middleman-blog-similar 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -3
  3. data/CHANGELOG.md +5 -0
  4. data/Gemfile +6 -0
  5. data/README.md +63 -19
  6. data/features/damerau_levenshtein.feature +5 -5
  7. data/features/levenshtein.feature +5 -5
  8. data/features/word_frequency.feature +15 -0
  9. data/fixtures/test-app/source/2014-05-08-article0.md +1 -0
  10. data/fixtures/test-app/source/2014-05-10-article2.md +2 -1
  11. data/fixtures/test-app/source/2014-05-11-article3.md +2 -1
  12. data/fixtures/test-app/source/2014-05-14-article6.md +1 -1
  13. data/fixtures/test-app/source/layouts/article.slim +2 -2
  14. data/lib/middleman-blog-similar/algorithm.rb +19 -0
  15. data/lib/middleman-blog-similar/algorithm/damerau_levenshtein.rb +7 -0
  16. data/lib/middleman-blog-similar/algorithm/levenshtein.rb +7 -0
  17. data/lib/middleman-blog-similar/algorithm/unigrams.csv +21089 -0
  18. data/lib/middleman-blog-similar/algorithm/word_frequency.rb +69 -0
  19. data/lib/middleman-blog-similar/algorithm/word_frequency/mecab.rb +20 -0
  20. data/lib/middleman-blog-similar/algorithm/word_frequency/tree_tagger.rb +20 -0
  21. data/lib/middleman-blog-similar/blog_article_extensions.rb +20 -11
  22. data/lib/middleman-blog-similar/extension.rb +10 -7
  23. data/lib/middleman-blog-similar/helpers.rb +6 -12
  24. data/lib/middleman-blog-similar/version.rb +1 -1
  25. data/middleman-blog-similar.gemspec +3 -2
  26. data/spec/middleman-blog-similar/algorithm/damerau_levenshtein_spec.rb +42 -0
  27. data/spec/middleman-blog-similar/algorithm/levenshtein_spec.rb +42 -0
  28. data/spec/middleman-blog-similar/algorithm/word_frequency/mecab_spec.rb +41 -0
  29. data/spec/middleman-blog-similar/algorithm/word_frequency/tree_tagger_spec.rb +52 -0
  30. data/spec/middleman-blog-similar/algorithm/word_frequency_spec.rb +73 -0
  31. data/spec/middleman-blog-similar/algorithm_spec.rb +40 -0
  32. data/spec/spec_helper.rb +26 -2
  33. metadata +37 -7
  34. data/lib/middleman-blog-similar/engines/base.rb +0 -27
  35. data/lib/middleman-blog-similar/engines/damerau_levenshtein.rb +0 -15
  36. data/lib/middleman-blog-similar/engines/levenshtein.rb +0 -15
  37. data/spec/middleman-blog-similar/engines/base_spec.rb +0 -4
@@ -0,0 +1,69 @@
1
+ # encoding: utf-8
2
+
3
+ require 'fast-stemmer'
4
+ require 'csv'
5
+
6
+ # logic ported from https://plugins.trac.wordpress.org/browser/wordpress-23-related-posts-plugin/trunk/recommendations.php
7
+
8
+ class Middleman::Blog::Similar::Algorithm::WordFrequency < ::Middleman::Blog::Similar::Algorithm
9
+ @@unigrams = nil
10
+ class << self
11
+ def unigrams_path
12
+ File.join File.dirname(__FILE__), 'unigrams.csv'
13
+ end
14
+ def unigrams
15
+ if @@unigrams.nil?
16
+ @@unigrams = {}
17
+ CSV.foreach(unigrams_path, { :col_sep => "\t" }) do|row|
18
+ @@unigrams[row[0]] = row[4].to_f if row.length == 5
19
+ end
20
+ end
21
+ @@unigrams
22
+ end
23
+ end
24
+ def distance a
25
+ d = 0xffffff
26
+ wf = a.similarity_algorithm.word_freq
27
+ word_freq.each do|word, freq|
28
+ if wf.has_key? word
29
+ d -= wf[word] * freq
30
+ end
31
+ end
32
+ d
33
+ end
34
+ def words
35
+ re = /[\t\s\n,\.、。 ]/
36
+ article.untagged_body.split(re) + article.title.split(re)
37
+ end
38
+ def generate_word_freq
39
+ suitable_words = unigrams.dup
40
+ word_freq= {}
41
+ words.each do|word|
42
+ word.downcase!
43
+ word = word.stem if word =~ /^\w+$/
44
+ word_freq[word] ||= 0
45
+ word_freq[word] += 1
46
+ end
47
+ selected_words = {}
48
+ word_freq.each do|word, freq|
49
+ selected_words[word] = unigrams[word] * Math.sqrt(freq) if unigrams[word]
50
+ end
51
+ article.tags.each do|tag|
52
+ tag = tag.downcase.stem
53
+ word_freq[tag] ||= 0
54
+ word_freq[tag] += tag_weight
55
+ end
56
+ Hash[ word_freq.sort_by{|k, v| k }.sort_by{|k, v| v } ]
57
+ end
58
+ def word_freq
59
+ @word_freq ||= generate_word_freq
60
+ end
61
+ def generate_tags
62
+ generate_word_freq.keys.reverse
63
+ end
64
+ def tags
65
+ @tags ||= generate_tags
66
+ end
67
+ def tag_weight ; 5 ; end
68
+ def unigrams ; self.class.unigrams ; end
69
+ end
@@ -0,0 +1,20 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'middleman-blog-similar/algorithm/word_frequency'
4
+
5
+ class Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab < ::Middleman::Blog::Similar::Algorithm::WordFrequency
6
+ class CommandNotFound < StandardError; end
7
+ def words
8
+ res = []
9
+ IO.popen("mecab 2>/dev/null", 'r+') {|f|
10
+ f.puts article.untagged_body
11
+ f.puts article.title
12
+ f.close_write
13
+ while line = f.gets
14
+ word, pos = line.split(/[\t\s]+/)
15
+ res << word if pos && pos.start_with?('名詞')
16
+ end
17
+ }
18
+ res
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ require 'middleman-blog-similar/algorithm/word_frequency'
2
+
3
+ class Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger < ::Middleman::Blog::Similar::Algorithm::WordFrequency
4
+ class CommandNotFound < StandardError; end
5
+ def words
6
+ raise CommandNotFound.new "You need to tree tagger command with ENV['TREETAGGER_COMMAND']" unless ENV['TREETAGGER_COMMAND']
7
+ res = []
8
+ IO.popen("#{ ENV['TREETAGGER_COMMAND'] } 2>/dev/null", 'r+') {|f|
9
+ f.puts article.untagged_body
10
+ f.puts article.title
11
+ f.close_write
12
+ while line = f.gets
13
+ word, pos = line.split(/\s+/)
14
+ # http://courses.washington.edu/hypertxt/csar-v02/penntable.html
15
+ res << word if %w{NN JJ NP}.include? pos[0..2]
16
+ end
17
+ }
18
+ res
19
+ end
20
+ end
@@ -1,14 +1,23 @@
1
- module Middleman
2
- module Blog
3
- module Similar
4
- module BlogArticleExtensions
5
- def similar_articles
6
- if !@similar_articles && (engine = app.similarity_engine)
7
- @similar_articles = engine.new(self).similar_articles
8
- end
9
- @similar_articles || []
10
- end
11
- end
1
+ module Middleman::Blog::Similar::BlogArticleExtensions
2
+ def similar_articles
3
+ if !@similar_articles && similarity_algorithm
4
+ @similar_articles = similarity_algorithm.similar_articles
12
5
  end
6
+ @similar_articles || []
7
+ end
8
+ def words
9
+ unless @words && similarity_algorithm
10
+ @words = similarity_algorithm.words
11
+ end
12
+ @words
13
+ end
14
+ def similarity_algorithm
15
+ if !@similarity_algorithm && (algorithm = app.similarity_algorithm)
16
+ @similarity_algorithm = algorithm.new self
17
+ end
18
+ @similarity_algorithm
19
+ end
20
+ def untagged_body
21
+ body.gsub(/<[^>]*>/ui,'')
13
22
  end
14
23
  end
@@ -1,25 +1,28 @@
1
1
  require 'middleman-blog-similar/blog_article_extensions'
2
2
  require 'middleman-blog-similar/helpers'
3
- require 'middleman-blog-similar/engines/base'
3
+ require 'middleman-blog-similar/algorithm'
4
4
 
5
5
  module Middleman
6
6
  module Blog
7
7
  class SimilarExtension < ::Middleman::Extension
8
8
 
9
- option :engine, :levenshtein, 'Similar lookup engine'
9
+ option :algorithm, :word_frequency, 'Similar lookup algorithm'
10
10
 
11
11
  self.defined_helpers = [ Middleman::Blog::Similar::Helpers ]
12
12
 
13
13
  def after_configuration
14
14
  require 'middleman-blog/blog_article'
15
15
  ::Middleman::Sitemap::Resource.send :include, Middleman::Blog::Similar::BlogArticleExtensions
16
- engine = options[:engine].to_s
16
+ algorithm = options[:algorithm].to_s
17
17
  begin
18
- require "middleman-blog-similar/engines/#{engine}"
19
- engine = ::Middleman::Blog::Similar::Engines.const_get engine.camelize
20
- app.set :similarity_engine, engine
18
+ require "middleman-blog-similar/algorithm/#{algorithm}"
19
+ ns = ::Middleman::Blog::Similar::Algorithm
20
+ algorithm.split('/').each do|n|
21
+ ns = ns.const_get n.camelize
22
+ end
23
+ app.set :similarity_algorithm, ns
21
24
  rescue LoadError => e
22
- app.logger.error "Requested similar engine '#{engine}' not found."
25
+ app.logger.error "Requested similar algorithm '#{algorithm}' not found."
23
26
  raise e
24
27
  end
25
28
  end
@@ -1,15 +1,9 @@
1
- module Middleman
2
- module Blog
3
- module Similar
4
- module Helpers
5
- def similar_articles
6
- if is_blog_article?
7
- current_article.similar_articles
8
- else
9
- []
10
- end
11
- end
12
- end
1
+ module Middleman::Blog::Similar::Helpers
2
+ def similar_articles
3
+ if is_blog_article?
4
+ current_article.similar_articles
5
+ else
6
+ []
13
7
  end
14
8
  end
15
9
  end
@@ -1,7 +1,7 @@
1
1
  module Middleman
2
2
  module Blog
3
3
  module Similar
4
- VERSION = "1.0.0"
4
+ VERSION = "1.1.0"
5
5
  end
6
6
  end
7
7
  end
@@ -15,6 +15,7 @@ Gem::Specification.new do |s|
15
15
  s.files = `git ls-files -z`.split("\0")
16
16
  s.test_files = `git ls-files -z -- {fixtures,features,spec}/*`.split("\0")
17
17
  s.require_paths = ["lib"]
18
- s.add_runtime_dependency("middleman-core", ["~> 3.2"])
19
- s.add_runtime_dependency("middleman-blog", ["~> 3.5"])
18
+ s.add_runtime_dependency("middleman-core", ["~> 3.2" ])
19
+ s.add_runtime_dependency("middleman-blog", ["~> 3.5" ])
20
+ s.add_runtime_dependency("fast-stemmer", ["~> 1.0.2"])
20
21
  end
@@ -0,0 +1,42 @@
1
+ require 'spec_helper'
2
+ require 'middleman-blog-similar/algorithm/damerau_levenshtein'
3
+
4
+ describe 'Middleman::Blog::Similar::Algorithm::DamerauLevenshtein' do
5
+ let(:app) {
6
+ middleman_app('test-app') {
7
+ activate :similar, :algorithm => :damerau_levenshtein
8
+ }
9
+ }
10
+ let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
11
+ let(:algorithm) { article.similarity_algorithm }
12
+ describe(:app) {
13
+ describe(:similarity_algorithm) {
14
+ subject { app.similarity_algorithm }
15
+ it { should be ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
16
+ }
17
+ }
18
+ describe(:similarity_algorithm) {
19
+ subject { algorithm }
20
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
21
+ describe(:similar_articles) {
22
+ subject { algorithm.similar_articles.map(&:url) }
23
+ it {
24
+ should eq [
25
+ "/2014/05/13/article5.html",
26
+ "/2014/05/09/article1.html",
27
+ "/2014/05/12/article4.html",
28
+ "/2014/05/14/article6.html",
29
+ "/2014/05/10/article2.html",
30
+ "/2014/05/11/article3.html"
31
+ ]
32
+ }
33
+ }
34
+ }
35
+ describe(:article) {
36
+ describe(:similarity_algorithm) {
37
+ subject { algorithm }
38
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
39
+ }
40
+ }
41
+
42
+ end
@@ -0,0 +1,42 @@
1
+ require 'spec_helper'
2
+ require 'middleman-blog-similar/algorithm/levenshtein'
3
+
4
+ describe 'Middleman::Blog::Similar::Algorithm::Levenshtein' do
5
+ let(:app) {
6
+ middleman_app('test-app') {
7
+ activate :similar, :algorithm => :levenshtein
8
+ }
9
+ }
10
+ let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
11
+ let(:algorithm) { article.similarity_algorithm }
12
+ describe(:app) {
13
+ describe(:similarity_algorithm) {
14
+ subject { app.similarity_algorithm }
15
+ it { should be ::Middleman::Blog::Similar::Algorithm::Levenshtein }
16
+ }
17
+ }
18
+ describe(:similarity_algorithm) {
19
+ subject { algorithm }
20
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::Levenshtein }
21
+ describe(:similar_articles) {
22
+ subject { algorithm.similar_articles.map(&:url) }
23
+ it {
24
+ should eq [
25
+ "/2014/05/13/article5.html",
26
+ "/2014/05/09/article1.html",
27
+ "/2014/05/12/article4.html",
28
+ "/2014/05/14/article6.html",
29
+ "/2014/05/10/article2.html",
30
+ "/2014/05/11/article3.html"
31
+ ]
32
+ }
33
+ }
34
+ }
35
+ describe(:article) {
36
+ describe(:similarity_algorithm) {
37
+ subject { algorithm }
38
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::Levenshtein }
39
+ }
40
+ }
41
+
42
+ end
@@ -0,0 +1,41 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'spec_helper'
4
+ require 'middleman-blog-similar/algorithm/word_frequency/mecab'
5
+
6
+ describe 'Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab' do
7
+ let(:app) {
8
+ middleman_app('test-app') {
9
+ activate :similar, :algorithm => :'word_frequency/mecab'
10
+ }
11
+ }
12
+ let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/11/article3.html' }
13
+ let(:algorithm) { article.similarity_algorithm }
14
+ describe(:app) {
15
+ describe(:similarity_algorithm) {
16
+ subject { app.similarity_algorithm }
17
+ it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
18
+ }
19
+ }
20
+ describe(:similarity_algorithm) {
21
+ subject { algorithm }
22
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
23
+ }
24
+ describe(:tags) {
25
+ describe(:output) {
26
+ if %x{which mecab}
27
+ subject { algorithm.tags }
28
+ it { should eq ["fox","の","国家","さん","方","誰","私","坊ちゃん","立脚","西洋","矛盾","相違","発会","時分","昨日","講演","教師","拡張","悪口","尻","変","結果","開始","周旋","向","叫び","反駁","反抗","前","人間","ネルソン","よう","関係","なん","ため","それ","そう","院","この世","お話","隙","expect","articl","3"] }
29
+ else
30
+ pending "mecab is not installed."
31
+ end
32
+ }
33
+ }
34
+ describe(:article) {
35
+ describe(:similarity_algorithm) {
36
+ subject { algorithm }
37
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
38
+ }
39
+ }
40
+
41
+ end
@@ -0,0 +1,52 @@
1
+ require 'spec_helper'
2
+ require 'middleman-blog-similar/algorithm/word_frequency/tree_tagger'
3
+
4
+ describe 'Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger' do
5
+ let(:app) {
6
+ middleman_app('test-app') {
7
+ activate :similar, :algorithm => :'word_frequency/tree_tagger'
8
+ }
9
+ }
10
+ let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/10/article2.html' }
11
+ let(:algorithm) { article.similarity_algorithm }
12
+ describe(:app) {
13
+ describe(:similarity_algorithm) {
14
+ subject { app.similarity_algorithm }
15
+ it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
16
+ }
17
+ }
18
+ describe(:similarity_algorithm) {
19
+ subject { algorithm }
20
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
21
+ }
22
+ describe(:tags) {
23
+ describe(:output) {
24
+ if ENV['TREETAGGER_COMMAND']
25
+ subject { algorithm.tags }
26
+ it { should eq ["quick", "fox", "dog", "brown", "lazi", "articl"] }
27
+ else
28
+ pending "ENV['TREETAGGER_COMMAND'] not set."
29
+ end
30
+ }
31
+ context('if command path is not set') {
32
+ subject { -> { algorithm.tags } }
33
+ before {
34
+ @cmd = ENV['TREETAGGER_COMMAND']
35
+ ENV['TREETAGGER_COMMAND'] = nil
36
+ }
37
+ after {
38
+ ENV['TREETAGGER_COMMAND'] = @cmd if @cmd
39
+ }
40
+ describe('raises error') {
41
+ it { should raise_error Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger::CommandNotFound }
42
+ }
43
+ }
44
+ }
45
+ describe(:article) {
46
+ describe(:similarity_algorithm) {
47
+ subject { algorithm }
48
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
49
+ }
50
+ }
51
+
52
+ end
@@ -0,0 +1,73 @@
1
+ require 'spec_helper'
2
+ require 'middleman-blog-similar/algorithm/word_frequency'
3
+
4
+ describe 'Middleman::Blog::Similar::Algorithm::WordFrequency' do
5
+ let(:app) {
6
+ middleman_app('test-app') {
7
+ activate :similar, :algorithm => :word_frequency
8
+ }
9
+ }
10
+ let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/10/article2.html' }
11
+ let(:algorithm) { article.similarity_algorithm }
12
+ describe(:app) {
13
+ describe(:similarity_algorithm) {
14
+ subject { app.similarity_algorithm }
15
+ it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency }
16
+ }
17
+ }
18
+ describe(:algorithm) {
19
+ subject { algorithm }
20
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
21
+ }
22
+ describe(:unigrams) {
23
+ describe('length of keys') {
24
+ subject { algorithm.unigrams.keys.length }
25
+ it { should be 21089 }
26
+ }
27
+ describe('class') {
28
+ subject { algorithm.unigrams }
29
+ it { should be_a_kind_of Hash }
30
+ }
31
+ }
32
+ describe(:similar_articles) {
33
+ subject { algorithm.similar_articles.map(&:url) }
34
+ it {
35
+ should eq [
36
+ "/2014/05/11/article3.html",
37
+ "/2014/05/08/article0.html",
38
+ "/2014/05/12/article4.html",
39
+ "/2014/05/13/article5.html",
40
+ "/2014/05/09/article1.html",
41
+ "/2014/05/14/article6.html"
42
+ ]
43
+ }
44
+ }
45
+ describe(:tags) {
46
+ subject { algorithm.tags }
47
+ it { should eq ["fox", "quick", "dog", "brown", "the", "jump", "lazi", "over", "articl", "2"] }
48
+ }
49
+ describe(:word_freq) {
50
+ subject { algorithm.word_freq }
51
+ it {
52
+ should eq({
53
+ "brown" => 2,
54
+ "dog" => 3,
55
+ "fox" => 6,
56
+ "jump" => 1,
57
+ "lazi" => 1,
58
+ "over" => 1,
59
+ "quick" => 6,
60
+ "the" => 2,
61
+ "2" => 1,
62
+ "articl" => 1
63
+ })
64
+ }
65
+ }
66
+ describe(:article) {
67
+ describe(:similarity_algorithm) {
68
+ subject { algorithm }
69
+ it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
70
+ }
71
+ }
72
+
73
+ end