middleman-blog-similar 1.1.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +33 -0
  5. data/.travis.yml +12 -21
  6. data/Gemfile +20 -35
  7. data/Guardfile +9 -9
  8. data/README.md +57 -82
  9. data/Rakefile +9 -7
  10. data/features/default.feature +21 -0
  11. data/features/support/env.rb +20 -10
  12. data/fixtures/test-app/source/{2014-05-08-article0.md → 2014-05-08-article0.html.md} +2 -1
  13. data/fixtures/test-app/source/{2014-05-09-article1.md → 2014-05-09-article1.html.md} +2 -0
  14. data/fixtures/test-app/source/{2014-05-10-article2.md → 2014-05-10-article2.html.md} +1 -1
  15. data/fixtures/test-app/source/{2014-05-11-article3.md → 2014-05-11-article3.html.md} +0 -0
  16. data/fixtures/test-app/source/{2014-05-12-article4.md → 2014-05-12-article4.html.md} +1 -0
  17. data/fixtures/test-app/source/{2014-05-13-article5.md → 2014-05-13-article5.html.md} +1 -0
  18. data/fixtures/test-app/source/{2014-05-14-article6.md → 2014-05-14-article6.html.md} +1 -0
  19. data/fixtures/test-app/source/index.html.slim +1 -0
  20. data/fixtures/test-app/source/layout.slim +1 -1
  21. data/fixtures/test-app/source/layouts/article.slim +6 -6
  22. data/lib/middleman-blog-similar.rb +3 -3
  23. data/lib/middleman-blog-similar/blog_article_extensions.rb +8 -20
  24. data/lib/middleman-blog-similar/database.rb +70 -0
  25. data/lib/middleman-blog-similar/extension.rb +37 -15
  26. data/lib/middleman-blog-similar/helpers.rb +12 -6
  27. data/lib/middleman-blog-similar/models/article.rb +33 -0
  28. data/lib/middleman-blog-similar/models/migration.rb +32 -0
  29. data/lib/middleman-blog-similar/models/tag.rb +10 -0
  30. data/lib/middleman-blog-similar/models/tagging.rb +10 -0
  31. data/lib/middleman-blog-similar/resource_list_manipulator.rb +19 -0
  32. data/lib/middleman-blog-similar/tagger/entagger.rb +17 -0
  33. data/lib/middleman-blog-similar/tagger/mecab.rb +19 -0
  34. data/lib/middleman-blog-similar/tagger/tags.rb +13 -0
  35. data/lib/middleman-blog-similar/version.rb +1 -1
  36. data/lib/middleman_extension.rb +0 -1
  37. data/middleman-blog-similar.gemspec +18 -13
  38. data/spec/middleman-blog-similar/extension_spec.rb +44 -1
  39. data/spec/middleman-blog-similar/tagger_spec.rb +24 -0
  40. data/spec/spec_helper.rb +33 -24
  41. metadata +77 -72
  42. data/features/damerau_levenshtein.feature +0 -20
  43. data/features/levenshtein.feature +0 -20
  44. data/features/word_frequency.feature +0 -15
  45. data/lib/middleman-blog-similar/algorithm.rb +0 -19
  46. data/lib/middleman-blog-similar/algorithm/damerau_levenshtein.rb +0 -7
  47. data/lib/middleman-blog-similar/algorithm/levenshtein.rb +0 -7
  48. data/lib/middleman-blog-similar/algorithm/unigrams.csv +0 -21089
  49. data/lib/middleman-blog-similar/algorithm/word_frequency.rb +0 -69
  50. data/lib/middleman-blog-similar/algorithm/word_frequency/mecab.rb +0 -22
  51. data/lib/middleman-blog-similar/algorithm/word_frequency/tree_tagger.rb +0 -20
  52. data/spec/helper_spec.rb +0 -4
  53. data/spec/middleman-blog-similar/algorithm/damerau_levenshtein_spec.rb +0 -42
  54. data/spec/middleman-blog-similar/algorithm/levenshtein_spec.rb +0 -42
  55. data/spec/middleman-blog-similar/algorithm/word_frequency/mecab_spec.rb +0 -41
  56. data/spec/middleman-blog-similar/algorithm/word_frequency/tree_tagger_spec.rb +0 -52
  57. data/spec/middleman-blog-similar/algorithm/word_frequency_spec.rb +0 -73
  58. data/spec/middleman-blog-similar/algorithm_spec.rb +0 -40
@@ -1,69 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'fast-stemmer'
4
- require 'csv'
5
-
6
- # logic ported from https://plugins.trac.wordpress.org/browser/wordpress-23-related-posts-plugin/trunk/recommendations.php
7
-
8
- class Middleman::Blog::Similar::Algorithm::WordFrequency < ::Middleman::Blog::Similar::Algorithm
9
- @@unigrams = nil
10
- class << self
11
- def unigrams_path
12
- File.join File.dirname(__FILE__), 'unigrams.csv'
13
- end
14
- def unigrams
15
- if @@unigrams.nil?
16
- @@unigrams = {}
17
- CSV.foreach(unigrams_path, { :col_sep => "\t" }) do|row|
18
- @@unigrams[row[0]] = row[4].to_f if row.length == 5
19
- end
20
- end
21
- @@unigrams
22
- end
23
- end
24
- def distance a
25
- d = 0xffffff
26
- wf = a.similarity_algorithm.word_freq
27
- word_freq.each do|word, freq|
28
- if wf.has_key? word
29
- d -= wf[word] * freq
30
- end
31
- end
32
- d
33
- end
34
- def words
35
- re = /[\t\s\n,\.、。 ]/
36
- article.untagged_body.split(re) + article.title.split(re)
37
- end
38
- def generate_word_freq
39
- suitable_words = unigrams.dup
40
- word_freq= {}
41
- words.each do|word|
42
- word.downcase!
43
- word = word.stem if word =~ /^\w+$/
44
- word_freq[word] ||= 0
45
- word_freq[word] += 1
46
- end
47
- selected_words = {}
48
- word_freq.each do|word, freq|
49
- selected_words[word] = unigrams[word] * Math.sqrt(freq) if unigrams[word]
50
- end
51
- article.tags.each do|tag|
52
- tag = tag.downcase.stem
53
- word_freq[tag] ||= 0
54
- word_freq[tag] += tag_weight
55
- end
56
- Hash[ word_freq.sort_by{|k, v| k }.sort_by{|k, v| v } ]
57
- end
58
- def word_freq
59
- @word_freq ||= generate_word_freq
60
- end
61
- def generate_tags
62
- generate_word_freq.keys.reverse
63
- end
64
- def tags
65
- @tags ||= generate_tags
66
- end
67
- def tag_weight ; 5 ; end
68
- def unigrams ; self.class.unigrams ; end
69
- end
@@ -1,22 +0,0 @@
1
- # -*- coding: UTF-8 -*-
2
-
3
- require 'middleman-blog-similar/algorithm/word_frequency'
4
-
5
- class Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab < ::Middleman::Blog::Similar::Algorithm::WordFrequency
6
- class CommandNotFound < StandardError; end
7
- def words
8
- res = []
9
- IO.popen("mecab 2>/dev/null", 'r+') {|f|
10
- f.puts article.untagged_body
11
- f.puts article.title
12
- f.close_write
13
- while line = f.gets
14
- word, pos = line.split(/[\t\s]+/)
15
- next unless pos
16
- pos = pos.split(',')
17
- res << word if pos[0] == '名詞' && %w{一般 固有名詞}.include?(pos[1])
18
- end
19
- }
20
- res
21
- end
22
- end
@@ -1,20 +0,0 @@
1
- require 'middleman-blog-similar/algorithm/word_frequency'
2
-
3
- class Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger < ::Middleman::Blog::Similar::Algorithm::WordFrequency
4
- class CommandNotFound < StandardError; end
5
- def words
6
- raise CommandNotFound.new "You need to tree tagger command with ENV['TREETAGGER_COMMAND']" unless ENV['TREETAGGER_COMMAND']
7
- res = []
8
- IO.popen("#{ ENV['TREETAGGER_COMMAND'] } 2>/dev/null", 'r+') {|f|
9
- f.puts article.untagged_body
10
- f.puts article.title
11
- f.close_write
12
- while line = f.gets
13
- word, pos = line.split(/\s+/)
14
- # http://courses.washington.edu/hypertxt/csar-v02/penntable.html
15
- res << word if %w{NN JJ NP}.include? pos[0..2]
16
- end
17
- }
18
- res
19
- end
20
- end
data/spec/helper_spec.rb DELETED
@@ -1,4 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe "Middleman::Blog::BlogArticle" do
4
- end
@@ -1,42 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/damerau_levenshtein'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm::DamerauLevenshtein' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar, :algorithm => :damerau_levenshtein
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
16
- }
17
- }
18
- describe(:similarity_algorithm) {
19
- subject { algorithm }
20
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
21
- describe(:similar_articles) {
22
- subject { algorithm.similar_articles.map(&:url) }
23
- it {
24
- should eq [
25
- "/2014/05/13/article5.html",
26
- "/2014/05/09/article1.html",
27
- "/2014/05/12/article4.html",
28
- "/2014/05/14/article6.html",
29
- "/2014/05/10/article2.html",
30
- "/2014/05/11/article3.html"
31
- ]
32
- }
33
- }
34
- }
35
- describe(:article) {
36
- describe(:similarity_algorithm) {
37
- subject { algorithm }
38
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
39
- }
40
- }
41
-
42
- end
@@ -1,42 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/levenshtein'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm::Levenshtein' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar, :algorithm => :levenshtein
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::Levenshtein }
16
- }
17
- }
18
- describe(:similarity_algorithm) {
19
- subject { algorithm }
20
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::Levenshtein }
21
- describe(:similar_articles) {
22
- subject { algorithm.similar_articles.map(&:url) }
23
- it {
24
- should eq [
25
- "/2014/05/13/article5.html",
26
- "/2014/05/09/article1.html",
27
- "/2014/05/12/article4.html",
28
- "/2014/05/14/article6.html",
29
- "/2014/05/10/article2.html",
30
- "/2014/05/11/article3.html"
31
- ]
32
- }
33
- }
34
- }
35
- describe(:article) {
36
- describe(:similarity_algorithm) {
37
- subject { algorithm }
38
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::Levenshtein }
39
- }
40
- }
41
-
42
- end
@@ -1,41 +0,0 @@
1
- # -*- coding: UTF-8 -*-
2
-
3
- require 'spec_helper'
4
- require 'middleman-blog-similar/algorithm/word_frequency/mecab'
5
-
6
- describe 'Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab' do
7
- let(:app) {
8
- middleman_app('test-app') {
9
- activate :similar, :algorithm => :'word_frequency/mecab'
10
- }
11
- }
12
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/11/article3.html' }
13
- let(:algorithm) { article.similarity_algorithm }
14
- describe(:app) {
15
- describe(:similarity_algorithm) {
16
- subject { app.similarity_algorithm }
17
- it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
18
- }
19
- }
20
- describe(:similarity_algorithm) {
21
- subject { algorithm }
22
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
23
- }
24
- describe(:tags) {
25
- describe(:output) {
26
- if %x{which mecab}
27
- subject { algorithm.tags }
28
- it { should eq ["fox", "国家", "隙", "教師", "悪口", "尻", "坊ちゃん", "時分", "向", "叫び", "人間", "ネルソン", "この世", "西洋", "expect", "articl"] }
29
- else
30
- pending "mecab is not installed."
31
- end
32
- }
33
- }
34
- describe(:article) {
35
- describe(:similarity_algorithm) {
36
- subject { algorithm }
37
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
38
- }
39
- }
40
-
41
- end
@@ -1,52 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/word_frequency/tree_tagger'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar, :algorithm => :'word_frequency/tree_tagger'
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/10/article2.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
16
- }
17
- }
18
- describe(:similarity_algorithm) {
19
- subject { algorithm }
20
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
21
- }
22
- describe(:tags) {
23
- describe(:output) {
24
- if ENV['TREETAGGER_COMMAND']
25
- subject { algorithm.tags }
26
- it { should eq ["quick", "fox", "dog", "brown", "lazi", "articl"] }
27
- else
28
- pending "ENV['TREETAGGER_COMMAND'] not set."
29
- end
30
- }
31
- context('if command path is not set') {
32
- subject { -> { algorithm.tags } }
33
- before {
34
- @cmd = ENV['TREETAGGER_COMMAND']
35
- ENV['TREETAGGER_COMMAND'] = nil
36
- }
37
- after {
38
- ENV['TREETAGGER_COMMAND'] = @cmd if @cmd
39
- }
40
- describe('raises error') {
41
- it { should raise_error Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger::CommandNotFound }
42
- }
43
- }
44
- }
45
- describe(:article) {
46
- describe(:similarity_algorithm) {
47
- subject { algorithm }
48
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
49
- }
50
- }
51
-
52
- end
@@ -1,73 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/word_frequency'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm::WordFrequency' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar, :algorithm => :word_frequency
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/10/article2.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency }
16
- }
17
- }
18
- describe(:algorithm) {
19
- subject { algorithm }
20
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
21
- }
22
- describe(:unigrams) {
23
- describe('length of keys') {
24
- subject { algorithm.unigrams.keys.length }
25
- it { should be 21089 }
26
- }
27
- describe('class') {
28
- subject { algorithm.unigrams }
29
- it { should be_a_kind_of Hash }
30
- }
31
- }
32
- describe(:similar_articles) {
33
- subject { algorithm.similar_articles.map(&:url) }
34
- it {
35
- should eq [
36
- "/2014/05/11/article3.html",
37
- "/2014/05/08/article0.html",
38
- "/2014/05/12/article4.html",
39
- "/2014/05/13/article5.html",
40
- "/2014/05/09/article1.html",
41
- "/2014/05/14/article6.html"
42
- ]
43
- }
44
- }
45
- describe(:tags) {
46
- subject { algorithm.tags }
47
- it { should eq ["fox", "quick", "dog", "brown", "the", "jump", "lazi", "over", "articl", "2"] }
48
- }
49
- describe(:word_freq) {
50
- subject { algorithm.word_freq }
51
- it {
52
- should eq({
53
- "brown" => 2,
54
- "dog" => 3,
55
- "fox" => 6,
56
- "jump" => 1,
57
- "lazi" => 1,
58
- "over" => 1,
59
- "quick" => 6,
60
- "the" => 2,
61
- "2" => 1,
62
- "articl" => 1
63
- })
64
- }
65
- }
66
- describe(:article) {
67
- describe(:similarity_algorithm) {
68
- subject { algorithm }
69
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
70
- }
71
- }
72
-
73
- end
@@ -1,40 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/word_frequency'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency }
16
- }
17
- }
18
- describe(:similarity_algorithm) {
19
- describe(:articles) {
20
- subject { algorithm.articles.map(&:url) }
21
- it {
22
- should eq [
23
- "/2014/05/14/article6.html",
24
- "/2014/05/13/article5.html",
25
- "/2014/05/12/article4.html",
26
- "/2014/05/11/article3.html",
27
- "/2014/05/10/article2.html",
28
- "/2014/05/09/article1.html",
29
- "/2014/05/08/article0.html"
30
- ]
31
- }
32
- }
33
- }
34
- describe(:article) {
35
- describe(:similarity_algorithm) {
36
- subject { algorithm }
37
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
38
- }
39
- }
40
- end