middleman-blog-similar 1.1.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +33 -0
  5. data/.travis.yml +12 -21
  6. data/Gemfile +20 -35
  7. data/Guardfile +9 -9
  8. data/README.md +57 -82
  9. data/Rakefile +9 -7
  10. data/features/default.feature +21 -0
  11. data/features/support/env.rb +20 -10
  12. data/fixtures/test-app/source/{2014-05-08-article0.md → 2014-05-08-article0.html.md} +2 -1
  13. data/fixtures/test-app/source/{2014-05-09-article1.md → 2014-05-09-article1.html.md} +2 -0
  14. data/fixtures/test-app/source/{2014-05-10-article2.md → 2014-05-10-article2.html.md} +1 -1
  15. data/fixtures/test-app/source/{2014-05-11-article3.md → 2014-05-11-article3.html.md} +0 -0
  16. data/fixtures/test-app/source/{2014-05-12-article4.md → 2014-05-12-article4.html.md} +1 -0
  17. data/fixtures/test-app/source/{2014-05-13-article5.md → 2014-05-13-article5.html.md} +1 -0
  18. data/fixtures/test-app/source/{2014-05-14-article6.md → 2014-05-14-article6.html.md} +1 -0
  19. data/fixtures/test-app/source/index.html.slim +1 -0
  20. data/fixtures/test-app/source/layout.slim +1 -1
  21. data/fixtures/test-app/source/layouts/article.slim +6 -6
  22. data/lib/middleman-blog-similar.rb +3 -3
  23. data/lib/middleman-blog-similar/blog_article_extensions.rb +8 -20
  24. data/lib/middleman-blog-similar/database.rb +70 -0
  25. data/lib/middleman-blog-similar/extension.rb +37 -15
  26. data/lib/middleman-blog-similar/helpers.rb +12 -6
  27. data/lib/middleman-blog-similar/models/article.rb +33 -0
  28. data/lib/middleman-blog-similar/models/migration.rb +32 -0
  29. data/lib/middleman-blog-similar/models/tag.rb +10 -0
  30. data/lib/middleman-blog-similar/models/tagging.rb +10 -0
  31. data/lib/middleman-blog-similar/resource_list_manipulator.rb +19 -0
  32. data/lib/middleman-blog-similar/tagger/entagger.rb +17 -0
  33. data/lib/middleman-blog-similar/tagger/mecab.rb +19 -0
  34. data/lib/middleman-blog-similar/tagger/tags.rb +13 -0
  35. data/lib/middleman-blog-similar/version.rb +1 -1
  36. data/lib/middleman_extension.rb +0 -1
  37. data/middleman-blog-similar.gemspec +18 -13
  38. data/spec/middleman-blog-similar/extension_spec.rb +44 -1
  39. data/spec/middleman-blog-similar/tagger_spec.rb +24 -0
  40. data/spec/spec_helper.rb +33 -24
  41. metadata +77 -72
  42. data/features/damerau_levenshtein.feature +0 -20
  43. data/features/levenshtein.feature +0 -20
  44. data/features/word_frequency.feature +0 -15
  45. data/lib/middleman-blog-similar/algorithm.rb +0 -19
  46. data/lib/middleman-blog-similar/algorithm/damerau_levenshtein.rb +0 -7
  47. data/lib/middleman-blog-similar/algorithm/levenshtein.rb +0 -7
  48. data/lib/middleman-blog-similar/algorithm/unigrams.csv +0 -21089
  49. data/lib/middleman-blog-similar/algorithm/word_frequency.rb +0 -69
  50. data/lib/middleman-blog-similar/algorithm/word_frequency/mecab.rb +0 -22
  51. data/lib/middleman-blog-similar/algorithm/word_frequency/tree_tagger.rb +0 -20
  52. data/spec/helper_spec.rb +0 -4
  53. data/spec/middleman-blog-similar/algorithm/damerau_levenshtein_spec.rb +0 -42
  54. data/spec/middleman-blog-similar/algorithm/levenshtein_spec.rb +0 -42
  55. data/spec/middleman-blog-similar/algorithm/word_frequency/mecab_spec.rb +0 -41
  56. data/spec/middleman-blog-similar/algorithm/word_frequency/tree_tagger_spec.rb +0 -52
  57. data/spec/middleman-blog-similar/algorithm/word_frequency_spec.rb +0 -73
  58. data/spec/middleman-blog-similar/algorithm_spec.rb +0 -40
@@ -1,69 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'fast-stemmer'
4
- require 'csv'
5
-
6
- # logic ported from https://plugins.trac.wordpress.org/browser/wordpress-23-related-posts-plugin/trunk/recommendations.php
7
-
8
- class Middleman::Blog::Similar::Algorithm::WordFrequency < ::Middleman::Blog::Similar::Algorithm
9
- @@unigrams = nil
10
- class << self
11
- def unigrams_path
12
- File.join File.dirname(__FILE__), 'unigrams.csv'
13
- end
14
- def unigrams
15
- if @@unigrams.nil?
16
- @@unigrams = {}
17
- CSV.foreach(unigrams_path, { :col_sep => "\t" }) do|row|
18
- @@unigrams[row[0]] = row[4].to_f if row.length == 5
19
- end
20
- end
21
- @@unigrams
22
- end
23
- end
24
- def distance a
25
- d = 0xffffff
26
- wf = a.similarity_algorithm.word_freq
27
- word_freq.each do|word, freq|
28
- if wf.has_key? word
29
- d -= wf[word] * freq
30
- end
31
- end
32
- d
33
- end
34
- def words
35
- re = /[\t\s\n,\.、。 ]/
36
- article.untagged_body.split(re) + article.title.split(re)
37
- end
38
- def generate_word_freq
39
- suitable_words = unigrams.dup
40
- word_freq= {}
41
- words.each do|word|
42
- word.downcase!
43
- word = word.stem if word =~ /^\w+$/
44
- word_freq[word] ||= 0
45
- word_freq[word] += 1
46
- end
47
- selected_words = {}
48
- word_freq.each do|word, freq|
49
- selected_words[word] = unigrams[word] * Math.sqrt(freq) if unigrams[word]
50
- end
51
- article.tags.each do|tag|
52
- tag = tag.downcase.stem
53
- word_freq[tag] ||= 0
54
- word_freq[tag] += tag_weight
55
- end
56
- Hash[ word_freq.sort_by{|k, v| k }.sort_by{|k, v| v } ]
57
- end
58
- def word_freq
59
- @word_freq ||= generate_word_freq
60
- end
61
- def generate_tags
62
- generate_word_freq.keys.reverse
63
- end
64
- def tags
65
- @tags ||= generate_tags
66
- end
67
- def tag_weight ; 5 ; end
68
- def unigrams ; self.class.unigrams ; end
69
- end
@@ -1,22 +0,0 @@
1
- # -*- coding: UTF-8 -*-
2
-
3
- require 'middleman-blog-similar/algorithm/word_frequency'
4
-
5
- class Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab < ::Middleman::Blog::Similar::Algorithm::WordFrequency
6
- class CommandNotFound < StandardError; end
7
- def words
8
- res = []
9
- IO.popen("mecab 2>/dev/null", 'r+') {|f|
10
- f.puts article.untagged_body
11
- f.puts article.title
12
- f.close_write
13
- while line = f.gets
14
- word, pos = line.split(/[\t\s]+/)
15
- next unless pos
16
- pos = pos.split(',')
17
- res << word if pos[0] == '名詞' && %w{一般 固有名詞}.include?(pos[1])
18
- end
19
- }
20
- res
21
- end
22
- end
@@ -1,20 +0,0 @@
1
- require 'middleman-blog-similar/algorithm/word_frequency'
2
-
3
- class Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger < ::Middleman::Blog::Similar::Algorithm::WordFrequency
4
- class CommandNotFound < StandardError; end
5
- def words
6
- raise CommandNotFound.new "You need to tree tagger command with ENV['TREETAGGER_COMMAND']" unless ENV['TREETAGGER_COMMAND']
7
- res = []
8
- IO.popen("#{ ENV['TREETAGGER_COMMAND'] } 2>/dev/null", 'r+') {|f|
9
- f.puts article.untagged_body
10
- f.puts article.title
11
- f.close_write
12
- while line = f.gets
13
- word, pos = line.split(/\s+/)
14
- # http://courses.washington.edu/hypertxt/csar-v02/penntable.html
15
- res << word if %w{NN JJ NP}.include? pos[0..2]
16
- end
17
- }
18
- res
19
- end
20
- end
data/spec/helper_spec.rb DELETED
@@ -1,4 +0,0 @@
1
- require 'spec_helper'
2
-
3
- describe "Middleman::Blog::BlogArticle" do
4
- end
@@ -1,42 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/damerau_levenshtein'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm::DamerauLevenshtein' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar, :algorithm => :damerau_levenshtein
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
16
- }
17
- }
18
- describe(:similarity_algorithm) {
19
- subject { algorithm }
20
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
21
- describe(:similar_articles) {
22
- subject { algorithm.similar_articles.map(&:url) }
23
- it {
24
- should eq [
25
- "/2014/05/13/article5.html",
26
- "/2014/05/09/article1.html",
27
- "/2014/05/12/article4.html",
28
- "/2014/05/14/article6.html",
29
- "/2014/05/10/article2.html",
30
- "/2014/05/11/article3.html"
31
- ]
32
- }
33
- }
34
- }
35
- describe(:article) {
36
- describe(:similarity_algorithm) {
37
- subject { algorithm }
38
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
39
- }
40
- }
41
-
42
- end
@@ -1,42 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/levenshtein'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm::Levenshtein' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar, :algorithm => :levenshtein
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::Levenshtein }
16
- }
17
- }
18
- describe(:similarity_algorithm) {
19
- subject { algorithm }
20
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::Levenshtein }
21
- describe(:similar_articles) {
22
- subject { algorithm.similar_articles.map(&:url) }
23
- it {
24
- should eq [
25
- "/2014/05/13/article5.html",
26
- "/2014/05/09/article1.html",
27
- "/2014/05/12/article4.html",
28
- "/2014/05/14/article6.html",
29
- "/2014/05/10/article2.html",
30
- "/2014/05/11/article3.html"
31
- ]
32
- }
33
- }
34
- }
35
- describe(:article) {
36
- describe(:similarity_algorithm) {
37
- subject { algorithm }
38
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::Levenshtein }
39
- }
40
- }
41
-
42
- end
@@ -1,41 +0,0 @@
1
- # -*- coding: UTF-8 -*-
2
-
3
- require 'spec_helper'
4
- require 'middleman-blog-similar/algorithm/word_frequency/mecab'
5
-
6
- describe 'Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab' do
7
- let(:app) {
8
- middleman_app('test-app') {
9
- activate :similar, :algorithm => :'word_frequency/mecab'
10
- }
11
- }
12
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/11/article3.html' }
13
- let(:algorithm) { article.similarity_algorithm }
14
- describe(:app) {
15
- describe(:similarity_algorithm) {
16
- subject { app.similarity_algorithm }
17
- it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
18
- }
19
- }
20
- describe(:similarity_algorithm) {
21
- subject { algorithm }
22
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
23
- }
24
- describe(:tags) {
25
- describe(:output) {
26
- if %x{which mecab}
27
- subject { algorithm.tags }
28
- it { should eq ["fox", "国家", "隙", "教師", "悪口", "尻", "坊ちゃん", "時分", "向", "叫び", "人間", "ネルソン", "この世", "西洋", "expect", "articl"] }
29
- else
30
- pending "mecab is not installed."
31
- end
32
- }
33
- }
34
- describe(:article) {
35
- describe(:similarity_algorithm) {
36
- subject { algorithm }
37
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
38
- }
39
- }
40
-
41
- end
@@ -1,52 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/word_frequency/tree_tagger'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar, :algorithm => :'word_frequency/tree_tagger'
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/10/article2.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
16
- }
17
- }
18
- describe(:similarity_algorithm) {
19
- subject { algorithm }
20
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
21
- }
22
- describe(:tags) {
23
- describe(:output) {
24
- if ENV['TREETAGGER_COMMAND']
25
- subject { algorithm.tags }
26
- it { should eq ["quick", "fox", "dog", "brown", "lazi", "articl"] }
27
- else
28
- pending "ENV['TREETAGGER_COMMAND'] not set."
29
- end
30
- }
31
- context('if command path is not set') {
32
- subject { -> { algorithm.tags } }
33
- before {
34
- @cmd = ENV['TREETAGGER_COMMAND']
35
- ENV['TREETAGGER_COMMAND'] = nil
36
- }
37
- after {
38
- ENV['TREETAGGER_COMMAND'] = @cmd if @cmd
39
- }
40
- describe('raises error') {
41
- it { should raise_error Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger::CommandNotFound }
42
- }
43
- }
44
- }
45
- describe(:article) {
46
- describe(:similarity_algorithm) {
47
- subject { algorithm }
48
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
49
- }
50
- }
51
-
52
- end
@@ -1,73 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/word_frequency'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm::WordFrequency' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar, :algorithm => :word_frequency
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/10/article2.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency }
16
- }
17
- }
18
- describe(:algorithm) {
19
- subject { algorithm }
20
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
21
- }
22
- describe(:unigrams) {
23
- describe('length of keys') {
24
- subject { algorithm.unigrams.keys.length }
25
- it { should be 21089 }
26
- }
27
- describe('class') {
28
- subject { algorithm.unigrams }
29
- it { should be_a_kind_of Hash }
30
- }
31
- }
32
- describe(:similar_articles) {
33
- subject { algorithm.similar_articles.map(&:url) }
34
- it {
35
- should eq [
36
- "/2014/05/11/article3.html",
37
- "/2014/05/08/article0.html",
38
- "/2014/05/12/article4.html",
39
- "/2014/05/13/article5.html",
40
- "/2014/05/09/article1.html",
41
- "/2014/05/14/article6.html"
42
- ]
43
- }
44
- }
45
- describe(:tags) {
46
- subject { algorithm.tags }
47
- it { should eq ["fox", "quick", "dog", "brown", "the", "jump", "lazi", "over", "articl", "2"] }
48
- }
49
- describe(:word_freq) {
50
- subject { algorithm.word_freq }
51
- it {
52
- should eq({
53
- "brown" => 2,
54
- "dog" => 3,
55
- "fox" => 6,
56
- "jump" => 1,
57
- "lazi" => 1,
58
- "over" => 1,
59
- "quick" => 6,
60
- "the" => 2,
61
- "2" => 1,
62
- "articl" => 1
63
- })
64
- }
65
- }
66
- describe(:article) {
67
- describe(:similarity_algorithm) {
68
- subject { algorithm }
69
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
70
- }
71
- }
72
-
73
- end
@@ -1,40 +0,0 @@
1
- require 'spec_helper'
2
- require 'middleman-blog-similar/algorithm/word_frequency'
3
-
4
- describe 'Middleman::Blog::Similar::Algorithm' do
5
- let(:app) {
6
- middleman_app('test-app') {
7
- activate :similar
8
- }
9
- }
10
- let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
11
- let(:algorithm) { article.similarity_algorithm }
12
- describe(:app) {
13
- describe(:similarity_algorithm) {
14
- subject { app.similarity_algorithm }
15
- it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency }
16
- }
17
- }
18
- describe(:similarity_algorithm) {
19
- describe(:articles) {
20
- subject { algorithm.articles.map(&:url) }
21
- it {
22
- should eq [
23
- "/2014/05/14/article6.html",
24
- "/2014/05/13/article5.html",
25
- "/2014/05/12/article4.html",
26
- "/2014/05/11/article3.html",
27
- "/2014/05/10/article2.html",
28
- "/2014/05/09/article1.html",
29
- "/2014/05/08/article0.html"
30
- ]
31
- }
32
- }
33
- }
34
- describe(:article) {
35
- describe(:similarity_algorithm) {
36
- subject { algorithm }
37
- it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
38
- }
39
- }
40
- end