middleman-blog-similar 1.1.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/.rubocop.yml +33 -0
- data/.travis.yml +12 -21
- data/Gemfile +20 -35
- data/Guardfile +9 -9
- data/README.md +57 -82
- data/Rakefile +9 -7
- data/features/default.feature +21 -0
- data/features/support/env.rb +20 -10
- data/fixtures/test-app/source/{2014-05-08-article0.md → 2014-05-08-article0.html.md} +2 -1
- data/fixtures/test-app/source/{2014-05-09-article1.md → 2014-05-09-article1.html.md} +2 -0
- data/fixtures/test-app/source/{2014-05-10-article2.md → 2014-05-10-article2.html.md} +1 -1
- data/fixtures/test-app/source/{2014-05-11-article3.md → 2014-05-11-article3.html.md} +0 -0
- data/fixtures/test-app/source/{2014-05-12-article4.md → 2014-05-12-article4.html.md} +1 -0
- data/fixtures/test-app/source/{2014-05-13-article5.md → 2014-05-13-article5.html.md} +1 -0
- data/fixtures/test-app/source/{2014-05-14-article6.md → 2014-05-14-article6.html.md} +1 -0
- data/fixtures/test-app/source/index.html.slim +1 -0
- data/fixtures/test-app/source/layout.slim +1 -1
- data/fixtures/test-app/source/layouts/article.slim +6 -6
- data/lib/middleman-blog-similar.rb +3 -3
- data/lib/middleman-blog-similar/blog_article_extensions.rb +8 -20
- data/lib/middleman-blog-similar/database.rb +70 -0
- data/lib/middleman-blog-similar/extension.rb +37 -15
- data/lib/middleman-blog-similar/helpers.rb +12 -6
- data/lib/middleman-blog-similar/models/article.rb +33 -0
- data/lib/middleman-blog-similar/models/migration.rb +32 -0
- data/lib/middleman-blog-similar/models/tag.rb +10 -0
- data/lib/middleman-blog-similar/models/tagging.rb +10 -0
- data/lib/middleman-blog-similar/resource_list_manipulator.rb +19 -0
- data/lib/middleman-blog-similar/tagger/entagger.rb +17 -0
- data/lib/middleman-blog-similar/tagger/mecab.rb +19 -0
- data/lib/middleman-blog-similar/tagger/tags.rb +13 -0
- data/lib/middleman-blog-similar/version.rb +1 -1
- data/lib/middleman_extension.rb +0 -1
- data/middleman-blog-similar.gemspec +18 -13
- data/spec/middleman-blog-similar/extension_spec.rb +44 -1
- data/spec/middleman-blog-similar/tagger_spec.rb +24 -0
- data/spec/spec_helper.rb +33 -24
- metadata +77 -72
- data/features/damerau_levenshtein.feature +0 -20
- data/features/levenshtein.feature +0 -20
- data/features/word_frequency.feature +0 -15
- data/lib/middleman-blog-similar/algorithm.rb +0 -19
- data/lib/middleman-blog-similar/algorithm/damerau_levenshtein.rb +0 -7
- data/lib/middleman-blog-similar/algorithm/levenshtein.rb +0 -7
- data/lib/middleman-blog-similar/algorithm/unigrams.csv +0 -21089
- data/lib/middleman-blog-similar/algorithm/word_frequency.rb +0 -69
- data/lib/middleman-blog-similar/algorithm/word_frequency/mecab.rb +0 -22
- data/lib/middleman-blog-similar/algorithm/word_frequency/tree_tagger.rb +0 -20
- data/spec/helper_spec.rb +0 -4
- data/spec/middleman-blog-similar/algorithm/damerau_levenshtein_spec.rb +0 -42
- data/spec/middleman-blog-similar/algorithm/levenshtein_spec.rb +0 -42
- data/spec/middleman-blog-similar/algorithm/word_frequency/mecab_spec.rb +0 -41
- data/spec/middleman-blog-similar/algorithm/word_frequency/tree_tagger_spec.rb +0 -52
- data/spec/middleman-blog-similar/algorithm/word_frequency_spec.rb +0 -73
- data/spec/middleman-blog-similar/algorithm_spec.rb +0 -40
@@ -1,69 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'fast-stemmer'
|
4
|
-
require 'csv'
|
5
|
-
|
6
|
-
# logic ported from https://plugins.trac.wordpress.org/browser/wordpress-23-related-posts-plugin/trunk/recommendations.php
|
7
|
-
|
8
|
-
class Middleman::Blog::Similar::Algorithm::WordFrequency < ::Middleman::Blog::Similar::Algorithm
|
9
|
-
@@unigrams = nil
|
10
|
-
class << self
|
11
|
-
def unigrams_path
|
12
|
-
File.join File.dirname(__FILE__), 'unigrams.csv'
|
13
|
-
end
|
14
|
-
def unigrams
|
15
|
-
if @@unigrams.nil?
|
16
|
-
@@unigrams = {}
|
17
|
-
CSV.foreach(unigrams_path, { :col_sep => "\t" }) do|row|
|
18
|
-
@@unigrams[row[0]] = row[4].to_f if row.length == 5
|
19
|
-
end
|
20
|
-
end
|
21
|
-
@@unigrams
|
22
|
-
end
|
23
|
-
end
|
24
|
-
def distance a
|
25
|
-
d = 0xffffff
|
26
|
-
wf = a.similarity_algorithm.word_freq
|
27
|
-
word_freq.each do|word, freq|
|
28
|
-
if wf.has_key? word
|
29
|
-
d -= wf[word] * freq
|
30
|
-
end
|
31
|
-
end
|
32
|
-
d
|
33
|
-
end
|
34
|
-
def words
|
35
|
-
re = /[\t\s\n,\.、。 ]/
|
36
|
-
article.untagged_body.split(re) + article.title.split(re)
|
37
|
-
end
|
38
|
-
def generate_word_freq
|
39
|
-
suitable_words = unigrams.dup
|
40
|
-
word_freq= {}
|
41
|
-
words.each do|word|
|
42
|
-
word.downcase!
|
43
|
-
word = word.stem if word =~ /^\w+$/
|
44
|
-
word_freq[word] ||= 0
|
45
|
-
word_freq[word] += 1
|
46
|
-
end
|
47
|
-
selected_words = {}
|
48
|
-
word_freq.each do|word, freq|
|
49
|
-
selected_words[word] = unigrams[word] * Math.sqrt(freq) if unigrams[word]
|
50
|
-
end
|
51
|
-
article.tags.each do|tag|
|
52
|
-
tag = tag.downcase.stem
|
53
|
-
word_freq[tag] ||= 0
|
54
|
-
word_freq[tag] += tag_weight
|
55
|
-
end
|
56
|
-
Hash[ word_freq.sort_by{|k, v| k }.sort_by{|k, v| v } ]
|
57
|
-
end
|
58
|
-
def word_freq
|
59
|
-
@word_freq ||= generate_word_freq
|
60
|
-
end
|
61
|
-
def generate_tags
|
62
|
-
generate_word_freq.keys.reverse
|
63
|
-
end
|
64
|
-
def tags
|
65
|
-
@tags ||= generate_tags
|
66
|
-
end
|
67
|
-
def tag_weight ; 5 ; end
|
68
|
-
def unigrams ; self.class.unigrams ; end
|
69
|
-
end
|
@@ -1,22 +0,0 @@
|
|
1
|
-
# -*- coding: UTF-8 -*-
|
2
|
-
|
3
|
-
require 'middleman-blog-similar/algorithm/word_frequency'
|
4
|
-
|
5
|
-
class Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab < ::Middleman::Blog::Similar::Algorithm::WordFrequency
|
6
|
-
class CommandNotFound < StandardError; end
|
7
|
-
def words
|
8
|
-
res = []
|
9
|
-
IO.popen("mecab 2>/dev/null", 'r+') {|f|
|
10
|
-
f.puts article.untagged_body
|
11
|
-
f.puts article.title
|
12
|
-
f.close_write
|
13
|
-
while line = f.gets
|
14
|
-
word, pos = line.split(/[\t\s]+/)
|
15
|
-
next unless pos
|
16
|
-
pos = pos.split(',')
|
17
|
-
res << word if pos[0] == '名詞' && %w{一般 固有名詞}.include?(pos[1])
|
18
|
-
end
|
19
|
-
}
|
20
|
-
res
|
21
|
-
end
|
22
|
-
end
|
@@ -1,20 +0,0 @@
|
|
1
|
-
require 'middleman-blog-similar/algorithm/word_frequency'
|
2
|
-
|
3
|
-
class Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger < ::Middleman::Blog::Similar::Algorithm::WordFrequency
|
4
|
-
class CommandNotFound < StandardError; end
|
5
|
-
def words
|
6
|
-
raise CommandNotFound.new "You need to tree tagger command with ENV['TREETAGGER_COMMAND']" unless ENV['TREETAGGER_COMMAND']
|
7
|
-
res = []
|
8
|
-
IO.popen("#{ ENV['TREETAGGER_COMMAND'] } 2>/dev/null", 'r+') {|f|
|
9
|
-
f.puts article.untagged_body
|
10
|
-
f.puts article.title
|
11
|
-
f.close_write
|
12
|
-
while line = f.gets
|
13
|
-
word, pos = line.split(/\s+/)
|
14
|
-
# http://courses.washington.edu/hypertxt/csar-v02/penntable.html
|
15
|
-
res << word if %w{NN JJ NP}.include? pos[0..2]
|
16
|
-
end
|
17
|
-
}
|
18
|
-
res
|
19
|
-
end
|
20
|
-
end
|
data/spec/helper_spec.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'middleman-blog-similar/algorithm/damerau_levenshtein'
|
3
|
-
|
4
|
-
describe 'Middleman::Blog::Similar::Algorithm::DamerauLevenshtein' do
|
5
|
-
let(:app) {
|
6
|
-
middleman_app('test-app') {
|
7
|
-
activate :similar, :algorithm => :damerau_levenshtein
|
8
|
-
}
|
9
|
-
}
|
10
|
-
let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
|
11
|
-
let(:algorithm) { article.similarity_algorithm }
|
12
|
-
describe(:app) {
|
13
|
-
describe(:similarity_algorithm) {
|
14
|
-
subject { app.similarity_algorithm }
|
15
|
-
it { should be ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
|
16
|
-
}
|
17
|
-
}
|
18
|
-
describe(:similarity_algorithm) {
|
19
|
-
subject { algorithm }
|
20
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
|
21
|
-
describe(:similar_articles) {
|
22
|
-
subject { algorithm.similar_articles.map(&:url) }
|
23
|
-
it {
|
24
|
-
should eq [
|
25
|
-
"/2014/05/13/article5.html",
|
26
|
-
"/2014/05/09/article1.html",
|
27
|
-
"/2014/05/12/article4.html",
|
28
|
-
"/2014/05/14/article6.html",
|
29
|
-
"/2014/05/10/article2.html",
|
30
|
-
"/2014/05/11/article3.html"
|
31
|
-
]
|
32
|
-
}
|
33
|
-
}
|
34
|
-
}
|
35
|
-
describe(:article) {
|
36
|
-
describe(:similarity_algorithm) {
|
37
|
-
subject { algorithm }
|
38
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::DamerauLevenshtein }
|
39
|
-
}
|
40
|
-
}
|
41
|
-
|
42
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'middleman-blog-similar/algorithm/levenshtein'
|
3
|
-
|
4
|
-
describe 'Middleman::Blog::Similar::Algorithm::Levenshtein' do
|
5
|
-
let(:app) {
|
6
|
-
middleman_app('test-app') {
|
7
|
-
activate :similar, :algorithm => :levenshtein
|
8
|
-
}
|
9
|
-
}
|
10
|
-
let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
|
11
|
-
let(:algorithm) { article.similarity_algorithm }
|
12
|
-
describe(:app) {
|
13
|
-
describe(:similarity_algorithm) {
|
14
|
-
subject { app.similarity_algorithm }
|
15
|
-
it { should be ::Middleman::Blog::Similar::Algorithm::Levenshtein }
|
16
|
-
}
|
17
|
-
}
|
18
|
-
describe(:similarity_algorithm) {
|
19
|
-
subject { algorithm }
|
20
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::Levenshtein }
|
21
|
-
describe(:similar_articles) {
|
22
|
-
subject { algorithm.similar_articles.map(&:url) }
|
23
|
-
it {
|
24
|
-
should eq [
|
25
|
-
"/2014/05/13/article5.html",
|
26
|
-
"/2014/05/09/article1.html",
|
27
|
-
"/2014/05/12/article4.html",
|
28
|
-
"/2014/05/14/article6.html",
|
29
|
-
"/2014/05/10/article2.html",
|
30
|
-
"/2014/05/11/article3.html"
|
31
|
-
]
|
32
|
-
}
|
33
|
-
}
|
34
|
-
}
|
35
|
-
describe(:article) {
|
36
|
-
describe(:similarity_algorithm) {
|
37
|
-
subject { algorithm }
|
38
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::Levenshtein }
|
39
|
-
}
|
40
|
-
}
|
41
|
-
|
42
|
-
end
|
@@ -1,41 +0,0 @@
|
|
1
|
-
# -*- coding: UTF-8 -*-
|
2
|
-
|
3
|
-
require 'spec_helper'
|
4
|
-
require 'middleman-blog-similar/algorithm/word_frequency/mecab'
|
5
|
-
|
6
|
-
describe 'Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab' do
|
7
|
-
let(:app) {
|
8
|
-
middleman_app('test-app') {
|
9
|
-
activate :similar, :algorithm => :'word_frequency/mecab'
|
10
|
-
}
|
11
|
-
}
|
12
|
-
let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/11/article3.html' }
|
13
|
-
let(:algorithm) { article.similarity_algorithm }
|
14
|
-
describe(:app) {
|
15
|
-
describe(:similarity_algorithm) {
|
16
|
-
subject { app.similarity_algorithm }
|
17
|
-
it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
|
18
|
-
}
|
19
|
-
}
|
20
|
-
describe(:similarity_algorithm) {
|
21
|
-
subject { algorithm }
|
22
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
|
23
|
-
}
|
24
|
-
describe(:tags) {
|
25
|
-
describe(:output) {
|
26
|
-
if %x{which mecab}
|
27
|
-
subject { algorithm.tags }
|
28
|
-
it { should eq ["fox", "国家", "隙", "教師", "悪口", "尻", "坊ちゃん", "時分", "向", "叫び", "人間", "ネルソン", "この世", "西洋", "expect", "articl"] }
|
29
|
-
else
|
30
|
-
pending "mecab is not installed."
|
31
|
-
end
|
32
|
-
}
|
33
|
-
}
|
34
|
-
describe(:article) {
|
35
|
-
describe(:similarity_algorithm) {
|
36
|
-
subject { algorithm }
|
37
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::Mecab }
|
38
|
-
}
|
39
|
-
}
|
40
|
-
|
41
|
-
end
|
@@ -1,52 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'middleman-blog-similar/algorithm/word_frequency/tree_tagger'
|
3
|
-
|
4
|
-
describe 'Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger' do
|
5
|
-
let(:app) {
|
6
|
-
middleman_app('test-app') {
|
7
|
-
activate :similar, :algorithm => :'word_frequency/tree_tagger'
|
8
|
-
}
|
9
|
-
}
|
10
|
-
let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/10/article2.html' }
|
11
|
-
let(:algorithm) { article.similarity_algorithm }
|
12
|
-
describe(:app) {
|
13
|
-
describe(:similarity_algorithm) {
|
14
|
-
subject { app.similarity_algorithm }
|
15
|
-
it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
|
16
|
-
}
|
17
|
-
}
|
18
|
-
describe(:similarity_algorithm) {
|
19
|
-
subject { algorithm }
|
20
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
|
21
|
-
}
|
22
|
-
describe(:tags) {
|
23
|
-
describe(:output) {
|
24
|
-
if ENV['TREETAGGER_COMMAND']
|
25
|
-
subject { algorithm.tags }
|
26
|
-
it { should eq ["quick", "fox", "dog", "brown", "lazi", "articl"] }
|
27
|
-
else
|
28
|
-
pending "ENV['TREETAGGER_COMMAND'] not set."
|
29
|
-
end
|
30
|
-
}
|
31
|
-
context('if command path is not set') {
|
32
|
-
subject { -> { algorithm.tags } }
|
33
|
-
before {
|
34
|
-
@cmd = ENV['TREETAGGER_COMMAND']
|
35
|
-
ENV['TREETAGGER_COMMAND'] = nil
|
36
|
-
}
|
37
|
-
after {
|
38
|
-
ENV['TREETAGGER_COMMAND'] = @cmd if @cmd
|
39
|
-
}
|
40
|
-
describe('raises error') {
|
41
|
-
it { should raise_error Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger::CommandNotFound }
|
42
|
-
}
|
43
|
-
}
|
44
|
-
}
|
45
|
-
describe(:article) {
|
46
|
-
describe(:similarity_algorithm) {
|
47
|
-
subject { algorithm }
|
48
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency::TreeTagger }
|
49
|
-
}
|
50
|
-
}
|
51
|
-
|
52
|
-
end
|
@@ -1,73 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'middleman-blog-similar/algorithm/word_frequency'
|
3
|
-
|
4
|
-
describe 'Middleman::Blog::Similar::Algorithm::WordFrequency' do
|
5
|
-
let(:app) {
|
6
|
-
middleman_app('test-app') {
|
7
|
-
activate :similar, :algorithm => :word_frequency
|
8
|
-
}
|
9
|
-
}
|
10
|
-
let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/10/article2.html' }
|
11
|
-
let(:algorithm) { article.similarity_algorithm }
|
12
|
-
describe(:app) {
|
13
|
-
describe(:similarity_algorithm) {
|
14
|
-
subject { app.similarity_algorithm }
|
15
|
-
it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency }
|
16
|
-
}
|
17
|
-
}
|
18
|
-
describe(:algorithm) {
|
19
|
-
subject { algorithm }
|
20
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
|
21
|
-
}
|
22
|
-
describe(:unigrams) {
|
23
|
-
describe('length of keys') {
|
24
|
-
subject { algorithm.unigrams.keys.length }
|
25
|
-
it { should be 21089 }
|
26
|
-
}
|
27
|
-
describe('class') {
|
28
|
-
subject { algorithm.unigrams }
|
29
|
-
it { should be_a_kind_of Hash }
|
30
|
-
}
|
31
|
-
}
|
32
|
-
describe(:similar_articles) {
|
33
|
-
subject { algorithm.similar_articles.map(&:url) }
|
34
|
-
it {
|
35
|
-
should eq [
|
36
|
-
"/2014/05/11/article3.html",
|
37
|
-
"/2014/05/08/article0.html",
|
38
|
-
"/2014/05/12/article4.html",
|
39
|
-
"/2014/05/13/article5.html",
|
40
|
-
"/2014/05/09/article1.html",
|
41
|
-
"/2014/05/14/article6.html"
|
42
|
-
]
|
43
|
-
}
|
44
|
-
}
|
45
|
-
describe(:tags) {
|
46
|
-
subject { algorithm.tags }
|
47
|
-
it { should eq ["fox", "quick", "dog", "brown", "the", "jump", "lazi", "over", "articl", "2"] }
|
48
|
-
}
|
49
|
-
describe(:word_freq) {
|
50
|
-
subject { algorithm.word_freq }
|
51
|
-
it {
|
52
|
-
should eq({
|
53
|
-
"brown" => 2,
|
54
|
-
"dog" => 3,
|
55
|
-
"fox" => 6,
|
56
|
-
"jump" => 1,
|
57
|
-
"lazi" => 1,
|
58
|
-
"over" => 1,
|
59
|
-
"quick" => 6,
|
60
|
-
"the" => 2,
|
61
|
-
"2" => 1,
|
62
|
-
"articl" => 1
|
63
|
-
})
|
64
|
-
}
|
65
|
-
}
|
66
|
-
describe(:article) {
|
67
|
-
describe(:similarity_algorithm) {
|
68
|
-
subject { algorithm }
|
69
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
|
70
|
-
}
|
71
|
-
}
|
72
|
-
|
73
|
-
end
|
@@ -1,40 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'middleman-blog-similar/algorithm/word_frequency'
|
3
|
-
|
4
|
-
describe 'Middleman::Blog::Similar::Algorithm' do
|
5
|
-
let(:app) {
|
6
|
-
middleman_app('test-app') {
|
7
|
-
activate :similar
|
8
|
-
}
|
9
|
-
}
|
10
|
-
let(:article) { app.sitemap.find_resource_by_destination_path '/2014/05/08/article0.html' }
|
11
|
-
let(:algorithm) { article.similarity_algorithm }
|
12
|
-
describe(:app) {
|
13
|
-
describe(:similarity_algorithm) {
|
14
|
-
subject { app.similarity_algorithm }
|
15
|
-
it { should be ::Middleman::Blog::Similar::Algorithm::WordFrequency }
|
16
|
-
}
|
17
|
-
}
|
18
|
-
describe(:similarity_algorithm) {
|
19
|
-
describe(:articles) {
|
20
|
-
subject { algorithm.articles.map(&:url) }
|
21
|
-
it {
|
22
|
-
should eq [
|
23
|
-
"/2014/05/14/article6.html",
|
24
|
-
"/2014/05/13/article5.html",
|
25
|
-
"/2014/05/12/article4.html",
|
26
|
-
"/2014/05/11/article3.html",
|
27
|
-
"/2014/05/10/article2.html",
|
28
|
-
"/2014/05/09/article1.html",
|
29
|
-
"/2014/05/08/article0.html"
|
30
|
-
]
|
31
|
-
}
|
32
|
-
}
|
33
|
-
}
|
34
|
-
describe(:article) {
|
35
|
-
describe(:similarity_algorithm) {
|
36
|
-
subject { algorithm }
|
37
|
-
it { should be_a_kind_of ::Middleman::Blog::Similar::Algorithm::WordFrequency }
|
38
|
-
}
|
39
|
-
}
|
40
|
-
end
|