middleman-blog-similar 1.1.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/.rubocop.yml +33 -0
- data/.travis.yml +12 -21
- data/Gemfile +20 -35
- data/Guardfile +9 -9
- data/README.md +57 -82
- data/Rakefile +9 -7
- data/features/default.feature +21 -0
- data/features/support/env.rb +20 -10
- data/fixtures/test-app/source/{2014-05-08-article0.md → 2014-05-08-article0.html.md} +2 -1
- data/fixtures/test-app/source/{2014-05-09-article1.md → 2014-05-09-article1.html.md} +2 -0
- data/fixtures/test-app/source/{2014-05-10-article2.md → 2014-05-10-article2.html.md} +1 -1
- data/fixtures/test-app/source/{2014-05-11-article3.md → 2014-05-11-article3.html.md} +0 -0
- data/fixtures/test-app/source/{2014-05-12-article4.md → 2014-05-12-article4.html.md} +1 -0
- data/fixtures/test-app/source/{2014-05-13-article5.md → 2014-05-13-article5.html.md} +1 -0
- data/fixtures/test-app/source/{2014-05-14-article6.md → 2014-05-14-article6.html.md} +1 -0
- data/fixtures/test-app/source/index.html.slim +1 -0
- data/fixtures/test-app/source/layout.slim +1 -1
- data/fixtures/test-app/source/layouts/article.slim +6 -6
- data/lib/middleman-blog-similar.rb +3 -3
- data/lib/middleman-blog-similar/blog_article_extensions.rb +8 -20
- data/lib/middleman-blog-similar/database.rb +70 -0
- data/lib/middleman-blog-similar/extension.rb +37 -15
- data/lib/middleman-blog-similar/helpers.rb +12 -6
- data/lib/middleman-blog-similar/models/article.rb +33 -0
- data/lib/middleman-blog-similar/models/migration.rb +32 -0
- data/lib/middleman-blog-similar/models/tag.rb +10 -0
- data/lib/middleman-blog-similar/models/tagging.rb +10 -0
- data/lib/middleman-blog-similar/resource_list_manipulator.rb +19 -0
- data/lib/middleman-blog-similar/tagger/entagger.rb +17 -0
- data/lib/middleman-blog-similar/tagger/mecab.rb +19 -0
- data/lib/middleman-blog-similar/tagger/tags.rb +13 -0
- data/lib/middleman-blog-similar/version.rb +1 -1
- data/lib/middleman_extension.rb +0 -1
- data/middleman-blog-similar.gemspec +18 -13
- data/spec/middleman-blog-similar/extension_spec.rb +44 -1
- data/spec/middleman-blog-similar/tagger_spec.rb +24 -0
- data/spec/spec_helper.rb +33 -24
- metadata +77 -72
- data/features/damerau_levenshtein.feature +0 -20
- data/features/levenshtein.feature +0 -20
- data/features/word_frequency.feature +0 -15
- data/lib/middleman-blog-similar/algorithm.rb +0 -19
- data/lib/middleman-blog-similar/algorithm/damerau_levenshtein.rb +0 -7
- data/lib/middleman-blog-similar/algorithm/levenshtein.rb +0 -7
- data/lib/middleman-blog-similar/algorithm/unigrams.csv +0 -21089
- data/lib/middleman-blog-similar/algorithm/word_frequency.rb +0 -69
- data/lib/middleman-blog-similar/algorithm/word_frequency/mecab.rb +0 -22
- data/lib/middleman-blog-similar/algorithm/word_frequency/tree_tagger.rb +0 -20
- data/spec/helper_spec.rb +0 -4
- data/spec/middleman-blog-similar/algorithm/damerau_levenshtein_spec.rb +0 -42
- data/spec/middleman-blog-similar/algorithm/levenshtein_spec.rb +0 -42
- data/spec/middleman-blog-similar/algorithm/word_frequency/mecab_spec.rb +0 -41
- data/spec/middleman-blog-similar/algorithm/word_frequency/tree_tagger_spec.rb +0 -52
- data/spec/middleman-blog-similar/algorithm/word_frequency_spec.rb +0 -73
- data/spec/middleman-blog-similar/algorithm_spec.rb +0 -40
File without changes
|
@@ -2,16 +2,16 @@ html
|
|
2
2
|
head
|
3
3
|
meta charset="utf-8"
|
4
4
|
title= current_article.title
|
5
|
-
body
|
5
|
+
body data-similar-article-count=similar_articles.count
|
6
6
|
.container
|
7
7
|
h1= current_article.title
|
8
|
+
p.tags= current_article.tags.join ', '
|
8
9
|
= yield
|
9
10
|
|
10
11
|
h2 Similar Entries
|
11
12
|
ul
|
12
|
-
- similar_articles.
|
13
|
+
- similar_articles.each_with_index do|article, index|
|
13
14
|
li class="a#{index}"
|
14
|
-
= link_to article.
|
15
|
-
|
16
|
-
|
17
|
-
= current_article.app.similarity_algorithm
|
15
|
+
= link_to article.url do
|
16
|
+
span.title= article.title
|
17
|
+
span.tags= article.tags.join ', '
|
@@ -1,7 +1,7 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'middleman-core'
|
2
|
+
require 'middleman-blog-similar/version'
|
3
3
|
|
4
4
|
::Middleman::Extensions.register(:similar) do
|
5
|
-
require
|
5
|
+
require 'middleman-blog-similar/extension'
|
6
6
|
::Middleman::Blog::SimilarExtension
|
7
7
|
end
|
@@ -1,23 +1,11 @@
|
|
1
|
-
module Middleman
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
module BlogArticleExtensions
|
5
|
+
def similar_articles
|
6
|
+
locals[:similar_db].find_similar(self)
|
7
|
+
end
|
8
|
+
end
|
5
9
|
end
|
6
|
-
@similar_articles || []
|
7
|
-
end
|
8
|
-
def words
|
9
|
-
unless @words && similarity_algorithm
|
10
|
-
@words = similarity_algorithm.words
|
11
|
-
end
|
12
|
-
@words
|
13
|
-
end
|
14
|
-
def similarity_algorithm
|
15
|
-
if !@similarity_algorithm && (algorithm = app.similarity_algorithm)
|
16
|
-
@similarity_algorithm = algorithm.new self
|
17
|
-
end
|
18
|
-
@similarity_algorithm
|
19
|
-
end
|
20
|
-
def untagged_body
|
21
|
-
body.gsub(/<[^>]*>/ui,'')
|
22
10
|
end
|
23
11
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'sqlite3'
|
2
|
+
require 'active_record'
|
3
|
+
require 'middleman-blog-similar/models/article'
|
4
|
+
require 'middleman-blog-similar/models/tag'
|
5
|
+
require 'middleman-blog-similar/models/tagging'
|
6
|
+
require 'middleman-blog-similar/models/migration'
|
7
|
+
|
8
|
+
module Middleman
|
9
|
+
module Blog
|
10
|
+
module Similar
|
11
|
+
class Database
|
12
|
+
attr_reader :taggers
|
13
|
+
def initialize(path, taggers)
|
14
|
+
ActiveRecord::Base.establish_connection(
|
15
|
+
adapter: 'sqlite3',
|
16
|
+
database: path
|
17
|
+
)
|
18
|
+
Migration.apply
|
19
|
+
@taggers = taggers
|
20
|
+
@id_map = {}
|
21
|
+
end
|
22
|
+
|
23
|
+
def store_articles(resources)
|
24
|
+
@id_map = {}
|
25
|
+
ActiveRecord::Base.transaction do
|
26
|
+
ids = []
|
27
|
+
resources.each do |res|
|
28
|
+
next unless res.is_a?(Middleman::Blog::BlogArticle)
|
29
|
+
execute_article res
|
30
|
+
ids << res.page_id
|
31
|
+
@id_map[res.page_id.to_s] = res
|
32
|
+
end
|
33
|
+
Article.where.not(page_id: ids).delete_all unless ids.empty?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def execute_article(resource)
|
38
|
+
source_file = resource.source_file
|
39
|
+
page_id = resource.page_id
|
40
|
+
digest = ::Digest::SHA1.file(source_file).hexdigest
|
41
|
+
return page_id if Article.exists?(digest: digest, page_id: page_id)
|
42
|
+
article = Article.find_or_create_by(page_id: page_id)
|
43
|
+
new_tagging_ids = []
|
44
|
+
@taggers.each do |tagger|
|
45
|
+
tagger[1].call(resource).map(&:downcase).each do |tag_name|
|
46
|
+
tag = Tag.find_or_create_by name: tag_name
|
47
|
+
tagging = Tagging.find_or_create_by tag_id: tag.id, article_id: article.id
|
48
|
+
tagging.weight = tagger[0]
|
49
|
+
tagging.save!
|
50
|
+
new_tagging_ids << tagging.id
|
51
|
+
end
|
52
|
+
end
|
53
|
+
if new_tagging_ids.any?
|
54
|
+
article.taggings.where.not(id: new_tagging_ids).delete_all
|
55
|
+
end
|
56
|
+
article.update! digest: digest
|
57
|
+
page_id
|
58
|
+
end
|
59
|
+
|
60
|
+
def find_similar(article)
|
61
|
+
article = Article.find_by(page_id: article.page_id)
|
62
|
+
return [] unless article
|
63
|
+
article.similar_article_page_ids.map do |page_id|
|
64
|
+
@id_map[page_id]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -1,32 +1,54 @@
|
|
1
1
|
require 'middleman-blog-similar/blog_article_extensions'
|
2
2
|
require 'middleman-blog-similar/helpers'
|
3
|
-
require 'middleman-blog-similar/
|
3
|
+
require 'middleman-blog-similar/resource_list_manipulator'
|
4
|
+
require 'middleman-blog-similar/database'
|
4
5
|
|
5
6
|
module Middleman
|
6
7
|
module Blog
|
7
8
|
class SimilarExtension < ::Middleman::Extension
|
9
|
+
option :tagger, :tags, 'Article tagger'
|
10
|
+
option :db, '.similar.db', 'SQLite3 Database'
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
self.defined_helpers = [ Middleman::Blog::Similar::Helpers ]
|
12
|
+
self.defined_helpers = [Middleman::Blog::Similar::Helpers]
|
12
13
|
|
13
14
|
def after_configuration
|
14
15
|
require 'middleman-blog/blog_article'
|
15
16
|
::Middleman::Sitemap::Resource.send :include, Middleman::Blog::Similar::BlogArticleExtensions
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
17
|
+
|
18
|
+
@taggers = []
|
19
|
+
case options.tagger
|
20
|
+
when String, Symbol
|
21
|
+
@taggers << [1, load_tagger(options.tagger)]
|
22
|
+
when Hash
|
23
|
+
options.tagger.each do |k, v|
|
24
|
+
if v.is_a?(Array)
|
25
|
+
k = v[1]
|
26
|
+
v = v[0]
|
27
|
+
end
|
28
|
+
@taggers << [v, load_tagger(k)]
|
29
|
+
end
|
30
|
+
else
|
31
|
+
raise "Invalid type for tagger option: #{options.tagger.class}"
|
27
32
|
end
|
33
|
+
db_path = options.db
|
34
|
+
db_path = File.expand_path(options.db, app.root) if db_path != ':memory:'
|
35
|
+
@db = Middleman::Blog::Similar::Database.new db_path, @taggers
|
36
|
+
@resource_list_manipulator = Middleman::Blog::Similar::ResourceListManipulator.new app, @db
|
37
|
+
@app.sitemap.register_resource_list_manipulator :blog_similar, @resource_list_manipulator
|
28
38
|
end
|
29
39
|
|
40
|
+
def load_tagger(tagger)
|
41
|
+
return tagger unless tagger.is_a?(String) || tagger.is_a?(Symbol)
|
42
|
+
require "middleman-blog-similar/tagger/#{tagger}"
|
43
|
+
ns = ::Middleman::Blog::Similar::Tagger
|
44
|
+
tagger.to_s.split('/').each do |n|
|
45
|
+
ns = ns.const_get n.camelize
|
46
|
+
end
|
47
|
+
ns.new
|
48
|
+
rescue LoadError => e
|
49
|
+
app.logger.error "Requested similar tagger '#{tagger}' not found."
|
50
|
+
raise e
|
51
|
+
end
|
30
52
|
end
|
31
53
|
end
|
32
54
|
end
|
@@ -1,9 +1,15 @@
|
|
1
|
-
module Middleman
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
module Helpers
|
5
|
+
def similar_articles
|
6
|
+
if is_blog_article?
|
7
|
+
current_article.similar_articles
|
8
|
+
else
|
9
|
+
[]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
7
13
|
end
|
8
14
|
end
|
9
15
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
class Article < ActiveRecord::Base
|
5
|
+
has_many :taggings
|
6
|
+
has_many :tags, through: :taggings
|
7
|
+
def similar_article_page_ids
|
8
|
+
return self.class.none if tags.empty?
|
9
|
+
# http://stackoverflow.com/a/22472153
|
10
|
+
res = ActiveRecord::Base.connection.select_all "
|
11
|
+
SELECT rtr.article_id FROM taggings AS rtr
|
12
|
+
INNER JOIN taggings rtr2
|
13
|
+
ON (rtr2.tag_id = rtr.tag_id AND rtr2.article_id = #{id})
|
14
|
+
LEFT JOIN
|
15
|
+
(SELECT * FROM taggings WHERE article_id = #{id}) AS r
|
16
|
+
ON rtr.tag_id = r.tag_id
|
17
|
+
LEFT JOIN articles a ON a.id = rtr.article_id
|
18
|
+
WHERE rtr.article_id != #{id}
|
19
|
+
GROUP BY rtr.article_id
|
20
|
+
HAVING COUNT(*) > 0
|
21
|
+
ORDER BY COUNT(*) * rtr.weight DESC, a.page_id DESC"
|
22
|
+
ids = res.to_hash.map { |h| h['article_id'] }
|
23
|
+
page_id_map = {}
|
24
|
+
articles = self.class.where(id: ids).select(:id, :page_id)
|
25
|
+
articles.each do |a|
|
26
|
+
page_id_map[a.id] = a.page_id
|
27
|
+
end
|
28
|
+
ids.map { |id| page_id_map[id] }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
module Migration
|
5
|
+
def self.apply
|
6
|
+
ActiveRecord::Schema.define(version: 201703240752) do # rubocop:disable Style/NumericLiterals
|
7
|
+
unless ActiveRecord::Base.connection.data_source_exists? 'articles'
|
8
|
+
create_table :articles do |table|
|
9
|
+
table.column :page_id, :string, index: true, unique: true
|
10
|
+
table.column :digest, :string, index: true
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
unless ActiveRecord::Base.connection.data_source_exists? 'tags'
|
15
|
+
create_table :tags do |table|
|
16
|
+
table.column :name, :string, index: true, unique: true
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
unless ActiveRecord::Base.connection.data_source_exists? 'taggings'
|
21
|
+
create_table :taggings do |table|
|
22
|
+
table.references :article, foreign_key: true
|
23
|
+
table.references :tag, foreign_key: true
|
24
|
+
table.column :weight, :integer, default: 1, null: false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
class ResourceListManipulator
|
5
|
+
attr_reader :article, :app, :db
|
6
|
+
def initialize(app, db)
|
7
|
+
@app = app
|
8
|
+
@db = db
|
9
|
+
end
|
10
|
+
|
11
|
+
def manipulate_resource_list(resources)
|
12
|
+
resources.each { |res| res.add_metadata locals: { similar_db: @db } }
|
13
|
+
@db.store_articles resources
|
14
|
+
resources
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'engtagger'
|
2
|
+
|
3
|
+
module Middleman
|
4
|
+
module Blog
|
5
|
+
module Similar
|
6
|
+
module Tagger
|
7
|
+
class Entagger
|
8
|
+
def call(article)
|
9
|
+
tgr = EngTagger.new
|
10
|
+
tagged = tgr.add_tags article.body.gsub(%r{</?[^>]+>}, '')
|
11
|
+
tgr.get_nouns(tagged).keys
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|