middleman-blog-similar 1.1.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rspec +2 -0
- data/.rubocop.yml +33 -0
- data/.travis.yml +12 -21
- data/Gemfile +20 -35
- data/Guardfile +9 -9
- data/README.md +57 -82
- data/Rakefile +9 -7
- data/features/default.feature +21 -0
- data/features/support/env.rb +20 -10
- data/fixtures/test-app/source/{2014-05-08-article0.md → 2014-05-08-article0.html.md} +2 -1
- data/fixtures/test-app/source/{2014-05-09-article1.md → 2014-05-09-article1.html.md} +2 -0
- data/fixtures/test-app/source/{2014-05-10-article2.md → 2014-05-10-article2.html.md} +1 -1
- data/fixtures/test-app/source/{2014-05-11-article3.md → 2014-05-11-article3.html.md} +0 -0
- data/fixtures/test-app/source/{2014-05-12-article4.md → 2014-05-12-article4.html.md} +1 -0
- data/fixtures/test-app/source/{2014-05-13-article5.md → 2014-05-13-article5.html.md} +1 -0
- data/fixtures/test-app/source/{2014-05-14-article6.md → 2014-05-14-article6.html.md} +1 -0
- data/fixtures/test-app/source/index.html.slim +1 -0
- data/fixtures/test-app/source/layout.slim +1 -1
- data/fixtures/test-app/source/layouts/article.slim +6 -6
- data/lib/middleman-blog-similar.rb +3 -3
- data/lib/middleman-blog-similar/blog_article_extensions.rb +8 -20
- data/lib/middleman-blog-similar/database.rb +70 -0
- data/lib/middleman-blog-similar/extension.rb +37 -15
- data/lib/middleman-blog-similar/helpers.rb +12 -6
- data/lib/middleman-blog-similar/models/article.rb +33 -0
- data/lib/middleman-blog-similar/models/migration.rb +32 -0
- data/lib/middleman-blog-similar/models/tag.rb +10 -0
- data/lib/middleman-blog-similar/models/tagging.rb +10 -0
- data/lib/middleman-blog-similar/resource_list_manipulator.rb +19 -0
- data/lib/middleman-blog-similar/tagger/entagger.rb +17 -0
- data/lib/middleman-blog-similar/tagger/mecab.rb +19 -0
- data/lib/middleman-blog-similar/tagger/tags.rb +13 -0
- data/lib/middleman-blog-similar/version.rb +1 -1
- data/lib/middleman_extension.rb +0 -1
- data/middleman-blog-similar.gemspec +18 -13
- data/spec/middleman-blog-similar/extension_spec.rb +44 -1
- data/spec/middleman-blog-similar/tagger_spec.rb +24 -0
- data/spec/spec_helper.rb +33 -24
- metadata +77 -72
- data/features/damerau_levenshtein.feature +0 -20
- data/features/levenshtein.feature +0 -20
- data/features/word_frequency.feature +0 -15
- data/lib/middleman-blog-similar/algorithm.rb +0 -19
- data/lib/middleman-blog-similar/algorithm/damerau_levenshtein.rb +0 -7
- data/lib/middleman-blog-similar/algorithm/levenshtein.rb +0 -7
- data/lib/middleman-blog-similar/algorithm/unigrams.csv +0 -21089
- data/lib/middleman-blog-similar/algorithm/word_frequency.rb +0 -69
- data/lib/middleman-blog-similar/algorithm/word_frequency/mecab.rb +0 -22
- data/lib/middleman-blog-similar/algorithm/word_frequency/tree_tagger.rb +0 -20
- data/spec/helper_spec.rb +0 -4
- data/spec/middleman-blog-similar/algorithm/damerau_levenshtein_spec.rb +0 -42
- data/spec/middleman-blog-similar/algorithm/levenshtein_spec.rb +0 -42
- data/spec/middleman-blog-similar/algorithm/word_frequency/mecab_spec.rb +0 -41
- data/spec/middleman-blog-similar/algorithm/word_frequency/tree_tagger_spec.rb +0 -52
- data/spec/middleman-blog-similar/algorithm/word_frequency_spec.rb +0 -73
- data/spec/middleman-blog-similar/algorithm_spec.rb +0 -40
File without changes
|
@@ -2,16 +2,16 @@ html
|
|
2
2
|
head
|
3
3
|
meta charset="utf-8"
|
4
4
|
title= current_article.title
|
5
|
-
body
|
5
|
+
body data-similar-article-count=similar_articles.count
|
6
6
|
.container
|
7
7
|
h1= current_article.title
|
8
|
+
p.tags= current_article.tags.join ', '
|
8
9
|
= yield
|
9
10
|
|
10
11
|
h2 Similar Entries
|
11
12
|
ul
|
12
|
-
- similar_articles.
|
13
|
+
- similar_articles.each_with_index do|article, index|
|
13
14
|
li class="a#{index}"
|
14
|
-
= link_to article.
|
15
|
-
|
16
|
-
|
17
|
-
= current_article.app.similarity_algorithm
|
15
|
+
= link_to article.url do
|
16
|
+
span.title= article.title
|
17
|
+
span.tags= article.tags.join ', '
|
@@ -1,7 +1,7 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'middleman-core'
|
2
|
+
require 'middleman-blog-similar/version'
|
3
3
|
|
4
4
|
::Middleman::Extensions.register(:similar) do
|
5
|
-
require
|
5
|
+
require 'middleman-blog-similar/extension'
|
6
6
|
::Middleman::Blog::SimilarExtension
|
7
7
|
end
|
@@ -1,23 +1,11 @@
|
|
1
|
-
module Middleman
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
module BlogArticleExtensions
|
5
|
+
def similar_articles
|
6
|
+
locals[:similar_db].find_similar(self)
|
7
|
+
end
|
8
|
+
end
|
5
9
|
end
|
6
|
-
@similar_articles || []
|
7
|
-
end
|
8
|
-
def words
|
9
|
-
unless @words && similarity_algorithm
|
10
|
-
@words = similarity_algorithm.words
|
11
|
-
end
|
12
|
-
@words
|
13
|
-
end
|
14
|
-
def similarity_algorithm
|
15
|
-
if !@similarity_algorithm && (algorithm = app.similarity_algorithm)
|
16
|
-
@similarity_algorithm = algorithm.new self
|
17
|
-
end
|
18
|
-
@similarity_algorithm
|
19
|
-
end
|
20
|
-
def untagged_body
|
21
|
-
body.gsub(/<[^>]*>/ui,'')
|
22
10
|
end
|
23
11
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'sqlite3'
|
2
|
+
require 'active_record'
|
3
|
+
require 'middleman-blog-similar/models/article'
|
4
|
+
require 'middleman-blog-similar/models/tag'
|
5
|
+
require 'middleman-blog-similar/models/tagging'
|
6
|
+
require 'middleman-blog-similar/models/migration'
|
7
|
+
|
8
|
+
module Middleman
|
9
|
+
module Blog
|
10
|
+
module Similar
|
11
|
+
class Database
|
12
|
+
attr_reader :taggers
|
13
|
+
def initialize(path, taggers)
|
14
|
+
ActiveRecord::Base.establish_connection(
|
15
|
+
adapter: 'sqlite3',
|
16
|
+
database: path
|
17
|
+
)
|
18
|
+
Migration.apply
|
19
|
+
@taggers = taggers
|
20
|
+
@id_map = {}
|
21
|
+
end
|
22
|
+
|
23
|
+
def store_articles(resources)
|
24
|
+
@id_map = {}
|
25
|
+
ActiveRecord::Base.transaction do
|
26
|
+
ids = []
|
27
|
+
resources.each do |res|
|
28
|
+
next unless res.is_a?(Middleman::Blog::BlogArticle)
|
29
|
+
execute_article res
|
30
|
+
ids << res.page_id
|
31
|
+
@id_map[res.page_id.to_s] = res
|
32
|
+
end
|
33
|
+
Article.where.not(page_id: ids).delete_all unless ids.empty?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def execute_article(resource)
|
38
|
+
source_file = resource.source_file
|
39
|
+
page_id = resource.page_id
|
40
|
+
digest = ::Digest::SHA1.file(source_file).hexdigest
|
41
|
+
return page_id if Article.exists?(digest: digest, page_id: page_id)
|
42
|
+
article = Article.find_or_create_by(page_id: page_id)
|
43
|
+
new_tagging_ids = []
|
44
|
+
@taggers.each do |tagger|
|
45
|
+
tagger[1].call(resource).map(&:downcase).each do |tag_name|
|
46
|
+
tag = Tag.find_or_create_by name: tag_name
|
47
|
+
tagging = Tagging.find_or_create_by tag_id: tag.id, article_id: article.id
|
48
|
+
tagging.weight = tagger[0]
|
49
|
+
tagging.save!
|
50
|
+
new_tagging_ids << tagging.id
|
51
|
+
end
|
52
|
+
end
|
53
|
+
if new_tagging_ids.any?
|
54
|
+
article.taggings.where.not(id: new_tagging_ids).delete_all
|
55
|
+
end
|
56
|
+
article.update! digest: digest
|
57
|
+
page_id
|
58
|
+
end
|
59
|
+
|
60
|
+
def find_similar(article)
|
61
|
+
article = Article.find_by(page_id: article.page_id)
|
62
|
+
return [] unless article
|
63
|
+
article.similar_article_page_ids.map do |page_id|
|
64
|
+
@id_map[page_id]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -1,32 +1,54 @@
|
|
1
1
|
require 'middleman-blog-similar/blog_article_extensions'
|
2
2
|
require 'middleman-blog-similar/helpers'
|
3
|
-
require 'middleman-blog-similar/
|
3
|
+
require 'middleman-blog-similar/resource_list_manipulator'
|
4
|
+
require 'middleman-blog-similar/database'
|
4
5
|
|
5
6
|
module Middleman
|
6
7
|
module Blog
|
7
8
|
class SimilarExtension < ::Middleman::Extension
|
9
|
+
option :tagger, :tags, 'Article tagger'
|
10
|
+
option :db, '.similar.db', 'SQLite3 Database'
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
self.defined_helpers = [ Middleman::Blog::Similar::Helpers ]
|
12
|
+
self.defined_helpers = [Middleman::Blog::Similar::Helpers]
|
12
13
|
|
13
14
|
def after_configuration
|
14
15
|
require 'middleman-blog/blog_article'
|
15
16
|
::Middleman::Sitemap::Resource.send :include, Middleman::Blog::Similar::BlogArticleExtensions
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
17
|
+
|
18
|
+
@taggers = []
|
19
|
+
case options.tagger
|
20
|
+
when String, Symbol
|
21
|
+
@taggers << [1, load_tagger(options.tagger)]
|
22
|
+
when Hash
|
23
|
+
options.tagger.each do |k, v|
|
24
|
+
if v.is_a?(Array)
|
25
|
+
k = v[1]
|
26
|
+
v = v[0]
|
27
|
+
end
|
28
|
+
@taggers << [v, load_tagger(k)]
|
29
|
+
end
|
30
|
+
else
|
31
|
+
raise "Invalid type for tagger option: #{options.tagger.class}"
|
27
32
|
end
|
33
|
+
db_path = options.db
|
34
|
+
db_path = File.expand_path(options.db, app.root) if db_path != ':memory:'
|
35
|
+
@db = Middleman::Blog::Similar::Database.new db_path, @taggers
|
36
|
+
@resource_list_manipulator = Middleman::Blog::Similar::ResourceListManipulator.new app, @db
|
37
|
+
@app.sitemap.register_resource_list_manipulator :blog_similar, @resource_list_manipulator
|
28
38
|
end
|
29
39
|
|
40
|
+
def load_tagger(tagger)
|
41
|
+
return tagger unless tagger.is_a?(String) || tagger.is_a?(Symbol)
|
42
|
+
require "middleman-blog-similar/tagger/#{tagger}"
|
43
|
+
ns = ::Middleman::Blog::Similar::Tagger
|
44
|
+
tagger.to_s.split('/').each do |n|
|
45
|
+
ns = ns.const_get n.camelize
|
46
|
+
end
|
47
|
+
ns.new
|
48
|
+
rescue LoadError => e
|
49
|
+
app.logger.error "Requested similar tagger '#{tagger}' not found."
|
50
|
+
raise e
|
51
|
+
end
|
30
52
|
end
|
31
53
|
end
|
32
54
|
end
|
@@ -1,9 +1,15 @@
|
|
1
|
-
module Middleman
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
module Helpers
|
5
|
+
def similar_articles
|
6
|
+
if is_blog_article?
|
7
|
+
current_article.similar_articles
|
8
|
+
else
|
9
|
+
[]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
7
13
|
end
|
8
14
|
end
|
9
15
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
class Article < ActiveRecord::Base
|
5
|
+
has_many :taggings
|
6
|
+
has_many :tags, through: :taggings
|
7
|
+
def similar_article_page_ids
|
8
|
+
return self.class.none if tags.empty?
|
9
|
+
# http://stackoverflow.com/a/22472153
|
10
|
+
res = ActiveRecord::Base.connection.select_all "
|
11
|
+
SELECT rtr.article_id FROM taggings AS rtr
|
12
|
+
INNER JOIN taggings rtr2
|
13
|
+
ON (rtr2.tag_id = rtr.tag_id AND rtr2.article_id = #{id})
|
14
|
+
LEFT JOIN
|
15
|
+
(SELECT * FROM taggings WHERE article_id = #{id}) AS r
|
16
|
+
ON rtr.tag_id = r.tag_id
|
17
|
+
LEFT JOIN articles a ON a.id = rtr.article_id
|
18
|
+
WHERE rtr.article_id != #{id}
|
19
|
+
GROUP BY rtr.article_id
|
20
|
+
HAVING COUNT(*) > 0
|
21
|
+
ORDER BY COUNT(*) * rtr.weight DESC, a.page_id DESC"
|
22
|
+
ids = res.to_hash.map { |h| h['article_id'] }
|
23
|
+
page_id_map = {}
|
24
|
+
articles = self.class.where(id: ids).select(:id, :page_id)
|
25
|
+
articles.each do |a|
|
26
|
+
page_id_map[a.id] = a.page_id
|
27
|
+
end
|
28
|
+
ids.map { |id| page_id_map[id] }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
module Migration
|
5
|
+
def self.apply
|
6
|
+
ActiveRecord::Schema.define(version: 201703240752) do # rubocop:disable Style/NumericLiterals
|
7
|
+
unless ActiveRecord::Base.connection.data_source_exists? 'articles'
|
8
|
+
create_table :articles do |table|
|
9
|
+
table.column :page_id, :string, index: true, unique: true
|
10
|
+
table.column :digest, :string, index: true
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
unless ActiveRecord::Base.connection.data_source_exists? 'tags'
|
15
|
+
create_table :tags do |table|
|
16
|
+
table.column :name, :string, index: true, unique: true
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
unless ActiveRecord::Base.connection.data_source_exists? 'taggings'
|
21
|
+
create_table :taggings do |table|
|
22
|
+
table.references :article, foreign_key: true
|
23
|
+
table.references :tag, foreign_key: true
|
24
|
+
table.column :weight, :integer, default: 1, null: false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Middleman
|
2
|
+
module Blog
|
3
|
+
module Similar
|
4
|
+
class ResourceListManipulator
|
5
|
+
attr_reader :article, :app, :db
|
6
|
+
def initialize(app, db)
|
7
|
+
@app = app
|
8
|
+
@db = db
|
9
|
+
end
|
10
|
+
|
11
|
+
def manipulate_resource_list(resources)
|
12
|
+
resources.each { |res| res.add_metadata locals: { similar_db: @db } }
|
13
|
+
@db.store_articles resources
|
14
|
+
resources
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'engtagger'
|
2
|
+
|
3
|
+
module Middleman
|
4
|
+
module Blog
|
5
|
+
module Similar
|
6
|
+
module Tagger
|
7
|
+
class Entagger
|
8
|
+
def call(article)
|
9
|
+
tgr = EngTagger.new
|
10
|
+
tagged = tgr.add_tags article.body.gsub(%r{</?[^>]+>}, '')
|
11
|
+
tgr.get_nouns(tagged).keys
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|