middleman-blog-similar 1.1.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +33 -0
  5. data/.travis.yml +12 -21
  6. data/Gemfile +20 -35
  7. data/Guardfile +9 -9
  8. data/README.md +57 -82
  9. data/Rakefile +9 -7
  10. data/features/default.feature +21 -0
  11. data/features/support/env.rb +20 -10
  12. data/fixtures/test-app/source/{2014-05-08-article0.md → 2014-05-08-article0.html.md} +2 -1
  13. data/fixtures/test-app/source/{2014-05-09-article1.md → 2014-05-09-article1.html.md} +2 -0
  14. data/fixtures/test-app/source/{2014-05-10-article2.md → 2014-05-10-article2.html.md} +1 -1
  15. data/fixtures/test-app/source/{2014-05-11-article3.md → 2014-05-11-article3.html.md} +0 -0
  16. data/fixtures/test-app/source/{2014-05-12-article4.md → 2014-05-12-article4.html.md} +1 -0
  17. data/fixtures/test-app/source/{2014-05-13-article5.md → 2014-05-13-article5.html.md} +1 -0
  18. data/fixtures/test-app/source/{2014-05-14-article6.md → 2014-05-14-article6.html.md} +1 -0
  19. data/fixtures/test-app/source/index.html.slim +1 -0
  20. data/fixtures/test-app/source/layout.slim +1 -1
  21. data/fixtures/test-app/source/layouts/article.slim +6 -6
  22. data/lib/middleman-blog-similar.rb +3 -3
  23. data/lib/middleman-blog-similar/blog_article_extensions.rb +8 -20
  24. data/lib/middleman-blog-similar/database.rb +70 -0
  25. data/lib/middleman-blog-similar/extension.rb +37 -15
  26. data/lib/middleman-blog-similar/helpers.rb +12 -6
  27. data/lib/middleman-blog-similar/models/article.rb +33 -0
  28. data/lib/middleman-blog-similar/models/migration.rb +32 -0
  29. data/lib/middleman-blog-similar/models/tag.rb +10 -0
  30. data/lib/middleman-blog-similar/models/tagging.rb +10 -0
  31. data/lib/middleman-blog-similar/resource_list_manipulator.rb +19 -0
  32. data/lib/middleman-blog-similar/tagger/entagger.rb +17 -0
  33. data/lib/middleman-blog-similar/tagger/mecab.rb +19 -0
  34. data/lib/middleman-blog-similar/tagger/tags.rb +13 -0
  35. data/lib/middleman-blog-similar/version.rb +1 -1
  36. data/lib/middleman_extension.rb +0 -1
  37. data/middleman-blog-similar.gemspec +18 -13
  38. data/spec/middleman-blog-similar/extension_spec.rb +44 -1
  39. data/spec/middleman-blog-similar/tagger_spec.rb +24 -0
  40. data/spec/spec_helper.rb +33 -24
  41. metadata +77 -72
  42. data/features/damerau_levenshtein.feature +0 -20
  43. data/features/levenshtein.feature +0 -20
  44. data/features/word_frequency.feature +0 -15
  45. data/lib/middleman-blog-similar/algorithm.rb +0 -19
  46. data/lib/middleman-blog-similar/algorithm/damerau_levenshtein.rb +0 -7
  47. data/lib/middleman-blog-similar/algorithm/levenshtein.rb +0 -7
  48. data/lib/middleman-blog-similar/algorithm/unigrams.csv +0 -21089
  49. data/lib/middleman-blog-similar/algorithm/word_frequency.rb +0 -69
  50. data/lib/middleman-blog-similar/algorithm/word_frequency/mecab.rb +0 -22
  51. data/lib/middleman-blog-similar/algorithm/word_frequency/tree_tagger.rb +0 -20
  52. data/spec/helper_spec.rb +0 -4
  53. data/spec/middleman-blog-similar/algorithm/damerau_levenshtein_spec.rb +0 -42
  54. data/spec/middleman-blog-similar/algorithm/levenshtein_spec.rb +0 -42
  55. data/spec/middleman-blog-similar/algorithm/word_frequency/mecab_spec.rb +0 -41
  56. data/spec/middleman-blog-similar/algorithm/word_frequency/tree_tagger_spec.rb +0 -52
  57. data/spec/middleman-blog-similar/algorithm/word_frequency_spec.rb +0 -73
  58. data/spec/middleman-blog-similar/algorithm_spec.rb +0 -40
@@ -1,7 +1,8 @@
1
1
  ---
2
2
  title: Article 0
3
3
  date: 2014-05-08 07:00
4
- tags: dog
4
+ tags: dog, cat, brown
5
+ category: test
5
6
  ---
6
7
 
7
8
  content
@@ -1,6 +1,8 @@
1
1
  ---
2
2
  title: Article 1
3
3
  date: 2014-05-09 07:00
4
+ tags: dog, cat
5
+ category: test
4
6
  ---
5
7
 
6
8
  content 1
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  title: Article 2
3
3
  date: 2014-05-10 07:00
4
- tags: Quick, Fox
4
+ tags: quick, Fox
5
5
  ---
6
6
 
7
7
  The quick brown fox jumps over the lazy dog dog dog brown
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  title: Article 4
3
3
  date: 2014-05-12 07:00
4
+ tags: dog, cat, fox
4
5
  ---
5
6
 
6
7
  content test 4
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  title: Article 5
3
3
  date: 2014-05-13 07:00
4
+ tags: dog
4
5
  ---
5
6
 
6
7
  content!
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  title: Article 6
3
3
  date: 2014-05-14 07:00
4
+ tags: dog, Brown, cat
4
5
  ---
5
6
 
6
7
  contents tests 6
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  description: My Description
3
3
  title: My Title
4
+ tags: dog, cat, quick, brown
4
5
  ---
5
6
 
6
7
  h1 Hello index
@@ -2,6 +2,6 @@ html
2
2
  head
3
3
  meta charset="utf-8"
4
4
  title= current_resource.data.title
5
- body
5
+ body data-similar-article-count=similar_articles.count
6
6
  .container
7
7
  = yield
@@ -2,16 +2,16 @@ html
2
2
  head
3
3
  meta charset="utf-8"
4
4
  title= current_article.title
5
- body
5
+ body data-similar-article-count=similar_articles.count
6
6
  .container
7
7
  h1= current_article.title
8
+ p.tags= current_article.tags.join ', '
8
9
  = yield
9
10
 
10
11
  h2 Similar Entries
11
12
  ul
12
- - similar_articles.first(5).each_with_index do|article, index|
13
+ - similar_articles.each_with_index do|article, index|
13
14
  li class="a#{index}"
14
- = link_to article.title, article.url
15
-
16
- blockquote.algorithm
17
- = current_article.app.similarity_algorithm
15
+ = link_to article.url do
16
+ span.title= article.title
17
+ span.tags= article.tags.join ', '
@@ -1,7 +1,7 @@
1
- require "middleman-core"
2
- require "middleman-blog-similar/version"
1
+ require 'middleman-core'
2
+ require 'middleman-blog-similar/version'
3
3
 
4
4
  ::Middleman::Extensions.register(:similar) do
5
- require "middleman-blog-similar/extension"
5
+ require 'middleman-blog-similar/extension'
6
6
  ::Middleman::Blog::SimilarExtension
7
7
  end
@@ -1,23 +1,11 @@
1
- module Middleman::Blog::Similar::BlogArticleExtensions
2
- def similar_articles
3
- if !@similar_articles && similarity_algorithm
4
- @similar_articles = similarity_algorithm.similar_articles
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ module BlogArticleExtensions
5
+ def similar_articles
6
+ locals[:similar_db].find_similar(self)
7
+ end
8
+ end
5
9
  end
6
- @similar_articles || []
7
- end
8
- def words
9
- unless @words && similarity_algorithm
10
- @words = similarity_algorithm.words
11
- end
12
- @words
13
- end
14
- def similarity_algorithm
15
- if !@similarity_algorithm && (algorithm = app.similarity_algorithm)
16
- @similarity_algorithm = algorithm.new self
17
- end
18
- @similarity_algorithm
19
- end
20
- def untagged_body
21
- body.gsub(/<[^>]*>/ui,'')
22
10
  end
23
11
  end
@@ -0,0 +1,70 @@
1
+ require 'sqlite3'
2
+ require 'active_record'
3
+ require 'middleman-blog-similar/models/article'
4
+ require 'middleman-blog-similar/models/tag'
5
+ require 'middleman-blog-similar/models/tagging'
6
+ require 'middleman-blog-similar/models/migration'
7
+
8
+ module Middleman
9
+ module Blog
10
+ module Similar
11
+ class Database
12
+ attr_reader :taggers
13
+ def initialize(path, taggers)
14
+ ActiveRecord::Base.establish_connection(
15
+ adapter: 'sqlite3',
16
+ database: path
17
+ )
18
+ Migration.apply
19
+ @taggers = taggers
20
+ @id_map = {}
21
+ end
22
+
23
+ def store_articles(resources)
24
+ @id_map = {}
25
+ ActiveRecord::Base.transaction do
26
+ ids = []
27
+ resources.each do |res|
28
+ next unless res.is_a?(Middleman::Blog::BlogArticle)
29
+ execute_article res
30
+ ids << res.page_id
31
+ @id_map[res.page_id.to_s] = res
32
+ end
33
+ Article.where.not(page_id: ids).delete_all unless ids.empty?
34
+ end
35
+ end
36
+
37
+ def execute_article(resource)
38
+ source_file = resource.source_file
39
+ page_id = resource.page_id
40
+ digest = ::Digest::SHA1.file(source_file).hexdigest
41
+ return page_id if Article.exists?(digest: digest, page_id: page_id)
42
+ article = Article.find_or_create_by(page_id: page_id)
43
+ new_tagging_ids = []
44
+ @taggers.each do |tagger|
45
+ tagger[1].call(resource).map(&:downcase).each do |tag_name|
46
+ tag = Tag.find_or_create_by name: tag_name
47
+ tagging = Tagging.find_or_create_by tag_id: tag.id, article_id: article.id
48
+ tagging.weight = tagger[0]
49
+ tagging.save!
50
+ new_tagging_ids << tagging.id
51
+ end
52
+ end
53
+ if new_tagging_ids.any?
54
+ article.taggings.where.not(id: new_tagging_ids).delete_all
55
+ end
56
+ article.update! digest: digest
57
+ page_id
58
+ end
59
+
60
+ def find_similar(article)
61
+ article = Article.find_by(page_id: article.page_id)
62
+ return [] unless article
63
+ article.similar_article_page_ids.map do |page_id|
64
+ @id_map[page_id]
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -1,32 +1,54 @@
1
1
  require 'middleman-blog-similar/blog_article_extensions'
2
2
  require 'middleman-blog-similar/helpers'
3
- require 'middleman-blog-similar/algorithm'
3
+ require 'middleman-blog-similar/resource_list_manipulator'
4
+ require 'middleman-blog-similar/database'
4
5
 
5
6
  module Middleman
6
7
  module Blog
7
8
  class SimilarExtension < ::Middleman::Extension
9
+ option :tagger, :tags, 'Article tagger'
10
+ option :db, '.similar.db', 'SQLite3 Database'
8
11
 
9
- option :algorithm, :word_frequency, 'Similar lookup algorithm'
10
-
11
- self.defined_helpers = [ Middleman::Blog::Similar::Helpers ]
12
+ self.defined_helpers = [Middleman::Blog::Similar::Helpers]
12
13
 
13
14
  def after_configuration
14
15
  require 'middleman-blog/blog_article'
15
16
  ::Middleman::Sitemap::Resource.send :include, Middleman::Blog::Similar::BlogArticleExtensions
16
- algorithm = options[:algorithm].to_s
17
- begin
18
- require "middleman-blog-similar/algorithm/#{algorithm}"
19
- ns = ::Middleman::Blog::Similar::Algorithm
20
- algorithm.split('/').each do|n|
21
- ns = ns.const_get n.camelize
22
- end
23
- app.set :similarity_algorithm, ns
24
- rescue LoadError => e
25
- app.logger.error "Requested similar algorithm '#{algorithm}' not found."
26
- raise e
17
+
18
+ @taggers = []
19
+ case options.tagger
20
+ when String, Symbol
21
+ @taggers << [1, load_tagger(options.tagger)]
22
+ when Hash
23
+ options.tagger.each do |k, v|
24
+ if v.is_a?(Array)
25
+ k = v[1]
26
+ v = v[0]
27
+ end
28
+ @taggers << [v, load_tagger(k)]
29
+ end
30
+ else
31
+ raise "Invalid type for tagger option: #{options.tagger.class}"
27
32
  end
33
+ db_path = options.db
34
+ db_path = File.expand_path(options.db, app.root) if db_path != ':memory:'
35
+ @db = Middleman::Blog::Similar::Database.new db_path, @taggers
36
+ @resource_list_manipulator = Middleman::Blog::Similar::ResourceListManipulator.new app, @db
37
+ @app.sitemap.register_resource_list_manipulator :blog_similar, @resource_list_manipulator
28
38
  end
29
39
 
40
+ def load_tagger(tagger)
41
+ return tagger unless tagger.is_a?(String) || tagger.is_a?(Symbol)
42
+ require "middleman-blog-similar/tagger/#{tagger}"
43
+ ns = ::Middleman::Blog::Similar::Tagger
44
+ tagger.to_s.split('/').each do |n|
45
+ ns = ns.const_get n.camelize
46
+ end
47
+ ns.new
48
+ rescue LoadError => e
49
+ app.logger.error "Requested similar tagger '#{tagger}' not found."
50
+ raise e
51
+ end
30
52
  end
31
53
  end
32
54
  end
@@ -1,9 +1,15 @@
1
- module Middleman::Blog::Similar::Helpers
2
- def similar_articles
3
- if is_blog_article?
4
- current_article.similar_articles
5
- else
6
- []
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ module Helpers
5
+ def similar_articles
6
+ if is_blog_article?
7
+ current_article.similar_articles
8
+ else
9
+ []
10
+ end
11
+ end
12
+ end
7
13
  end
8
14
  end
9
15
  end
@@ -0,0 +1,33 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ class Article < ActiveRecord::Base
5
+ has_many :taggings
6
+ has_many :tags, through: :taggings
7
+ def similar_article_page_ids
8
+ return self.class.none if tags.empty?
9
+ # http://stackoverflow.com/a/22472153
10
+ res = ActiveRecord::Base.connection.select_all "
11
+ SELECT rtr.article_id FROM taggings AS rtr
12
+ INNER JOIN taggings rtr2
13
+ ON (rtr2.tag_id = rtr.tag_id AND rtr2.article_id = #{id})
14
+ LEFT JOIN
15
+ (SELECT * FROM taggings WHERE article_id = #{id}) AS r
16
+ ON rtr.tag_id = r.tag_id
17
+ LEFT JOIN articles a ON a.id = rtr.article_id
18
+ WHERE rtr.article_id != #{id}
19
+ GROUP BY rtr.article_id
20
+ HAVING COUNT(*) > 0
21
+ ORDER BY COUNT(*) * rtr.weight DESC, a.page_id DESC"
22
+ ids = res.to_hash.map { |h| h['article_id'] }
23
+ page_id_map = {}
24
+ articles = self.class.where(id: ids).select(:id, :page_id)
25
+ articles.each do |a|
26
+ page_id_map[a.id] = a.page_id
27
+ end
28
+ ids.map { |id| page_id_map[id] }
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ module Migration
5
+ def self.apply
6
+ ActiveRecord::Schema.define(version: 201703240752) do # rubocop:disable Style/NumericLiterals
7
+ unless ActiveRecord::Base.connection.data_source_exists? 'articles'
8
+ create_table :articles do |table|
9
+ table.column :page_id, :string, index: true, unique: true
10
+ table.column :digest, :string, index: true
11
+ end
12
+ end
13
+
14
+ unless ActiveRecord::Base.connection.data_source_exists? 'tags'
15
+ create_table :tags do |table|
16
+ table.column :name, :string, index: true, unique: true
17
+ end
18
+ end
19
+
20
+ unless ActiveRecord::Base.connection.data_source_exists? 'taggings'
21
+ create_table :taggings do |table|
22
+ table.references :article, foreign_key: true
23
+ table.references :tag, foreign_key: true
24
+ table.column :weight, :integer, default: 1, null: false
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,10 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ class Tag < ActiveRecord::Base
5
+ has_many :taggings
6
+ has_many :articles, through: :taggings
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ class Tagging < ActiveRecord::Base
5
+ belongs_to :article
6
+ belongs_to :tag
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,19 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ class ResourceListManipulator
5
+ attr_reader :article, :app, :db
6
+ def initialize(app, db)
7
+ @app = app
8
+ @db = db
9
+ end
10
+
11
+ def manipulate_resource_list(resources)
12
+ resources.each { |res| res.add_metadata locals: { similar_db: @db } }
13
+ @db.store_articles resources
14
+ resources
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,17 @@
1
+ require 'engtagger'
2
+
3
+ module Middleman
4
+ module Blog
5
+ module Similar
6
+ module Tagger
7
+ class Entagger
8
+ def call(article)
9
+ tgr = EngTagger.new
10
+ tagged = tgr.add_tags article.body.gsub(%r{</?[^>]+>}, '')
11
+ tgr.get_nouns(tagged).keys
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end