middleman-blog-similar 1.1.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +33 -0
  5. data/.travis.yml +12 -21
  6. data/Gemfile +20 -35
  7. data/Guardfile +9 -9
  8. data/README.md +57 -82
  9. data/Rakefile +9 -7
  10. data/features/default.feature +21 -0
  11. data/features/support/env.rb +20 -10
  12. data/fixtures/test-app/source/{2014-05-08-article0.md → 2014-05-08-article0.html.md} +2 -1
  13. data/fixtures/test-app/source/{2014-05-09-article1.md → 2014-05-09-article1.html.md} +2 -0
  14. data/fixtures/test-app/source/{2014-05-10-article2.md → 2014-05-10-article2.html.md} +1 -1
  15. data/fixtures/test-app/source/{2014-05-11-article3.md → 2014-05-11-article3.html.md} +0 -0
  16. data/fixtures/test-app/source/{2014-05-12-article4.md → 2014-05-12-article4.html.md} +1 -0
  17. data/fixtures/test-app/source/{2014-05-13-article5.md → 2014-05-13-article5.html.md} +1 -0
  18. data/fixtures/test-app/source/{2014-05-14-article6.md → 2014-05-14-article6.html.md} +1 -0
  19. data/fixtures/test-app/source/index.html.slim +1 -0
  20. data/fixtures/test-app/source/layout.slim +1 -1
  21. data/fixtures/test-app/source/layouts/article.slim +6 -6
  22. data/lib/middleman-blog-similar.rb +3 -3
  23. data/lib/middleman-blog-similar/blog_article_extensions.rb +8 -20
  24. data/lib/middleman-blog-similar/database.rb +70 -0
  25. data/lib/middleman-blog-similar/extension.rb +37 -15
  26. data/lib/middleman-blog-similar/helpers.rb +12 -6
  27. data/lib/middleman-blog-similar/models/article.rb +33 -0
  28. data/lib/middleman-blog-similar/models/migration.rb +32 -0
  29. data/lib/middleman-blog-similar/models/tag.rb +10 -0
  30. data/lib/middleman-blog-similar/models/tagging.rb +10 -0
  31. data/lib/middleman-blog-similar/resource_list_manipulator.rb +19 -0
  32. data/lib/middleman-blog-similar/tagger/entagger.rb +17 -0
  33. data/lib/middleman-blog-similar/tagger/mecab.rb +19 -0
  34. data/lib/middleman-blog-similar/tagger/tags.rb +13 -0
  35. data/lib/middleman-blog-similar/version.rb +1 -1
  36. data/lib/middleman_extension.rb +0 -1
  37. data/middleman-blog-similar.gemspec +18 -13
  38. data/spec/middleman-blog-similar/extension_spec.rb +44 -1
  39. data/spec/middleman-blog-similar/tagger_spec.rb +24 -0
  40. data/spec/spec_helper.rb +33 -24
  41. metadata +77 -72
  42. data/features/damerau_levenshtein.feature +0 -20
  43. data/features/levenshtein.feature +0 -20
  44. data/features/word_frequency.feature +0 -15
  45. data/lib/middleman-blog-similar/algorithm.rb +0 -19
  46. data/lib/middleman-blog-similar/algorithm/damerau_levenshtein.rb +0 -7
  47. data/lib/middleman-blog-similar/algorithm/levenshtein.rb +0 -7
  48. data/lib/middleman-blog-similar/algorithm/unigrams.csv +0 -21089
  49. data/lib/middleman-blog-similar/algorithm/word_frequency.rb +0 -69
  50. data/lib/middleman-blog-similar/algorithm/word_frequency/mecab.rb +0 -22
  51. data/lib/middleman-blog-similar/algorithm/word_frequency/tree_tagger.rb +0 -20
  52. data/spec/helper_spec.rb +0 -4
  53. data/spec/middleman-blog-similar/algorithm/damerau_levenshtein_spec.rb +0 -42
  54. data/spec/middleman-blog-similar/algorithm/levenshtein_spec.rb +0 -42
  55. data/spec/middleman-blog-similar/algorithm/word_frequency/mecab_spec.rb +0 -41
  56. data/spec/middleman-blog-similar/algorithm/word_frequency/tree_tagger_spec.rb +0 -52
  57. data/spec/middleman-blog-similar/algorithm/word_frequency_spec.rb +0 -73
  58. data/spec/middleman-blog-similar/algorithm_spec.rb +0 -40
@@ -1,7 +1,8 @@
1
1
  ---
2
2
  title: Article 0
3
3
  date: 2014-05-08 07:00
4
- tags: dog
4
+ tags: dog, cat, brown
5
+ category: test
5
6
  ---
6
7
 
7
8
  content
@@ -1,6 +1,8 @@
1
1
  ---
2
2
  title: Article 1
3
3
  date: 2014-05-09 07:00
4
+ tags: dog, cat
5
+ category: test
4
6
  ---
5
7
 
6
8
  content 1
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  title: Article 2
3
3
  date: 2014-05-10 07:00
4
- tags: Quick, Fox
4
+ tags: quick, Fox
5
5
  ---
6
6
 
7
7
  The quick brown fox jumps over the lazy dog dog dog brown
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  title: Article 4
3
3
  date: 2014-05-12 07:00
4
+ tags: dog, cat, fox
4
5
  ---
5
6
 
6
7
  content test 4
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  title: Article 5
3
3
  date: 2014-05-13 07:00
4
+ tags: dog
4
5
  ---
5
6
 
6
7
  content!
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  title: Article 6
3
3
  date: 2014-05-14 07:00
4
+ tags: dog, Brown, cat
4
5
  ---
5
6
 
6
7
  contents tests 6
@@ -1,6 +1,7 @@
1
1
  ---
2
2
  description: My Description
3
3
  title: My Title
4
+ tags: dog, cat, quick, brown
4
5
  ---
5
6
 
6
7
  h1 Hello index
@@ -2,6 +2,6 @@ html
2
2
  head
3
3
  meta charset="utf-8"
4
4
  title= current_resource.data.title
5
- body
5
+ body data-similar-article-count=similar_articles.count
6
6
  .container
7
7
  = yield
@@ -2,16 +2,16 @@ html
2
2
  head
3
3
  meta charset="utf-8"
4
4
  title= current_article.title
5
- body
5
+ body data-similar-article-count=similar_articles.count
6
6
  .container
7
7
  h1= current_article.title
8
+ p.tags= current_article.tags.join ', '
8
9
  = yield
9
10
 
10
11
  h2 Similar Entries
11
12
  ul
12
- - similar_articles.first(5).each_with_index do|article, index|
13
+ - similar_articles.each_with_index do|article, index|
13
14
  li class="a#{index}"
14
- = link_to article.title, article.url
15
-
16
- blockquote.algorithm
17
- = current_article.app.similarity_algorithm
15
+ = link_to article.url do
16
+ span.title= article.title
17
+ span.tags= article.tags.join ', '
@@ -1,7 +1,7 @@
1
- require "middleman-core"
2
- require "middleman-blog-similar/version"
1
+ require 'middleman-core'
2
+ require 'middleman-blog-similar/version'
3
3
 
4
4
  ::Middleman::Extensions.register(:similar) do
5
- require "middleman-blog-similar/extension"
5
+ require 'middleman-blog-similar/extension'
6
6
  ::Middleman::Blog::SimilarExtension
7
7
  end
@@ -1,23 +1,11 @@
1
- module Middleman::Blog::Similar::BlogArticleExtensions
2
- def similar_articles
3
- if !@similar_articles && similarity_algorithm
4
- @similar_articles = similarity_algorithm.similar_articles
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ module BlogArticleExtensions
5
+ def similar_articles
6
+ locals[:similar_db].find_similar(self)
7
+ end
8
+ end
5
9
  end
6
- @similar_articles || []
7
- end
8
- def words
9
- unless @words && similarity_algorithm
10
- @words = similarity_algorithm.words
11
- end
12
- @words
13
- end
14
- def similarity_algorithm
15
- if !@similarity_algorithm && (algorithm = app.similarity_algorithm)
16
- @similarity_algorithm = algorithm.new self
17
- end
18
- @similarity_algorithm
19
- end
20
- def untagged_body
21
- body.gsub(/<[^>]*>/ui,'')
22
10
  end
23
11
  end
@@ -0,0 +1,70 @@
1
+ require 'sqlite3'
2
+ require 'active_record'
3
+ require 'middleman-blog-similar/models/article'
4
+ require 'middleman-blog-similar/models/tag'
5
+ require 'middleman-blog-similar/models/tagging'
6
+ require 'middleman-blog-similar/models/migration'
7
+
8
+ module Middleman
9
+ module Blog
10
+ module Similar
11
+ class Database
12
+ attr_reader :taggers
13
+ def initialize(path, taggers)
14
+ ActiveRecord::Base.establish_connection(
15
+ adapter: 'sqlite3',
16
+ database: path
17
+ )
18
+ Migration.apply
19
+ @taggers = taggers
20
+ @id_map = {}
21
+ end
22
+
23
+ def store_articles(resources)
24
+ @id_map = {}
25
+ ActiveRecord::Base.transaction do
26
+ ids = []
27
+ resources.each do |res|
28
+ next unless res.is_a?(Middleman::Blog::BlogArticle)
29
+ execute_article res
30
+ ids << res.page_id
31
+ @id_map[res.page_id.to_s] = res
32
+ end
33
+ Article.where.not(page_id: ids).delete_all unless ids.empty?
34
+ end
35
+ end
36
+
37
+ def execute_article(resource)
38
+ source_file = resource.source_file
39
+ page_id = resource.page_id
40
+ digest = ::Digest::SHA1.file(source_file).hexdigest
41
+ return page_id if Article.exists?(digest: digest, page_id: page_id)
42
+ article = Article.find_or_create_by(page_id: page_id)
43
+ new_tagging_ids = []
44
+ @taggers.each do |tagger|
45
+ tagger[1].call(resource).map(&:downcase).each do |tag_name|
46
+ tag = Tag.find_or_create_by name: tag_name
47
+ tagging = Tagging.find_or_create_by tag_id: tag.id, article_id: article.id
48
+ tagging.weight = tagger[0]
49
+ tagging.save!
50
+ new_tagging_ids << tagging.id
51
+ end
52
+ end
53
+ if new_tagging_ids.any?
54
+ article.taggings.where.not(id: new_tagging_ids).delete_all
55
+ end
56
+ article.update! digest: digest
57
+ page_id
58
+ end
59
+
60
+ def find_similar(article)
61
+ article = Article.find_by(page_id: article.page_id)
62
+ return [] unless article
63
+ article.similar_article_page_ids.map do |page_id|
64
+ @id_map[page_id]
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -1,32 +1,54 @@
1
1
  require 'middleman-blog-similar/blog_article_extensions'
2
2
  require 'middleman-blog-similar/helpers'
3
- require 'middleman-blog-similar/algorithm'
3
+ require 'middleman-blog-similar/resource_list_manipulator'
4
+ require 'middleman-blog-similar/database'
4
5
 
5
6
  module Middleman
6
7
  module Blog
7
8
  class SimilarExtension < ::Middleman::Extension
9
+ option :tagger, :tags, 'Article tagger'
10
+ option :db, '.similar.db', 'SQLite3 Database'
8
11
 
9
- option :algorithm, :word_frequency, 'Similar lookup algorithm'
10
-
11
- self.defined_helpers = [ Middleman::Blog::Similar::Helpers ]
12
+ self.defined_helpers = [Middleman::Blog::Similar::Helpers]
12
13
 
13
14
  def after_configuration
14
15
  require 'middleman-blog/blog_article'
15
16
  ::Middleman::Sitemap::Resource.send :include, Middleman::Blog::Similar::BlogArticleExtensions
16
- algorithm = options[:algorithm].to_s
17
- begin
18
- require "middleman-blog-similar/algorithm/#{algorithm}"
19
- ns = ::Middleman::Blog::Similar::Algorithm
20
- algorithm.split('/').each do|n|
21
- ns = ns.const_get n.camelize
22
- end
23
- app.set :similarity_algorithm, ns
24
- rescue LoadError => e
25
- app.logger.error "Requested similar algorithm '#{algorithm}' not found."
26
- raise e
17
+
18
+ @taggers = []
19
+ case options.tagger
20
+ when String, Symbol
21
+ @taggers << [1, load_tagger(options.tagger)]
22
+ when Hash
23
+ options.tagger.each do |k, v|
24
+ if v.is_a?(Array)
25
+ k = v[1]
26
+ v = v[0]
27
+ end
28
+ @taggers << [v, load_tagger(k)]
29
+ end
30
+ else
31
+ raise "Invalid type for tagger option: #{options.tagger.class}"
27
32
  end
33
+ db_path = options.db
34
+ db_path = File.expand_path(options.db, app.root) if db_path != ':memory:'
35
+ @db = Middleman::Blog::Similar::Database.new db_path, @taggers
36
+ @resource_list_manipulator = Middleman::Blog::Similar::ResourceListManipulator.new app, @db
37
+ @app.sitemap.register_resource_list_manipulator :blog_similar, @resource_list_manipulator
28
38
  end
29
39
 
40
+ def load_tagger(tagger)
41
+ return tagger unless tagger.is_a?(String) || tagger.is_a?(Symbol)
42
+ require "middleman-blog-similar/tagger/#{tagger}"
43
+ ns = ::Middleman::Blog::Similar::Tagger
44
+ tagger.to_s.split('/').each do |n|
45
+ ns = ns.const_get n.camelize
46
+ end
47
+ ns.new
48
+ rescue LoadError => e
49
+ app.logger.error "Requested similar tagger '#{tagger}' not found."
50
+ raise e
51
+ end
30
52
  end
31
53
  end
32
54
  end
@@ -1,9 +1,15 @@
1
- module Middleman::Blog::Similar::Helpers
2
- def similar_articles
3
- if is_blog_article?
4
- current_article.similar_articles
5
- else
6
- []
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ module Helpers
5
+ def similar_articles
6
+ if is_blog_article?
7
+ current_article.similar_articles
8
+ else
9
+ []
10
+ end
11
+ end
12
+ end
7
13
  end
8
14
  end
9
15
  end
@@ -0,0 +1,33 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ class Article < ActiveRecord::Base
5
+ has_many :taggings
6
+ has_many :tags, through: :taggings
7
+ def similar_article_page_ids
8
+ return self.class.none if tags.empty?
9
+ # http://stackoverflow.com/a/22472153
10
+ res = ActiveRecord::Base.connection.select_all "
11
+ SELECT rtr.article_id FROM taggings AS rtr
12
+ INNER JOIN taggings rtr2
13
+ ON (rtr2.tag_id = rtr.tag_id AND rtr2.article_id = #{id})
14
+ LEFT JOIN
15
+ (SELECT * FROM taggings WHERE article_id = #{id}) AS r
16
+ ON rtr.tag_id = r.tag_id
17
+ LEFT JOIN articles a ON a.id = rtr.article_id
18
+ WHERE rtr.article_id != #{id}
19
+ GROUP BY rtr.article_id
20
+ HAVING COUNT(*) > 0
21
+ ORDER BY COUNT(*) * rtr.weight DESC, a.page_id DESC"
22
+ ids = res.to_hash.map { |h| h['article_id'] }
23
+ page_id_map = {}
24
+ articles = self.class.where(id: ids).select(:id, :page_id)
25
+ articles.each do |a|
26
+ page_id_map[a.id] = a.page_id
27
+ end
28
+ ids.map { |id| page_id_map[id] }
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ module Migration
5
+ def self.apply
6
+ ActiveRecord::Schema.define(version: 201703240752) do # rubocop:disable Style/NumericLiterals
7
+ unless ActiveRecord::Base.connection.data_source_exists? 'articles'
8
+ create_table :articles do |table|
9
+ table.column :page_id, :string, index: true, unique: true
10
+ table.column :digest, :string, index: true
11
+ end
12
+ end
13
+
14
+ unless ActiveRecord::Base.connection.data_source_exists? 'tags'
15
+ create_table :tags do |table|
16
+ table.column :name, :string, index: true, unique: true
17
+ end
18
+ end
19
+
20
+ unless ActiveRecord::Base.connection.data_source_exists? 'taggings'
21
+ create_table :taggings do |table|
22
+ table.references :article, foreign_key: true
23
+ table.references :tag, foreign_key: true
24
+ table.column :weight, :integer, default: 1, null: false
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,10 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ class Tag < ActiveRecord::Base
5
+ has_many :taggings
6
+ has_many :articles, through: :taggings
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ class Tagging < ActiveRecord::Base
5
+ belongs_to :article
6
+ belongs_to :tag
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,19 @@
1
+ module Middleman
2
+ module Blog
3
+ module Similar
4
+ class ResourceListManipulator
5
+ attr_reader :article, :app, :db
6
+ def initialize(app, db)
7
+ @app = app
8
+ @db = db
9
+ end
10
+
11
+ def manipulate_resource_list(resources)
12
+ resources.each { |res| res.add_metadata locals: { similar_db: @db } }
13
+ @db.store_articles resources
14
+ resources
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,17 @@
1
+ require 'engtagger'
2
+
3
+ module Middleman
4
+ module Blog
5
+ module Similar
6
+ module Tagger
7
+ class Entagger
8
+ def call(article)
9
+ tgr = EngTagger.new
10
+ tagged = tgr.add_tags article.body.gsub(%r{</?[^>]+>}, '')
11
+ tgr.get_nouns(tagged).keys
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end