jekyll-related-blog-posts 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/jekyll-related-blog-posts.rb +47 -101
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: faa98f92d223adef6735d0c20410214d48c6b08fd7ef76acd7797eccb8200588
4
- data.tar.gz: 6a59b5733d1e62387510cdf3ea96c353a522d3ff231d6c8de059b454ca236f0e
3
+ metadata.gz: 3ec8e865c5b63044c782f0be8b21b91fcd97ba7ad4f49bcd76c77d4e7b560ccf
4
+ data.tar.gz: ab1be4898ff69303b4620901782851c76934eaeebefbbc1e8766b4cade444ba2
5
5
  SHA512:
6
- metadata.gz: bdd965321a679ec0d709d0424f5c70baa58541831479b4b983a3539af980a4b127314592c37d3877090eacef7137e567b613ffc504399f5e9ec7eb64100940aa
7
- data.tar.gz: 03d96f7be1c06510090c6b085f3975195d0f0a3857d17050ba080cb977732a334b93b99aff9d3dfc3cc0c5eb336ff4a6f5fb94ada21ee3a7f09822312e78712c
6
+ metadata.gz: cf21deb16f506691cbc3733a3dbd40e332f31b1d6f7365cd3349c844df9a454c07386d0b8adc6506c7361e320cb2dd87a3908fd94a57feda86cc1b75ccb501d1
7
+ data.tar.gz: e464db2de9d96407d0e7d9888db372d76427005cbeb24a3f15261c31121f802bae0a6659a742f8e776179b4daab477b06cb82e2c26347add28d77224ea31dabe
@@ -1,96 +1,62 @@
1
1
  require 'rubygems'
2
2
  require 'jekyll'
3
- require 'singleton'
4
- require 'tokenizer'
5
- require 'yaml'
6
- require 'liquid'
7
3
  require 'fast_stemmer'
8
4
  require 'stopwords'
9
5
  require 'pqueue'
10
6
  require 'nmatrix'
11
- require 'nmatrix/lapacke'
12
7
 
13
- module Amadeusz
8
+ module SangsooNam
14
9
  module Jekyll
15
- class RelatedPosts
16
- include Singleton
17
-
10
+ class TFIDFRelatedPosts
18
11
  def initialize
19
- @posts = Array.new
12
+ @docs = Array.new
20
13
  @keywords = Array.new
21
- @tokenizer = Tokenizer::WhitespaceTokenizer.new(:en)
14
+ @tags_and_categories = Array.new
22
15
  @stopwords_filter = Stopwords::Snowball::Filter.new('en')
23
16
  end
24
17
 
25
18
  def add_post(post)
26
- post = {
27
- url: post.url,
28
- title: post.data['title'].dup,
29
- content: (stem(post.content) + stem(post.data['title']))
19
+ tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
20
+ categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
21
+ doc = {
22
+ post: post,
23
+ content: (stem(post.data['title']) + tags + categories)
30
24
  }
31
-
32
- @posts << post
33
- @keywords += post[:content]
34
- @keywords.uniq!
25
+ @docs << doc
26
+ @keywords += doc[:content]
27
+ @tags_and_categories += tags + categories
35
28
  end
36
29
 
37
- def build!(site)
38
- conf = config(site)
39
- @weights = keywords_weights(conf['weights'])
40
- related = find_releated(conf['max_count'], conf['min_score'], conf['accuracy'])
41
- template = Liquid::Template.parse(File.read(template_path(site)))
42
-
43
- @posts.each do |post|
44
- filename = File.join(site.config['destination'], post[:url])
45
- filename = File.join(filename, 'index.html') if File.directory? filename
46
- rendered = File.read(filename)
47
-
48
- output = template.render('related_posts' => related[post])
30
+ def build(site)
31
+ @keywords.uniq!
32
+ @tags_and_categories.uniq!
33
+ @weights = custom_weights(@tags_and_categories)
34
+ related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
49
35
 
50
- rendered.gsub! '<related-posts />', output
51
- File.write(filename, rendered)
36
+ @docs.each do |doc|
37
+ doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
52
38
  end
53
39
  end
54
40
 
55
41
  private
56
42
 
57
- def config(site)
58
- builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), '_config.yml')
59
- defaults = YAML.load_file(builtin_file)
60
-
61
- defaults['related'].merge(site.config['related'] || {})
62
- end
63
-
64
- def template_path(site)
65
- site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
66
- builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')
67
-
68
- if File.exist? site_file
69
- site_file
70
- else
71
- builtin_file
72
- end
73
- end
74
-
75
- def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
76
- dc = document_correleation(accuracy)
43
+ def build_related_docs_with_score(count = 8)
44
+ dc = document_correleation
77
45
  result = Hash.new
78
- count = [count, @posts.size].min
46
+ count = [count, @docs.size].min
79
47
 
80
- @posts.each_with_index do |post, index|
81
- queue = PQueue.new(dc.row(index).each_with_index.select{|s,_| s>=min_score}) do |a, b|
48
+ @docs.each_with_index do |doc, index|
49
+ queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
82
50
  a[0] > b[0]
83
51
  end
84
52
 
85
- result[post] = []
53
+ result[doc] = []
86
54
  count.times do
87
55
  score, id = queue.pop
88
- break unless score
89
56
  begin
90
- result[post] << {
91
- 'score' => score,
92
- 'url' => @posts[id][:url],
93
- 'title' => @posts[id][:title]
57
+ result[doc] << {
58
+ score: score,
59
+ post: @docs[id][:post]
94
60
  }
95
61
  rescue
96
62
  break
@@ -101,27 +67,8 @@ module Jekyll
101
67
  return result
102
68
  end
103
69
 
104
- def lsi(matrix, accuracy)
105
- degree = (@keywords.size * accuracy - 1).floor
106
- u, sigma, vt = matrix.transpose.gesdd
107
-
108
- u2 = u.slice(0..degree, 0..degree)
109
- sigma_d = NMatrix.zeros([degree+1, @posts.size])
110
- sigma.each_with_indices do |v, i, j|
111
- break if i > degree
112
- sigma_d[i, i] = v
113
- end
114
-
115
- return u2.dot(sigma_d).dot(vt).transpose
116
- end
117
-
118
- def document_correleation(accuracy = 1.0)
119
- if accuracy == 1.0
120
- scores = tfidf
121
- else
122
- scores = lsi(tfidf, accuracy)
123
- end
124
-
70
+ def document_correleation()
71
+ scores = tfidf
125
72
  result = scores.dot(scores.transpose)
126
73
 
127
74
  result.each_with_indices do |_, u, v|
@@ -136,11 +83,11 @@ module Jekyll
136
83
  end
137
84
 
138
85
  def bag_of_words
139
- result = NMatrix.new([@posts.size, @keywords.size], 0.0)
140
- @max = NMatrix.new([@posts.size], 0.0)
86
+ result = NMatrix.new([@docs.size, @keywords.size], 0.0)
87
+ @max = NMatrix.new([@docs.size], 0.0)
141
88
 
142
89
  result.each_with_indices do |_, pi, ki|
143
- result[pi, ki] = @posts[pi][:content].count(@keywords[ki])
90
+ result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
144
91
 
145
92
  if result[pi, ki] > @max[pi]
146
93
  @max[pi] = result[pi, ki]
@@ -162,15 +109,11 @@ module Jekyll
162
109
  return result
163
110
  end
164
111
 
165
- def keywords_weights(weights)
112
+ def custom_weights(terms, weight = 8.0)
166
113
  result = NMatrix.new([1, @keywords.size], 1.0)
167
114
 
168
- weights.each do |word, weight|
169
- keyword = word.to_s.stem.to_sym
170
-
171
- next unless @keywords.include? keyword
172
-
173
- result[0, @keywords.index(keyword)] = weight
115
+ terms.each do |term|
116
+ result[0, @keywords.index(term)] = weight
174
117
  end
175
118
 
176
119
  return result
@@ -202,20 +145,23 @@ module Jekyll
202
145
  end
203
146
 
204
147
  def stem(data)
205
- tokenized = @tokenizer.tokenize(data.gsub(/[^a-z \t'_\-\n.,+]/i, '')).map(&:downcase)
148
+ data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
149
+ tokenized = data.scan(/\w+/).map(&:downcase)
206
150
  filtered = @stopwords_filter.filter(tokenized)
207
- stemmed = filtered.map(&:stem).select{|s| not s.empty?}.map(&:to_sym)
208
-
151
+ stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
209
152
  return stemmed
210
153
  end
211
154
  end
212
155
  end
213
156
  end
214
157
 
215
- Jekyll::Hooks.register :posts, :pre_render do |post|
216
- Amadeusz::Jekyll::RelatedPosts.instance.add_post(post)
217
- end
158
+ Jekyll::Hooks.register :site, :pre_render do |site|
159
+ Jekyll.logger.info("Building TFIDF index...")
160
+ tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
161
+ site.posts.docs.each do |x|
162
+ tfidf.add_post(x)
163
+ end
218
164
 
219
- Jekyll::Hooks.register :site, :post_write do |site|
220
- Amadeusz::Jekyll::RelatedPosts.instance.build! site
165
+ Jekyll.logger.info("Replaceing Related Posts...")
166
+ tfidf.build(site)
221
167
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jekyll-related-blog-posts
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manpreet singh