jekyll-related-blog-posts 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/jekyll-related-blog-posts.rb +101 -47
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c8751f960fdd690fb4a0c4c9899f9b425b9108cbf9cf5580f01e4afad33580b
4
- data.tar.gz: ffcffde09d1e6c7ce0830295429b663c9577ac5b723c5039857ed3e23a7238b1
3
+ metadata.gz: faa98f92d223adef6735d0c20410214d48c6b08fd7ef76acd7797eccb8200588
4
+ data.tar.gz: 6a59b5733d1e62387510cdf3ea96c353a522d3ff231d6c8de059b454ca236f0e
5
5
  SHA512:
6
- metadata.gz: 003dd525f02a5fd168430eb04d553070b7d77fce4433fc941b555402f95a65c9851a82abd1c190fee064d5b722d72ed668b28b81ce86f7d5e5851447fc6dfedb
7
- data.tar.gz: 8986f4ec51761d582bcbb06e5bbb651e04a6f87f04dc7cb8e10e110cea9ab0c0dd38fbdac4edaa74741b1b894a83291c5e122c12c2c06c6e3adc6f4caf96a1d2
6
+ metadata.gz: bdd965321a679ec0d709d0424f5c70baa58541831479b4b983a3539af980a4b127314592c37d3877090eacef7137e567b613ffc504399f5e9ec7eb64100940aa
7
+ data.tar.gz: 03d96f7be1c06510090c6b085f3975195d0f0a3857d17050ba080cb977732a334b93b99aff9d3dfc3cc0c5eb336ff4a6f5fb94ada21ee3a7f09822312e78712c
@@ -1,62 +1,96 @@
1
1
  require 'rubygems'
2
2
  require 'jekyll'
3
+ require 'singleton'
4
+ require 'tokenizer'
5
+ require 'yaml'
6
+ require 'liquid'
3
7
  require 'fast_stemmer'
4
8
  require 'stopwords'
5
9
  require 'pqueue'
6
10
  require 'nmatrix'
11
+ require 'nmatrix/lapacke'
7
12
 
8
- module SangsooNam
13
+ module Amadeusz
9
14
  module Jekyll
10
- class TFIDFRelatedPosts
15
+ class RelatedPosts
16
+ include Singleton
17
+
11
18
  def initialize
12
- @docs = Array.new
19
+ @posts = Array.new
13
20
  @keywords = Array.new
14
- @tags_and_categories = Array.new
21
+ @tokenizer = Tokenizer::WhitespaceTokenizer.new(:en)
15
22
  @stopwords_filter = Stopwords::Snowball::Filter.new('en')
16
23
  end
17
24
 
18
25
  def add_post(post)
19
- tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
20
- categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
21
- doc = {
22
- post: post,
23
- content: (stem(post.content) + stem(post.data['title']) + tags + categories)
26
+ post = {
27
+ url: post.url,
28
+ title: post.data['title'].dup,
29
+ content: (stem(post.content) + stem(post.data['title']))
24
30
  }
25
- @docs << doc
26
- @keywords += doc[:content]
27
- @tags_and_categories += tags + categories
28
- end
29
31
 
30
- def build(site)
32
+ @posts << post
33
+ @keywords += post[:content]
31
34
  @keywords.uniq!
32
- @tags_and_categories.uniq!
33
- @weights = custom_weights(@tags_and_categories)
34
- related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
35
+ end
36
+
37
+ def build!(site)
38
+ conf = config(site)
39
+ @weights = keywords_weights(conf['weights'])
40
+ related = find_releated(conf['max_count'], conf['min_score'], conf['accuracy'])
41
+ template = Liquid::Template.parse(File.read(template_path(site)))
35
42
 
36
- @docs.each do |doc|
37
- doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
43
+ @posts.each do |post|
44
+ filename = File.join(site.config['destination'], post[:url])
45
+ filename = File.join(filename, 'index.html') if File.directory? filename
46
+ rendered = File.read(filename)
47
+
48
+ output = template.render('related_posts' => related[post])
49
+
50
+ rendered.gsub! '<related-posts />', output
51
+ File.write(filename, rendered)
38
52
  end
39
53
  end
40
54
 
41
55
  private
42
56
 
43
- def build_related_docs_with_score(count = 8)
44
- dc = document_correleation
57
+ def config(site)
58
+ builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), '_config.yml')
59
+ defaults = YAML.load_file(builtin_file)
60
+
61
+ defaults['related'].merge(site.config['related'] || {})
62
+ end
63
+
64
+ def template_path(site)
65
+ site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
66
+ builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')
67
+
68
+ if File.exist? site_file
69
+ site_file
70
+ else
71
+ builtin_file
72
+ end
73
+ end
74
+
75
+ def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
76
+ dc = document_correleation(accuracy)
45
77
  result = Hash.new
46
- count = [count, @docs.size].min
78
+ count = [count, @posts.size].min
47
79
 
48
- @docs.each_with_index do |doc, index|
49
- queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
80
+ @posts.each_with_index do |post, index|
81
+ queue = PQueue.new(dc.row(index).each_with_index.select{|s,_| s>=min_score}) do |a, b|
50
82
  a[0] > b[0]
51
83
  end
52
84
 
53
- result[doc] = []
85
+ result[post] = []
54
86
  count.times do
55
87
  score, id = queue.pop
88
+ break unless score
56
89
  begin
57
- result[doc] << {
58
- score: score,
59
- post: @docs[id][:post]
90
+ result[post] << {
91
+ 'score' => score,
92
+ 'url' => @posts[id][:url],
93
+ 'title' => @posts[id][:title]
60
94
  }
61
95
  rescue
62
96
  break
@@ -67,8 +101,27 @@ module Jekyll
67
101
  return result
68
102
  end
69
103
 
70
- def document_correleation()
71
- scores = tfidf
104
+ def lsi(matrix, accuracy)
105
+ degree = (@keywords.size * accuracy - 1).floor
106
+ u, sigma, vt = matrix.transpose.gesdd
107
+
108
+ u2 = u.slice(0..degree, 0..degree)
109
+ sigma_d = NMatrix.zeros([degree+1, @posts.size])
110
+ sigma.each_with_indices do |v, i, j|
111
+ break if i > degree
112
+ sigma_d[i, i] = v
113
+ end
114
+
115
+ return u2.dot(sigma_d).dot(vt).transpose
116
+ end
117
+
118
+ def document_correleation(accuracy = 1.0)
119
+ if accuracy == 1.0
120
+ scores = tfidf
121
+ else
122
+ scores = lsi(tfidf, accuracy)
123
+ end
124
+
72
125
  result = scores.dot(scores.transpose)
73
126
 
74
127
  result.each_with_indices do |_, u, v|
@@ -83,11 +136,11 @@ module Jekyll
83
136
  end
84
137
 
85
138
  def bag_of_words
86
- result = NMatrix.new([@docs.size, @keywords.size], 0.0)
87
- @max = NMatrix.new([@docs.size], 0.0)
139
+ result = NMatrix.new([@posts.size, @keywords.size], 0.0)
140
+ @max = NMatrix.new([@posts.size], 0.0)
88
141
 
89
142
  result.each_with_indices do |_, pi, ki|
90
- result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
143
+ result[pi, ki] = @posts[pi][:content].count(@keywords[ki])
91
144
 
92
145
  if result[pi, ki] > @max[pi]
93
146
  @max[pi] = result[pi, ki]
@@ -109,11 +162,15 @@ module Jekyll
109
162
  return result
110
163
  end
111
164
 
112
- def custom_weights(terms, weight = 8.0)
165
+ def keywords_weights(weights)
113
166
  result = NMatrix.new([1, @keywords.size], 1.0)
114
167
 
115
- terms.each do |term|
116
- result[0, @keywords.index(term)] = weight
168
+ weights.each do |word, weight|
169
+ keyword = word.to_s.stem.to_sym
170
+
171
+ next unless @keywords.include? keyword
172
+
173
+ result[0, @keywords.index(keyword)] = weight
117
174
  end
118
175
 
119
176
  return result
@@ -145,23 +202,20 @@ module Jekyll
145
202
  end
146
203
 
147
204
  def stem(data)
148
- data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
149
- tokenized = data.scan(/\w+/).map(&:downcase)
205
+ tokenized = @tokenizer.tokenize(data.gsub(/[^a-z \t'_\-\n.,+]/i, '')).map(&:downcase)
150
206
  filtered = @stopwords_filter.filter(tokenized)
151
- stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
207
+ stemmed = filtered.map(&:stem).select{|s| not s.empty?}.map(&:to_sym)
208
+
152
209
  return stemmed
153
210
  end
154
211
  end
155
212
  end
156
213
  end
157
214
 
158
- Jekyll::Hooks.register :site, :pre_render do |site|
159
- Jekyll.logger.info("Building TFIDF index...")
160
- tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
161
- site.posts.docs.each do |x|
162
- tfidf.add_post(x)
163
- end
215
+ Jekyll::Hooks.register :posts, :pre_render do |post|
216
+ Amadeusz::Jekyll::RelatedPosts.instance.add_post(post)
217
+ end
164
218
 
165
- Jekyll.logger.info("Replaceing Related Posts...")
166
- tfidf.build(site)
219
+ Jekyll::Hooks.register :site, :post_write do |site|
220
+ Amadeusz::Jekyll::RelatedPosts.instance.build! site
167
221
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jekyll-related-blog-posts
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manpreet singh