jekyll-related-blog-posts 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/jekyll-related-blog-posts.rb +101 -47
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c8751f960fdd690fb4a0c4c9899f9b425b9108cbf9cf5580f01e4afad33580b
4
- data.tar.gz: ffcffde09d1e6c7ce0830295429b663c9577ac5b723c5039857ed3e23a7238b1
3
+ metadata.gz: faa98f92d223adef6735d0c20410214d48c6b08fd7ef76acd7797eccb8200588
4
+ data.tar.gz: 6a59b5733d1e62387510cdf3ea96c353a522d3ff231d6c8de059b454ca236f0e
5
5
  SHA512:
6
- metadata.gz: 003dd525f02a5fd168430eb04d553070b7d77fce4433fc941b555402f95a65c9851a82abd1c190fee064d5b722d72ed668b28b81ce86f7d5e5851447fc6dfedb
7
- data.tar.gz: 8986f4ec51761d582bcbb06e5bbb651e04a6f87f04dc7cb8e10e110cea9ab0c0dd38fbdac4edaa74741b1b894a83291c5e122c12c2c06c6e3adc6f4caf96a1d2
6
+ metadata.gz: bdd965321a679ec0d709d0424f5c70baa58541831479b4b983a3539af980a4b127314592c37d3877090eacef7137e567b613ffc504399f5e9ec7eb64100940aa
7
+ data.tar.gz: 03d96f7be1c06510090c6b085f3975195d0f0a3857d17050ba080cb977732a334b93b99aff9d3dfc3cc0c5eb336ff4a6f5fb94ada21ee3a7f09822312e78712c
@@ -1,62 +1,96 @@
1
1
  require 'rubygems'
2
2
  require 'jekyll'
3
+ require 'singleton'
4
+ require 'tokenizer'
5
+ require 'yaml'
6
+ require 'liquid'
3
7
  require 'fast_stemmer'
4
8
  require 'stopwords'
5
9
  require 'pqueue'
6
10
  require 'nmatrix'
11
+ require 'nmatrix/lapacke'
7
12
 
8
- module SangsooNam
13
+ module Amadeusz
9
14
  module Jekyll
10
- class TFIDFRelatedPosts
15
+ class RelatedPosts
16
+ include Singleton
17
+
11
18
  def initialize
12
- @docs = Array.new
19
+ @posts = Array.new
13
20
  @keywords = Array.new
14
- @tags_and_categories = Array.new
21
+ @tokenizer = Tokenizer::WhitespaceTokenizer.new(:en)
15
22
  @stopwords_filter = Stopwords::Snowball::Filter.new('en')
16
23
  end
17
24
 
18
25
  def add_post(post)
19
- tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
20
- categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
21
- doc = {
22
- post: post,
23
- content: (stem(post.content) + stem(post.data['title']) + tags + categories)
26
+ post = {
27
+ url: post.url,
28
+ title: post.data['title'].dup,
29
+ content: (stem(post.content) + stem(post.data['title']))
24
30
  }
25
- @docs << doc
26
- @keywords += doc[:content]
27
- @tags_and_categories += tags + categories
28
- end
29
31
 
30
- def build(site)
32
+ @posts << post
33
+ @keywords += post[:content]
31
34
  @keywords.uniq!
32
- @tags_and_categories.uniq!
33
- @weights = custom_weights(@tags_and_categories)
34
- related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
35
+ end
36
+
37
+ def build!(site)
38
+ conf = config(site)
39
+ @weights = keywords_weights(conf['weights'])
40
+ related = find_releated(conf['max_count'], conf['min_score'], conf['accuracy'])
41
+ template = Liquid::Template.parse(File.read(template_path(site)))
35
42
 
36
- @docs.each do |doc|
37
- doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
43
+ @posts.each do |post|
44
+ filename = File.join(site.config['destination'], post[:url])
45
+ filename = File.join(filename, 'index.html') if File.directory? filename
46
+ rendered = File.read(filename)
47
+
48
+ output = template.render('related_posts' => related[post])
49
+
50
+ rendered.gsub! '<related-posts />', output
51
+ File.write(filename, rendered)
38
52
  end
39
53
  end
40
54
 
41
55
  private
42
56
 
43
- def build_related_docs_with_score(count = 8)
44
- dc = document_correleation
57
+ def config(site)
58
+ builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), '_config.yml')
59
+ defaults = YAML.load_file(builtin_file)
60
+
61
+ defaults['related'].merge(site.config['related'] || {})
62
+ end
63
+
64
+ def template_path(site)
65
+ site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
66
+ builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')
67
+
68
+ if File.exist? site_file
69
+ site_file
70
+ else
71
+ builtin_file
72
+ end
73
+ end
74
+
75
+ def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
76
+ dc = document_correleation(accuracy)
45
77
  result = Hash.new
46
- count = [count, @docs.size].min
78
+ count = [count, @posts.size].min
47
79
 
48
- @docs.each_with_index do |doc, index|
49
- queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
80
+ @posts.each_with_index do |post, index|
81
+ queue = PQueue.new(dc.row(index).each_with_index.select{|s,_| s>=min_score}) do |a, b|
50
82
  a[0] > b[0]
51
83
  end
52
84
 
53
- result[doc] = []
85
+ result[post] = []
54
86
  count.times do
55
87
  score, id = queue.pop
88
+ break unless score
56
89
  begin
57
- result[doc] << {
58
- score: score,
59
- post: @docs[id][:post]
90
+ result[post] << {
91
+ 'score' => score,
92
+ 'url' => @posts[id][:url],
93
+ 'title' => @posts[id][:title]
60
94
  }
61
95
  rescue
62
96
  break
@@ -67,8 +101,27 @@ module Jekyll
67
101
  return result
68
102
  end
69
103
 
70
- def document_correleation()
71
- scores = tfidf
104
+ def lsi(matrix, accuracy)
105
+ degree = (@keywords.size * accuracy - 1).floor
106
+ u, sigma, vt = matrix.transpose.gesdd
107
+
108
+ u2 = u.slice(0..degree, 0..degree)
109
+ sigma_d = NMatrix.zeros([degree+1, @posts.size])
110
+ sigma.each_with_indices do |v, i, j|
111
+ break if i > degree
112
+ sigma_d[i, i] = v
113
+ end
114
+
115
+ return u2.dot(sigma_d).dot(vt).transpose
116
+ end
117
+
118
+ def document_correleation(accuracy = 1.0)
119
+ if accuracy == 1.0
120
+ scores = tfidf
121
+ else
122
+ scores = lsi(tfidf, accuracy)
123
+ end
124
+
72
125
  result = scores.dot(scores.transpose)
73
126
 
74
127
  result.each_with_indices do |_, u, v|
@@ -83,11 +136,11 @@ module Jekyll
83
136
  end
84
137
 
85
138
  def bag_of_words
86
- result = NMatrix.new([@docs.size, @keywords.size], 0.0)
87
- @max = NMatrix.new([@docs.size], 0.0)
139
+ result = NMatrix.new([@posts.size, @keywords.size], 0.0)
140
+ @max = NMatrix.new([@posts.size], 0.0)
88
141
 
89
142
  result.each_with_indices do |_, pi, ki|
90
- result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
143
+ result[pi, ki] = @posts[pi][:content].count(@keywords[ki])
91
144
 
92
145
  if result[pi, ki] > @max[pi]
93
146
  @max[pi] = result[pi, ki]
@@ -109,11 +162,15 @@ module Jekyll
109
162
  return result
110
163
  end
111
164
 
112
- def custom_weights(terms, weight = 8.0)
165
+ def keywords_weights(weights)
113
166
  result = NMatrix.new([1, @keywords.size], 1.0)
114
167
 
115
- terms.each do |term|
116
- result[0, @keywords.index(term)] = weight
168
+ weights.each do |word, weight|
169
+ keyword = word.to_s.stem.to_sym
170
+
171
+ next unless @keywords.include? keyword
172
+
173
+ result[0, @keywords.index(keyword)] = weight
117
174
  end
118
175
 
119
176
  return result
@@ -145,23 +202,20 @@ module Jekyll
145
202
  end
146
203
 
147
204
  def stem(data)
148
- data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
149
- tokenized = data.scan(/\w+/).map(&:downcase)
205
+ tokenized = @tokenizer.tokenize(data.gsub(/[^a-z \t'_\-\n.,+]/i, '')).map(&:downcase)
150
206
  filtered = @stopwords_filter.filter(tokenized)
151
- stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
207
+ stemmed = filtered.map(&:stem).select{|s| not s.empty?}.map(&:to_sym)
208
+
152
209
  return stemmed
153
210
  end
154
211
  end
155
212
  end
156
213
  end
157
214
 
158
- Jekyll::Hooks.register :site, :pre_render do |site|
159
- Jekyll.logger.info("Building TFIDF index...")
160
- tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
161
- site.posts.docs.each do |x|
162
- tfidf.add_post(x)
163
- end
215
+ Jekyll::Hooks.register :posts, :pre_render do |post|
216
+ Amadeusz::Jekyll::RelatedPosts.instance.add_post(post)
217
+ end
164
218
 
165
- Jekyll.logger.info("Replaceing Related Posts...")
166
- tfidf.build(site)
219
+ Jekyll::Hooks.register :site, :post_write do |site|
220
+ Amadeusz::Jekyll::RelatedPosts.instance.build! site
167
221
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jekyll-related-blog-posts
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manpreet singh