jekyll-related-blog-posts 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/jekyll-related-blog-posts.rb +47 -101
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: faa98f92d223adef6735d0c20410214d48c6b08fd7ef76acd7797eccb8200588
4
- data.tar.gz: 6a59b5733d1e62387510cdf3ea96c353a522d3ff231d6c8de059b454ca236f0e
3
+ metadata.gz: 3ec8e865c5b63044c782f0be8b21b91fcd97ba7ad4f49bcd76c77d4e7b560ccf
4
+ data.tar.gz: ab1be4898ff69303b4620901782851c76934eaeebefbbc1e8766b4cade444ba2
5
5
  SHA512:
6
- metadata.gz: bdd965321a679ec0d709d0424f5c70baa58541831479b4b983a3539af980a4b127314592c37d3877090eacef7137e567b613ffc504399f5e9ec7eb64100940aa
7
- data.tar.gz: 03d96f7be1c06510090c6b085f3975195d0f0a3857d17050ba080cb977732a334b93b99aff9d3dfc3cc0c5eb336ff4a6f5fb94ada21ee3a7f09822312e78712c
6
+ metadata.gz: cf21deb16f506691cbc3733a3dbd40e332f31b1d6f7365cd3349c844df9a454c07386d0b8adc6506c7361e320cb2dd87a3908fd94a57feda86cc1b75ccb501d1
7
+ data.tar.gz: e464db2de9d96407d0e7d9888db372d76427005cbeb24a3f15261c31121f802bae0a6659a742f8e776179b4daab477b06cb82e2c26347add28d77224ea31dabe
@@ -1,96 +1,62 @@
1
1
  require 'rubygems'
2
2
  require 'jekyll'
3
- require 'singleton'
4
- require 'tokenizer'
5
- require 'yaml'
6
- require 'liquid'
7
3
  require 'fast_stemmer'
8
4
  require 'stopwords'
9
5
  require 'pqueue'
10
6
  require 'nmatrix'
11
- require 'nmatrix/lapacke'
12
7
 
13
- module Amadeusz
8
+ module SangsooNam
14
9
  module Jekyll
15
- class RelatedPosts
16
- include Singleton
17
-
10
+ class TFIDFRelatedPosts
18
11
  def initialize
19
- @posts = Array.new
12
+ @docs = Array.new
20
13
  @keywords = Array.new
21
- @tokenizer = Tokenizer::WhitespaceTokenizer.new(:en)
14
+ @tags_and_categories = Array.new
22
15
  @stopwords_filter = Stopwords::Snowball::Filter.new('en')
23
16
  end
24
17
 
25
18
  def add_post(post)
26
- post = {
27
- url: post.url,
28
- title: post.data['title'].dup,
29
- content: (stem(post.content) + stem(post.data['title']))
19
+ tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
20
+ categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
21
+ doc = {
22
+ post: post,
23
+ content: (stem(post.data['title']) + tags + categories)
30
24
  }
31
-
32
- @posts << post
33
- @keywords += post[:content]
34
- @keywords.uniq!
25
+ @docs << doc
26
+ @keywords += doc[:content]
27
+ @tags_and_categories += tags + categories
35
28
  end
36
29
 
37
- def build!(site)
38
- conf = config(site)
39
- @weights = keywords_weights(conf['weights'])
40
- related = find_releated(conf['max_count'], conf['min_score'], conf['accuracy'])
41
- template = Liquid::Template.parse(File.read(template_path(site)))
42
-
43
- @posts.each do |post|
44
- filename = File.join(site.config['destination'], post[:url])
45
- filename = File.join(filename, 'index.html') if File.directory? filename
46
- rendered = File.read(filename)
47
-
48
- output = template.render('related_posts' => related[post])
30
+ def build(site)
31
+ @keywords.uniq!
32
+ @tags_and_categories.uniq!
33
+ @weights = custom_weights(@tags_and_categories)
34
+ related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
49
35
 
50
- rendered.gsub! '<related-posts />', output
51
- File.write(filename, rendered)
36
+ @docs.each do |doc|
37
+ doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
52
38
  end
53
39
  end
54
40
 
55
41
  private
56
42
 
57
- def config(site)
58
- builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), '_config.yml')
59
- defaults = YAML.load_file(builtin_file)
60
-
61
- defaults['related'].merge(site.config['related'] || {})
62
- end
63
-
64
- def template_path(site)
65
- site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
66
- builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')
67
-
68
- if File.exist? site_file
69
- site_file
70
- else
71
- builtin_file
72
- end
73
- end
74
-
75
- def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
76
- dc = document_correleation(accuracy)
43
+ def build_related_docs_with_score(count = 8)
44
+ dc = document_correleation
77
45
  result = Hash.new
78
- count = [count, @posts.size].min
46
+ count = [count, @docs.size].min
79
47
 
80
- @posts.each_with_index do |post, index|
81
- queue = PQueue.new(dc.row(index).each_with_index.select{|s,_| s>=min_score}) do |a, b|
48
+ @docs.each_with_index do |doc, index|
49
+ queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
82
50
  a[0] > b[0]
83
51
  end
84
52
 
85
- result[post] = []
53
+ result[doc] = []
86
54
  count.times do
87
55
  score, id = queue.pop
88
- break unless score
89
56
  begin
90
- result[post] << {
91
- 'score' => score,
92
- 'url' => @posts[id][:url],
93
- 'title' => @posts[id][:title]
57
+ result[doc] << {
58
+ score: score,
59
+ post: @docs[id][:post]
94
60
  }
95
61
  rescue
96
62
  break
@@ -101,27 +67,8 @@ module Jekyll
101
67
  return result
102
68
  end
103
69
 
104
- def lsi(matrix, accuracy)
105
- degree = (@keywords.size * accuracy - 1).floor
106
- u, sigma, vt = matrix.transpose.gesdd
107
-
108
- u2 = u.slice(0..degree, 0..degree)
109
- sigma_d = NMatrix.zeros([degree+1, @posts.size])
110
- sigma.each_with_indices do |v, i, j|
111
- break if i > degree
112
- sigma_d[i, i] = v
113
- end
114
-
115
- return u2.dot(sigma_d).dot(vt).transpose
116
- end
117
-
118
- def document_correleation(accuracy = 1.0)
119
- if accuracy == 1.0
120
- scores = tfidf
121
- else
122
- scores = lsi(tfidf, accuracy)
123
- end
124
-
70
+ def document_correleation()
71
+ scores = tfidf
125
72
  result = scores.dot(scores.transpose)
126
73
 
127
74
  result.each_with_indices do |_, u, v|
@@ -136,11 +83,11 @@ module Jekyll
136
83
  end
137
84
 
138
85
  def bag_of_words
139
- result = NMatrix.new([@posts.size, @keywords.size], 0.0)
140
- @max = NMatrix.new([@posts.size], 0.0)
86
+ result = NMatrix.new([@docs.size, @keywords.size], 0.0)
87
+ @max = NMatrix.new([@docs.size], 0.0)
141
88
 
142
89
  result.each_with_indices do |_, pi, ki|
143
- result[pi, ki] = @posts[pi][:content].count(@keywords[ki])
90
+ result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
144
91
 
145
92
  if result[pi, ki] > @max[pi]
146
93
  @max[pi] = result[pi, ki]
@@ -162,15 +109,11 @@ module Jekyll
162
109
  return result
163
110
  end
164
111
 
165
- def keywords_weights(weights)
112
+ def custom_weights(terms, weight = 8.0)
166
113
  result = NMatrix.new([1, @keywords.size], 1.0)
167
114
 
168
- weights.each do |word, weight|
169
- keyword = word.to_s.stem.to_sym
170
-
171
- next unless @keywords.include? keyword
172
-
173
- result[0, @keywords.index(keyword)] = weight
115
+ terms.each do |term|
116
+ result[0, @keywords.index(term)] = weight
174
117
  end
175
118
 
176
119
  return result
@@ -202,20 +145,23 @@ module Jekyll
202
145
  end
203
146
 
204
147
  def stem(data)
205
- tokenized = @tokenizer.tokenize(data.gsub(/[^a-z \t'_\-\n.,+]/i, '')).map(&:downcase)
148
+ data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
149
+ tokenized = data.scan(/\w+/).map(&:downcase)
206
150
  filtered = @stopwords_filter.filter(tokenized)
207
- stemmed = filtered.map(&:stem).select{|s| not s.empty?}.map(&:to_sym)
208
-
151
+ stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
209
152
  return stemmed
210
153
  end
211
154
  end
212
155
  end
213
156
  end
214
157
 
215
- Jekyll::Hooks.register :posts, :pre_render do |post|
216
- Amadeusz::Jekyll::RelatedPosts.instance.add_post(post)
217
- end
158
+ Jekyll::Hooks.register :site, :pre_render do |site|
159
+ Jekyll.logger.info("Building TFIDF index...")
160
+ tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
161
+ site.posts.docs.each do |x|
162
+ tfidf.add_post(x)
163
+ end
218
164
 
219
- Jekyll::Hooks.register :site, :post_write do |site|
220
- Amadeusz::Jekyll::RelatedPosts.instance.build! site
165
+ Jekyll.logger.info("Replaceing Related Posts...")
166
+ tfidf.build(site)
221
167
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jekyll-related-blog-posts
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Manpreet singh