jekyll-related-blog-posts 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jekyll-related-blog-posts.rb +101 -47
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: faa98f92d223adef6735d0c20410214d48c6b08fd7ef76acd7797eccb8200588
|
4
|
+
data.tar.gz: 6a59b5733d1e62387510cdf3ea96c353a522d3ff231d6c8de059b454ca236f0e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bdd965321a679ec0d709d0424f5c70baa58541831479b4b983a3539af980a4b127314592c37d3877090eacef7137e567b613ffc504399f5e9ec7eb64100940aa
|
7
|
+
data.tar.gz: 03d96f7be1c06510090c6b085f3975195d0f0a3857d17050ba080cb977732a334b93b99aff9d3dfc3cc0c5eb336ff4a6f5fb94ada21ee3a7f09822312e78712c
|
@@ -1,62 +1,96 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'jekyll'
|
3
|
+
require 'singleton'
|
4
|
+
require 'tokenizer'
|
5
|
+
require 'yaml'
|
6
|
+
require 'liquid'
|
3
7
|
require 'fast_stemmer'
|
4
8
|
require 'stopwords'
|
5
9
|
require 'pqueue'
|
6
10
|
require 'nmatrix'
|
11
|
+
require 'nmatrix/lapacke'
|
7
12
|
|
8
|
-
module
|
13
|
+
module Amadeusz
|
9
14
|
module Jekyll
|
10
|
-
class
|
15
|
+
class RelatedPosts
|
16
|
+
include Singleton
|
17
|
+
|
11
18
|
def initialize
|
12
|
-
@
|
19
|
+
@posts = Array.new
|
13
20
|
@keywords = Array.new
|
14
|
-
@
|
21
|
+
@tokenizer = Tokenizer::WhitespaceTokenizer.new(:en)
|
15
22
|
@stopwords_filter = Stopwords::Snowball::Filter.new('en')
|
16
23
|
end
|
17
24
|
|
18
25
|
def add_post(post)
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
content: (stem(post.content) + stem(post.data['title']) + tags + categories)
|
26
|
+
post = {
|
27
|
+
url: post.url,
|
28
|
+
title: post.data['title'].dup,
|
29
|
+
content: (stem(post.content) + stem(post.data['title']))
|
24
30
|
}
|
25
|
-
@docs << doc
|
26
|
-
@keywords += doc[:content]
|
27
|
-
@tags_and_categories += tags + categories
|
28
|
-
end
|
29
31
|
|
30
|
-
|
32
|
+
@posts << post
|
33
|
+
@keywords += post[:content]
|
31
34
|
@keywords.uniq!
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
+
end
|
36
|
+
|
37
|
+
def build!(site)
|
38
|
+
conf = config(site)
|
39
|
+
@weights = keywords_weights(conf['weights'])
|
40
|
+
related = find_releated(conf['max_count'], conf['min_score'], conf['accuracy'])
|
41
|
+
template = Liquid::Template.parse(File.read(template_path(site)))
|
35
42
|
|
36
|
-
@
|
37
|
-
|
43
|
+
@posts.each do |post|
|
44
|
+
filename = File.join(site.config['destination'], post[:url])
|
45
|
+
filename = File.join(filename, 'index.html') if File.directory? filename
|
46
|
+
rendered = File.read(filename)
|
47
|
+
|
48
|
+
output = template.render('related_posts' => related[post])
|
49
|
+
|
50
|
+
rendered.gsub! '<related-posts />', output
|
51
|
+
File.write(filename, rendered)
|
38
52
|
end
|
39
53
|
end
|
40
54
|
|
41
55
|
private
|
42
56
|
|
43
|
-
def
|
44
|
-
|
57
|
+
def config(site)
|
58
|
+
builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), '_config.yml')
|
59
|
+
defaults = YAML.load_file(builtin_file)
|
60
|
+
|
61
|
+
defaults['related'].merge(site.config['related'] || {})
|
62
|
+
end
|
63
|
+
|
64
|
+
def template_path(site)
|
65
|
+
site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
|
66
|
+
builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')
|
67
|
+
|
68
|
+
if File.exist? site_file
|
69
|
+
site_file
|
70
|
+
else
|
71
|
+
builtin_file
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
|
76
|
+
dc = document_correleation(accuracy)
|
45
77
|
result = Hash.new
|
46
|
-
count = [count, @
|
78
|
+
count = [count, @posts.size].min
|
47
79
|
|
48
|
-
@
|
49
|
-
queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
|
80
|
+
@posts.each_with_index do |post, index|
|
81
|
+
queue = PQueue.new(dc.row(index).each_with_index.select{|s,_| s>=min_score}) do |a, b|
|
50
82
|
a[0] > b[0]
|
51
83
|
end
|
52
84
|
|
53
|
-
result[
|
85
|
+
result[post] = []
|
54
86
|
count.times do
|
55
87
|
score, id = queue.pop
|
88
|
+
break unless score
|
56
89
|
begin
|
57
|
-
result[
|
58
|
-
score
|
59
|
-
|
90
|
+
result[post] << {
|
91
|
+
'score' => score,
|
92
|
+
'url' => @posts[id][:url],
|
93
|
+
'title' => @posts[id][:title]
|
60
94
|
}
|
61
95
|
rescue
|
62
96
|
break
|
@@ -67,8 +101,27 @@ module Jekyll
|
|
67
101
|
return result
|
68
102
|
end
|
69
103
|
|
70
|
-
def
|
71
|
-
|
104
|
+
def lsi(matrix, accuracy)
|
105
|
+
degree = (@keywords.size * accuracy - 1).floor
|
106
|
+
u, sigma, vt = matrix.transpose.gesdd
|
107
|
+
|
108
|
+
u2 = u.slice(0..degree, 0..degree)
|
109
|
+
sigma_d = NMatrix.zeros([degree+1, @posts.size])
|
110
|
+
sigma.each_with_indices do |v, i, j|
|
111
|
+
break if i > degree
|
112
|
+
sigma_d[i, i] = v
|
113
|
+
end
|
114
|
+
|
115
|
+
return u2.dot(sigma_d).dot(vt).transpose
|
116
|
+
end
|
117
|
+
|
118
|
+
def document_correleation(accuracy = 1.0)
|
119
|
+
if accuracy == 1.0
|
120
|
+
scores = tfidf
|
121
|
+
else
|
122
|
+
scores = lsi(tfidf, accuracy)
|
123
|
+
end
|
124
|
+
|
72
125
|
result = scores.dot(scores.transpose)
|
73
126
|
|
74
127
|
result.each_with_indices do |_, u, v|
|
@@ -83,11 +136,11 @@ module Jekyll
|
|
83
136
|
end
|
84
137
|
|
85
138
|
def bag_of_words
|
86
|
-
result = NMatrix.new([@
|
87
|
-
@max = NMatrix.new([@
|
139
|
+
result = NMatrix.new([@posts.size, @keywords.size], 0.0)
|
140
|
+
@max = NMatrix.new([@posts.size], 0.0)
|
88
141
|
|
89
142
|
result.each_with_indices do |_, pi, ki|
|
90
|
-
result[pi, ki] = @
|
143
|
+
result[pi, ki] = @posts[pi][:content].count(@keywords[ki])
|
91
144
|
|
92
145
|
if result[pi, ki] > @max[pi]
|
93
146
|
@max[pi] = result[pi, ki]
|
@@ -109,11 +162,15 @@ module Jekyll
|
|
109
162
|
return result
|
110
163
|
end
|
111
164
|
|
112
|
-
def
|
165
|
+
def keywords_weights(weights)
|
113
166
|
result = NMatrix.new([1, @keywords.size], 1.0)
|
114
167
|
|
115
|
-
|
116
|
-
|
168
|
+
weights.each do |word, weight|
|
169
|
+
keyword = word.to_s.stem.to_sym
|
170
|
+
|
171
|
+
next unless @keywords.include? keyword
|
172
|
+
|
173
|
+
result[0, @keywords.index(keyword)] = weight
|
117
174
|
end
|
118
175
|
|
119
176
|
return result
|
@@ -145,23 +202,20 @@ module Jekyll
|
|
145
202
|
end
|
146
203
|
|
147
204
|
def stem(data)
|
148
|
-
|
149
|
-
tokenized = data.scan(/\w+/).map(&:downcase)
|
205
|
+
tokenized = @tokenizer.tokenize(data.gsub(/[^a-z \t'_\-\n.,+]/i, '')).map(&:downcase)
|
150
206
|
filtered = @stopwords_filter.filter(tokenized)
|
151
|
-
stemmed = filtered.map(&:stem).select{|s| s.
|
207
|
+
stemmed = filtered.map(&:stem).select{|s| not s.empty?}.map(&:to_sym)
|
208
|
+
|
152
209
|
return stemmed
|
153
210
|
end
|
154
211
|
end
|
155
212
|
end
|
156
213
|
end
|
157
214
|
|
158
|
-
Jekyll::Hooks.register :
|
159
|
-
Jekyll.
|
160
|
-
|
161
|
-
site.posts.docs.each do |x|
|
162
|
-
tfidf.add_post(x)
|
163
|
-
end
|
215
|
+
Jekyll::Hooks.register :posts, :pre_render do |post|
|
216
|
+
Amadeusz::Jekyll::RelatedPosts.instance.add_post(post)
|
217
|
+
end
|
164
218
|
|
165
|
-
|
166
|
-
|
219
|
+
Jekyll::Hooks.register :site, :post_write do |site|
|
220
|
+
Amadeusz::Jekyll::RelatedPosts.instance.build! site
|
167
221
|
end
|