jekyll-related-blog-posts 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/jekyll-related-blog-posts.rb +101 -47
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: faa98f92d223adef6735d0c20410214d48c6b08fd7ef76acd7797eccb8200588
|
4
|
+
data.tar.gz: 6a59b5733d1e62387510cdf3ea96c353a522d3ff231d6c8de059b454ca236f0e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bdd965321a679ec0d709d0424f5c70baa58541831479b4b983a3539af980a4b127314592c37d3877090eacef7137e567b613ffc504399f5e9ec7eb64100940aa
|
7
|
+
data.tar.gz: 03d96f7be1c06510090c6b085f3975195d0f0a3857d17050ba080cb977732a334b93b99aff9d3dfc3cc0c5eb336ff4a6f5fb94ada21ee3a7f09822312e78712c
|
@@ -1,62 +1,96 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'jekyll'
|
3
|
+
require 'singleton'
|
4
|
+
require 'tokenizer'
|
5
|
+
require 'yaml'
|
6
|
+
require 'liquid'
|
3
7
|
require 'fast_stemmer'
|
4
8
|
require 'stopwords'
|
5
9
|
require 'pqueue'
|
6
10
|
require 'nmatrix'
|
11
|
+
require 'nmatrix/lapacke'
|
7
12
|
|
8
|
-
module
|
13
|
+
module Amadeusz
|
9
14
|
module Jekyll
|
10
|
-
class
|
15
|
+
class RelatedPosts
|
16
|
+
include Singleton
|
17
|
+
|
11
18
|
def initialize
|
12
|
-
@
|
19
|
+
@posts = Array.new
|
13
20
|
@keywords = Array.new
|
14
|
-
@
|
21
|
+
@tokenizer = Tokenizer::WhitespaceTokenizer.new(:en)
|
15
22
|
@stopwords_filter = Stopwords::Snowball::Filter.new('en')
|
16
23
|
end
|
17
24
|
|
18
25
|
def add_post(post)
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
content: (stem(post.content) + stem(post.data['title']) + tags + categories)
|
26
|
+
post = {
|
27
|
+
url: post.url,
|
28
|
+
title: post.data['title'].dup,
|
29
|
+
content: (stem(post.content) + stem(post.data['title']))
|
24
30
|
}
|
25
|
-
@docs << doc
|
26
|
-
@keywords += doc[:content]
|
27
|
-
@tags_and_categories += tags + categories
|
28
|
-
end
|
29
31
|
|
30
|
-
|
32
|
+
@posts << post
|
33
|
+
@keywords += post[:content]
|
31
34
|
@keywords.uniq!
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
+
end
|
36
|
+
|
37
|
+
def build!(site)
|
38
|
+
conf = config(site)
|
39
|
+
@weights = keywords_weights(conf['weights'])
|
40
|
+
related = find_releated(conf['max_count'], conf['min_score'], conf['accuracy'])
|
41
|
+
template = Liquid::Template.parse(File.read(template_path(site)))
|
35
42
|
|
36
|
-
@
|
37
|
-
|
43
|
+
@posts.each do |post|
|
44
|
+
filename = File.join(site.config['destination'], post[:url])
|
45
|
+
filename = File.join(filename, 'index.html') if File.directory? filename
|
46
|
+
rendered = File.read(filename)
|
47
|
+
|
48
|
+
output = template.render('related_posts' => related[post])
|
49
|
+
|
50
|
+
rendered.gsub! '<related-posts />', output
|
51
|
+
File.write(filename, rendered)
|
38
52
|
end
|
39
53
|
end
|
40
54
|
|
41
55
|
private
|
42
56
|
|
43
|
-
def
|
44
|
-
|
57
|
+
def config(site)
|
58
|
+
builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), '_config.yml')
|
59
|
+
defaults = YAML.load_file(builtin_file)
|
60
|
+
|
61
|
+
defaults['related'].merge(site.config['related'] || {})
|
62
|
+
end
|
63
|
+
|
64
|
+
def template_path(site)
|
65
|
+
site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
|
66
|
+
builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')
|
67
|
+
|
68
|
+
if File.exist? site_file
|
69
|
+
site_file
|
70
|
+
else
|
71
|
+
builtin_file
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
|
76
|
+
dc = document_correleation(accuracy)
|
45
77
|
result = Hash.new
|
46
|
-
count = [count, @
|
78
|
+
count = [count, @posts.size].min
|
47
79
|
|
48
|
-
@
|
49
|
-
queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
|
80
|
+
@posts.each_with_index do |post, index|
|
81
|
+
queue = PQueue.new(dc.row(index).each_with_index.select{|s,_| s>=min_score}) do |a, b|
|
50
82
|
a[0] > b[0]
|
51
83
|
end
|
52
84
|
|
53
|
-
result[
|
85
|
+
result[post] = []
|
54
86
|
count.times do
|
55
87
|
score, id = queue.pop
|
88
|
+
break unless score
|
56
89
|
begin
|
57
|
-
result[
|
58
|
-
score
|
59
|
-
|
90
|
+
result[post] << {
|
91
|
+
'score' => score,
|
92
|
+
'url' => @posts[id][:url],
|
93
|
+
'title' => @posts[id][:title]
|
60
94
|
}
|
61
95
|
rescue
|
62
96
|
break
|
@@ -67,8 +101,27 @@ module Jekyll
|
|
67
101
|
return result
|
68
102
|
end
|
69
103
|
|
70
|
-
def
|
71
|
-
|
104
|
+
def lsi(matrix, accuracy)
|
105
|
+
degree = (@keywords.size * accuracy - 1).floor
|
106
|
+
u, sigma, vt = matrix.transpose.gesdd
|
107
|
+
|
108
|
+
u2 = u.slice(0..degree, 0..degree)
|
109
|
+
sigma_d = NMatrix.zeros([degree+1, @posts.size])
|
110
|
+
sigma.each_with_indices do |v, i, j|
|
111
|
+
break if i > degree
|
112
|
+
sigma_d[i, i] = v
|
113
|
+
end
|
114
|
+
|
115
|
+
return u2.dot(sigma_d).dot(vt).transpose
|
116
|
+
end
|
117
|
+
|
118
|
+
def document_correleation(accuracy = 1.0)
|
119
|
+
if accuracy == 1.0
|
120
|
+
scores = tfidf
|
121
|
+
else
|
122
|
+
scores = lsi(tfidf, accuracy)
|
123
|
+
end
|
124
|
+
|
72
125
|
result = scores.dot(scores.transpose)
|
73
126
|
|
74
127
|
result.each_with_indices do |_, u, v|
|
@@ -83,11 +136,11 @@ module Jekyll
|
|
83
136
|
end
|
84
137
|
|
85
138
|
def bag_of_words
|
86
|
-
result = NMatrix.new([@
|
87
|
-
@max = NMatrix.new([@
|
139
|
+
result = NMatrix.new([@posts.size, @keywords.size], 0.0)
|
140
|
+
@max = NMatrix.new([@posts.size], 0.0)
|
88
141
|
|
89
142
|
result.each_with_indices do |_, pi, ki|
|
90
|
-
result[pi, ki] = @
|
143
|
+
result[pi, ki] = @posts[pi][:content].count(@keywords[ki])
|
91
144
|
|
92
145
|
if result[pi, ki] > @max[pi]
|
93
146
|
@max[pi] = result[pi, ki]
|
@@ -109,11 +162,15 @@ module Jekyll
|
|
109
162
|
return result
|
110
163
|
end
|
111
164
|
|
112
|
-
def
|
165
|
+
def keywords_weights(weights)
|
113
166
|
result = NMatrix.new([1, @keywords.size], 1.0)
|
114
167
|
|
115
|
-
|
116
|
-
|
168
|
+
weights.each do |word, weight|
|
169
|
+
keyword = word.to_s.stem.to_sym
|
170
|
+
|
171
|
+
next unless @keywords.include? keyword
|
172
|
+
|
173
|
+
result[0, @keywords.index(keyword)] = weight
|
117
174
|
end
|
118
175
|
|
119
176
|
return result
|
@@ -145,23 +202,20 @@ module Jekyll
|
|
145
202
|
end
|
146
203
|
|
147
204
|
def stem(data)
|
148
|
-
|
149
|
-
tokenized = data.scan(/\w+/).map(&:downcase)
|
205
|
+
tokenized = @tokenizer.tokenize(data.gsub(/[^a-z \t'_\-\n.,+]/i, '')).map(&:downcase)
|
150
206
|
filtered = @stopwords_filter.filter(tokenized)
|
151
|
-
stemmed = filtered.map(&:stem).select{|s| s.
|
207
|
+
stemmed = filtered.map(&:stem).select{|s| not s.empty?}.map(&:to_sym)
|
208
|
+
|
152
209
|
return stemmed
|
153
210
|
end
|
154
211
|
end
|
155
212
|
end
|
156
213
|
end
|
157
214
|
|
158
|
-
Jekyll::Hooks.register :
|
159
|
-
Jekyll.
|
160
|
-
|
161
|
-
site.posts.docs.each do |x|
|
162
|
-
tfidf.add_post(x)
|
163
|
-
end
|
215
|
+
Jekyll::Hooks.register :posts, :pre_render do |post|
|
216
|
+
Amadeusz::Jekyll::RelatedPosts.instance.add_post(post)
|
217
|
+
end
|
164
218
|
|
165
|
-
|
166
|
-
|
219
|
+
Jekyll::Hooks.register :site, :post_write do |site|
|
220
|
+
Amadeusz::Jekyll::RelatedPosts.instance.build! site
|
167
221
|
end
|