jekyll-related-blog-posts 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/jekyll-related-blog-posts.rb +47 -101
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ec8e865c5b63044c782f0be8b21b91fcd97ba7ad4f49bcd76c77d4e7b560ccf
|
4
|
+
data.tar.gz: ab1be4898ff69303b4620901782851c76934eaeebefbbc1e8766b4cade444ba2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf21deb16f506691cbc3733a3dbd40e332f31b1d6f7365cd3349c844df9a454c07386d0b8adc6506c7361e320cb2dd87a3908fd94a57feda86cc1b75ccb501d1
|
7
|
+
data.tar.gz: e464db2de9d96407d0e7d9888db372d76427005cbeb24a3f15261c31121f802bae0a6659a742f8e776179b4daab477b06cb82e2c26347add28d77224ea31dabe
|
@@ -1,96 +1,62 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'jekyll'
|
3
|
-
require 'singleton'
|
4
|
-
require 'tokenizer'
|
5
|
-
require 'yaml'
|
6
|
-
require 'liquid'
|
7
3
|
require 'fast_stemmer'
|
8
4
|
require 'stopwords'
|
9
5
|
require 'pqueue'
|
10
6
|
require 'nmatrix'
|
11
|
-
require 'nmatrix/lapacke'
|
12
7
|
|
13
|
-
module
|
8
|
+
module SangsooNam
|
14
9
|
module Jekyll
|
15
|
-
class
|
16
|
-
include Singleton
|
17
|
-
|
10
|
+
class TFIDFRelatedPosts
|
18
11
|
def initialize
|
19
|
-
@
|
12
|
+
@docs = Array.new
|
20
13
|
@keywords = Array.new
|
21
|
-
@
|
14
|
+
@tags_and_categories = Array.new
|
22
15
|
@stopwords_filter = Stopwords::Snowball::Filter.new('en')
|
23
16
|
end
|
24
17
|
|
25
18
|
def add_post(post)
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
19
|
+
tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
|
20
|
+
categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
|
21
|
+
doc = {
|
22
|
+
post: post,
|
23
|
+
content: (stem(post.data['title']) + tags + categories)
|
30
24
|
}
|
31
|
-
|
32
|
-
@
|
33
|
-
@
|
34
|
-
@keywords.uniq!
|
25
|
+
@docs << doc
|
26
|
+
@keywords += doc[:content]
|
27
|
+
@tags_and_categories += tags + categories
|
35
28
|
end
|
36
29
|
|
37
|
-
def build
|
38
|
-
|
39
|
-
@
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
@posts.each do |post|
|
44
|
-
filename = File.join(site.config['destination'], post[:url])
|
45
|
-
filename = File.join(filename, 'index.html') if File.directory? filename
|
46
|
-
rendered = File.read(filename)
|
47
|
-
|
48
|
-
output = template.render('related_posts' => related[post])
|
30
|
+
def build(site)
|
31
|
+
@keywords.uniq!
|
32
|
+
@tags_and_categories.uniq!
|
33
|
+
@weights = custom_weights(@tags_and_categories)
|
34
|
+
related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
|
49
35
|
|
50
|
-
|
51
|
-
|
36
|
+
@docs.each do |doc|
|
37
|
+
doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
|
52
38
|
end
|
53
39
|
end
|
54
40
|
|
55
41
|
private
|
56
42
|
|
57
|
-
def
|
58
|
-
|
59
|
-
defaults = YAML.load_file(builtin_file)
|
60
|
-
|
61
|
-
defaults['related'].merge(site.config['related'] || {})
|
62
|
-
end
|
63
|
-
|
64
|
-
def template_path(site)
|
65
|
-
site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
|
66
|
-
builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')
|
67
|
-
|
68
|
-
if File.exist? site_file
|
69
|
-
site_file
|
70
|
-
else
|
71
|
-
builtin_file
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
|
76
|
-
dc = document_correleation(accuracy)
|
43
|
+
def build_related_docs_with_score(count = 8)
|
44
|
+
dc = document_correleation
|
77
45
|
result = Hash.new
|
78
|
-
count = [count, @
|
46
|
+
count = [count, @docs.size].min
|
79
47
|
|
80
|
-
@
|
81
|
-
queue = PQueue.new(dc.row(index).each_with_index
|
48
|
+
@docs.each_with_index do |doc, index|
|
49
|
+
queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
|
82
50
|
a[0] > b[0]
|
83
51
|
end
|
84
52
|
|
85
|
-
result[
|
53
|
+
result[doc] = []
|
86
54
|
count.times do
|
87
55
|
score, id = queue.pop
|
88
|
-
break unless score
|
89
56
|
begin
|
90
|
-
result[
|
91
|
-
|
92
|
-
|
93
|
-
'title' => @posts[id][:title]
|
57
|
+
result[doc] << {
|
58
|
+
score: score,
|
59
|
+
post: @docs[id][:post]
|
94
60
|
}
|
95
61
|
rescue
|
96
62
|
break
|
@@ -101,27 +67,8 @@ module Jekyll
|
|
101
67
|
return result
|
102
68
|
end
|
103
69
|
|
104
|
-
def
|
105
|
-
|
106
|
-
u, sigma, vt = matrix.transpose.gesdd
|
107
|
-
|
108
|
-
u2 = u.slice(0..degree, 0..degree)
|
109
|
-
sigma_d = NMatrix.zeros([degree+1, @posts.size])
|
110
|
-
sigma.each_with_indices do |v, i, j|
|
111
|
-
break if i > degree
|
112
|
-
sigma_d[i, i] = v
|
113
|
-
end
|
114
|
-
|
115
|
-
return u2.dot(sigma_d).dot(vt).transpose
|
116
|
-
end
|
117
|
-
|
118
|
-
def document_correleation(accuracy = 1.0)
|
119
|
-
if accuracy == 1.0
|
120
|
-
scores = tfidf
|
121
|
-
else
|
122
|
-
scores = lsi(tfidf, accuracy)
|
123
|
-
end
|
124
|
-
|
70
|
+
def document_correleation()
|
71
|
+
scores = tfidf
|
125
72
|
result = scores.dot(scores.transpose)
|
126
73
|
|
127
74
|
result.each_with_indices do |_, u, v|
|
@@ -136,11 +83,11 @@ module Jekyll
|
|
136
83
|
end
|
137
84
|
|
138
85
|
def bag_of_words
|
139
|
-
result = NMatrix.new([@
|
140
|
-
@max = NMatrix.new([@
|
86
|
+
result = NMatrix.new([@docs.size, @keywords.size], 0.0)
|
87
|
+
@max = NMatrix.new([@docs.size], 0.0)
|
141
88
|
|
142
89
|
result.each_with_indices do |_, pi, ki|
|
143
|
-
result[pi, ki] = @
|
90
|
+
result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
|
144
91
|
|
145
92
|
if result[pi, ki] > @max[pi]
|
146
93
|
@max[pi] = result[pi, ki]
|
@@ -162,15 +109,11 @@ module Jekyll
|
|
162
109
|
return result
|
163
110
|
end
|
164
111
|
|
165
|
-
def
|
112
|
+
def custom_weights(terms, weight = 8.0)
|
166
113
|
result = NMatrix.new([1, @keywords.size], 1.0)
|
167
114
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
next unless @keywords.include? keyword
|
172
|
-
|
173
|
-
result[0, @keywords.index(keyword)] = weight
|
115
|
+
terms.each do |term|
|
116
|
+
result[0, @keywords.index(term)] = weight
|
174
117
|
end
|
175
118
|
|
176
119
|
return result
|
@@ -202,20 +145,23 @@ module Jekyll
|
|
202
145
|
end
|
203
146
|
|
204
147
|
def stem(data)
|
205
|
-
|
148
|
+
data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
|
149
|
+
tokenized = data.scan(/\w+/).map(&:downcase)
|
206
150
|
filtered = @stopwords_filter.filter(tokenized)
|
207
|
-
stemmed = filtered.map(&:stem).select{|s|
|
208
|
-
|
151
|
+
stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
|
209
152
|
return stemmed
|
210
153
|
end
|
211
154
|
end
|
212
155
|
end
|
213
156
|
end
|
214
157
|
|
215
|
-
Jekyll::Hooks.register :
|
216
|
-
|
217
|
-
|
158
|
+
Jekyll::Hooks.register :site, :pre_render do |site|
|
159
|
+
Jekyll.logger.info("Building TFIDF index...")
|
160
|
+
tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
|
161
|
+
site.posts.docs.each do |x|
|
162
|
+
tfidf.add_post(x)
|
163
|
+
end
|
218
164
|
|
219
|
-
Jekyll
|
220
|
-
|
165
|
+
Jekyll.logger.info("Replaceing Related Posts...")
|
166
|
+
tfidf.build(site)
|
221
167
|
end
|