jekyll-related-blog-posts 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jekyll-related-blog-posts.rb +47 -101
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ec8e865c5b63044c782f0be8b21b91fcd97ba7ad4f49bcd76c77d4e7b560ccf
|
4
|
+
data.tar.gz: ab1be4898ff69303b4620901782851c76934eaeebefbbc1e8766b4cade444ba2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf21deb16f506691cbc3733a3dbd40e332f31b1d6f7365cd3349c844df9a454c07386d0b8adc6506c7361e320cb2dd87a3908fd94a57feda86cc1b75ccb501d1
|
7
|
+
data.tar.gz: e464db2de9d96407d0e7d9888db372d76427005cbeb24a3f15261c31121f802bae0a6659a742f8e776179b4daab477b06cb82e2c26347add28d77224ea31dabe
|
@@ -1,96 +1,62 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'jekyll'
|
3
|
-
require 'singleton'
|
4
|
-
require 'tokenizer'
|
5
|
-
require 'yaml'
|
6
|
-
require 'liquid'
|
7
3
|
require 'fast_stemmer'
|
8
4
|
require 'stopwords'
|
9
5
|
require 'pqueue'
|
10
6
|
require 'nmatrix'
|
11
|
-
require 'nmatrix/lapacke'
|
12
7
|
|
13
|
-
module
|
8
|
+
module SangsooNam
|
14
9
|
module Jekyll
|
15
|
-
class
|
16
|
-
include Singleton
|
17
|
-
|
10
|
+
class TFIDFRelatedPosts
|
18
11
|
def initialize
|
19
|
-
@
|
12
|
+
@docs = Array.new
|
20
13
|
@keywords = Array.new
|
21
|
-
@
|
14
|
+
@tags_and_categories = Array.new
|
22
15
|
@stopwords_filter = Stopwords::Snowball::Filter.new('en')
|
23
16
|
end
|
24
17
|
|
25
18
|
def add_post(post)
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
19
|
+
tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
|
20
|
+
categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
|
21
|
+
doc = {
|
22
|
+
post: post,
|
23
|
+
content: (stem(post.data['title']) + tags + categories)
|
30
24
|
}
|
31
|
-
|
32
|
-
@
|
33
|
-
@
|
34
|
-
@keywords.uniq!
|
25
|
+
@docs << doc
|
26
|
+
@keywords += doc[:content]
|
27
|
+
@tags_and_categories += tags + categories
|
35
28
|
end
|
36
29
|
|
37
|
-
def build
|
38
|
-
|
39
|
-
@
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
@posts.each do |post|
|
44
|
-
filename = File.join(site.config['destination'], post[:url])
|
45
|
-
filename = File.join(filename, 'index.html') if File.directory? filename
|
46
|
-
rendered = File.read(filename)
|
47
|
-
|
48
|
-
output = template.render('related_posts' => related[post])
|
30
|
+
def build(site)
|
31
|
+
@keywords.uniq!
|
32
|
+
@tags_and_categories.uniq!
|
33
|
+
@weights = custom_weights(@tags_and_categories)
|
34
|
+
related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
|
49
35
|
|
50
|
-
|
51
|
-
|
36
|
+
@docs.each do |doc|
|
37
|
+
doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
|
52
38
|
end
|
53
39
|
end
|
54
40
|
|
55
41
|
private
|
56
42
|
|
57
|
-
def
|
58
|
-
|
59
|
-
defaults = YAML.load_file(builtin_file)
|
60
|
-
|
61
|
-
defaults['related'].merge(site.config['related'] || {})
|
62
|
-
end
|
63
|
-
|
64
|
-
def template_path(site)
|
65
|
-
site_file = File.join(site.config['source'], site.config['layouts_dir'], 'related.html')
|
66
|
-
builtin_file = File.join(File.absolute_path(File.dirname(__FILE__)), 'related.html')
|
67
|
-
|
68
|
-
if File.exist? site_file
|
69
|
-
site_file
|
70
|
-
else
|
71
|
-
builtin_file
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def find_releated(count = 5, min_score = -10.0, accuracy = 1.0)
|
76
|
-
dc = document_correleation(accuracy)
|
43
|
+
def build_related_docs_with_score(count = 8)
|
44
|
+
dc = document_correleation
|
77
45
|
result = Hash.new
|
78
|
-
count = [count, @
|
46
|
+
count = [count, @docs.size].min
|
79
47
|
|
80
|
-
@
|
81
|
-
queue = PQueue.new(dc.row(index).each_with_index
|
48
|
+
@docs.each_with_index do |doc, index|
|
49
|
+
queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
|
82
50
|
a[0] > b[0]
|
83
51
|
end
|
84
52
|
|
85
|
-
result[
|
53
|
+
result[doc] = []
|
86
54
|
count.times do
|
87
55
|
score, id = queue.pop
|
88
|
-
break unless score
|
89
56
|
begin
|
90
|
-
result[
|
91
|
-
|
92
|
-
|
93
|
-
'title' => @posts[id][:title]
|
57
|
+
result[doc] << {
|
58
|
+
score: score,
|
59
|
+
post: @docs[id][:post]
|
94
60
|
}
|
95
61
|
rescue
|
96
62
|
break
|
@@ -101,27 +67,8 @@ module Jekyll
|
|
101
67
|
return result
|
102
68
|
end
|
103
69
|
|
104
|
-
def
|
105
|
-
|
106
|
-
u, sigma, vt = matrix.transpose.gesdd
|
107
|
-
|
108
|
-
u2 = u.slice(0..degree, 0..degree)
|
109
|
-
sigma_d = NMatrix.zeros([degree+1, @posts.size])
|
110
|
-
sigma.each_with_indices do |v, i, j|
|
111
|
-
break if i > degree
|
112
|
-
sigma_d[i, i] = v
|
113
|
-
end
|
114
|
-
|
115
|
-
return u2.dot(sigma_d).dot(vt).transpose
|
116
|
-
end
|
117
|
-
|
118
|
-
def document_correleation(accuracy = 1.0)
|
119
|
-
if accuracy == 1.0
|
120
|
-
scores = tfidf
|
121
|
-
else
|
122
|
-
scores = lsi(tfidf, accuracy)
|
123
|
-
end
|
124
|
-
|
70
|
+
def document_correleation()
|
71
|
+
scores = tfidf
|
125
72
|
result = scores.dot(scores.transpose)
|
126
73
|
|
127
74
|
result.each_with_indices do |_, u, v|
|
@@ -136,11 +83,11 @@ module Jekyll
|
|
136
83
|
end
|
137
84
|
|
138
85
|
def bag_of_words
|
139
|
-
result = NMatrix.new([@
|
140
|
-
@max = NMatrix.new([@
|
86
|
+
result = NMatrix.new([@docs.size, @keywords.size], 0.0)
|
87
|
+
@max = NMatrix.new([@docs.size], 0.0)
|
141
88
|
|
142
89
|
result.each_with_indices do |_, pi, ki|
|
143
|
-
result[pi, ki] = @
|
90
|
+
result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
|
144
91
|
|
145
92
|
if result[pi, ki] > @max[pi]
|
146
93
|
@max[pi] = result[pi, ki]
|
@@ -162,15 +109,11 @@ module Jekyll
|
|
162
109
|
return result
|
163
110
|
end
|
164
111
|
|
165
|
-
def
|
112
|
+
def custom_weights(terms, weight = 8.0)
|
166
113
|
result = NMatrix.new([1, @keywords.size], 1.0)
|
167
114
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
next unless @keywords.include? keyword
|
172
|
-
|
173
|
-
result[0, @keywords.index(keyword)] = weight
|
115
|
+
terms.each do |term|
|
116
|
+
result[0, @keywords.index(term)] = weight
|
174
117
|
end
|
175
118
|
|
176
119
|
return result
|
@@ -202,20 +145,23 @@ module Jekyll
|
|
202
145
|
end
|
203
146
|
|
204
147
|
def stem(data)
|
205
|
-
|
148
|
+
data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
|
149
|
+
tokenized = data.scan(/\w+/).map(&:downcase)
|
206
150
|
filtered = @stopwords_filter.filter(tokenized)
|
207
|
-
stemmed = filtered.map(&:stem).select{|s|
|
208
|
-
|
151
|
+
stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
|
209
152
|
return stemmed
|
210
153
|
end
|
211
154
|
end
|
212
155
|
end
|
213
156
|
end
|
214
157
|
|
215
|
-
Jekyll::Hooks.register :
|
216
|
-
|
217
|
-
|
158
|
+
Jekyll::Hooks.register :site, :pre_render do |site|
|
159
|
+
Jekyll.logger.info("Building TFIDF index...")
|
160
|
+
tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
|
161
|
+
site.posts.docs.each do |x|
|
162
|
+
tfidf.add_post(x)
|
163
|
+
end
|
218
164
|
|
219
|
-
Jekyll
|
220
|
-
|
165
|
+
Jekyll.logger.info("Replaceing Related Posts...")
|
166
|
+
tfidf.build(site)
|
221
167
|
end
|