jekyll-related-blog-posts 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/jekyll-related-blog-posts.rb +167 -0
  3. metadata +117 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0c8751f960fdd690fb4a0c4c9899f9b425b9108cbf9cf5580f01e4afad33580b
4
+ data.tar.gz: ffcffde09d1e6c7ce0830295429b663c9577ac5b723c5039857ed3e23a7238b1
5
+ SHA512:
6
+ metadata.gz: 003dd525f02a5fd168430eb04d553070b7d77fce4433fc941b555402f95a65c9851a82abd1c190fee064d5b722d72ed668b28b81ce86f7d5e5851447fc6dfedb
7
+ data.tar.gz: 8986f4ec51761d582bcbb06e5bbb651e04a6f87f04dc7cb8e10e110cea9ab0c0dd38fbdac4edaa74741b1b894a83291c5e122c12c2c06c6e3adc6f4caf96a1d2
@@ -0,0 +1,167 @@
1
+ require 'rubygems'
2
+ require 'jekyll'
3
+ require 'fast_stemmer'
4
+ require 'stopwords'
5
+ require 'pqueue'
6
+ require 'nmatrix'
7
+
8
+ module SangsooNam
9
+ module Jekyll
10
+ class TFIDFRelatedPosts
11
+ def initialize
12
+ @docs = Array.new
13
+ @keywords = Array.new
14
+ @tags_and_categories = Array.new
15
+ @stopwords_filter = Stopwords::Snowball::Filter.new('en')
16
+ end
17
+
18
+ def add_post(post)
19
+ tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
20
+ categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
21
+ doc = {
22
+ post: post,
23
+ content: (stem(post.content) + stem(post.data['title']) + tags + categories)
24
+ }
25
+ @docs << doc
26
+ @keywords += doc[:content]
27
+ @tags_and_categories += tags + categories
28
+ end
29
+
30
+ def build(site)
31
+ @keywords.uniq!
32
+ @tags_and_categories.uniq!
33
+ @weights = custom_weights(@tags_and_categories)
34
+ related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
35
+
36
+ @docs.each do |doc|
37
+ doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def build_related_docs_with_score(count = 8)
44
+ dc = document_correleation
45
+ result = Hash.new
46
+ count = [count, @docs.size].min
47
+
48
+ @docs.each_with_index do |doc, index|
49
+ queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
50
+ a[0] > b[0]
51
+ end
52
+
53
+ result[doc] = []
54
+ count.times do
55
+ score, id = queue.pop
56
+ begin
57
+ result[doc] << {
58
+ score: score,
59
+ post: @docs[id][:post]
60
+ }
61
+ rescue
62
+ break
63
+ end
64
+ end
65
+ end
66
+
67
+ return result
68
+ end
69
+
70
+ def document_correleation()
71
+ scores = tfidf
72
+ result = scores.dot(scores.transpose)
73
+
74
+ result.each_with_indices do |_, u, v|
75
+ if u != v
76
+ result[u, v] /= (result[u, u] + result[v, v] - result[u, v])
77
+ else
78
+ result[u, v] = 0.0
79
+ end
80
+ end
81
+
82
+ return result
83
+ end
84
+
85
+ def bag_of_words
86
+ result = NMatrix.new([@docs.size, @keywords.size], 0.0)
87
+ @max = NMatrix.new([@docs.size], 0.0)
88
+
89
+ result.each_with_indices do |_, pi, ki|
90
+ result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
91
+
92
+ if result[pi, ki] > @max[pi]
93
+ @max[pi] = result[pi, ki]
94
+ end
95
+ end
96
+
97
+ @bag_of_words = result.dup
98
+ return result
99
+ end
100
+
101
+ def term_frequency
102
+ result = bag_of_words
103
+
104
+ result.rows.times do |r|
105
+ result[r, 0..-1] *= @weights
106
+ result[r, 0..-1] /= @max[r]
107
+ end
108
+
109
+ return result
110
+ end
111
+
112
+ def custom_weights(terms, weight = 8.0)
113
+ result = NMatrix.new([1, @keywords.size], 1.0)
114
+
115
+ terms.each do |term|
116
+ result[0, @keywords.index(term)] = weight
117
+ end
118
+
119
+ return result
120
+ end
121
+
122
+ def inverse_document_frequency
123
+ result = NMatrix.new([1, @keywords.size], 0.0)
124
+
125
+ @bag_of_words.each_column do |column|
126
+ occurences = column.reduce do |m, c|
127
+ m + (c > 0 ? 1.0 : 0.0)
128
+ end
129
+
130
+ result[0, column.offset[1]] = Math.log(column.size / occurences) if occurences > 0
131
+ end
132
+
133
+ return result
134
+ end
135
+
136
+ def tfidf
137
+ result = term_frequency
138
+ idf = inverse_document_frequency
139
+
140
+ result.rows.times do |r|
141
+ result[r, 0..-1] *= idf
142
+ end
143
+
144
+ return result
145
+ end
146
+
147
+ def stem(data)
148
+ data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
149
+ tokenized = data.scan(/\w+/).map(&:downcase)
150
+ filtered = @stopwords_filter.filter(tokenized)
151
+ stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
152
+ return stemmed
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ Jekyll::Hooks.register :site, :pre_render do |site|
159
+ Jekyll.logger.info("Building TFIDF index...")
160
+ tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
161
+ site.posts.docs.each do |x|
162
+ tfidf.add_post(x)
163
+ end
164
+
165
+ Jekyll.logger.info("Replaceing Related Posts...")
166
+ tfidf.build(site)
167
+ end
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jekyll-related-blog-posts
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Manpreet singh
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-05-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: jekyll
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '3.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '3.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: stopwords-filter
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.4'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.4'
41
+ - !ruby/object:Gem::Dependency
42
+ name: fast-stemmer
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pqueue
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nmatrix
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.2'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.2'
83
+ description: Jekyll plugin to show related posts based on the content, tags, and categories. The
84
+ similarity is calculated using TF-IDF(term frequency-inverted document frequency).
85
+ Since tags and categories are use-defined values, those are considered with higher
86
+ weights than a content while calculating.
87
+ email:
88
+ - ms4110415@gmail.com
89
+ executables: []
90
+ extensions: []
91
+ extra_rdoc_files: []
92
+ files:
93
+ - lib/jekyll-related-blog-posts.rb
94
+ homepage: https://github.com/ManpreetChoudhary/jekyll-related-posts
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubygems_version: 3.1.2
114
+ signing_key:
115
+ specification_version: 4
116
+ summary: Jekyll plugin to show related posts based on the content, tags, and categories.
117
+ test_files: []