jekyll-related-blog-posts 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/jekyll-related-blog-posts.rb +167 -0
  3. metadata +117 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 0c8751f960fdd690fb4a0c4c9899f9b425b9108cbf9cf5580f01e4afad33580b
4
+ data.tar.gz: ffcffde09d1e6c7ce0830295429b663c9577ac5b723c5039857ed3e23a7238b1
5
+ SHA512:
6
+ metadata.gz: 003dd525f02a5fd168430eb04d553070b7d77fce4433fc941b555402f95a65c9851a82abd1c190fee064d5b722d72ed668b28b81ce86f7d5e5851447fc6dfedb
7
+ data.tar.gz: 8986f4ec51761d582bcbb06e5bbb651e04a6f87f04dc7cb8e10e110cea9ab0c0dd38fbdac4edaa74741b1b894a83291c5e122c12c2c06c6e3adc6f4caf96a1d2
@@ -0,0 +1,167 @@
1
+ require 'rubygems'
2
+ require 'jekyll'
3
+ require 'fast_stemmer'
4
+ require 'stopwords'
5
+ require 'pqueue'
6
+ require 'nmatrix'
7
+
8
+ module SangsooNam
9
+ module Jekyll
10
+ class TFIDFRelatedPosts
11
+ def initialize
12
+ @docs = Array.new
13
+ @keywords = Array.new
14
+ @tags_and_categories = Array.new
15
+ @stopwords_filter = Stopwords::Snowball::Filter.new('en')
16
+ end
17
+
18
+ def add_post(post)
19
+ tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
20
+ categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
21
+ doc = {
22
+ post: post,
23
+ content: (stem(post.content) + stem(post.data['title']) + tags + categories)
24
+ }
25
+ @docs << doc
26
+ @keywords += doc[:content]
27
+ @tags_and_categories += tags + categories
28
+ end
29
+
30
+ def build(site)
31
+ @keywords.uniq!
32
+ @tags_and_categories.uniq!
33
+ @weights = custom_weights(@tags_and_categories)
34
+ related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
35
+
36
+ @docs.each do |doc|
37
+ doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def build_related_docs_with_score(count = 8)
44
+ dc = document_correleation
45
+ result = Hash.new
46
+ count = [count, @docs.size].min
47
+
48
+ @docs.each_with_index do |doc, index|
49
+ queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
50
+ a[0] > b[0]
51
+ end
52
+
53
+ result[doc] = []
54
+ count.times do
55
+ score, id = queue.pop
56
+ begin
57
+ result[doc] << {
58
+ score: score,
59
+ post: @docs[id][:post]
60
+ }
61
+ rescue
62
+ break
63
+ end
64
+ end
65
+ end
66
+
67
+ return result
68
+ end
69
+
70
+ def document_correleation()
71
+ scores = tfidf
72
+ result = scores.dot(scores.transpose)
73
+
74
+ result.each_with_indices do |_, u, v|
75
+ if u != v
76
+ result[u, v] /= (result[u, u] + result[v, v] - result[u, v])
77
+ else
78
+ result[u, v] = 0.0
79
+ end
80
+ end
81
+
82
+ return result
83
+ end
84
+
85
+ def bag_of_words
86
+ result = NMatrix.new([@docs.size, @keywords.size], 0.0)
87
+ @max = NMatrix.new([@docs.size], 0.0)
88
+
89
+ result.each_with_indices do |_, pi, ki|
90
+ result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
91
+
92
+ if result[pi, ki] > @max[pi]
93
+ @max[pi] = result[pi, ki]
94
+ end
95
+ end
96
+
97
+ @bag_of_words = result.dup
98
+ return result
99
+ end
100
+
101
+ def term_frequency
102
+ result = bag_of_words
103
+
104
+ result.rows.times do |r|
105
+ result[r, 0..-1] *= @weights
106
+ result[r, 0..-1] /= @max[r]
107
+ end
108
+
109
+ return result
110
+ end
111
+
112
+ def custom_weights(terms, weight = 8.0)
113
+ result = NMatrix.new([1, @keywords.size], 1.0)
114
+
115
+ terms.each do |term|
116
+ result[0, @keywords.index(term)] = weight
117
+ end
118
+
119
+ return result
120
+ end
121
+
122
+ def inverse_document_frequency
123
+ result = NMatrix.new([1, @keywords.size], 0.0)
124
+
125
+ @bag_of_words.each_column do |column|
126
+ occurences = column.reduce do |m, c|
127
+ m + (c > 0 ? 1.0 : 0.0)
128
+ end
129
+
130
+ result[0, column.offset[1]] = Math.log(column.size / occurences) if occurences > 0
131
+ end
132
+
133
+ return result
134
+ end
135
+
136
+ def tfidf
137
+ result = term_frequency
138
+ idf = inverse_document_frequency
139
+
140
+ result.rows.times do |r|
141
+ result[r, 0..-1] *= idf
142
+ end
143
+
144
+ return result
145
+ end
146
+
147
+ def stem(data)
148
+ data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
149
+ tokenized = data.scan(/\w+/).map(&:downcase)
150
+ filtered = @stopwords_filter.filter(tokenized)
151
+ stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
152
+ return stemmed
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ Jekyll::Hooks.register :site, :pre_render do |site|
159
+ Jekyll.logger.info("Building TFIDF index...")
160
+ tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
161
+ site.posts.docs.each do |x|
162
+ tfidf.add_post(x)
163
+ end
164
+
165
+ Jekyll.logger.info("Replaceing Related Posts...")
166
+ tfidf.build(site)
167
+ end
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jekyll-related-blog-posts
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Manpreet singh
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-05-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: jekyll
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '3.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '3.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: stopwords-filter
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.4'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.4'
41
+ - !ruby/object:Gem::Dependency
42
+ name: fast-stemmer
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pqueue
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nmatrix
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.2'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.2'
83
+ description: Jekyll plugin to show related posts based on the content, tags, and categories. The
84
+ similarity is calculated using TF-IDF(term frequency-inverted document frequency).
85
+ Since tags and categories are use-defined values, those are considered with higher
86
+ weights than a content while calculating.
87
+ email:
88
+ - ms4110415@gmail.com
89
+ executables: []
90
+ extensions: []
91
+ extra_rdoc_files: []
92
+ files:
93
+ - lib/jekyll-related-blog-posts.rb
94
+ homepage: https://github.com/ManpreetChoudhary/jekyll-related-posts
95
+ licenses:
96
+ - MIT
97
+ metadata: {}
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubygems_version: 3.1.2
114
+ signing_key:
115
+ specification_version: 4
116
+ summary: Jekyll plugin to show related posts based on the content, tags, and categories.
117
+ test_files: []