jekyll-tfidf-related-posts 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: be9718e2724638708bdd9f40b2c9a6a8b9eb37a8
4
+ data.tar.gz: c26b700ff4ec87fbb228a835bd487d33c0754191
5
+ SHA512:
6
+ metadata.gz: 9d5e168c31cc4c799174bb099006ac00321d130a09ed5445dbad5f56d6fd35c056e7e579f10ae191c523cf11678a03e8b4905268e24c491178a89a23aec25b42
7
+ data.tar.gz: 9f3d690f6be343d64a15793ec1edc521bbb83f828f4beff3d85d69b976e96691661b2285658d59d7c6694be2c3e652c33af669473aa30a84f305d6877b811631
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ Gemfile.lock
2
+ *.gem
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in jekyll-related-posts.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2019 Sangsoo Nam
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,13 @@
1
+ # jekyll-tdidf-related-posts
2
+
3
+ [![DUB](https://img.shields.io/dub/l/vibe-d.svg)]()
4
+
5
+ [Jekyll](http://jekyllrb.com) plugin to show related posts based on the content, tags, and categories. The similarity is calculated using TF-IDF(term frequency-inverted document frequency). Since tags and categories are use-defined values, those are considered with higher weights than a content while calculating.
6
+
7
+ ### Configuration
8
+
9
+ By default, there are 4 related posts. You can configure it in the `_config.yml`
10
+
11
+ ```
12
+ related_posts_count: 8
13
+ ```
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "jekyll-tfidf-related-posts"
7
+ spec.version = "0.1.0"
8
+ spec.authors = ["Sangsoo Nam"]
9
+ spec.email = ["sangsoo.ted@gmail.com"]
10
+ spec.summary = %q{Jekyll plugin to show related posts based on the content, tags, and categories.}
11
+ spec.description = %q{[Jekyll](http://jekyllrb.com) plugin to show related posts based on the content, tags, and categories. The similarity is calculated using TF-IDF(term frequency-inverted document frequency). Since tags and categories are use-defined values, those are considered with higher weights than a content while calculating.}
12
+ spec.homepage = "https://github.com/SangsooNam/jekyll-tfidf-related-posts"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0")
16
+ spec.require_paths = ["lib"]
17
+
18
+ spec.add_dependency "jekyll", "~> 3.0"
19
+ spec.add_dependency "stopwords-filter", "~> 0.4"
20
+ spec.add_dependency "fast-stemmer", "~> 1.0"
21
+ spec.add_dependency "pqueue", "~> 2.1"
22
+ spec.add_dependency "nmatrix", "~> 0.2"
23
+ end
@@ -0,0 +1,167 @@
1
+ require 'rubygems'
2
+ require 'jekyll'
3
+ require 'fast_stemmer'
4
+ require 'stopwords'
5
+ require 'pqueue'
6
+ require 'nmatrix'
7
+
8
+ module SangsooNam
9
+ module Jekyll
10
+ class TFIDFRelatedPosts
11
+ def initialize
12
+ @docs = Array.new
13
+ @keywords = Array.new
14
+ @tags_and_categories = Array.new
15
+ @stopwords_filter = Stopwords::Snowball::Filter.new('en')
16
+ end
17
+
18
+ def add_post(post)
19
+ tags = post.data['tags'].map { |e| "@tag:" + e }.map(&:to_sym)
20
+ categories = post.data['categories'].map { |e| "@category:" + e }.map(&:to_sym)
21
+ doc = {
22
+ post: post,
23
+ content: (stem(post.content) + stem(post.data['title']) + tags + categories)
24
+ }
25
+ @docs << doc
26
+ @keywords += doc[:content]
27
+ @tags_and_categories += tags + categories
28
+ end
29
+
30
+ def build(site)
31
+ @keywords.uniq!
32
+ @tags_and_categories.uniq!
33
+ @weights = custom_weights(@tags_and_categories)
34
+ related = build_related_docs_with_score(site.config['related_posts_count'] || 4)
35
+
36
+ @docs.each do |doc|
37
+ doc[:post].instance_variable_set(:@related_posts,related[doc].map { |x| x[:post] })
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def build_related_docs_with_score(count = 8)
44
+ dc = document_correleation
45
+ result = Hash.new
46
+ count = [count, @docs.size].min
47
+
48
+ @docs.each_with_index do |doc, index|
49
+ queue = PQueue.new(dc.row(index).each_with_index) do |a, b|
50
+ a[0] > b[0]
51
+ end
52
+
53
+ result[doc] = []
54
+ count.times do
55
+ score, id = queue.pop
56
+ begin
57
+ result[doc] << {
58
+ score: score,
59
+ post: @docs[id][:post]
60
+ }
61
+ rescue
62
+ break
63
+ end
64
+ end
65
+ end
66
+
67
+ return result
68
+ end
69
+
70
+ def document_correleation()
71
+ scores = tfidf
72
+ result = scores.dot(scores.transpose)
73
+
74
+ result.each_with_indices do |_, u, v|
75
+ if u != v
76
+ result[u, v] /= (result[u, u] + result[v, v] - result[u, v])
77
+ else
78
+ result[u, v] = 0.0
79
+ end
80
+ end
81
+
82
+ return result
83
+ end
84
+
85
+ def bag_of_words
86
+ result = NMatrix.new([@docs.size, @keywords.size], 0.0)
87
+ @max = NMatrix.new([@docs.size], 0.0)
88
+
89
+ result.each_with_indices do |_, pi, ki|
90
+ result[pi, ki] = @docs[pi][:content].count(@keywords[ki])
91
+
92
+ if result[pi, ki] > @max[pi]
93
+ @max[pi] = result[pi, ki]
94
+ end
95
+ end
96
+
97
+ @bag_of_words = result.dup
98
+ return result
99
+ end
100
+
101
+ def term_frequency
102
+ result = bag_of_words
103
+
104
+ result.rows.times do |r|
105
+ result[r, 0..-1] *= @weights
106
+ result[r, 0..-1] /= @max[r]
107
+ end
108
+
109
+ return result
110
+ end
111
+
112
+ def custom_weights(terms, weight = 8.0)
113
+ result = NMatrix.new([1, @keywords.size], 1.0)
114
+
115
+ terms.each do |term|
116
+ result[0, @keywords.index(term)] = weight
117
+ end
118
+
119
+ return result
120
+ end
121
+
122
+ def inverse_document_frequency
123
+ result = NMatrix.new([1, @keywords.size], 0.0)
124
+
125
+ @bag_of_words.each_column do |column|
126
+ occurences = column.reduce do |m, c|
127
+ m + (c > 0 ? 1.0 : 0.0)
128
+ end
129
+
130
+ result[0, column.offset[1]] = Math.log(column.size / occurences) if occurences > 0
131
+ end
132
+
133
+ return result
134
+ end
135
+
136
+ def tfidf
137
+ result = term_frequency
138
+ idf = inverse_document_frequency
139
+
140
+ result.rows.times do |r|
141
+ result[r, 0..-1] *= idf
142
+ end
143
+
144
+ return result
145
+ end
146
+
147
+ def stem(data)
148
+ data = data.gsub(/{%.+%}/, ' ') # Replace liquid templates
149
+ tokenized = data.scan(/\w+/).map(&:downcase)
150
+ filtered = @stopwords_filter.filter(tokenized)
151
+ stemmed = filtered.map(&:stem).select{|s| s.length > 1}.map(&:to_sym)
152
+ return stemmed
153
+ end
154
+ end
155
+ end
156
+ end
157
+
158
+ Jekyll::Hooks.register :site, :pre_render do |site|
159
+ Jekyll.logger.info("Building TFIDF index...")
160
+ tfidf = SangsooNam::Jekyll::TFIDFRelatedPosts.new
161
+ site.posts.docs.each do |x|
162
+ tfidf.add_post(x)
163
+ end
164
+
165
+ Jekyll.logger.info("Replaceing Related Posts...")
166
+ tfidf.build(site)
167
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jekyll-tfidf-related-posts
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Sangsoo Nam
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-01-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: jekyll
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: stopwords-filter
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.4'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.4'
41
+ - !ruby/object:Gem::Dependency
42
+ name: fast-stemmer
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pqueue
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.1'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.1'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nmatrix
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.2'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.2'
83
+ description: "[Jekyll](http://jekyllrb.com) plugin to show related posts based on
84
+ the content, tags, and categories. The similarity is calculated using TF-IDF(term
85
+ frequency-inverted document frequency). Since tags and categories are use-defined
86
+ values, those are considered with higher weights than a content while calculating."
87
+ email:
88
+ - sangsoo.ted@gmail.com
89
+ executables: []
90
+ extensions: []
91
+ extra_rdoc_files: []
92
+ files:
93
+ - ".gitignore"
94
+ - Gemfile
95
+ - LICENSE.txt
96
+ - README.md
97
+ - jekyll-tfidf-related-posts.gemspec
98
+ - lib/jekyll-tfidf-related-posts.rb
99
+ homepage: https://github.com/SangsooNam/jekyll-tfidf-related-posts
100
+ licenses:
101
+ - MIT
102
+ metadata: {}
103
+ post_install_message:
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.6.11
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Jekyll plugin to show related posts based on the content, tags, and categories.
123
+ test_files: []