jekyll_ranked_search 0.0.1 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 78ba1763b6b2bca798b128851cd24a67c5f8f47b980d9201c8045d33b40834e1
4
- data.tar.gz: 0babe4848299d150103574360b54117207858f6a2171cd46dfc4c48fb971e041
3
+ metadata.gz: 81e9541ca2a4139827dfb10a8ad23bb3f493f9465d95c8c0d8f18f55affeb10f
4
+ data.tar.gz: cc861e099846391ab537624f4a509e7bf97987c5f68851335a613fe8fbadb6e9
5
5
  SHA512:
6
- metadata.gz: 729b789788706222be8f96680bd7a6d7839eccbd5eb66f04a1e8956e176e49c123c085ea2b52c2bdec32521904a788a1ea2f31a3b5443bf621a760e76960479a
7
- data.tar.gz: 227866c90b5e664291d7a13a0da968e5e2c3b0c07972ec7ab20d574b0a0a25d9b1094b1202962c7b46f216d83e56b8646ad283b5af5c21574be84295180f4e88
6
+ metadata.gz: cb47470cf5b035428a768d83804dd92703e754e3776f85655094f136e02d1523e0995c884595caf02716a29fe8cf7f416fd05f4549b18ca121c13fbad0ec252e
7
+ data.tar.gz: 5258f44e86d40cc44666aae5841ffd7e50c7a51b5c60dcf7666300177b278ffff0c4e89f5b4a6b2d62c6117f21edea8f8a0ee75ed076a8def7b010a572472a1d
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "set"
2
4
  require "redcarpet"
3
5
  require "redcarpet/render_strip"
@@ -46,13 +48,28 @@ class TfidfConverter < Jekyll::Generator
46
48
  # Create vocabulary
47
49
  docs.each_with_index do |post, idx|
48
50
  content = markdown.render(post.content)
51
+
52
+ # Tokenize content before applying any other transformations
53
+ tokenized = self.tokenize_words "#{post.data['title']} #{content}"
54
+
49
55
  # Replace newlines with wide spaces and bullet points
50
- # TODO: Remove trailing bullet point
51
- content.gsub!(/\n/, ' • ')
52
- # TODO: Use first n words instead of characters
53
- content = markdown.render(content)
54
- content = content[..512] # The first 512 characters of the post
56
+ divider = " "
57
+ content.gsub!(/\n/, divider)
58
+
59
+ # Remove trailing divider
60
+ if content.end_with?(divider)
61
+ content = content[0..-4]
62
+ end
55
63
 
64
+ # Take first n words of post
65
+ n_words = 40
66
+ splitted_content = content.split(" ")
67
+ word_count = splitted_content.length
68
+ content = splitted_content[..n_words].join(" ") # The first n words of the post
69
+ if word_count > n_words
70
+ content += "..."
71
+ end
72
+
56
73
  processed_docs.push({
57
74
  title: post.data['title'],
58
75
  url: post.url,
@@ -60,7 +77,6 @@ class TfidfConverter < Jekyll::Generator
60
77
  text: content,
61
78
  })
62
79
 
63
- tokenized = self.tokenize_words "#{post.data['title']} #{content}"
64
80
  token_seen = false
65
81
  tokenized.each do |word|
66
82
  if !bow.include?(word)
@@ -94,8 +110,6 @@ class TfidfConverter < Jekyll::Generator
94
110
  tfidf = {}
95
111
  tf.each do |idx, freq|
96
112
  token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
97
- # puts "token idx: #{token_idx}"
98
- # puts df
99
113
  _idf = Math.log(total_docs / df[token_idx] + 0.00001)
100
114
 
101
115
  # Exponential decay over time (boost newer posts)
@@ -109,17 +123,18 @@ class TfidfConverter < Jekyll::Generator
109
123
  end
110
124
 
111
125
  def tokenize_words(doc)
112
- # TODO: Better tokenization
126
+ # Remove stopwords from document
113
127
  @stopwords ||= self.load_stopwords
114
- # replace_chars = /[-_:;@#,¿?¡!'"“”‘’`\/\(\)\[\]\{\}]/i
128
+
129
+ # Split document into tokens
115
130
  splitted_doc = doc.strip.downcase.split
116
- splitted_doc.delete_if { |word| @stopwords.include?(word) }
117
131
 
132
+ # Remove stopwords in place
133
+ splitted_doc.delete_if { |word| @stopwords.include?(word) }
118
134
 
119
135
  # Remove special characters (only at beginning and end)
120
- splitted_doc.map! { |word| word.gsub(/[^a-z0-9\s]/i, '') }
136
+ splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }
121
137
 
122
- # splitted_doc.map! { |word| word.tr("@#!?.:;[]()", "") }
123
138
  splitted_doc
124
139
  end
125
140
 
@@ -153,5 +168,4 @@ class TfidfConverter < Jekyll::Generator
153
168
  end
154
169
  page
155
170
  end
156
-
157
171
  end
data/lib/search.js ADDED
@@ -0,0 +1,265 @@
1
+ import {LitElement, html, css} from 'https://cdn.jsdelivr.net/gh/lit/dist@2/core/lit-core.min.js';
2
+ import 'https://cdn.jsdelivr.net/npm/@github/relative-time-element';
3
+
4
+
5
+ class SearchBox extends LitElement {
6
+ static properties = {
7
+ _isLoading: {state: true, type: Boolean},
8
+ _data: {state: false, type: Array},
9
+ _results : {state: true, type: Array},
10
+ _open: {state: true, type: Boolean},
11
+ _placeholder: {state: true, type: String},
12
+
13
+ // Lazy loading
14
+ lazy: {type: Boolean, attribute: true},
15
+ };
16
+
17
+ /**
18
+ * Constructor. Sets up default values.
19
+ */
20
+ constructor() {
21
+ super();
22
+
23
+ // No data initially loaded
24
+ this._data = [];
25
+
26
+ // Results are initially empty
27
+ this._results = [];
28
+
29
+ // Start in closed state, show no results
30
+ this._open = false;
31
+
32
+ // Default to not loading
33
+ this._isLoading = false;
34
+
35
+ // Disable lazy loading by default
36
+ this.lazy = false;
37
+
38
+ this._placeholder = "Search...";
39
+ }
40
+
41
+ static styles = css`
42
+ :host {
43
+ position: relative;
44
+ display: block;
45
+ }
46
+
47
+ input#q {
48
+ box-sizing: border-box;
49
+ width: 100%;
50
+ // margin: 0 auto;
51
+ padding: .4em;
52
+ border: 1px solid #ccc;
53
+ font-size: 1.2em;
54
+ border-radius: 4px;
55
+ box-shadow: 1px 1px 3px #AAA;
56
+ z-index: 11;
57
+ }
58
+
59
+ #results {
60
+ position: absolute;
61
+ width: 100%;
62
+ margin-top: 4px;
63
+ z-index: 10;
64
+ background-color: #F6F6F6;
65
+ border-radius: 4px;
66
+ box-shadow: 1px 1px 2px #888;
67
+ }
68
+
69
+ .hide {
70
+ display: none;
71
+ }
72
+
73
+ .resultItem {
74
+ text-decoration: none;
75
+ color: #333;
76
+ padding: .4em;
77
+ display: flex;
78
+ flex-direction: column;
79
+ }
80
+
81
+ .resultItem:hover {
82
+ background-color: #F0F0F0;
83
+ }
84
+ .resultItem .title {
85
+ color: #1756a9;
86
+ font-weight: 500;
87
+ }
88
+ .resultItem .datetime {
89
+ color: #666;
90
+ font-size: .8em;
91
+ }
92
+ .resultItem .excerpt {
93
+ font-size: .8em;
94
+ }
95
+ .resultItemActive {
96
+ background-color: #F0F0F0;
97
+ }
98
+ `;
99
+
100
+ connectedCallback() {
101
+ super.connectedCallback();
102
+
103
+ // Load data if lazy loading is disabled
104
+ if (!this.lazy) {
105
+ this.loadData();
106
+ }
107
+
108
+ document.addEventListener('click', (event) => {
109
+ if (!event.composedPath().includes(this) && this._open) {
110
+ this.toggle();
111
+ }
112
+ });
113
+ }
114
+
115
+ toggle() {
116
+ this._open = !this._open;
117
+ }
118
+
119
+ openIfResults() {
120
+ if (this._results.length > 0) {
121
+ this._open = true;
122
+ }
123
+ }
124
+
125
+ close() {
126
+ this._open = false;
127
+ }
128
+
129
+ async loadData() {
130
+ // Set state during loading
131
+ this._isLoading = true;
132
+ this.updatePlaceholder();
133
+
134
+ const response = await fetch("/search.json");
135
+ const jsonData = await response.json();
136
+ jsonData.word2doc = new Map(Object.entries(jsonData.word2doc));
137
+ jsonData.bow = new Map(Object.entries(jsonData.bow));
138
+ jsonData.tfidf = new Map(Object.entries(jsonData.tfidf));
139
+ this._data = jsonData;
140
+
141
+ // Cleanup state
142
+ this._isLoading = false;
143
+ this.updatePlaceholder();
144
+ }
145
+
146
+ disconnectedCallback() {
147
+ super.disconnectedCallback();
148
+ }
149
+
150
+ search(event) {
151
+ if (event.key === "Escape") {
152
+ this.close();
153
+ return;
154
+ }
155
+ const query = event.target.value.toLowerCase().trim();
156
+ if (query === "") {
157
+ this._results = [];
158
+ this.close();
159
+ return;
160
+ }
161
+
162
+ // Split query into word-tokens
163
+ const tokens = query.split(" ");
164
+
165
+ // Find token ids for each token
166
+ let tokenIds = new Set();
167
+ for (const token of tokens) {
168
+ if (token === "") {
169
+ continue;
170
+ }
171
+ if (this._data.bow.has(token)) {
172
+ tokenIds.add(this._data.bow.get(token));
173
+ } else {
174
+ // If one of the tokens is not available, we can return immediately
175
+ // as there will be no results
176
+ this._results = [];
177
+ this.close();
178
+ return;
179
+ }
180
+ }
181
+
182
+ // Convert tokenIds to array
183
+ tokenIds = [...tokenIds];
184
+
185
+ // Initialize docs with first token
186
+ // Subsequent token need to interset with this set
187
+ let docs = new Set(this._data.word2doc.get(tokenIds[0].toString()));
188
+
189
+ for (const tokenId of tokenIds.slice(1)) {
190
+ // Find document candidates
191
+ const docCandidates = new Set(this._data.word2doc.get(tokenId.toString()));
192
+ docs = new Set([...docs].filter((x) => docCandidates.has(x)));
193
+ }
194
+
195
+ // Calculate TF-IDF
196
+ let results = new Map();
197
+ for (const doc of docs) {
198
+ let score = 0;
199
+ for (const tokenId of tokenIds) {
200
+ if (this._data.tfidf.has(`${tokenId},${doc}`)) {
201
+ score += this._data.tfidf.get(`${tokenId},${doc}`);
202
+ }
203
+ }
204
+ results.set(doc, score);
205
+ }
206
+
207
+ // Sort by score
208
+ const candidates = [...results.entries()].sort((a, b) => b[1] - a[1]).map((a) => a[0]);
209
+
210
+ // Get top n results
211
+ this._results = candidates.map((idx) => this._data.docs[idx]).slice(0, 8);
212
+ this._open = true;
213
+ }
214
+
215
+ updatePlaceholder() {
216
+ if (this._isLoading) {
217
+ this._placeholder = "Loading...";
218
+ return;
219
+ }
220
+ if (this._data && this._data.docs && this._data.docs.length > 0) {
221
+ let plural = "";
222
+ if (this._data.docs.length !== 1) {
223
+ plural = "s";
224
+ }
225
+ this._placeholder = "Search in " + this._data.docs.length + ` post${plural}...`;
226
+ return;
227
+ }
228
+ }
229
+
230
+ /**
231
+ * Event triggered on search box focus.
232
+ */
233
+ focus(_) {
234
+ if (this.lazy && this._data.length === 0 && !this._isLoading) {
235
+ this.loadData();
236
+ }
237
+ }
238
+
239
+ render() {
240
+ return html`<div>
241
+ <input id="q" type="text" placeholder="${this._placeholder}" @keyup="${this.search}" @click=${this.openIfResults} @focus=${this.focus}>
242
+ ${this._open ? html`
243
+ <div id="results">
244
+ ${this._results.map((result) => html`
245
+ <a class="resultItem" href="${result.url}">
246
+ <div>
247
+ <span class="title">${result.title}</span>
248
+ <span class="datetime">
249
+ <relative-time datetime="${result.date}">
250
+ ${result.date}
251
+ </relative-time>
252
+ </span>
253
+ </div>
254
+ <div class="excerpt">
255
+ ${result.text}
256
+ </div>
257
+ </a>
258
+ `)}
259
+ </div>
260
+ ` : ""}
261
+
262
+ </div>`;
263
+ }
264
+ }
265
+ customElements.define('search-box', SearchBox);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jekyll_ranked_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Friedrich Ewald
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-22 00:00:00.000000000 Z
11
+ date: 2023-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redcarpet
@@ -24,13 +24,14 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '3.6'
27
- description: Search for Jekyll posts using TF-IDF
27
+ description: Offline search plugin for Jekyll posts using TF-IDF
28
28
  email: freddiemailster@gmail.com
29
29
  executables: []
30
30
  extensions: []
31
31
  extra_rdoc_files: []
32
32
  files:
33
33
  - lib/jekyll_ranked_search.rb
34
+ - lib/search.js
34
35
  - lib/search.json
35
36
  - lib/stopwords.txt
36
37
  homepage: https://github.com/f-ewald/jekyll_ranked_search
@@ -45,7 +46,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
45
46
  requirements:
46
47
  - - ">="
47
48
  - !ruby/object:Gem::Version
48
- version: '0'
49
+ version: 2.5.0
49
50
  required_rubygems_version: !ruby/object:Gem::Requirement
50
51
  requirements:
51
52
  - - ">="
@@ -55,5 +56,5 @@ requirements: []
55
56
  rubygems_version: 3.4.13
56
57
  signing_key:
57
58
  specification_version: 4
58
- summary: TF-IDF search for Jekyll posts
59
+ summary: TF-IDF offline search for Jekyll posts
59
60
  test_files: []