jekyll_ranked_search 0.0.1 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/jekyll_ranked_search.rb +28 -14
- data/lib/search.js +265 -0
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 81e9541ca2a4139827dfb10a8ad23bb3f493f9465d95c8c0d8f18f55affeb10f
|
4
|
+
data.tar.gz: cc861e099846391ab537624f4a509e7bf97987c5f68851335a613fe8fbadb6e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cb47470cf5b035428a768d83804dd92703e754e3776f85655094f136e02d1523e0995c884595caf02716a29fe8cf7f416fd05f4549b18ca121c13fbad0ec252e
|
7
|
+
data.tar.gz: 5258f44e86d40cc44666aae5841ffd7e50c7a51b5c60dcf7666300177b278ffff0c4e89f5b4a6b2d62c6117f21edea8f8a0ee75ed076a8def7b010a572472a1d
|
data/lib/jekyll_ranked_search.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "set"
|
2
4
|
require "redcarpet"
|
3
5
|
require "redcarpet/render_strip"
|
@@ -46,13 +48,28 @@ class TfidfConverter < Jekyll::Generator
|
|
46
48
|
# Create vocabulary
|
47
49
|
docs.each_with_index do |post, idx|
|
48
50
|
content = markdown.render(post.content)
|
51
|
+
|
52
|
+
# Tokenize content before applying any other transformations
|
53
|
+
tokenized = self.tokenize_words "#{post.data['title']} #{content}"
|
54
|
+
|
49
55
|
# Replace newlines with wide spaces and bullet points
|
50
|
-
|
51
|
-
content.gsub!(/\n/,
|
52
|
-
|
53
|
-
|
54
|
-
|
56
|
+
divider = " • "
|
57
|
+
content.gsub!(/\n/, divider)
|
58
|
+
|
59
|
+
# Remove trailing divider
|
60
|
+
if content.end_with?(divider)
|
61
|
+
content = content[0..-4]
|
62
|
+
end
|
55
63
|
|
64
|
+
# Take first n words of post
|
65
|
+
n_words = 40
|
66
|
+
splitted_content = content.split(" ")
|
67
|
+
word_count = splitted_content.length
|
68
|
+
content = splitted_content[..n_words].join(" ") # The first n words of the post
|
69
|
+
if word_count > n_words
|
70
|
+
content += "..."
|
71
|
+
end
|
72
|
+
|
56
73
|
processed_docs.push({
|
57
74
|
title: post.data['title'],
|
58
75
|
url: post.url,
|
@@ -60,7 +77,6 @@ class TfidfConverter < Jekyll::Generator
|
|
60
77
|
text: content,
|
61
78
|
})
|
62
79
|
|
63
|
-
tokenized = self.tokenize_words "#{post.data['title']} #{content}"
|
64
80
|
token_seen = false
|
65
81
|
tokenized.each do |word|
|
66
82
|
if !bow.include?(word)
|
@@ -94,8 +110,6 @@ class TfidfConverter < Jekyll::Generator
|
|
94
110
|
tfidf = {}
|
95
111
|
tf.each do |idx, freq|
|
96
112
|
token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
|
97
|
-
# puts "token idx: #{token_idx}"
|
98
|
-
# puts df
|
99
113
|
_idf = Math.log(total_docs / df[token_idx] + 0.00001)
|
100
114
|
|
101
115
|
# Exponential decay over time (boost newer posts)
|
@@ -109,17 +123,18 @@ class TfidfConverter < Jekyll::Generator
|
|
109
123
|
end
|
110
124
|
|
111
125
|
def tokenize_words(doc)
|
112
|
-
#
|
126
|
+
# Remove stopwords from document
|
113
127
|
@stopwords ||= self.load_stopwords
|
114
|
-
|
128
|
+
|
129
|
+
# Split document into tokens
|
115
130
|
splitted_doc = doc.strip.downcase.split
|
116
|
-
splitted_doc.delete_if { |word| @stopwords.include?(word) }
|
117
131
|
|
132
|
+
# Remove stopwords in place
|
133
|
+
splitted_doc.delete_if { |word| @stopwords.include?(word) }
|
118
134
|
|
119
135
|
# Remove special characters (only at beginning and end)
|
120
|
-
splitted_doc.map! { |word| word.gsub(/[^a-z0-
|
136
|
+
splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }
|
121
137
|
|
122
|
-
# splitted_doc.map! { |word| word.tr("@#!?.:;[]()", "") }
|
123
138
|
splitted_doc
|
124
139
|
end
|
125
140
|
|
@@ -153,5 +168,4 @@ class TfidfConverter < Jekyll::Generator
|
|
153
168
|
end
|
154
169
|
page
|
155
170
|
end
|
156
|
-
|
157
171
|
end
|
data/lib/search.js
ADDED
@@ -0,0 +1,265 @@
|
|
1
|
+
import {LitElement, html, css} from 'https://cdn.jsdelivr.net/gh/lit/dist@2/core/lit-core.min.js';
|
2
|
+
import 'https://cdn.jsdelivr.net/npm/@github/relative-time-element';
|
3
|
+
|
4
|
+
|
5
|
+
class SearchBox extends LitElement {
|
6
|
+
static properties = {
|
7
|
+
_isLoading: {state: true, type: Boolean},
|
8
|
+
_data: {state: false, type: Array},
|
9
|
+
_results : {state: true, type: Array},
|
10
|
+
_open: {state: true, type: Boolean},
|
11
|
+
_placeholder: {state: true, type: String},
|
12
|
+
|
13
|
+
// Lazy loading
|
14
|
+
lazy: {type: Boolean, attribute: true},
|
15
|
+
};
|
16
|
+
|
17
|
+
/**
|
18
|
+
* Constructor. Sets up default values.
|
19
|
+
*/
|
20
|
+
constructor() {
|
21
|
+
super();
|
22
|
+
|
23
|
+
// No data initially loaded
|
24
|
+
this._data = [];
|
25
|
+
|
26
|
+
// Results are initially empty
|
27
|
+
this._results = [];
|
28
|
+
|
29
|
+
// Start in closed state, show no results
|
30
|
+
this._open = false;
|
31
|
+
|
32
|
+
// Default to not loading
|
33
|
+
this._isLoading = false;
|
34
|
+
|
35
|
+
// Disable lazy loading by default
|
36
|
+
this.lazy = false;
|
37
|
+
|
38
|
+
this._placeholder = "Search...";
|
39
|
+
}
|
40
|
+
|
41
|
+
static styles = css`
|
42
|
+
:host {
|
43
|
+
position: relative;
|
44
|
+
display: block;
|
45
|
+
}
|
46
|
+
|
47
|
+
input#q {
|
48
|
+
box-sizing: border-box;
|
49
|
+
width: 100%;
|
50
|
+
// margin: 0 auto;
|
51
|
+
padding: .4em;
|
52
|
+
border: 1px solid #ccc;
|
53
|
+
font-size: 1.2em;
|
54
|
+
border-radius: 4px;
|
55
|
+
box-shadow: 1px 1px 3px #AAA;
|
56
|
+
z-index: 11;
|
57
|
+
}
|
58
|
+
|
59
|
+
#results {
|
60
|
+
position: absolute;
|
61
|
+
width: 100%;
|
62
|
+
margin-top: 4px;
|
63
|
+
z-index: 10;
|
64
|
+
background-color: #F6F6F6;
|
65
|
+
border-radius: 4px;
|
66
|
+
box-shadow: 1px 1px 2px #888;
|
67
|
+
}
|
68
|
+
|
69
|
+
.hide {
|
70
|
+
display: none;
|
71
|
+
}
|
72
|
+
|
73
|
+
.resultItem {
|
74
|
+
text-decoration: none;
|
75
|
+
color: #333;
|
76
|
+
padding: .4em;
|
77
|
+
display: flex;
|
78
|
+
flex-direction: column;
|
79
|
+
}
|
80
|
+
|
81
|
+
.resultItem:hover {
|
82
|
+
background-color: #F0F0F0;
|
83
|
+
}
|
84
|
+
.resultItem .title {
|
85
|
+
color: #1756a9;
|
86
|
+
font-weight: 500;
|
87
|
+
}
|
88
|
+
.resultItem .datetime {
|
89
|
+
color: #666;
|
90
|
+
font-size: .8em;
|
91
|
+
}
|
92
|
+
.resultItem .excerpt {
|
93
|
+
font-size: .8em;
|
94
|
+
}
|
95
|
+
.resultItemActive {
|
96
|
+
background-color: #F0F0F0;
|
97
|
+
}
|
98
|
+
`;
|
99
|
+
|
100
|
+
connectedCallback() {
|
101
|
+
super.connectedCallback();
|
102
|
+
|
103
|
+
// Load data if lazy loading is disabled
|
104
|
+
if (!this.lazy) {
|
105
|
+
this.loadData();
|
106
|
+
}
|
107
|
+
|
108
|
+
document.addEventListener('click', (event) => {
|
109
|
+
if (!event.composedPath().includes(this) && this._open) {
|
110
|
+
this.toggle();
|
111
|
+
}
|
112
|
+
});
|
113
|
+
}
|
114
|
+
|
115
|
+
toggle() {
|
116
|
+
this._open = !this._open;
|
117
|
+
}
|
118
|
+
|
119
|
+
openIfResults() {
|
120
|
+
if (this._results.length > 0) {
|
121
|
+
this._open = true;
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
close() {
|
126
|
+
this._open = false;
|
127
|
+
}
|
128
|
+
|
129
|
+
async loadData() {
|
130
|
+
// Set state during loading
|
131
|
+
this._isLoading = true;
|
132
|
+
this.updatePlaceholder();
|
133
|
+
|
134
|
+
const response = await fetch("/search.json");
|
135
|
+
const jsonData = await response.json();
|
136
|
+
jsonData.word2doc = new Map(Object.entries(jsonData.word2doc));
|
137
|
+
jsonData.bow = new Map(Object.entries(jsonData.bow));
|
138
|
+
jsonData.tfidf = new Map(Object.entries(jsonData.tfidf));
|
139
|
+
this._data = jsonData;
|
140
|
+
|
141
|
+
// Cleanup state
|
142
|
+
this._isLoading = false;
|
143
|
+
this.updatePlaceholder();
|
144
|
+
}
|
145
|
+
|
146
|
+
disconnectedCallback() {
|
147
|
+
super.disconnectedCallback();
|
148
|
+
}
|
149
|
+
|
150
|
+
search(event) {
|
151
|
+
if (event.key === "Escape") {
|
152
|
+
this.close();
|
153
|
+
return;
|
154
|
+
}
|
155
|
+
const query = event.target.value.toLowerCase().trim();
|
156
|
+
if (query === "") {
|
157
|
+
this._results = [];
|
158
|
+
this.close();
|
159
|
+
return;
|
160
|
+
}
|
161
|
+
|
162
|
+
// Split query into word-tokens
|
163
|
+
const tokens = query.split(" ");
|
164
|
+
|
165
|
+
// Find token ids for each token
|
166
|
+
let tokenIds = new Set();
|
167
|
+
for (const token of tokens) {
|
168
|
+
if (token === "") {
|
169
|
+
continue;
|
170
|
+
}
|
171
|
+
if (this._data.bow.has(token)) {
|
172
|
+
tokenIds.add(this._data.bow.get(token));
|
173
|
+
} else {
|
174
|
+
// If one of the tokens is not available, we can return immediately
|
175
|
+
// as there will be no results
|
176
|
+
this._results = [];
|
177
|
+
this.close();
|
178
|
+
return;
|
179
|
+
}
|
180
|
+
}
|
181
|
+
|
182
|
+
// Convert tokenIds to array
|
183
|
+
tokenIds = [...tokenIds];
|
184
|
+
|
185
|
+
// Initialize docs with first token
|
186
|
+
// Subsequent token need to interset with this set
|
187
|
+
let docs = new Set(this._data.word2doc.get(tokenIds[0].toString()));
|
188
|
+
|
189
|
+
for (const tokenId of tokenIds.slice(1)) {
|
190
|
+
// Find document candidates
|
191
|
+
const docCandidates = new Set(this._data.word2doc.get(tokenId.toString()));
|
192
|
+
docs = new Set([...docs].filter((x) => docCandidates.has(x)));
|
193
|
+
}
|
194
|
+
|
195
|
+
// Calculate TF-IDF
|
196
|
+
let results = new Map();
|
197
|
+
for (const doc of docs) {
|
198
|
+
let score = 0;
|
199
|
+
for (const tokenId of tokenIds) {
|
200
|
+
if (this._data.tfidf.has(`${tokenId},${doc}`)) {
|
201
|
+
score += this._data.tfidf.get(`${tokenId},${doc}`);
|
202
|
+
}
|
203
|
+
}
|
204
|
+
results.set(doc, score);
|
205
|
+
}
|
206
|
+
|
207
|
+
// Sort by score
|
208
|
+
const candidates = [...results.entries()].sort((a, b) => b[1] - a[1]).map((a) => a[0]);
|
209
|
+
|
210
|
+
// Get top n results
|
211
|
+
this._results = candidates.map((idx) => this._data.docs[idx]).slice(0, 8);
|
212
|
+
this._open = true;
|
213
|
+
}
|
214
|
+
|
215
|
+
updatePlaceholder() {
|
216
|
+
if (this._isLoading) {
|
217
|
+
this._placeholder = "Loading...";
|
218
|
+
return;
|
219
|
+
}
|
220
|
+
if (this._data && this._data.docs && this._data.docs.length > 0) {
|
221
|
+
let plural = "";
|
222
|
+
if (this._data.docs.length !== 1) {
|
223
|
+
plural = "s";
|
224
|
+
}
|
225
|
+
this._placeholder = "Search in " + this._data.docs.length + ` post${plural}...`;
|
226
|
+
return;
|
227
|
+
}
|
228
|
+
}
|
229
|
+
|
230
|
+
/**
|
231
|
+
* Event triggered on search box focus.
|
232
|
+
*/
|
233
|
+
focus(_) {
|
234
|
+
if (this.lazy && this._data.length === 0 && !this._isLoading) {
|
235
|
+
this.loadData();
|
236
|
+
}
|
237
|
+
}
|
238
|
+
|
239
|
+
render() {
|
240
|
+
return html`<div>
|
241
|
+
<input id="q" type="text" placeholder="${this._placeholder}" @keyup="${this.search}" @click=${this.openIfResults} @focus=${this.focus}>
|
242
|
+
${this._open ? html`
|
243
|
+
<div id="results">
|
244
|
+
${this._results.map((result) => html`
|
245
|
+
<a class="resultItem" href="${result.url}">
|
246
|
+
<div>
|
247
|
+
<span class="title">${result.title}</span>
|
248
|
+
<span class="datetime">
|
249
|
+
<relative-time datetime="${result.date}">
|
250
|
+
${result.date}
|
251
|
+
</relative-time>
|
252
|
+
</span>
|
253
|
+
</div>
|
254
|
+
<div class="excerpt">
|
255
|
+
${result.text}
|
256
|
+
</div>
|
257
|
+
</a>
|
258
|
+
`)}
|
259
|
+
</div>
|
260
|
+
` : ""}
|
261
|
+
|
262
|
+
</div>`;
|
263
|
+
}
|
264
|
+
}
|
265
|
+
customElements.define('search-box', SearchBox);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jekyll_ranked_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Friedrich Ewald
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redcarpet
|
@@ -24,13 +24,14 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '3.6'
|
27
|
-
description:
|
27
|
+
description: Offline search plugin for Jekyll posts using TF-IDF
|
28
28
|
email: freddiemailster@gmail.com
|
29
29
|
executables: []
|
30
30
|
extensions: []
|
31
31
|
extra_rdoc_files: []
|
32
32
|
files:
|
33
33
|
- lib/jekyll_ranked_search.rb
|
34
|
+
- lib/search.js
|
34
35
|
- lib/search.json
|
35
36
|
- lib/stopwords.txt
|
36
37
|
homepage: https://github.com/f-ewald/jekyll_ranked_search
|
@@ -45,7 +46,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
45
46
|
requirements:
|
46
47
|
- - ">="
|
47
48
|
- !ruby/object:Gem::Version
|
48
|
-
version:
|
49
|
+
version: 2.5.0
|
49
50
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
51
|
requirements:
|
51
52
|
- - ">="
|
@@ -55,5 +56,5 @@ requirements: []
|
|
55
56
|
rubygems_version: 3.4.13
|
56
57
|
signing_key:
|
57
58
|
specification_version: 4
|
58
|
-
summary: TF-IDF search for Jekyll posts
|
59
|
+
summary: TF-IDF offline search for Jekyll posts
|
59
60
|
test_files: []
|