jekyll_ranked_search 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/jekyll_ranked_search.rb +28 -14
- data/lib/search.js +223 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a717e9e1526e49b484e6b84067aa22daef311e21160e80a5c87a5a18fd6aeeb
|
4
|
+
data.tar.gz: 5f6403859df289ecd20971ecb775d08bcc88487616df544ebacc1c2e74805fe6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c268065e1ffe5dbd646feca161ed346dfc649c52d2b4f7269e1cab98e17a3a6180cb70c81a82a8c9b081b4b3a4aa7a6d5dd4d7d9b3ea10610a20210610e0f46
|
7
|
+
data.tar.gz: 73822909d296b0d24a711cbffe27ceb906d6319bc419932ce5dc8be94774d1840339aba7eb58634c126c75a836bc4b4279275eab8ae431d47b17ffc9df0127d4
|
data/lib/jekyll_ranked_search.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "set"
|
2
4
|
require "redcarpet"
|
3
5
|
require "redcarpet/render_strip"
|
@@ -46,13 +48,28 @@ class TfidfConverter < Jekyll::Generator
|
|
46
48
|
# Create vocabulary
|
47
49
|
docs.each_with_index do |post, idx|
|
48
50
|
content = markdown.render(post.content)
|
51
|
+
|
52
|
+
# Tokenize content before applying any other transformations
|
53
|
+
tokenized = self.tokenize_words "#{post.data['title']} #{content}"
|
54
|
+
|
49
55
|
# Replace newlines with wide spaces and bullet points
|
50
|
-
|
51
|
-
content.gsub!(/\n/,
|
52
|
-
|
53
|
-
|
54
|
-
|
56
|
+
divider = " • "
|
57
|
+
content.gsub!(/\n/, divider)
|
58
|
+
|
59
|
+
# Remove trailing divider
|
60
|
+
if content.end_with?(divider)
|
61
|
+
content = content[0..-4]
|
62
|
+
end
|
55
63
|
|
64
|
+
# Take first n words of post
|
65
|
+
n_words = 40
|
66
|
+
splitted_content = content.split(" ")
|
67
|
+
word_count = splitted_content.length
|
68
|
+
content = splitted_content[..n_words].join(" ") # The first n words of the post
|
69
|
+
if word_count > n_words
|
70
|
+
content += "..."
|
71
|
+
end
|
72
|
+
|
56
73
|
processed_docs.push({
|
57
74
|
title: post.data['title'],
|
58
75
|
url: post.url,
|
@@ -60,7 +77,6 @@ class TfidfConverter < Jekyll::Generator
|
|
60
77
|
text: content,
|
61
78
|
})
|
62
79
|
|
63
|
-
tokenized = self.tokenize_words "#{post.data['title']} #{content}"
|
64
80
|
token_seen = false
|
65
81
|
tokenized.each do |word|
|
66
82
|
if !bow.include?(word)
|
@@ -94,8 +110,6 @@ class TfidfConverter < Jekyll::Generator
|
|
94
110
|
tfidf = {}
|
95
111
|
tf.each do |idx, freq|
|
96
112
|
token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
|
97
|
-
# puts "token idx: #{token_idx}"
|
98
|
-
# puts df
|
99
113
|
_idf = Math.log(total_docs / df[token_idx] + 0.00001)
|
100
114
|
|
101
115
|
# Exponential decay over time (boost newer posts)
|
@@ -109,17 +123,18 @@ class TfidfConverter < Jekyll::Generator
|
|
109
123
|
end
|
110
124
|
|
111
125
|
def tokenize_words(doc)
|
112
|
-
#
|
126
|
+
# Remove stopwords from document
|
113
127
|
@stopwords ||= self.load_stopwords
|
114
|
-
|
128
|
+
|
129
|
+
# Split document into tokens
|
115
130
|
splitted_doc = doc.strip.downcase.split
|
116
|
-
splitted_doc.delete_if { |word| @stopwords.include?(word) }
|
117
131
|
|
132
|
+
# Remove stopwords in place
|
133
|
+
splitted_doc.delete_if { |word| @stopwords.include?(word) }
|
118
134
|
|
119
135
|
# Remove special characters (only at beginning and end)
|
120
|
-
splitted_doc.map! { |word| word.gsub(/[^a-z0-
|
136
|
+
splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }
|
121
137
|
|
122
|
-
# splitted_doc.map! { |word| word.tr("@#!?.:;[]()", "") }
|
123
138
|
splitted_doc
|
124
139
|
end
|
125
140
|
|
@@ -153,5 +168,4 @@ class TfidfConverter < Jekyll::Generator
|
|
153
168
|
end
|
154
169
|
page
|
155
170
|
end
|
156
|
-
|
157
171
|
end
|
data/lib/search.js
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
import {LitElement, html, css} from 'https://cdn.jsdelivr.net/gh/lit/dist@2/core/lit-core.min.js';
|
2
|
+
import 'https://cdn.jsdelivr.net/npm/@github/relative-time-element';
|
3
|
+
|
4
|
+
|
5
|
+
class SearchBox extends LitElement {
|
6
|
+
static properties = {
|
7
|
+
_data: {state: true, type: Array},
|
8
|
+
_results : {state: true, type: Array},
|
9
|
+
_open: {state: true, type: Boolean},
|
10
|
+
};
|
11
|
+
|
12
|
+
constructor() {
|
13
|
+
super();
|
14
|
+
this._data = [];
|
15
|
+
this._results = [];
|
16
|
+
this._open = false;
|
17
|
+
}
|
18
|
+
|
19
|
+
static styles = css`
|
20
|
+
:host {
|
21
|
+
position: relative;
|
22
|
+
display: block;
|
23
|
+
}
|
24
|
+
|
25
|
+
input#q {
|
26
|
+
box-sizing: border-box;
|
27
|
+
width: 100%;
|
28
|
+
// margin: 0 auto;
|
29
|
+
padding: .4em;
|
30
|
+
border: 1px solid #ccc;
|
31
|
+
font-size: 1.2em;
|
32
|
+
border-radius: 4px;
|
33
|
+
box-shadow: 1px 1px 3px #AAA;
|
34
|
+
z-index: 11;
|
35
|
+
}
|
36
|
+
|
37
|
+
#results {
|
38
|
+
position: absolute;
|
39
|
+
width: 100%;
|
40
|
+
margin-top: 4px;
|
41
|
+
z-index: 10;
|
42
|
+
background-color: #F6F6F6;
|
43
|
+
border-radius: 4px;
|
44
|
+
box-shadow: 1px 1px 2px #888;
|
45
|
+
}
|
46
|
+
|
47
|
+
.hide {
|
48
|
+
display: none;
|
49
|
+
}
|
50
|
+
|
51
|
+
.resultItem {
|
52
|
+
text-decoration: none;
|
53
|
+
color: #333;
|
54
|
+
padding: .4em;
|
55
|
+
display: flex;
|
56
|
+
flex-direction: column;
|
57
|
+
}
|
58
|
+
|
59
|
+
.resultItem:hover {
|
60
|
+
background-color: #F0F0F0;
|
61
|
+
}
|
62
|
+
.resultItem .title {
|
63
|
+
color: #1756a9;
|
64
|
+
font-weight: 500;
|
65
|
+
}
|
66
|
+
.resultItem .datetime {
|
67
|
+
color: #666;
|
68
|
+
font-size: .8em;
|
69
|
+
}
|
70
|
+
.resultItem .excerpt {
|
71
|
+
font-size: .8em;
|
72
|
+
}
|
73
|
+
.resultItemActive {
|
74
|
+
background-color: #F0F0F0;
|
75
|
+
}
|
76
|
+
`;
|
77
|
+
|
78
|
+
connectedCallback() {
|
79
|
+
super.connectedCallback();
|
80
|
+
this.loadData();
|
81
|
+
|
82
|
+
document.addEventListener('click', (event) => {
|
83
|
+
if (!event.composedPath().includes(this) && this._open) {
|
84
|
+
this.toggle();
|
85
|
+
}
|
86
|
+
});
|
87
|
+
|
88
|
+
// Register arrow keys
|
89
|
+
|
90
|
+
}
|
91
|
+
|
92
|
+
toggle() {
|
93
|
+
this._open = !this._open;
|
94
|
+
}
|
95
|
+
|
96
|
+
openIfResults() {
|
97
|
+
if (this._results.length > 0) {
|
98
|
+
this._open = true;
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
close() {
|
103
|
+
this._open = false;
|
104
|
+
}
|
105
|
+
|
106
|
+
async loadData() {
|
107
|
+
const response = await fetch("/search.json");
|
108
|
+
const jsonData = await response.json();
|
109
|
+
jsonData.word2doc = new Map(Object.entries(jsonData.word2doc));
|
110
|
+
jsonData.bow = new Map(Object.entries(jsonData.bow));
|
111
|
+
jsonData.tfidf = new Map(Object.entries(jsonData.tfidf));
|
112
|
+
this._data = jsonData;
|
113
|
+
}
|
114
|
+
|
115
|
+
disconnectedCallback() {
|
116
|
+
super.disconnectedCallback();
|
117
|
+
}
|
118
|
+
|
119
|
+
search(event) {
|
120
|
+
if (event.key === "Escape") {
|
121
|
+
this.close();
|
122
|
+
return;
|
123
|
+
}
|
124
|
+
const query = event.target.value.toLowerCase().trim();
|
125
|
+
if (query === "") {
|
126
|
+
this._results = [];
|
127
|
+
this.close();
|
128
|
+
return;
|
129
|
+
}
|
130
|
+
|
131
|
+
// Split query into word-tokens
|
132
|
+
const tokens = query.split(" ");
|
133
|
+
|
134
|
+
// Find token ids for each token
|
135
|
+
let tokenIds = new Set();
|
136
|
+
for (const token of tokens) {
|
137
|
+
if (token === "") {
|
138
|
+
continue;
|
139
|
+
}
|
140
|
+
if (this._data.bow.has(token)) {
|
141
|
+
tokenIds.add(this._data.bow.get(token));
|
142
|
+
} else {
|
143
|
+
// If one of the tokens is not available, we can return immediately
|
144
|
+
// as there will be no results
|
145
|
+
this._results = [];
|
146
|
+
this.close();
|
147
|
+
return;
|
148
|
+
}
|
149
|
+
}
|
150
|
+
|
151
|
+
// Convert tokenIds to array
|
152
|
+
tokenIds = [...tokenIds];
|
153
|
+
|
154
|
+
// Initialize docs with first token
|
155
|
+
// Subsequent token need to interset with this set
|
156
|
+
let docs = new Set(this._data.word2doc.get(tokenIds[0].toString()));
|
157
|
+
|
158
|
+
for (const tokenId of tokenIds.slice(1)) {
|
159
|
+
// Find document candidates
|
160
|
+
const docCandidates = new Set(this._data.word2doc.get(tokenId.toString()));
|
161
|
+
// console.log("intersection", docCandidates, docs);
|
162
|
+
docs = new Set([...docs].filter((x) => docCandidates.has(x)));
|
163
|
+
}
|
164
|
+
|
165
|
+
// Calculate TF-IDF
|
166
|
+
let results = new Map();
|
167
|
+
for (const doc of docs) {
|
168
|
+
let score = 0;
|
169
|
+
for (const tokenId of tokenIds) {
|
170
|
+
if (this._data.tfidf.has(`${tokenId},${doc}`)) {
|
171
|
+
score += this._data.tfidf.get(`${tokenId},${doc}`);
|
172
|
+
}
|
173
|
+
}
|
174
|
+
results.set(doc, score);
|
175
|
+
}
|
176
|
+
|
177
|
+
// Sort by score
|
178
|
+
const candidates = [...results.entries()].sort((a, b) => b[1] - a[1]).map((a) => a[0]);
|
179
|
+
|
180
|
+
// Get top n results
|
181
|
+
this._results = candidates.map((idx) => this._data.docs[idx]).slice(0, 8);
|
182
|
+
this._open = true;
|
183
|
+
}
|
184
|
+
|
185
|
+
placeholder() {
|
186
|
+
if (this._data && this._data.docs && this._data.docs.length > 0) {
|
187
|
+
let plural = "";
|
188
|
+
if (this._data.docs.length !== 1) {
|
189
|
+
plural = "s";
|
190
|
+
}
|
191
|
+
return "Search in " + this._data.docs.length + ` post${plural}...`;
|
192
|
+
} else {
|
193
|
+
return "Loading...";
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
render() {
|
198
|
+
return html`<div>
|
199
|
+
<input id="q" type="text" placeholder="${this.placeholder()}" @keyup="${this.search}" @click=${this.openIfResults}>
|
200
|
+
${this._open ? html`
|
201
|
+
<div id="results">
|
202
|
+
${this._results.map((result) => html`
|
203
|
+
<a class="resultItem" href="${result.url}">
|
204
|
+
<div>
|
205
|
+
<span class="title">${result.title}</span>
|
206
|
+
<span class="datetime">
|
207
|
+
<relative-time datetime="${result.date}">
|
208
|
+
${result.date}
|
209
|
+
</relative-time>
|
210
|
+
</span>
|
211
|
+
</div>
|
212
|
+
<div class="excerpt">
|
213
|
+
${result.text}
|
214
|
+
</div>
|
215
|
+
</a>
|
216
|
+
`)}
|
217
|
+
</div>
|
218
|
+
` : ""}
|
219
|
+
|
220
|
+
</div>`;
|
221
|
+
}
|
222
|
+
}
|
223
|
+
customElements.define('search-box', SearchBox);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jekyll_ranked_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Friedrich Ewald
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redcarpet
|
@@ -24,13 +24,14 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '3.6'
|
27
|
-
description:
|
27
|
+
description: Offline search plugin for Jekyll posts using TF-IDF
|
28
28
|
email: freddiemailster@gmail.com
|
29
29
|
executables: []
|
30
30
|
extensions: []
|
31
31
|
extra_rdoc_files: []
|
32
32
|
files:
|
33
33
|
- lib/jekyll_ranked_search.rb
|
34
|
+
- lib/search.js
|
34
35
|
- lib/search.json
|
35
36
|
- lib/stopwords.txt
|
36
37
|
homepage: https://github.com/f-ewald/jekyll_ranked_search
|