jekyll_ranked_search 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/jekyll_ranked_search.rb +28 -14
- data/lib/search.js +223 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a717e9e1526e49b484e6b84067aa22daef311e21160e80a5c87a5a18fd6aeeb
|
4
|
+
data.tar.gz: 5f6403859df289ecd20971ecb775d08bcc88487616df544ebacc1c2e74805fe6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7c268065e1ffe5dbd646feca161ed346dfc649c52d2b4f7269e1cab98e17a3a6180cb70c81a82a8c9b081b4b3a4aa7a6d5dd4d7d9b3ea10610a20210610e0f46
|
7
|
+
data.tar.gz: 73822909d296b0d24a711cbffe27ceb906d6319bc419932ce5dc8be94774d1840339aba7eb58634c126c75a836bc4b4279275eab8ae431d47b17ffc9df0127d4
|
data/lib/jekyll_ranked_search.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "set"
|
2
4
|
require "redcarpet"
|
3
5
|
require "redcarpet/render_strip"
|
@@ -46,13 +48,28 @@ class TfidfConverter < Jekyll::Generator
|
|
46
48
|
# Create vocabulary
|
47
49
|
docs.each_with_index do |post, idx|
|
48
50
|
content = markdown.render(post.content)
|
51
|
+
|
52
|
+
# Tokenize content before applying any other transformations
|
53
|
+
tokenized = self.tokenize_words "#{post.data['title']} #{content}"
|
54
|
+
|
49
55
|
# Replace newlines with wide spaces and bullet points
|
50
|
-
|
51
|
-
content.gsub!(/\n/,
|
52
|
-
|
53
|
-
|
54
|
-
|
56
|
+
divider = " • "
|
57
|
+
content.gsub!(/\n/, divider)
|
58
|
+
|
59
|
+
# Remove trailing divider
|
60
|
+
if content.end_with?(divider)
|
61
|
+
content = content[0..-4]
|
62
|
+
end
|
55
63
|
|
64
|
+
# Take first n words of post
|
65
|
+
n_words = 40
|
66
|
+
splitted_content = content.split(" ")
|
67
|
+
word_count = splitted_content.length
|
68
|
+
content = splitted_content[..n_words].join(" ") # The first n words of the post
|
69
|
+
if word_count > n_words
|
70
|
+
content += "..."
|
71
|
+
end
|
72
|
+
|
56
73
|
processed_docs.push({
|
57
74
|
title: post.data['title'],
|
58
75
|
url: post.url,
|
@@ -60,7 +77,6 @@ class TfidfConverter < Jekyll::Generator
|
|
60
77
|
text: content,
|
61
78
|
})
|
62
79
|
|
63
|
-
tokenized = self.tokenize_words "#{post.data['title']} #{content}"
|
64
80
|
token_seen = false
|
65
81
|
tokenized.each do |word|
|
66
82
|
if !bow.include?(word)
|
@@ -94,8 +110,6 @@ class TfidfConverter < Jekyll::Generator
|
|
94
110
|
tfidf = {}
|
95
111
|
tf.each do |idx, freq|
|
96
112
|
token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
|
97
|
-
# puts "token idx: #{token_idx}"
|
98
|
-
# puts df
|
99
113
|
_idf = Math.log(total_docs / df[token_idx] + 0.00001)
|
100
114
|
|
101
115
|
# Exponential decay over time (boost newer posts)
|
@@ -109,17 +123,18 @@ class TfidfConverter < Jekyll::Generator
|
|
109
123
|
end
|
110
124
|
|
111
125
|
def tokenize_words(doc)
|
112
|
-
#
|
126
|
+
# Remove stopwords from document
|
113
127
|
@stopwords ||= self.load_stopwords
|
114
|
-
|
128
|
+
|
129
|
+
# Split document into tokens
|
115
130
|
splitted_doc = doc.strip.downcase.split
|
116
|
-
splitted_doc.delete_if { |word| @stopwords.include?(word) }
|
117
131
|
|
132
|
+
# Remove stopwords in place
|
133
|
+
splitted_doc.delete_if { |word| @stopwords.include?(word) }
|
118
134
|
|
119
135
|
# Remove special characters (only at beginning and end)
|
120
|
-
splitted_doc.map! { |word| word.gsub(/[^a-z0-
|
136
|
+
splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }
|
121
137
|
|
122
|
-
# splitted_doc.map! { |word| word.tr("@#!?.:;[]()", "") }
|
123
138
|
splitted_doc
|
124
139
|
end
|
125
140
|
|
@@ -153,5 +168,4 @@ class TfidfConverter < Jekyll::Generator
|
|
153
168
|
end
|
154
169
|
page
|
155
170
|
end
|
156
|
-
|
157
171
|
end
|
data/lib/search.js
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
import {LitElement, html, css} from 'https://cdn.jsdelivr.net/gh/lit/dist@2/core/lit-core.min.js';
|
2
|
+
import 'https://cdn.jsdelivr.net/npm/@github/relative-time-element';
|
3
|
+
|
4
|
+
|
5
|
+
class SearchBox extends LitElement {
|
6
|
+
static properties = {
|
7
|
+
_data: {state: true, type: Array},
|
8
|
+
_results : {state: true, type: Array},
|
9
|
+
_open: {state: true, type: Boolean},
|
10
|
+
};
|
11
|
+
|
12
|
+
constructor() {
|
13
|
+
super();
|
14
|
+
this._data = [];
|
15
|
+
this._results = [];
|
16
|
+
this._open = false;
|
17
|
+
}
|
18
|
+
|
19
|
+
static styles = css`
|
20
|
+
:host {
|
21
|
+
position: relative;
|
22
|
+
display: block;
|
23
|
+
}
|
24
|
+
|
25
|
+
input#q {
|
26
|
+
box-sizing: border-box;
|
27
|
+
width: 100%;
|
28
|
+
// margin: 0 auto;
|
29
|
+
padding: .4em;
|
30
|
+
border: 1px solid #ccc;
|
31
|
+
font-size: 1.2em;
|
32
|
+
border-radius: 4px;
|
33
|
+
box-shadow: 1px 1px 3px #AAA;
|
34
|
+
z-index: 11;
|
35
|
+
}
|
36
|
+
|
37
|
+
#results {
|
38
|
+
position: absolute;
|
39
|
+
width: 100%;
|
40
|
+
margin-top: 4px;
|
41
|
+
z-index: 10;
|
42
|
+
background-color: #F6F6F6;
|
43
|
+
border-radius: 4px;
|
44
|
+
box-shadow: 1px 1px 2px #888;
|
45
|
+
}
|
46
|
+
|
47
|
+
.hide {
|
48
|
+
display: none;
|
49
|
+
}
|
50
|
+
|
51
|
+
.resultItem {
|
52
|
+
text-decoration: none;
|
53
|
+
color: #333;
|
54
|
+
padding: .4em;
|
55
|
+
display: flex;
|
56
|
+
flex-direction: column;
|
57
|
+
}
|
58
|
+
|
59
|
+
.resultItem:hover {
|
60
|
+
background-color: #F0F0F0;
|
61
|
+
}
|
62
|
+
.resultItem .title {
|
63
|
+
color: #1756a9;
|
64
|
+
font-weight: 500;
|
65
|
+
}
|
66
|
+
.resultItem .datetime {
|
67
|
+
color: #666;
|
68
|
+
font-size: .8em;
|
69
|
+
}
|
70
|
+
.resultItem .excerpt {
|
71
|
+
font-size: .8em;
|
72
|
+
}
|
73
|
+
.resultItemActive {
|
74
|
+
background-color: #F0F0F0;
|
75
|
+
}
|
76
|
+
`;
|
77
|
+
|
78
|
+
connectedCallback() {
|
79
|
+
super.connectedCallback();
|
80
|
+
this.loadData();
|
81
|
+
|
82
|
+
document.addEventListener('click', (event) => {
|
83
|
+
if (!event.composedPath().includes(this) && this._open) {
|
84
|
+
this.toggle();
|
85
|
+
}
|
86
|
+
});
|
87
|
+
|
88
|
+
// Register arrow keys
|
89
|
+
|
90
|
+
}
|
91
|
+
|
92
|
+
toggle() {
|
93
|
+
this._open = !this._open;
|
94
|
+
}
|
95
|
+
|
96
|
+
openIfResults() {
|
97
|
+
if (this._results.length > 0) {
|
98
|
+
this._open = true;
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
close() {
|
103
|
+
this._open = false;
|
104
|
+
}
|
105
|
+
|
106
|
+
async loadData() {
|
107
|
+
const response = await fetch("/search.json");
|
108
|
+
const jsonData = await response.json();
|
109
|
+
jsonData.word2doc = new Map(Object.entries(jsonData.word2doc));
|
110
|
+
jsonData.bow = new Map(Object.entries(jsonData.bow));
|
111
|
+
jsonData.tfidf = new Map(Object.entries(jsonData.tfidf));
|
112
|
+
this._data = jsonData;
|
113
|
+
}
|
114
|
+
|
115
|
+
disconnectedCallback() {
|
116
|
+
super.disconnectedCallback();
|
117
|
+
}
|
118
|
+
|
119
|
+
search(event) {
|
120
|
+
if (event.key === "Escape") {
|
121
|
+
this.close();
|
122
|
+
return;
|
123
|
+
}
|
124
|
+
const query = event.target.value.toLowerCase().trim();
|
125
|
+
if (query === "") {
|
126
|
+
this._results = [];
|
127
|
+
this.close();
|
128
|
+
return;
|
129
|
+
}
|
130
|
+
|
131
|
+
// Split query into word-tokens
|
132
|
+
const tokens = query.split(" ");
|
133
|
+
|
134
|
+
// Find token ids for each token
|
135
|
+
let tokenIds = new Set();
|
136
|
+
for (const token of tokens) {
|
137
|
+
if (token === "") {
|
138
|
+
continue;
|
139
|
+
}
|
140
|
+
if (this._data.bow.has(token)) {
|
141
|
+
tokenIds.add(this._data.bow.get(token));
|
142
|
+
} else {
|
143
|
+
// If one of the tokens is not available, we can return immediately
|
144
|
+
// as there will be no results
|
145
|
+
this._results = [];
|
146
|
+
this.close();
|
147
|
+
return;
|
148
|
+
}
|
149
|
+
}
|
150
|
+
|
151
|
+
// Convert tokenIds to array
|
152
|
+
tokenIds = [...tokenIds];
|
153
|
+
|
154
|
+
// Initialize docs with first token
|
155
|
+
// Subsequent token need to interset with this set
|
156
|
+
let docs = new Set(this._data.word2doc.get(tokenIds[0].toString()));
|
157
|
+
|
158
|
+
for (const tokenId of tokenIds.slice(1)) {
|
159
|
+
// Find document candidates
|
160
|
+
const docCandidates = new Set(this._data.word2doc.get(tokenId.toString()));
|
161
|
+
// console.log("intersection", docCandidates, docs);
|
162
|
+
docs = new Set([...docs].filter((x) => docCandidates.has(x)));
|
163
|
+
}
|
164
|
+
|
165
|
+
// Calculate TF-IDF
|
166
|
+
let results = new Map();
|
167
|
+
for (const doc of docs) {
|
168
|
+
let score = 0;
|
169
|
+
for (const tokenId of tokenIds) {
|
170
|
+
if (this._data.tfidf.has(`${tokenId},${doc}`)) {
|
171
|
+
score += this._data.tfidf.get(`${tokenId},${doc}`);
|
172
|
+
}
|
173
|
+
}
|
174
|
+
results.set(doc, score);
|
175
|
+
}
|
176
|
+
|
177
|
+
// Sort by score
|
178
|
+
const candidates = [...results.entries()].sort((a, b) => b[1] - a[1]).map((a) => a[0]);
|
179
|
+
|
180
|
+
// Get top n results
|
181
|
+
this._results = candidates.map((idx) => this._data.docs[idx]).slice(0, 8);
|
182
|
+
this._open = true;
|
183
|
+
}
|
184
|
+
|
185
|
+
placeholder() {
|
186
|
+
if (this._data && this._data.docs && this._data.docs.length > 0) {
|
187
|
+
let plural = "";
|
188
|
+
if (this._data.docs.length !== 1) {
|
189
|
+
plural = "s";
|
190
|
+
}
|
191
|
+
return "Search in " + this._data.docs.length + ` post${plural}...`;
|
192
|
+
} else {
|
193
|
+
return "Loading...";
|
194
|
+
}
|
195
|
+
}
|
196
|
+
|
197
|
+
render() {
|
198
|
+
return html`<div>
|
199
|
+
<input id="q" type="text" placeholder="${this.placeholder()}" @keyup="${this.search}" @click=${this.openIfResults}>
|
200
|
+
${this._open ? html`
|
201
|
+
<div id="results">
|
202
|
+
${this._results.map((result) => html`
|
203
|
+
<a class="resultItem" href="${result.url}">
|
204
|
+
<div>
|
205
|
+
<span class="title">${result.title}</span>
|
206
|
+
<span class="datetime">
|
207
|
+
<relative-time datetime="${result.date}">
|
208
|
+
${result.date}
|
209
|
+
</relative-time>
|
210
|
+
</span>
|
211
|
+
</div>
|
212
|
+
<div class="excerpt">
|
213
|
+
${result.text}
|
214
|
+
</div>
|
215
|
+
</a>
|
216
|
+
`)}
|
217
|
+
</div>
|
218
|
+
` : ""}
|
219
|
+
|
220
|
+
</div>`;
|
221
|
+
}
|
222
|
+
}
|
223
|
+
customElements.define('search-box', SearchBox);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jekyll_ranked_search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Friedrich Ewald
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-06-
|
11
|
+
date: 2023-06-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redcarpet
|
@@ -24,13 +24,14 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '3.6'
|
27
|
-
description:
|
27
|
+
description: Offline search plugin for Jekyll posts using TF-IDF
|
28
28
|
email: freddiemailster@gmail.com
|
29
29
|
executables: []
|
30
30
|
extensions: []
|
31
31
|
extra_rdoc_files: []
|
32
32
|
files:
|
33
33
|
- lib/jekyll_ranked_search.rb
|
34
|
+
- lib/search.js
|
34
35
|
- lib/search.json
|
35
36
|
- lib/stopwords.txt
|
36
37
|
homepage: https://github.com/f-ewald/jekyll_ranked_search
|