jekyll_ranked_search 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 78ba1763b6b2bca798b128851cd24a67c5f8f47b980d9201c8045d33b40834e1
4
- data.tar.gz: 0babe4848299d150103574360b54117207858f6a2171cd46dfc4c48fb971e041
3
+ metadata.gz: 4a717e9e1526e49b484e6b84067aa22daef311e21160e80a5c87a5a18fd6aeeb
4
+ data.tar.gz: 5f6403859df289ecd20971ecb775d08bcc88487616df544ebacc1c2e74805fe6
5
5
  SHA512:
6
- metadata.gz: 729b789788706222be8f96680bd7a6d7839eccbd5eb66f04a1e8956e176e49c123c085ea2b52c2bdec32521904a788a1ea2f31a3b5443bf621a760e76960479a
7
- data.tar.gz: 227866c90b5e664291d7a13a0da968e5e2c3b0c07972ec7ab20d574b0a0a25d9b1094b1202962c7b46f216d83e56b8646ad283b5af5c21574be84295180f4e88
6
+ metadata.gz: 7c268065e1ffe5dbd646feca161ed346dfc649c52d2b4f7269e1cab98e17a3a6180cb70c81a82a8c9b081b4b3a4aa7a6d5dd4d7d9b3ea10610a20210610e0f46
7
+ data.tar.gz: 73822909d296b0d24a711cbffe27ceb906d6319bc419932ce5dc8be94774d1840339aba7eb58634c126c75a836bc4b4279275eab8ae431d47b17ffc9df0127d4
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "set"
2
4
  require "redcarpet"
3
5
  require "redcarpet/render_strip"
@@ -46,13 +48,28 @@ class TfidfConverter < Jekyll::Generator
46
48
  # Create vocabulary
47
49
  docs.each_with_index do |post, idx|
48
50
  content = markdown.render(post.content)
51
+
52
+ # Tokenize content before applying any other transformations
53
+ tokenized = self.tokenize_words "#{post.data['title']} #{content}"
54
+
49
55
  # Replace newlines with wide spaces and bullet points
50
- # TODO: Remove trailing bullet point
51
- content.gsub!(/\n/, ' • ')
52
- # TODO: Use first n words instead of characters
53
- content = markdown.render(content)
54
- content = content[..512] # The first 512 characters of the post
56
+ divider = " "
57
+ content.gsub!(/\n/, divider)
58
+
59
+ # Remove trailing divider
60
+ if content.end_with?(divider)
61
+ content = content[0..-4]
62
+ end
55
63
 
64
+ # Take first n words of post
65
+ n_words = 40
66
+ splitted_content = content.split(" ")
67
+ word_count = splitted_content.length
68
+ content = splitted_content[..n_words].join(" ") # The first n words of the post
69
+ if word_count > n_words
70
+ content += "..."
71
+ end
72
+
56
73
  processed_docs.push({
57
74
  title: post.data['title'],
58
75
  url: post.url,
@@ -60,7 +77,6 @@ class TfidfConverter < Jekyll::Generator
60
77
  text: content,
61
78
  })
62
79
 
63
- tokenized = self.tokenize_words "#{post.data['title']} #{content}"
64
80
  token_seen = false
65
81
  tokenized.each do |word|
66
82
  if !bow.include?(word)
@@ -94,8 +110,6 @@ class TfidfConverter < Jekyll::Generator
94
110
  tfidf = {}
95
111
  tf.each do |idx, freq|
96
112
  token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
97
- # puts "token idx: #{token_idx}"
98
- # puts df
99
113
  _idf = Math.log(total_docs / df[token_idx] + 0.00001)
100
114
 
101
115
  # Exponential decay over time (boost newer posts)
@@ -109,17 +123,18 @@ class TfidfConverter < Jekyll::Generator
109
123
  end
110
124
 
111
125
  def tokenize_words(doc)
112
- # TODO: Better tokenization
126
+ # Remove stopwords from document
113
127
  @stopwords ||= self.load_stopwords
114
- # replace_chars = /[-_:;@#,¿?¡!'"“”‘’`\/\(\)\[\]\{\}]/i
128
+
129
+ # Split document into tokens
115
130
  splitted_doc = doc.strip.downcase.split
116
- splitted_doc.delete_if { |word| @stopwords.include?(word) }
117
131
 
132
+ # Remove stopwords in place
133
+ splitted_doc.delete_if { |word| @stopwords.include?(word) }
118
134
 
119
135
  # Remove special characters (only at beginning and end)
120
- splitted_doc.map! { |word| word.gsub(/[^a-z0-9\s]/i, '') }
136
+ splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }
121
137
 
122
- # splitted_doc.map! { |word| word.tr("@#!?.:;[]()", "") }
123
138
  splitted_doc
124
139
  end
125
140
 
@@ -153,5 +168,4 @@ class TfidfConverter < Jekyll::Generator
153
168
  end
154
169
  page
155
170
  end
156
-
157
171
  end
data/lib/search.js ADDED
@@ -0,0 +1,223 @@
1
+ import {LitElement, html, css} from 'https://cdn.jsdelivr.net/gh/lit/dist@2/core/lit-core.min.js';
2
+ import 'https://cdn.jsdelivr.net/npm/@github/relative-time-element';
3
+
4
+
5
+ class SearchBox extends LitElement {
6
+ static properties = {
7
+ _data: {state: true, type: Array},
8
+ _results : {state: true, type: Array},
9
+ _open: {state: true, type: Boolean},
10
+ };
11
+
12
+ constructor() {
13
+ super();
14
+ this._data = [];
15
+ this._results = [];
16
+ this._open = false;
17
+ }
18
+
19
+ static styles = css`
20
+ :host {
21
+ position: relative;
22
+ display: block;
23
+ }
24
+
25
+ input#q {
26
+ box-sizing: border-box;
27
+ width: 100%;
28
+ // margin: 0 auto;
29
+ padding: .4em;
30
+ border: 1px solid #ccc;
31
+ font-size: 1.2em;
32
+ border-radius: 4px;
33
+ box-shadow: 1px 1px 3px #AAA;
34
+ z-index: 11;
35
+ }
36
+
37
+ #results {
38
+ position: absolute;
39
+ width: 100%;
40
+ margin-top: 4px;
41
+ z-index: 10;
42
+ background-color: #F6F6F6;
43
+ border-radius: 4px;
44
+ box-shadow: 1px 1px 2px #888;
45
+ }
46
+
47
+ .hide {
48
+ display: none;
49
+ }
50
+
51
+ .resultItem {
52
+ text-decoration: none;
53
+ color: #333;
54
+ padding: .4em;
55
+ display: flex;
56
+ flex-direction: column;
57
+ }
58
+
59
+ .resultItem:hover {
60
+ background-color: #F0F0F0;
61
+ }
62
+ .resultItem .title {
63
+ color: #1756a9;
64
+ font-weight: 500;
65
+ }
66
+ .resultItem .datetime {
67
+ color: #666;
68
+ font-size: .8em;
69
+ }
70
+ .resultItem .excerpt {
71
+ font-size: .8em;
72
+ }
73
+ .resultItemActive {
74
+ background-color: #F0F0F0;
75
+ }
76
+ `;
77
+
78
+ connectedCallback() {
79
+ super.connectedCallback();
80
+ this.loadData();
81
+
82
+ document.addEventListener('click', (event) => {
83
+ if (!event.composedPath().includes(this) && this._open) {
84
+ this.toggle();
85
+ }
86
+ });
87
+
88
+ // Register arrow keys
89
+
90
+ }
91
+
92
+ toggle() {
93
+ this._open = !this._open;
94
+ }
95
+
96
+ openIfResults() {
97
+ if (this._results.length > 0) {
98
+ this._open = true;
99
+ }
100
+ }
101
+
102
+ close() {
103
+ this._open = false;
104
+ }
105
+
106
+ async loadData() {
107
+ const response = await fetch("/search.json");
108
+ const jsonData = await response.json();
109
+ jsonData.word2doc = new Map(Object.entries(jsonData.word2doc));
110
+ jsonData.bow = new Map(Object.entries(jsonData.bow));
111
+ jsonData.tfidf = new Map(Object.entries(jsonData.tfidf));
112
+ this._data = jsonData;
113
+ }
114
+
115
+ disconnectedCallback() {
116
+ super.disconnectedCallback();
117
+ }
118
+
119
+ search(event) {
120
+ if (event.key === "Escape") {
121
+ this.close();
122
+ return;
123
+ }
124
+ const query = event.target.value.toLowerCase().trim();
125
+ if (query === "") {
126
+ this._results = [];
127
+ this.close();
128
+ return;
129
+ }
130
+
131
+ // Split query into word-tokens
132
+ const tokens = query.split(" ");
133
+
134
+ // Find token ids for each token
135
+ let tokenIds = new Set();
136
+ for (const token of tokens) {
137
+ if (token === "") {
138
+ continue;
139
+ }
140
+ if (this._data.bow.has(token)) {
141
+ tokenIds.add(this._data.bow.get(token));
142
+ } else {
143
+ // If one of the tokens is not available, we can return immediately
144
+ // as there will be no results
145
+ this._results = [];
146
+ this.close();
147
+ return;
148
+ }
149
+ }
150
+
151
+ // Convert tokenIds to array
152
+ tokenIds = [...tokenIds];
153
+
154
+ // Initialize docs with first token
155
+ // Subsequent token need to interset with this set
156
+ let docs = new Set(this._data.word2doc.get(tokenIds[0].toString()));
157
+
158
+ for (const tokenId of tokenIds.slice(1)) {
159
+ // Find document candidates
160
+ const docCandidates = new Set(this._data.word2doc.get(tokenId.toString()));
161
+ // console.log("intersection", docCandidates, docs);
162
+ docs = new Set([...docs].filter((x) => docCandidates.has(x)));
163
+ }
164
+
165
+ // Calculate TF-IDF
166
+ let results = new Map();
167
+ for (const doc of docs) {
168
+ let score = 0;
169
+ for (const tokenId of tokenIds) {
170
+ if (this._data.tfidf.has(`${tokenId},${doc}`)) {
171
+ score += this._data.tfidf.get(`${tokenId},${doc}`);
172
+ }
173
+ }
174
+ results.set(doc, score);
175
+ }
176
+
177
+ // Sort by score
178
+ const candidates = [...results.entries()].sort((a, b) => b[1] - a[1]).map((a) => a[0]);
179
+
180
+ // Get top n results
181
+ this._results = candidates.map((idx) => this._data.docs[idx]).slice(0, 8);
182
+ this._open = true;
183
+ }
184
+
185
+ placeholder() {
186
+ if (this._data && this._data.docs && this._data.docs.length > 0) {
187
+ let plural = "";
188
+ if (this._data.docs.length !== 1) {
189
+ plural = "s";
190
+ }
191
+ return "Search in " + this._data.docs.length + ` post${plural}...`;
192
+ } else {
193
+ return "Loading...";
194
+ }
195
+ }
196
+
197
+ render() {
198
+ return html`<div>
199
+ <input id="q" type="text" placeholder="${this.placeholder()}" @keyup="${this.search}" @click=${this.openIfResults}>
200
+ ${this._open ? html`
201
+ <div id="results">
202
+ ${this._results.map((result) => html`
203
+ <a class="resultItem" href="${result.url}">
204
+ <div>
205
+ <span class="title">${result.title}</span>
206
+ <span class="datetime">
207
+ <relative-time datetime="${result.date}">
208
+ ${result.date}
209
+ </relative-time>
210
+ </span>
211
+ </div>
212
+ <div class="excerpt">
213
+ ${result.text}
214
+ </div>
215
+ </a>
216
+ `)}
217
+ </div>
218
+ ` : ""}
219
+
220
+ </div>`;
221
+ }
222
+ }
223
+ customElements.define('search-box', SearchBox);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jekyll_ranked_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Friedrich Ewald
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-22 00:00:00.000000000 Z
11
+ date: 2023-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redcarpet
@@ -24,13 +24,14 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '3.6'
27
- description: Search for Jekyll posts using TF-IDF
27
+ description: Offline search plugin for Jekyll posts using TF-IDF
28
28
  email: freddiemailster@gmail.com
29
29
  executables: []
30
30
  extensions: []
31
31
  extra_rdoc_files: []
32
32
  files:
33
33
  - lib/jekyll_ranked_search.rb
34
+ - lib/search.js
34
35
  - lib/search.json
35
36
  - lib/stopwords.txt
36
37
  homepage: https://github.com/f-ewald/jekyll_ranked_search