jekyll_ranked_search 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 78ba1763b6b2bca798b128851cd24a67c5f8f47b980d9201c8045d33b40834e1
4
- data.tar.gz: 0babe4848299d150103574360b54117207858f6a2171cd46dfc4c48fb971e041
3
+ metadata.gz: 4a717e9e1526e49b484e6b84067aa22daef311e21160e80a5c87a5a18fd6aeeb
4
+ data.tar.gz: 5f6403859df289ecd20971ecb775d08bcc88487616df544ebacc1c2e74805fe6
5
5
  SHA512:
6
- metadata.gz: 729b789788706222be8f96680bd7a6d7839eccbd5eb66f04a1e8956e176e49c123c085ea2b52c2bdec32521904a788a1ea2f31a3b5443bf621a760e76960479a
7
- data.tar.gz: 227866c90b5e664291d7a13a0da968e5e2c3b0c07972ec7ab20d574b0a0a25d9b1094b1202962c7b46f216d83e56b8646ad283b5af5c21574be84295180f4e88
6
+ metadata.gz: 7c268065e1ffe5dbd646feca161ed346dfc649c52d2b4f7269e1cab98e17a3a6180cb70c81a82a8c9b081b4b3a4aa7a6d5dd4d7d9b3ea10610a20210610e0f46
7
+ data.tar.gz: 73822909d296b0d24a711cbffe27ceb906d6319bc419932ce5dc8be94774d1840339aba7eb58634c126c75a836bc4b4279275eab8ae431d47b17ffc9df0127d4
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "set"
2
4
  require "redcarpet"
3
5
  require "redcarpet/render_strip"
@@ -46,13 +48,28 @@ class TfidfConverter < Jekyll::Generator
46
48
  # Create vocabulary
47
49
  docs.each_with_index do |post, idx|
48
50
  content = markdown.render(post.content)
51
+
52
+ # Tokenize content before applying any other transformations
53
+ tokenized = self.tokenize_words "#{post.data['title']} #{content}"
54
+
49
55
  # Replace newlines with wide spaces and bullet points
50
- # TODO: Remove trailing bullet point
51
- content.gsub!(/\n/, ' • ')
52
- # TODO: Use first n words instead of characters
53
- content = markdown.render(content)
54
- content = content[..512] # The first 512 characters of the post
56
+ divider = " "
57
+ content.gsub!(/\n/, divider)
58
+
59
+ # Remove trailing divider
60
+ if content.end_with?(divider)
61
+ content = content[0..-4]
62
+ end
55
63
 
64
+ # Take first n words of post
65
+ n_words = 40
66
+ splitted_content = content.split(" ")
67
+ word_count = splitted_content.length
68
+ content = splitted_content[..n_words].join(" ") # The first n words of the post
69
+ if word_count > n_words
70
+ content += "..."
71
+ end
72
+
56
73
  processed_docs.push({
57
74
  title: post.data['title'],
58
75
  url: post.url,
@@ -60,7 +77,6 @@ class TfidfConverter < Jekyll::Generator
60
77
  text: content,
61
78
  })
62
79
 
63
- tokenized = self.tokenize_words "#{post.data['title']} #{content}"
64
80
  token_seen = false
65
81
  tokenized.each do |word|
66
82
  if !bow.include?(word)
@@ -94,8 +110,6 @@ class TfidfConverter < Jekyll::Generator
94
110
  tfidf = {}
95
111
  tf.each do |idx, freq|
96
112
  token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
97
- # puts "token idx: #{token_idx}"
98
- # puts df
99
113
  _idf = Math.log(total_docs / df[token_idx] + 0.00001)
100
114
 
101
115
  # Exponential decay over time (boost newer posts)
@@ -109,17 +123,18 @@ class TfidfConverter < Jekyll::Generator
109
123
  end
110
124
 
111
125
  def tokenize_words(doc)
112
- # TODO: Better tokenization
126
+ # Remove stopwords from document
113
127
  @stopwords ||= self.load_stopwords
114
- # replace_chars = /[-_:;@#,¿?¡!'"“”‘’`\/\(\)\[\]\{\}]/i
128
+
129
+ # Split document into tokens
115
130
  splitted_doc = doc.strip.downcase.split
116
- splitted_doc.delete_if { |word| @stopwords.include?(word) }
117
131
 
132
+ # Remove stopwords in place
133
+ splitted_doc.delete_if { |word| @stopwords.include?(word) }
118
134
 
119
135
  # Remove special characters (only at beginning and end)
120
- splitted_doc.map! { |word| word.gsub(/[^a-z0-9\s]/i, '') }
136
+ splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }
121
137
 
122
- # splitted_doc.map! { |word| word.tr("@#!?.:;[]()", "") }
123
138
  splitted_doc
124
139
  end
125
140
 
@@ -153,5 +168,4 @@ class TfidfConverter < Jekyll::Generator
153
168
  end
154
169
  page
155
170
  end
156
-
157
171
  end
data/lib/search.js ADDED
@@ -0,0 +1,223 @@
1
+ import {LitElement, html, css} from 'https://cdn.jsdelivr.net/gh/lit/dist@2/core/lit-core.min.js';
2
+ import 'https://cdn.jsdelivr.net/npm/@github/relative-time-element';
3
+
4
+
5
+ class SearchBox extends LitElement {
6
+ static properties = {
7
+ _data: {state: true, type: Array},
8
+ _results : {state: true, type: Array},
9
+ _open: {state: true, type: Boolean},
10
+ };
11
+
12
+ constructor() {
13
+ super();
14
+ this._data = [];
15
+ this._results = [];
16
+ this._open = false;
17
+ }
18
+
19
+ static styles = css`
20
+ :host {
21
+ position: relative;
22
+ display: block;
23
+ }
24
+
25
+ input#q {
26
+ box-sizing: border-box;
27
+ width: 100%;
28
+ // margin: 0 auto;
29
+ padding: .4em;
30
+ border: 1px solid #ccc;
31
+ font-size: 1.2em;
32
+ border-radius: 4px;
33
+ box-shadow: 1px 1px 3px #AAA;
34
+ z-index: 11;
35
+ }
36
+
37
+ #results {
38
+ position: absolute;
39
+ width: 100%;
40
+ margin-top: 4px;
41
+ z-index: 10;
42
+ background-color: #F6F6F6;
43
+ border-radius: 4px;
44
+ box-shadow: 1px 1px 2px #888;
45
+ }
46
+
47
+ .hide {
48
+ display: none;
49
+ }
50
+
51
+ .resultItem {
52
+ text-decoration: none;
53
+ color: #333;
54
+ padding: .4em;
55
+ display: flex;
56
+ flex-direction: column;
57
+ }
58
+
59
+ .resultItem:hover {
60
+ background-color: #F0F0F0;
61
+ }
62
+ .resultItem .title {
63
+ color: #1756a9;
64
+ font-weight: 500;
65
+ }
66
+ .resultItem .datetime {
67
+ color: #666;
68
+ font-size: .8em;
69
+ }
70
+ .resultItem .excerpt {
71
+ font-size: .8em;
72
+ }
73
+ .resultItemActive {
74
+ background-color: #F0F0F0;
75
+ }
76
+ `;
77
+
78
+ connectedCallback() {
79
+ super.connectedCallback();
80
+ this.loadData();
81
+
82
+ document.addEventListener('click', (event) => {
83
+ if (!event.composedPath().includes(this) && this._open) {
84
+ this.toggle();
85
+ }
86
+ });
87
+
88
+ // Register arrow keys
89
+
90
+ }
91
+
92
+ toggle() {
93
+ this._open = !this._open;
94
+ }
95
+
96
+ openIfResults() {
97
+ if (this._results.length > 0) {
98
+ this._open = true;
99
+ }
100
+ }
101
+
102
+ close() {
103
+ this._open = false;
104
+ }
105
+
106
+ async loadData() {
107
+ const response = await fetch("/search.json");
108
+ const jsonData = await response.json();
109
+ jsonData.word2doc = new Map(Object.entries(jsonData.word2doc));
110
+ jsonData.bow = new Map(Object.entries(jsonData.bow));
111
+ jsonData.tfidf = new Map(Object.entries(jsonData.tfidf));
112
+ this._data = jsonData;
113
+ }
114
+
115
+ disconnectedCallback() {
116
+ super.disconnectedCallback();
117
+ }
118
+
119
+ search(event) {
120
+ if (event.key === "Escape") {
121
+ this.close();
122
+ return;
123
+ }
124
+ const query = event.target.value.toLowerCase().trim();
125
+ if (query === "") {
126
+ this._results = [];
127
+ this.close();
128
+ return;
129
+ }
130
+
131
+ // Split query into word-tokens
132
+ const tokens = query.split(" ");
133
+
134
+ // Find token ids for each token
135
+ let tokenIds = new Set();
136
+ for (const token of tokens) {
137
+ if (token === "") {
138
+ continue;
139
+ }
140
+ if (this._data.bow.has(token)) {
141
+ tokenIds.add(this._data.bow.get(token));
142
+ } else {
143
+ // If one of the tokens is not available, we can return immediately
144
+ // as there will be no results
145
+ this._results = [];
146
+ this.close();
147
+ return;
148
+ }
149
+ }
150
+
151
+ // Convert tokenIds to array
152
+ tokenIds = [...tokenIds];
153
+
154
+ // Initialize docs with first token
155
+ // Subsequent token need to interset with this set
156
+ let docs = new Set(this._data.word2doc.get(tokenIds[0].toString()));
157
+
158
+ for (const tokenId of tokenIds.slice(1)) {
159
+ // Find document candidates
160
+ const docCandidates = new Set(this._data.word2doc.get(tokenId.toString()));
161
+ // console.log("intersection", docCandidates, docs);
162
+ docs = new Set([...docs].filter((x) => docCandidates.has(x)));
163
+ }
164
+
165
+ // Calculate TF-IDF
166
+ let results = new Map();
167
+ for (const doc of docs) {
168
+ let score = 0;
169
+ for (const tokenId of tokenIds) {
170
+ if (this._data.tfidf.has(`${tokenId},${doc}`)) {
171
+ score += this._data.tfidf.get(`${tokenId},${doc}`);
172
+ }
173
+ }
174
+ results.set(doc, score);
175
+ }
176
+
177
+ // Sort by score
178
+ const candidates = [...results.entries()].sort((a, b) => b[1] - a[1]).map((a) => a[0]);
179
+
180
+ // Get top n results
181
+ this._results = candidates.map((idx) => this._data.docs[idx]).slice(0, 8);
182
+ this._open = true;
183
+ }
184
+
185
+ placeholder() {
186
+ if (this._data && this._data.docs && this._data.docs.length > 0) {
187
+ let plural = "";
188
+ if (this._data.docs.length !== 1) {
189
+ plural = "s";
190
+ }
191
+ return "Search in " + this._data.docs.length + ` post${plural}...`;
192
+ } else {
193
+ return "Loading...";
194
+ }
195
+ }
196
+
197
+ render() {
198
+ return html`<div>
199
+ <input id="q" type="text" placeholder="${this.placeholder()}" @keyup="${this.search}" @click=${this.openIfResults}>
200
+ ${this._open ? html`
201
+ <div id="results">
202
+ ${this._results.map((result) => html`
203
+ <a class="resultItem" href="${result.url}">
204
+ <div>
205
+ <span class="title">${result.title}</span>
206
+ <span class="datetime">
207
+ <relative-time datetime="${result.date}">
208
+ ${result.date}
209
+ </relative-time>
210
+ </span>
211
+ </div>
212
+ <div class="excerpt">
213
+ ${result.text}
214
+ </div>
215
+ </a>
216
+ `)}
217
+ </div>
218
+ ` : ""}
219
+
220
+ </div>`;
221
+ }
222
+ }
223
+ customElements.define('search-box', SearchBox);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jekyll_ranked_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Friedrich Ewald
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-06-22 00:00:00.000000000 Z
11
+ date: 2023-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redcarpet
@@ -24,13 +24,14 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '3.6'
27
- description: Search for Jekyll posts using TF-IDF
27
+ description: Offline search plugin for Jekyll posts using TF-IDF
28
28
  email: freddiemailster@gmail.com
29
29
  executables: []
30
30
  extensions: []
31
31
  extra_rdoc_files: []
32
32
  files:
33
33
  - lib/jekyll_ranked_search.rb
34
+ - lib/search.js
34
35
  - lib/search.json
35
36
  - lib/stopwords.txt
36
37
  homepage: https://github.com/f-ewald/jekyll_ranked_search