RubyGems - jekyll_ranked_search - Versions diffs - 0.0.1 → 0.0.2 - Mend

jekyll_ranked_search 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/jekyll_ranked_search.rb +28 -14
data/lib/search.js +223 -0
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 78ba1763b6b2bca798b128851cd24a67c5f8f47b980d9201c8045d33b40834e1
-  data.tar.gz: 0babe4848299d150103574360b54117207858f6a2171cd46dfc4c48fb971e041
+  metadata.gz: 4a717e9e1526e49b484e6b84067aa22daef311e21160e80a5c87a5a18fd6aeeb
+  data.tar.gz: 5f6403859df289ecd20971ecb775d08bcc88487616df544ebacc1c2e74805fe6
 SHA512:
-  metadata.gz: 729b789788706222be8f96680bd7a6d7839eccbd5eb66f04a1e8956e176e49c123c085ea2b52c2bdec32521904a788a1ea2f31a3b5443bf621a760e76960479a
-  data.tar.gz: 227866c90b5e664291d7a13a0da968e5e2c3b0c07972ec7ab20d574b0a0a25d9b1094b1202962c7b46f216d83e56b8646ad283b5af5c21574be84295180f4e88
+  metadata.gz: 7c268065e1ffe5dbd646feca161ed346dfc649c52d2b4f7269e1cab98e17a3a6180cb70c81a82a8c9b081b4b3a4aa7a6d5dd4d7d9b3ea10610a20210610e0f46
+  data.tar.gz: 73822909d296b0d24a711cbffe27ceb906d6319bc419932ce5dc8be94774d1840339aba7eb58634c126c75a836bc4b4279275eab8ae431d47b17ffc9df0127d4

data/lib/jekyll_ranked_search.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require "set"
 require "redcarpet"
 require "redcarpet/render_strip"
@@ -46,13 +48,28 @@ class TfidfConverter < Jekyll::Generator
     # Create vocabulary
     docs.each_with_index do |post, idx|
       content = markdown.render(post.content)
+      # Tokenize content before applying any other transformations
+      tokenized = self.tokenize_words "#{post.data['title']} #{content}"
       # Replace newlines with wide spaces and bullet points
-      # TODO: Remove trailing bullet point
-      content.gsub!(/\n/, ' • ')
-      # TODO: Use first n words instead of characters
-      content = markdown.render(content)
-      content = content[..512]  # The first 512 characters of the post
+      divider = " • "
+      content.gsub!(/\n/, divider)
+      # Remove trailing divider
+      if content.end_with?(divider)
+        content = content[0..-4]
+      end
+      # Take first n words of post
+      n_words = 40
+      splitted_content = content.split(" ")
+      word_count = splitted_content.length
+      content = splitted_content[..n_words].join(" ")  # The first n words of the post
+      if word_count > n_words
+        content += "..."
+      end
       processed_docs.push({
         title: post.data['title'],
         url: post.url,
@@ -60,7 +77,6 @@ class TfidfConverter < Jekyll::Generator
         text: content,
       })
-      tokenized = self.tokenize_words "#{post.data['title']} #{content}"
       token_seen = false
       tokenized.each do |word|
         if !bow.include?(word)
@@ -94,8 +110,6 @@ class TfidfConverter < Jekyll::Generator
     tfidf = {}
     tf.each do |idx, freq|
       token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
-      # puts "token idx: #{token_idx}"
-      # puts df
       _idf = Math.log(total_docs / df[token_idx] + 0.00001)
       # Exponential decay over time (boost newer posts)
@@ -109,17 +123,18 @@ class TfidfConverter < Jekyll::Generator
   end
   def tokenize_words(doc)
-    # TODO: Better tokenization
+    # Remove stopwords from document
     @stopwords ||= self.load_stopwords
-    # replace_chars = /[-_:;@#,¿?¡!'"“”‘’`\/\(\)\[\]\{\}]/i
+    # Split document into tokens
     splitted_doc = doc.strip.downcase.split
-    splitted_doc.delete_if { |word| @stopwords.include?(word) }
+    # Remove stopwords in place
+    splitted_doc.delete_if { |word| @stopwords.include?(word) }
     # Remove special characters (only at beginning and end)
-    splitted_doc.map! { |word| word.gsub(/[^a-z0-9\s]/i, '') }
+    splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }
-    # splitted_doc.map! { |word| word.tr("@#!?.:;[]()", "") }
     splitted_doc
   end
@@ -153,5 +168,4 @@ class TfidfConverter < Jekyll::Generator
     end
     page
   end
 end

data/lib/search.js ADDED Viewed

@@ -0,0 +1,223 @@
+import {LitElement, html, css} from 'https://cdn.jsdelivr.net/gh/lit/dist@2/core/lit-core.min.js';
+import 'https://cdn.jsdelivr.net/npm/@github/relative-time-element';
+class SearchBox extends LitElement {
+  static properties = {
+    _data: {state: true, type: Array},
+    _results : {state: true, type: Array},
+    _open: {state: true, type: Boolean},
+  };
+  constructor() {
+    super();
+    this._data = [];
+    this._results = [];
+    this._open = false;
+  }
+  static styles = css`
+    :host {
+      position: relative;
+      display: block;
+    }
+    input#q {
+      box-sizing: border-box;
+      width: 100%;
+      // margin: 0 auto;
+      padding: .4em;
+      border: 1px solid #ccc;
+      font-size: 1.2em;
+      border-radius: 4px;
+      box-shadow: 1px 1px 3px #AAA;
+      z-index: 11;
+    }
+    #results {
+      position: absolute;
+      width: 100%;
+      margin-top: 4px;
+      z-index: 10;
+      background-color: #F6F6F6;
+      border-radius: 4px;
+      box-shadow: 1px 1px 2px #888;
+    }
+    .hide {
+      display: none;
+    }
+    .resultItem {
+      text-decoration: none;
+      color: #333;
+      padding: .4em;
+      display: flex;
+      flex-direction: column;
+    }
+    .resultItem:hover {
+      background-color: #F0F0F0;
+    }
+    .resultItem .title {
+      color: #1756a9;
+      font-weight: 500;
+    }
+    .resultItem .datetime {
+      color: #666;
+      font-size: .8em;
+    }
+    .resultItem .excerpt {
+      font-size: .8em;
+    }
+    .resultItemActive {
+      background-color: #F0F0F0;
+    }
+  `;
+  connectedCallback() {
+    super.connectedCallback();
+    this.loadData();
+    document.addEventListener('click', (event) => {
+      if (!event.composedPath().includes(this) && this._open) {
+          this.toggle();
+      }
+  });
+    // Register arrow keys
+  }
+  toggle() {
+    this._open = !this._open;
+  }
+  openIfResults() {
+    if (this._results.length > 0) {
+      this._open = true;
+    }
+  }
+  close() {
+    this._open = false;
+  }
+  async loadData() {
+    const response = await fetch("/search.json");
+    const jsonData = await response.json();
+    jsonData.word2doc = new Map(Object.entries(jsonData.word2doc));
+    jsonData.bow = new Map(Object.entries(jsonData.bow));
+    jsonData.tfidf = new Map(Object.entries(jsonData.tfidf));
+    this._data = jsonData;
+  }
+  disconnectedCallback() {
+    super.disconnectedCallback();
+  }
+  search(event) {
+    if (event.key === "Escape") {
+      this.close();
+      return;
+    }
+    const query = event.target.value.toLowerCase().trim();
+    if (query === "") {
+      this._results = [];
+      this.close();
+      return;
+    }
+    // Split query into word-tokens
+    const tokens = query.split(" ");
+    // Find token ids for each token
+    let tokenIds = new Set();
+    for (const token of tokens) {
+      if (token === "") {
+        continue;
+      }
+      if (this._data.bow.has(token)) {
+        tokenIds.add(this._data.bow.get(token));
+      } else {
+        // If one of the tokens is not available, we can return immediately
+        // as there will be no results
+        this._results = [];
+        this.close();
+        return;
+      }
+    }
+    // Convert tokenIds to array
+    tokenIds = [...tokenIds];
+    // Initialize docs with first token
+    // Subsequent token need to interset with this set
+    let docs = new Set(this._data.word2doc.get(tokenIds[0].toString()));
+    for (const tokenId of tokenIds.slice(1)) {
+      // Find document candidates
+      const docCandidates = new Set(this._data.word2doc.get(tokenId.toString()));
+      // console.log("intersection", docCandidates, docs);
+      docs = new Set([...docs].filter((x) => docCandidates.has(x)));
+    }
+    // Calculate TF-IDF
+    let results = new Map();
+    for (const doc of docs) {
+      let score = 0;
+      for (const tokenId of tokenIds) {
+        if (this._data.tfidf.has(`${tokenId},${doc}`)) {
+          score += this._data.tfidf.get(`${tokenId},${doc}`);
+        }
+      }
+      results.set(doc, score);
+    }
+    // Sort by score
+    const candidates = [...results.entries()].sort((a, b) => b[1] - a[1]).map((a) => a[0]);
+    // Get top n results
+    this._results = candidates.map((idx) => this._data.docs[idx]).slice(0, 8);
+    this._open = true;
+  }
+  placeholder() {
+    if (this._data && this._data.docs && this._data.docs.length > 0) {
+      let plural = "";
+      if (this._data.docs.length !== 1) {
+        plural = "s";
+      }
+      return "Search in " + this._data.docs.length + ` post${plural}...`;
+    } else {
+      return "Loading...";
+    }
+  }
+  render() {
+    return html`<div>
+      <input id="q" type="text" placeholder="${this.placeholder()}" @keyup="${this.search}" @click=${this.openIfResults}>
+      ${this._open ? html`
+        <div id="results">
+          ${this._results.map((result) => html`
+            <a class="resultItem" href="${result.url}">
+              <div>
+                <span class="title">${result.title}</span>
+                <span class="datetime">
+                  <relative-time datetime="${result.date}">
+                    ${result.date}
+                  </relative-time>
+                </span>
+              </div>
+              <div class="excerpt">
+                ${result.text}
+              </div>
+            </a>
+          `)}
+        </div>
+      ` : ""}
+    </div>`;
+  }
+}
+customElements.define('search-box', SearchBox);

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: jekyll_ranked_search
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - Friedrich Ewald
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-06-22 00:00:00.000000000 Z
+date: 2023-06-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: redcarpet
@@ -24,13 +24,14 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.6'
-description: Search for Jekyll posts using TF-IDF
+description: Offline search plugin for Jekyll posts using TF-IDF
 email: freddiemailster@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
 - lib/jekyll_ranked_search.rb
+- lib/search.js
 - lib/search.json
 - lib/stopwords.txt
 homepage: https://github.com/f-ewald/jekyll_ranked_search