RubyGems - jekyll_ranked_search - Versions diffs - 0.0.1 → 0.0.3 - Mend

jekyll_ranked_search 0.0.1 → 0.0.3

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/jekyll_ranked_search.rb +28 -14
data/lib/search.js +265 -0
metadata +6 -5

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 78ba1763b6b2bca798b128851cd24a67c5f8f47b980d9201c8045d33b40834e1
-  data.tar.gz: 0babe4848299d150103574360b54117207858f6a2171cd46dfc4c48fb971e041
+  metadata.gz: 81e9541ca2a4139827dfb10a8ad23bb3f493f9465d95c8c0d8f18f55affeb10f
+  data.tar.gz: cc861e099846391ab537624f4a509e7bf97987c5f68851335a613fe8fbadb6e9
 SHA512:
-  metadata.gz: 729b789788706222be8f96680bd7a6d7839eccbd5eb66f04a1e8956e176e49c123c085ea2b52c2bdec32521904a788a1ea2f31a3b5443bf621a760e76960479a
-  data.tar.gz: 227866c90b5e664291d7a13a0da968e5e2c3b0c07972ec7ab20d574b0a0a25d9b1094b1202962c7b46f216d83e56b8646ad283b5af5c21574be84295180f4e88
+  metadata.gz: cb47470cf5b035428a768d83804dd92703e754e3776f85655094f136e02d1523e0995c884595caf02716a29fe8cf7f416fd05f4549b18ca121c13fbad0ec252e
+  data.tar.gz: 5258f44e86d40cc44666aae5841ffd7e50c7a51b5c60dcf7666300177b278ffff0c4e89f5b4a6b2d62c6117f21edea8f8a0ee75ed076a8def7b010a572472a1d

data/lib/jekyll_ranked_search.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require "set"
 require "redcarpet"
 require "redcarpet/render_strip"
@@ -46,13 +48,28 @@ class TfidfConverter < Jekyll::Generator
     # Create vocabulary
     docs.each_with_index do |post, idx|
       content = markdown.render(post.content)
+      # Tokenize content before applying any other transformations
+      tokenized = self.tokenize_words "#{post.data['title']} #{content}"
       # Replace newlines with wide spaces and bullet points
-      # TODO: Remove trailing bullet point
-      content.gsub!(/\n/, ' • ')
-      # TODO: Use first n words instead of characters
-      content = markdown.render(content)
-      content = content[..512]  # The first 512 characters of the post
+      divider = " • "
+      content.gsub!(/\n/, divider)
+      # Remove trailing divider
+      if content.end_with?(divider)
+        content = content[0..-4]
+      end
+      # Take first n words of post
+      n_words = 40
+      splitted_content = content.split(" ")
+      word_count = splitted_content.length
+      content = splitted_content[..n_words].join(" ")  # The first n words of the post
+      if word_count > n_words
+        content += "..."
+      end
       processed_docs.push({
         title: post.data['title'],
         url: post.url,
@@ -60,7 +77,6 @@ class TfidfConverter < Jekyll::Generator
         text: content,
       })
-      tokenized = self.tokenize_words "#{post.data['title']} #{content}"
       token_seen = false
       tokenized.each do |word|
         if !bow.include?(word)
@@ -94,8 +110,6 @@ class TfidfConverter < Jekyll::Generator
     tfidf = {}
     tf.each do |idx, freq|
       token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
-      # puts "token idx: #{token_idx}"
-      # puts df
       _idf = Math.log(total_docs / df[token_idx] + 0.00001)
       # Exponential decay over time (boost newer posts)
@@ -109,17 +123,18 @@ class TfidfConverter < Jekyll::Generator
   end
   def tokenize_words(doc)
-    # TODO: Better tokenization
+    # Remove stopwords from document
     @stopwords ||= self.load_stopwords
-    # replace_chars = /[-_:;@#,¿?¡!'"“”‘’`\/\(\)\[\]\{\}]/i
+    # Split document into tokens
     splitted_doc = doc.strip.downcase.split
-    splitted_doc.delete_if { |word| @stopwords.include?(word) }
+    # Remove stopwords in place
+    splitted_doc.delete_if { |word| @stopwords.include?(word) }
     # Remove special characters (only at beginning and end)
-    splitted_doc.map! { |word| word.gsub(/[^a-z0-9\s]/i, '') }
+    splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }
-    # splitted_doc.map! { |word| word.tr("@#!?.:;[]()", "") }
     splitted_doc
   end
@@ -153,5 +168,4 @@ class TfidfConverter < Jekyll::Generator
     end
     page
   end
 end

data/lib/search.js ADDED Viewed

@@ -0,0 +1,265 @@
+import {LitElement, html, css} from 'https://cdn.jsdelivr.net/gh/lit/dist@2/core/lit-core.min.js';
+import 'https://cdn.jsdelivr.net/npm/@github/relative-time-element';
+class SearchBox extends LitElement {
+  static properties = {
+    _isLoading: {state: true, type: Boolean},
+    _data: {state: false, type: Array},
+    _results : {state: true, type: Array},
+    _open: {state: true, type: Boolean},
+    _placeholder: {state: true, type: String},
+    // Lazy loading
+    lazy: {type: Boolean, attribute: true},
+  };
+  /**
+   * Constructor. Sets up default values.
+   */
+  constructor() {
+    super();
+    // No data initially loaded
+    this._data = [];
+    // Results are initially empty
+    this._results = [];
+    // Start in closed state, show no results
+    this._open = false;
+    // Default to not loading
+    this._isLoading = false;
+    // Disable lazy loading by default
+    this.lazy = false;
+    this._placeholder = "Search...";
+  }
+  static styles = css`
+    :host {
+      position: relative;
+      display: block;
+    }
+    input#q {
+      box-sizing: border-box;
+      width: 100%;
+      // margin: 0 auto;
+      padding: .4em;
+      border: 1px solid #ccc;
+      font-size: 1.2em;
+      border-radius: 4px;
+      box-shadow: 1px 1px 3px #AAA;
+      z-index: 11;
+    }
+    #results {
+      position: absolute;
+      width: 100%;
+      margin-top: 4px;
+      z-index: 10;
+      background-color: #F6F6F6;
+      border-radius: 4px;
+      box-shadow: 1px 1px 2px #888;
+    }
+    .hide {
+      display: none;
+    }
+    .resultItem {
+      text-decoration: none;
+      color: #333;
+      padding: .4em;
+      display: flex;
+      flex-direction: column;
+    }
+    .resultItem:hover {
+      background-color: #F0F0F0;
+    }
+    .resultItem .title {
+      color: #1756a9;
+      font-weight: 500;
+    }
+    .resultItem .datetime {
+      color: #666;
+      font-size: .8em;
+    }
+    .resultItem .excerpt {
+      font-size: .8em;
+    }
+    .resultItemActive {
+      background-color: #F0F0F0;
+    }
+  `;
+  connectedCallback() {
+    super.connectedCallback();
+    // Load data if lazy loading is disabled
+    if (!this.lazy) {
+      this.loadData();
+    }
+    document.addEventListener('click', (event) => {
+      if (!event.composedPath().includes(this) && this._open) {
+          this.toggle();
+      }
+    });
+  }
+  toggle() {
+    this._open = !this._open;
+  }
+  openIfResults() {
+    if (this._results.length > 0) {
+      this._open = true;
+    }
+  }
+  close() {
+    this._open = false;
+  }
+  async loadData() {
+    // Set state during loading
+    this._isLoading = true;
+    this.updatePlaceholder();
+    const response = await fetch("/search.json");
+    const jsonData = await response.json();
+    jsonData.word2doc = new Map(Object.entries(jsonData.word2doc));
+    jsonData.bow = new Map(Object.entries(jsonData.bow));
+    jsonData.tfidf = new Map(Object.entries(jsonData.tfidf));
+    this._data = jsonData;
+    // Cleanup state
+    this._isLoading = false;
+    this.updatePlaceholder();
+  }
+  disconnectedCallback() {
+    super.disconnectedCallback();
+  }
+  search(event) {
+    if (event.key === "Escape") {
+      this.close();
+      return;
+    }
+    const query = event.target.value.toLowerCase().trim();
+    if (query === "") {
+      this._results = [];
+      this.close();
+      return;
+    }
+    // Split query into word-tokens
+    const tokens = query.split(" ");
+    // Find token ids for each token
+    let tokenIds = new Set();
+    for (const token of tokens) {
+      if (token === "") {
+        continue;
+      }
+      if (this._data.bow.has(token)) {
+        tokenIds.add(this._data.bow.get(token));
+      } else {
+        // If one of the tokens is not available, we can return immediately
+        // as there will be no results
+        this._results = [];
+        this.close();
+        return;
+      }
+    }
+    // Convert tokenIds to array
+    tokenIds = [...tokenIds];
+    // Initialize docs with first token
+    // Subsequent token need to interset with this set
+    let docs = new Set(this._data.word2doc.get(tokenIds[0].toString()));
+    for (const tokenId of tokenIds.slice(1)) {
+      // Find document candidates
+      const docCandidates = new Set(this._data.word2doc.get(tokenId.toString()));
+      docs = new Set([...docs].filter((x) => docCandidates.has(x)));
+    }
+    // Calculate TF-IDF
+    let results = new Map();
+    for (const doc of docs) {
+      let score = 0;
+      for (const tokenId of tokenIds) {
+        if (this._data.tfidf.has(`${tokenId},${doc}`)) {
+          score += this._data.tfidf.get(`${tokenId},${doc}`);
+        }
+      }
+      results.set(doc, score);
+    }
+    // Sort by score
+    const candidates = [...results.entries()].sort((a, b) => b[1] - a[1]).map((a) => a[0]);
+    // Get top n results
+    this._results = candidates.map((idx) => this._data.docs[idx]).slice(0, 8);
+    this._open = true;
+  }
+  updatePlaceholder() {
+    if (this._isLoading) {
+      this._placeholder = "Loading...";
+      return;
+    }
+    if (this._data && this._data.docs && this._data.docs.length > 0) {
+      let plural = "";
+      if (this._data.docs.length !== 1) {
+        plural = "s";
+      }
+      this._placeholder = "Search in " + this._data.docs.length + ` post${plural}...`;
+      return;
+    }
+  }
+  /**
+   * Event triggered on search box focus.
+   */
+  focus(_) {
+    if (this.lazy && this._data.length === 0 && !this._isLoading) {
+      this.loadData();
+    }
+  }
+  render() {
+    return html`<div>
+      <input id="q" type="text" placeholder="${this._placeholder}" @keyup="${this.search}" @click=${this.openIfResults} @focus=${this.focus}>
+      ${this._open ? html`
+        <div id="results">
+          ${this._results.map((result) => html`
+            <a class="resultItem" href="${result.url}">
+              <div>
+                <span class="title">${result.title}</span>
+                <span class="datetime">
+                  <relative-time datetime="${result.date}">
+                    ${result.date}
+                  </relative-time>
+                </span>
+              </div>
+              <div class="excerpt">
+                ${result.text}
+              </div>
+            </a>
+          `)}
+        </div>
+      ` : ""}
+    </div>`;
+  }
+}
+customElements.define('search-box', SearchBox);

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: jekyll_ranked_search
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.3
 platform: ruby
 authors:
 - Friedrich Ewald
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-06-22 00:00:00.000000000 Z
+date: 2023-06-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: redcarpet
@@ -24,13 +24,14 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.6'
-description: Search for Jekyll posts using TF-IDF
+description: Offline search plugin for Jekyll posts using TF-IDF
 email: freddiemailster@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
 - lib/jekyll_ranked_search.rb
+- lib/search.js
 - lib/search.json
 - lib/stopwords.txt
 homepage: https://github.com/f-ewald/jekyll_ranked_search
@@ -45,7 +46,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 2.5.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
@@ -55,5 +56,5 @@ requirements: []
 rubygems_version: 3.4.13
 signing_key:
 specification_version: 4
-summary: TF-IDF search for Jekyll posts
+summary: TF-IDF offline search for Jekyll posts
 test_files: []