RubyGems - algoliasearch-jekyll - Versions diffs - 0.2.0 → 0.2.1 - Mend

algoliasearch-jekyll 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f1af115b167749491e0cfc3fbdfca8f7cbd0bfb1
-  data.tar.gz: 6cdc40cf3148a33400178ba9bb9076095adebd31
+  metadata.gz: 3ab982386891076f076e8a420e4f7bce3ae6c4c3
+  data.tar.gz: 3ee4daebb86e545421496ee3b7d1badd1e7484ed
 SHA512:
-  metadata.gz: 371291f704b4029819eb5dbb59de2e3ac2ac90c8973ae678fb1890824f3c9b4470782b1bfe8b7cbb528a374e80eeb4e274440c53a5561efc5bb3d703d4e19ead
-  data.tar.gz: c508df8e04d78ae5db324ef678cb27ae46f5e067030a45a373dc399f25eca244263ba3bddf998e35b965fed31fb374d1e565014e6fe7a06631f4e4831811b097
+  metadata.gz: 5a52c45e78b71da6d978fc8550c18350002a039b1433c5ece654992a3bcf508eadd7435dea2d977db359c2118f03fcaf2b02c2ef4da05db34cdb9c8c3c7af5e8
+  data.tar.gz: 127b1475369e4196e81eaac262da037e41cecfc9e2df38ea49dfa80aeebbe4300cd84865362891808152b8838b81b35cf7533c86809af02909a3af452a5a358a

data/lib/push.rb CHANGED Viewed

@@ -41,9 +41,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
       # Exclude files manually excluded from config
       excluded_files = @config['algolia']['excluded_files']
-      unless excluded_files.nil?
-        return false if excluded_files.include?(file.name)
-      end
+      return false if excluded_files && excluded_files.include?(file.name)
       true
     end
@@ -60,6 +58,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
           new_items = AlgoliaSearchRecordExtractor.new(file).extract
           next if new_items.nil?
           items += new_items
         end
         AlgoliaSearchJekyllPush.push(items)
@@ -96,7 +95,7 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
         exit 1
       end
-      unless @config['algolia']['application_id']
+      unless @config['algolia'] && @config['algolia']['application_id']
         Jekyll.logger.error 'Algolia Error: No application ID defined'
         Jekyll.logger.warn '  Please set your application id in the '\
                            '_config.yml file, like so:'
@@ -124,13 +123,12 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
         Jekyll.logger.warn '    https://www.algolia.com/explorer'
         exit 1
       end
-      true
+      nil
     end
     # Get index settings
     def configure_index(index)
       settings = {
-        typoTolerance: true,
         distinct: true,
         attributeForDistinct: 'title',
         attributesForFaceting: %w(tags type title),
@@ -149,18 +147,14 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
           css_selector
           css_selector_parent
         ),
-        customRanking: ['desc(posted_at)', 'desc(title_weight)'],
+        customRanking: ['desc(posted_at)', 'desc(weight)'],
         highlightPreTag: '<span class="algolia__result-highlight">',
         highlightPostTag: '</span>'
       }
       # Merge default settings with user custom ones
-      if @config['algolia'].key?('settings')
-        custom_settings = {}
-        @config['algolia']['settings'].each do |key, value|
-          custom_settings[key.to_sym] = value
-        end
-        settings.merge!(custom_settings)
+      (@config['algolia']['settings'] || []).each do |key, value|
+        settings[key.to_sym] = value
       end
       index.set_settings(settings)

data/lib/record_extractor.rb ADDED Viewed

@@ -0,0 +1,184 @@
+require 'algoliasearch'
+require 'nokogiri'
+require 'json'
+# Given an HTML file as input, will return an array of records to index
+class AlgoliaSearchRecordExtractor
+  def initialize(file)
+    @file = file
+    default_config = {
+      'record_css_selector' => 'p'
+    }
+    @config = default_config.merge(file.site.config['algolia'])
+  end
+  # Hook to modify a record after extracting
+  def custom_hook_each(item, _node)
+    item
+  end
+  # Hook to modify all records after extracting
+  def custom_hook_all(items)
+    items
+  end
+  # Returns metadata from the current file
+  def metadata
+    return metadata_page if @file.is_a?(Jekyll::Page)
+    return metadata_post if @file.is_a?(Jekyll::Post)
+    {}
+  end
+  # Extract a list of tags
+  def tags
+    return nil unless @file.respond_to? :tags
+    # Some plugins will extend the tags from simple strings to full featured
+    # objects. We'll simply call .to_s to always have a string
+    @file.tags.map(&:to_s)
+  end
+  # Extract metadata from a post
+  def metadata_post
+    {
+      type: 'post',
+      url: @file.url,
+      title: @file.title,
+      slug: @file.slug,
+      posted_at: @file.date.to_time.to_i,
+      tags: tags
+    }
+  end
+  # Extract metadata from a page
+  def metadata_page
+    {
+      type: 'page',
+      url: @file.url,
+      title: @file['title'],
+      slug: @file.basename
+    }
+  end
+  # Get the list of all HTML nodes to index
+  def html_nodes
+    document = Nokogiri::HTML(@file.content)
+    document.css(@config['record_css_selector'])
+  end
+  # Get the closest heading parent
+  def node_heading_parent(node, level = 'h7')
+    headings = %w(h1 h2 h3 h4 h5 h6)
+    # If initially called on a heading, we must not accept it but only accept
+    # strong headings
+    level = node.name if level == 'h7' && headings.include?(node.name)
+    previous = node.previous_element
+    # No previous element, we go up to the parent
+    unless previous
+      parent = node.parent
+      # No more parent, then no heading found
+      return nil if parent.name == 'body'
+      return node_heading_parent(parent, level)
+    end
+    # This is a heading, we return it
+    return previous if headings.include?(previous.name) && previous.name < level
+    node_heading_parent(previous, level)
+  end
+  # Get all the parent headings of the specified node
+  def node_hierarchy(node, memo = { level: 7 })
+    previous = node_heading_parent(node)
+    # No previous heading, we can stop the recursion
+    unless previous
+      memo.delete(:level)
+      return memo
+    end
+    tag_name = previous.name
+    level = tag_name.gsub('h', '').to_i
+    content = previous.content
+    # Skip if item already as title of a higher level
+    return node_hierarchy(previous, memo) if level >= memo[:level]
+    memo[:level] = level
+    # Add to the memo and continue
+    memo[tag_name.to_sym] = content
+    node_hierarchy(previous, memo)
+  end
+  # Return the raw HTML of the element to index
+  def node_raw_html(node)
+    node.to_s
+  end
+  # Return the text of the element, sanitized to be displayed
+  def node_text(node)
+    node.content.gsub('<', '&lt;').gsub('>', '&gt;')
+  end
+  # Returns a unique string of hierarchy from title to h6, used for distinct
+  def unique_hierarchy(data)
+    headings = %w(title h1 h2 h3 h4 h5 h6)
+    headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
+  end
+  # Returns a hash of two CSS selectors. One for the node itself, and one its
+  # closest heading parent
+  def node_css_selector(node)
+    return nil if node.nil?
+    # Use the CSS id if one is set
+    return "##{node['id']}" if node['id']
+    # Default Nokogiri selector
+    node.css_path.gsub('html > body > ', '')
+  end
+  # Returns a custom numeric value representing how relevant to its hierarchy
+  # this record is. This value can be used in the custom ranking to display more
+  # relevant records first.
+  def weight(data)
+    # Get list of unique words in headings
+    title_words = %i(title h1 h2 h3 h4 h5 h6)
+                  .select { |title| data.key?(title) }
+                  .map { |title| data[title].split(/\W+/) }
+                  .flatten
+                  .compact
+                  .map(&:downcase)
+                  .uniq
+    # Intersect words in headings with words in test
+    text_words = data[:text].downcase.split(/\W+/)
+    (title_words & text_words).size
+  end
+  def extract
+    items = []
+    html_nodes.each_with_index do |node, index|
+      next unless node.text.size > 0
+      item = metadata.clone
+      item[:objectID] = "#{item[:slug]}_#{index}"
+      item.merge!(node_hierarchy(node))
+      item[:tag_name] = node.name
+      item[:raw_html] = node_raw_html(node)
+      item[:text] = node_text(node)
+      item[:unique_hierarchy] = unique_hierarchy(item)
+      item[:css_selector] = node_css_selector(node)
+      item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
+      item[:weight] = weight(item)
+      # We pass item through the user defined custom hook
+      item = custom_hook_each(item, node)
+      next if item.nil?
+      items << item
+    end
+    custom_hook_all(items)
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: algoliasearch-jekyll
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - Tim Carry
@@ -117,6 +117,7 @@ extra_rdoc_files: []
 files:
 - lib/algoliasearch-jekyll.rb
 - lib/push.rb
+- lib/record_extractor.rb
 homepage: https://github.com/algolia/algoliasearch-jekyll
 licenses:
 - MIT