RubyGems - algoliasearch-jekyll - Versions diffs - 0.9.1 → 1.0.0.beta.pre.1 - Mend

algoliasearch-jekyll 0.9.1 → 1.0.0.beta.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +4 -4
data/.travis.yml +3 -4
data/CONTRIBUTING.md +8 -1
data/Gemfile +4 -5
data/README.md +318 -11
data/Rakefile +7 -12
data/algoliasearch-jekyll.gemspec +66 -62
data/gemfiles/jekyll_v2.gemfile +3 -3
data/gemfiles/jekyll_v3.gemfile +4 -4
data/gemfiles/jekyll_v3_1_3.gemfile +24 -0
data/gemfiles/jekyll_v3_1_6.gemfile +24 -0
data/lib/algoliasearch-jekyll.rb +1 -3
data/lib/credential_checker.rb +2 -1
data/lib/error_handler.rb +6 -0
data/lib/push.rb +81 -19
data/lib/record_extractor.rb +120 -140
data/lib/utils.rb +13 -0
data/lib/version.rb +1 -1
data/scripts/release +13 -12
data/scripts/test_v3 +1 -1
data/scripts/watch +4 -0
data/spec/error_handler_spec.rb +17 -0
data/spec/fixtures/jekyll_version_2/404.html +8 -0
data/spec/fixtures/jekyll_version_2/404.md +9 -0
data/spec/fixtures/jekyll_version_2/_my-collection/collection-item.md +3 -0
data/spec/fixtures/jekyll_version_2/_posts/2015-07-02-test-post.md +1 -1
data/spec/fixtures/jekyll_version_2/about.md +3 -0
data/spec/fixtures/jekyll_version_2/front_matter.md +15 -0
data/spec/fixtures/jekyll_version_2/index.html +3 -1
data/spec/fixtures/jekyll_version_2/only-divs.md +15 -0
data/spec/fixtures/jekyll_version_2/only-paragraphs.md +15 -0
data/spec/fixtures/jekyll_version_3/404.html +8 -0
data/spec/fixtures/jekyll_version_3/404.md +9 -0
data/spec/fixtures/jekyll_version_3/_config.yml +1 -1
data/spec/fixtures/jekyll_version_3/_my-collection/collection-item.md +3 -0
data/spec/fixtures/jekyll_version_3/_posts/2015-07-02-test-post.md +1 -1
data/spec/fixtures/jekyll_version_3/about.md +3 -0
data/spec/fixtures/jekyll_version_3/front_matter.md +15 -0
data/spec/fixtures/jekyll_version_3/index.html +4 -1
data/spec/fixtures/jekyll_version_3/only-divs.md +15 -0
data/spec/fixtures/jekyll_version_3/only-paragraphs.md +15 -0
data/spec/push_spec.rb +211 -8
data/spec/record_extractor_spec.rb +296 -358
data/spec/spec_helper.rb +32 -11
data/txt/record_too_big +19 -0
metadata +40 -51
data/scripts/watch +0 -1

data/gemfiles/jekyll_v2.gemfile CHANGED

@@ -5,8 +5,9 @@ source "http://rubygems.org"
 gem "algoliasearch", "~> 1.4"
 gem "appraisal", "~> 2.1.0"
 gem "awesome_print", "~> 1.6"
-gem "json", ">= 1.8.6"
-gem "nokogiri", '~> 1.7', '>= 1.7.2'
+gem "json", "~> 1.8"
+gem "nokogiri", "~> 1.6"
+gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
 gem "verbal_expressions", "~> 0.1.5"
 gem "jekyll", "~> 2.5"
@@ -19,5 +20,4 @@ group :development do
   gem "rspec", "~> 3.0"
   gem "rubocop", "~> 0.31"
   gem "simplecov", "~> 0.10"
-  gem "rack", "< 2"
 end

data/gemfiles/jekyll_v3.gemfile CHANGED

@@ -5,10 +5,11 @@ source "http://rubygems.org"
 gem "algoliasearch", "~> 1.4"
 gem "appraisal", "~> 2.1.0"
 gem "awesome_print", "~> 1.6"
-gem "json", ">= 1.8.6"
-gem "nokogiri", '~> 1.7', '>= 1.7.2'
+gem "json", "~> 1.8"
+gem "nokogiri", "~> 1.6"
+gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
 gem "verbal_expressions", "~> 0.1.5"
-gem "jekyll", "~> 3.0"
+gem "jekyll", "3.1.6"
 gem "jekyll-paginate", "~> 1.1.0"
 group :development do
@@ -20,5 +21,4 @@ group :development do
   gem "rspec", "~> 3.0"
   gem "rubocop", "~> 0.31"
   gem "simplecov", "~> 0.10"
-  gem "rack", "< 2"
 end

data/gemfiles/jekyll_v3_1_3.gemfile ADDED

@@ -0,0 +1,24 @@
+# This file was generated by Appraisal
+source "http://rubygems.org"
+gem "algoliasearch", "~> 1.4"
+gem "appraisal", "~> 2.1.0"
+gem "awesome_print", "~> 1.6"
+gem "json", "~> 1.8"
+gem "nokogiri", "~> 1.6"
+gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
+gem "verbal_expressions", "~> 0.1.5"
+gem "jekyll", "3.1.3"
+gem "jekyll-paginate", "~> 1.1.0"
+group :development do
+  gem "coveralls", "~> 0.8"
+  gem "flay", "~> 2.6"
+  gem "flog", "~> 4.3"
+  gem "guard-rspec", "~> 4.6"
+  gem "jeweler", "~> 2.0"
+  gem "rspec", "~> 3.0"
+  gem "rubocop", "~> 0.31"
+  gem "simplecov", "~> 0.10"
+end

data/gemfiles/jekyll_v3_1_6.gemfile ADDED

@@ -0,0 +1,24 @@
+# This file was generated by Appraisal
+source "http://rubygems.org"
+gem "algoliasearch", "~> 1.4"
+gem "appraisal", "~> 2.1.0"
+gem "awesome_print", "~> 1.6"
+gem "json", "~> 1.8"
+gem "nokogiri", "~> 1.6"
+gem "html-hierarchy-extractor", :path => "../../html-hierarchy-extractor/"
+gem "verbal_expressions", "~> 0.1.5"
+gem "jekyll", "3.1.6"
+gem "jekyll-paginate", "~> 1.1.0"
+group :development do
+  gem "coveralls", "~> 0.8"
+  gem "flay", "~> 2.6"
+  gem "flog", "~> 4.3"
+  gem "guard-rspec", "~> 4.6"
+  gem "jeweler", "~> 2.0"
+  gem "rspec", "~> 3.0"
+  gem "rubocop", "~> 0.31"
+  gem "simplecov", "~> 0.10"
+end

data/lib/algoliasearch-jekyll.rb CHANGED

@@ -1,12 +1,10 @@
 require 'rubygems'
 require 'bundler/setup'
 require 'awesome_print'
 require_relative './version'
 require_relative './push'
-# `jekyll algolia` main entry
+# Registering the `jekyll algolia push` command
 class AlgoliaSearchJekyll < Jekyll::Command
   class << self
     def init_with_program(prog)

data/lib/credential_checker.rb CHANGED

@@ -3,7 +3,8 @@ require 'nokogiri'
 require 'json'
 require_relative './error_handler.rb'
-# Given an HTML file as input, will return an array of records to index
+# Will check that all the needed credentials are correctly given by the user
+# before starting any push process
 class AlgoliaSearchCredentialChecker
   attr_accessor :config, :logger

data/lib/error_handler.rb CHANGED

@@ -82,6 +82,12 @@ class AlgoliaSearchErrorHandler
       return 'check_key_acl_to_tmp_index'
     end
+    # Pushed record is above the 10KB limit
+    if error['http_error'] == 400 &&
+       error['json']['message'] =~ /^Record is too big/
+      return 'record_too_big'
+    end
     false
   end
 end

data/lib/push.rb CHANGED

@@ -1,12 +1,12 @@
 require 'algoliasearch'
-require 'nokogiri'
 require 'json'
+require 'nokogiri'
 require_relative './version'
 require_relative './record_extractor'
 require_relative './credential_checker'
 require_relative './error_handler'
-# `jekyll algolia push` command
+# `jekyll algolia push` main command
 class AlgoliaSearchJekyllPush < Jekyll::Command
   class << self
     attr_accessor :options, :config
@@ -22,30 +22,42 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
       @args = args
       @options = options
       @config = config
+      @checker = AlgoliaSearchCredentialChecker.new(@config)
       @is_verbose = @config['verbose']
       @is_dry_run = @config['dry_run']
+      @is_lazy_update = lazy_update?
       self
     end
+    # Check if the lazy update feature is enabled or not (default to false)
+    def lazy_update?
+      return false unless @config['algolia']
+      return true if @config['algolia']['lazy_update']
+      false
+    end
     # Check if the specified file should be indexed (we exclude static files,
     # robots.txt and custom defined exclusions).
     def indexable?(file)
+      # Excluding all static assets (images, fonts, etc)
       return false if file.is_a?(Jekyll::StaticFile)
-      basename = File.basename(file.path)
-      extname = File.extname(basename)[1..-1]
-      # Keep only markdown and html files
+      # Jekyll auto-converts markdown to HTML, so if the file is neither
+      # markdown or HTML, we should probably not index it
       allowed_extensions = %w(html)
       if @config['markdown_ext']
         allowed_extensions += @config['markdown_ext'].split(',')
       end
-      if @config['algolia']
-        allowed_extensions += (@config['algolia']['allowed_extensions'] || [])
-      end
-      return false unless allowed_extensions.include?(extname)
+      extname = File.extname(File.basename(file.path))
+      return false unless allowed_extensions.include?(extname[1..-1])
+      # We should not index GitHub pages 404 pages
+      # https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
+      basename_no_ext = File.basename(file.path, extname)
+      return false if basename_no_ext == '404'
+      # Users can also define their own blacklist and hooks to exclude files
       return false if excluded_file?(file)
       true
@@ -53,14 +65,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
     # Check if the file is in the list of excluded files
     def excluded_file?(file)
+      # Blacklist of pages generated by Jekyll that we know should not be
+      # indexing
       excluded = [
-        %r{^page([0-9]*)/index\.html}
+        /^index\.html$/, # Index page
+        %r{^page([0-9]*)/index\.html} # Pagination pages
       ]
+      # User-provided blacklist
       if @config['algolia']
         excluded += (@config['algolia']['excluded_files'] || [])
       end
-      # Exclude files explicitly excluded in _config
       excluded.each do |pattern|
         pattern = /#{Regexp.quote(pattern)}/ if pattern.is_a? String
         return true if file.path =~ pattern
@@ -88,14 +103,17 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
         items = []
         is_verbose = config['verbose']
         each_site_file do |file|
+          # Skip files that should not be indexed
           next unless AlgoliaSearchJekyllPush.indexable?(file)
           Jekyll.logger.info "Extracting data from #{file.path}" if is_verbose
           new_items = AlgoliaSearchRecordExtractor.new(file).extract
           next if new_items.nil?
           ap new_items if is_verbose
           items += new_items
         end
         AlgoliaSearchJekyllPush.push(items)
       end
@@ -178,14 +196,11 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
       end
     end
-    def push(items)
-      checker = AlgoliaSearchCredentialChecker.new(@config)
-      checker.assert_valid
-      Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
+    # Greedy update will push all the records to a temporary index, then
+    # override the existing index with this temp one
+    def greedy_update(items)
       # Add items to a temp index, then rename it
-      index_name = checker.index_name
+      index_name = @checker.index_name
       index_name_tmp = "#{index_name}_tmp"
       batch_add_items(items, create_index(index_name_tmp))
       Algolia.move_index(index_name_tmp, index_name) unless @is_dry_run
@@ -193,5 +208,52 @@ class AlgoliaSearchJekyllPush < Jekyll::Command
       Jekyll.logger.info "Indexing of #{items.size} items " \
                          "in #{index_name} done."
     end
+    # Lazy update will minimize the number of operations by only pushing new
+    # data and deleting old data
+    def lazy_update(items)
+      index = create_index(@checker.index_name)
+      remote = remote_ids(index)
+      local = items.map { |item| item[:objectID] }
+      delete_remote_not_in_local(index, local, remote)
+      add_local_not_in_remote(index, items, local, remote)
+    end
+    # Array of all objectID in the remote index
+    def remote_ids(index)
+      list = []
+      index.browse(attributesToRetrieve: 'objectID') do |hit|
+        list << hit['objectID']
+      end
+      list
+    end
+    # Delete all remote items that are no longer in the local items
+    def delete_remote_not_in_local(index, local, remote)
+      list = remote - local
+      Jekyll.logger.info "Deleting #{list.size} items"
+      index.delete_objects!(list) unless list.empty?
+    end
+    # Push all local items that are not yet in the index
+    def add_local_not_in_remote(index, items, local, remote)
+      list = local - remote
+      return Jekyll.logger.info "Adding #{list.size} items" if list.empty?
+      items_to_push = items.select do |item|
+        list.include?(item[:objectID])
+      end
+      batch_add_items(items_to_push, index)
+    end
+    def push(items)
+      checker = AlgoliaSearchCredentialChecker.new(@config)
+      checker.assert_valid
+      Jekyll.logger.info '=== DRY RUN ===' if @is_dry_run
+      @is_lazy_update ? lazy_update(items) : greedy_update(items)
+    end
   end
 end

data/lib/record_extractor.rb CHANGED

@@ -1,6 +1,8 @@
 require 'algoliasearch'
 require 'nokogiri'
 require 'json'
+require 'html-hierarchy-extractor'
+require_relative './utils'
 # Given an HTML file as input, will return an array of records to index
 class AlgoliaSearchRecordExtractor
@@ -25,189 +27,167 @@ class AlgoliaSearchRecordExtractor
     items
   end
-  # Returns metadata from the current file
-  def metadata
-    metadata = {}
-    @file.data.each { |key, value| metadata[key.to_sym] = value }
+  ##
+  # Return the type of the Jekyll element
+  # It can be either page, post or document
+  def type
+    classname = @file.class.name
+    subclass = classname.split('::')[1]
+    type = subclass.downcase
-    metadata[:type] = @file.class.name.split('::')[1].downcase
-    metadata[:url] = @file.url
+    # In Jekyll v2, Page, Post and Document have their own class
+    return type if AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
-    metadata[:slug] = slug
+    # In Jekyll v3, Post are actually a specific type of Documents
+    if type == 'document'
+      collection_name = @file.collection.label
+      return 'post' if collection_name == 'posts'
+    end
-    metadata[:posted_at] = @file.date.to_time.to_i if @file.respond_to? :date
-    metadata[:tags] = tags
+    type
+  end
-    metadata
+  ##
+  # Return the url of the page
+  def url
+    @file.url
   end
+  ##
+  # Return the title of the page
+  def title
+    @file.data['title']
+  end
+  ##
   # Returns the slug of the document
   def slug
-    # Jekyll v3 has it in data
-    return @file.data['slug'] if @file.data.key?('slug')
-    # Old Jekyll v2 has it at the root
-    return @file.slug if @file.respond_to? :slug
-    # Otherwise, we guess it from the filename
+    # We can guess the slug from the filename for all documents
     basename = File.basename(@file.path)
     extname = File.extname(basename)
-    File.basename(basename, extname)
-  end
-  # Extract a list of tags
-  def tags
-    tags = nil
+    slug = File.basename(basename, extname)
-    # Jekyll v3 has it in data, while v2 have it at the root
-    if @file.data.key?('tags')
-      tags = @file.data['tags']
-    elsif @file.respond_to? :tags
-      tags = @file.tags
-    end
-    return tags if tags.nil?
+    # Jekyll v3 posts have it in data
+    return @file.data['slug'] if @file.data.key?('slug')
-    # Anyway, we force cast it to string as some plugins will extend the tags to
-    # full featured objects
-    tags.map(&:to_s)
-  end
+    # Jekyll v2 posts have a specific slug method
+    return @file.slug if @file.respond_to?(:slug)
-  # Get the list of all HTML nodes to index
-  def html_nodes
-    document = Nokogiri::HTML(@file.content)
-    document.css(@config['record_css_selector'])
+    slug
   end
-  # Check if node is a heading
-  def node_heading?(node)
-    %w(h1 h2 h3 h4 h5 h6).include?(node.name)
-  end
+  ##
+  # Get an array of tags of the document
+  def tags
+    tags = []
-  # Get the closest heading parent
-  def node_heading_parent(node, level = 'h7')
-    # If initially called on a heading, we only accept stronger headings
-    level = node.name if level == 'h7' && node_heading?(node)
+    is_v2 = AlgoliaSearchUtils.restrict_jekyll_version(less_than: '3.0')
+    is_v3 = AlgoliaSearchUtils.restrict_jekyll_version(more_than: '3.0')
+    has_tags_method = @file.respond_to?(:tags)
+    has_tags_data = @file.data.key?('tags')
-    previous = node.previous_element
+    # Starting from Jekyll v3, all tags are in data['tags']
+    tags = @file.data['tags'] if is_v3 && has_tags_data
-    # No previous element, we go up to the parent
-    unless previous
-      parent = node.parent
-      # No more parent, then no heading found
-      return nil if parent.name == 'body'
-      return node_heading_parent(parent, level)
+    # In Jekyll v2, tags are in data['tags'], or in .tags
+    if is_v2
+      tags = @file.tags if has_tags_method
+      tags = @file.data['tags'] if tags.empty? && has_tags_data
     end
-    # This is a heading, we return it
-    return previous if node_heading?(previous) && previous.name < level
-    node_heading_parent(previous, level)
+    # Some extension extends the tags with custom classes, so we make sure we
+    # cast them as strings
+    tags.map(&:to_s)
   end
-  # Get all the parent headings of the specified node
-  # If the node itself is a heading, we include it
-  def node_hierarchy(node, state = { level: 7 })
-    tag_name = node.name
-    level = tag_name.delete('h').to_i
+  ##
+  # Get the post date timestamp
+  def date
+    return nil unless @file.respond_to?(:date)
-    if node_heading?(node) && level < state[:level]
-      state[tag_name.to_sym] = node_text(node)
-      state[:level] = level
-    end
-    heading = node_heading_parent(node)
-    # No previous heading, we can stop the recursion
-    unless heading
-      state.delete(:level)
-      return state
-    end
-    node_hierarchy(heading, state)
+    @file.date.to_time.to_i
   end
-  # Return the raw HTML of the element to index
-  def node_raw_html(node)
-    node.to_s
-  end
+  ##
+  # Get the collection name of a document
+  def collection
+    return nil unless @file.respond_to?(:collection)
-  # Return the text of the element, sanitized to be displayed
-  def node_text(node)
-    node.content.gsub('<', '&lt;').gsub('>', '&gt;')
-  end
+    collection_name = @file.collection.label
-  # Returns a unique string of hierarchy from title to h6, used for distinct
-  def unique_hierarchy(data)
-    headings = %w(title h1 h2 h3 h4 h5 h6)
-    headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
+    # In Jekyll v3, posts are actually a collection
+    return nil if collection_name == 'posts'
+    collection_name
   end
-  # Returns a hash of two CSS selectors. One for the node itself, and one its
-  # closest heading parent
-  def node_css_selector(node)
-    return nil if node.nil?
+  ##
+  # Get a hash of all front-matter data
+  def front_matter
+    raw_data = @file.data
-    # Use the CSS id if one is set
-    return "##{node['id']}" if node['id']
-    # Default Nokogiri selector
-    node.css_path.gsub('html > body > ', '')
-  end
+    # We clean some keys that will be handled by specific methods
+    attributes_to_remove = %w(title tags slug url date type)
+    attributes_to_remove.each do |attribute|
+      raw_data.delete(attribute)
+    end
-  # The more words are in common between this node and its parent heading, the
-  # higher the score
-  def weight_heading_relevance(data)
-    # Get list of unique words in headings
-    title_words = %i(title h1 h2 h3 h4 h5 h6)
-                  .select { |title| data.key?(title) }
-                  .map { |title| data[title].to_s.split(/\W+/) }
-                  .flatten
-                  .compact
-                  .map(&:downcase)
-                  .uniq
-    # Intersect words in headings with words in test
-    text_words = data[:text].downcase.split(/\W+/)
-    (title_words & text_words).size
-  end
+    # Convert to symbols
+    data = {}
+    raw_data.each do |key, value|
+      data[key.to_sym] = value
+    end
-  # Returns a weight based on the tag_name
-  def weight_tag_name(item)
-    tag_name = item[:tag_name]
-    # No a heading, no weight
-    return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
-    # h1: 100, h2: 90, ..., h6: 50
-    100 - (tag_name.delete('h').to_i - 1) * 10
+    data
   end
-  # Returns an object of all weights
-  def weight(item, index)
-    {
-      tag_name: weight_tag_name(item),
-      heading_relevance: weight_heading_relevance(item),
-      position: index
+  ##
+  # Get the list of all node data
+  def hierarchy_nodes
+    extractor_options = {
+      css_selector: @config['record_css_selector']
     }
+    HTMLHierarchyExtractor.new(
+      @file.content,
+      options: extractor_options
+    ).extract
   end
+  # Extract all records from the page and return the list
   def extract
+    # Getting all hierarchical nodes from the HTML input
+    raw_items = hierarchy_nodes
+    # Shared attributes relative to the page that all records will have
+    shared_attributes = {
+      type: type,
+      url: url,
+      title: title,
+      slug: slug,
+      date: date,
+      collection: collection,
+      tags: tags
+    }
+    # Remove empty attributes
+    shared_attributes = shared_attributes.delete_if do |_, value|
+      value.nil?
+    end
+    # Enriching with page metadata
     items = []
-    html_nodes.each_with_index do |node, index|
-      next if node.text.empty?
-      item = metadata.clone
-      item.merge!(node_hierarchy(node))
-      item[:tag_name] = node.name
-      item[:raw_html] = node_raw_html(node)
-      item[:text] = node_text(node)
-      item[:unique_hierarchy] = unique_hierarchy(item)
-      item[:css_selector] = node_css_selector(node)
-      item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
-      item[:weight] = weight(item, index)
-      # We pass item through the user defined custom hook
-      item = custom_hook_each(item, node)
+    raw_items.each do |raw_item|
+      nokogiri_node = raw_item[:node]
+      raw_item.delete(:node)
+      item = shared_attributes.merge(raw_item)
+      item[:objectID] = item[:uuid]
+      item.delete(:uuid)
+      item = custom_hook_each(item, nokogiri_node)
       next if item.nil?
       items << item
     end
     custom_hook_all(items)
   end
 end