RubyGems - jekyll-algolia - Versions diffs - 0.0.0 - Mend

jekyll-algolia 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +7 -0
data/CONTRIBUTING.md +94 -0
data/README.md +99 -0
data/errors/invalid_credentials.txt +10 -0
data/errors/invalid_credentials_for_tmp_index.txt +17 -0
data/errors/invalid_index_name.txt +11 -0
data/errors/missing_api_key.txt +17 -0
data/errors/missing_application_id.txt +12 -0
data/errors/missing_index_name.txt +19 -0
data/errors/no_records_found.txt +20 -0
data/errors/record_too_big.txt +25 -0
data/errors/unknown_application_id.txt +20 -0
data/errors/unknown_settings.txt +15 -0
data/lib/jekyll-algolia.rb +107 -0
data/lib/jekyll/algolia/configurator.rb +202 -0
data/lib/jekyll/algolia/error_handler.rb +270 -0
data/lib/jekyll/algolia/extractor.rb +64 -0
data/lib/jekyll/algolia/file_browser.rb +269 -0
data/lib/jekyll/algolia/hooks.rb +67 -0
data/lib/jekyll/algolia/indexer.rb +258 -0
data/lib/jekyll/algolia/logger.rb +63 -0
data/lib/jekyll/algolia/utils.rb +68 -0
data/lib/jekyll/algolia/version.rb +7 -0
data/lib/jekyll/commands/algolia.rb +49 -0
metadata +304 -0

data/lib/jekyll/algolia/extractor.rb ADDED

@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+require 'algolia_html_extractor'
+module Jekyll
+  module Algolia
+    # Module to extract records from Jekyll files
+    module Extractor
+      include Jekyll::Algolia
+      # Public: Extract records from the file
+      #
+      # file - The Jekyll file to process
+      # TOTEST
+      def self.run(file)
+        # Getting all hierarchical nodes from the HTML input
+        raw_records = extract_raw_records(file.content)
+        # Getting file metadata
+        shared_metadata = FileBrowser.metadata(file)
+        # Building the list of records
+        records = []
+        raw_records.map do |record|
+          # We do not need to pass the HTML node element to the final record
+          node = record[:node]
+          record.delete(:node)
+          # Merging each record info with file info
+          record = Utils.compact_empty(record.merge(shared_metadata))
+          # Apply custom user-defined hooks
+          # Users can return `nil` from the hook to signal we should not index
+          # such a record
+          record = Hooks.apply_each(record, node)
+          next if record.nil?
+          records << record
+        end
+        records
+      end
+      # Public: Adds a unique :objectID field to the hash, representing the
+      # current content of the record
+      def self.add_unique_object_id(record)
+        record[:objectID] = AlgoliaHTMLExtractor.uuid(record)
+        record
+      end
+      # Public: Extract raw records from the file, including content for each
+      # node to index and hierarchy
+      #
+      # content - The HTML content to parse
+      def self.extract_raw_records(content)
+        AlgoliaHTMLExtractor.run(
+          content,
+          options: {
+            css_selector: Configurator.algolia('nodes_to_index')
+          }
+        )
+      end
+    end
+  end
+end

data/lib/jekyll/algolia/file_browser.rb ADDED

@@ -0,0 +1,269 @@
+# frozen_string_literal: true
+require 'algolia_html_extractor'
+module Jekyll
+  module Algolia
+    # Module to get information about Jekyll file. Jekyll handles posts, pages,
+    # collection, etc. They each need specific processing, so knowing which kind
+    # of file we're working on will help.
+    #
+    # We also do not index all files. This module will help in defining which
+    # files should be indexed and which should not.
+    module FileBrowser
+      include Jekyll::Algolia
+      # Public: Check if the specified file is a static Jekyll asset
+      #
+      # file - The Jekyll file
+      #
+      # We don't index static assets (js, css, images)
+      def self.static_file?(file)
+        file.is_a?(Jekyll::StaticFile)
+      end
+      # Public: Check if the file is a 404 error page
+      #
+      # file - The Jekyll file
+      #
+      # 404 pages are not Jekyll defaults but a convention adopted by GitHub
+      # pages. We don't want to index those.
+      # Source: https://help.github.com/articles/creating-a-custom-404-page-for-your-github-pages-site/
+      #
+      # rubocop:disable Naming/PredicateName
+      def self.is_404?(file)
+        File.basename(file.path, File.extname(file.path)) == '404'
+      end
+      # rubocop:enable Naming/PredicateName
+      # Public: Check if the page is a pagination page
+      #
+      # file - The Jekyll file
+      #
+      # `jekyll-paginate` automatically creates pages to paginate through posts.
+      # We don't want to index those
+      def self.pagination_page?(file)
+        Utils.match?(file.path, %r{page([0-9]*)/index\.html$})
+      end
+      # Public: Check if the file has one of the allowed extensions
+      #
+      # file - The Jekyll file
+      #
+      # Jekyll can transform markdown files to HTML by default. With plugins, it
+      # can convert many more file formats. By default we'll only index markdown
+      # and raw HTML files but this list can be extended using the
+      # `extensions_to_index` config option.
+      def self.allowed_extension?(file)
+        extensions = Configurator.algolia('extensions_to_index')
+        extname = File.extname(file.path)[1..-1]
+        extensions.include?(extname)
+      end
+      # Public: Check if the file has been excluded by the user
+      #
+      # file - The Jekyll file
+      #
+      # Files can be excluded either by setting the `files_to_exclude` option,
+      # or by defining a custom hook
+      def self.excluded_by_user?(file)
+        excluded_from_config?(file) || excluded_from_hook?(file)
+      end
+      # Public: Check if the file has been excluded by `files_to_exclude`
+      #
+      # file - The Jekyll file
+      def self.excluded_from_config?(file)
+        excluded_patterns = Configurator.algolia('files_to_exclude')
+        excluded_files = []
+        # Transform the glob patterns into a real list of files
+        Dir.chdir(Configurator.get('source')) do
+          excluded_patterns.each do |pattern|
+            excluded_files += Dir.glob(pattern)
+          end
+        end
+        excluded_files.include?(file.path)
+      end
+      # Public: Check if the file has been excluded by running a custom user
+      # hook
+      #
+      # file - The Jekyll file
+      def self.excluded_from_hook?(file)
+        Hooks.should_be_excluded?(file.path)
+      end
+      # Public: Return the path to the original file, relative from the Jekyll
+      # source
+      #
+      # file - The Jekyll file
+      #
+      # Pages have their .path property relative to the source, but collections
+      # (including posts) have an absolute file path.
+      def self.path_from_root(file)
+        source = Configurator.get('source')
+        file.path.gsub(%r{^#{source}/}, '')
+      end
+      # Public: Check if the file should be indexed
+      #
+      # file - The Jekyll file
+      #
+      # There are many reasons a file should not be indexed. We need to exclude
+      # all the static assets, only keep the actual content.
+      def self.indexable?(file)
+        return false if static_file?(file)
+        return false if is_404?(file)
+        return false if pagination_page?(file)
+        return false unless allowed_extension?(file)
+        return false if excluded_by_user?(file)
+        true
+      end
+      # Public: Return a hash of all the file metadata
+      #
+      # file - The Jekyll file
+      #
+      # It contains both the raw metadata extracted from the front-matter, as
+      # well as more specific fields like the collection name, date timestamp,
+      # slug, type and url
+      def self.metadata(file)
+        raw_data = raw_data(file)
+        specific_data = {
+          collection: collection(file),
+          date: date(file),
+          excerpt_html: excerpt_html(file),
+          excerpt_text: excerpt_text(file),
+          slug: slug(file),
+          type: type(file),
+          url: url(file)
+        }
+        metadata = Utils.compact_empty(raw_data.merge(specific_data))
+        metadata
+      end
+      # Public: Return a hash of all the raw data, as defined in the
+      # front-matter and including default values
+      #
+      # file - The Jekyll file
+      #
+      # Any custom data passed to the front-matter will be returned by this
+      # method. It ignores any key where we have a better, custom, getter.
+      # Note that even if you define tags and categories in a collection item,
+      # it will not be included in the data. It's always an empty array.
+      def self.raw_data(file)
+        data = file.data.clone
+        # Remove all keys where we have a specific getter
+        data.each_key do |key|
+          data.delete(key) if respond_to?(key)
+        end
+        # Also delete keys we manually handle
+        data.delete('excerpt')
+        # Convert all keys to symbols
+        data = Utils.keys_to_symbols(data)
+        data
+      end
+      # Public: Get the type of the document (page, post, collection, etc)
+      #
+      # file - The Jekyll file
+      #
+      # Pages are simple html and markdown documents in the tree
+      # Elements from a collection are called Documents
+      # Posts are a custom kind of Documents
+      def self.type(file)
+        type = file.class.name.split('::')[-1].downcase
+        type = 'post' if type == 'document' && file.collection.label == 'posts'
+        type
+      end
+      # Public: Returns the url of the file, starting from the root
+      #
+      # file - The Jekyll file
+      def self.url(file)
+        file.url
+      end
+      # Public: Returns a timestamp of the file date
+      #
+      # file - The Jekyll file
+      #
+      # All collections have a date, either taken from the filename, or the
+      # `date` config set in the front-matter. Even if none is set, the current
+      # date is taken by default.
+      def self.date(file)
+        date = file.data['date']
+        return nil if date.nil?
+        date.to_i
+      end
+      # Public: Returns the HTML version of the excerpt
+      #
+      # file - The Jekyll file
+      #
+      # Only collections (including posts) have an excerpt. Pages don't.
+      def self.excerpt_html(file)
+        excerpt = file.data['excerpt']
+        return nil if excerpt.nil?
+        excerpt.to_s.tr("\n", ' ').strip
+      end
+      # Public: Returns the text version of the excerpt
+      #
+      # file - The Jekyll file
+      #
+      # Only collections (including posts) have an excerpt. Pages don't.
+      def self.excerpt_text(file)
+        html = excerpt_html(file)
+        return nil if html.nil?
+        Utils.html_to_text(html)
+      end
+      # Public: Returns the slug of the file
+      #
+      # file - The Jekyll file
+      #
+      # Slugs can be automatically extracted from collections, but for other
+      # files, we have to create them from the basename
+      def self.slug(file)
+        # We get the real slug from the file data if available
+        return file.data['slug'] if file.data.key?('slug')
+        # We create it ourselves from the filepath otherwise
+        File.basename(file.path, File.extname(file.path)).downcase
+      end
+      # Public: Returns the name of the collection
+      #
+      # file - The Jekyll file
+      #
+      # Only collection documents can have a collection name. Pages don't. Posts
+      # are purposefully excluded from it as well even if they are technically
+      # part of a collection
+      def self.collection(file)
+        return nil unless file.respond_to?(:collection)
+        collection_name = file.collection.label
+        # Posts are a special kind of collection, but it's an implementation
+        # detail from my POV, so I'll exclude them
+        return nil if collection_name == 'posts'
+        collection_name
+      end
+    end
+  end
+end

data/lib/jekyll/algolia/hooks.rb ADDED

@@ -0,0 +1,67 @@
+# frozen_string_literal: true
+module Jekyll
+  module Algolia
+    # Applying user-defined hooks on the processing pipeline
+    module Hooks
+      # Public: Apply the before_indexing_each hook to the record.
+      # This method is a simple wrapper around methods that can be overwritten
+      # by users. Using a wrapper around it makes testing their behavior easier
+      # as they can be mocked in tests.
+      #
+      # record - The hash of the record to be pushed
+      # node - The Nokogiri node of the element
+      def self.apply_each(record, node)
+        before_indexing_each(record, node)
+      end
+      # Public: Apply the before_indexing_all hook to all records.
+      # This method is a simple wrapper around methods that can be overwritten
+      # by users. Using a wrapper around it makes testing their behavior easier
+      # as they can be mocked in tests.
+      #
+      # records - The list of all records to be indexed
+      def self.apply_all(records)
+        before_indexing_all(records)
+      end
+      # Public: Check if the file should be indexed or not
+      #
+      # filepath - The path to the file, before transformation
+      #
+      # This hook allow users to define if a specific file should be indexed or
+      # not. Basic exclusion can be done through the `files_to_exclude` option,
+      # but a custom hook like this one can allow more fine-grained
+      # customisation.
+      def self.should_be_excluded?(_filepath)
+        false
+      end
+      # Public: Custom method to be run on the record before indexing it
+      #
+      # record - The hash of the record to be pushed
+      # node - The Nokogiri node of the element
+      #
+      # Users can modify the record (adding/editing/removing keys) here. It can
+      # be used to remove keys that should not be indexed, or access more
+      # information from the HTML node.
+      #
+      # Users can return nil to signal that the record should not be indexed
+      def self.before_indexing_each(record, _node)
+        record
+      end
+      # Public: Custom method to be run on the list of all records before
+      # indexing them
+      #
+      # records - The list of all records to be indexed
+      #
+      # Users can modify the full list from here. It might provide an easier
+      # interface than `hook_before_indexing_each` when knowing the full context
+      # is necessary
+      def self.before_indexing_all(records)
+        records
+      end
+    end
+  end
+end

data/lib/jekyll/algolia/indexer.rb ADDED

@@ -0,0 +1,258 @@
+# frozen_string_literal: true
+require 'algoliasearch'
+module Jekyll
+  module Algolia
+    # Module to push records to Algolia and configure the index
+    module Indexer
+      include Jekyll::Algolia
+      # Public: Init the module
+      #
+      # This call will instanciate the Algolia API client, set the custom
+      # User Agent and give an easy access to the main index
+      def self.init
+        ::Algolia.init(
+          application_id: Configurator.application_id,
+          api_key: Configurator.api_key
+        )
+        set_user_agent
+      end
+      # Public: Set the User-Agent to send to the API
+      #
+      # Every integrations should follow the "YYY Integration" pattern, and
+      # every API client should follow the "Algolia for YYY" pattern. Even if
+      # each integration version is pinned to a specific API client version, we
+      # are explicit in defining it to help debug from the dashboard.
+      def self.set_user_agent
+        user_agent = [
+          "Jekyll Integration (#{VERSION})",
+          "Algolia for Ruby (#{::Algolia::VERSION})",
+          "Jekyll (#{::Jekyll::VERSION})",
+          "Ruby (#{RUBY_VERSION})"
+        ].join('; ')
+        ::Algolia.set_extra_header('User-Agent', user_agent)
+      end
+      # Public: Returns an Algolia Index object from an index name
+      #
+      # index_name - String name of the index
+      def self.index(index_name)
+        ::Algolia::Index.new(index_name)
+      end
+      # Public: Update records of the specified index
+      #
+      # index - Algolia Index to update
+      # records - Array of records to update
+      #
+      # New records will be automatically added. Technically existing records
+      # should be updated but this case should never happen as changing a record
+      # content will change its objectID as well.
+      #
+      # Does nothing in dry run mode
+      def self.update_records(index, records)
+        batch_size = Configurator.algolia('indexing_batch_size')
+        records.each_slice(batch_size) do |batch|
+          Logger.log("I:Pushing #{batch.size} records")
+          next if Configurator.dry_run?
+          begin
+            index.add_objects!(batch)
+          rescue StandardError => error
+            ErrorHandler.stop(error, records: records)
+          end
+        end
+      end
+      # Public: Delete records whose objectIDs are passed
+      #
+      # index - Algolia Index to target
+      # ids - Array of objectIDs to delete
+      #
+      # Does nothing in dry run mode
+      def self.delete_records_by_id(index, ids)
+        return if ids.empty?
+        Logger.log("I:Deleting #{ids.length} records")
+        return if Configurator.dry_run?
+        begin
+          index.delete_objects!(ids)
+        rescue StandardError => error
+          ErrorHandler.stop(error)
+        end
+      end
+      # Public: Returns an array of all the objectIDs in the index
+      #
+      # index - Algolia Index to target
+      #
+      # The returned array is sorted. It won't have any impact on the way it is
+      # processed, but makes debugging easier when comparing arrays is needed.
+      def self.remote_object_ids(index)
+        list = []
+        begin
+          index.browse(attributesToRetrieve: 'objectID') do |hit|
+            list << hit['objectID']
+          end
+        rescue StandardError
+          # The index might not exist if it's the first time we use the plugin
+          # so we'll consider that it means there are no records there
+          return []
+        end
+        list.sort
+      end
+      # Public: Returns an array of the local objectIDs
+      #
+      # records - Array of all local records
+      def self.local_object_ids(records)
+        records.map { |record| record[:objectID] }.compact.sort
+      end
+      # Public: Update settings of the index
+      #
+      # index - The Algolia Index
+      # settings - The hash of settings to pass to the index
+      #
+      # Does nothing in dry run mode
+      def self.update_settings(index, settings)
+        Logger.verbose('I:Updating settings')
+        return if Configurator.dry_run?
+        begin
+          index.set_settings(settings)
+        rescue StandardError => error
+          ErrorHandler.stop(error, settings: settings)
+        end
+      end
+      # Public: Index content following the `diff` indexing mode
+      #
+      # records - Array of local records
+      #
+      # The `diff` indexing mode will only push new content to the index and
+      # remove old content from it. It won't touch records that haven't been
+      # updated. It will be a bit slower as it will first need to get the list
+      # of all records in the index, but it will consume less operations.
+      def self.run_diff_mode(records)
+        index = index(Configurator.index_name)
+        # Update settings
+        update_settings(index, Configurator.settings)
+        # Getting list of objectID in remote and locally
+        remote_ids = remote_object_ids(index)
+        local_ids = local_object_ids(records)
+        old_records_ids = remote_ids - local_ids
+        new_records_ids = local_ids - remote_ids
+        if old_records_ids.empty? && new_records_ids.empty?
+          Logger.log('I:Nothing to index. Your content is already up to date.')
+          return
+        end
+        Logger.log('I:Pushing records to Algolia...')
+        # Delete remote records that are no longer available locally
+        delete_records_by_id(index, old_records_ids)
+        # Add only records that are not yet already in the remote
+        new_records = records.select do |record|
+          new_records_ids.include?(record[:objectID])
+        end
+        update_records(index, new_records)
+        Logger.log('I:✔ Indexing complete')
+      end
+      # Public: Get the settings of the remote index
+      #
+      # index - The Algolia Index
+      def self.remote_settings(index)
+        index.get_settings
+      rescue StandardError => error
+        ErrorHandler.stop(error)
+      end
+      # Public: Rename an index
+      #
+      # old_name - Current name of the index
+      # new_name - New name of the index
+      #
+      # Does nothing in dry run mode
+      def self.rename_index(old_name, new_name)
+        Logger.verbose("I:Renaming `#{old_name}` to `#{new_name}`")
+        return if Configurator.dry_run?
+        begin
+          ::Algolia.move_index(old_name, new_name)
+        rescue StandardError => error
+          ErrorHandler.stop(error, new_name: new_name)
+        end
+      end
+      # Public: Index content following the `atomic` indexing mode
+      #
+      # records - Array of records to push
+      #
+      # The `atomic` indexing mode will push all records to a brand new index,
+      # configure it, and then overwrite the previous index with this new one.
+      # For the end-user, it will make all the changes in one go, making sure
+      # people are always searching into a fully configured index. It will
+      # consume more operations, but will never leave the index in a transient
+      # state.
+      def self.run_atomic_mode(records)
+        index_name = Configurator.index_name
+        index = index(index_name)
+        index_tmp_name = "#{Configurator.index_name}_tmp"
+        index_tmp = index(index_tmp_name)
+        Logger.verbose("I:Using `#{index_tmp_name}` as temporary index")
+        # Copying original settings to the new index
+        remote_settings = remote_settings(index)
+        new_settings = remote_settings.merge(Configurator.settings)
+        update_settings(index_tmp, new_settings)
+        # Pushing everthing to a brand new index
+        update_records(index_tmp, records)
+        # Renaming the new index in place of the old
+        rename_index(index_tmp_name, index_name)
+        Logger.log('I:✔ Indexing complete')
+      end
+      # Public: Push all records to Algolia and configure the index
+      #
+      # records - Records to push
+      def self.run(records)
+        init
+        record_count = records.length
+        # Indexing zero record is surely a misconfiguration
+        if record_count.zero?
+          files_to_exclude = Configurator.algolia('files_to_exclude').join(', ')
+          Logger.known_message(
+            'no_records_found',
+            'files_to_exclude' => files_to_exclude,
+            'nodes_to_index' => Configurator.algolia('nodes_to_index')
+          )
+          exit 1
+        end
+        indexing_mode = Configurator.indexing_mode
+        Logger.verbose("I:Indexing mode: #{indexing_mode}")
+        case indexing_mode
+        when 'diff'
+          run_diff_mode(records)
+        when 'atomic'
+          run_atomic_mode(records)
+        end
+      end
+    end
+  end
+end