RubyGems - blacklight-spotlight - Versions diffs - 3.0.0.rc3 → 3.0.0.rc4 - Mend

blacklight-spotlight 3.0.0.rc3 → 3.0.0.rc4

Files changed (81) hide show

checksums.yaml +4 -4
data/app/assets/javascripts/spotlight/admin/reindex_monitor.js +1 -0
data/app/assets/stylesheets/spotlight/browse_group_categories_block.scss +23 -0
data/app/controllers/spotlight/catalog_controller.rb +4 -1
data/app/controllers/spotlight/dashboards_controller.rb +1 -1
data/app/controllers/spotlight/exhibits_controller.rb +1 -1
data/app/helpers/spotlight/application_helper.rb +19 -0
data/app/helpers/spotlight/pages_helper.rb +1 -1
data/app/jobs/concerns/spotlight/job_tracking.rb +47 -0
data/app/jobs/concerns/spotlight/limit_concurrency.rb +33 -0
data/app/jobs/spotlight/add_uploads_from_csv.rb +6 -3
data/app/jobs/spotlight/application_job.rb +8 -0
data/app/jobs/spotlight/cleanup_job_trackers_job.rb +13 -0
data/app/jobs/spotlight/default_thumbnail_job.rb +1 -3
data/app/jobs/spotlight/reindex_exhibit_job.rb +36 -0
data/app/jobs/spotlight/reindex_job.rb +49 -41
data/app/jobs/spotlight/rename_sidecar_field_job.rb +2 -2
data/app/jobs/spotlight/update_job_trackers_job.rb +20 -0
data/app/models/concerns/spotlight/user.rb +2 -1
data/app/models/spotlight/event.rb +13 -0
data/app/models/spotlight/exhibit.rb +4 -14
data/app/models/spotlight/job_tracker.rb +105 -0
data/app/models/spotlight/reindex_progress.rb +44 -27
data/app/models/spotlight/resource.rb +24 -58
data/app/models/spotlight/resources/iiif_harvester.rb +10 -1
data/app/models/spotlight/resources/iiif_manifest.rb +3 -1
data/app/models/spotlight/resources/iiif_service.rb +1 -1
data/app/models/spotlight/resources/json_upload.rb +12 -0
data/app/models/spotlight/resources/upload.rb +25 -2
data/app/models/spotlight/solr_document_sidecar.rb +2 -1
data/app/services/spotlight/etl.rb +7 -0
data/app/services/spotlight/etl/context.rb +52 -0
data/app/services/spotlight/etl/executor.rb +194 -0
data/app/services/spotlight/etl/loaders.rb +12 -0
data/app/services/spotlight/etl/pipeline.rb +81 -0
data/app/services/spotlight/etl/solr_loader.rb +96 -0
data/app/services/spotlight/etl/sources.rb +25 -0
data/app/services/spotlight/etl/step.rb +82 -0
data/app/services/spotlight/etl/transforms.rb +64 -0
data/app/services/spotlight/validity_checker.rb +5 -5
data/app/views/spotlight/dashboards/_reindexing_activity.html.erb +6 -6
data/app/views/spotlight/shared/_locale_picker.html.erb +1 -1
data/app/views/spotlight/sir_trevor/blocks/_browse_group_categories_block.html.erb +4 -3
data/config/locales/spotlight.ar.yml +11 -1
data/config/locales/spotlight.en.yml +3 -2
data/db/migrate/20210122082032_create_job_trackers.rb +22 -0
data/db/migrate/20210126123041_create_events.rb +15 -0
data/lib/generators/spotlight/scaffold_resource_generator.rb +5 -13
data/lib/spotlight/engine.rb +8 -1
data/lib/spotlight/version.rb +1 -1
data/spec/controllers/spotlight/catalog_controller_spec.rb +3 -1
data/spec/examples.txt +1448 -1437
data/spec/factories/job_trackers.rb +9 -0
data/spec/features/add_items_spec.rb +9 -4
data/spec/features/javascript/reindex_monitor_spec.rb +1 -1
data/spec/features/site_users_management_spec.rb +4 -4
data/spec/helpers/spotlight/pages_helper_spec.rb +8 -0
data/spec/jobs/spotlight/reindex_exhibit_job_spec.rb +43 -0
data/spec/jobs/spotlight/reindex_job_spec.rb +30 -59
data/spec/models/spotlight/exhibit_spec.rb +3 -57
data/spec/models/spotlight/reindex_progress_spec.rb +89 -87
data/spec/models/spotlight/resource_spec.rb +69 -90
data/spec/models/spotlight/resources/iiif_harvester_spec.rb +9 -10
data/spec/models/spotlight/solr_document_sidecar_spec.rb +1 -0
data/spec/services/spotlight/etl/context_spec.rb +66 -0
data/spec/services/spotlight/etl/executor_spec.rb +149 -0
data/spec/services/spotlight/etl/pipeline_spec.rb +22 -0
data/spec/services/spotlight/etl/solr_loader_spec.rb +76 -0
data/spec/services/spotlight/etl/step_spec.rb +70 -0
data/spec/spec_helper.rb +2 -5
data/spec/views/spotlight/dashboards/_reindexing_activity.html.erb_spec.rb +22 -19
metadata +55 -15
data/app/models/concerns/spotlight/resources/open_graph.rb +0 -36
data/app/models/spotlight/reindexing_log_entry.rb +0 -42
data/app/services/spotlight/resources/iiif_builder.rb +0 -19
data/app/services/spotlight/solr_document_builder.rb +0 -77
data/app/services/spotlight/upload_solr_document_builder.rb +0 -57
data/spec/factories/reindexing_log_entries.rb +0 -54
data/spec/models/spotlight/reindexing_log_entry_spec.rb +0 -129
data/spec/models/spotlight/resources/open_graph_spec.rb +0 -65
data/spec/services/spotlight/solr_document_builder_spec.rb +0 -66

data/app/models/spotlight/resources/iiif_manifest.rb CHANGED Viewed

@@ -14,7 +14,9 @@ module Spotlight
         @solr_hash = {}
       end
-      def to_solr
+      def to_solr(exhibit: nil)
+        @exhibit = exhibit if exhibit
         add_document_id
         add_label
         add_thumbnail_url

data/app/models/spotlight/resources/iiif_service.rb CHANGED Viewed

@@ -42,7 +42,7 @@ module Spotlight
       class << self
         def iiif_response(url)
           Faraday.get(url).body
-        rescue Faraday::Error::ConnectionFailed, Faraday::TimeoutError => e
+        rescue Faraday::ConnectionFailed, Faraday::TimeoutError => e
           Rails.logger.warn("HTTP GET for #{url} failed with #{e}")
           {}.to_json
         end

data/app/models/spotlight/resources/json_upload.rb CHANGED Viewed

@@ -5,6 +5,18 @@ module Spotlight
     # Raw solr document uploads
     class JsonUpload < Spotlight::Resource
       store :data, accessors: :json
+      # The indexing pipeline for JSON uploads copies the data from the stored
+      # `#data` field directly into the indexed document.
+      def self.indexing_pipeline
+        @indexing_pipeline ||= super.dup.tap do |pipeline|
+          pipeline.sources = [Spotlight::Etl::Sources::StoredData]
+          pipeline.transforms = [
+            Spotlight::Etl::Transforms::IdentityTransform
+          ] + pipeline.transforms
+        end
+      end
     end
   end
 end

data/app/models/spotlight/resources/upload.rb CHANGED Viewed

@@ -10,8 +10,6 @@ module Spotlight
       # we want to do this before reindexing
       after_create :update_document_sidecar
-      self.document_builder_class = UploadSolrDocumentBuilder
       def self.fields(exhibit)
         @fields ||= {}
         @fields[exhibit] ||= begin
@@ -25,6 +23,15 @@ module Spotlight
         end
       end
+      def self.indexing_pipeline
+        @indexing_pipeline ||= super.dup.tap do |pipeline|
+          pipeline.transforms = [
+            ->(data, p) { data.merge({ p.context.document_model.unique_key.to_sym => p.source.compound_id }) },
+            Spotlight::Etl::Transforms::SourceMethodTransform(:to_solr)
+          ] + pipeline.transforms
+        end
+      end
       def compound_id
         "#{exhibit_id}-#{id}"
       end
@@ -33,6 +40,22 @@ module Spotlight
         @sidecar ||= document_model.new(id: compound_id).sidecar(exhibit)
       end
+      def to_solr
+        return {} unless upload.file_present?
+        spotlight_routes = Spotlight::Engine.routes.url_helpers
+        riiif = Riiif::Engine.routes.url_helpers
+        dimensions = Riiif::Image.new(upload_id).info
+        {
+          spotlight_full_image_width_ssm: dimensions.width,
+          spotlight_full_image_height_ssm: dimensions.height,
+          Spotlight::Engine.config.thumbnail_field => riiif.image_path(upload_id, size: '!400,400'),
+          Spotlight::Engine.config.iiif_manifest_field => spotlight_routes.manifest_exhibit_solr_document_path(exhibit, compound_id)
+        }
+      end
       private
       def configured_fields

data/app/models/spotlight/solr_document_sidecar.rb CHANGED Viewed

@@ -74,7 +74,6 @@ module Spotlight
       upload_fields.each_with_object({}) do |field, solr_hash|
         field_name = field.field_name.to_s
-        next unless configured_fields && configured_fields[field_name].present?
         value = configured_fields[field_name]
         field_data = field.data_to_solr(convert_stored_value_to_solr(value))
@@ -85,6 +84,8 @@ module Spotlight
     end
     def upload_fields
+      return [] unless document.uploaded_resource? || resource.is_a?(Spotlight::Resources::Upload)
       Spotlight::Resources::Upload.fields(exhibit)
     end

data/app/services/spotlight/etl.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+module Spotlight
+  # :nodoc:
+  module Etl
+  end
+end

data/app/services/spotlight/etl/context.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+module Spotlight
+  module Etl
+    # Contextual information for the ETL pipeline
+    class Context
+      # A hook for downstream applications to report or handle errors using external
+      # systems or services.
+      class_attribute :error_reporter
+      attr_reader :arguments, :additional_metadata, :additional_parameters, :logger
+      delegate :document_model, to: :resource
+      def initialize(*args, additional_metadata: {}, on_error: :log, logger: Rails.logger, **additional_parameters)
+        @arguments = args
+        @additional_metadata = additional_metadata
+        @additional_parameters = additional_parameters
+        @on_error = on_error
+        @logger = logger
+      end
+      # @return [Spotlight::Resource]
+      def resource
+        arguments.first
+      end
+      # @return [String]
+      def unique_key(data)
+        data[document_model&.unique_key&.to_sym || :id]
+      end
+      ##
+      # This hook receives any exceptions raised by pipeline steps and handles them
+      # appropriately.
+      def on_error(pipeline, exception, data)
+        error_reporter&.call(pipeline, exception, data)
+        case @on_error
+        when :log
+          logger.tagged('ETL') do
+            logger.error("Pipeline error processing resource #{resource.id}: #{exception}")
+          end
+        when :exception
+          raise exception
+        else
+          @on_error&.call(pipeline, exception, data)
+        end
+      end
+    end
+  end
+end

data/app/services/spotlight/etl/executor.rb ADDED Viewed

@@ -0,0 +1,194 @@
+# frozen_string_literal: true
+module Spotlight
+  module Etl
+    # ETL pipeline executor
+    class Executor
+      include ActiveSupport::Benchmarkable
+      attr_reader :pipeline, :context, :source, :cache, :logger
+      delegate :sources, :pre_processes, :transforms, :post_processes, :loaders, to: :pipeline
+      # @param [Spotlight::Etl::Pipeline] pipeline
+      # @param [Spotlight::Etl::Context] context
+      # @param [Hash] cache a shared cache for pipeline steps to store data for the lifetime of the cache
+      def initialize(pipeline, context, cache: nil)
+        @pipeline = pipeline
+        @context = context
+        @provided_cache = cache.present?
+        @cache = cache || {}
+        @step_cache = {}
+      end
+      ##
+      # Execute the ETL pipeline
+      #
+      # @param [Hash] data the initial data structure to pass through to the transform steps
+      # @yield (optionally..) each transformed document after it is transformed but before
+      #        it is sent to the loaders
+      def call(data: {}, &block)
+        extract.with_index do |source, index|
+          with_source(source, index) do
+            catch :skip do
+              load(transform(data), &block)
+            end
+          rescue StandardError => e
+            on_error(e, data)
+          end
+        end
+        after_call
+      end
+      ##
+      # Estimate the number of documents that will be produced by the pipeline
+      #
+      # @return [Number]
+      def estimated_size
+        @estimated_size ||= begin
+          compile_steps(sources).sum { |source| source.call(context).count }
+        end
+      end
+      ##
+      # Tagged logger for benchmarks and data flow logging.
+      # NOTE: this is super weird to support Rails 5.2
+      # @private
+      # @yield Logger
+      def with_logger
+        logger = (context&.logger || Rails.logger)
+        logger.tagged(pipeline.class) do
+          logger.tagged("#<#{source.class} id=#{source&.id if source.respond_to?(:id)}>") do
+            @logger = logger
+            yield logger
+          end
+        end
+      end
+      ##
+      # @private
+      # @param [Hash] data
+      # @return [String] a simplified + truncated version of the data hash for debugging
+      def transform_data_for_debugging(data, verbose: $VERBOSE, truncate: 100)
+        return data.inspect.truncate(truncate) unless data.is_a?(Hash)
+        return "id #{context.unique_key(data) || data&.first(5)&.inspect}" unless verbose
+        JSON.fast_generate(data).truncate(truncate)
+      end
+      ##
+      # Propagate exceptions up to the context's error handler.
+      def on_error(exception, data)
+        context.on_error(self, exception, data)
+      end
+      private
+      ##
+      # Set the current source
+      # @param [Object] source
+      # @param [Number] index
+      def with_source(source, index)
+        @source = source
+        benchmark "Indexing item #{source.inspect.truncate(50)} in resource #{context.resource.id} (#{index} / #{estimated_size})" do
+          yield.tap { @source = nil }
+        end
+      end
+      ##
+      # Extract data from sources. The defined sources receive the provided context
+      # and should return an array or other enumerable of sources to pass through
+      # the pipeline.
+      #
+      # @yield [Object]
+      def extract(&block)
+        return to_enum(:extract) { estimated_size } unless block_given?
+        compile_steps(sources).each do |source|
+          source.call(context).each do |data|
+            block.call(data)
+          end
+        end
+      end
+      ##
+      # Transform the source to a document.
+      #
+      # @param [Hash] from the initial seed data used as the input to the initial transforms
+      # @return [Hash] the transformed document
+      def transform(from)
+        compile_steps(pre_processes).each { |step| step.call(from, self) }
+        data = compile_steps(transforms).inject(from) { |input, step| step.call(input, self) }
+        compile_steps(post_processes).each { |step| step.call(data, self) }
+        with_logger do |logger|
+          logger.debug do
+            "Transform output: #{transform_data_for_debugging(data, verbose: true, truncate: 1000)}"
+          end
+        end
+        data
+      end
+      ##
+      # Load a document into a data sink.
+      #
+      # @param [Hash] the fully transformed data
+      # @yield [Hash] the data before it is sent to any loaders
+      def load(data, &block)
+        return unless data
+        catch :skip do
+          block&.call(data, self)
+          compile_steps(loaders).each do |loader|
+            loader.call(data, self)
+          end
+        end
+      end
+      ##
+      # A callback run after transforming data to do any finalizing or cleanup
+      # from the run.
+      def after_call
+        finalize_loaders
+        @cache = {} unless @provided_cache
+        @step_cache = {}
+      end
+      ##
+      # Loaders may implement a `#finalize` method if they want to perform any work
+      # after all the data is transformed.
+      def finalize_loaders
+        compile_steps(loaders).each do |step|
+          step.finalize(self) if step.respond_to? :finalize
+        end
+      end
+      ##
+      # DSL convenience utility for writing compact lists of steps; this unrolls
+      # pipeline definitions to contain arrays or hashes, e.g.:
+      # `pipeline.transforms = [step_1: lambda {}, step_2: lambda {}]`
+      #
+      # @return [Enumerable<Spotlight::Etl::Step>]
+      def compile_steps(steps)
+        return to_enum(:compile_steps, steps) unless block_given?
+        steps.flatten.each do |step|
+          if step.is_a? Hash
+            step.each do |k, v|
+              yield(@step_cache[k] ||= Spotlight::Etl::Step.new(v, label: k, executor: self))
+            end
+          else
+            yield @step_cache[step] ||= Spotlight::Etl::Step.new(step, executor: self)
+          end
+        end
+      end
+    end
+  end
+end

data/app/services/spotlight/etl/loaders.rb ADDED Viewed

@@ -0,0 +1,12 @@
+# frozen_string_literal: true
+module Spotlight
+  module Etl
+    module Loaders
+      # A loader that just prints the data to $stderr for debugging.
+      WarnLoader = lambda do |data, _context|
+        warn(JSON.pretty_generate(data))
+      end
+    end
+  end
+end

data/app/services/spotlight/etl/pipeline.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# frozen_string_literal: true
+module Spotlight
+  module Etl
+    # ETL pipeline definition
+    class Pipeline
+      include ActiveSupport::Benchmarkable
+      attr_reader :context, :source
+      # This ETL pipeline system, while somewhat generic, was implemented for Spotlight
+      # to transform Spotlight::Resource instances into Solr documents. The resources
+      # go through a series of steps (sources, transforms, loaders) to produce one or
+      # more documents in the Solr index.
+      #
+      # All of the steps below can be provided as:
+      #  - a lambda
+      #  - a ruby class (which will be initialized for each pipeline execution)
+      #  - or, a hash (of any length) with:
+      #       - a key (used only for clarity in logging, particularly useful to label lambdas)
+      #       - a value that is one of the valid step types (lambda or ruby class).
+      #
+      # Any of the transform or loader steps can `throw :skip` to skip the current source.
+      #
+      # Any exceptions raised by the pipeline's steps are sent to the context's
+      # error handler by calling `#on_error` on the context object.
+      # sources return enumerables that convert from the Spotlight::Etl::Context
+      # to some data structure that the transform steps can handle. The Context is provided
+      # by the implementation when the pipeline is executed.
+      class_attribute :sources, default: []
+      # The transform steps (pre-processes, transforms, and post-processes) receive
+      # the current data state and the pipeline. The return value from the transforms
+      # steps replaces the current data state, however the return values for pre- and
+      # post- processing is ignored (although they may mutate the provided data, pipeline, etc).
+      #
+      # Through the pipeline argument, the transform steps can access:
+      #  - `context`, the implementation-provided resource
+      #  - `source`, the current source instance
+      class_attribute :pre_processes, default: []
+      class_attribute :transforms, default: []
+      class_attribute :post_processes, default: []
+      # loaders receive the transformed data and.. do something with it (like load it into Solr)
+      # After all documents are transformed, the loader may also receive `#finalize` to finish any
+      # additional processing.
+      class_attribute :loaders, default: []
+      def initialize
+        yield(self) if block_given?
+      end
+      ##
+      # Execute the ETL pipeline
+      #
+      # @param [Spotlight::Etl::Context] context
+      # @param [Hash] data the initial data structure to pass through to the transform steps
+      # @yield (optioanlly..) each transformed document after it is transformed but before
+      #        it is sent to the loaders
+      def call(context, data: {}, cache: nil, &block)
+        executor(context, cache: cache).call(data: data, &block)
+      end
+      ##
+      # Estimate the number of documents that will be produced by the pipeline
+      #
+      # @param [Spotlight::Etl::Context] context
+      # @return [Number]
+      def estimated_size(context)
+        executor(context).estimated_size
+      end
+      private
+      def executor(context, **args)
+        Spotlight::Etl::Executor.new(self, context, **args)
+      end
+    end
+  end
+end