blacklight-spotlight 3.0.0.rc3 → 3.0.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/javascripts/spotlight/admin/reindex_monitor.js +1 -0
  3. data/app/assets/stylesheets/spotlight/browse_group_categories_block.scss +23 -0
  4. data/app/controllers/spotlight/catalog_controller.rb +4 -1
  5. data/app/controllers/spotlight/dashboards_controller.rb +1 -1
  6. data/app/controllers/spotlight/exhibits_controller.rb +1 -1
  7. data/app/helpers/spotlight/application_helper.rb +19 -0
  8. data/app/helpers/spotlight/pages_helper.rb +1 -1
  9. data/app/jobs/concerns/spotlight/job_tracking.rb +47 -0
  10. data/app/jobs/concerns/spotlight/limit_concurrency.rb +33 -0
  11. data/app/jobs/spotlight/add_uploads_from_csv.rb +6 -3
  12. data/app/jobs/spotlight/application_job.rb +8 -0
  13. data/app/jobs/spotlight/cleanup_job_trackers_job.rb +13 -0
  14. data/app/jobs/spotlight/default_thumbnail_job.rb +1 -3
  15. data/app/jobs/spotlight/reindex_exhibit_job.rb +36 -0
  16. data/app/jobs/spotlight/reindex_job.rb +49 -41
  17. data/app/jobs/spotlight/rename_sidecar_field_job.rb +2 -2
  18. data/app/jobs/spotlight/update_job_trackers_job.rb +20 -0
  19. data/app/models/concerns/spotlight/user.rb +2 -1
  20. data/app/models/spotlight/event.rb +13 -0
  21. data/app/models/spotlight/exhibit.rb +4 -14
  22. data/app/models/spotlight/job_tracker.rb +105 -0
  23. data/app/models/spotlight/reindex_progress.rb +44 -27
  24. data/app/models/spotlight/resource.rb +24 -58
  25. data/app/models/spotlight/resources/iiif_harvester.rb +10 -1
  26. data/app/models/spotlight/resources/iiif_manifest.rb +3 -1
  27. data/app/models/spotlight/resources/iiif_service.rb +1 -1
  28. data/app/models/spotlight/resources/json_upload.rb +12 -0
  29. data/app/models/spotlight/resources/upload.rb +25 -2
  30. data/app/models/spotlight/solr_document_sidecar.rb +2 -1
  31. data/app/services/spotlight/etl.rb +7 -0
  32. data/app/services/spotlight/etl/context.rb +52 -0
  33. data/app/services/spotlight/etl/executor.rb +194 -0
  34. data/app/services/spotlight/etl/loaders.rb +12 -0
  35. data/app/services/spotlight/etl/pipeline.rb +81 -0
  36. data/app/services/spotlight/etl/solr_loader.rb +96 -0
  37. data/app/services/spotlight/etl/sources.rb +25 -0
  38. data/app/services/spotlight/etl/step.rb +82 -0
  39. data/app/services/spotlight/etl/transforms.rb +64 -0
  40. data/app/services/spotlight/validity_checker.rb +5 -5
  41. data/app/views/spotlight/dashboards/_reindexing_activity.html.erb +6 -6
  42. data/app/views/spotlight/shared/_locale_picker.html.erb +1 -1
  43. data/app/views/spotlight/sir_trevor/blocks/_browse_group_categories_block.html.erb +4 -3
  44. data/config/locales/spotlight.ar.yml +11 -1
  45. data/config/locales/spotlight.en.yml +3 -2
  46. data/db/migrate/20210122082032_create_job_trackers.rb +22 -0
  47. data/db/migrate/20210126123041_create_events.rb +15 -0
  48. data/lib/generators/spotlight/scaffold_resource_generator.rb +5 -13
  49. data/lib/spotlight/engine.rb +8 -1
  50. data/lib/spotlight/version.rb +1 -1
  51. data/spec/controllers/spotlight/catalog_controller_spec.rb +3 -1
  52. data/spec/examples.txt +1448 -1437
  53. data/spec/factories/job_trackers.rb +9 -0
  54. data/spec/features/add_items_spec.rb +9 -4
  55. data/spec/features/javascript/reindex_monitor_spec.rb +1 -1
  56. data/spec/features/site_users_management_spec.rb +4 -4
  57. data/spec/helpers/spotlight/pages_helper_spec.rb +8 -0
  58. data/spec/jobs/spotlight/reindex_exhibit_job_spec.rb +43 -0
  59. data/spec/jobs/spotlight/reindex_job_spec.rb +30 -59
  60. data/spec/models/spotlight/exhibit_spec.rb +3 -57
  61. data/spec/models/spotlight/reindex_progress_spec.rb +89 -87
  62. data/spec/models/spotlight/resource_spec.rb +69 -90
  63. data/spec/models/spotlight/resources/iiif_harvester_spec.rb +9 -10
  64. data/spec/models/spotlight/solr_document_sidecar_spec.rb +1 -0
  65. data/spec/services/spotlight/etl/context_spec.rb +66 -0
  66. data/spec/services/spotlight/etl/executor_spec.rb +149 -0
  67. data/spec/services/spotlight/etl/pipeline_spec.rb +22 -0
  68. data/spec/services/spotlight/etl/solr_loader_spec.rb +76 -0
  69. data/spec/services/spotlight/etl/step_spec.rb +70 -0
  70. data/spec/spec_helper.rb +2 -5
  71. data/spec/views/spotlight/dashboards/_reindexing_activity.html.erb_spec.rb +22 -19
  72. metadata +55 -15
  73. data/app/models/concerns/spotlight/resources/open_graph.rb +0 -36
  74. data/app/models/spotlight/reindexing_log_entry.rb +0 -42
  75. data/app/services/spotlight/resources/iiif_builder.rb +0 -19
  76. data/app/services/spotlight/solr_document_builder.rb +0 -77
  77. data/app/services/spotlight/upload_solr_document_builder.rb +0 -57
  78. data/spec/factories/reindexing_log_entries.rb +0 -54
  79. data/spec/models/spotlight/reindexing_log_entry_spec.rb +0 -129
  80. data/spec/models/spotlight/resources/open_graph_spec.rb +0 -65
  81. data/spec/services/spotlight/solr_document_builder_spec.rb +0 -66
@@ -14,7 +14,9 @@ module Spotlight
14
14
  @solr_hash = {}
15
15
  end
16
16
 
17
- def to_solr
17
+ def to_solr(exhibit: nil)
18
+ @exhibit = exhibit if exhibit
19
+
18
20
  add_document_id
19
21
  add_label
20
22
  add_thumbnail_url
@@ -42,7 +42,7 @@ module Spotlight
42
42
  class << self
43
43
  def iiif_response(url)
44
44
  Faraday.get(url).body
45
- rescue Faraday::Error::ConnectionFailed, Faraday::TimeoutError => e
45
+ rescue Faraday::ConnectionFailed, Faraday::TimeoutError => e
46
46
  Rails.logger.warn("HTTP GET for #{url} failed with #{e}")
47
47
  {}.to_json
48
48
  end
@@ -5,6 +5,18 @@ module Spotlight
5
5
  # Raw solr document uploads
6
6
  class JsonUpload < Spotlight::Resource
7
7
  store :data, accessors: :json
8
+
9
+ # The indexing pipeline for JSON uploads copies the data from the stored
10
+ # `#data` field directly into the indexed document.
11
+ def self.indexing_pipeline
12
+ @indexing_pipeline ||= super.dup.tap do |pipeline|
13
+ pipeline.sources = [Spotlight::Etl::Sources::StoredData]
14
+
15
+ pipeline.transforms = [
16
+ Spotlight::Etl::Transforms::IdentityTransform
17
+ ] + pipeline.transforms
18
+ end
19
+ end
8
20
  end
9
21
  end
10
22
  end
@@ -10,8 +10,6 @@ module Spotlight
10
10
  # we want to do this before reindexing
11
11
  after_create :update_document_sidecar
12
12
 
13
- self.document_builder_class = UploadSolrDocumentBuilder
14
-
15
13
  def self.fields(exhibit)
16
14
  @fields ||= {}
17
15
  @fields[exhibit] ||= begin
@@ -25,6 +23,15 @@ module Spotlight
25
23
  end
26
24
  end
27
25
 
26
+ def self.indexing_pipeline
27
+ @indexing_pipeline ||= super.dup.tap do |pipeline|
28
+ pipeline.transforms = [
29
+ ->(data, p) { data.merge({ p.context.document_model.unique_key.to_sym => p.source.compound_id }) },
30
+ Spotlight::Etl::Transforms::SourceMethodTransform(:to_solr)
31
+ ] + pipeline.transforms
32
+ end
33
+ end
34
+
28
35
  def compound_id
29
36
  "#{exhibit_id}-#{id}"
30
37
  end
@@ -33,6 +40,22 @@ module Spotlight
33
40
  @sidecar ||= document_model.new(id: compound_id).sidecar(exhibit)
34
41
  end
35
42
 
43
+ def to_solr
44
+ return {} unless upload.file_present?
45
+
46
+ spotlight_routes = Spotlight::Engine.routes.url_helpers
47
+ riiif = Riiif::Engine.routes.url_helpers
48
+
49
+ dimensions = Riiif::Image.new(upload_id).info
50
+
51
+ {
52
+ spotlight_full_image_width_ssm: dimensions.width,
53
+ spotlight_full_image_height_ssm: dimensions.height,
54
+ Spotlight::Engine.config.thumbnail_field => riiif.image_path(upload_id, size: '!400,400'),
55
+ Spotlight::Engine.config.iiif_manifest_field => spotlight_routes.manifest_exhibit_solr_document_path(exhibit, compound_id)
56
+ }
57
+ end
58
+
36
59
  private
37
60
 
38
61
  def configured_fields
@@ -74,7 +74,6 @@ module Spotlight
74
74
 
75
75
  upload_fields.each_with_object({}) do |field, solr_hash|
76
76
  field_name = field.field_name.to_s
77
- next unless configured_fields && configured_fields[field_name].present?
78
77
 
79
78
  value = configured_fields[field_name]
80
79
  field_data = field.data_to_solr(convert_stored_value_to_solr(value))
@@ -85,6 +84,8 @@ module Spotlight
85
84
  end
86
85
 
87
86
  def upload_fields
87
+ return [] unless document.uploaded_resource? || resource.is_a?(Spotlight::Resources::Upload)
88
+
88
89
  Spotlight::Resources::Upload.fields(exhibit)
89
90
  end
90
91
 
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spotlight
4
+ # :nodoc:
5
+ module Etl
6
+ end
7
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spotlight
4
+ module Etl
5
+ # Contextual information for the ETL pipeline
6
+ class Context
7
+ # A hook for downstream applications to report or handle errors using external
8
+ # systems or services.
9
+ class_attribute :error_reporter
10
+
11
+ attr_reader :arguments, :additional_metadata, :additional_parameters, :logger
12
+
13
+ delegate :document_model, to: :resource
14
+
15
+ def initialize(*args, additional_metadata: {}, on_error: :log, logger: Rails.logger, **additional_parameters)
16
+ @arguments = args
17
+ @additional_metadata = additional_metadata
18
+ @additional_parameters = additional_parameters
19
+ @on_error = on_error
20
+ @logger = logger
21
+ end
22
+
23
+ # @return [Spotlight::Resource]
24
+ def resource
25
+ arguments.first
26
+ end
27
+
28
+ # @return [String]
29
+ def unique_key(data)
30
+ data[document_model&.unique_key&.to_sym || :id]
31
+ end
32
+
33
+ ##
34
+ # This hook receives any exceptions raised by pipeline steps and handles them
35
+ # appropriately.
36
+ def on_error(pipeline, exception, data)
37
+ error_reporter&.call(pipeline, exception, data)
38
+
39
+ case @on_error
40
+ when :log
41
+ logger.tagged('ETL') do
42
+ logger.error("Pipeline error processing resource #{resource.id}: #{exception}")
43
+ end
44
+ when :exception
45
+ raise exception
46
+ else
47
+ @on_error&.call(pipeline, exception, data)
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,194 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spotlight
4
+ module Etl
5
+ # ETL pipeline executor
6
+ class Executor
7
+ include ActiveSupport::Benchmarkable
8
+
9
+ attr_reader :pipeline, :context, :source, :cache, :logger
10
+
11
+ delegate :sources, :pre_processes, :transforms, :post_processes, :loaders, to: :pipeline
12
+
13
+ # @param [Spotlight::Etl::Pipeline] pipeline
14
+ # @param [Spotlight::Etl::Context] context
15
+ # @param [Hash] cache a shared cache for pipeline steps to store data for the lifetime of the cache
16
+ def initialize(pipeline, context, cache: nil)
17
+ @pipeline = pipeline
18
+ @context = context
19
+
20
+ @provided_cache = cache.present?
21
+ @cache = cache || {}
22
+ @step_cache = {}
23
+ end
24
+
25
+ ##
26
+ # Execute the ETL pipeline
27
+ #
28
+ # @param [Hash] data the initial data structure to pass through to the transform steps
29
+ # @yield (optionally..) each transformed document after it is transformed but before
30
+ # it is sent to the loaders
31
+ def call(data: {}, &block)
32
+ extract.with_index do |source, index|
33
+ with_source(source, index) do
34
+ catch :skip do
35
+ load(transform(data), &block)
36
+ end
37
+ rescue StandardError => e
38
+ on_error(e, data)
39
+ end
40
+ end
41
+
42
+ after_call
43
+ end
44
+
45
+ ##
46
+ # Estimate the number of documents that will be produced by the pipeline
47
+ #
48
+ # @return [Number]
49
+ def estimated_size
50
+ @estimated_size ||= begin
51
+ compile_steps(sources).sum { |source| source.call(context).count }
52
+ end
53
+ end
54
+
55
+ ##
56
+ # Tagged logger for benchmarks and data flow logging.
57
+ # NOTE: this is super weird to support Rails 5.2
58
+ # @private
59
+ # @yield Logger
60
+ def with_logger
61
+ logger = (context&.logger || Rails.logger)
62
+ logger.tagged(pipeline.class) do
63
+ logger.tagged("#<#{source.class} id=#{source&.id if source.respond_to?(:id)}>") do
64
+ @logger = logger
65
+ yield logger
66
+ end
67
+ end
68
+ end
69
+
70
+ ##
71
+ # @private
72
+ # @param [Hash] data
73
+ # @return [String] a simplified + truncated version of the data hash for debugging
74
+ def transform_data_for_debugging(data, verbose: $VERBOSE, truncate: 100)
75
+ return data.inspect.truncate(truncate) unless data.is_a?(Hash)
76
+ return "id #{context.unique_key(data) || data&.first(5)&.inspect}" unless verbose
77
+
78
+ JSON.fast_generate(data).truncate(truncate)
79
+ end
80
+
81
+ ##
82
+ # Propagate exceptions up to the context's error handler.
83
+ def on_error(exception, data)
84
+ context.on_error(self, exception, data)
85
+ end
86
+
87
+ private
88
+
89
+ ##
90
+ # Set the current source
91
+ # @param [Object] source
92
+ # @param [Number] index
93
+ def with_source(source, index)
94
+ @source = source
95
+
96
+ benchmark "Indexing item #{source.inspect.truncate(50)} in resource #{context.resource.id} (#{index} / #{estimated_size})" do
97
+ yield.tap { @source = nil }
98
+ end
99
+ end
100
+
101
+ ##
102
+ # Extract data from sources. The defined sources receive the provided context
103
+ # and should return an array or other enumerable of sources to pass through
104
+ # the pipeline.
105
+ #
106
+ # @yield [Object]
107
+ def extract(&block)
108
+ return to_enum(:extract) { estimated_size } unless block_given?
109
+
110
+ compile_steps(sources).each do |source|
111
+ source.call(context).each do |data|
112
+ block.call(data)
113
+ end
114
+ end
115
+ end
116
+
117
+ ##
118
+ # Transform the source to a document.
119
+ #
120
+ # @param [Hash] from the initial seed data used as the input to the initial transforms
121
+ # @return [Hash] the transformed document
122
+ def transform(from)
123
+ compile_steps(pre_processes).each { |step| step.call(from, self) }
124
+
125
+ data = compile_steps(transforms).inject(from) { |input, step| step.call(input, self) }
126
+
127
+ compile_steps(post_processes).each { |step| step.call(data, self) }
128
+
129
+ with_logger do |logger|
130
+ logger.debug do
131
+ "Transform output: #{transform_data_for_debugging(data, verbose: true, truncate: 1000)}"
132
+ end
133
+ end
134
+
135
+ data
136
+ end
137
+
138
+ ##
139
+ # Load a document into a data sink.
140
+ #
141
+ # @param [Hash] the fully transformed data
142
+ # @yield [Hash] the data before it is sent to any loaders
143
+ def load(data, &block)
144
+ return unless data
145
+
146
+ catch :skip do
147
+ block&.call(data, self)
148
+
149
+ compile_steps(loaders).each do |loader|
150
+ loader.call(data, self)
151
+ end
152
+ end
153
+ end
154
+
155
+ ##
156
+ # A callback run after transforming data to do any finalizing or cleanup
157
+ # from the run.
158
+ def after_call
159
+ finalize_loaders
160
+ @cache = {} unless @provided_cache
161
+ @step_cache = {}
162
+ end
163
+
164
+ ##
165
+ # Loaders may implement a `#finalize` method if they want to perform any work
166
+ # after all the data is transformed.
167
+ def finalize_loaders
168
+ compile_steps(loaders).each do |step|
169
+ step.finalize(self) if step.respond_to? :finalize
170
+ end
171
+ end
172
+
173
+ ##
174
+ # DSL convenience utility for writing compact lists of steps; this unrolls
175
+ # pipeline definitions to contain arrays or hashes, e.g.:
176
+ # `pipeline.transforms = [step_1: lambda {}, step_2: lambda {}]`
177
+ #
178
+ # @return [Enumerable<Spotlight::Etl::Step>]
179
+ def compile_steps(steps)
180
+ return to_enum(:compile_steps, steps) unless block_given?
181
+
182
+ steps.flatten.each do |step|
183
+ if step.is_a? Hash
184
+ step.each do |k, v|
185
+ yield(@step_cache[k] ||= Spotlight::Etl::Step.new(v, label: k, executor: self))
186
+ end
187
+ else
188
+ yield @step_cache[step] ||= Spotlight::Etl::Step.new(step, executor: self)
189
+ end
190
+ end
191
+ end
192
+ end
193
+ end
194
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spotlight
4
+ module Etl
5
+ module Loaders
6
+ # A loader that just prints the data to $stderr for debugging.
7
+ WarnLoader = lambda do |data, _context|
8
+ warn(JSON.pretty_generate(data))
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spotlight
4
+ module Etl
5
+ # ETL pipeline definition
6
+ class Pipeline
7
+ include ActiveSupport::Benchmarkable
8
+
9
+ attr_reader :context, :source
10
+
11
+ # This ETL pipeline system, while somewhat generic, was implemented for Spotlight
12
+ # to transform Spotlight::Resource instances into Solr documents. The resources
13
+ # go through a series of steps (sources, transforms, loaders) to produce one or
14
+ # more documents in the Solr index.
15
+ #
16
+ # All of the steps below can be provided as:
17
+ # - a lambda
18
+ # - a ruby class (which will be initialized for each pipeline execution)
19
+ # - or, a hash (of any length) with:
20
+ # - a key (used only for clarity in logging, particularly useful to label lambdas)
21
+ # - a value that is one of the valid step types (lambda or ruby class).
22
+ #
23
+ # Any of the transform or loader steps can `throw :skip` to skip the current source.
24
+ #
25
+ # Any exceptions raised by the pipeline's steps are sent to the context's
26
+ # error handler by calling `#on_error` on the context object.
27
+
28
+ # sources return enumerables that convert from the Spotlight::Etl::Context
29
+ # to some data structure that the transform steps can handle. The Context is provided
30
+ # by the implementation when the pipeline is executed.
31
+ class_attribute :sources, default: []
32
+
33
+ # The transform steps (pre-processes, transforms, and post-processes) receive
34
+ # the current data state and the pipeline. The return value from the transforms
35
+ # steps replaces the current data state, however the return values for pre- and
36
+ # post- processing is ignored (although they may mutate the provided data, pipeline, etc).
37
+ #
38
+ # Through the pipeline argument, the transform steps can access:
39
+ # - `context`, the implementation-provided resource
40
+ # - `source`, the current source instance
41
+ class_attribute :pre_processes, default: []
42
+ class_attribute :transforms, default: []
43
+ class_attribute :post_processes, default: []
44
+
45
+ # loaders receive the transformed data and.. do something with it (like load it into Solr)
46
+ # After all documents are transformed, the loader may also receive `#finalize` to finish any
47
+ # additional processing.
48
+ class_attribute :loaders, default: []
49
+
50
+ def initialize
51
+ yield(self) if block_given?
52
+ end
53
+
54
+ ##
55
+ # Execute the ETL pipeline
56
+ #
57
+ # @param [Spotlight::Etl::Context] context
58
+ # @param [Hash] data the initial data structure to pass through to the transform steps
59
+ # @yield (optioanlly..) each transformed document after it is transformed but before
60
+ # it is sent to the loaders
61
+ def call(context, data: {}, cache: nil, &block)
62
+ executor(context, cache: cache).call(data: data, &block)
63
+ end
64
+
65
+ ##
66
+ # Estimate the number of documents that will be produced by the pipeline
67
+ #
68
+ # @param [Spotlight::Etl::Context] context
69
+ # @return [Number]
70
+ def estimated_size(context)
71
+ executor(context).estimated_size
72
+ end
73
+
74
+ private
75
+
76
+ def executor(context, **args)
77
+ Spotlight::Etl::Executor.new(self, context, **args)
78
+ end
79
+ end
80
+ end
81
+ end