iiif_print 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  4. data/.github/workflows/build-lint-test-action.yaml +4 -5
  5. data/.gitignore +5 -4
  6. data/.rubocop.yml +1 -0
  7. data/.solargraph.yml +19 -0
  8. data/Gemfile.lock +1025 -0
  9. data/README.md +98 -9
  10. data/Rakefile +6 -0
  11. data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
  12. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
  13. data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
  14. data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
  15. data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
  16. data/app/helpers/iiif_print_helper.rb +0 -20
  17. data/app/indexers/concerns/iiif_print/child_indexer.rb +9 -3
  18. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +17 -4
  19. data/app/models/concerns/iiif_print/set_child_flag.rb +9 -0
  20. data/app/models/concerns/iiif_print/solr/document.rb +14 -0
  21. data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
  22. data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
  23. data/app/models/iiif_print/pending_relationship.rb +3 -0
  24. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
  25. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  26. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +19 -10
  27. data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
  28. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
  29. data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
  30. data/app/services/iiif_print/manifest_builder_service_behavior.rb +88 -31
  31. data/app/services/iiif_print/pluggable_derivative_service.rb +3 -9
  32. data/app/views/catalog/_index_header_list_default.html.erb +13 -0
  33. data/app/views/hyrax/base/_representative_media.html.erb +4 -3
  34. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
  35. data/app/views/hyrax/file_sets/_actions.html.erb +2 -1
  36. data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
  37. data/config/locales/iiif_print.en.yml +4 -0
  38. data/config/routes.rb +3 -0
  39. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
  40. data/docker-compose.yml +2 -2
  41. data/iiif_print.gemspec +10 -9
  42. data/lib/generators/iiif_print/install_generator.rb +21 -1
  43. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
  44. data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
  45. data/lib/iiif_print/base_derivative_service.rb +2 -1
  46. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +57 -5
  47. data/lib/iiif_print/catalog_search_builder.rb +5 -1
  48. data/lib/iiif_print/configuration.rb +145 -8
  49. data/lib/iiif_print/data/fileset_helper.rb +1 -1
  50. data/lib/iiif_print/data/work_derivatives.rb +3 -3
  51. data/lib/iiif_print/engine.rb +7 -13
  52. data/lib/iiif_print/errors.rb +18 -0
  53. data/lib/iiif_print/homepage_search_builder.rb +17 -0
  54. data/lib/iiif_print/image_tool.rb +12 -8
  55. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +74 -33
  56. data/lib/iiif_print/jobs/create_relationships_job.rb +80 -31
  57. data/lib/iiif_print/jobs/request_split_pdf_job.rb +31 -0
  58. data/lib/iiif_print/lineage_service.rb +29 -8
  59. data/lib/iiif_print/metadata.rb +67 -48
  60. data/lib/iiif_print/split_pdfs/base_splitter.rb +142 -0
  61. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +68 -32
  62. data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
  63. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +33 -0
  64. data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
  65. data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
  66. data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
  67. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
  68. data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
  69. data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
  70. data/lib/iiif_print/version.rb +1 -1
  71. data/lib/iiif_print.rb +167 -12
  72. data/lib/samvera/derivatives/configuration.rb +83 -0
  73. data/lib/samvera/derivatives/hyrax.rb +129 -0
  74. data/lib/samvera/derivatives.rb +238 -0
  75. data/spec/factories/newspaper_page_solr_document.rb +9 -1
  76. data/spec/fixtures/authorities/licenses.yml +4 -0
  77. data/spec/fixtures/authorities/rights_statements.yml +4 -0
  78. data/spec/iiif_print/base_derivative_service_spec.rb +20 -3
  79. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +11 -3
  80. data/spec/iiif_print/catalog_search_builder_spec.rb +1 -1
  81. data/spec/iiif_print/configuration_spec.rb +141 -15
  82. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +7 -2
  83. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +110 -9
  84. data/spec/iiif_print/lineage_service_spec.rb +1 -1
  85. data/spec/iiif_print/metadata_spec.rb +157 -23
  86. data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +27 -0
  87. data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +80 -0
  88. data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +92 -0
  89. data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +22 -0
  90. data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +18 -0
  91. data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +19 -0
  92. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +2 -2
  93. data/spec/iiif_print_spec.rb +125 -5
  94. data/spec/models/iiif_print/iiif_search_decorator_spec.rb +27 -0
  95. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +51 -0
  96. data/spec/samvera/derivatives/configuration_spec.rb +41 -0
  97. data/spec/samvera/derivatives/hyrax_spec.rb +62 -0
  98. data/spec/samvera/derivatives_spec.rb +54 -0
  99. data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +103 -0
  100. data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +20 -0
  101. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +8 -11
  102. data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
  103. data/tasks/copy_authorities_to_test_app.rake +11 -0
  104. data/tasks/iiif_print_dev.rake +4 -4
  105. metadata +123 -35
  106. data/app/helpers/hyrax/iiif_helper.rb +0 -22
  107. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
  108. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
@@ -3,8 +3,128 @@ module IiifPrint
3
3
  module IiifManifestPresenterBehavior
4
4
  extend ActiveSupport::Concern
5
5
 
6
+ # Extending the presenter to the base url which includes the protocol.
7
+ # We need the base url to render the facet links and normalize the interface.
8
+ attr_accessor :base_url
9
+
10
+ def manifest_metadata
11
+ # ensure we are using a SolrDocument
12
+ @manifest_metadata ||= IiifPrint.manifest_metadata_from(work: model.solr_document, presenter: self)
13
+ end
14
+
6
15
  def search_service
7
16
  Rails.application.routes.url_helpers.solr_document_iiif_search_url(id, host: hostname)
8
17
  end
18
+
19
+ # OVERRIDE: Hyrax 3x, avoid nil returning to IIIF Manifest gem
20
+ # @see https://github.com/samvera/iiif_manifest/blob/c408f90eba11bef908796c7236ba6bcf8d687acc/lib/iiif_manifest/v3/manifest_builder/record_property_builder.rb#L28
21
+ ##
22
+ # @return [Array<Hash{String => String}>]
23
+ def sequence_rendering
24
+ Array(try(:rendering_ids)).map do |file_set_id|
25
+ rendering = file_set_presenters.find { |p| p.id == file_set_id }
26
+ return [] unless rendering
27
+
28
+ { '@id' => Hyrax::Engine.routes.url_helpers.download_url(rendering.id, host: hostname),
29
+ 'format' => rendering.mime_type.presence || I18n.t("hyrax.manifest.unknown_mime_text"),
30
+ 'label' => I18n.t("hyrax.manifest.download_text") + (rendering.label || '') }
31
+ end.flatten
32
+ end
33
+
34
+ # OVERRIDE: Hyrax v3.x
35
+ module DisplayImagePresenterBehavior
36
+ # Extending the presenter to the base url which includes the protocol.
37
+ # We need the base url to render the facet links and normalize the interface.
38
+ attr_accessor :base_url
39
+
40
+ # Extending this class because there is an #ability= but not #ability and this definition
41
+ # mirrors the Hyrax::IiifManifestPresenter#ability.
42
+ def ability
43
+ @ability ||= NullAbility.new
44
+ end
45
+
46
+ def display_image
47
+ return nil unless latest_file_id
48
+ return nil unless model.image?
49
+ return nil unless IiifPrint.config.default_iiif_manifest_version == 2
50
+
51
+ IIIFManifest::DisplayImage
52
+ .new(display_image_url(hostname),
53
+ format: image_format(alpha_channels),
54
+ width: width,
55
+ height: height,
56
+ iiif_endpoint: iiif_endpoint(latest_file_id, base_url: hostname))
57
+ end
58
+
59
+ # OVERRIDE: IIIF Hyrax AV v0.2 #display_content for prez 3 manifests
60
+ def display_content
61
+ return nil unless latest_file_id
62
+ return super unless model.image?
63
+
64
+ IIIFManifest::V3::DisplayContent
65
+ .new(display_image_url(hostname),
66
+ format: image_format(alpha_channels),
67
+ width: width,
68
+ height: height,
69
+ type: 'Image',
70
+ iiif_endpoint: iiif_endpoint(latest_file_id, base_url: hostname))
71
+ end
72
+
73
+ def display_image_url(base_url)
74
+ if ENV['EXTERNAL_IIIF_URL'].present?
75
+ # At the moment we are only concerned about Hyrax's default image url builder
76
+ iiif_image_url_builder(url_builder: Hyrax.config.iiif_image_url_builder)
77
+ else
78
+ super
79
+ end
80
+ end
81
+
82
+ def iiif_endpoint(file_id, base_url: request.base_url)
83
+ if ENV['EXTERNAL_IIIF_URL'].present?
84
+ IIIFManifest::IIIFEndpoint.new(
85
+ File.join(ENV['EXTERNAL_IIIF_URL'], file_id),
86
+ profile: Hyrax.config.iiif_image_compliance_level_uri
87
+ )
88
+ else
89
+ super
90
+ end
91
+ end
92
+
93
+ def hostname
94
+ @hostname || 'localhost'
95
+ end
96
+
97
+ ##
98
+ # @return [Boolean] false
99
+ def work?
100
+ false
101
+ end
102
+
103
+ private
104
+
105
+ def latest_file_id
106
+ if ENV['EXTERNAL_IIIF_URL'].present?
107
+ external_latest_file_id
108
+ else
109
+ super
110
+ end
111
+ end
112
+
113
+ def external_latest_file_id
114
+ @latest_file_id ||= digest_sha1
115
+ end
116
+
117
+ def iiif_image_url_builder(url_builder:)
118
+ args = [
119
+ latest_file_id,
120
+ ENV['EXTERNAL_IIIF_URL'],
121
+ Hyrax.config.iiif_image_size_default
122
+ ]
123
+ # In Hyrax 3, Hyrax.config.iiif_image_url_builder takes an additional argument
124
+ args << image_format(alpha_channels) if url_builder.arity == 4
125
+
126
+ url_builder.call(*args).gsub(%r{images/}, '')
127
+ end
128
+ end
9
129
  end
10
130
  end
@@ -14,7 +14,7 @@ module IiifPrint
14
14
  presenter_class.for(solr_doc)
15
15
  elsif Hyrax.config.curation_concerns.include?(solr_doc.hydra_model)
16
16
  # look up file set ids and loop through those
17
- file_set_docs = load_file_set_docs(solr_doc.file_set_ids)
17
+ file_set_docs = load_file_set_docs(solr_doc.try(:file_set_ids) || solr_doc.try(:[], 'file_set_ids_ssim'))
18
18
  file_set_docs.map { |doc| presenter_class.for(doc) } if file_set_docs.length
19
19
  end
20
20
  end.flatten.compact
@@ -4,26 +4,35 @@ module IiifPrint
4
4
  module WorkShowPresenterDecorator
5
5
  delegate :file_set_ids, to: :solr_document
6
6
 
7
- # OVERRIDE Hyrax 2.9.6 to remove check for representative_presenter.image? and allow
8
- # a fallback to check for images on the child works
7
+ # OVERRIDE Hyrax 2.9.6 to remove check for representative_presenter.image?
9
8
  # @return [Boolean] render a IIIF viewer
10
9
  def iiif_viewer?
11
- parent_work_has_files? || child_work_has_files?
10
+ Hyrax.config.iiif_image_server? &&
11
+ representative_id.present? &&
12
+ representative_presenter.present? &&
13
+ members_include_viewable_image?
12
14
  end
13
15
 
14
16
  alias universal_viewer? iiif_viewer?
15
17
 
16
18
  private
17
19
 
18
- def parent_work_has_files?
19
- Hyrax.config.iiif_image_server? &&
20
- representative_id.present? &&
21
- representative_presenter.present? &&
22
- members_include_viewable_image?
20
+ # overriding Hyrax to include file sets for both work and child works (file set ids include both)
21
+ # process each id, short-circuiting the loop once one true value is found. This speeds up the test
22
+ # by not loading more member_presenters than needed.
23
+ def members_include_viewable_image?
24
+ all_member_ids = (solr_document.try(:file_set_ids) || solr_document.try(:[], 'file_set_ids_ssim'))
25
+ Array.wrap(all_member_ids).each do |id|
26
+ return true if file_type_and_permissions_valid?(member_presenters_for([id]).first)
27
+ end
28
+ false
23
29
  end
24
30
 
25
- def child_work_has_files?
26
- file_set_ids.present?
31
+ # This method allows for overriding to add additional file types to mix in with IiifAv
32
+ # TODO: add configuration setting for file types to loop through so an override is unneeded.
33
+ def file_type_and_permissions_valid?(presenter)
34
+ current_ability.can?(:read, presenter.id) &&
35
+ (presenter.try(:image?) || presenter.try(:solr_document).try(:image?))
27
36
  end
28
37
  end
29
38
  end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module IiifPrint
4
+ module AllinsonFlexFields
5
+ def include_allinson_flex_fields(solr_parameters)
6
+ return unless defined?(AllinsonFlex)
7
+
8
+ query_fields = solr_parameters[:qf].split(' ') + IiifPrint.allinson_flex_fields
9
+ .each_with_object([]) do |field, arr|
10
+ arr << (field.name + '_tesim') if field.is_a?(AllinsonFlex::ProfileProperty)
11
+ end
12
+ solr_parameters[:qf] = query_fields.uniq.join(' ')
13
+ end
14
+ end
15
+ end
@@ -6,9 +6,10 @@ module IiifPrint
6
6
  def highlight_search_params(solr_parameters = {})
7
7
  return unless solr_parameters[:q] || solr_parameters[:all_fields]
8
8
  solr_parameters[:hl] = true
9
- solr_parameters[:'hl.fl'] = 'all_text_tsimv'
9
+ solr_parameters[:'hl.fl'] = '*'
10
10
  solr_parameters[:'hl.fragsize'] = 100
11
11
  solr_parameters[:'hl.snippets'] = 5
12
+ solr_parameters[:'hl.requiredFieldMatch'] = true
12
13
  end
13
14
  end
14
15
  end
@@ -0,0 +1,382 @@
1
+ module IiifPrint
2
+ ##
3
+ # This class implements the interface of a Hyrax::DerivativeService.
4
+ #
5
+ # That means three important methods are:
6
+ #
7
+ # - {#valid?}
8
+ # - {#create_derivatives}
9
+ # - {#cleanup_derivatives}
10
+ #
11
+ # And the object initializes with a FileSet.
12
+ #
13
+ # It is a companion to {IiifPrint::PluggableDerivativeService}.
14
+ #
15
+ # @see https://github.com/samvera/hyrax/blob/main/app/services/hyrax/derivative_service.rb Hyrax::DerivativesService
16
+ # rubocop:disable Metrics/ClassLength
17
+ class DerivativeRodeoService
18
+ ##
19
+ # @!group Class Attributes
20
+ #
21
+ # @!attribute parent_work_identifier_property_name [r|w]
22
+ # @return [String] the property we use to identify the unique identifier of the parent work as
23
+ # it went through the SpaceStone pre-process.
24
+ #
25
+ # @todo The default of :aark_id is a quick hack for adventist. By exposing a configuration
26
+ # value, my hope is that this becomes easier to configure.
27
+ # @api public
28
+ class_attribute :parent_work_identifier_property_name, default: 'aark_id'
29
+
30
+ ##
31
+ # @!attribute preprocessed_location_adapter_name [r|w]
32
+ # @return [String] The name of a derivative rodeo storage location; this will must be a
33
+ # registered with the DerivativeRodeo::StorageLocations::BaseLocation.
34
+ # @api public
35
+ class_attribute :preprocessed_location_adapter_name, default: 's3'
36
+
37
+ ##
38
+ # @!attribute named_derivatives_and_generators_by_type [r|w]
39
+ # @return [Hash<Symbol, #constantize>] the named derivative and it's associated generator.
40
+ # The "name" is important for Hyrax or IIIF Print implementations. The generator is
41
+ # one that exists in the DerivativeRodeo.
42
+ #
43
+ # @example
44
+ # # In this case there are two changes:
45
+ # # 1. Do not use the DerivativeRodeo to process PDFs; instead fallback to another
46
+ # # applicable service.
47
+ # # 2. For Images, we will use the DerivativeRodeo but will only generate the thumbnail.
48
+ # # We will skip the JSON, XML, and TXT for an image.
49
+ # #
50
+ # # NOTE: Changing the behavior in this way may create broken assumptions in Hyrax.
51
+ # IiifPrint::DerivativeRodeoService.named_derivatives_and_generators_by_type =
52
+ # { image: { thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator" } }
53
+ #
54
+ # @todo Could be nice to have a registry for the DerivativeRodeo::Generators; but that's a
55
+ # tomorrow wish.
56
+ # @api public
57
+ class_attribute(:named_derivatives_and_generators_by_type, default: {
58
+ pdf: {
59
+ thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator"
60
+ },
61
+ image: {
62
+ thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator",
63
+ json: "DerivativeRodeo::Generators::WordCoordinatesGenerator",
64
+ xml: "DerivativeRodeo::Generators::AltoGenerator",
65
+ txt: "DerivativeRodeo::Generators::PlainTextGenerator"
66
+ }
67
+ })
68
+
69
+ ##
70
+ # @!attribute named_derivatives_and_generators_filter [r|w]
71
+ # @return [#call] with three named parameters: :filename, :candidates, :file_set
72
+ #
73
+ # - :file_set is a {FileSet}
74
+ # - :filename is a String
75
+ # - :named_derivatives_and_generators is an entry from
76
+ # {.named_derivatives_and_generators_by_type} as pulled from
77
+ # {#named_derivatives_and_generators}
78
+ #
79
+ # The lambda is responsible for filtering any named generators that should or should not
80
+ # be run. It should return a data structure similar to the provided
81
+ # :named_derivatives_and_generators
82
+ #
83
+ # @example
84
+ # # The following configured filter will skip thumbnail generation for any files that
85
+ # # end in '.tn.jpg'
86
+ # IiifPrint::DerivativeRodeoService.named_derivatives_and_generators_filter =
87
+ # ->(file_set:, filename:, named_derivatives_and_generators:) do
88
+ # named_derivatives_and_generators.reject do |named_derivative, generators|
89
+ # named_derivative == :thumbnail && filename.downcase.ends_with?('.tn.jpg')
90
+ # end
91
+ # end
92
+ #
93
+ # @see .named_derivatives_and_generators_by_type
94
+ # @see #named_derivatives_and_generators
95
+ # @api public
96
+ # rubocop:disable Lint/UnusedBlockArgument
97
+ class_attribute(:named_derivatives_and_generators_filter,
98
+ default: ->(file_set:, filename:, named_derivatives_and_generators:) { named_derivatives_and_generators })
99
+
100
+ # rubocop:enable Lint/UnusedBlockArgument
101
+ # @!endgroup Class Attributes
102
+ ##
103
+
104
+ ##
105
+ # @see .named_derivatives_and_generators_by_type
106
+ #
107
+ # @return [Hash<Symbol,String] The named derivative types and their corresponding generators.
108
+ # @raise [IiifPrint::UnexpectedMimeTypeError] when the {#file_set}'s {#mime_type} is not one
109
+ # that is part of {.named_derivatives_and_generators_by_type}
110
+ def named_derivatives_and_generators
111
+ @named_derivatives_and_generators ||=
112
+ if file_set.class.pdf_mime_types.include?(mime_type)
113
+ named_derivatives_and_generators_by_type.fetch(:pdf).deep_dup
114
+ elsif file_set.class.image_mime_types.include?(mime_type)
115
+ named_derivatives_and_generators_by_type.fetch(:image).deep_dup
116
+ else
117
+ raise UnexpectedMimeTypeError.new(file_set: file_set, mime_type: mime_type)
118
+ end
119
+ end
120
+
121
+ ##
122
+ # This method encodes some existing assumptions about the URI based on implementations for
123
+ # Adventist. Those are reasonable assumptions but time will tell how reasonable.
124
+ #
125
+ # By convention, this method is returning output_location of the SpaceStone::Serverless
126
+ # processing. We might know the original location that SpaceStone::Serverless processed, but
127
+ # that seems to be a tenuous assumption.
128
+ #
129
+ # In other words, where would SpaceStone, by convention, have written the original file and by
130
+ # convention written that original file's derivatives.
131
+ #
132
+ # TODO: We also need to account for PDF splitting
133
+ #
134
+ # @param file_set [FileSet]
135
+ # @param filename [String]
136
+ # @param extension [String]
137
+ # @param adapter_name [String] Added as a parameter to make testing just a bit easier. See
138
+ # {.preprocessed_location_adapter_name}
139
+ #
140
+ # @return [String] when we have a possible candidate.
141
+ # @return [NilClass] when we could not derive a candidate.
142
+ # rubocop:disable Metrics/MethodLength
143
+ def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil, adapter_name: preprocessed_location_adapter_name)
144
+ # TODO: This is a hack that knows about the inner workings of Hydra::Works, but for
145
+ # expendiency, I'm using it. See
146
+ # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/services/add_file_to_file_set.rb#L49-L53
147
+ filename ||= Hydra::Works::DetermineOriginalName.call(file_set.original_file)
148
+
149
+ dirname = derivative_rodeo_preprocessed_directory_for(file_set: file_set, filename: filename)
150
+ return nil unless dirname
151
+
152
+ # The aforementioned filename and the following basename and extension are here to allow for
153
+ # us to take an original file and see if we've pre-processed the derivative file. In the
154
+ # pre-processed derivative case, that would mean we have a different extension than the
155
+ # original.
156
+ extension ||= File.extname(filename)
157
+ extension = ".#{extension}" unless extension.start_with?(".")
158
+
159
+ # We want to strip off the extension of the given filename.
160
+ basename = File.basename(filename, File.extname(filename))
161
+
162
+ # TODO: What kinds of exceptions might we raise if the location is not configured? Do we need
163
+ # to "validate" it in another step.
164
+ location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(adapter_name)
165
+
166
+ File.join(location.adapter_prefix, dirname, "#{basename}#{extension}")
167
+ end
168
+ # rubocop:enable Metrics/MethodLength
169
+
170
+ ##
171
+ # @api public
172
+ #
173
+ # Figure out the ancestor type and ancestor
174
+ def self.get_ancestor(filename: nil, file_set:)
175
+ # In the case of a page split from a PDF, we need to know the grandparent's identifier to
176
+ # find the file(s) in the DerivativeRodeo.
177
+ if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename)
178
+ [IiifPrint.grandparent_for(file_set), :grandparent]
179
+ else
180
+ [IiifPrint.parent_for(file_set), :parent]
181
+ end
182
+ end
183
+
184
+ ##
185
+ # @api public
186
+ #
187
+ # @note You may find yourself wanting to override this method. Please do if you find a better
188
+ # way to do this.
189
+ #
190
+ # By convention, we're putting the files of a work in a "directory" that is based on some
191
+ # identifying value (e.g. an object's AARK ID) of the work.
192
+ #
193
+ # Because we split PDFs (see {IiifPrint::SplitPdfs::DerivativeRodeoSplitter} we need to consider
194
+ # that we may be working on the PDF (and that FileSet is directly associated with the work) or
195
+ # we are working on one of the pages ripped from the PDF (and the FileSet's work is a to be
196
+ # related child work of the original work).
197
+ #
198
+ # @param file_set [FileSet]
199
+ # @param filename [String]
200
+ # @return [String] the dirname (without any "/" we hope)
201
+ # @return [NilClass] when we cannot infer a URI from the object.
202
+ # rubocop:disable Metrics/MethodLength
203
+ def self.derivative_rodeo_preprocessed_directory_for(file_set:, filename:)
204
+ ancestor, ancestor_type = get_ancestor(filename: filename, file_set: file_set)
205
+
206
+ # Why might we not have an ancestor? In the case of grandparent_for, we may not yet have run
207
+ # the create relationships job. We could sneak a peak in the table to maybe glean some insight.
208
+ # However, read further the `else` clause to see the novel approach.
209
+ # rubocop:disable Style/GuardClause
210
+ if ancestor
211
+ message = "#{self.class}.#{__method__} #{file_set.class} ID=#{file_set.id} and filename: #{filename.inspect}" \
212
+ "has #{ancestor_type} of #{ancestor.class} ID=#{ancestor.id}"
213
+ Rails.logger.info(message)
214
+ parent_work_identifier = ancestor.public_send(parent_work_identifier_property_name)
215
+ return parent_work_identifier if parent_work_identifier.present?
216
+ Rails.logger.warn("Expected #{ancestor.class} ID=#{ancestor.id} (#{ancestor_type} of #{file_set.class} ID=#{file_set.id}) " \
217
+ "to have a present #{parent_work_identifier_property_name.inspect}")
218
+ nil
219
+ else
220
+ # HACK: This makes critical assumptions about how we're creating the title for the file_set;
221
+ # but we don't have much to fall-back on. Consider making this a configurable function. Or
222
+ # perhaps this entire method should be more configurable.
223
+ # TODO: Revisit this implementation.
224
+ candidate = file_set.title.first.split(".").first
225
+ return candidate if candidate.present?
226
+ nil
227
+ end
228
+ # rubocop:enable Style/GuardClause
229
+ end
230
+ # rubocop:enable Metrics/MethodLength
231
+
232
+ def initialize(file_set)
233
+ @file_set = file_set
234
+ end
235
+
236
+ attr_reader :file_set
237
+ delegate :uri, :mime_type, to: :file_set
238
+
239
+ ##
240
+ # @return
241
+ # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb#L18-L20 Hyrax::FileSetDerivativesService#valid?
242
+ def valid?
243
+ if in_the_rodeo?
244
+ Rails.logger.info("Using the DerivativeRodeo for FileSet ID=#{file_set.id} with mime_type of #{mime_type}")
245
+ true
246
+ else
247
+ Rails.logger.info("Skipping the DerivativeRodeo for FileSet ID=#{file_set.id} with mime_type of #{mime_type}")
248
+ false
249
+ end
250
+ end
251
+
252
+ ##
253
+ # @api public
254
+ #
255
+ # The file_set.class.*_mime_types are carried over from Hyrax.
256
+ #
257
+ # @note We write derivatives to the {#absolute_derivative_path_for} and should likewise clean
258
+ # them up when deleted.
259
+ # @see #cleanup_derivatives
260
+ #
261
+ # @param filename [String]
262
+ #
263
+ # @see .named_derivatives_and_generators_filter
264
+ # @see #named_derivatives_and_generators
265
+ def create_derivatives(filename)
266
+ named_derivatives_and_generators_filter
267
+ .call(file_set: file_set, filename: filename, named_derivatives_and_generators: named_derivatives_and_generators)
268
+ .flat_map do |named_derivative, generator_name|
269
+ lasso_up_some_derivatives(
270
+ named_derivative: named_derivative,
271
+ generator_name: generator_name,
272
+ filename: filename
273
+ )
274
+ end
275
+ end
276
+
277
+ # We need to clean up the derivatives that we created.
278
+ #
279
+ # @see #create_derivatives
280
+ #
281
+ # @note Due to the configurability and plasticity of the named derivatives, it is possible that
282
+ # when we created the derivatives, we had a different configuration (e.g. were we to
283
+ # create derivatives again, we might get a set of different files). So we must ask
284
+ # ourselves, is it important to clean up all derivatives (even ones that may not be in
285
+ # scope for this service) or to clean up only those presently in scope? I am favoring
286
+ # removing all of them. In part because of the nature of the valid derivative service.
287
+ def cleanup_derivatives
288
+ ## Were we to only delete the derivatives that this service presently creates, this would be
289
+ ## that code:
290
+ #
291
+ # named_derivatives_and_generators.keys.each do |named_derivative|
292
+ # path = absolute_derivative_path_for(named_derivative)
293
+ # FileUtils.rm_f(path) if File.exist?(path)
294
+ # end
295
+
296
+ ## Instead, let's clean it all up.
297
+ Hyrax::DerivativePath.derivatives_for_reference(file_set).each do |path|
298
+ FileUtils.rm_f(path) if File.exist?(path)
299
+ end
300
+ end
301
+
302
+ private
303
+
304
+ def absolute_derivative_path_for(named_derivative:)
305
+ Hyrax::DerivativePath.derivative_path_for_reference(file_set, named_derivative.to_s)
306
+ end
307
+
308
+ # rubocop:disable Metrics/MethodLength
309
+ def lasso_up_some_derivatives(filename:, named_derivative:, generator_name:)
310
+ # TODO: Can we use the filename instead of the antics of the original_file on the file_set?
311
+ # We have the filename in create_derivatives.
312
+
313
+ # This is the location that Hyrax expects us to put files that will be added to Fedora.
314
+ output_location_template = "file://#{absolute_derivative_path_for(named_derivative: named_derivative)}"
315
+
316
+ # The generator knows the output extensions.
317
+ generator = generator_name.constantize
318
+
319
+ # This is the location where we hope the derivative rodeo will have generated the derived
320
+ # file (e.g. a PDF page's txt file or an image's thumbnail.
321
+ preprocessed_location_template = self.class.derivative_rodeo_uri(file_set: file_set, filename: filename, extension: generator.output_extension)
322
+
323
+ begin
324
+ generator.new(
325
+ input_uris: [input_uri],
326
+ preprocessed_location_template: preprocessed_location_template,
327
+ output_location_template: output_location_template
328
+ ).generated_files.first.file_path
329
+ rescue => e
330
+ message = "#{generator}#generated_files encountered `#{e.class}' “#{e}” for " \
331
+ "input_uri: #{input_uri.inspect}, " \
332
+ "output_location_template: #{output_location_template.inspect}, and " \
333
+ "preprocessed_location_template: #{preprocessed_location_template.inspect}."
334
+ exception = RuntimeError.new(message)
335
+ exception.set_backtrace(e.backtrace)
336
+ # Why this additional logging? Because you may splice in a different logger for the
337
+ # Rodeo, and having this information might be helpful as you try to debug a very woolly
338
+ # operation.
339
+ DerivativeRodeo.logger.error(message)
340
+ raise exception
341
+ end
342
+ end
343
+ # rubocop:enable Metrics/MethodLength
344
+
345
+ def supported_mime_types
346
+ # If we've configured the rodeo
347
+ named_derivatives_and_generators_by_type.keys.flat_map { |type| file_set.class.public_send("#{type}_mime_types") }
348
+ end
349
+
350
+ # Where can we find the "original" file that we want to operate on?
351
+ #
352
+ # @return [String]
353
+ def input_uri
354
+ return @input_uri if defined?(@input_uri)
355
+
356
+ # TODO: I've built up logic to use the derivative_rodeo_uri, however what if we don't need to
357
+ # look at that location? If not there, then we need to look to the file associated with the
358
+ # file set.
359
+ # QUESTION: Should we skip using the derivative rodeo uri as a candidate for the input_uri?
360
+ input_uri = self.class.derivative_rodeo_uri(file_set: file_set)
361
+ location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(input_uri)
362
+ @input_uri = if location.exist?
363
+ input_uri
364
+ elsif file_set.import_url.present?
365
+ file_set.import_url
366
+ else
367
+ # TODO: This is the fedora URL representing the file we uploaded; is that adequate? Will we
368
+ # have access to this file?
369
+ file_set.original_file.uri.to_s
370
+ end
371
+ end
372
+
373
+ def in_the_rodeo?
374
+ # We can assume that we are not going to have pre-processed an unsupported mime type. We
375
+ # could check if the original file is in the rodeo, but the way it's designed thee rodeo is
376
+ # capable of generating all of the enumerated derivatives (see
377
+ # .named_derivatives_and_generators_by_type) for the supported mime type.
378
+ supported_mime_types.include?(mime_type)
379
+ end
380
+ end
381
+ # rubocop:enable Metrics/ClassLength
382
+ end