iiif_print 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (181) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  4. data/.github/workflows/build-lint-test-action.yaml +4 -5
  5. data/.gitignore +5 -4
  6. data/.rubocop.yml +1 -0
  7. data/.solargraph.yml +19 -0
  8. data/Gemfile.lock +1025 -0
  9. data/README.md +102 -9
  10. data/Rakefile +6 -0
  11. data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
  12. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
  13. data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
  14. data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
  15. data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
  16. data/app/helpers/iiif_print_helper.rb +0 -20
  17. data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
  18. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +45 -17
  19. data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
  20. data/app/jobs/iiif_print/jobs/child_works_from_pdf_job.rb +153 -0
  21. data/app/jobs/iiif_print/jobs/create_relationships_job.rb +117 -0
  22. data/app/jobs/iiif_print/jobs/request_split_pdf_job.rb +31 -0
  23. data/app/listeners/iiif_print/listener.rb +31 -0
  24. data/app/models/concerns/iiif_print/set_child_flag.rb +10 -1
  25. data/app/models/concerns/iiif_print/solr/document.rb +19 -3
  26. data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
  27. data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
  28. data/app/models/iiif_print/pending_relationship.rb +3 -0
  29. data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
  30. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
  31. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  32. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +23 -11
  33. data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
  34. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
  35. data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
  36. data/app/services/iiif_print/manifest_builder_service_behavior.rb +90 -31
  37. data/app/services/iiif_print/pluggable_derivative_service.rb +8 -10
  38. data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
  39. data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
  40. data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
  41. data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
  42. data/app/views/catalog/_index_header_list_default.html.erb +13 -0
  43. data/app/views/hyrax/base/_representative_media.html.erb +4 -3
  44. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
  45. data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
  46. data/config/initializers/simple_schema_loader.rb +1 -0
  47. data/config/locales/iiif_print.en.yml +4 -0
  48. data/config/metadata/child_works_from_pdf_splitting.yaml +21 -0
  49. data/config/routes.rb +3 -0
  50. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
  51. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
  52. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
  53. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
  54. data/docker-compose.yml +2 -2
  55. data/iiif_print.gemspec +11 -10
  56. data/lib/generators/iiif_print/install_generator.rb +21 -1
  57. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
  58. data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
  59. data/lib/iiif_print/base_derivative_service.rb +14 -2
  60. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +58 -6
  61. data/lib/iiif_print/catalog_search_builder.rb +7 -3
  62. data/lib/iiif_print/configuration.rb +205 -8
  63. data/lib/iiif_print/data/fileset_helper.rb +3 -3
  64. data/lib/iiif_print/data/work_derivatives.rb +4 -4
  65. data/lib/iiif_print/engine.rb +53 -15
  66. data/lib/iiif_print/errors.rb +18 -0
  67. data/lib/iiif_print/homepage_search_builder.rb +17 -0
  68. data/lib/iiif_print/image_tool.rb +12 -8
  69. data/lib/iiif_print/jp2_derivative_service.rb +4 -1
  70. data/lib/iiif_print/lineage_service.rb +47 -13
  71. data/lib/iiif_print/metadata.rb +67 -48
  72. data/lib/iiif_print/pdf_derivative_service.rb +3 -1
  73. data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
  74. data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
  75. data/lib/iiif_print/persistence_layer.rb +118 -0
  76. data/lib/iiif_print/split_pdfs/base_splitter.rb +153 -0
  77. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +83 -37
  78. data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
  79. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +22 -0
  80. data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
  81. data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
  82. data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
  83. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
  84. data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
  85. data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
  86. data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
  87. data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
  88. data/lib/iiif_print/tiff_derivative_service.rb +3 -1
  89. data/lib/iiif_print/version.rb +1 -1
  90. data/lib/iiif_print.rb +210 -20
  91. data/lib/samvera/derivatives/configuration.rb +83 -0
  92. data/lib/samvera/derivatives/hyrax.rb +129 -0
  93. data/lib/samvera/derivatives.rb +238 -0
  94. data/tasks/copy_authorities_to_test_app.rake +11 -0
  95. data/tasks/iiif_print_dev.rake +4 -4
  96. metadata +111 -196
  97. data/app/helpers/hyrax/iiif_helper.rb +0 -22
  98. data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -34
  99. data/app/views/hyrax/file_sets/_actions.html.erb +0 -45
  100. data/bin/rails +0 -13
  101. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +0 -107
  102. data/lib/iiif_print/jobs/create_relationships_job.rb +0 -78
  103. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
  104. data/spec/.keep.txt +0 -1
  105. data/spec/factories/ability.rb +0 -6
  106. data/spec/factories/newspaper_issue.rb +0 -7
  107. data/spec/factories/newspaper_page.rb +0 -7
  108. data/spec/factories/newspaper_page_solr_document.rb +0 -12
  109. data/spec/factories/newspaper_title.rb +0 -8
  110. data/spec/factories/uploaded_pdf_file.rb +0 -9
  111. data/spec/factories/uploaded_txt_file.rb +0 -9
  112. data/spec/factories/user.rb +0 -13
  113. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  114. data/spec/fixtures/files/4.1.07.tiff +0 -0
  115. data/spec/fixtures/files/README.md +0 -7
  116. data/spec/fixtures/files/alto-2-0.xsd +0 -714
  117. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  118. data/spec/fixtures/files/credits.md +0 -16
  119. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  120. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  121. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  122. data/spec/fixtures/files/minimal-alto.xml +0 -31
  123. data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
  124. data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
  125. data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
  126. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  127. data/spec/fixtures/files/ocr_alto.xml +0 -202
  128. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
  129. data/spec/fixtures/files/ocr_color.tiff +0 -0
  130. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  131. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  132. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  133. data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
  134. data/spec/fixtures/files/page1.tiff +0 -0
  135. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  136. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  137. data/spec/fixtures/files/thumbnail.jpg +0 -0
  138. data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
  139. data/spec/helpers/iiif_print_helper_spec.rb +0 -43
  140. data/spec/iiif_print/base_derivative_service_spec.rb +0 -11
  141. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -51
  142. data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
  143. data/spec/iiif_print/configuration_spec.rb +0 -67
  144. data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
  145. data/spec/iiif_print/data/work_file_spec.rb +0 -99
  146. data/spec/iiif_print/data/work_files_spec.rb +0 -237
  147. data/spec/iiif_print/image_tool_spec.rb +0 -109
  148. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -30
  149. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -17
  150. data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
  151. data/spec/iiif_print/lineage_service_spec.rb +0 -13
  152. data/spec/iiif_print/metadata_spec.rb +0 -115
  153. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
  154. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
  155. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
  156. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
  157. data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
  158. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
  159. data/spec/iiif_print_spec.rb +0 -51
  160. data/spec/misc_shared.rb +0 -111
  161. data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
  162. data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
  163. data/spec/models/solr_document_spec.rb +0 -14
  164. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -19
  165. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
  166. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
  167. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
  168. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -178
  169. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
  170. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
  171. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
  172. data/spec/spec_helper.rb +0 -181
  173. data/spec/support/controller_level_helpers.rb +0 -28
  174. data/spec/support/iiif_print_models.rb +0 -127
  175. data/spec/test_app_templates/blacklight.yml +0 -9
  176. data/spec/test_app_templates/fedora.yml +0 -15
  177. data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
  178. data/spec/test_app_templates/redis.yml +0 -9
  179. data/spec/test_app_templates/solr/conf/schema.xml +0 -362
  180. data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
  181. data/spec/test_app_templates/solr.yml +0 -7
@@ -0,0 +1,382 @@
1
+ module IiifPrint
2
+ ##
3
+ # This class implements the interface of a Hyrax::DerivativeService.
4
+ #
5
+ # That means three important methods are:
6
+ #
7
+ # - {#valid?}
8
+ # - {#create_derivatives}
9
+ # - {#cleanup_derivatives}
10
+ #
11
+ # And the object initializes with a FileSet.
12
+ #
13
+ # It is a companion to {IiifPrint::PluggableDerivativeService}.
14
+ #
15
+ # @see https://github.com/samvera/hyrax/blob/main/app/services/hyrax/derivative_service.rb Hyrax::DerivativesService
16
+ # rubocop:disable Metrics/ClassLength
17
+ class DerivativeRodeoService
18
+ ##
19
+ # @!group Class Attributes
20
+ #
21
+ # @!attribute parent_work_identifier_property_name [r|w]
22
+ # @return [String] the property we use to identify the unique identifier of the parent work as
23
+ # it went through the SpaceStone pre-process.
24
+ #
25
+ # @todo The default of :aark_id is a quick hack for adventist. By exposing a configuration
26
+ # value, my hope is that this becomes easier to configure.
27
+ # @api public
28
+ class_attribute :parent_work_identifier_property_name, default: 'aark_id'
29
+
30
+ ##
31
+ # @!attribute preprocessed_location_adapter_name [r|w]
32
+ # @return [String] The name of a derivative rodeo storage location; this will must be a
33
+ # registered with the DerivativeRodeo::StorageLocations::BaseLocation.
34
+ # @api public
35
+ class_attribute :preprocessed_location_adapter_name, default: 's3'
36
+
37
+ ##
38
+ # @!attribute named_derivatives_and_generators_by_type [r|w]
39
+ # @return [Hash<Symbol, #constantize>] the named derivative and it's associated generator.
40
+ # The "name" is important for Hyrax or IIIF Print implementations. The generator is
41
+ # one that exists in the DerivativeRodeo.
42
+ #
43
+ # @example
44
+ # # In this case there are two changes:
45
+ # # 1. Do not use the DerivativeRodeo to process PDFs; instead fallback to another
46
+ # # applicable service.
47
+ # # 2. For Images, we will use the DerivativeRodeo but will only generate the thumbnail.
48
+ # # We will skip the JSON, XML, and TXT for an image.
49
+ # #
50
+ # # NOTE: Changing the behavior in this way may create broken assumptions in Hyrax.
51
+ # IiifPrint::DerivativeRodeoService.named_derivatives_and_generators_by_type =
52
+ # { image: { thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator" } }
53
+ #
54
+ # @todo Could be nice to have a registry for the DerivativeRodeo::Generators; but that's a
55
+ # tomorrow wish.
56
+ # @api public
57
+ class_attribute(:named_derivatives_and_generators_by_type, default: {
58
+ pdf: {
59
+ thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator"
60
+ },
61
+ image: {
62
+ thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator",
63
+ json: "DerivativeRodeo::Generators::WordCoordinatesGenerator",
64
+ xml: "DerivativeRodeo::Generators::AltoGenerator",
65
+ txt: "DerivativeRodeo::Generators::PlainTextGenerator"
66
+ }
67
+ })
68
+
69
+ ##
70
+ # @!attribute named_derivatives_and_generators_filter [r|w]
71
+ # @return [#call] with three named parameters: :filename, :candidates, :file_set
72
+ #
73
+ # - :file_set is a {FileSet}
74
+ # - :filename is a String
75
+ # - :named_derivatives_and_generators is an entry from
76
+ # {.named_derivatives_and_generators_by_type} as pulled from
77
+ # {#named_derivatives_and_generators}
78
+ #
79
+ # The lambda is responsible for filtering any named generators that should or should not
80
+ # be run. It should return a data structure similar to the provided
81
+ # :named_derivatives_and_generators
82
+ #
83
+ # @example
84
+ # # The following configured filter will skip thumbnail generation for any files that
85
+ # # end in '.tn.jpg'
86
+ # IiifPrint::DerivativeRodeoService.named_derivatives_and_generators_filter =
87
+ # ->(file_set:, filename:, named_derivatives_and_generators:) do
88
+ # named_derivatives_and_generators.reject do |named_derivative, generators|
89
+ # named_derivative == :thumbnail && filename.downcase.ends_with?('.tn.jpg')
90
+ # end
91
+ # end
92
+ #
93
+ # @see .named_derivatives_and_generators_by_type
94
+ # @see #named_derivatives_and_generators
95
+ # @api public
96
+ # rubocop:disable Lint/UnusedBlockArgument
97
+ class_attribute(:named_derivatives_and_generators_filter,
98
+ default: ->(file_set:, filename:, named_derivatives_and_generators:) { named_derivatives_and_generators })
99
+
100
+ # rubocop:enable Lint/UnusedBlockArgument
101
+ # @!endgroup Class Attributes
102
+ ##
103
+
104
+ ##
105
+ # @see .named_derivatives_and_generators_by_type
106
+ #
107
+ # @return [Hash<Symbol,String] The named derivative types and their corresponding generators.
108
+ # @raise [IiifPrint::UnexpectedMimeTypeError] when the {#file_set}'s {#mime_type} is not one
109
+ # that is part of {.named_derivatives_and_generators_by_type}
110
+ def named_derivatives_and_generators
111
+ @named_derivatives_and_generators ||=
112
+ if file_set.class.pdf_mime_types.include?(mime_type)
113
+ named_derivatives_and_generators_by_type.fetch(:pdf).deep_dup
114
+ elsif file_set.class.image_mime_types.include?(mime_type)
115
+ named_derivatives_and_generators_by_type.fetch(:image).deep_dup
116
+ else
117
+ raise UnexpectedMimeTypeError.new(file_set: file_set, mime_type: mime_type)
118
+ end
119
+ end
120
+
121
+ ##
122
+ # This method encodes some existing assumptions about the URI based on implementations for
123
+ # Adventist. Those are reasonable assumptions but time will tell how reasonable.
124
+ #
125
+ # By convention, this method is returning output_location of the SpaceStone::Serverless
126
+ # processing. We might know the original location that SpaceStone::Serverless processed, but
127
+ # that seems to be a tenuous assumption.
128
+ #
129
+ # In other words, where would SpaceStone, by convention, have written the original file and by
130
+ # convention written that original file's derivatives.
131
+ #
132
+ # TODO: We also need to account for PDF splitting
133
+ #
134
+ # @param file_set [FileSet]
135
+ # @param filename [String]
136
+ # @param extension [String]
137
+ # @param adapter_name [String] Added as a parameter to make testing just a bit easier. See
138
+ # {.preprocessed_location_adapter_name}
139
+ #
140
+ # @return [String] when we have a possible candidate.
141
+ # @return [NilClass] when we could not derive a candidate.
142
+ # rubocop:disable Metrics/MethodLength
143
+ def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil, adapter_name: preprocessed_location_adapter_name)
144
+ # TODO: This is a hack that knows about the inner workings of Hydra::Works, but for
145
+ # expendiency, I'm using it. See
146
+ # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/services/add_file_to_file_set.rb#L49-L53
147
+ filename ||= Hydra::Works::DetermineOriginalName.call(file_set.original_file)
148
+
149
+ dirname = derivative_rodeo_preprocessed_directory_for(file_set: file_set, filename: filename)
150
+ return nil unless dirname
151
+
152
+ # The aforementioned filename and the following basename and extension are here to allow for
153
+ # us to take an original file and see if we've pre-processed the derivative file. In the
154
+ # pre-processed derivative case, that would mean we have a different extension than the
155
+ # original.
156
+ extension ||= File.extname(filename)
157
+ extension = ".#{extension}" unless extension.start_with?(".")
158
+
159
+ # We want to strip off the extension of the given filename.
160
+ basename = File.basename(filename, File.extname(filename))
161
+
162
+ # TODO: What kinds of exceptions might we raise if the location is not configured? Do we need
163
+ # to "validate" it in another step.
164
+ location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(adapter_name)
165
+
166
+ File.join(location.adapter_prefix, dirname, "#{basename}#{extension}")
167
+ end
168
+ # rubocop:enable Metrics/MethodLength
169
+
170
+ ##
171
+ # @api public
172
+ #
173
+ # Figure out the ancestor type and ancestor
174
+ def self.get_ancestor(filename: nil, file_set:)
175
+ # In the case of a page split from a PDF, we need to know the grandparent's identifier to
176
+ # find the file(s) in the DerivativeRodeo.
177
+ if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename)
178
+ [IiifPrint.grandparent_for(file_set), :grandparent]
179
+ else
180
+ [IiifPrint.parent_for(file_set), :parent]
181
+ end
182
+ end
183
+
184
+ ##
185
+ # @api public
186
+ #
187
+ # @note You may find yourself wanting to override this method. Please do if you find a better
188
+ # way to do this.
189
+ #
190
+ # By convention, we're putting the files of a work in a "directory" that is based on some
191
+ # identifying value (e.g. an object's AARK ID) of the work.
192
+ #
193
+ # Because we split PDFs (see {IiifPrint::SplitPdfs::DerivativeRodeoSplitter} we need to consider
194
+ # that we may be working on the PDF (and that FileSet is directly associated with the work) or
195
+ # we are working on one of the pages ripped from the PDF (and the FileSet's work is a to be
196
+ # related child work of the original work).
197
+ #
198
+ # @param file_set [FileSet]
199
+ # @param filename [String]
200
+ # @return [String] the dirname (without any "/" we hope)
201
+ # @return [NilClass] when we cannot infer a URI from the object.
202
+ # rubocop:disable Metrics/MethodLength
203
+ def self.derivative_rodeo_preprocessed_directory_for(file_set:, filename:)
204
+ ancestor, ancestor_type = get_ancestor(filename: filename, file_set: file_set)
205
+
206
+ # Why might we not have an ancestor? In the case of grandparent_for, we may not yet have run
207
+ # the create relationships job. We could sneak a peak in the table to maybe glean some insight.
208
+ # However, read further the `else` clause to see the novel approach.
209
+ # rubocop:disable Style/GuardClause
210
+ if ancestor
211
+ message = "#{self.class}.#{__method__} #{file_set.class} ID=#{file_set.id} and filename: #{filename.inspect}" \
212
+ "has #{ancestor_type} of #{ancestor.class} ID=#{ancestor.id}"
213
+ Rails.logger.info(message)
214
+ parent_work_identifier = ancestor.public_send(parent_work_identifier_property_name)
215
+ return parent_work_identifier if parent_work_identifier.present?
216
+ Rails.logger.warn("Expected #{ancestor.class} ID=#{ancestor.id} (#{ancestor_type} of #{file_set.class} ID=#{file_set.id}) " \
217
+ "to have a present #{parent_work_identifier_property_name.inspect}")
218
+ nil
219
+ else
220
+ # HACK: This makes critical assumptions about how we're creating the title for the file_set;
221
+ # but we don't have much to fall-back on. Consider making this a configurable function. Or
222
+ # perhaps this entire method should be more configurable.
223
+ # TODO: Revisit this implementation.
224
+ candidate = file_set.title.first.split(".").first
225
+ return candidate if candidate.present?
226
+ nil
227
+ end
228
+ # rubocop:enable Style/GuardClause
229
+ end
230
+ # rubocop:enable Metrics/MethodLength
231
+
232
+ def initialize(file_set)
233
+ @file_set = file_set
234
+ end
235
+
236
+ attr_reader :file_set
237
+ delegate :uri, :mime_type, to: :file_set
238
+
239
+ ##
240
+ # @return
241
+ # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb#L18-L20 Hyrax::FileSetDerivativesService#valid?
242
+ def valid?
243
+ if in_the_rodeo?
244
+ Rails.logger.info("Using the DerivativeRodeo for FileSet ID=#{file_set.id} with mime_type of #{mime_type}")
245
+ true
246
+ else
247
+ Rails.logger.info("Skipping the DerivativeRodeo for FileSet ID=#{file_set.id} with mime_type of #{mime_type}")
248
+ false
249
+ end
250
+ end
251
+
252
+ ##
253
+ # @api public
254
+ #
255
+ # The file_set.class.*_mime_types are carried over from Hyrax.
256
+ #
257
+ # @note We write derivatives to the {#absolute_derivative_path_for} and should likewise clean
258
+ # them up when deleted.
259
+ # @see #cleanup_derivatives
260
+ #
261
+ # @param filename [String]
262
+ #
263
+ # @see .named_derivatives_and_generators_filter
264
+ # @see #named_derivatives_and_generators
265
+ def create_derivatives(filename)
266
+ named_derivatives_and_generators_filter
267
+ .call(file_set: file_set, filename: filename, named_derivatives_and_generators: named_derivatives_and_generators)
268
+ .flat_map do |named_derivative, generator_name|
269
+ lasso_up_some_derivatives(
270
+ named_derivative: named_derivative,
271
+ generator_name: generator_name,
272
+ filename: filename
273
+ )
274
+ end
275
+ end
276
+
277
+ # We need to clean up the derivatives that we created.
278
+ #
279
+ # @see #create_derivatives
280
+ #
281
+ # @note Due to the configurability and plasticity of the named derivatives, it is possible that
282
+ # when we created the derivatives, we had a different configuration (e.g. were we to
283
+ # create derivatives again, we might get a set of different files). So we must ask
284
+ # ourselves, is it important to clean up all derivatives (even ones that may not be in
285
+ # scope for this service) or to clean up only those presently in scope? I am favoring
286
+ # removing all of them. In part because of the nature of the valid derivative service.
287
+ def cleanup_derivatives
288
+ ## Were we to only delete the derivatives that this service presently creates, this would be
289
+ ## that code:
290
+ #
291
+ # named_derivatives_and_generators.keys.each do |named_derivative|
292
+ # path = absolute_derivative_path_for(named_derivative)
293
+ # FileUtils.rm_f(path) if File.exist?(path)
294
+ # end
295
+
296
+ ## Instead, let's clean it all up.
297
+ Hyrax::DerivativePath.derivatives_for_reference(file_set).each do |path|
298
+ FileUtils.rm_f(path) if File.exist?(path)
299
+ end
300
+ end
301
+
302
+ private
303
+
304
+ def absolute_derivative_path_for(named_derivative:)
305
+ Hyrax::DerivativePath.derivative_path_for_reference(file_set, named_derivative.to_s)
306
+ end
307
+
308
+ # rubocop:disable Metrics/MethodLength
309
+ def lasso_up_some_derivatives(filename:, named_derivative:, generator_name:)
310
+ # TODO: Can we use the filename instead of the antics of the original_file on the file_set?
311
+ # We have the filename in create_derivatives.
312
+
313
+ # This is the location that Hyrax expects us to put files that will be added to Fedora.
314
+ output_location_template = "file://#{absolute_derivative_path_for(named_derivative: named_derivative)}"
315
+
316
+ # The generator knows the output extensions.
317
+ generator = generator_name.constantize
318
+
319
+ # This is the location where we hope the derivative rodeo will have generated the derived
320
+ # file (e.g. a PDF page's txt file or an image's thumbnail.
321
+ preprocessed_location_template = self.class.derivative_rodeo_uri(file_set: file_set, filename: filename, extension: generator.output_extension)
322
+
323
+ begin
324
+ generator.new(
325
+ input_uris: [input_uri],
326
+ preprocessed_location_template: preprocessed_location_template,
327
+ output_location_template: output_location_template
328
+ ).generated_files.first.file_path
329
+ rescue => e
330
+ message = "#{generator}#generated_files encountered `#{e.class}' “#{e}” for " \
331
+ "input_uri: #{input_uri.inspect}, " \
332
+ "output_location_template: #{output_location_template.inspect}, and " \
333
+ "preprocessed_location_template: #{preprocessed_location_template.inspect}."
334
+ exception = RuntimeError.new(message)
335
+ exception.set_backtrace(e.backtrace)
336
+ # Why this additional logging? Because you may splice in a different logger for the
337
+ # Rodeo, and having this information might be helpful as you try to debug a very woolly
338
+ # operation.
339
+ DerivativeRodeo.logger.error(message)
340
+ raise exception
341
+ end
342
+ end
343
+ # rubocop:enable Metrics/MethodLength
344
+
345
+ def supported_mime_types
346
+ # If we've configured the rodeo
347
+ named_derivatives_and_generators_by_type.keys.flat_map { |type| file_set.class.public_send("#{type}_mime_types") }
348
+ end
349
+
350
+ # Where can we find the "original" file that we want to operate on?
351
+ #
352
+ # @return [String]
353
+ def input_uri
354
+ return @input_uri if defined?(@input_uri)
355
+
356
+ # TODO: I've built up logic to use the derivative_rodeo_uri, however what if we don't need to
357
+ # look at that location? If not there, then we need to look to the file associated with the
358
+ # file set.
359
+ # QUESTION: Should we skip using the derivative rodeo uri as a candidate for the input_uri?
360
+ input_uri = self.class.derivative_rodeo_uri(file_set: file_set)
361
+ location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(input_uri)
362
+ @input_uri = if location.exist?
363
+ input_uri
364
+ elsif file_set.import_url.present?
365
+ file_set.import_url
366
+ else
367
+ # TODO: This is the fedora URL representing the file we uploaded; is that adequate? Will we
368
+ # have access to this file?
369
+ file_set.original_file.uri.to_s
370
+ end
371
+ end
372
+
373
+ def in_the_rodeo?
374
+ # We can assume that we are not going to have pre-processed an unsupported mime type. We
375
+ # could check if the original file is in the rodeo, but the way it's designed thee rodeo is
376
+ # capable of generating all of the enumerated derivatives (see
377
+ # .named_derivatives_and_generators_by_type) for the supported mime type.
378
+ supported_mime_types.include?(mime_type)
379
+ end
380
+ end
381
+ # rubocop:enable Metrics/ClassLength
382
+ end
@@ -1,14 +1,20 @@
1
1
  module IiifPrint
2
+ # rubocop:disable Metrics/ModuleLength
2
3
  module ManifestBuilderServiceBehavior
3
4
  def initialize(*args,
4
5
  version: IiifPrint.config.default_iiif_manifest_version,
5
6
  iiif_manifest_factory: iiif_manifest_factory_for(version),
6
7
  &block)
7
- super(*args, iiif_manifest_factory: iiif_manifest_factory, &block)
8
+ # Ensure we're setting the version before we go any further.
8
9
  @version = version.to_i
10
+ @child_works = nil
11
+ super(*args, iiif_manifest_factory: iiif_manifest_factory, &block)
9
12
  end
10
13
 
14
+ attr_reader :child_works, :version
15
+
11
16
  def manifest_for(presenter:)
17
+ @child_works = get_solr_hits(member_ids_for(presenter))
12
18
  build_manifest(presenter: presenter)
13
19
  end
14
20
 
@@ -36,62 +42,115 @@ module IiifPrint
36
42
  # ManifestFactory interface?
37
43
  manifest = manifest_factory.new(presenter).to_h
38
44
  hash = JSON.parse(manifest.to_json)
39
- hash = send("sanitize_v#{@version}", hash: hash, presenter: presenter)
40
- send("sorted_canvases_v#{@version}", hash: hash, sort_field: IiifPrint.config.sort_iiif_manifest_canvases_by)
45
+ parent_and_child_solr_hits = parent_and_child_solr_hits(presenter) if @child_works.present?
46
+ hash = send("sanitize_v#{@version}", hash: hash, presenter: presenter, solr_doc_hits: parent_and_child_solr_hits)
47
+ if @child_works.present? && IiifPrint.config.sort_iiif_manifest_canvases_by
48
+ send("sort_canvases_v#{@version}",
49
+ hash: hash,
50
+ sort_field: IiifPrint.config.sort_iiif_manifest_canvases_by)
51
+ end
52
+ hash
41
53
  end
42
54
 
43
- def sanitize_v2(hash:, presenter:)
55
+ def sanitize_v2(hash:, presenter:, solr_doc_hits:)
44
56
  hash['label'] = CGI.unescapeHTML(sanitize_value(hash['label'])) if hash.key?('label')
45
57
  hash.delete('description') # removes default description since it's in the metadata fields
46
58
  hash['sequences']&.each do |sequence|
47
59
  sequence['canvases']&.each do |canvas|
48
60
  canvas['label'] = CGI.unescapeHTML(sanitize_value(canvas['label']))
49
- apply_v2_metadata_to_canvas(canvas: canvas, presenter: presenter)
61
+ apply_metadata_to_canvas(canvas: canvas, presenter: presenter, solr_doc_hits: solr_doc_hits)
50
62
  end
51
63
  end
52
64
  hash
53
65
  end
54
66
 
55
- def sanitize_v3(hash:, **)
56
- # TODO: flesh out metadata for v3
67
+ def sanitize_v3(hash:, presenter:, solr_doc_hits:)
68
+ hash['label']['none'].map! { |text| CGI.unescapeHTML(sanitize_value(text)) } if hash.key('label')
69
+ hash['items'].each do |canvas|
70
+ canvas['label']['none'].map! { |text| CGI.unescapeHTML(sanitize_value(text)) }
71
+ apply_metadata_to_canvas(canvas: canvas, presenter: presenter, solr_doc_hits: solr_doc_hits)
72
+ end
57
73
  hash
58
74
  end
59
75
 
60
- def apply_v2_metadata_to_canvas(canvas:, presenter:)
61
- solr_docs = get_solr_docs(presenter)
62
- # uses the '@id' property which is a URL that contains the FileSet id
63
- file_set_id = canvas['@id'].split('/').last
76
+ def apply_metadata_to_canvas(canvas:, presenter:, solr_doc_hits:)
77
+ return if @child_works.empty?
78
+
79
+ # uses the 'id' property for v3 manifest and `@id' for v2, which is a URL that contains the FileSet id
80
+ file_set_id = (canvas['id'] || canvas['@id']).split('/').last
64
81
  # finds the image that the FileSet is attached to and creates metadata on that canvas
65
- image = solr_docs.find { |doc| doc[:member_ids_ssim]&.include?(file_set_id) }
66
- canvas_metadata = IiifPrint.manifest_metadata_for(work: image,
67
- current_ability: presenter.ability,
68
- base_url: presenter.base_url)
69
- canvas['metadata'] = canvas_metadata
82
+ image = solr_doc_hits.find { |hit| hit[:member_ids_ssim]&.include?(file_set_id) }
83
+ return unless image
84
+ # prevents duplicating the child and parent metadata
85
+ return if image.id == presenter.id
86
+
87
+ canvas['metadata'] = IiifPrint.manifest_metadata_from(work: image, presenter: presenter)
88
+ end
89
+
90
+ LARGEST_SORT_ORDER_CHAR = '~'.freeze
91
+
92
+ def sort_canvases_v2(hash:, sort_field:)
93
+ return sort_by_label_v2(hash) if sort_field == :label
94
+
95
+ sort_field = Hyrax::Renderers::AttributeRenderer.new(sort_field, nil).label
96
+ hash['sequences']&.first&.[]('canvases')&.sort_by! do |canvas|
97
+ selection = canvas['metadata'].select { |h| h['label'] == sort_field }
98
+ fallback = [{ label: sort_field,
99
+ value: [LARGEST_SORT_ORDER_CHAR] }]
100
+ sort_field_metadata = selection.presence || fallback
101
+ sort_field_metadata.first['value'] if sort_field_metadata.present?
102
+ end
103
+ hash
70
104
  end
71
105
 
72
- def sorted_canvases_v2(hash:, sort_field:)
106
+ def sort_canvases_v3(hash:, sort_field:)
73
107
  sort_field = Hyrax::Renderers::AttributeRenderer.new(sort_field, nil).label
74
- hash["sequences"]&.first&.[]("canvases")&.sort_by! do |canvas|
75
- selection = canvas["metadata"].select { |h| h["label"] == sort_field }
76
- fallback = [{ label: sort_field, value: ['~'] }]
77
- identifier_metadata = selection.presence || fallback
78
- identifier_metadata.first["value"] if identifier_metadata.present?
108
+ hash['items']&.sort_by! do |item|
109
+ selection = item['metadata'].select { |h| h['label'][I18n.locale.to_s] == [sort_field] }
110
+ fallback = [{ label: { "#{I18n.locale}": [sort_field] },
111
+ value: { none: [LARGEST_SORT_ORDER_CHAR] } }]
112
+ sort_field_metadata = selection.presence || fallback
113
+ sort_field_metadata.first['value']['none'] if sort_field_metadata.present?
79
114
  end
80
115
  hash
81
116
  end
82
117
 
83
- def sorted_canvases_v3(hash:, **)
84
- # TODO: flesh out metadata for v3
118
+ # TODO: implement this for v3
119
+ def sort_by_label_v2(hash)
120
+ hash['sequences']&.first&.[]('canvases')&.sort_by! do |canvas|
121
+ canvas['label']
122
+ end
85
123
  hash
86
124
  end
87
125
 
88
- def get_solr_docs(presenter)
89
- parent_id = [presenter._source['id']]
90
- child_ids = presenter._source['member_ids_ssim']
91
- parent_id_and_child_ids = parent_id + child_ids
92
- query = ActiveFedora::SolrQueryBuilder.construct_query_for_ids(parent_id_and_child_ids)
93
- solr_hits = ActiveFedora::SolrService.query(query, fq: "-has_model_ssim:FileSet", rows: 100_000)
94
- solr_hits.map { |solr_hit| ::SolrDocument.new(solr_hit) }
126
+ def member_ids_for(presenter)
127
+ member_ids = presenter.try(:ordered_ids) || presenter.try(:member_ids)
128
+ member_ids.nil? ? [] : member_ids
129
+ end
130
+
131
+ def parent_and_child_solr_hits(presenter)
132
+ get_solr_hits([presenter.id]) + @child_works
133
+ end
134
+
135
+ SOLR_QUERY_PAGE_SIZE = 512
136
+ ##
137
+ # return an array of work SolrHits, gathered via paginated segmentation of the ids list
138
+ # to avoid Solr's limit on 1024 logical connections
139
+ # @param ids [Array]
140
+ # @return [Array<ActiveFedora::SolrHit>]
141
+ def get_solr_hits(ids)
142
+ results = []
143
+ ids.each_slice(SOLR_QUERY_PAGE_SIZE) do |paged_ids|
144
+ query = "id:(#{paged_ids.join(' OR ')})"
145
+ results += IiifPrint.solr_query(
146
+ query,
147
+ fq: "-has_model_ssim:FileSet",
148
+ rows: paged_ids.size,
149
+ method: :post
150
+ )
151
+ end
152
+ results
95
153
  end
96
154
  end
155
+ # rubocop:enable Metrics/ClassLength
97
156
  end
@@ -27,7 +27,11 @@ class IiifPrint::PluggableDerivativeService
27
27
  class_attribute :derivative_path_factory, default: Hyrax::DerivativePath
28
28
 
29
29
  def initialize(file_set, plugins: plugins_for(file_set))
30
- @file_set = file_set
30
+ @file_set = if file_set.is_a?(Hyrax::FileMetadata)
31
+ Hyrax.query_service.find_by(id: file_set.file_set_id)
32
+ else
33
+ file_set
34
+ end
31
35
  @plugins = Array.wrap(plugins)
32
36
  @valid_plugins = plugins.map { |plugin| plugin.new(file_set) }.select(&:valid?)
33
37
  end
@@ -39,7 +43,7 @@ class IiifPrint::PluggableDerivativeService
39
43
  # multiple plugins, some of which may or may not be valid, so
40
44
  # validity checks happen within as well.
41
45
  def valid?
42
- !valid_plugins.size.zero?
46
+ !valid_plugins.empty?
43
47
  end
44
48
 
45
49
  # get derivative services relevant to method name and file_set context
@@ -105,16 +109,10 @@ class IiifPrint::PluggableDerivativeService
105
109
  # set would use. That "possibility" is based on the work. Later, we will check the plugin's
106
110
  # "valid?" which would now look at the specific file_set for validity.
107
111
  def plugins_for(file_set)
108
- parent = parent_for(file_set)
112
+ parent = IiifPrint.parent_for(file_set)
109
113
  return Array(default_plugins) if parent.nil?
110
114
  return Array(default_plugins) unless parent.respond_to?(:iiif_print_config)
111
115
 
112
- (file_set.parent.iiif_print_config.derivative_service_plugins + Array(default_plugins)).flatten.compact.uniq
113
- end
114
-
115
- def parent_for(file_set)
116
- # fallback to Fedora-stored relationships if work's aggregation of
117
- # file set is not indexed in Solr
118
- file_set.parent || file_set.member_of.find(&:work?)
116
+ (parent.iiif_print_config.derivative_service_plugins + Array(default_plugins)).flatten.compact.uniq
119
117
  end
120
118
  end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ # OVERRIDE Hyrax v5.0.0rc2 to add schemas that are located in config/metadata/*.yaml
4
+
5
+ module IiifPrint
6
+ module SimpleSchemaLoaderDecorator
7
+ def config_search_paths
8
+ super + [IiifPrint::Engine.root]
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Hyrax
4
+ module Transactions
5
+ ##
6
+ # This decorator does the following:
7
+ #
8
+ # - Prepend the {ConditionallyDestroyChildrenFromSplit} transaction to the "file_set.destroy"
9
+ # step. The prependment corresponds to the behavior for
10
+ # {IiifPrint::Actors::FileSetActorDecorator#destroy}
11
+ #
12
+ # For more information about adjusting transactions, see
13
+ # [Transitioning workshop solution for adding transaction](https://github.com/samvera-labs/transitioning-to-valkyrie-workshop/commit/bcab2bb8f65078e88395c68f72be00e7ffad57ec)
14
+ #
15
+ # @see https://github.com/samvera/hyrax/blob/f875d61dc87229cf1f05eb2bb6d414b5ef314616/lib/hyrax/transactions/container.rb
16
+ class IiifPrintContainerDecorator
17
+ extend Dry::Container::Mixin
18
+
19
+ namespace 'file_set' do |ops|
20
+ ops.register 'iiif_print_conditionally_destroy_spawned_children' do
21
+ Steps::ConditionallyDestroyChildrenFromSplit.new
22
+ end
23
+ ops.register 'destroy' do
24
+ Hyrax::Transactions::FileSetDestroy.new(
25
+ steps: (['file_set.iiif_print_conditionally_destroy_spawned_children'] +
26
+ Hyrax::Transactions::FileSetDestroy::DEFAULT_STEPS)
27
+ )
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+
34
+ "Hyrax::Transactions::Container".safe_constantize&.merge(Hyrax::Transactions::IiifPrintContainerDecorator)
@@ -0,0 +1,32 @@
1
+ module Hyrax
2
+ module Transactions
3
+ module Steps
4
+ ##
5
+ # For a FileSet that is a PDF, we need to delete any works and file_sets that are the result of
6
+ # splitting that PDF into constituent images of each page of the PDF. This is responsible for
7
+ # that work.
8
+ class ConditionallyDestroyChildrenFromSplit
9
+ include Dry::Monads[:result]
10
+
11
+ ##
12
+ # @param resource [Hyrax::FileSet]
13
+ def call(resource, user: nil)
14
+ return Failure(:resource_not_persisted) unless resource.persisted?
15
+
16
+ parent = IiifPrint.persistence_adapter.parent_for(resource)
17
+ return Success(true) unless parent
18
+
19
+ # We do not care about the results of this call; as it is conditionally looking for things
20
+ # to destroy.
21
+ IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of(
22
+ file_set: resource,
23
+ work: parent,
24
+ user: user
25
+ )
26
+
27
+ Success(resource)
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end