iiif_print 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (181) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  4. data/.github/workflows/build-lint-test-action.yaml +4 -5
  5. data/.gitignore +5 -4
  6. data/.rubocop.yml +1 -0
  7. data/.solargraph.yml +19 -0
  8. data/Gemfile.lock +1025 -0
  9. data/README.md +102 -9
  10. data/Rakefile +6 -0
  11. data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
  12. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
  13. data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
  14. data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
  15. data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
  16. data/app/helpers/iiif_print_helper.rb +0 -20
  17. data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
  18. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +45 -17
  19. data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
  20. data/app/jobs/iiif_print/jobs/child_works_from_pdf_job.rb +153 -0
  21. data/app/jobs/iiif_print/jobs/create_relationships_job.rb +117 -0
  22. data/app/jobs/iiif_print/jobs/request_split_pdf_job.rb +31 -0
  23. data/app/listeners/iiif_print/listener.rb +31 -0
  24. data/app/models/concerns/iiif_print/set_child_flag.rb +10 -1
  25. data/app/models/concerns/iiif_print/solr/document.rb +19 -3
  26. data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
  27. data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
  28. data/app/models/iiif_print/pending_relationship.rb +3 -0
  29. data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
  30. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
  31. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  32. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +23 -11
  33. data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
  34. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
  35. data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
  36. data/app/services/iiif_print/manifest_builder_service_behavior.rb +90 -31
  37. data/app/services/iiif_print/pluggable_derivative_service.rb +8 -10
  38. data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
  39. data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
  40. data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
  41. data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
  42. data/app/views/catalog/_index_header_list_default.html.erb +13 -0
  43. data/app/views/hyrax/base/_representative_media.html.erb +4 -3
  44. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
  45. data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
  46. data/config/initializers/simple_schema_loader.rb +1 -0
  47. data/config/locales/iiif_print.en.yml +4 -0
  48. data/config/metadata/child_works_from_pdf_splitting.yaml +21 -0
  49. data/config/routes.rb +3 -0
  50. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
  51. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
  52. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
  53. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
  54. data/docker-compose.yml +2 -2
  55. data/iiif_print.gemspec +11 -10
  56. data/lib/generators/iiif_print/install_generator.rb +21 -1
  57. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
  58. data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
  59. data/lib/iiif_print/base_derivative_service.rb +14 -2
  60. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +58 -6
  61. data/lib/iiif_print/catalog_search_builder.rb +7 -3
  62. data/lib/iiif_print/configuration.rb +205 -8
  63. data/lib/iiif_print/data/fileset_helper.rb +3 -3
  64. data/lib/iiif_print/data/work_derivatives.rb +4 -4
  65. data/lib/iiif_print/engine.rb +53 -15
  66. data/lib/iiif_print/errors.rb +18 -0
  67. data/lib/iiif_print/homepage_search_builder.rb +17 -0
  68. data/lib/iiif_print/image_tool.rb +12 -8
  69. data/lib/iiif_print/jp2_derivative_service.rb +4 -1
  70. data/lib/iiif_print/lineage_service.rb +47 -13
  71. data/lib/iiif_print/metadata.rb +67 -48
  72. data/lib/iiif_print/pdf_derivative_service.rb +3 -1
  73. data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
  74. data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
  75. data/lib/iiif_print/persistence_layer.rb +118 -0
  76. data/lib/iiif_print/split_pdfs/base_splitter.rb +153 -0
  77. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +83 -37
  78. data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
  79. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +22 -0
  80. data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
  81. data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
  82. data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
  83. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
  84. data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
  85. data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
  86. data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
  87. data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
  88. data/lib/iiif_print/tiff_derivative_service.rb +3 -1
  89. data/lib/iiif_print/version.rb +1 -1
  90. data/lib/iiif_print.rb +210 -20
  91. data/lib/samvera/derivatives/configuration.rb +83 -0
  92. data/lib/samvera/derivatives/hyrax.rb +129 -0
  93. data/lib/samvera/derivatives.rb +238 -0
  94. data/tasks/copy_authorities_to_test_app.rake +11 -0
  95. data/tasks/iiif_print_dev.rake +4 -4
  96. metadata +111 -196
  97. data/app/helpers/hyrax/iiif_helper.rb +0 -22
  98. data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -34
  99. data/app/views/hyrax/file_sets/_actions.html.erb +0 -45
  100. data/bin/rails +0 -13
  101. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +0 -107
  102. data/lib/iiif_print/jobs/create_relationships_job.rb +0 -78
  103. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
  104. data/spec/.keep.txt +0 -1
  105. data/spec/factories/ability.rb +0 -6
  106. data/spec/factories/newspaper_issue.rb +0 -7
  107. data/spec/factories/newspaper_page.rb +0 -7
  108. data/spec/factories/newspaper_page_solr_document.rb +0 -12
  109. data/spec/factories/newspaper_title.rb +0 -8
  110. data/spec/factories/uploaded_pdf_file.rb +0 -9
  111. data/spec/factories/uploaded_txt_file.rb +0 -9
  112. data/spec/factories/user.rb +0 -13
  113. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  114. data/spec/fixtures/files/4.1.07.tiff +0 -0
  115. data/spec/fixtures/files/README.md +0 -7
  116. data/spec/fixtures/files/alto-2-0.xsd +0 -714
  117. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  118. data/spec/fixtures/files/credits.md +0 -16
  119. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  120. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  121. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  122. data/spec/fixtures/files/minimal-alto.xml +0 -31
  123. data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
  124. data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
  125. data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
  126. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  127. data/spec/fixtures/files/ocr_alto.xml +0 -202
  128. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
  129. data/spec/fixtures/files/ocr_color.tiff +0 -0
  130. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  131. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  132. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  133. data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
  134. data/spec/fixtures/files/page1.tiff +0 -0
  135. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  136. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  137. data/spec/fixtures/files/thumbnail.jpg +0 -0
  138. data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
  139. data/spec/helpers/iiif_print_helper_spec.rb +0 -43
  140. data/spec/iiif_print/base_derivative_service_spec.rb +0 -11
  141. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -51
  142. data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
  143. data/spec/iiif_print/configuration_spec.rb +0 -67
  144. data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
  145. data/spec/iiif_print/data/work_file_spec.rb +0 -99
  146. data/spec/iiif_print/data/work_files_spec.rb +0 -237
  147. data/spec/iiif_print/image_tool_spec.rb +0 -109
  148. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -30
  149. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -17
  150. data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
  151. data/spec/iiif_print/lineage_service_spec.rb +0 -13
  152. data/spec/iiif_print/metadata_spec.rb +0 -115
  153. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
  154. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
  155. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
  156. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
  157. data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
  158. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
  159. data/spec/iiif_print_spec.rb +0 -51
  160. data/spec/misc_shared.rb +0 -111
  161. data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
  162. data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
  163. data/spec/models/solr_document_spec.rb +0 -14
  164. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -19
  165. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
  166. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
  167. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
  168. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -178
  169. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
  170. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
  171. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
  172. data/spec/spec_helper.rb +0 -181
  173. data/spec/support/controller_level_helpers.rb +0 -28
  174. data/spec/support/iiif_print_models.rb +0 -127
  175. data/spec/test_app_templates/blacklight.yml +0 -9
  176. data/spec/test_app_templates/fedora.yml +0 -15
  177. data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
  178. data/spec/test_app_templates/redis.yml +0 -9
  179. data/spec/test_app_templates/solr/conf/schema.xml +0 -362
  180. data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
  181. data/spec/test_app_templates/solr.yml +0 -7
data/README.md CHANGED
@@ -35,9 +35,9 @@ IiifPrint supports:
35
35
  * OCR keyword match highlighting
36
36
  * viewer with page navigation and deep zooming
37
37
  * splitting of PDFs to LZW compressed TIFFs for viewing
38
- * configuring how the manifest canvases are sorted in the viewer
39
38
  * adding metadata fields to the manifest with faceted search links and external links
40
39
  * excluding specified work types to be found in the catalog search
40
+ * external IIIF image urls that work with services such as serverless-iiif or cantaloup
41
41
 
42
42
  A complete list of features can be found [here](https://github.com/scientist-softserv/iiif_print/wiki/Features-List).
43
43
 
@@ -86,15 +86,43 @@ IiifPrint easily integrates with your Hyrax 2.x applications.
86
86
  * In `config/routes.rb`, it adds `concerns :iiif_search` in the `resources :solr_documents` block
87
87
  * Adds `config/initializers/iiif_print.rb`
88
88
  * Adds three migrations, `CreateIiifPrintDerivativeAttachments`, `CreateIiifPrintIngestFileRelations`, and `CreateIiifPrintPendingRelationships`
89
- * In `solr/conf/schema.xml`, it adds Blacklight IIIF Search autocomplete config
90
- * In `solr/conf/solrconfig.xml`, it adds Blacklight IIIF Search autocomplete config
91
- * Adds `solr/lib/solr-tokenizing_suggester-7.x.jar`
92
89
 
93
90
  (It may be helpful to run `git diff` after installation to see all the changes made by the installer.)
94
91
 
92
+ ## Catalog to Universal Viewer search:
93
+ To enable a feature where the UV automatically picks up the search from the catalog, do the following:
94
+ * Add `highlight: urlDataProvider.get('q'),` into your uv.html in the `<script>` section.
95
+ ```js
96
+ uv = createUV('#uv', {
97
+ root: '.',
98
+ iiifResourceUri: urlDataProvider.get('manifest'),
99
+ configUri: 'uv-config.json',
100
+ collectionIndex: Number(urlDataProvider.get('c', 0)),
101
+ manifestIndex: Number(urlDataProvider.get('m', 0)),
102
+ sequenceIndex: Number(urlDataProvider.get('s', 0)),
103
+ canvasIndex: Number(urlDataProvider.get('cv', 0)),
104
+ rangeId: urlDataProvider.get('rid', 0),
105
+ rotation: Number(urlDataProvider.get('r', 0)),
106
+ xywh: urlDataProvider.get('xywh', ''),
107
+ embedded: true,
108
+ highlight: urlDataProvider.get('q'), // <-- here's a good spot
109
+ locales: formattedLocales
110
+ }, urlDataProvider);
111
+ ```
112
+
113
+ * Make sure to remove your application's `app/helpers/hyrax/iiif_helper.rb` and `app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb` (if exists)
114
+
95
115
  ## Configuration to enable IiifPrint features
96
116
  **NOTE: WorkTypes and models are used synonymously here.**
97
117
 
118
+ ### Persistence Layer Adapter
119
+
120
+ We created IiifPrint with an assumption of ActiveFedora. However, as Hyrax now supports Valkyrie, we need an alternate approach. We introduced `IiifPrint::Configuration#persistence_layer` as a configuration option. By default it will use `ActiveFedora` methods; but you can switch adapters to use Valkyrie instead. (See `IiifPrint::PersistentLayer` for more details).
121
+
122
+ ### IIIF URL configuration
123
+
124
+ If you set EXTERNAL_IIIF_URL in your environment, then IiifPrint will use that URL as the root for your IIIF URLs. It will also switch from using the file set ID to using the SHA1 of the file as the identifier. This enables using serverless_iiif or Cantaloupe (refered to as the service) by pointing the service to the same S3 bucket that FCREPO writes the uploaded files to. By setting it up that way you do not need the service to connect to FCREPO or Hyrax at all, both natively support connecting to an S3 bucket to get their data.
125
+
98
126
  ### Model level configurations
99
127
 
100
128
  In `app/models/{work_type}.rb` add `include IiifPrint.model_configuration` to any work types which require IiifPrint processing features (such as PDF splitting or OCR derivatives). See [lib/iiif_print.rb](./lib/iiif_print.rb) for details on configuration options.
@@ -126,10 +154,6 @@ IiifPrint.config do |config|
126
154
  # Add configurable solr field key for searching, default key is: 'human_readable_type_sim' if
127
155
  # another key is used, make sure to adjust the config.excluded_model_name_solr_field_values to match
128
156
  config.excluded_model_name_solr_field_key = 'some_solr_field_key'
129
-
130
- # Configure how the manifest sorts the canvases, by default it sorts by `:title`, but a different
131
- # model property may be desired such as :date_published
132
- config.sort_iiif_manifest_canvases_by = :date_published
133
157
  end
134
158
  ```
135
159
 
@@ -146,7 +170,7 @@ TO ENABLE OCR Search (from the UV and catalog search)
146
170
  }
147
171
  end
148
172
  ```
149
- * Set `config.search_builder_class = IiifPrint::CatalogSearchBuilder` to remove works from the catalog search results if `is_child_bsi: true`
173
+ * Set `config.search_builder_class = IiifPrint::CatalogSearchBuilder` to remove works from the catalog search results if `is_child_bsi: true`
150
174
  * Ensure that all text search is configured in default_solr_params config block:
151
175
  ```rb
152
176
  config.default_solr_params = {
@@ -156,6 +180,75 @@ TO ENABLE OCR Search (from the UV and catalog search)
156
180
  }
157
181
  ```
158
182
 
183
+ To remove child works from recent works on homepage
184
+ ### homepage_controller.rb
185
+ * In the HomepageController, change the search_builder_class to remove works from recent_documents if `is_child_bsi: true`
186
+ ```rb
187
+ require "iiif_print/homepage_search_builder"
188
+
189
+ def search_builder_class
190
+ IiifPrint::HomepageSearchBuilder
191
+ end
192
+ ```
193
+
194
+ ### Skipping Certain File Suffixes for PDF Splitting
195
+
196
+ By default when a work is configured for splitting PDFs, we will split all PDFs. However, in some cases you don't want to split based on the file name's suffix. In that case, configure code as follows:
197
+
198
+ ```ruby
199
+ IiifPrint.config do |config|
200
+ config.skip_splitting_pdf_files_that_end_with_these_texts = ['.reader.pdf']
201
+ end
202
+ ```
203
+
204
+ ### Derivative Rodeo Configuration
205
+
206
+ The Derivative Rodeo is used in two ways:
207
+
208
+ - Configuring the `Hyrax::DerivativeService` by adding `IiifPrint::DerivativeRodeoService`
209
+ - Enable Derivative Rodeo PDF Splitting service by `IiifPrint.model_configuration`
210
+
211
+ #### Configuring Hyrax::Derivative
212
+
213
+ In the application initializer:
214
+
215
+ ```ruby
216
+ Hyrax::DerivativeService.services = [
217
+ IiifPrint::DerivativeRodeoService,
218
+ Hyrax::FileSetDerivativesService]
219
+ ```
220
+
221
+ #### Enabling Derivative Rodeo PDF Splitting
222
+
223
+ The [IiifPrint.model\_configuration method](./lib/iiif_print.rb) allows for specifying the `pdf\_splitter\_service` as below:
224
+
225
+ ```ruby
226
+ class Book < ActiveFedora::Base
227
+ include IiifPrint.model_configuration(
228
+ pdf_splitter_service: IiifPrint::SplitPdfs::DerivativeRodeoSplitter
229
+ )
230
+ end
231
+ ```
232
+
233
+ #### Pre-Process Location
234
+
235
+ The [DerivativeRodeo](https://github.com/scientist-softserv/derivative_rodeo) allows for specifying a location where you've done pre-processing (e.g. you ran splitting and derivative generation in AWS's Lambda).
236
+
237
+ By default the preprocess location is S3, as that is where SoftServ has been running pre-processing. However that default may not be adequate for local development.
238
+
239
+ #### Conditional Derivative Generation
240
+
241
+ The [IiifPrint::DerivativeRodeoService][./app/services/iiif_print/derivative_rodeo_service.rb] provides a means of specifying the derivatives to generate via two configuration points:
242
+
243
+ - `IiifPrint::DerivativeRodeoService.named_derivatives_and_generators_by_type`
244
+ - `IiifPrint::DerivativeRodeoService.named_derivatives_and_generators_filter`
245
+
246
+ In the case of `named_derivatives_and_generators_by_type`, we're saying all mime categories will generate these derivatives.
247
+
248
+ In the case of `named_derivatives_and_generators_filter`, we're providing a point where we can specify for each file_set and filename the specific derivatives to accept/reject/append to the named derivative generation.
249
+
250
+ See their examples for further configuration guidance.
251
+
159
252
  # Ingesting Content
160
253
 
161
254
  IiifPrint supports a range of different ingest workflows:
data/Rakefile CHANGED
@@ -35,4 +35,10 @@ end
35
35
  Dir.glob('tasks/*.rake').each { |r| import r }
36
36
  Dir.glob('lib/tasks/*.rake').each { |r| import r }
37
37
 
38
+ # Adding the copy_authorities here so it runs the same in CI
39
+ desc "Generate the engine_cart, copy authorities, and run tests"
40
+ task prepare_and_run_tests: ['engine_cart:generate', 'engine_cart:copy_authorities'] do
41
+ puts "Running CI tests"
42
+ end
43
+
38
44
  task default: :ci
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ # override Hyrax to remove splitting upon work delete
4
+ module IiifPrint
5
+ module Actors
6
+ # Responsible for removing FileSets related to the given curation concern.
7
+ module CleanupFileSetsActorDecorator
8
+ # @param [Hyrax::Actors::Environment] env
9
+ # @return [Boolean] true if destroy was successful
10
+ def destroy(env)
11
+ file_sets = env.curation_concern.file_sets
12
+ file_sets.each do |file_set|
13
+ # we destroy the children before the file_set, because we need the parent relationship
14
+ IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of(
15
+ file_set: file_set,
16
+ work: env.curation_concern
17
+ )
18
+ end
19
+ # and now back to your regularly scheduled programming
20
+ super
21
+ end
22
+ end
23
+ end
24
+ end
@@ -1,6 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # override to add PDF splitting for file sets
3
+ # override to add PDF splitting for file sets and remove splitting upon fileset delete
4
+
5
+ # Depending on whether we have an uploaded file or a remote url, the sequence of calling
6
+ # attach_to_work and create_content will switch.
4
7
  module IiifPrint
5
8
  module Actors
6
9
  module FileSetActorDecorator
@@ -9,48 +12,47 @@ module IiifPrint
9
12
  super
10
13
 
11
14
  if from_url
12
- # we have everything we need... queue the job
13
- parent = parent_for(file_set: @file_set)
14
-
15
- if service.iiif_print_split?(work: parent) && service.pdfs?(paths: [file_set.import_url])
16
- service.queue_job(
17
- work: parent,
18
- file_locations: [file.path],
19
- user: @user,
20
- admin_set_id: parent.admin_set_id
21
- )
22
- end
15
+ # in this case, the file that came in is a temp file, and we need to use the actual file.
16
+ # the file was attached to the file_set in Hyrax::ImportUrlJob so we can just access it.
17
+ args = { file_set: file_set, file: file_set.files.first, import_url: file_set.import_url, user: @user }
18
+ returned_value = service.conditionally_enqueue(**args)
19
+ Rails.logger.info("Result of #{returned_value} for conditional enqueueing of #{args.inspect}")
20
+ true
23
21
  else
24
22
  # we don't have the parent yet... save the paths for later use
25
- @pdf_paths = service.pdf_paths(files: [file.id.to_s])
23
+ @file = file
26
24
  end
27
25
  end
28
26
 
29
- # Prior to Hyrax v3.1.0, this method did not exist
30
- # @param file_set [FileSet]
31
- # @return [ActiveFedora::Base]
32
- def parent_for(file_set:)
33
- file_set.parent
34
- end
35
-
36
27
  # Override to add PDF splitting
37
28
  def attach_to_work(work, file_set_params = {})
38
29
  # Locks to ensure that only one process is operating on the list at a time.
39
30
  super
40
31
 
41
- return if @pdf_paths.blank?
42
- return unless service.iiif_print_split?(work: work)
43
- service.queue_job(
44
- work: work,
45
- file_locations: @pdf_paths,
46
- user: @user,
47
- admin_set_id: work.admin_set_id
48
- )
32
+ # when we are importing a remote_url, this method is called before the file is attached.
33
+ # We want to short-circuit the process and prevent unnecessarily confusing logging.
34
+ return unless @file
35
+
36
+ args = { file_set: file_set, work: work, file: @file, user: @user }
37
+ returned_value = service.conditionally_enqueue(**args)
38
+ Rails.logger.info("Result of #{returned_value} for conditional enqueueing of #{args.inspect}")
39
+ true
49
40
  end
50
41
 
51
42
  def service
52
43
  IiifPrint::SplitPdfs::ChildWorkCreationFromPdfService
53
44
  end
45
+
46
+ # Clean up children when removing the fileset
47
+ def destroy
48
+ # we destroy the children before the file_set, because we need the parent relationship
49
+ IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of(
50
+ file_set: file_set,
51
+ work: IiifPrint.parent_for(file_set)
52
+ )
53
+ # and now back to your regularly scheduled programming
54
+ super
55
+ end
54
56
  end
55
57
  end
56
58
  end
@@ -0,0 +1,38 @@
1
+ module IiifPrint
2
+ # Responsible for coordinating the request to resplit a PDF.
3
+ class SplitPdfsController < ApplicationController
4
+ before_action :authenticate_user!
5
+
6
+ def create
7
+ @file_set = FileSet.where(id: params[:file_set_id]).first
8
+ authorize_create_split_request!(@file_set)
9
+ IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set: @file_set, user: current_user)
10
+ respond_to do |wants|
11
+ wants.html { redirect_to polymorphic_path([main_app, @file_set]), notice: t("iiif_print.file_set.split_submitted", id: @file_set.id) }
12
+ wants.json { render json: { id: @file_set.id, to_param: @file_set.to_param }, status: :ok }
13
+ end
14
+ end
15
+
16
+ private
17
+
18
+ ##
19
+ # @param file_set [FileSet]
20
+ def authorize_create_split_request!(file_set)
21
+ # NOTE: Duplicates logic of Hyrax: https://github.com/samvera/hyrax/blob/b334e186e77691d7da8ed59ff27f091be1c2a700/app/controllers/hyrax/file_sets_controller.rb#L234-L241
22
+ #
23
+ # Namely if we don't have a file_set we need not proceed.
24
+ raise CanCan::AccessDenied unless file_set
25
+
26
+ ##
27
+ # Rely on CanCan's authorize! method. We could add the :split_pdf action to the ability
28
+ # class. But we're pigging backing on the idea that you can do this if you can edit the work.
29
+ authorize!(:edit, file_set)
30
+ raise "Expected #{file_set.class} ID=#{file_set.id} #to_param=#{file_set.to_param} to be a PDF. Instead found mime_type of #{file_set.mime_type}." unless file_set.pdf?
31
+
32
+ work = IiifPrint.parent_for(file_set)
33
+ raise WorkNotConfiguredToSplitFileSetError.new(file_set: file_set, work: work) unless work&.iiif_print_config&.pdf_splitter_job&.presence
34
+
35
+ true
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ # OVERRIDE Hyrax v2.9.6 add #uv_search_param
4
+
5
+ module IiifPrint
6
+ module IiifHelperDecorator
7
+ def iiif_viewer_display(work_presenter, locals = {})
8
+ render iiif_viewer_display_partial(work_presenter),
9
+ locals.merge(presenter: work_presenter)
10
+ end
11
+
12
+ def iiif_viewer_display_partial(work_presenter)
13
+ 'hyrax/base/iiif_viewers/' + work_presenter.iiif_viewer.to_s
14
+ end
15
+
16
+ def universal_viewer_base_url
17
+ "#{request&.base_url}#{IiifPrint.config.uv_base_path}"
18
+ end
19
+
20
+ def universal_viewer_config_url
21
+ "#{request&.base_url}#{IiifPrint.config.uv_config_path}"
22
+ end
23
+
24
+ # Extract query param from search
25
+ def uv_search_param
26
+ search_params = current_search_session.try(:query_params) || {}
27
+ q = search_params['q'].presence || ''
28
+
29
+ "&q=#{url_encode(q)}" if q.present?
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,23 @@
1
+ module IiifPrint::IiifPrintHelperBehavior
2
+ ##
3
+ # print the ocr snippets. if more than one, separate with <br/>
4
+ #
5
+ # @param options [Hash] options hash provided by Blacklight
6
+ # @return [String] snippets HTML to be rendered
7
+ # rubocop:disable Rails/OutputSafety
8
+ def render_ocr_snippets(options = {})
9
+ snippets = options[:value]
10
+ return if snippets.blank?
11
+
12
+ snippets_content = [content_tag('div',
13
+ "... #{snippets.first} ...".html_safe,
14
+ class: 'ocr_snippet first_snippet')]
15
+ if snippets.length > 1
16
+ snippets_content << render(partial: 'catalog/snippets_more',
17
+ locals: { snippets: snippets.drop(1),
18
+ options: options })
19
+ end
20
+ snippets_content.join("\n").html_safe
21
+ end
22
+ # rubocop:enable Rails/OutputSafety
23
+ end
@@ -41,24 +41,4 @@ module IiifPrintHelper
41
41
  end
42
42
  hl_matches.uniq.sort.join(' ')
43
43
  end
44
-
45
- ##
46
- # print the ocr snippets. if more than one, separate with <br/>
47
- #
48
- # @param options [Hash] options hash provided by Blacklight
49
- # @return [String] snippets HTML to be rendered
50
- # rubocop:disable Rails/OutputSafety
51
- def render_ocr_snippets(options = {})
52
- snippets = options[:value]
53
- snippets_content = [content_tag('div',
54
- "... #{snippets.first} ...".html_safe,
55
- class: 'ocr_snippet first_snippet')]
56
- if snippets.length > 1
57
- snippets_content << render(partial: 'catalog/snippets_more',
58
- locals: { snippets: snippets.drop(1),
59
- options: options })
60
- end
61
- snippets_content.join("\n").html_safe
62
- end
63
- # rubocop:enable Rails/OutputSafety
64
44
  end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module IiifPrint
4
+ module ChildWorkIndexer
5
+ def to_solr
6
+ super.tap do |index_document|
7
+ index_solr_doc(index_document)
8
+ end
9
+ end
10
+
11
+ def generate_solr_document
12
+ super.tap do |solr_doc|
13
+ index_solr_doc(solr_doc)
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def index_solr_doc(solr_doc)
20
+ object ||= @object || resource
21
+ solr_doc['is_child_bsi'] ||= object.try(:is_child)
22
+ solr_doc['split_from_pdf_id_ssi'] ||= object.try(:split_from_pdf_id)
23
+ solr_doc['is_page_of_ssim'] = iiif_print_lineage_service.ancestor_ids_for(object)
24
+ solr_doc['member_ids_ssim'] = iiif_print_lineage_service.descendent_member_ids_for(object)
25
+ end
26
+ end
27
+ end
@@ -2,28 +2,56 @@
2
2
 
3
3
  module IiifPrint
4
4
  module FileSetIndexer
5
- # Why `.decorate`? In my tests for Rails 5.2, I'm not able to use the prepended nor included
6
- # blocks to assign a class_attribute when I "prepend" a module to the base class. This method
7
- # allows me to handle that behavior.
8
- #
9
- # @param base [Class]
10
- # @return [Class] the given base, now decorated in all of it's glory
11
- def self.decorate(base)
12
- base.prepend(self)
13
- base.class_attribute :iiif_print_lineage_service, default: IiifPrint::LineageService
14
- base
5
+ def to_solr
6
+ super.tap do |index_document|
7
+ index_solr_doc(index_document)
8
+ end
15
9
  end
16
10
 
17
11
  def generate_solr_document
18
12
  super.tap do |solr_doc|
19
- # only UV viewable images should have is_page_of, it is only used for iiif search
20
- solr_doc['is_page_of_ssim'] = iiif_print_lineage_service.ancestor_ids_for(object) if object.mime_type&.match(/image/)
21
- # index for full text search
22
- text = IiifPrint::Data::WorkDerivatives.data(from: object, of_type: 'txt')
23
- text = text.tr("\n", ' ').squeeze(' ')
24
- solr_doc['all_text_timv'] = text
25
- solr_doc['all_text_tsimv'] = text
13
+ index_solr_doc(solr_doc)
26
14
  end
27
15
  end
16
+
17
+ private
18
+
19
+ def index_solr_doc(solr_doc)
20
+ object ||= @object || resource
21
+ # only UV viewable images should have is_page_of, it is only used for iiif search
22
+ solr_doc['is_page_of_ssim'] = IiifPrint::LineageService.ancestor_ids_for(object) if image?(object)
23
+ # index for full text search
24
+ solr_doc['all_text_tsimv'] = solr_doc['all_text_timv'] = all_text(object)
25
+ solr_doc['digest_ssim'] = find_checksum(object)
26
+ end
27
+
28
+ def image?(object)
29
+ mime_type = object.try(:mime_type) || object.original_file.try(:mime_type)
30
+ mime_type&.match(/image/)
31
+ end
32
+
33
+ def find_checksum(object)
34
+ file = object.original_file
35
+ return unless file
36
+
37
+ digest ||= if file.is_a?(Hyrax::FileMetadata)
38
+ Array.wrap(file.checksum).first
39
+ else # file is a Hydra::PCDM::File (ActiveFedora)
40
+ file.digest.first
41
+ end
42
+ return unless digest
43
+
44
+ digest.to_s
45
+ end
46
+
47
+ def all_text(object)
48
+ file = object.original_file
49
+ return unless file
50
+
51
+ text = IiifPrint.extract_text_for(file_set: object)
52
+ return text if text.blank?
53
+
54
+ text.tr("\n", ' ').squeeze(' ')
55
+ end
28
56
  end
29
57
  end
@@ -2,7 +2,8 @@ module IiifPrint
2
2
  module Jobs
3
3
  # TODO: Consider inheriting from ::Application job. That means we would have the upstreams
4
4
  # based job behavior.
5
- class ApplicationJob < ActiveJob::Base
5
+ class ApplicationJob < ::ApplicationJob
6
+ queue_as ::IiifPrint.config.ingest_queue_name
6
7
  end
7
8
  end
8
9
  end
@@ -0,0 +1,153 @@
1
+ require 'iiif_print/jobs/application_job'
2
+
3
+ module IiifPrint
4
+ module Jobs
5
+ # @deprecated
6
+ class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
7
+ ##
8
+ # Break a pdf into individual pages
9
+ #
10
+ # @param candidate_for_parency [FileSet, Hydra::PCDM::Work]
11
+ # @param pdf_paths: [<Array => String>] paths to pdfs
12
+ # @param user: [User]
13
+ # @param admin_set_id: [<String>]
14
+ # rubocop:disable Metrics/MethodLength
15
+ def perform(id, pdf_paths, user, admin_set_id, *)
16
+ candidate_for_parency = IiifPrint.find_by(id: id)
17
+
18
+ ##
19
+ # We know that we have cases where parent_work is nil, this will definitely raise an
20
+ # exception; which is fine because we were going to do it later anyway.
21
+ @parent_work = if candidate_for_parency.work?
22
+ pdf_file_set = nil
23
+ candidate_for_parency
24
+ else
25
+ # We likely have a file set
26
+ pdf_file_set = candidate_for_parency
27
+ IiifPrint.parent_for(candidate_for_parency)
28
+ end
29
+ @child_admin_set_id = admin_set_id
30
+ child_model = @parent_work.iiif_print_config.pdf_split_child_model
31
+
32
+ # When working with remote files, we have put the PDF file into the correct path before submitting this job.
33
+ # However, there seem to be cases where we still don't have the file when we get here, so to be sure, we
34
+ # re-do the same command that was previously used to prepare the file path. If the file is already here, it
35
+ # simply returns the path, but if not it will copy the file there, giving us one more chance to have what we need.
36
+ pdf_paths = [Hyrax::WorkingDirectory.find_or_retrieve(pdf_file_set.original_file.id, pdf_file_set.id, pdf_paths.first)] if pdf_file_set
37
+ # handle each input pdf (when input is a file set, we will only have one).
38
+ pdf_paths.each do |original_pdf_path|
39
+ split_pdf(original_pdf_path, user, child_model, pdf_file_set)
40
+ end
41
+
42
+ # Link newly created child works to the parent
43
+ # @param user: [User] user
44
+ # @param parent_id: [<String>] parent work id
45
+ # @param parent_model: [<String>] parent model
46
+ # @param child_model: [<String>] child model
47
+ IiifPrint::Jobs::CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
48
+ user: user,
49
+ parent_id: @parent_work.id.to_s,
50
+ parent_model: @parent_work.class.to_s,
51
+ child_model: child_model.to_s
52
+ )
53
+
54
+ # TODO: clean up image_files and pdf_paths
55
+ end
56
+ # rubocop:enable Metrics/MethodLength
57
+
58
+ private
59
+
60
+ # rubocop:disable Metrics/ParameterLists
61
+ # rubocop:disable Metrics/MethodLength
62
+ def split_pdf(original_pdf_path, user, child_model, pdf_file_set)
63
+ user = User.find_by_user_key(user) unless user.is_a?(User)
64
+ image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path)
65
+
66
+ # give as much info as possible if we don't have image files to work with.
67
+ if image_files.blank?
68
+ raise "#{@parent_work.class} (ID=#{@parent_work.id} " \
69
+ "to_param:#{@parent_work.to_param}) " \
70
+ "original_pdf_path #{original_pdf_path.inspect} " \
71
+ "pdf_file_set #{pdf_file_set.inspect}"
72
+ end
73
+
74
+ @split_from_pdf_id = pdf_file_set&.id.to_s
75
+ prepare_import_data(original_pdf_path, image_files, user)
76
+
77
+ # submit the job to create all the child works for one PDF
78
+ # @param [User] user
79
+ # @param [Hash<String => String>] titles
80
+ # @param [Hash<String => String>] resource_types (optional)
81
+ # @param [Array<String>] uploaded_files Hyrax::UploadedFile IDs
82
+ # @param [Hash] attributes attributes to apply to all works, including :model
83
+ # @param [Hyrax::BatchCreateOperation] operation
84
+ operation = Hyrax::BatchCreateOperation.create!(
85
+ user: user,
86
+ operation_type: "PDF Batch Create"
87
+ )
88
+ BatchCreateJob.perform_later(user,
89
+ @child_work_titles,
90
+ {},
91
+ @uploaded_files,
92
+ attributes.merge!(model: child_model.to_s, split_from_pdf_id: @split_from_pdf_id).with_indifferent_access,
93
+ operation)
94
+ end
95
+ # rubocop:enable Metrics/MethodLength
96
+ # rubocop:enable Metrics/ParameterLists
97
+
98
+ # rubocop:disable Metrics/MethodLength
99
+ def prepare_import_data(original_pdf_path, image_files, user)
100
+ @uploaded_files = []
101
+ @child_work_titles = {}
102
+ number_of_pages_in_pdf = image_files.size
103
+ image_files.each_with_index do |image_path, page_number|
104
+ file_id = create_uploaded_file(user, image_path).to_s
105
+
106
+ child_title = IiifPrint.config.unique_child_title_generator_function.call(
107
+ original_pdf_path: original_pdf_path,
108
+ image_path: image_path,
109
+ parent_work: @parent_work,
110
+ page_number: page_number,
111
+ page_padding: number_of_digits(nbr: number_of_pages_in_pdf)
112
+ )
113
+
114
+ @uploaded_files << file_id
115
+ @child_work_titles[file_id] = child_title
116
+ # save child work info to create the member relationships
117
+ PendingRelationship.create!(child_title: child_title,
118
+ parent_id: @parent_work.id,
119
+ child_order: child_title,
120
+ parent_model: @parent_work.class,
121
+ child_model: @parent_work.iiif_print_config.pdf_split_child_model,
122
+ file_id: @split_from_pdf_id)
123
+
124
+ begin
125
+ # Clean up the temporary image path.
126
+ FileUtils.rm_f(image_path) if File.exist?(image_path)
127
+ rescue
128
+ # If we can't delete, let's move on. Maybe it was already cleaned-up.
129
+ end
130
+ end
131
+ end
132
+ # rubocop:enable Metrics/MethodLength
133
+
134
+ def number_of_digits(nbr:)
135
+ nbr.to_s.size
136
+ end
137
+
138
+ def create_uploaded_file(user, path)
139
+ # TODO: Could we create a remote path?
140
+ uf = Hyrax::UploadedFile.new
141
+ uf.user_id = user.try(:id) || user
142
+ uf.file = CarrierWave::SanitizedFile.new(path)
143
+ uf.save!
144
+ uf.id
145
+ end
146
+
147
+ # TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
148
+ def attributes
149
+ IiifPrint.config.child_work_attributes_function.call(parent_work: @parent_work, admin_set_id: @child_admin_set_id)
150
+ end
151
+ end
152
+ end
153
+ end