iiif_print 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
- data/.github/workflows/build-lint-test-action.yaml +4 -5
- data/.gitignore +5 -4
- data/.rubocop.yml +1 -0
- data/.solargraph.yml +19 -0
- data/Gemfile.lock +1025 -0
- data/README.md +102 -9
- data/Rakefile +6 -0
- data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
- data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
- data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
- data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
- data/app/helpers/iiif_print_helper.rb +0 -20
- data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +45 -17
- data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
- data/app/jobs/iiif_print/jobs/child_works_from_pdf_job.rb +153 -0
- data/app/jobs/iiif_print/jobs/create_relationships_job.rb +117 -0
- data/app/jobs/iiif_print/jobs/request_split_pdf_job.rb +31 -0
- data/app/listeners/iiif_print/listener.rb +31 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +10 -1
- data/app/models/concerns/iiif_print/solr/document.rb +19 -3
- data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
- data/app/models/iiif_print/pending_relationship.rb +3 -0
- data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +23 -11
- data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
- data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +90 -31
- data/app/services/iiif_print/pluggable_derivative_service.rb +8 -10
- data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
- data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
- data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
- data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
- data/app/views/catalog/_index_header_list_default.html.erb +13 -0
- data/app/views/hyrax/base/_representative_media.html.erb +4 -3
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
- data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
- data/config/initializers/simple_schema_loader.rb +1 -0
- data/config/locales/iiif_print.en.yml +4 -0
- data/config/metadata/child_works_from_pdf_splitting.yaml +21 -0
- data/config/routes.rb +3 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
- data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
- data/docker-compose.yml +2 -2
- data/iiif_print.gemspec +11 -10
- data/lib/generators/iiif_print/install_generator.rb +21 -1
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
- data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
- data/lib/iiif_print/base_derivative_service.rb +14 -2
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +58 -6
- data/lib/iiif_print/catalog_search_builder.rb +7 -3
- data/lib/iiif_print/configuration.rb +205 -8
- data/lib/iiif_print/data/fileset_helper.rb +3 -3
- data/lib/iiif_print/data/work_derivatives.rb +4 -4
- data/lib/iiif_print/engine.rb +53 -15
- data/lib/iiif_print/errors.rb +18 -0
- data/lib/iiif_print/homepage_search_builder.rb +17 -0
- data/lib/iiif_print/image_tool.rb +12 -8
- data/lib/iiif_print/jp2_derivative_service.rb +4 -1
- data/lib/iiif_print/lineage_service.rb +47 -13
- data/lib/iiif_print/metadata.rb +67 -48
- data/lib/iiif_print/pdf_derivative_service.rb +3 -1
- data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
- data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
- data/lib/iiif_print/persistence_layer.rb +118 -0
- data/lib/iiif_print/split_pdfs/base_splitter.rb +153 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +83 -37
- data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
- data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +22 -0
- data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
- data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
- data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
- data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
- data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
- data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
- data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
- data/lib/iiif_print/tiff_derivative_service.rb +3 -1
- data/lib/iiif_print/version.rb +1 -1
- data/lib/iiif_print.rb +210 -20
- data/lib/samvera/derivatives/configuration.rb +83 -0
- data/lib/samvera/derivatives/hyrax.rb +129 -0
- data/lib/samvera/derivatives.rb +238 -0
- data/tasks/copy_authorities_to_test_app.rake +11 -0
- data/tasks/iiif_print_dev.rake +4 -4
- metadata +111 -196
- data/app/helpers/hyrax/iiif_helper.rb +0 -22
- data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -34
- data/app/views/hyrax/file_sets/_actions.html.erb +0 -45
- data/bin/rails +0 -13
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +0 -107
- data/lib/iiif_print/jobs/create_relationships_job.rb +0 -78
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
- data/spec/.keep.txt +0 -1
- data/spec/factories/ability.rb +0 -6
- data/spec/factories/newspaper_issue.rb +0 -7
- data/spec/factories/newspaper_page.rb +0 -7
- data/spec/factories/newspaper_page_solr_document.rb +0 -12
- data/spec/factories/newspaper_title.rb +0 -8
- data/spec/factories/uploaded_pdf_file.rb +0 -9
- data/spec/factories/uploaded_txt_file.rb +0 -9
- data/spec/factories/user.rb +0 -13
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +0 -7
- data/spec/fixtures/files/alto-2-0.xsd +0 -714
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +0 -16
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +0 -31
- data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
- data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
- data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +0 -202
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
- data/spec/helpers/iiif_print_helper_spec.rb +0 -43
- data/spec/iiif_print/base_derivative_service_spec.rb +0 -11
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -51
- data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
- data/spec/iiif_print/configuration_spec.rb +0 -67
- data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
- data/spec/iiif_print/data/work_file_spec.rb +0 -99
- data/spec/iiif_print/data/work_files_spec.rb +0 -237
- data/spec/iiif_print/image_tool_spec.rb +0 -109
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -30
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -17
- data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
- data/spec/iiif_print/lineage_service_spec.rb +0 -13
- data/spec/iiif_print/metadata_spec.rb +0 -115
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
- data/spec/iiif_print_spec.rb +0 -51
- data/spec/misc_shared.rb +0 -111
- data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
- data/spec/models/solr_document_spec.rb +0 -14
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -19
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -178
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
- data/spec/spec_helper.rb +0 -181
- data/spec/support/controller_level_helpers.rb +0 -28
- data/spec/support/iiif_print_models.rb +0 -127
- data/spec/test_app_templates/blacklight.yml +0 -9
- data/spec/test_app_templates/fedora.yml +0 -15
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
- data/spec/test_app_templates/redis.yml +0 -9
- data/spec/test_app_templates/solr/conf/schema.xml +0 -362
- data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
- data/spec/test_app_templates/solr.yml +0 -7
data/README.md
CHANGED
@@ -35,9 +35,9 @@ IiifPrint supports:
|
|
35
35
|
* OCR keyword match highlighting
|
36
36
|
* viewer with page navigation and deep zooming
|
37
37
|
* splitting of PDFs to LZW compressed TIFFs for viewing
|
38
|
-
* configuring how the manifest canvases are sorted in the viewer
|
39
38
|
* adding metadata fields to the manifest with faceted search links and external links
|
40
39
|
* excluding specified work types to be found in the catalog search
|
40
|
+
* external IIIF image urls that work with services such as serverless-iiif or cantaloup
|
41
41
|
|
42
42
|
A complete list of features can be found [here](https://github.com/scientist-softserv/iiif_print/wiki/Features-List).
|
43
43
|
|
@@ -86,15 +86,43 @@ IiifPrint easily integrates with your Hyrax 2.x applications.
|
|
86
86
|
* In `config/routes.rb`, it adds `concerns :iiif_search` in the `resources :solr_documents` block
|
87
87
|
* Adds `config/initializers/iiif_print.rb`
|
88
88
|
* Adds three migrations, `CreateIiifPrintDerivativeAttachments`, `CreateIiifPrintIngestFileRelations`, and `CreateIiifPrintPendingRelationships`
|
89
|
-
* In `solr/conf/schema.xml`, it adds Blacklight IIIF Search autocomplete config
|
90
|
-
* In `solr/conf/solrconfig.xml`, it adds Blacklight IIIF Search autocomplete config
|
91
|
-
* Adds `solr/lib/solr-tokenizing_suggester-7.x.jar`
|
92
89
|
|
93
90
|
(It may be helpful to run `git diff` after installation to see all the changes made by the installer.)
|
94
91
|
|
92
|
+
## Catalog to Universal Viewer search:
|
93
|
+
To enable a feature where the UV automatically picks up the search from the catalog, do the following:
|
94
|
+
* Add `highlight: urlDataProvider.get('q'),` into your uv.html in the `<script>` section.
|
95
|
+
```js
|
96
|
+
uv = createUV('#uv', {
|
97
|
+
root: '.',
|
98
|
+
iiifResourceUri: urlDataProvider.get('manifest'),
|
99
|
+
configUri: 'uv-config.json',
|
100
|
+
collectionIndex: Number(urlDataProvider.get('c', 0)),
|
101
|
+
manifestIndex: Number(urlDataProvider.get('m', 0)),
|
102
|
+
sequenceIndex: Number(urlDataProvider.get('s', 0)),
|
103
|
+
canvasIndex: Number(urlDataProvider.get('cv', 0)),
|
104
|
+
rangeId: urlDataProvider.get('rid', 0),
|
105
|
+
rotation: Number(urlDataProvider.get('r', 0)),
|
106
|
+
xywh: urlDataProvider.get('xywh', ''),
|
107
|
+
embedded: true,
|
108
|
+
highlight: urlDataProvider.get('q'), // <-- here's a good spot
|
109
|
+
locales: formattedLocales
|
110
|
+
}, urlDataProvider);
|
111
|
+
```
|
112
|
+
|
113
|
+
* Make sure to remove your application's `app/helpers/hyrax/iiif_helper.rb` and `app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb` (if exists)
|
114
|
+
|
95
115
|
## Configuration to enable IiifPrint features
|
96
116
|
**NOTE: WorkTypes and models are used synonymously here.**
|
97
117
|
|
118
|
+
### Persistence Layer Adapter
|
119
|
+
|
120
|
+
We created IiifPrint with an assumption of ActiveFedora. However, as Hyrax now supports Valkyrie, we need an alternate approach. We introduced `IiifPrint::Configuration#persistence_layer` as a configuration option. By default it will use `ActiveFedora` methods; but you can switch adapters to use Valkyrie instead. (See `IiifPrint::PersistentLayer` for more details).
|
121
|
+
|
122
|
+
### IIIF URL configuration
|
123
|
+
|
124
|
+
If you set EXTERNAL_IIIF_URL in your environment, then IiifPrint will use that URL as the root for your IIIF URLs. It will also switch from using the file set ID to using the SHA1 of the file as the identifier. This enables using serverless_iiif or Cantaloupe (refered to as the service) by pointing the service to the same S3 bucket that FCREPO writes the uploaded files to. By setting it up that way you do not need the service to connect to FCREPO or Hyrax at all, both natively support connecting to an S3 bucket to get their data.
|
125
|
+
|
98
126
|
### Model level configurations
|
99
127
|
|
100
128
|
In `app/models/{work_type}.rb` add `include IiifPrint.model_configuration` to any work types which require IiifPrint processing features (such as PDF splitting or OCR derivatives). See [lib/iiif_print.rb](./lib/iiif_print.rb) for details on configuration options.
|
@@ -126,10 +154,6 @@ IiifPrint.config do |config|
|
|
126
154
|
# Add configurable solr field key for searching, default key is: 'human_readable_type_sim' if
|
127
155
|
# another key is used, make sure to adjust the config.excluded_model_name_solr_field_values to match
|
128
156
|
config.excluded_model_name_solr_field_key = 'some_solr_field_key'
|
129
|
-
|
130
|
-
# Configure how the manifest sorts the canvases, by default it sorts by `:title`, but a different
|
131
|
-
# model property may be desired such as :date_published
|
132
|
-
config.sort_iiif_manifest_canvases_by = :date_published
|
133
157
|
end
|
134
158
|
```
|
135
159
|
|
@@ -146,7 +170,7 @@ TO ENABLE OCR Search (from the UV and catalog search)
|
|
146
170
|
}
|
147
171
|
end
|
148
172
|
```
|
149
|
-
* Set `config.search_builder_class = IiifPrint::CatalogSearchBuilder` to remove works from the catalog search results if `is_child_bsi: true`
|
173
|
+
* Set `config.search_builder_class = IiifPrint::CatalogSearchBuilder` to remove works from the catalog search results if `is_child_bsi: true`
|
150
174
|
* Ensure that all text search is configured in default_solr_params config block:
|
151
175
|
```rb
|
152
176
|
config.default_solr_params = {
|
@@ -156,6 +180,75 @@ TO ENABLE OCR Search (from the UV and catalog search)
|
|
156
180
|
}
|
157
181
|
```
|
158
182
|
|
183
|
+
To remove child works from recent works on homepage
|
184
|
+
### homepage_controller.rb
|
185
|
+
* In the HomepageController, change the search_builder_class to remove works from recent_documents if `is_child_bsi: true`
|
186
|
+
```rb
|
187
|
+
require "iiif_print/homepage_search_builder"
|
188
|
+
|
189
|
+
def search_builder_class
|
190
|
+
IiifPrint::HomepageSearchBuilder
|
191
|
+
end
|
192
|
+
```
|
193
|
+
|
194
|
+
### Skipping Certain File Suffixes for PDF Splitting
|
195
|
+
|
196
|
+
By default when a work is configured for splitting PDFs, we will split all PDFs. However, in some cases you don't want to split based on the file name's suffix. In that case, configure code as follows:
|
197
|
+
|
198
|
+
```ruby
|
199
|
+
IiifPrint.config do |config|
|
200
|
+
config.skip_splitting_pdf_files_that_end_with_these_texts = ['.reader.pdf']
|
201
|
+
end
|
202
|
+
```
|
203
|
+
|
204
|
+
### Derivative Rodeo Configuration
|
205
|
+
|
206
|
+
The Derivative Rodeo is used in two ways:
|
207
|
+
|
208
|
+
- Configuring the `Hyrax::DerivativeService` by adding `IiifPrint::DerivativeRodeoService`
|
209
|
+
- Enable Derivative Rodeo PDF Splitting service by `IiifPrint.model_configuration`
|
210
|
+
|
211
|
+
#### Configuring Hyrax::Derivative
|
212
|
+
|
213
|
+
In the application initializer:
|
214
|
+
|
215
|
+
```ruby
|
216
|
+
Hyrax::DerivativeService.services = [
|
217
|
+
IiifPrint::DerivativeRodeoService,
|
218
|
+
Hyrax::FileSetDerivativesService]
|
219
|
+
```
|
220
|
+
|
221
|
+
#### Enabling Derivative Rodeo PDF Splitting
|
222
|
+
|
223
|
+
The [IiifPrint.model\_configuration method](./lib/iiif_print.rb) allows for specifying the `pdf\_splitter\_service` as below:
|
224
|
+
|
225
|
+
```ruby
|
226
|
+
class Book < ActiveFedora::Base
|
227
|
+
include IiifPrint.model_configuration(
|
228
|
+
pdf_splitter_service: IiifPrint::SplitPdfs::DerivativeRodeoSplitter
|
229
|
+
)
|
230
|
+
end
|
231
|
+
```
|
232
|
+
|
233
|
+
#### Pre-Process Location
|
234
|
+
|
235
|
+
The [DerivativeRodeo](https://github.com/scientist-softserv/derivative_rodeo) allows for specifying a location where you've done pre-processing (e.g. you ran splitting and derivative generation in AWS's Lambda).
|
236
|
+
|
237
|
+
By default the preprocess location is S3, as that is where SoftServ has been running pre-processing. However that default may not be adequate for local development.
|
238
|
+
|
239
|
+
#### Conditional Derivative Generation
|
240
|
+
|
241
|
+
The [IiifPrint::DerivativeRodeoService][./app/services/iiif_print/derivative_rodeo_service.rb] provides a means of specifying the derivatives to generate via two configuration points:
|
242
|
+
|
243
|
+
- `IiifPrint::DerivativeRodeoService.named_derivatives_and_generators_by_type`
|
244
|
+
- `IiifPrint::DerivativeRodeoService.named_derivatives_and_generators_filter`
|
245
|
+
|
246
|
+
In the case of `named_derivatives_and_generators_by_type`, we're saying all mime categories will generate these derivatives.
|
247
|
+
|
248
|
+
In the case of `named_derivatives_and_generators_filter`, we're providing a point where we can specify for each file_set and filename the specific derivatives to accept/reject/append to the named derivative generation.
|
249
|
+
|
250
|
+
See their examples for further configuration guidance.
|
251
|
+
|
159
252
|
# Ingesting Content
|
160
253
|
|
161
254
|
IiifPrint supports a range of different ingest workflows:
|
data/Rakefile
CHANGED
@@ -35,4 +35,10 @@ end
|
|
35
35
|
Dir.glob('tasks/*.rake').each { |r| import r }
|
36
36
|
Dir.glob('lib/tasks/*.rake').each { |r| import r }
|
37
37
|
|
38
|
+
# Adding the copy_authorities here so it runs the same in CI
|
39
|
+
desc "Generate the engine_cart, copy authorities, and run tests"
|
40
|
+
task prepare_and_run_tests: ['engine_cart:generate', 'engine_cart:copy_authorities'] do
|
41
|
+
puts "Running CI tests"
|
42
|
+
end
|
43
|
+
|
38
44
|
task default: :ci
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# override Hyrax to remove splitting upon work delete
|
4
|
+
module IiifPrint
|
5
|
+
module Actors
|
6
|
+
# Responsible for removing FileSets related to the given curation concern.
|
7
|
+
module CleanupFileSetsActorDecorator
|
8
|
+
# @param [Hyrax::Actors::Environment] env
|
9
|
+
# @return [Boolean] true if destroy was successful
|
10
|
+
def destroy(env)
|
11
|
+
file_sets = env.curation_concern.file_sets
|
12
|
+
file_sets.each do |file_set|
|
13
|
+
# we destroy the children before the file_set, because we need the parent relationship
|
14
|
+
IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of(
|
15
|
+
file_set: file_set,
|
16
|
+
work: env.curation_concern
|
17
|
+
)
|
18
|
+
end
|
19
|
+
# and now back to your regularly scheduled programming
|
20
|
+
super
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -1,6 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
# override to add PDF splitting for file sets
|
3
|
+
# override to add PDF splitting for file sets and remove splitting upon fileset delete
|
4
|
+
|
5
|
+
# Depending on whether we have an uploaded file or a remote url, the sequence of calling
|
6
|
+
# attach_to_work and create_content will switch.
|
4
7
|
module IiifPrint
|
5
8
|
module Actors
|
6
9
|
module FileSetActorDecorator
|
@@ -9,48 +12,47 @@ module IiifPrint
|
|
9
12
|
super
|
10
13
|
|
11
14
|
if from_url
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
file_locations: [file.path],
|
19
|
-
user: @user,
|
20
|
-
admin_set_id: parent.admin_set_id
|
21
|
-
)
|
22
|
-
end
|
15
|
+
# in this case, the file that came in is a temp file, and we need to use the actual file.
|
16
|
+
# the file was attached to the file_set in Hyrax::ImportUrlJob so we can just access it.
|
17
|
+
args = { file_set: file_set, file: file_set.files.first, import_url: file_set.import_url, user: @user }
|
18
|
+
returned_value = service.conditionally_enqueue(**args)
|
19
|
+
Rails.logger.info("Result of #{returned_value} for conditional enqueueing of #{args.inspect}")
|
20
|
+
true
|
23
21
|
else
|
24
22
|
# we don't have the parent yet... save the paths for later use
|
25
|
-
@
|
23
|
+
@file = file
|
26
24
|
end
|
27
25
|
end
|
28
26
|
|
29
|
-
# Prior to Hyrax v3.1.0, this method did not exist
|
30
|
-
# @param file_set [FileSet]
|
31
|
-
# @return [ActiveFedora::Base]
|
32
|
-
def parent_for(file_set:)
|
33
|
-
file_set.parent
|
34
|
-
end
|
35
|
-
|
36
27
|
# Override to add PDF splitting
|
37
28
|
def attach_to_work(work, file_set_params = {})
|
38
29
|
# Locks to ensure that only one process is operating on the list at a time.
|
39
30
|
super
|
40
31
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
32
|
+
# when we are importing a remote_url, this method is called before the file is attached.
|
33
|
+
# We want to short-circuit the process and prevent unnecessarily confusing logging.
|
34
|
+
return unless @file
|
35
|
+
|
36
|
+
args = { file_set: file_set, work: work, file: @file, user: @user }
|
37
|
+
returned_value = service.conditionally_enqueue(**args)
|
38
|
+
Rails.logger.info("Result of #{returned_value} for conditional enqueueing of #{args.inspect}")
|
39
|
+
true
|
49
40
|
end
|
50
41
|
|
51
42
|
def service
|
52
43
|
IiifPrint::SplitPdfs::ChildWorkCreationFromPdfService
|
53
44
|
end
|
45
|
+
|
46
|
+
# Clean up children when removing the fileset
|
47
|
+
def destroy
|
48
|
+
# we destroy the children before the file_set, because we need the parent relationship
|
49
|
+
IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of(
|
50
|
+
file_set: file_set,
|
51
|
+
work: IiifPrint.parent_for(file_set)
|
52
|
+
)
|
53
|
+
# and now back to your regularly scheduled programming
|
54
|
+
super
|
55
|
+
end
|
54
56
|
end
|
55
57
|
end
|
56
58
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
# Responsible for coordinating the request to resplit a PDF.
|
3
|
+
class SplitPdfsController < ApplicationController
|
4
|
+
before_action :authenticate_user!
|
5
|
+
|
6
|
+
def create
|
7
|
+
@file_set = FileSet.where(id: params[:file_set_id]).first
|
8
|
+
authorize_create_split_request!(@file_set)
|
9
|
+
IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set: @file_set, user: current_user)
|
10
|
+
respond_to do |wants|
|
11
|
+
wants.html { redirect_to polymorphic_path([main_app, @file_set]), notice: t("iiif_print.file_set.split_submitted", id: @file_set.id) }
|
12
|
+
wants.json { render json: { id: @file_set.id, to_param: @file_set.to_param }, status: :ok }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
##
|
19
|
+
# @param file_set [FileSet]
|
20
|
+
def authorize_create_split_request!(file_set)
|
21
|
+
# NOTE: Duplicates logic of Hyrax: https://github.com/samvera/hyrax/blob/b334e186e77691d7da8ed59ff27f091be1c2a700/app/controllers/hyrax/file_sets_controller.rb#L234-L241
|
22
|
+
#
|
23
|
+
# Namely if we don't have a file_set we need not proceed.
|
24
|
+
raise CanCan::AccessDenied unless file_set
|
25
|
+
|
26
|
+
##
|
27
|
+
# Rely on CanCan's authorize! method. We could add the :split_pdf action to the ability
|
28
|
+
# class. But we're pigging backing on the idea that you can do this if you can edit the work.
|
29
|
+
authorize!(:edit, file_set)
|
30
|
+
raise "Expected #{file_set.class} ID=#{file_set.id} #to_param=#{file_set.to_param} to be a PDF. Instead found mime_type of #{file_set.mime_type}." unless file_set.pdf?
|
31
|
+
|
32
|
+
work = IiifPrint.parent_for(file_set)
|
33
|
+
raise WorkNotConfiguredToSplitFileSetError.new(file_set: file_set, work: work) unless work&.iiif_print_config&.pdf_splitter_job&.presence
|
34
|
+
|
35
|
+
true
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# OVERRIDE Hyrax v2.9.6 add #uv_search_param
|
4
|
+
|
5
|
+
module IiifPrint
|
6
|
+
module IiifHelperDecorator
|
7
|
+
def iiif_viewer_display(work_presenter, locals = {})
|
8
|
+
render iiif_viewer_display_partial(work_presenter),
|
9
|
+
locals.merge(presenter: work_presenter)
|
10
|
+
end
|
11
|
+
|
12
|
+
def iiif_viewer_display_partial(work_presenter)
|
13
|
+
'hyrax/base/iiif_viewers/' + work_presenter.iiif_viewer.to_s
|
14
|
+
end
|
15
|
+
|
16
|
+
def universal_viewer_base_url
|
17
|
+
"#{request&.base_url}#{IiifPrint.config.uv_base_path}"
|
18
|
+
end
|
19
|
+
|
20
|
+
def universal_viewer_config_url
|
21
|
+
"#{request&.base_url}#{IiifPrint.config.uv_config_path}"
|
22
|
+
end
|
23
|
+
|
24
|
+
# Extract query param from search
|
25
|
+
def uv_search_param
|
26
|
+
search_params = current_search_session.try(:query_params) || {}
|
27
|
+
q = search_params['q'].presence || ''
|
28
|
+
|
29
|
+
"&q=#{url_encode(q)}" if q.present?
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module IiifPrint::IiifPrintHelperBehavior
|
2
|
+
##
|
3
|
+
# print the ocr snippets. if more than one, separate with <br/>
|
4
|
+
#
|
5
|
+
# @param options [Hash] options hash provided by Blacklight
|
6
|
+
# @return [String] snippets HTML to be rendered
|
7
|
+
# rubocop:disable Rails/OutputSafety
|
8
|
+
def render_ocr_snippets(options = {})
|
9
|
+
snippets = options[:value]
|
10
|
+
return if snippets.blank?
|
11
|
+
|
12
|
+
snippets_content = [content_tag('div',
|
13
|
+
"... #{snippets.first} ...".html_safe,
|
14
|
+
class: 'ocr_snippet first_snippet')]
|
15
|
+
if snippets.length > 1
|
16
|
+
snippets_content << render(partial: 'catalog/snippets_more',
|
17
|
+
locals: { snippets: snippets.drop(1),
|
18
|
+
options: options })
|
19
|
+
end
|
20
|
+
snippets_content.join("\n").html_safe
|
21
|
+
end
|
22
|
+
# rubocop:enable Rails/OutputSafety
|
23
|
+
end
|
@@ -41,24 +41,4 @@ module IiifPrintHelper
|
|
41
41
|
end
|
42
42
|
hl_matches.uniq.sort.join(' ')
|
43
43
|
end
|
44
|
-
|
45
|
-
##
|
46
|
-
# print the ocr snippets. if more than one, separate with <br/>
|
47
|
-
#
|
48
|
-
# @param options [Hash] options hash provided by Blacklight
|
49
|
-
# @return [String] snippets HTML to be rendered
|
50
|
-
# rubocop:disable Rails/OutputSafety
|
51
|
-
def render_ocr_snippets(options = {})
|
52
|
-
snippets = options[:value]
|
53
|
-
snippets_content = [content_tag('div',
|
54
|
-
"... #{snippets.first} ...".html_safe,
|
55
|
-
class: 'ocr_snippet first_snippet')]
|
56
|
-
if snippets.length > 1
|
57
|
-
snippets_content << render(partial: 'catalog/snippets_more',
|
58
|
-
locals: { snippets: snippets.drop(1),
|
59
|
-
options: options })
|
60
|
-
end
|
61
|
-
snippets_content.join("\n").html_safe
|
62
|
-
end
|
63
|
-
# rubocop:enable Rails/OutputSafety
|
64
44
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
module ChildWorkIndexer
|
5
|
+
def to_solr
|
6
|
+
super.tap do |index_document|
|
7
|
+
index_solr_doc(index_document)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def generate_solr_document
|
12
|
+
super.tap do |solr_doc|
|
13
|
+
index_solr_doc(solr_doc)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def index_solr_doc(solr_doc)
|
20
|
+
object ||= @object || resource
|
21
|
+
solr_doc['is_child_bsi'] ||= object.try(:is_child)
|
22
|
+
solr_doc['split_from_pdf_id_ssi'] ||= object.try(:split_from_pdf_id)
|
23
|
+
solr_doc['is_page_of_ssim'] = iiif_print_lineage_service.ancestor_ids_for(object)
|
24
|
+
solr_doc['member_ids_ssim'] = iiif_print_lineage_service.descendent_member_ids_for(object)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -2,28 +2,56 @@
|
|
2
2
|
|
3
3
|
module IiifPrint
|
4
4
|
module FileSetIndexer
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
# @param base [Class]
|
10
|
-
# @return [Class] the given base, now decorated in all of it's glory
|
11
|
-
def self.decorate(base)
|
12
|
-
base.prepend(self)
|
13
|
-
base.class_attribute :iiif_print_lineage_service, default: IiifPrint::LineageService
|
14
|
-
base
|
5
|
+
def to_solr
|
6
|
+
super.tap do |index_document|
|
7
|
+
index_solr_doc(index_document)
|
8
|
+
end
|
15
9
|
end
|
16
10
|
|
17
11
|
def generate_solr_document
|
18
12
|
super.tap do |solr_doc|
|
19
|
-
|
20
|
-
solr_doc['is_page_of_ssim'] = iiif_print_lineage_service.ancestor_ids_for(object) if object.mime_type&.match(/image/)
|
21
|
-
# index for full text search
|
22
|
-
text = IiifPrint::Data::WorkDerivatives.data(from: object, of_type: 'txt')
|
23
|
-
text = text.tr("\n", ' ').squeeze(' ')
|
24
|
-
solr_doc['all_text_timv'] = text
|
25
|
-
solr_doc['all_text_tsimv'] = text
|
13
|
+
index_solr_doc(solr_doc)
|
26
14
|
end
|
27
15
|
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def index_solr_doc(solr_doc)
|
20
|
+
object ||= @object || resource
|
21
|
+
# only UV viewable images should have is_page_of, it is only used for iiif search
|
22
|
+
solr_doc['is_page_of_ssim'] = IiifPrint::LineageService.ancestor_ids_for(object) if image?(object)
|
23
|
+
# index for full text search
|
24
|
+
solr_doc['all_text_tsimv'] = solr_doc['all_text_timv'] = all_text(object)
|
25
|
+
solr_doc['digest_ssim'] = find_checksum(object)
|
26
|
+
end
|
27
|
+
|
28
|
+
def image?(object)
|
29
|
+
mime_type = object.try(:mime_type) || object.original_file.try(:mime_type)
|
30
|
+
mime_type&.match(/image/)
|
31
|
+
end
|
32
|
+
|
33
|
+
def find_checksum(object)
|
34
|
+
file = object.original_file
|
35
|
+
return unless file
|
36
|
+
|
37
|
+
digest ||= if file.is_a?(Hyrax::FileMetadata)
|
38
|
+
Array.wrap(file.checksum).first
|
39
|
+
else # file is a Hydra::PCDM::File (ActiveFedora)
|
40
|
+
file.digest.first
|
41
|
+
end
|
42
|
+
return unless digest
|
43
|
+
|
44
|
+
digest.to_s
|
45
|
+
end
|
46
|
+
|
47
|
+
def all_text(object)
|
48
|
+
file = object.original_file
|
49
|
+
return unless file
|
50
|
+
|
51
|
+
text = IiifPrint.extract_text_for(file_set: object)
|
52
|
+
return text if text.blank?
|
53
|
+
|
54
|
+
text.tr("\n", ' ').squeeze(' ')
|
55
|
+
end
|
28
56
|
end
|
29
57
|
end
|
@@ -2,7 +2,8 @@ module IiifPrint
|
|
2
2
|
module Jobs
|
3
3
|
# TODO: Consider inheriting from ::Application job. That means we would have the upstreams
|
4
4
|
# based job behavior.
|
5
|
-
class ApplicationJob <
|
5
|
+
class ApplicationJob < ::ApplicationJob
|
6
|
+
queue_as ::IiifPrint.config.ingest_queue_name
|
6
7
|
end
|
7
8
|
end
|
8
9
|
end
|
@@ -0,0 +1,153 @@
|
|
1
|
+
require 'iiif_print/jobs/application_job'
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
module Jobs
|
5
|
+
# @deprecated
|
6
|
+
class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
|
7
|
+
##
|
8
|
+
# Break a pdf into individual pages
|
9
|
+
#
|
10
|
+
# @param candidate_for_parency [FileSet, Hydra::PCDM::Work]
|
11
|
+
# @param pdf_paths: [<Array => String>] paths to pdfs
|
12
|
+
# @param user: [User]
|
13
|
+
# @param admin_set_id: [<String>]
|
14
|
+
# rubocop:disable Metrics/MethodLength
|
15
|
+
def perform(id, pdf_paths, user, admin_set_id, *)
|
16
|
+
candidate_for_parency = IiifPrint.find_by(id: id)
|
17
|
+
|
18
|
+
##
|
19
|
+
# We know that we have cases where parent_work is nil, this will definitely raise an
|
20
|
+
# exception; which is fine because we were going to do it later anyway.
|
21
|
+
@parent_work = if candidate_for_parency.work?
|
22
|
+
pdf_file_set = nil
|
23
|
+
candidate_for_parency
|
24
|
+
else
|
25
|
+
# We likely have a file set
|
26
|
+
pdf_file_set = candidate_for_parency
|
27
|
+
IiifPrint.parent_for(candidate_for_parency)
|
28
|
+
end
|
29
|
+
@child_admin_set_id = admin_set_id
|
30
|
+
child_model = @parent_work.iiif_print_config.pdf_split_child_model
|
31
|
+
|
32
|
+
# When working with remote files, we have put the PDF file into the correct path before submitting this job.
|
33
|
+
# However, there seem to be cases where we still don't have the file when we get here, so to be sure, we
|
34
|
+
# re-do the same command that was previously used to prepare the file path. If the file is already here, it
|
35
|
+
# simply returns the path, but if not it will copy the file there, giving us one more chance to have what we need.
|
36
|
+
pdf_paths = [Hyrax::WorkingDirectory.find_or_retrieve(pdf_file_set.original_file.id, pdf_file_set.id, pdf_paths.first)] if pdf_file_set
|
37
|
+
# handle each input pdf (when input is a file set, we will only have one).
|
38
|
+
pdf_paths.each do |original_pdf_path|
|
39
|
+
split_pdf(original_pdf_path, user, child_model, pdf_file_set)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Link newly created child works to the parent
|
43
|
+
# @param user: [User] user
|
44
|
+
# @param parent_id: [<String>] parent work id
|
45
|
+
# @param parent_model: [<String>] parent model
|
46
|
+
# @param child_model: [<String>] child model
|
47
|
+
IiifPrint::Jobs::CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
|
48
|
+
user: user,
|
49
|
+
parent_id: @parent_work.id.to_s,
|
50
|
+
parent_model: @parent_work.class.to_s,
|
51
|
+
child_model: child_model.to_s
|
52
|
+
)
|
53
|
+
|
54
|
+
# TODO: clean up image_files and pdf_paths
|
55
|
+
end
|
56
|
+
# rubocop:enable Metrics/MethodLength
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
# rubocop:disable Metrics/ParameterLists
|
61
|
+
# rubocop:disable Metrics/MethodLength
|
62
|
+
def split_pdf(original_pdf_path, user, child_model, pdf_file_set)
|
63
|
+
user = User.find_by_user_key(user) unless user.is_a?(User)
|
64
|
+
image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path)
|
65
|
+
|
66
|
+
# give as much info as possible if we don't have image files to work with.
|
67
|
+
if image_files.blank?
|
68
|
+
raise "#{@parent_work.class} (ID=#{@parent_work.id} " \
|
69
|
+
"to_param:#{@parent_work.to_param}) " \
|
70
|
+
"original_pdf_path #{original_pdf_path.inspect} " \
|
71
|
+
"pdf_file_set #{pdf_file_set.inspect}"
|
72
|
+
end
|
73
|
+
|
74
|
+
@split_from_pdf_id = pdf_file_set&.id.to_s
|
75
|
+
prepare_import_data(original_pdf_path, image_files, user)
|
76
|
+
|
77
|
+
# submit the job to create all the child works for one PDF
|
78
|
+
# @param [User] user
|
79
|
+
# @param [Hash<String => String>] titles
|
80
|
+
# @param [Hash<String => String>] resource_types (optional)
|
81
|
+
# @param [Array<String>] uploaded_files Hyrax::UploadedFile IDs
|
82
|
+
# @param [Hash] attributes attributes to apply to all works, including :model
|
83
|
+
# @param [Hyrax::BatchCreateOperation] operation
|
84
|
+
operation = Hyrax::BatchCreateOperation.create!(
|
85
|
+
user: user,
|
86
|
+
operation_type: "PDF Batch Create"
|
87
|
+
)
|
88
|
+
BatchCreateJob.perform_later(user,
|
89
|
+
@child_work_titles,
|
90
|
+
{},
|
91
|
+
@uploaded_files,
|
92
|
+
attributes.merge!(model: child_model.to_s, split_from_pdf_id: @split_from_pdf_id).with_indifferent_access,
|
93
|
+
operation)
|
94
|
+
end
|
95
|
+
# rubocop:enable Metrics/MethodLength
|
96
|
+
# rubocop:enable Metrics/ParameterLists
|
97
|
+
|
98
|
+
# rubocop:disable Metrics/MethodLength
|
99
|
+
def prepare_import_data(original_pdf_path, image_files, user)
|
100
|
+
@uploaded_files = []
|
101
|
+
@child_work_titles = {}
|
102
|
+
number_of_pages_in_pdf = image_files.size
|
103
|
+
image_files.each_with_index do |image_path, page_number|
|
104
|
+
file_id = create_uploaded_file(user, image_path).to_s
|
105
|
+
|
106
|
+
child_title = IiifPrint.config.unique_child_title_generator_function.call(
|
107
|
+
original_pdf_path: original_pdf_path,
|
108
|
+
image_path: image_path,
|
109
|
+
parent_work: @parent_work,
|
110
|
+
page_number: page_number,
|
111
|
+
page_padding: number_of_digits(nbr: number_of_pages_in_pdf)
|
112
|
+
)
|
113
|
+
|
114
|
+
@uploaded_files << file_id
|
115
|
+
@child_work_titles[file_id] = child_title
|
116
|
+
# save child work info to create the member relationships
|
117
|
+
PendingRelationship.create!(child_title: child_title,
|
118
|
+
parent_id: @parent_work.id,
|
119
|
+
child_order: child_title,
|
120
|
+
parent_model: @parent_work.class,
|
121
|
+
child_model: @parent_work.iiif_print_config.pdf_split_child_model,
|
122
|
+
file_id: @split_from_pdf_id)
|
123
|
+
|
124
|
+
begin
|
125
|
+
# Clean up the temporary image path.
|
126
|
+
FileUtils.rm_f(image_path) if File.exist?(image_path)
|
127
|
+
rescue
|
128
|
+
# If we can't delete, let's move on. Maybe it was already cleaned-up.
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
# rubocop:enable Metrics/MethodLength
|
133
|
+
|
134
|
+
def number_of_digits(nbr:)
|
135
|
+
nbr.to_s.size
|
136
|
+
end
|
137
|
+
|
138
|
+
def create_uploaded_file(user, path)
|
139
|
+
# TODO: Could we create a remote path?
|
140
|
+
uf = Hyrax::UploadedFile.new
|
141
|
+
uf.user_id = user.try(:id) || user
|
142
|
+
uf.file = CarrierWave::SanitizedFile.new(path)
|
143
|
+
uf.save!
|
144
|
+
uf.id
|
145
|
+
end
|
146
|
+
|
147
|
+
# TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
|
148
|
+
def attributes
|
149
|
+
IiifPrint.config.child_work_attributes_function.call(parent_work: @parent_work, admin_set_id: @child_admin_set_id)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|