iiif_print 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
- data/.github/workflows/build-lint-test-action.yaml +4 -5
- data/.gitignore +5 -4
- data/.rubocop.yml +1 -0
- data/.solargraph.yml +19 -0
- data/Gemfile.lock +1025 -0
- data/README.md +98 -9
- data/Rakefile +6 -0
- data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
- data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
- data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
- data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
- data/app/helpers/iiif_print_helper.rb +0 -20
- data/app/indexers/concerns/iiif_print/child_indexer.rb +9 -3
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +17 -4
- data/app/models/concerns/iiif_print/set_child_flag.rb +9 -0
- data/app/models/concerns/iiif_print/solr/document.rb +14 -0
- data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
- data/app/models/iiif_print/pending_relationship.rb +3 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +19 -10
- data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
- data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +88 -31
- data/app/services/iiif_print/pluggable_derivative_service.rb +3 -9
- data/app/views/catalog/_index_header_list_default.html.erb +13 -0
- data/app/views/hyrax/base/_representative_media.html.erb +4 -3
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
- data/app/views/hyrax/file_sets/_actions.html.erb +2 -1
- data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
- data/config/locales/iiif_print.en.yml +4 -0
- data/config/routes.rb +3 -0
- data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
- data/docker-compose.yml +2 -2
- data/iiif_print.gemspec +10 -9
- data/lib/generators/iiif_print/install_generator.rb +21 -1
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
- data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
- data/lib/iiif_print/base_derivative_service.rb +2 -1
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +57 -5
- data/lib/iiif_print/catalog_search_builder.rb +5 -1
- data/lib/iiif_print/configuration.rb +145 -8
- data/lib/iiif_print/data/fileset_helper.rb +1 -1
- data/lib/iiif_print/data/work_derivatives.rb +3 -3
- data/lib/iiif_print/engine.rb +7 -13
- data/lib/iiif_print/errors.rb +18 -0
- data/lib/iiif_print/homepage_search_builder.rb +17 -0
- data/lib/iiif_print/image_tool.rb +12 -8
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +74 -33
- data/lib/iiif_print/jobs/create_relationships_job.rb +80 -31
- data/lib/iiif_print/jobs/request_split_pdf_job.rb +31 -0
- data/lib/iiif_print/lineage_service.rb +29 -8
- data/lib/iiif_print/metadata.rb +67 -48
- data/lib/iiif_print/split_pdfs/base_splitter.rb +142 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +68 -32
- data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
- data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +33 -0
- data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
- data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
- data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
- data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
- data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
- data/lib/iiif_print/version.rb +1 -1
- data/lib/iiif_print.rb +167 -12
- data/lib/samvera/derivatives/configuration.rb +83 -0
- data/lib/samvera/derivatives/hyrax.rb +129 -0
- data/lib/samvera/derivatives.rb +238 -0
- data/spec/factories/newspaper_page_solr_document.rb +9 -1
- data/spec/fixtures/authorities/licenses.yml +4 -0
- data/spec/fixtures/authorities/rights_statements.yml +4 -0
- data/spec/iiif_print/base_derivative_service_spec.rb +20 -3
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +11 -3
- data/spec/iiif_print/catalog_search_builder_spec.rb +1 -1
- data/spec/iiif_print/configuration_spec.rb +141 -15
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +7 -2
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +110 -9
- data/spec/iiif_print/lineage_service_spec.rb +1 -1
- data/spec/iiif_print/metadata_spec.rb +157 -23
- data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +27 -0
- data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +80 -0
- data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +92 -0
- data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +22 -0
- data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +18 -0
- data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +19 -0
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +2 -2
- data/spec/iiif_print_spec.rb +125 -5
- data/spec/models/iiif_print/iiif_search_decorator_spec.rb +27 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +51 -0
- data/spec/samvera/derivatives/configuration_spec.rb +41 -0
- data/spec/samvera/derivatives/hyrax_spec.rb +62 -0
- data/spec/samvera/derivatives_spec.rb +54 -0
- data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +103 -0
- data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +20 -0
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +8 -11
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
- data/tasks/copy_authorities_to_test_app.rake +11 -0
- data/tasks/iiif_print_dev.rake +4 -4
- metadata +123 -35
- data/app/helpers/hyrax/iiif_helper.rb +0 -22
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
@@ -1,4 +1,5 @@
|
|
1
1
|
module IiifPrint
|
2
|
+
# rubocop:disable Metrics/ClassLength
|
2
3
|
class Configuration
|
3
4
|
attr_writer :after_create_fileset_handler
|
4
5
|
|
@@ -12,6 +13,14 @@ module IiifPrint
|
|
12
13
|
end
|
13
14
|
end
|
14
15
|
|
16
|
+
attr_writer :ancestory_identifier_function
|
17
|
+
# The function, with arity 1, that receives a work and returns it's identifier for the purposes
|
18
|
+
# of object ancestry.
|
19
|
+
# @return [Proc]
|
20
|
+
def ancestory_identifier_function
|
21
|
+
@ancestory_identifier_function ||= ->(work) { work.id }
|
22
|
+
end
|
23
|
+
|
15
24
|
attr_writer :excluded_model_name_solr_field_values
|
16
25
|
# By default, this uses an array of human readable types
|
17
26
|
# ex: ['Generic Work', 'Image']
|
@@ -21,6 +30,45 @@ module IiifPrint
|
|
21
30
|
@excluded_model_name_solr_field_values = []
|
22
31
|
end
|
23
32
|
|
33
|
+
def skip_splitting_pdf_files_that_end_with_these_texts=(values)
|
34
|
+
@skip_splitting_pdf_files_that_end_with_these_texts = Array.wrap(values).map(&:downcase)
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# @return [Array<String>] the file suffixes (e.g. [".reader.pdf"]) that we will skip. Per
|
39
|
+
# the implementation of {.split_for_path_suffix?}, these values are cast to
|
40
|
+
# downcase.
|
41
|
+
def skip_splitting_pdf_files_that_end_with_these_texts
|
42
|
+
@skip_splitting_pdf_files_that_end_with_these_texts || []
|
43
|
+
end
|
44
|
+
|
45
|
+
attr_writer :unique_child_title_generator_function
|
46
|
+
|
47
|
+
# The function, with keywords (though maybe you'll want to splat ignore a few), is responsible
|
48
|
+
# for generating the child work file title. of object ancestry.
|
49
|
+
#
|
50
|
+
# The keyword parameters that will be passed to this function are:
|
51
|
+
#
|
52
|
+
# :original_pdf_path - The fully qualified pathname to the original PDF from which the images
|
53
|
+
# were split.
|
54
|
+
# :image_path - The fully qualified pathname for an image of the single page from the PDF.
|
55
|
+
# :parent_work - The object in which we're "attaching" the image.
|
56
|
+
# :page_number - The image is of the N-th page_number of the original PDF
|
57
|
+
# :page_padding - A helper number that indicates the number of significant digits of pages
|
58
|
+
# (e.g. 150 pages would have a padding of 3).
|
59
|
+
#
|
60
|
+
# @return [Proc]
|
61
|
+
# rubocop:disable Lint/UnusedBlockArgument
|
62
|
+
def unique_child_title_generator_function
|
63
|
+
@unique_child_title_generator_function ||= lambda { |original_pdf_path:, image_path:, parent_work:, page_number:, page_padding:|
|
64
|
+
identifier = parent_work.id
|
65
|
+
filename = File.basename(original_pdf_path)
|
66
|
+
page_suffix = "Page #{(page_number.to_i + 1).to_s.rjust(page_padding.to_i, '0')}"
|
67
|
+
"#{identifier} - #{filename} #{page_suffix}"
|
68
|
+
}
|
69
|
+
end
|
70
|
+
# rubocop:enable Lint/UnusedBlockArgument
|
71
|
+
|
24
72
|
# This method wraps Hyrax's configuration so we can sniff out the correct method to use. The
|
25
73
|
# {Hyrax::Configuration#whitelisted_ingest_dirs} is deprecated in favor of
|
26
74
|
# {Hyrax::Configuration#registered_ingest_dirs}.
|
@@ -44,7 +92,7 @@ module IiifPrint
|
|
44
92
|
|
45
93
|
attr_writer :default_iiif_manifest_version
|
46
94
|
def default_iiif_manifest_version
|
47
|
-
@default_iiif_manifest_version || 2
|
95
|
+
@default_iiif_manifest_version.presence || 2
|
48
96
|
end
|
49
97
|
|
50
98
|
attr_writer :metadata_fields
|
@@ -81,19 +129,108 @@ module IiifPrint
|
|
81
129
|
end
|
82
130
|
# rubocop:enable Metrics/MethodLength
|
83
131
|
|
132
|
+
attr_writer :additional_tesseract_options
|
133
|
+
##
|
134
|
+
# The additional options to pass to the Tesseract configuration
|
135
|
+
#
|
136
|
+
# @see https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html
|
137
|
+
# @return [String]
|
138
|
+
def additional_tesseract_options
|
139
|
+
@additional_tesseract_options || ""
|
140
|
+
end
|
141
|
+
|
142
|
+
attr_writer :uv_config_path
|
143
|
+
##
|
144
|
+
# According to https://github.com/samvera/hyrax/wiki/Hyrax-Management-Guide#universal-viewer-config
|
145
|
+
# the name of the UV config file should be /uv/uv_config.json (with an _)
|
146
|
+
# However, in most applications, it is /uv/uv-config.json (with a -)
|
147
|
+
def uv_config_path
|
148
|
+
@uv_config_path || "/uv/uv-config.json"
|
149
|
+
end
|
150
|
+
|
151
|
+
attr_writer :uv_base_path
|
152
|
+
##
|
153
|
+
# While we're at it, we're going to go ahead and make the base path configurable as well
|
154
|
+
def uv_base_path
|
155
|
+
@uv_base_path || "/uv/uv.html"
|
156
|
+
end
|
157
|
+
|
158
|
+
attr_writer :child_work_attributes_function
|
159
|
+
##
|
160
|
+
# Here we allow for customization of the child work attributes
|
161
|
+
def child_work_attributes_function
|
162
|
+
@child_work_attributes_function ||= lambda do |parent_work:, admin_set_id:|
|
163
|
+
{
|
164
|
+
admin_set_id: admin_set_id.to_s,
|
165
|
+
creator: parent_work.creator.to_a,
|
166
|
+
rights_statement: parent_work.rights_statement.to_a,
|
167
|
+
visibility: parent_work.visibility.to_s,
|
168
|
+
is_child: true
|
169
|
+
}
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
84
173
|
attr_writer :sort_iiif_manifest_canvases_by
|
174
|
+
##
|
175
|
+
# Normally, the canvases are sorted by the `ordered_members` association.
|
176
|
+
# However, if you want it to be sorted by another property, you can set this
|
177
|
+
# configuration. Change `nil` to something like `:title` or `:identifier`.
|
178
|
+
#
|
179
|
+
# Should you want to sort by the filename of the image, you
|
180
|
+
# set `nil` to `:label`. This looks at the canvas label, which is typically set
|
181
|
+
# to the filename of the image.
|
85
182
|
def sort_iiif_manifest_canvases_by
|
86
|
-
@sort_iiif_manifest_canvases_by ||
|
183
|
+
@sort_iiif_manifest_canvases_by || nil
|
87
184
|
end
|
88
185
|
|
89
|
-
attr_writer :
|
186
|
+
attr_writer :ocr_coords_from_json_function
|
90
187
|
##
|
91
|
-
#
|
188
|
+
# This is used to determine where to pull the OCR coordinates from. By default, it will
|
189
|
+
# pull from the JSON file that is generated by the OCR engine. However, if you have a
|
190
|
+
# different source, you can set this configuration. Current implementation has access to
|
191
|
+
# the `file_set_id`` and the `document` [SolrDocument].
|
92
192
|
#
|
93
|
-
# @see
|
94
|
-
|
95
|
-
|
96
|
-
|
193
|
+
# @see IiifPrint::BlacklightIiifSearch::AnnotationDecorator#fetch_and_parse_coords
|
194
|
+
def ocr_coords_from_json_function
|
195
|
+
@ocr_coords_from_json_function ||= lambda do |file_set_id:, **|
|
196
|
+
IiifPrint::Data::WorkDerivatives.data(from: file_set_id, of_type: 'json')
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
attr_writer :all_text_generator_function
|
201
|
+
##
|
202
|
+
# This configuration determines where to pull the full text from. By default, it will
|
203
|
+
# pull from the TXT file that is generated by the OCR engine. However, if your
|
204
|
+
# application has its own implementation of generating the full text, then you can
|
205
|
+
# set your own configuration here.
|
206
|
+
def all_text_generator_function
|
207
|
+
@all_text_generator_function ||= lambda do |object:|
|
208
|
+
IiifPrint::Data::WorkDerivatives.data(from: object, of_type: 'txt')
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
attr_writer :iiif_metadata_field_presentation_order
|
213
|
+
##
|
214
|
+
# This is the default sorter for the metadata. It will sort by the order of the keys specificied.
|
215
|
+
# By default, this is turned off as it returns nil. If you want to turn it on, you can set this
|
216
|
+
# this to an array of symbols the properties on the work.
|
217
|
+
#
|
218
|
+
# @example [:title, :description, :date_created]
|
219
|
+
# @return [Array<Symbol>]
|
220
|
+
def iiif_metadata_field_presentation_order
|
221
|
+
@iiif_metadata_field_presentation_order || nil
|
222
|
+
end
|
223
|
+
|
224
|
+
def questioning_authority_fields=(fields)
|
225
|
+
@questioning_authority_fields = Array.wrap(fields).map(&:to_s)
|
226
|
+
end
|
227
|
+
|
228
|
+
##
|
229
|
+
# This is used to explicitly set which fields should be rendered as a Questioning Authority in the UV.
|
230
|
+
# By default, we render `rights_statement` and `license` as QA fields.
|
231
|
+
def questioning_authority_fields
|
232
|
+
@questioning_authority_fields ||= ['rights_statement', 'license']
|
97
233
|
end
|
98
234
|
end
|
235
|
+
# rubocop:enable Metrics/ModuleLength
|
99
236
|
end
|
@@ -42,16 +42,16 @@ module IiifPrint
|
|
42
42
|
#
|
43
43
|
# @return [String]
|
44
44
|
def self.data(from:, of_type:)
|
45
|
-
new(from).data(of_type)
|
45
|
+
new(work: from).data(of_type)
|
46
46
|
end
|
47
47
|
|
48
48
|
# alternate constructor spelling:
|
49
49
|
def self.of(work, fileset = nil, parent = nil)
|
50
|
-
new(work, fileset, parent)
|
50
|
+
new(work: work, fileset: fileset, parent: parent)
|
51
51
|
end
|
52
52
|
|
53
53
|
# Adapt work and either specific or first fileset
|
54
|
-
def initialize(work, fileset
|
54
|
+
def initialize(work: nil, fileset: nil, parent: nil)
|
55
55
|
# adapted context usually work, may be string id of FileSet
|
56
56
|
@work = work
|
57
57
|
@fileset = fileset.nil? ? first_fileset : fileset
|
data/lib/iiif_print/engine.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'active_fedora'
|
2
2
|
require 'hyrax'
|
3
3
|
require 'blacklight_iiif_search'
|
4
|
+
require 'derivative_rodeo'
|
4
5
|
|
5
6
|
module IiifPrint
|
6
7
|
# module constants:
|
@@ -12,6 +13,7 @@ module IiifPrint
|
|
12
13
|
|
13
14
|
# rubocop:disable Metrics/BlockLength
|
14
15
|
config.to_prepare do
|
16
|
+
require "iiif_print/jobs/create_relationships_job"
|
15
17
|
# We don't have a hard requirement of Bullkrax but in our experience, lingering on earlier
|
16
18
|
# versions can introduce bugs of both Bulkrax and some of the assumptions that we've resolved.
|
17
19
|
# Very early versions of Bulkrax do not have VERSION defined
|
@@ -41,26 +43,16 @@ module IiifPrint
|
|
41
43
|
Hyrax::Renderers::FacetedAttributeRenderer.prepend(Hyrax::Renderers::FacetedAttributeRendererDecorator)
|
42
44
|
Hyrax::WorksControllerBehavior.prepend(IiifPrint::WorksControllerBehaviorDecorator)
|
43
45
|
Hyrax::WorkShowPresenter.prepend(IiifPrint::WorkShowPresenterDecorator)
|
46
|
+
Hyrax::IiifHelper.prepend(IiifPrint::IiifHelperDecorator)
|
44
47
|
|
45
48
|
IiifPrint::ChildIndexer.decorate_work_types!
|
46
49
|
IiifPrint::FileSetIndexer.decorate(Hyrax::FileSetIndexer)
|
47
50
|
|
48
51
|
::BlacklightIiifSearch::IiifSearchResponse.prepend(IiifPrint::IiifSearchResponseDecorator)
|
49
52
|
::BlacklightIiifSearch::IiifSearchAnnotation.prepend(IiifPrint::BlacklightIiifSearch::AnnotationDecorator)
|
53
|
+
::BlacklightIiifSearch::IiifSearch.prepend(IiifPrint::IiifSearchDecorator)
|
50
54
|
Hyrax::Actors::FileSetActor.prepend(IiifPrint::Actors::FileSetActorDecorator)
|
51
|
-
|
52
|
-
# Extending the presenter to the base url which includes the protocol.
|
53
|
-
# We need the base url to render the facet links and normalize the interface.
|
54
|
-
Hyrax::IiifManifestPresenter.send(:attr_accessor, :base_url)
|
55
|
-
Hyrax::IiifManifestPresenter::DisplayImagePresenter.send(:attr_accessor, :base_url)
|
56
|
-
# Extending this class because there is an #ability= but not #ability and this definition
|
57
|
-
# mirrors the Hyrax::IiifManifestPresenter#ability.
|
58
|
-
module Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator
|
59
|
-
def ability
|
60
|
-
@ability ||= NullAbility.new
|
61
|
-
end
|
62
|
-
end
|
63
|
-
Hyrax::IiifManifestPresenter::DisplayImagePresenter.prepend(Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator)
|
55
|
+
Hyrax::Actors::CleanupFileSetsActor.prepend(IiifPrint::Actors::CleanupFileSetsActorDecorator)
|
64
56
|
|
65
57
|
Hyrax.config do |config|
|
66
58
|
config.callback.set(:after_create_fileset) do |file_set, user|
|
@@ -71,6 +63,8 @@ module IiifPrint
|
|
71
63
|
|
72
64
|
config.after_initialize do
|
73
65
|
IiifPrint::Solr::Document.decorate(SolrDocument)
|
66
|
+
Hyrax::IiifManifestPresenter::DisplayImagePresenter
|
67
|
+
.prepend(IiifPrint::IiifManifestPresenterBehavior::DisplayImagePresenterBehavior)
|
74
68
|
end
|
75
69
|
# rubocop:enable Metrics/BlockLength
|
76
70
|
end
|
data/lib/iiif_print/errors.rb
CHANGED
@@ -6,4 +6,22 @@ module IiifPrint
|
|
6
6
|
# Data transformation or read-error:
|
7
7
|
class DataError < IiifPrintError
|
8
8
|
end
|
9
|
+
|
10
|
+
class MissingFileError < IiifPrintError
|
11
|
+
end
|
12
|
+
|
13
|
+
class WorkNotConfiguredToSplitFileSetError < IiifPrintError
|
14
|
+
def initialize(file_set:, work:)
|
15
|
+
message = "Expected that we would be splitting #{file_set.class} ID=#{file_set&.id} #to_param=#{file_set&.to_param} " \
|
16
|
+
"for work #{work.class} ID=#{work&.id} #to_param=#{work&.to_param}. " \
|
17
|
+
"However it was not configured for PDF splitting."
|
18
|
+
super(message)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class UnexpectedMimeTypeError < IiifPrintError
|
23
|
+
def initialize(file_set:, mime_type:)
|
24
|
+
super "Unexpected mime_type #{mime_type} for #{file_set.class} ID=#{file_set.id.inspect}"
|
25
|
+
end
|
26
|
+
end
|
9
27
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Overrides Hyrax to add show_parents_only to processor chain
|
4
|
+
module IiifPrint
|
5
|
+
class HomepageSearchBuilder < Hyrax::HomepageSearchBuilder
|
6
|
+
self.default_processor_chain += [:show_parents_only]
|
7
|
+
|
8
|
+
def show_parents_only(solr_parameters)
|
9
|
+
query = if blacklight_params["include_child_works"] == 'true'
|
10
|
+
ActiveFedora::SolrQueryBuilder.construct_query(is_child_bsi: 'true')
|
11
|
+
else
|
12
|
+
ActiveFedora::SolrQueryBuilder.construct_query(is_child_bsi: nil)
|
13
|
+
end
|
14
|
+
solr_parameters[:fq] += [query]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -3,11 +3,10 @@ require 'tmpdir'
|
|
3
3
|
|
4
4
|
module IiifPrint
|
5
5
|
class ImageTool
|
6
|
-
attr_accessor :path
|
6
|
+
attr_accessor :path
|
7
7
|
|
8
8
|
def initialize(path)
|
9
9
|
@path = path
|
10
|
-
@ftype = magic
|
11
10
|
@metadata = nil
|
12
11
|
end
|
13
12
|
|
@@ -60,7 +59,7 @@ module IiifPrint
|
|
60
59
|
end
|
61
60
|
|
62
61
|
def im_line_select(lines, key)
|
63
|
-
line = lines.find { |l| l.scrub.downcase.strip.start_with?(key) }
|
62
|
+
line = lines.find { |l| l.scrub.downcase.strip.start_with?(key.downcase) }
|
64
63
|
# Given "key: value" line, return the value as String stripped of
|
65
64
|
# leading and trailing whitespace
|
66
65
|
return line if line.nil?
|
@@ -75,20 +74,25 @@ module IiifPrint
|
|
75
74
|
|
76
75
|
# @return [Array<String>] lines of output from imagemagick `identify`
|
77
76
|
def im_identify
|
78
|
-
cmd = "identify -
|
77
|
+
cmd = "identify -format 'Geometry: %G\nDepth: %[bit-depth]\nColorspace: %[colorspace]\nAlpha: %A\nMIME type: %m\n' #{path}"
|
79
78
|
`#{cmd}`.lines
|
80
79
|
end
|
81
80
|
|
82
81
|
def im_mime(lines)
|
83
82
|
return 'application/pdf' if pdf? # workaround older imagemagick bug
|
84
|
-
|
83
|
+
|
84
|
+
format = im_line_select(lines, 'mime type')
|
85
|
+
return if format.blank?
|
86
|
+
|
87
|
+
# `identify -format` with the `%m` switch only gives the format, we are coercing it into an image mime type
|
88
|
+
Mime::Type.lookup_by_extension(format.downcase).to_s
|
85
89
|
end
|
86
90
|
|
87
91
|
def populate_im_color!(lines, result)
|
88
92
|
bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
|
89
93
|
colorspace = im_line_select(lines, 'colorspace')
|
90
94
|
color = colorspace == 'Gray' ? 'gray' : 'color'
|
91
|
-
has_alpha = !im_line_select(lines, '
|
95
|
+
has_alpha = !im_line_select(lines, 'alpha') == 'Undefined'
|
92
96
|
result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
|
93
97
|
result[:color] = bpc == 1 ? 'monochrome' : color
|
94
98
|
result[:bits_per_component] = bpc
|
@@ -105,11 +109,11 @@ module IiifPrint
|
|
105
109
|
end
|
106
110
|
|
107
111
|
def magic
|
108
|
-
File.read(@path, 23, 0)
|
112
|
+
@magic ||= File.read(@path, 23, 0)
|
109
113
|
end
|
110
114
|
|
111
115
|
def jp2?
|
112
|
-
|
116
|
+
magic.end_with?('ftypjp2')
|
113
117
|
end
|
114
118
|
|
115
119
|
def pdf?
|
@@ -1,20 +1,38 @@
|
|
1
1
|
module IiifPrint
|
2
2
|
module Jobs
|
3
|
+
# @deprecated
|
3
4
|
class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
|
5
|
+
##
|
4
6
|
# Break a pdf into individual pages
|
5
|
-
#
|
7
|
+
#
|
8
|
+
# @param candidate_for_parency [FileSet, Hydra::PCDM::Work]
|
6
9
|
# @param pdf_paths: [<Array => String>] paths to pdfs
|
7
10
|
# @param user: [User]
|
8
11
|
# @param admin_set_id: [<String>]
|
9
|
-
#
|
10
|
-
def perform(
|
11
|
-
|
12
|
+
# rubocop:disable Metrics/MethodLength
|
13
|
+
def perform(candidate_for_parency, pdf_paths, user, admin_set_id, *)
|
14
|
+
##
|
15
|
+
# We know that we have cases where parent_work is nil, this will definitely raise an
|
16
|
+
# exception; which is fine because we were going to do it later anyway.
|
17
|
+
@parent_work = if candidate_for_parency.work?
|
18
|
+
pdf_file_set = nil
|
19
|
+
candidate_for_parency
|
20
|
+
else
|
21
|
+
# We likely have a file set
|
22
|
+
pdf_file_set = candidate_for_parency
|
23
|
+
IiifPrint.parent_for(candidate_for_parency)
|
24
|
+
end
|
12
25
|
@child_admin_set_id = admin_set_id
|
13
26
|
child_model = @parent_work.iiif_print_config.pdf_split_child_model
|
14
27
|
|
15
|
-
#
|
16
|
-
|
17
|
-
|
28
|
+
# When working with remote files, we have put the PDF file into the correct path before submitting this job.
|
29
|
+
# However, there seem to be cases where we still don't have the file when we get here, so to be sure, we
|
30
|
+
# re-do the same command that was previously used to prepare the file path. If the file is already here, it
|
31
|
+
# simply returns the path, but if not it will copy the file there, giving us one more chance to have what we need.
|
32
|
+
pdf_paths = [Hyrax::WorkingDirectory.find_or_retrieve(pdf_file_set.files.first.id, pdf_file_set.id, pdf_paths.first)] if pdf_file_set
|
33
|
+
# handle each input pdf (when input is a file set, we will only have one).
|
34
|
+
pdf_paths.each do |original_pdf_path|
|
35
|
+
split_pdf(original_pdf_path, user, child_model, pdf_file_set)
|
18
36
|
end
|
19
37
|
|
20
38
|
# Link newly created child works to the parent
|
@@ -31,15 +49,25 @@ module IiifPrint
|
|
31
49
|
|
32
50
|
# TODO: clean up image_files and pdf_paths
|
33
51
|
end
|
52
|
+
# rubocop:enable Metrics/MethodLength
|
34
53
|
|
35
54
|
private
|
36
55
|
|
37
|
-
|
38
|
-
|
39
|
-
|
56
|
+
# rubocop:disable Metrics/ParameterLists
|
57
|
+
# rubocop:disable Metrics/MethodLength
|
58
|
+
def split_pdf(original_pdf_path, user, child_model, pdf_file_set)
|
59
|
+
image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path, file_set: pdf_file_set)
|
40
60
|
|
41
|
-
|
42
|
-
|
61
|
+
# give as much info as possible if we don't have image files to work with.
|
62
|
+
if image_files.blank?
|
63
|
+
raise "#{@parent_work.class} (ID=#{@parent_work.id} " /
|
64
|
+
"to_param:#{@parent_work.to_param}) " /
|
65
|
+
"original_pdf_path #{original_pdf_path.inspect} " /
|
66
|
+
"pdf_file_set #{pdf_file_set.inspect}"
|
67
|
+
end
|
68
|
+
|
69
|
+
@split_from_pdf_id = pdf_file_set&.id
|
70
|
+
prepare_import_data(original_pdf_path, image_files, user)
|
43
71
|
|
44
72
|
# submit the job to create all the child works for one PDF
|
45
73
|
# @param [User] user
|
@@ -56,30 +84,54 @@ module IiifPrint
|
|
56
84
|
@child_work_titles,
|
57
85
|
{},
|
58
86
|
@uploaded_files,
|
59
|
-
attributes.merge!(model: child_model.to_s).with_indifferent_access,
|
87
|
+
attributes.merge!(model: child_model.to_s, split_from_pdf_id: @split_from_pdf_id).with_indifferent_access,
|
60
88
|
operation)
|
61
89
|
end
|
90
|
+
# rubocop:enable Metrics/MethodLength
|
91
|
+
# rubocop:enable Metrics/ParameterLists
|
62
92
|
|
63
|
-
|
93
|
+
# rubocop:disable Metrics/MethodLength
|
94
|
+
def prepare_import_data(original_pdf_path, image_files, user)
|
64
95
|
@uploaded_files = []
|
65
96
|
@child_work_titles = {}
|
66
|
-
image_files.
|
97
|
+
number_of_pages_in_pdf = image_files.size
|
98
|
+
image_files.each_with_index do |image_path, page_number|
|
67
99
|
file_id = create_uploaded_file(user, image_path).to_s
|
68
|
-
|
100
|
+
|
101
|
+
child_title = IiifPrint.config.unique_child_title_generator_function.call(
|
102
|
+
original_pdf_path: original_pdf_path,
|
103
|
+
image_path: image_path,
|
104
|
+
parent_work: @parent_work,
|
105
|
+
page_number: page_number,
|
106
|
+
page_padding: number_of_digits(nbr: number_of_pages_in_pdf)
|
107
|
+
)
|
108
|
+
|
69
109
|
@uploaded_files << file_id
|
70
|
-
@child_work_titles[file_id] =
|
110
|
+
@child_work_titles[file_id] = child_title
|
71
111
|
# save child work info to create the member relationships
|
72
|
-
PendingRelationship.create!(child_title:
|
112
|
+
PendingRelationship.create!(child_title: child_title,
|
73
113
|
parent_id: @parent_work.id,
|
74
|
-
child_order:
|
114
|
+
child_order: child_title,
|
115
|
+
parent_model: @parent_work.class,
|
116
|
+
child_model: @parent_work.iiif_print_config.pdf_split_child_model,
|
117
|
+
file_id: @split_from_pdf_id)
|
118
|
+
|
119
|
+
begin
|
120
|
+
# Clean up the temporary image path.
|
121
|
+
FileUtils.rm_f(image_path) if File.exist?(image_path)
|
122
|
+
rescue
|
123
|
+
# If we can't delete, let's move on. Maybe it was already cleaned-up.
|
124
|
+
end
|
75
125
|
end
|
76
126
|
end
|
127
|
+
# rubocop:enable Metrics/MethodLength
|
77
128
|
|
78
|
-
def
|
79
|
-
|
129
|
+
def number_of_digits(nbr:)
|
130
|
+
nbr.to_s.size
|
80
131
|
end
|
81
132
|
|
82
133
|
def create_uploaded_file(user, path)
|
134
|
+
# TODO: Could we create a remote path?
|
83
135
|
uf = Hyrax::UploadedFile.new
|
84
136
|
uf.user_id = user.id
|
85
137
|
uf.file = CarrierWave::SanitizedFile.new(path)
|
@@ -87,20 +139,9 @@ module IiifPrint
|
|
87
139
|
uf.id
|
88
140
|
end
|
89
141
|
|
90
|
-
def set_title(title, pdf_sequence, idx)
|
91
|
-
pdf_index = "Pdf Nbr #{pdf_sequence + 1}"
|
92
|
-
page_number = "Page #{idx + 1}"
|
93
|
-
"#{title}: #{pdf_index}, #{page_number}"
|
94
|
-
end
|
95
|
-
|
96
142
|
# TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
|
97
143
|
def attributes
|
98
|
-
|
99
|
-
admin_set_id: @child_admin_set_id.to_s,
|
100
|
-
creator: @parent_work.creator.to_a,
|
101
|
-
rights_statement: @parent_work.rights_statement.to_a,
|
102
|
-
visibility: @parent_work.visibility.to_s
|
103
|
-
}
|
144
|
+
IiifPrint.config.child_work_attributes_function.call(parent_work: @parent_work, admin_set_id: @child_admin_set_id)
|
104
145
|
end
|
105
146
|
end
|
106
147
|
end
|