iiif_print 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  4. data/.github/workflows/build-lint-test-action.yaml +4 -5
  5. data/.gitignore +5 -4
  6. data/.rubocop.yml +1 -0
  7. data/.solargraph.yml +19 -0
  8. data/Gemfile.lock +1025 -0
  9. data/README.md +98 -9
  10. data/Rakefile +6 -0
  11. data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
  12. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
  13. data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
  14. data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
  15. data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
  16. data/app/helpers/iiif_print_helper.rb +0 -20
  17. data/app/indexers/concerns/iiif_print/child_indexer.rb +9 -3
  18. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +17 -4
  19. data/app/models/concerns/iiif_print/set_child_flag.rb +9 -0
  20. data/app/models/concerns/iiif_print/solr/document.rb +14 -0
  21. data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
  22. data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
  23. data/app/models/iiif_print/pending_relationship.rb +3 -0
  24. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
  25. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  26. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +19 -10
  27. data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
  28. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
  29. data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
  30. data/app/services/iiif_print/manifest_builder_service_behavior.rb +88 -31
  31. data/app/services/iiif_print/pluggable_derivative_service.rb +3 -9
  32. data/app/views/catalog/_index_header_list_default.html.erb +13 -0
  33. data/app/views/hyrax/base/_representative_media.html.erb +4 -3
  34. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
  35. data/app/views/hyrax/file_sets/_actions.html.erb +2 -1
  36. data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
  37. data/config/locales/iiif_print.en.yml +4 -0
  38. data/config/routes.rb +3 -0
  39. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
  40. data/docker-compose.yml +2 -2
  41. data/iiif_print.gemspec +10 -9
  42. data/lib/generators/iiif_print/install_generator.rb +21 -1
  43. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
  44. data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
  45. data/lib/iiif_print/base_derivative_service.rb +2 -1
  46. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +57 -5
  47. data/lib/iiif_print/catalog_search_builder.rb +5 -1
  48. data/lib/iiif_print/configuration.rb +145 -8
  49. data/lib/iiif_print/data/fileset_helper.rb +1 -1
  50. data/lib/iiif_print/data/work_derivatives.rb +3 -3
  51. data/lib/iiif_print/engine.rb +7 -13
  52. data/lib/iiif_print/errors.rb +18 -0
  53. data/lib/iiif_print/homepage_search_builder.rb +17 -0
  54. data/lib/iiif_print/image_tool.rb +12 -8
  55. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +74 -33
  56. data/lib/iiif_print/jobs/create_relationships_job.rb +80 -31
  57. data/lib/iiif_print/jobs/request_split_pdf_job.rb +31 -0
  58. data/lib/iiif_print/lineage_service.rb +29 -8
  59. data/lib/iiif_print/metadata.rb +67 -48
  60. data/lib/iiif_print/split_pdfs/base_splitter.rb +142 -0
  61. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +68 -32
  62. data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
  63. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +33 -0
  64. data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
  65. data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
  66. data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
  67. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
  68. data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
  69. data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
  70. data/lib/iiif_print/version.rb +1 -1
  71. data/lib/iiif_print.rb +167 -12
  72. data/lib/samvera/derivatives/configuration.rb +83 -0
  73. data/lib/samvera/derivatives/hyrax.rb +129 -0
  74. data/lib/samvera/derivatives.rb +238 -0
  75. data/spec/factories/newspaper_page_solr_document.rb +9 -1
  76. data/spec/fixtures/authorities/licenses.yml +4 -0
  77. data/spec/fixtures/authorities/rights_statements.yml +4 -0
  78. data/spec/iiif_print/base_derivative_service_spec.rb +20 -3
  79. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +11 -3
  80. data/spec/iiif_print/catalog_search_builder_spec.rb +1 -1
  81. data/spec/iiif_print/configuration_spec.rb +141 -15
  82. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +7 -2
  83. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +110 -9
  84. data/spec/iiif_print/lineage_service_spec.rb +1 -1
  85. data/spec/iiif_print/metadata_spec.rb +157 -23
  86. data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +27 -0
  87. data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +80 -0
  88. data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +92 -0
  89. data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +22 -0
  90. data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +18 -0
  91. data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +19 -0
  92. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +2 -2
  93. data/spec/iiif_print_spec.rb +125 -5
  94. data/spec/models/iiif_print/iiif_search_decorator_spec.rb +27 -0
  95. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +51 -0
  96. data/spec/samvera/derivatives/configuration_spec.rb +41 -0
  97. data/spec/samvera/derivatives/hyrax_spec.rb +62 -0
  98. data/spec/samvera/derivatives_spec.rb +54 -0
  99. data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +103 -0
  100. data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +20 -0
  101. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +8 -11
  102. data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
  103. data/tasks/copy_authorities_to_test_app.rake +11 -0
  104. data/tasks/iiif_print_dev.rake +4 -4
  105. metadata +123 -35
  106. data/app/helpers/hyrax/iiif_helper.rb +0 -22
  107. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
  108. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
@@ -1,4 +1,5 @@
1
1
  module IiifPrint
2
+ # rubocop:disable Metrics/ClassLength
2
3
  class Configuration
3
4
  attr_writer :after_create_fileset_handler
4
5
 
@@ -12,6 +13,14 @@ module IiifPrint
12
13
  end
13
14
  end
14
15
 
16
+ attr_writer :ancestory_identifier_function
17
+ # The function, with arity 1, that receives a work and returns it's identifier for the purposes
18
+ # of object ancestry.
19
+ # @return [Proc]
20
+ def ancestory_identifier_function
21
+ @ancestory_identifier_function ||= ->(work) { work.id }
22
+ end
23
+
15
24
  attr_writer :excluded_model_name_solr_field_values
16
25
  # By default, this uses an array of human readable types
17
26
  # ex: ['Generic Work', 'Image']
@@ -21,6 +30,45 @@ module IiifPrint
21
30
  @excluded_model_name_solr_field_values = []
22
31
  end
23
32
 
33
+ def skip_splitting_pdf_files_that_end_with_these_texts=(values)
34
+ @skip_splitting_pdf_files_that_end_with_these_texts = Array.wrap(values).map(&:downcase)
35
+ end
36
+
37
+ ##
38
+ # @return [Array<String>] the file suffixes (e.g. [".reader.pdf"]) that we will skip. Per
39
+ # the implementation of {.split_for_path_suffix?}, these values are cast to
40
+ # downcase.
41
+ def skip_splitting_pdf_files_that_end_with_these_texts
42
+ @skip_splitting_pdf_files_that_end_with_these_texts || []
43
+ end
44
+
45
+ attr_writer :unique_child_title_generator_function
46
+
47
+ # The function, with keywords (though maybe you'll want to splat ignore a few), is responsible
48
+ # for generating the child work file title. of object ancestry.
49
+ #
50
+ # The keyword parameters that will be passed to this function are:
51
+ #
52
+ # :original_pdf_path - The fully qualified pathname to the original PDF from which the images
53
+ # were split.
54
+ # :image_path - The fully qualified pathname for an image of the single page from the PDF.
55
+ # :parent_work - The object in which we're "attaching" the image.
56
+ # :page_number - The image is of the N-th page_number of the original PDF
57
+ # :page_padding - A helper number that indicates the number of significant digits of pages
58
+ # (e.g. 150 pages would have a padding of 3).
59
+ #
60
+ # @return [Proc]
61
+ # rubocop:disable Lint/UnusedBlockArgument
62
+ def unique_child_title_generator_function
63
+ @unique_child_title_generator_function ||= lambda { |original_pdf_path:, image_path:, parent_work:, page_number:, page_padding:|
64
+ identifier = parent_work.id
65
+ filename = File.basename(original_pdf_path)
66
+ page_suffix = "Page #{(page_number.to_i + 1).to_s.rjust(page_padding.to_i, '0')}"
67
+ "#{identifier} - #{filename} #{page_suffix}"
68
+ }
69
+ end
70
+ # rubocop:enable Lint/UnusedBlockArgument
71
+
24
72
  # This method wraps Hyrax's configuration so we can sniff out the correct method to use. The
25
73
  # {Hyrax::Configuration#whitelisted_ingest_dirs} is deprecated in favor of
26
74
  # {Hyrax::Configuration#registered_ingest_dirs}.
@@ -44,7 +92,7 @@ module IiifPrint
44
92
 
45
93
  attr_writer :default_iiif_manifest_version
46
94
  def default_iiif_manifest_version
47
- @default_iiif_manifest_version || 2
95
+ @default_iiif_manifest_version.presence || 2
48
96
  end
49
97
 
50
98
  attr_writer :metadata_fields
@@ -81,19 +129,108 @@ module IiifPrint
81
129
  end
82
130
  # rubocop:enable Metrics/MethodLength
83
131
 
132
+ attr_writer :additional_tesseract_options
133
+ ##
134
+ # The additional options to pass to the Tesseract configuration
135
+ #
136
+ # @see https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html
137
+ # @return [String]
138
+ def additional_tesseract_options
139
+ @additional_tesseract_options || ""
140
+ end
141
+
142
+ attr_writer :uv_config_path
143
+ ##
144
+ # According to https://github.com/samvera/hyrax/wiki/Hyrax-Management-Guide#universal-viewer-config
145
+ # the name of the UV config file should be /uv/uv_config.json (with an _)
146
+ # However, in most applications, it is /uv/uv-config.json (with a -)
147
+ def uv_config_path
148
+ @uv_config_path || "/uv/uv-config.json"
149
+ end
150
+
151
+ attr_writer :uv_base_path
152
+ ##
153
+ # While we're at it, we're going to go ahead and make the base path configurable as well
154
+ def uv_base_path
155
+ @uv_base_path || "/uv/uv.html"
156
+ end
157
+
158
+ attr_writer :child_work_attributes_function
159
+ ##
160
+ # Here we allow for customization of the child work attributes
161
+ def child_work_attributes_function
162
+ @child_work_attributes_function ||= lambda do |parent_work:, admin_set_id:|
163
+ {
164
+ admin_set_id: admin_set_id.to_s,
165
+ creator: parent_work.creator.to_a,
166
+ rights_statement: parent_work.rights_statement.to_a,
167
+ visibility: parent_work.visibility.to_s,
168
+ is_child: true
169
+ }
170
+ end
171
+ end
172
+
84
173
  attr_writer :sort_iiif_manifest_canvases_by
174
+ ##
175
+ # Normally, the canvases are sorted by the `ordered_members` association.
176
+ # However, if you want it to be sorted by another property, you can set this
177
+ # configuration. Change `nil` to something like `:title` or `:identifier`.
178
+ #
179
+ # Should you want to sort by the filename of the image, you
180
+ # set `nil` to `:label`. This looks at the canvas label, which is typically set
181
+ # to the filename of the image.
85
182
  def sort_iiif_manifest_canvases_by
86
- @sort_iiif_manifest_canvases_by || :title
183
+ @sort_iiif_manifest_canvases_by || nil
87
184
  end
88
185
 
89
- attr_writer :additional_tessearct_options
186
+ attr_writer :ocr_coords_from_json_function
90
187
  ##
91
- # The additional options to pass to the Tesseract configuration
188
+ # This is used to determine where to pull the OCR coordinates from. By default, it will
189
+ # pull from the JSON file that is generated by the OCR engine. However, if you have a
190
+ # different source, you can set this configuration. Current implementation has access to
191
+ # the `file_set_id`` and the `document` [SolrDocument].
92
192
  #
93
- # @see https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html
94
- # @return [String]
95
- def additional_tessearct_options
96
- @additional_tessearct_options || ""
193
+ # @see IiifPrint::BlacklightIiifSearch::AnnotationDecorator#fetch_and_parse_coords
194
+ def ocr_coords_from_json_function
195
+ @ocr_coords_from_json_function ||= lambda do |file_set_id:, **|
196
+ IiifPrint::Data::WorkDerivatives.data(from: file_set_id, of_type: 'json')
197
+ end
198
+ end
199
+
200
+ attr_writer :all_text_generator_function
201
+ ##
202
+ # This configuration determines where to pull the full text from. By default, it will
203
+ # pull from the TXT file that is generated by the OCR engine. However, if your
204
+ # application has its own implementation of generating the full text, then you can
205
+ # set your own configuration here.
206
+ def all_text_generator_function
207
+ @all_text_generator_function ||= lambda do |object:|
208
+ IiifPrint::Data::WorkDerivatives.data(from: object, of_type: 'txt')
209
+ end
210
+ end
211
+
212
+ attr_writer :iiif_metadata_field_presentation_order
213
+ ##
214
+ # This is the default sorter for the metadata. It will sort by the order of the keys specificied.
215
+ # By default, this is turned off as it returns nil. If you want to turn it on, you can set this
216
+ # this to an array of symbols the properties on the work.
217
+ #
218
+ # @example [:title, :description, :date_created]
219
+ # @return [Array<Symbol>]
220
+ def iiif_metadata_field_presentation_order
221
+ @iiif_metadata_field_presentation_order || nil
222
+ end
223
+
224
+ def questioning_authority_fields=(fields)
225
+ @questioning_authority_fields = Array.wrap(fields).map(&:to_s)
226
+ end
227
+
228
+ ##
229
+ # This is used to explicitly set which fields should be rendered as a Questioning Authority in the UV.
230
+ # By default, we render `rights_statement` and `license` as QA fields.
231
+ def questioning_authority_fields
232
+ @questioning_authority_fields ||= ['rights_statement', 'license']
97
233
  end
98
234
  end
235
+ # rubocop:enable Metrics/ModuleLength
99
236
  end
@@ -7,7 +7,7 @@ module IiifPrint
7
7
  # if context is itself a string, presume it is a file set id
8
8
  return @work if @work.is_a? String
9
9
  # if context is not a String, presume a work or fileset context:
10
- fileset.nil? ? nil : fileset.id
10
+ fileset&.id
11
11
  end
12
12
 
13
13
  def first_fileset
@@ -42,16 +42,16 @@ module IiifPrint
42
42
  #
43
43
  # @return [String]
44
44
  def self.data(from:, of_type:)
45
- new(from).data(of_type)
45
+ new(work: from).data(of_type)
46
46
  end
47
47
 
48
48
  # alternate constructor spelling:
49
49
  def self.of(work, fileset = nil, parent = nil)
50
- new(work, fileset, parent)
50
+ new(work: work, fileset: fileset, parent: parent)
51
51
  end
52
52
 
53
53
  # Adapt work and either specific or first fileset
54
- def initialize(work, fileset = nil, parent = nil)
54
+ def initialize(work: nil, fileset: nil, parent: nil)
55
55
  # adapted context usually work, may be string id of FileSet
56
56
  @work = work
57
57
  @fileset = fileset.nil? ? first_fileset : fileset
@@ -1,6 +1,7 @@
1
1
  require 'active_fedora'
2
2
  require 'hyrax'
3
3
  require 'blacklight_iiif_search'
4
+ require 'derivative_rodeo'
4
5
 
5
6
  module IiifPrint
6
7
  # module constants:
@@ -12,6 +13,7 @@ module IiifPrint
12
13
 
13
14
  # rubocop:disable Metrics/BlockLength
14
15
  config.to_prepare do
16
+ require "iiif_print/jobs/create_relationships_job"
15
17
  # We don't have a hard requirement of Bullkrax but in our experience, lingering on earlier
16
18
  # versions can introduce bugs of both Bulkrax and some of the assumptions that we've resolved.
17
19
  # Very early versions of Bulkrax do not have VERSION defined
@@ -41,26 +43,16 @@ module IiifPrint
41
43
  Hyrax::Renderers::FacetedAttributeRenderer.prepend(Hyrax::Renderers::FacetedAttributeRendererDecorator)
42
44
  Hyrax::WorksControllerBehavior.prepend(IiifPrint::WorksControllerBehaviorDecorator)
43
45
  Hyrax::WorkShowPresenter.prepend(IiifPrint::WorkShowPresenterDecorator)
46
+ Hyrax::IiifHelper.prepend(IiifPrint::IiifHelperDecorator)
44
47
 
45
48
  IiifPrint::ChildIndexer.decorate_work_types!
46
49
  IiifPrint::FileSetIndexer.decorate(Hyrax::FileSetIndexer)
47
50
 
48
51
  ::BlacklightIiifSearch::IiifSearchResponse.prepend(IiifPrint::IiifSearchResponseDecorator)
49
52
  ::BlacklightIiifSearch::IiifSearchAnnotation.prepend(IiifPrint::BlacklightIiifSearch::AnnotationDecorator)
53
+ ::BlacklightIiifSearch::IiifSearch.prepend(IiifPrint::IiifSearchDecorator)
50
54
  Hyrax::Actors::FileSetActor.prepend(IiifPrint::Actors::FileSetActorDecorator)
51
-
52
- # Extending the presenter to the base url which includes the protocol.
53
- # We need the base url to render the facet links and normalize the interface.
54
- Hyrax::IiifManifestPresenter.send(:attr_accessor, :base_url)
55
- Hyrax::IiifManifestPresenter::DisplayImagePresenter.send(:attr_accessor, :base_url)
56
- # Extending this class because there is an #ability= but not #ability and this definition
57
- # mirrors the Hyrax::IiifManifestPresenter#ability.
58
- module Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator
59
- def ability
60
- @ability ||= NullAbility.new
61
- end
62
- end
63
- Hyrax::IiifManifestPresenter::DisplayImagePresenter.prepend(Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator)
55
+ Hyrax::Actors::CleanupFileSetsActor.prepend(IiifPrint::Actors::CleanupFileSetsActorDecorator)
64
56
 
65
57
  Hyrax.config do |config|
66
58
  config.callback.set(:after_create_fileset) do |file_set, user|
@@ -71,6 +63,8 @@ module IiifPrint
71
63
 
72
64
  config.after_initialize do
73
65
  IiifPrint::Solr::Document.decorate(SolrDocument)
66
+ Hyrax::IiifManifestPresenter::DisplayImagePresenter
67
+ .prepend(IiifPrint::IiifManifestPresenterBehavior::DisplayImagePresenterBehavior)
74
68
  end
75
69
  # rubocop:enable Metrics/BlockLength
76
70
  end
@@ -6,4 +6,22 @@ module IiifPrint
6
6
  # Data transformation or read-error:
7
7
  class DataError < IiifPrintError
8
8
  end
9
+
10
+ class MissingFileError < IiifPrintError
11
+ end
12
+
13
+ class WorkNotConfiguredToSplitFileSetError < IiifPrintError
14
+ def initialize(file_set:, work:)
15
+ message = "Expected that we would be splitting #{file_set.class} ID=#{file_set&.id} #to_param=#{file_set&.to_param} " \
16
+ "for work #{work.class} ID=#{work&.id} #to_param=#{work&.to_param}. " \
17
+ "However it was not configured for PDF splitting."
18
+ super(message)
19
+ end
20
+ end
21
+
22
+ class UnexpectedMimeTypeError < IiifPrintError
23
+ def initialize(file_set:, mime_type:)
24
+ super "Unexpected mime_type #{mime_type} for #{file_set.class} ID=#{file_set.id.inspect}"
25
+ end
26
+ end
9
27
  end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Overrides Hyrax to add show_parents_only to processor chain
4
+ module IiifPrint
5
+ class HomepageSearchBuilder < Hyrax::HomepageSearchBuilder
6
+ self.default_processor_chain += [:show_parents_only]
7
+
8
+ def show_parents_only(solr_parameters)
9
+ query = if blacklight_params["include_child_works"] == 'true'
10
+ ActiveFedora::SolrQueryBuilder.construct_query(is_child_bsi: 'true')
11
+ else
12
+ ActiveFedora::SolrQueryBuilder.construct_query(is_child_bsi: nil)
13
+ end
14
+ solr_parameters[:fq] += [query]
15
+ end
16
+ end
17
+ end
@@ -3,11 +3,10 @@ require 'tmpdir'
3
3
 
4
4
  module IiifPrint
5
5
  class ImageTool
6
- attr_accessor :path, :ftype
6
+ attr_accessor :path
7
7
 
8
8
  def initialize(path)
9
9
  @path = path
10
- @ftype = magic
11
10
  @metadata = nil
12
11
  end
13
12
 
@@ -60,7 +59,7 @@ module IiifPrint
60
59
  end
61
60
 
62
61
  def im_line_select(lines, key)
63
- line = lines.find { |l| l.scrub.downcase.strip.start_with?(key) }
62
+ line = lines.find { |l| l.scrub.downcase.strip.start_with?(key.downcase) }
64
63
  # Given "key: value" line, return the value as String stripped of
65
64
  # leading and trailing whitespace
66
65
  return line if line.nil?
@@ -75,20 +74,25 @@ module IiifPrint
75
74
 
76
75
  # @return [Array<String>] lines of output from imagemagick `identify`
77
76
  def im_identify
78
- cmd = "identify -verbose #{path}"
77
+ cmd = "identify -format 'Geometry: %G\nDepth: %[bit-depth]\nColorspace: %[colorspace]\nAlpha: %A\nMIME type: %m\n' #{path}"
79
78
  `#{cmd}`.lines
80
79
  end
81
80
 
82
81
  def im_mime(lines)
83
82
  return 'application/pdf' if pdf? # workaround older imagemagick bug
84
- im_line_select(lines, 'mime type')
83
+
84
+ format = im_line_select(lines, 'mime type')
85
+ return if format.blank?
86
+
87
+ # `identify -format` with the `%m` switch only gives the format, we are coercing it into an image mime type
88
+ Mime::Type.lookup_by_extension(format.downcase).to_s
85
89
  end
86
90
 
87
91
  def populate_im_color!(lines, result)
88
92
  bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
89
93
  colorspace = im_line_select(lines, 'colorspace')
90
94
  color = colorspace == 'Gray' ? 'gray' : 'color'
91
- has_alpha = !im_line_select(lines, 'Alpha').nil?
95
+ has_alpha = !im_line_select(lines, 'alpha') == 'Undefined'
92
96
  result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
93
97
  result[:color] = bpc == 1 ? 'monochrome' : color
94
98
  result[:bits_per_component] = bpc
@@ -105,11 +109,11 @@ module IiifPrint
105
109
  end
106
110
 
107
111
  def magic
108
- File.read(@path, 23, 0)
112
+ @magic ||= File.read(@path, 23, 0)
109
113
  end
110
114
 
111
115
  def jp2?
112
- @ftype.end_with?('ftypjp2')
116
+ magic.end_with?('ftypjp2')
113
117
  end
114
118
 
115
119
  def pdf?
@@ -1,20 +1,38 @@
1
1
  module IiifPrint
2
2
  module Jobs
3
+ # @deprecated
3
4
  class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
5
+ ##
4
6
  # Break a pdf into individual pages
5
- # @param parent_work
7
+ #
8
+ # @param candidate_for_parency [FileSet, Hydra::PCDM::Work]
6
9
  # @param pdf_paths: [<Array => String>] paths to pdfs
7
10
  # @param user: [User]
8
11
  # @param admin_set_id: [<String>]
9
- # @param prior_pdfs: [<Integer>] count of pdfs already on parent work
10
- def perform(parent_work, pdf_paths, user, admin_set_id, prior_pdfs)
11
- @parent_work = parent_work
12
+ # rubocop:disable Metrics/MethodLength
13
+ def perform(candidate_for_parency, pdf_paths, user, admin_set_id, *)
14
+ ##
15
+ # We know that we have cases where parent_work is nil, this will definitely raise an
16
+ # exception; which is fine because we were going to do it later anyway.
17
+ @parent_work = if candidate_for_parency.work?
18
+ pdf_file_set = nil
19
+ candidate_for_parency
20
+ else
21
+ # We likely have a file set
22
+ pdf_file_set = candidate_for_parency
23
+ IiifPrint.parent_for(candidate_for_parency)
24
+ end
12
25
  @child_admin_set_id = admin_set_id
13
26
  child_model = @parent_work.iiif_print_config.pdf_split_child_model
14
27
 
15
- # handle each input pdf
16
- pdf_paths.each_with_index do |path, pdf_idx|
17
- split_pdf(path, pdf_idx, user, prior_pdfs, child_model)
28
+ # When working with remote files, we have put the PDF file into the correct path before submitting this job.
29
+ # However, there seem to be cases where we still don't have the file when we get here, so to be sure, we
30
+ # re-do the same command that was previously used to prepare the file path. If the file is already here, it
31
+ # simply returns the path, but if not it will copy the file there, giving us one more chance to have what we need.
32
+ pdf_paths = [Hyrax::WorkingDirectory.find_or_retrieve(pdf_file_set.files.first.id, pdf_file_set.id, pdf_paths.first)] if pdf_file_set
33
+ # handle each input pdf (when input is a file set, we will only have one).
34
+ pdf_paths.each do |original_pdf_path|
35
+ split_pdf(original_pdf_path, user, child_model, pdf_file_set)
18
36
  end
19
37
 
20
38
  # Link newly created child works to the parent
@@ -31,15 +49,25 @@ module IiifPrint
31
49
 
32
50
  # TODO: clean up image_files and pdf_paths
33
51
  end
52
+ # rubocop:enable Metrics/MethodLength
34
53
 
35
54
  private
36
55
 
37
- def split_pdf(path, pdf_idx, user, prior_pdfs_count, child_model)
38
- image_files = @parent_work.iiif_print_config.pdf_splitter_service.new(path).to_a
39
- return if image_files.blank?
56
+ # rubocop:disable Metrics/ParameterLists
57
+ # rubocop:disable Metrics/MethodLength
58
+ def split_pdf(original_pdf_path, user, child_model, pdf_file_set)
59
+ image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path, file_set: pdf_file_set)
40
60
 
41
- pdf_sequence = pdf_idx + prior_pdfs_count
42
- prepare_import_data(pdf_sequence, image_files, user)
61
+ # give as much info as possible if we don't have image files to work with.
62
+ if image_files.blank?
63
+ raise "#{@parent_work.class} (ID=#{@parent_work.id} " /
64
+ "to_param:#{@parent_work.to_param}) " /
65
+ "original_pdf_path #{original_pdf_path.inspect} " /
66
+ "pdf_file_set #{pdf_file_set.inspect}"
67
+ end
68
+
69
+ @split_from_pdf_id = pdf_file_set&.id
70
+ prepare_import_data(original_pdf_path, image_files, user)
43
71
 
44
72
  # submit the job to create all the child works for one PDF
45
73
  # @param [User] user
@@ -56,30 +84,54 @@ module IiifPrint
56
84
  @child_work_titles,
57
85
  {},
58
86
  @uploaded_files,
59
- attributes.merge!(model: child_model.to_s).with_indifferent_access,
87
+ attributes.merge!(model: child_model.to_s, split_from_pdf_id: @split_from_pdf_id).with_indifferent_access,
60
88
  operation)
61
89
  end
90
+ # rubocop:enable Metrics/MethodLength
91
+ # rubocop:enable Metrics/ParameterLists
62
92
 
63
- def prepare_import_data(pdf_sequence, image_files, user)
93
+ # rubocop:disable Metrics/MethodLength
94
+ def prepare_import_data(original_pdf_path, image_files, user)
64
95
  @uploaded_files = []
65
96
  @child_work_titles = {}
66
- image_files.each_with_index do |image_path, idx|
97
+ number_of_pages_in_pdf = image_files.size
98
+ image_files.each_with_index do |image_path, page_number|
67
99
  file_id = create_uploaded_file(user, image_path).to_s
68
- file_title = set_title(@parent_work.title.first, pdf_sequence, idx)
100
+
101
+ child_title = IiifPrint.config.unique_child_title_generator_function.call(
102
+ original_pdf_path: original_pdf_path,
103
+ image_path: image_path,
104
+ parent_work: @parent_work,
105
+ page_number: page_number,
106
+ page_padding: number_of_digits(nbr: number_of_pages_in_pdf)
107
+ )
108
+
69
109
  @uploaded_files << file_id
70
- @child_work_titles[file_id] = file_title
110
+ @child_work_titles[file_id] = child_title
71
111
  # save child work info to create the member relationships
72
- PendingRelationship.create!(child_title: file_title,
112
+ PendingRelationship.create!(child_title: child_title,
73
113
  parent_id: @parent_work.id,
74
- child_order: sort_order(pdf_sequence, idx))
114
+ child_order: child_title,
115
+ parent_model: @parent_work.class,
116
+ child_model: @parent_work.iiif_print_config.pdf_split_child_model,
117
+ file_id: @split_from_pdf_id)
118
+
119
+ begin
120
+ # Clean up the temporary image path.
121
+ FileUtils.rm_f(image_path) if File.exist?(image_path)
122
+ rescue
123
+ # If we can't delete, let's move on. Maybe it was already cleaned-up.
124
+ end
75
125
  end
76
126
  end
127
+ # rubocop:enable Metrics/MethodLength
77
128
 
78
- def sort_order(pdf_sequence, idx)
79
- "#{pdf_sequence} #{idx}"
129
+ def number_of_digits(nbr:)
130
+ nbr.to_s.size
80
131
  end
81
132
 
82
133
  def create_uploaded_file(user, path)
134
+ # TODO: Could we create a remote path?
83
135
  uf = Hyrax::UploadedFile.new
84
136
  uf.user_id = user.id
85
137
  uf.file = CarrierWave::SanitizedFile.new(path)
@@ -87,20 +139,9 @@ module IiifPrint
87
139
  uf.id
88
140
  end
89
141
 
90
- def set_title(title, pdf_sequence, idx)
91
- pdf_index = "Pdf Nbr #{pdf_sequence + 1}"
92
- page_number = "Page #{idx + 1}"
93
- "#{title}: #{pdf_index}, #{page_number}"
94
- end
95
-
96
142
  # TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
97
143
  def attributes
98
- {
99
- admin_set_id: @child_admin_set_id.to_s,
100
- creator: @parent_work.creator.to_a,
101
- rights_statement: @parent_work.rights_statement.to_a,
102
- visibility: @parent_work.visibility.to_s
103
- }
144
+ IiifPrint.config.child_work_attributes_function.call(parent_work: @parent_work, admin_set_id: @child_admin_set_id)
104
145
  end
105
146
  end
106
147
  end