iiif_print 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (181) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  4. data/.github/workflows/build-lint-test-action.yaml +4 -5
  5. data/.gitignore +5 -4
  6. data/.rubocop.yml +1 -0
  7. data/.solargraph.yml +19 -0
  8. data/Gemfile.lock +1025 -0
  9. data/README.md +102 -9
  10. data/Rakefile +6 -0
  11. data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
  12. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
  13. data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
  14. data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
  15. data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
  16. data/app/helpers/iiif_print_helper.rb +0 -20
  17. data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
  18. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +45 -17
  19. data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
  20. data/app/jobs/iiif_print/jobs/child_works_from_pdf_job.rb +153 -0
  21. data/app/jobs/iiif_print/jobs/create_relationships_job.rb +117 -0
  22. data/app/jobs/iiif_print/jobs/request_split_pdf_job.rb +31 -0
  23. data/app/listeners/iiif_print/listener.rb +31 -0
  24. data/app/models/concerns/iiif_print/set_child_flag.rb +10 -1
  25. data/app/models/concerns/iiif_print/solr/document.rb +19 -3
  26. data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
  27. data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
  28. data/app/models/iiif_print/pending_relationship.rb +3 -0
  29. data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
  30. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
  31. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  32. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +23 -11
  33. data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
  34. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
  35. data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
  36. data/app/services/iiif_print/manifest_builder_service_behavior.rb +90 -31
  37. data/app/services/iiif_print/pluggable_derivative_service.rb +8 -10
  38. data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
  39. data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
  40. data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
  41. data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
  42. data/app/views/catalog/_index_header_list_default.html.erb +13 -0
  43. data/app/views/hyrax/base/_representative_media.html.erb +4 -3
  44. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
  45. data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
  46. data/config/initializers/simple_schema_loader.rb +1 -0
  47. data/config/locales/iiif_print.en.yml +4 -0
  48. data/config/metadata/child_works_from_pdf_splitting.yaml +21 -0
  49. data/config/routes.rb +3 -0
  50. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
  51. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
  52. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
  53. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
  54. data/docker-compose.yml +2 -2
  55. data/iiif_print.gemspec +11 -10
  56. data/lib/generators/iiif_print/install_generator.rb +21 -1
  57. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
  58. data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
  59. data/lib/iiif_print/base_derivative_service.rb +14 -2
  60. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +58 -6
  61. data/lib/iiif_print/catalog_search_builder.rb +7 -3
  62. data/lib/iiif_print/configuration.rb +205 -8
  63. data/lib/iiif_print/data/fileset_helper.rb +3 -3
  64. data/lib/iiif_print/data/work_derivatives.rb +4 -4
  65. data/lib/iiif_print/engine.rb +53 -15
  66. data/lib/iiif_print/errors.rb +18 -0
  67. data/lib/iiif_print/homepage_search_builder.rb +17 -0
  68. data/lib/iiif_print/image_tool.rb +12 -8
  69. data/lib/iiif_print/jp2_derivative_service.rb +4 -1
  70. data/lib/iiif_print/lineage_service.rb +47 -13
  71. data/lib/iiif_print/metadata.rb +67 -48
  72. data/lib/iiif_print/pdf_derivative_service.rb +3 -1
  73. data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
  74. data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
  75. data/lib/iiif_print/persistence_layer.rb +118 -0
  76. data/lib/iiif_print/split_pdfs/base_splitter.rb +153 -0
  77. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +83 -37
  78. data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
  79. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +22 -0
  80. data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
  81. data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
  82. data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
  83. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
  84. data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
  85. data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
  86. data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
  87. data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
  88. data/lib/iiif_print/tiff_derivative_service.rb +3 -1
  89. data/lib/iiif_print/version.rb +1 -1
  90. data/lib/iiif_print.rb +210 -20
  91. data/lib/samvera/derivatives/configuration.rb +83 -0
  92. data/lib/samvera/derivatives/hyrax.rb +129 -0
  93. data/lib/samvera/derivatives.rb +238 -0
  94. data/tasks/copy_authorities_to_test_app.rake +11 -0
  95. data/tasks/iiif_print_dev.rake +4 -4
  96. metadata +111 -196
  97. data/app/helpers/hyrax/iiif_helper.rb +0 -22
  98. data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -34
  99. data/app/views/hyrax/file_sets/_actions.html.erb +0 -45
  100. data/bin/rails +0 -13
  101. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +0 -107
  102. data/lib/iiif_print/jobs/create_relationships_job.rb +0 -78
  103. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
  104. data/spec/.keep.txt +0 -1
  105. data/spec/factories/ability.rb +0 -6
  106. data/spec/factories/newspaper_issue.rb +0 -7
  107. data/spec/factories/newspaper_page.rb +0 -7
  108. data/spec/factories/newspaper_page_solr_document.rb +0 -12
  109. data/spec/factories/newspaper_title.rb +0 -8
  110. data/spec/factories/uploaded_pdf_file.rb +0 -9
  111. data/spec/factories/uploaded_txt_file.rb +0 -9
  112. data/spec/factories/user.rb +0 -13
  113. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  114. data/spec/fixtures/files/4.1.07.tiff +0 -0
  115. data/spec/fixtures/files/README.md +0 -7
  116. data/spec/fixtures/files/alto-2-0.xsd +0 -714
  117. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  118. data/spec/fixtures/files/credits.md +0 -16
  119. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  120. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  121. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  122. data/spec/fixtures/files/minimal-alto.xml +0 -31
  123. data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
  124. data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
  125. data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
  126. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  127. data/spec/fixtures/files/ocr_alto.xml +0 -202
  128. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
  129. data/spec/fixtures/files/ocr_color.tiff +0 -0
  130. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  131. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  132. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  133. data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
  134. data/spec/fixtures/files/page1.tiff +0 -0
  135. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  136. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  137. data/spec/fixtures/files/thumbnail.jpg +0 -0
  138. data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
  139. data/spec/helpers/iiif_print_helper_spec.rb +0 -43
  140. data/spec/iiif_print/base_derivative_service_spec.rb +0 -11
  141. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -51
  142. data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
  143. data/spec/iiif_print/configuration_spec.rb +0 -67
  144. data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
  145. data/spec/iiif_print/data/work_file_spec.rb +0 -99
  146. data/spec/iiif_print/data/work_files_spec.rb +0 -237
  147. data/spec/iiif_print/image_tool_spec.rb +0 -109
  148. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -30
  149. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -17
  150. data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
  151. data/spec/iiif_print/lineage_service_spec.rb +0 -13
  152. data/spec/iiif_print/metadata_spec.rb +0 -115
  153. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
  154. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
  155. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
  156. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
  157. data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
  158. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
  159. data/spec/iiif_print_spec.rb +0 -51
  160. data/spec/misc_shared.rb +0 -111
  161. data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
  162. data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
  163. data/spec/models/solr_document_spec.rb +0 -14
  164. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -19
  165. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
  166. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
  167. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
  168. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -178
  169. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
  170. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
  171. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
  172. data/spec/spec_helper.rb +0 -181
  173. data/spec/support/controller_level_helpers.rb +0 -28
  174. data/spec/support/iiif_print_models.rb +0 -127
  175. data/spec/test_app_templates/blacklight.yml +0 -9
  176. data/spec/test_app_templates/fedora.yml +0 -15
  177. data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
  178. data/spec/test_app_templates/redis.yml +0 -9
  179. data/spec/test_app_templates/solr/conf/schema.xml +0 -362
  180. data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
  181. data/spec/test_app_templates/solr.yml +0 -7
@@ -0,0 +1,153 @@
1
+ require 'open3'
2
+ require 'securerandom'
3
+ require 'tmpdir'
4
+ require 'iiif_print/split_pdfs/pdf_image_extraction_service'
5
+
6
+ module IiifPrint
7
+ module SplitPdfs
8
+ # @abstract
9
+ #
10
+ # The purpose of this class is to split the PDF into constituent image files.
11
+ #
12
+ # @see .call
13
+ class BaseSplitter
14
+ ##
15
+ # @api public
16
+ #
17
+ # @param path [String] local path to the PDF that we will split.
18
+ # @return [Enumerable]
19
+ #
20
+ # @see #each
21
+ #
22
+ # @note We're including the ** args to provide method conformity; other services require
23
+ # additional information (such as the FileSet)
24
+ #
25
+ # @see IiifPrint::SplitPdfs::DerivativeRodeoSplitter
26
+ def self.call(path, **)
27
+ new(path).to_a
28
+ end
29
+
30
+ ##
31
+ # @api public
32
+ #
33
+ # Added to allow for fine-tuning of splitting decision such as tenant-based omission
34
+ # @see https://github.com/samvera/hyku/blob/main/app/services/iiif_print/tenant_config.rb
35
+ #
36
+ # @return [Boolean] returns false to not limit the splitting of PDFs
37
+ def self.never_split_pdfs?
38
+ false
39
+ end
40
+
41
+ class_attribute :image_extension
42
+ class_attribute :compression, default: nil
43
+ class_attribute :quality, default: nil
44
+
45
+ def initialize(path, tmpdir: Dir.mktmpdir, default_dpi: 400)
46
+ @baseid = SecureRandom.uuid
47
+ @pdfpath = path
48
+ @pdfinfo = IiifPrint::SplitPdfs::PdfImageExtractionService.new(pdfpath)
49
+ @tmpdir = tmpdir
50
+ @default_dpi = default_dpi
51
+ end
52
+
53
+ # In creating {#each} we get many of the methods of array operation (e.g. #to_a).
54
+ include Enumerable
55
+
56
+ # @api public
57
+ #
58
+ # @yieldparam [String] the path to the page's tiff.
59
+ def each
60
+ entries.each do |e|
61
+ yield(e)
62
+ end
63
+ end
64
+
65
+ # @api private
66
+ #
67
+ # TODO: put this test somewhere to prevent invalid pdfs from crashing the image service.
68
+ def invalid_pdf?
69
+ return true if pdfinfo.color.include?(nil) || pdfinfo.width.nil? || pdfinfo.height.nil? || pdfinfo.page_count.zero?
70
+ false
71
+ end
72
+
73
+ attr_reader :pdfinfo, :tmpdir, :baseid, :default_dpi, :pdfpath
74
+ private :pdfinfo, :tmpdir, :baseid, :default_dpi, :pdfpath
75
+
76
+ private
77
+
78
+ # entries for each page
79
+ def entries
80
+ return @entries if defined? @entries
81
+
82
+ @entries = Array.wrap(gsconvert)
83
+ end
84
+
85
+ # rubocop:disable Metrics/MethodLength
86
+ # ghostscript convert all pages to TIFF
87
+ def gsconvert
88
+ output_base = File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
89
+ # NOTE: you must call gsdevice before compression, as compression is
90
+ # updated during the gsdevice call.
91
+ cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
92
+ cmd += " -sCompression=#{compression}" if compression?
93
+ cmd += " -dJPEGQ=#{quality}" if quality?
94
+ cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
95
+ filenames = []
96
+
97
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
98
+ page_number = 0
99
+ stdout.read.split("\n").each do |line|
100
+ next unless line.start_with?('Page ')
101
+
102
+ page_number += 1
103
+ filenames << File.join(tmpdir, "#{baseid}-page#{page_number}.#{image_extension}")
104
+ end
105
+ end
106
+
107
+ filenames
108
+ end
109
+ # rubocop:enable Metrics/MethodLength
110
+
111
+ def gsdevice
112
+ raise NotImplementedError
113
+ end
114
+
115
+ PAGE_COUNT_REGEXP = %r{^Pages: +(\d+)$}.freeze
116
+
117
+ def pagecount
118
+ return @pagecount if defined? @pagecount
119
+
120
+ cmd = "pdfinfo #{pdfpath}"
121
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
122
+ match = PAGE_COUNT_REGEXP.match(stdout.read)
123
+ @pagecount = match[1].to_i
124
+ end
125
+ @pagecount
126
+ end
127
+
128
+ def ppi
129
+ if looks_scanned?
130
+ # For scanned media, defer to detected image PPI:
131
+ pdfinfo.ppi
132
+ else
133
+ # 400 dpi for something that does not look like scanned media:
134
+ default_dpi
135
+ end
136
+ end
137
+
138
+ def looks_scanned?
139
+ max_image_px = pdfinfo.width * pdfinfo.height
140
+ # single 10mp+ image per page?
141
+ single_image_per_page? && max_image_px > 1024 * 1024 * 10
142
+ end
143
+
144
+ def single_image_per_page?
145
+ pdfinfo.page_count == pagecount
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ require "iiif_print/split_pdfs/pages_to_jpgs_splitter"
152
+ require "iiif_print/split_pdfs/pages_to_pngs_splitter"
153
+ require "iiif_print/split_pdfs/pages_to_tiffs_splitter"
@@ -1,29 +1,49 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # Encapsulates methods used for pdf splitting into child works
4
3
  module IiifPrint
5
4
  module SplitPdfs
5
+ ##
6
+ # Encapsulates methods used for pdf splitting into child works.
7
+ #
8
+ # The primary point of entry is {.conditionally_enqueue}.
6
9
  class ChildWorkCreationFromPdfService
7
- # Load an array of paths to pdf files
8
- # @param [Array > Hyrax::Upload file ids]
9
- # @return [Array > String] file paths to temp directory
10
- def self.pdf_paths(files:)
11
- upload_ids = filter_file_ids(files)
12
- return [] if upload_ids.empty?
13
- uploads = Hyrax::UploadedFile.find(upload_ids)
14
- paths = uploads.map(&method(:upload_path))
15
- pdfs_only_for(paths)
16
- end
10
+ ##
11
+ # Responsible for conditionally enqueueing the PDF splitting job. The conditions attempt to
12
+ # sniff out whether the given file was a PDF.
13
+ #
14
+ # @param file_set [FileSet] What is the containing file set for the provided file.
15
+ # @param file [#path, #id]
16
+ # @param user [User] Who did the upload?
17
+ # @param import_url [NilClass, String] Provided when we're dealing with a file provided via a
18
+ # URL.
19
+ # @param work [Hydra::PCDM::Work] An optional parameter that saves us a bit of time in not
20
+ # needing to query for the parent of the given :file_set (see {.parent_for})
21
+ #
22
+ # @return [Symbol] when we don't enqueue the job
23
+ # @return [TrueClass] when we actually enqueue the job underlying job.
24
+ # rubocop:disable Metrics/MethodLength
25
+ def self.conditionally_enqueue(file_set:, file:, user:, import_url: nil, work: nil)
26
+ work ||= IiifPrint.parent_for(file_set)
17
27
 
18
- # Is child work splitting defined for model?
19
- # @param [GenericWork, etc] A valid type of hyrax work
20
- # @return [Boolean]
21
- def self.iiif_print_split?(work:)
22
- # defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
23
- return true if work.try(:iiif_print_config)&.pdf_split_child_model
24
- false
28
+ return :no_split_for_parent unless iiif_print_split?(work: work)
29
+ return :no_pdfs_to_split_for_import_url if import_url && !pdfs?(paths: [import_url])
30
+
31
+ file_locations = if import_url
32
+ # TODO: Fix this logic, currently unsupported in Bulkrax
33
+ [Hyrax::WorkingDirectory.find_or_retrieve(file.id, file_set.id)]
34
+ else
35
+ pdf_paths(file: file)
36
+ end
37
+ return :no_pdfs_to_split if file_locations.empty?
38
+
39
+ IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: file_locations, user: user)
40
+ :enqueued
25
41
  end
42
+ # rubocop:enable Metrics/MethodLength
26
43
 
44
+ ##
45
+ # @api private
46
+ #
27
47
  # Are there any PDF files?
28
48
  # @param [Array > String] paths to PDFs
29
49
  # @return [Boolean]
@@ -33,42 +53,68 @@ module IiifPrint
33
53
  true
34
54
  end
35
55
 
36
- # Submit the job to split PDF into child works
56
+ ##
57
+ # @api private
58
+ # Load an array of paths to pdf files
59
+ # @param [Array > Hyrax::Upload file ids]
60
+ # @return [Array > String] file paths to temp directory
61
+ def self.pdf_paths(file:)
62
+ return [] unless file
63
+
64
+ if file.class < Valkyrie::Resource
65
+ # assuming that if one PDF is uploaded to a Valkyrie resource then all of them should be
66
+ paths = [Hyrax.storage_adapter.file_path(file.file_identifier)]
67
+ pdfs_only_for(paths)
68
+ else
69
+ upload_ids = filter_file_ids(file.id.to_s)
70
+ return [] if upload_ids.empty?
71
+
72
+ uploads = Hyrax::UploadedFile.find(upload_ids)
73
+ paths = uploads.map(&method(:upload_path))
74
+ pdfs_only_for(paths)
75
+ end
76
+ end
77
+
78
+ ##
79
+ # @api private
80
+ #
81
+ # Is child work splitting defined for model?
37
82
  # @param [GenericWork, etc] A valid type of hyrax work
38
- # @param [Array<String>] paths to PDF attachments
39
- # @param [User] user
40
- # @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
41
- def self.queue_job(work:, file_locations:, user:, admin_set_id:)
42
- work.iiif_print_config.pdf_splitter_job.perform_later(
43
- work,
44
- file_locations,
45
- user,
46
- admin_set_id,
47
- count_existing_pdfs(work)
48
- )
83
+ # @return [Boolean]
84
+ def self.iiif_print_split?(work:)
85
+ config = work.try(:iiif_print_config)
86
+ return false unless config
87
+ return false if config.pdf_splitter_service.try(:never_split_pdfs?)
88
+ # defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
89
+ return true if config&.pdf_split_child_model
90
+ false
49
91
  end
50
92
 
93
+ ##
94
+ # @api private
51
95
  def self.filter_file_ids(input)
52
96
  Array.wrap(input).select(&:present?)
53
97
  end
54
98
 
99
+ ##
100
+ # @api private
101
+ #
55
102
  # Given Hyrax::Upload object, return path to file on local filesystem
56
103
  def self.upload_path(upload)
57
104
  # so many layers to this onion:
105
+ # TODO: Write a recursive function to keep calling file until
106
+ # the file doesn't respond to file then return that file.
58
107
  upload.file.file.file
59
108
  end
60
109
 
61
- # TODO: implement a method to count existing PDFs on a work to support
62
- # adding more PDFs to an existing work.
63
- def self.count_existing_pdfs(_work)
64
- 0
65
- end
66
-
110
+ ##
111
+ # @api private
112
+ #
67
113
  # TODO: Consider other methods to identify a PDF file.
68
114
  # This sub-selection may need to be moved to use mimetype if there
69
115
  # is a need to support paths not ending in .pdf (i.e. remote_urls)
70
116
  def self.pdfs_only_for(paths)
71
- paths.select { |path| path.end_with?('.pdf', '.PDF') }
117
+ paths.select { |path| IiifPrint.split_for_path_suffix?(path) }
72
118
  end
73
119
  end
74
120
  end
@@ -0,0 +1,166 @@
1
+ module IiifPrint
2
+ module SplitPdfs
3
+ ##
4
+ # This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed
5
+ # images, or split a PDF if there are no preprocessed images.
6
+ #
7
+ # We have already attached the original file to the file_set. We want to convert that original
8
+ # file that's attached to a input_uri (e.g. "file://path/to/original-file" as in what we have
9
+ # written to Fedora as the PDF)
10
+ #
11
+ # @see .call
12
+ class DerivativeRodeoSplitter
13
+ ##
14
+ # @param filename [String] the local path to the PDFDerivativeServicele
15
+ # @param file_set [FileSet] file set containing the PDF file to split
16
+ #
17
+ # @return [Array<String>] paths to images split from each page of PDF file
18
+ #
19
+ # @see IiifPrint::SplitPdfs::BaseSplitter
20
+ def self.call(filename, file_set:)
21
+ new(filename, file_set: file_set).split_files
22
+ end
23
+
24
+ ##
25
+ # @param filename [String] path to the original file. Note that we use {#filename} to
26
+ # derivate {#input_uri}
27
+ # @param file_set [FileSet] the container for the original file and its derivatives.
28
+ #
29
+ # @param output_tmp_dir [String] where we will be writing things. In using `Dir.mktmpdir`
30
+ # we're creating a sudirectory on `Dir.tmpdir`
31
+ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
32
+ @filename = filename
33
+ @file_set = file_set
34
+
35
+ @input_uri = "file://#{filename}"
36
+
37
+ # We are writing the images to a local location that CarrierWave can upload. This is a
38
+ # local file, internal to IiifPrint; it looks like SpaceStone/DerivativeRodeo lingo, but
39
+ # that's just a convenience.
40
+ output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}')
41
+
42
+ @output_location_template = "file://#{output_template_path}"
43
+ end
44
+
45
+ attr_reader :filename, :file_set
46
+
47
+ ##
48
+ # This is where, in "Fedora" we have the original file. This is not the original file in the
49
+ # pre-processing location but instead the long-term location of the file in the application
50
+ # that mounts IIIF Print.
51
+ #
52
+ # @return [String]
53
+ attr_reader :input_uri
54
+
55
+ ##
56
+ # This is the location where we're going to write the derivatives that will "go into Fedora";
57
+ # it is a local location, one that IIIF Print's mounting application can directly do
58
+ # "File.read"
59
+ #
60
+ # @return [String]
61
+ attr_reader :output_location_template
62
+
63
+ ##
64
+ # Where can we find the file that represents the pre-processing template. In this case, the
65
+ # original PDF file.
66
+ #
67
+ # The logic handles a case where SpaceStone successfully fetched the file to then perform
68
+ # processing.
69
+ #
70
+ # For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3
71
+ # bucket that we then use for IIIF Print.
72
+ #
73
+ # @note The preprocessed_location_template should end in `.pdf`. The
74
+ # DerivativeRodeo::BaseGenerator::PdfSplitGenerator#derive_preprocessed_template_from
75
+ # will coerce the template into one that represents the split pages.
76
+ #
77
+ # @return [String]
78
+ #
79
+ # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63
80
+ # rubocop:disable Metrics/MethodLength
81
+ # rubocop:disable Metrics/AbcSize
82
+ def preprocessed_location_template
83
+ return @preprocessed_location_template if defined?(@preprocessed_location_template)
84
+
85
+ derivative_rodeo_candidate = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename)
86
+
87
+ @preprocessed_location_template =
88
+ if derivative_rodeo_candidate.blank?
89
+ message = "#{self.class}##{__method__} could not establish derivative_rodeo_candidate for " \
90
+ "#{file_set.class} ID=#{file_set&.id} #to_param=#{file_set&.to_param} with filename #{filename.inspect}. " \
91
+ "Move along little buddy."
92
+ Rails.logger.debug(message)
93
+ nil
94
+ elsif rodeo_conformant_uri_exists?(derivative_rodeo_candidate)
95
+ Rails.logger.debug("#{self.class}##{__method__} found existing file at location #{derivative_rodeo_candidate}. High five partner!")
96
+ derivative_rodeo_candidate
97
+ elsif file_set.import_url
98
+ message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist. " \
99
+ "Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}"
100
+ Rails.logger.warn(message)
101
+ handle_original_file_not_in_derivative_rodeo
102
+ else
103
+ message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \
104
+ "nor a remote_url for #{file_set.class} ID=#{file_set.id} #to_param=#{file_set&.to_param}. " \
105
+ "Returning `nil' as we have no possible preprocess. " \
106
+ "Maybe the input_uri #{input_uri.inspect} will be adequate."
107
+ Rails.logger.warn(message)
108
+ nil
109
+ end
110
+ end
111
+ # rubocop:enable Metrics/AbcSize
112
+ # rubocop:enable Metrics/MethodLength
113
+
114
+ ##
115
+ # @api private
116
+ #
117
+ # When the file does not exist in the pre-processed location (e.g. "SpaceStone") we need to
118
+ # ensure that we have something locally. We copy the {FileSet#import_url} to the {#input_uri}
119
+ # location.
120
+ #
121
+ # @return [String] should be the {#input_uri}
122
+ # @raise [DerivativeRodeo::Errors::FileMissingError] when the input_uri does not exist
123
+ def handle_original_file_not_in_derivative_rodeo
124
+ # A quick short-circuit. Don't attempt to copy. Likely already covered by the DerivativeRodeo::Generators::CopyGenerator
125
+ return input_uri if rodeo_conformant_uri_exists?(input_uri)
126
+
127
+ message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \
128
+ "Perhaps there was a problem in SpaceStone downloading the file? " \
129
+ "Regardless, we'll use DerivativeRodeo::Generators::CopyGenerator to ensure #{input_uri.inspect} exists. " \
130
+ "However, we'll almost certainly be generating child pages locally."
131
+ Rails.logger.info(message)
132
+
133
+ # This ensures that we have a copy of the file_set.import_uri at the input_uri location;
134
+ # we likely have this.
135
+ DerivativeRodeo::Generators::CopyGenerator.new(
136
+ input_uris: [file_set.import_url],
137
+ output_location_template: input_uri
138
+ ).generated_uris.first
139
+ end
140
+ # private :handle_original_file_not_in_derivative_rodeo
141
+
142
+ def rodeo_conformant_uri_exists?(uri)
143
+ DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist?
144
+ end
145
+ private :rodeo_conformant_uri_exists?
146
+
147
+ ##
148
+ # @return [Array<Strings>] the paths to each of the images split off from the PDF.
149
+ def split_files
150
+ DerivativeRodeo::Generators::PdfSplitGenerator.new(
151
+ input_uris: [input_uri],
152
+ output_location_template: output_location_template,
153
+ preprocessed_location_template: preprocessed_location_template
154
+ ).generated_files.map(&:file_path)
155
+ rescue => e
156
+ message = "#{self.class}##{__method__} encountered `#{e.class}' “#{e}” for " \
157
+ "input_uri: #{input_uri.inspect}, " \
158
+ "output_location_template: #{output_location_template.inspect}, and " \
159
+ "preprocessed_location_template: #{preprocessed_location_template.inspect}."
160
+ exception = RuntimeError.new(message)
161
+ exception.set_backtrace(e.backtrace)
162
+ raise exception
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module IiifPrint
4
+ module SplitPdfs
5
+ ## Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works
6
+ class DestroyPdfChildWorksService
7
+ ## @api public
8
+ # @param file_set [FileSet] What is the containing file set for the provided file.
9
+ # @param work [Hydra::PCDM::Work] Parent of the fileset being deleted
10
+ def self.conditionally_destroy_spawned_children_of(file_set:, work:, user: nil)
11
+ child_model = work.try(:iiif_print_config)&.pdf_split_child_model
12
+ return unless child_model
13
+ return unless IiifPrint.pdf?(file_set)
14
+
15
+ # NOTE: The IiifPrint::PendingRelationship is an ActiveRecord object; hence we don't need to
16
+ # leverage an adapter.
17
+ IiifPrint::PendingRelationship.where(parent_id: work.id, file_id: file_set.id).find_each(&:destroy)
18
+ IiifPrint.destroy_children_split_from(file_set: file_set, work: work, model: child_model, user: user)
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,19 @@
1
+ module IiifPrint
2
+ module SplitPdfs
3
+ # @abstract
4
+ #
5
+ # The purpose of this class is to split the PDF into constituent jpg files.
6
+ #
7
+ # @see #each
8
+ class PagesToJpgsSplitter < BaseSplitter
9
+ self.image_extension = 'jpg'
10
+ self.quality = '50'
11
+
12
+ private
13
+
14
+ def gsdevice
15
+ 'jpeg'
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module IiifPrint
2
+ module SplitPdfs
3
+ # @abstract
4
+ #
5
+ # The purpose of this class is to split the PDF into constituent png files.
6
+ #
7
+ # @see #each
8
+ class PagesToPngsSplitter < BaseSplitter
9
+ self.image_extension = 'png'
10
+
11
+ private
12
+
13
+ def gsdevice
14
+ color, _channels, bpc = pdfinfo.color
15
+ device = nil
16
+ # 1 Bit Grayscale, if applicable:
17
+ device = 'pngmonod' if color == 'gray' && bpc == 1
18
+ # 8 Bit Grayscale, if applicable:
19
+ device = 'pnggray' if color == 'gray' && bpc > 1
20
+ # otherwise 24 Bit RGB:
21
+ device = 'png16m' if device.nil?
22
+ device
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,41 @@
1
+ module IiifPrint
2
+ module SplitPdfs
3
+ # The purpose of this class is to split the PDF into constituent TIFF files.
4
+ #
5
+ # @see #each
6
+ class PagesToTiffsSplitter < BaseSplitter
7
+ self.image_extension = 'tiff'
8
+ DEFAULT_COMPRESSION = 'lzw'.freeze
9
+ self.compression = DEFAULT_COMPRESSION
10
+
11
+ private
12
+
13
+ def gsdevice
14
+ color, channels, bpc = pdfinfo.color
15
+ device = nil
16
+ if color == 'gray'
17
+ # CCITT Group 4 Black and White, if applicable:
18
+ if bpc == 1
19
+ device = 'tiffg4'
20
+ self.compression = 'g4'
21
+ elsif bpc > 1
22
+ # 8 Bit Grayscale, if applicable:
23
+ device = 'tiffgray'
24
+ end
25
+ end
26
+
27
+ # otherwise color:
28
+ device = colordevice(channels, bpc) if device.nil?
29
+ device
30
+ end
31
+
32
+ def colordevice(channels, bpc)
33
+ bits = bpc * channels
34
+ # will be either 8bpc/16bpd color TIFF,
35
+ # with any CMYK source transformed to 8bpc RBG
36
+ bits = 24 unless [24, 48].include? bits
37
+ "tiff#{bits}nc"
38
+ end
39
+ end
40
+ end
41
+ end