iiif_print 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
  4. data/.github/workflows/build-lint-test-action.yaml +4 -5
  5. data/.gitignore +5 -4
  6. data/.rubocop.yml +1 -0
  7. data/.solargraph.yml +19 -0
  8. data/Gemfile.lock +1025 -0
  9. data/README.md +98 -9
  10. data/Rakefile +6 -0
  11. data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
  12. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
  13. data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
  14. data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
  15. data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
  16. data/app/helpers/iiif_print_helper.rb +0 -20
  17. data/app/indexers/concerns/iiif_print/child_indexer.rb +9 -3
  18. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +17 -4
  19. data/app/models/concerns/iiif_print/set_child_flag.rb +9 -0
  20. data/app/models/concerns/iiif_print/solr/document.rb +14 -0
  21. data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
  22. data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
  23. data/app/models/iiif_print/pending_relationship.rb +3 -0
  24. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
  25. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
  26. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +19 -10
  27. data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
  28. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
  29. data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
  30. data/app/services/iiif_print/manifest_builder_service_behavior.rb +88 -31
  31. data/app/services/iiif_print/pluggable_derivative_service.rb +3 -9
  32. data/app/views/catalog/_index_header_list_default.html.erb +13 -0
  33. data/app/views/hyrax/base/_representative_media.html.erb +4 -3
  34. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
  35. data/app/views/hyrax/file_sets/_actions.html.erb +2 -1
  36. data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
  37. data/config/locales/iiif_print.en.yml +4 -0
  38. data/config/routes.rb +3 -0
  39. data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
  40. data/docker-compose.yml +2 -2
  41. data/iiif_print.gemspec +10 -9
  42. data/lib/generators/iiif_print/install_generator.rb +21 -1
  43. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
  44. data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
  45. data/lib/iiif_print/base_derivative_service.rb +2 -1
  46. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +57 -5
  47. data/lib/iiif_print/catalog_search_builder.rb +5 -1
  48. data/lib/iiif_print/configuration.rb +145 -8
  49. data/lib/iiif_print/data/fileset_helper.rb +1 -1
  50. data/lib/iiif_print/data/work_derivatives.rb +3 -3
  51. data/lib/iiif_print/engine.rb +7 -13
  52. data/lib/iiif_print/errors.rb +18 -0
  53. data/lib/iiif_print/homepage_search_builder.rb +17 -0
  54. data/lib/iiif_print/image_tool.rb +12 -8
  55. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +74 -33
  56. data/lib/iiif_print/jobs/create_relationships_job.rb +80 -31
  57. data/lib/iiif_print/jobs/request_split_pdf_job.rb +31 -0
  58. data/lib/iiif_print/lineage_service.rb +29 -8
  59. data/lib/iiif_print/metadata.rb +67 -48
  60. data/lib/iiif_print/split_pdfs/base_splitter.rb +142 -0
  61. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +68 -32
  62. data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
  63. data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +33 -0
  64. data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
  65. data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
  66. data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
  67. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
  68. data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
  69. data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
  70. data/lib/iiif_print/version.rb +1 -1
  71. data/lib/iiif_print.rb +167 -12
  72. data/lib/samvera/derivatives/configuration.rb +83 -0
  73. data/lib/samvera/derivatives/hyrax.rb +129 -0
  74. data/lib/samvera/derivatives.rb +238 -0
  75. data/spec/factories/newspaper_page_solr_document.rb +9 -1
  76. data/spec/fixtures/authorities/licenses.yml +4 -0
  77. data/spec/fixtures/authorities/rights_statements.yml +4 -0
  78. data/spec/iiif_print/base_derivative_service_spec.rb +20 -3
  79. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +11 -3
  80. data/spec/iiif_print/catalog_search_builder_spec.rb +1 -1
  81. data/spec/iiif_print/configuration_spec.rb +141 -15
  82. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +7 -2
  83. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +110 -9
  84. data/spec/iiif_print/lineage_service_spec.rb +1 -1
  85. data/spec/iiif_print/metadata_spec.rb +157 -23
  86. data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +27 -0
  87. data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +80 -0
  88. data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +92 -0
  89. data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +22 -0
  90. data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +18 -0
  91. data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +19 -0
  92. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +2 -2
  93. data/spec/iiif_print_spec.rb +125 -5
  94. data/spec/models/iiif_print/iiif_search_decorator_spec.rb +27 -0
  95. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +51 -0
  96. data/spec/samvera/derivatives/configuration_spec.rb +41 -0
  97. data/spec/samvera/derivatives/hyrax_spec.rb +62 -0
  98. data/spec/samvera/derivatives_spec.rb +54 -0
  99. data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +103 -0
  100. data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +20 -0
  101. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +8 -11
  102. data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
  103. data/tasks/copy_authorities_to_test_app.rake +11 -0
  104. data/tasks/iiif_print_dev.rake +4 -4
  105. metadata +123 -35
  106. data/app/helpers/hyrax/iiif_helper.rb +0 -22
  107. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
  108. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
@@ -1,20 +1,76 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # Encapsulates methods used for pdf splitting into child works
4
3
  module IiifPrint
5
4
  module SplitPdfs
5
+ ##
6
+ # Encapsulates methods used for pdf splitting into child works.
7
+ #
8
+ # The primary point of entry is {.conditionally_enqueue}.
6
9
  class ChildWorkCreationFromPdfService
10
+ ##
11
+ # Responsible for conditionally enqueueing the PDF splitting job. The conditions attempt to
12
+ # sniff out whether the given file was a PDF.
13
+ #
14
+ # @param file_set [FileSet] What is the containing file set for the provided file.
15
+ # @param file [#path, #id]
16
+ # @param user [User] Who did the upload?
17
+ # @param import_url [NilClass, String] Provided when we're dealing with a file provided via a
18
+ # URL.
19
+ # @param work [Hydra::PCDM::Work] An optional parameter that saves us a bit of time in not
20
+ # needing to query for the parent of the given :file_set (see {.parent_for})
21
+ #
22
+ # @return [Symbol] when we don't enqueue the job
23
+ # @return [TrueClass] when we actually enqueue the job underlying job.
24
+ # rubocop:disable Metrics/MethodLength
25
+ def self.conditionally_enqueue(file_set:, file:, user:, import_url: nil, work: nil)
26
+ work ||= IiifPrint.parent_for(file_set)
27
+
28
+ return :no_split_for_parent unless iiif_print_split?(work: work)
29
+ return :no_pdfs_to_split_for_import_url if import_url && !pdfs?(paths: [import_url])
30
+
31
+ file_locations = if import_url
32
+ [Hyrax::WorkingDirectory.find_or_retrieve(file.id, file_set.id)]
33
+ else
34
+ pdf_paths(files: [file.try(:id)&.to_s].compact)
35
+ end
36
+ return :no_pdfs_to_split if file_locations.empty?
37
+
38
+ IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: file_locations, user: user)
39
+ :enqueued
40
+ end
41
+ # rubocop:enable Metrics/MethodLength
42
+
43
+ ##
44
+ # @api private
45
+ #
46
+ # Are there any PDF files?
47
+ # @param [Array > String] paths to PDFs
48
+ # @return [Boolean]
49
+ def self.pdfs?(paths:)
50
+ pdf_paths = pdfs_only_for(paths)
51
+ return false unless pdf_paths.count.positive?
52
+ true
53
+ end
54
+
55
+ ##
56
+ # @api private
7
57
  # Load an array of paths to pdf files
8
58
  # @param [Array > Hyrax::Upload file ids]
9
59
  # @return [Array > String] file paths to temp directory
10
60
  def self.pdf_paths(files:)
61
+ return [] if files.all?(&:empty?) # assumes an array
62
+
11
63
  upload_ids = filter_file_ids(files)
12
64
  return [] if upload_ids.empty?
65
+
13
66
  uploads = Hyrax::UploadedFile.find(upload_ids)
14
67
  paths = uploads.map(&method(:upload_path))
15
68
  pdfs_only_for(paths)
16
69
  end
17
70
 
71
+ ##
72
+ # @api private
73
+ #
18
74
  # Is child work splitting defined for model?
19
75
  # @param [GenericWork, etc] A valid type of hyrax work
20
76
  # @return [Boolean]
@@ -24,51 +80,31 @@ module IiifPrint
24
80
  false
25
81
  end
26
82
 
27
- # Are there any PDF files?
28
- # @param [Array > String] paths to PDFs
29
- # @return [Boolean]
30
- def self.pdfs?(paths:)
31
- pdf_paths = pdfs_only_for(paths)
32
- return false unless pdf_paths.count.positive?
33
- true
34
- end
35
-
36
- # Submit the job to split PDF into child works
37
- # @param [GenericWork, etc] A valid type of hyrax work
38
- # @param [Array<String>] paths to PDF attachments
39
- # @param [User] user
40
- # @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
41
- def self.queue_job(work:, file_locations:, user:, admin_set_id:)
42
- work.iiif_print_config.pdf_splitter_job.perform_later(
43
- work,
44
- file_locations,
45
- user,
46
- admin_set_id,
47
- count_existing_pdfs(work)
48
- )
49
- end
50
-
83
+ ##
84
+ # @api private
51
85
  def self.filter_file_ids(input)
52
86
  Array.wrap(input).select(&:present?)
53
87
  end
54
88
 
89
+ ##
90
+ # @api private
91
+ #
55
92
  # Given Hyrax::Upload object, return path to file on local filesystem
56
93
  def self.upload_path(upload)
57
94
  # so many layers to this onion:
95
+ # TODO: Write a recursive function to keep calling file until
96
+ # the file doesn't respond to file then return that file.
58
97
  upload.file.file.file
59
98
  end
60
99
 
61
- # TODO: implement a method to count existing PDFs on a work to support
62
- # adding more PDFs to an existing work.
63
- def self.count_existing_pdfs(_work)
64
- 0
65
- end
66
-
100
+ ##
101
+ # @api private
102
+ #
67
103
  # TODO: Consider other methods to identify a PDF file.
68
104
  # This sub-selection may need to be moved to use mimetype if there
69
105
  # is a need to support paths not ending in .pdf (i.e. remote_urls)
70
106
  def self.pdfs_only_for(paths)
71
- paths.select { |path| path.end_with?('.pdf', '.PDF') }
107
+ paths.select { |path| IiifPrint.split_for_path_suffix?(path) }
72
108
  end
73
109
  end
74
110
  end
@@ -0,0 +1,166 @@
1
+ module IiifPrint
2
+ module SplitPdfs
3
+ ##
4
+ # This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed
5
+ # images, or split a PDF if there are no preprocessed images.
6
+ #
7
+ # We have already attached the original file to the file_set. We want to convert that original
8
+ # file that's attached to a input_uri (e.g. "file://path/to/original-file" as in what we have
9
+ # written to Fedora as the PDF)
10
+ #
11
+ # @see .call
12
+ class DerivativeRodeoSplitter
13
+ ##
14
+ # @param filename [String] the local path to the PDFDerivativeServicele
15
+ # @param file_set [FileSet] file set containing the PDF file to split
16
+ #
17
+ # @return [Array<String>] paths to images split from each page of PDF file
18
+ #
19
+ # @see IiifPrint::SplitPdfs::BaseSplitter
20
+ def self.call(filename, file_set:)
21
+ new(filename, file_set: file_set).split_files
22
+ end
23
+
24
+ ##
25
+ # @param filename [String] path to the original file. Note that we use {#filename} to
26
+ # derivate {#input_uri}
27
+ # @param file_set [FileSet] the container for the original file and its derivatives.
28
+ #
29
+ # @param output_tmp_dir [String] where we will be writing things. In using `Dir.mktmpdir`
30
+ # we're creating a sudirectory on `Dir.tmpdir`
31
+ def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
32
+ @filename = filename
33
+ @file_set = file_set
34
+
35
+ @input_uri = "file://#{filename}"
36
+
37
+ # We are writing the images to a local location that CarrierWave can upload. This is a
38
+ # local file, internal to IiifPrint; it looks like SpaceStone/DerivativeRodeo lingo, but
39
+ # that's just a convenience.
40
+ output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}')
41
+
42
+ @output_location_template = "file://#{output_template_path}"
43
+ end
44
+
45
+ attr_reader :filename, :file_set
46
+
47
+ ##
48
+ # This is where, in "Fedora" we have the original file. This is not the original file in the
49
+ # pre-processing location but instead the long-term location of the file in the application
50
+ # that mounts IIIF Print.
51
+ #
52
+ # @return [String]
53
+ attr_reader :input_uri
54
+
55
+ ##
56
+ # This is the location where we're going to write the derivatives that will "go into Fedora";
57
+ # it is a local location, one that IIIF Print's mounting application can directly do
58
+ # "File.read"
59
+ #
60
+ # @return [String]
61
+ attr_reader :output_location_template
62
+
63
+ ##
64
+ # Where can we find the file that represents the pre-processing template. In this case, the
65
+ # original PDF file.
66
+ #
67
+ # The logic handles a case where SpaceStone successfully fetched the file to then perform
68
+ # processing.
69
+ #
70
+ # For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3
71
+ # bucket that we then use for IIIF Print.
72
+ #
73
+ # @note The preprocessed_location_template should end in `.pdf`. The
74
+ # DerivativeRodeo::BaseGenerator::PdfSplitGenerator#derive_preprocessed_template_from
75
+ # will coerce the template into one that represents the split pages.
76
+ #
77
+ # @return [String]
78
+ #
79
+ # @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63
80
+ # rubocop:disable Metrics/MethodLength
81
+ # rubocop:disable Metrics/AbcSize
82
+ def preprocessed_location_template
83
+ return @preprocessed_location_template if defined?(@preprocessed_location_template)
84
+
85
+ derivative_rodeo_candidate = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename)
86
+
87
+ @preprocessed_location_template =
88
+ if derivative_rodeo_candidate.blank?
89
+ message = "#{self.class}##{__method__} could not establish derivative_rodeo_candidate for " \
90
+ "#{file_set.class} ID=#{file_set&.id} #to_param=#{file_set&.to_param} with filename #{filename.inspect}. " \
91
+ "Move along little buddy."
92
+ Rails.logger.debug(message)
93
+ nil
94
+ elsif rodeo_conformant_uri_exists?(derivative_rodeo_candidate)
95
+ Rails.logger.debug("#{self.class}##{__method__} found existing file at location #{derivative_rodeo_candidate}. High five partner!")
96
+ derivative_rodeo_candidate
97
+ elsif file_set.import_url
98
+ message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist. " \
99
+ "Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}"
100
+ Rails.logger.warn(message)
101
+ handle_original_file_not_in_derivative_rodeo
102
+ else
103
+ message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \
104
+ "nor a remote_url for #{file_set.class} ID=#{file_set.id} #to_param=#{file_set&.to_param}. " \
105
+ "Returning `nil' as we have no possible preprocess. " \
106
+ "Maybe the input_uri #{input_uri.inspect} will be adequate."
107
+ Rails.logger.warn(message)
108
+ nil
109
+ end
110
+ end
111
+ # rubocop:enable Metrics/AbcSize
112
+ # rubocop:enable Metrics/MethodLength
113
+
114
+ ##
115
+ # @api private
116
+ #
117
+ # When the file does not exist in the pre-processed location (e.g. "SpaceStone") we need to
118
+ # ensure that we have something locally. We copy the {FileSet#import_url} to the {#input_uri}
119
+ # location.
120
+ #
121
+ # @return [String] should be the {#input_uri}
122
+ # @raise [DerivativeRodeo::Errors::FileMissingError] when the input_uri does not exist
123
+ def handle_original_file_not_in_derivative_rodeo
124
+ # A quick short-circuit. Don't attempt to copy. Likely already covered by the DerivativeRodeo::Generators::CopyGenerator
125
+ return input_uri if rodeo_conformant_uri_exists?(input_uri)
126
+
127
+ message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \
128
+ "Perhaps there was a problem in SpaceStone downloading the file? " \
129
+ "Regardless, we'll use DerivativeRodeo::Generators::CopyGenerator to ensure #{input_uri.inspect} exists. " \
130
+ "However, we'll almost certainly be generating child pages locally."
131
+ Rails.logger.info(message)
132
+
133
+ # This ensures that we have a copy of the file_set.import_uri at the input_uri location;
134
+ # we likely have this.
135
+ DerivativeRodeo::Generators::CopyGenerator.new(
136
+ input_uris: [file_set.import_url],
137
+ output_location_template: input_uri
138
+ ).generated_uris.first
139
+ end
140
+ # private :handle_original_file_not_in_derivative_rodeo
141
+
142
+ def rodeo_conformant_uri_exists?(uri)
143
+ DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist?
144
+ end
145
+ private :rodeo_conformant_uri_exists?
146
+
147
+ ##
148
+ # @return [Array<Strings>] the paths to each of the images split off from the PDF.
149
+ def split_files
150
+ DerivativeRodeo::Generators::PdfSplitGenerator.new(
151
+ input_uris: [input_uri],
152
+ output_location_template: output_location_template,
153
+ preprocessed_location_template: preprocessed_location_template
154
+ ).generated_files.map(&:file_path)
155
+ rescue => e
156
+ message = "#{self.class}##{__method__} encountered `#{e.class}' “#{e}” for " \
157
+ "input_uri: #{input_uri.inspect}, " \
158
+ "output_location_template: #{output_location_template.inspect}, and " \
159
+ "preprocessed_location_template: #{preprocessed_location_template.inspect}."
160
+ exception = RuntimeError.new(message)
161
+ exception.set_backtrace(e.backtrace)
162
+ raise exception
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module IiifPrint
4
+ module SplitPdfs
5
+ ## Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works
6
+ class DestroyPdfChildWorksService
7
+ ## @api public
8
+ # @param file_set [FileSet] What is the containing file set for the provided file.
9
+ # @param work [Hydra::PCDM::Work] Parent of the fileset being deleted
10
+ def self.conditionally_destroy_spawned_children_of(file_set:, work:)
11
+ child_model = work.try(:iiif_print_config)&.pdf_split_child_model
12
+ return unless child_model
13
+ return unless file_set.class.pdf_mime_types.include?(file_set.mime_type)
14
+
15
+ IiifPrint::PendingRelationship.where(parent_id: work.id, file_id: file_set.id).find_each(&:destroy)
16
+ destroy_spawned_children(model: child_model, file_set: file_set, work: work)
17
+ end
18
+
19
+ private_class_method def self.destroy_spawned_children(model:, file_set:, work:)
20
+ # look first for children by the file set id they were split from
21
+ children = model.where(split_from_pdf_id: file_set.id)
22
+ if children.blank?
23
+ # find works where file name and work `to_param` are both in the title
24
+ children = model.where(title: file_set.label).where(title: work.to_param)
25
+ end
26
+ return if children.blank?
27
+ children.each do |rcd|
28
+ rcd.destroy(eradicate: true)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,19 @@
1
+ module IiifPrint
2
+ module SplitPdfs
3
+ # @abstract
4
+ #
5
+ # The purpose of this class is to split the PDF into constituent jpg files.
6
+ #
7
+ # @see #each
8
+ class PagesToJpgsSplitter < BaseSplitter
9
+ self.image_extension = 'jpg'
10
+ self.quality = '50'
11
+
12
+ private
13
+
14
+ def gsdevice
15
+ 'jpeg'
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,26 @@
1
+ module IiifPrint
2
+ module SplitPdfs
3
+ # @abstract
4
+ #
5
+ # The purpose of this class is to split the PDF into constituent png files.
6
+ #
7
+ # @see #each
8
+ class PagesToPngsSplitter < BaseSplitter
9
+ self.image_extension = 'png'
10
+
11
+ private
12
+
13
+ def gsdevice
14
+ color, _channels, bpc = pdfinfo.color
15
+ device = nil
16
+ # 1 Bit Grayscale, if applicable:
17
+ device = 'pngmonod' if color == 'gray' && bpc == 1
18
+ # 8 Bit Grayscale, if applicable:
19
+ device = 'pnggray' if color == 'gray' && bpc > 1
20
+ # otherwise 24 Bit RGB:
21
+ device = 'png16m' if device.nil?
22
+ device
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,41 @@
1
+ module IiifPrint
2
+ module SplitPdfs
3
+ # The purpose of this class is to split the PDF into constituent TIFF files.
4
+ #
5
+ # @see #each
6
+ class PagesToTiffsSplitter < BaseSplitter
7
+ self.image_extension = 'tiff'
8
+ DEFAULT_COMPRESSION = 'lzw'.freeze
9
+ self.compression = DEFAULT_COMPRESSION
10
+
11
+ private
12
+
13
+ def gsdevice
14
+ color, channels, bpc = pdfinfo.color
15
+ device = nil
16
+ if color == 'gray'
17
+ # CCITT Group 4 Black and White, if applicable:
18
+ if bpc == 1
19
+ device = 'tiffg4'
20
+ self.compression = 'g4'
21
+ elsif bpc > 1
22
+ # 8 Bit Grayscale, if applicable:
23
+ device = 'tiffgray'
24
+ end
25
+ end
26
+
27
+ # otherwise color:
28
+ device = colordevice(channels, bpc) if device.nil?
29
+ device
30
+ end
31
+
32
+ def colordevice(channels, bpc)
33
+ bits = bpc * channels
34
+ # will be either 8bpc/16bpd color TIFF,
35
+ # with any CMYK source transformed to 8bpc RBG
36
+ bits = 24 unless [24, 48].include? bits
37
+ "tiff#{bits}nc"
38
+ end
39
+ end
40
+ end
41
+ end
@@ -8,78 +8,83 @@ module IiifPrint
8
8
  # For dpi extraction, falls back to calculating using MiniMagick,
9
9
  # if neccessary.
10
10
  class PdfImageExtractionService
11
- # class constant column numbers
12
- COL_WIDTH = 3
13
- COL_HEIGHT = 4
14
- COL_COLOR = 5
15
- COL_CHANNELS = 6
16
- COL_BITS = 7
17
- # only poppler 0.25+ has this column in output:
18
- COL_XPPI = 12
19
-
20
11
  def initialize(path)
21
12
  @path = path
22
- @cmd = format('pdfimages -list %<path>s', path: path)
23
- @output = nil
24
- @entries = nil
13
+ process(command: format('pdfimages -list %<path>s 2>/dev/null', path: path))
25
14
  end
26
15
 
27
- def process
28
- # call just once
29
- if @output.nil?
30
- Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
31
- @output = stdout.read.split("\n")
32
- end
33
- end
34
- @output.slice(2, @output.size - 1)
35
- end
16
+ attr_reader :path, :page_count, :width, :height, :pixels_per_inch
17
+ alias ppi pixels_per_inch
36
18
 
37
- def entries
38
- if @entries.nil?
39
- @entries = []
40
- output = process
41
- (0..output.size - 1).each do |i|
42
- @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
43
- end
44
- end
45
- @entries
19
+ # @return [Array<String, Integer, Integer>]
20
+ def color
21
+ [@color_description, @channels, @bits]
46
22
  end
47
23
 
48
- def selectcolumn(i, &block)
49
- result = entries.map { |e| e[i] }
50
- return result.map!(&block) if block_given?
51
- result
52
- end
24
+ private
53
25
 
54
- def width
55
- selectcolumn(COL_WIDTH, &:to_i).max
56
- end
26
+ # class constant column numbers
27
+ COL_WIDTH = 3
28
+ COL_HEIGHT = 4
29
+ COL_COLOR_DESC = 5
30
+ COL_CHANNELS = 6
31
+ COL_BITS = 7
32
+ # only poppler 0.25+ has this column in output:
33
+ COL_XPPI = 12
57
34
 
58
- def height
59
- selectcolumn(COL_HEIGHT, &:to_i).max
60
- end
35
+ # rubocop:disable Metrics/AbcSize - Because this helps us process the results in one loop.
36
+ # rubocop:disable Metrics/MethodLength - Again, to help speed up the processing loop.
37
+ # rubocop:disable Metrics/CyclomaticComplexity
38
+ # rubocop:disable Metrics/PerceivedComplexity
39
+ #
40
+ # The first two lines are tabular header information:
41
+ #
42
+ # Example:
43
+ #
44
+ # bash-5.1$ pdfimages -list fmc_color.pdf | head -5
45
+ # page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio
46
+ # --------------------------------------------------------------------------------------------
47
+ # 1 0 image 2475 413 rgb 3 8 jpeg no 10 0 300 300 21.8K 0.7%
48
+ def process(command:)
49
+ @page_count = 0
50
+ @color_description = 'gray'
51
+ @width = 0
52
+ @height = 0
53
+ @channels = 0
54
+ @bits = 0
55
+ @pixels_per_inch = 0
56
+ Open3.popen3(command) do |_stdin, stdout, _stderr, _wait_thr|
57
+ stdout.read.split("\n").each_with_index do |line, index|
58
+ # Skip the two header lines
59
+ next if index <= 1
60
+ @page_count += 1
61
+ cells = line.gsub(/\s+/m, ' ').strip.split(" ")
61
62
 
62
- def color
63
- # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
64
- # so caller may want all of this information, and in case of
65
- # mixed color spaces across images, this returns maximum
66
- desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
67
- channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
68
- bits = entries.map { |e| e[COL_BITS].to_i }.max
69
- [desc, channels, bits]
70
- end
63
+ @color_description = 'rgb' if cells[COL_COLOR_DESC] != 'gray'
64
+ @width = cells[COL_WIDTH].to_i if cells[COL_WIDTH].to_i > @width
65
+ @height = cells[COL_HEIGHT].to_i if cells[COL_HEIGHT].to_i > @height
66
+ @channels = cells[COL_CHANNELS].to_i if cells[COL_CHANNELS].to_i > @channels
67
+ @bits = cells[COL_BITS].to_i if cells[COL_BITS].to_i > @bits
71
68
 
72
- def ppi
73
- if entries[0].size <= 12
74
- # poppler < 0.25
75
- pdf = MiniMagick::Image.open(@path)
76
- width_points = pdf.width
77
- width_px = width
78
- return (72 * width_px / width_points).to_i
69
+ # In the case of poppler version < 0.25, we will have no more than 12 columns. As such,
70
+ # we need to do some alternative magic to calculate this.
71
+ if @page_count == 1 && cells.size <= 12
72
+ pdf = MiniMagick::Image.open(@path)
73
+ width_points = pdf.width
74
+ width_px = width
75
+ @pixels_per_inch = (72 * width_px / width_points).to_i
76
+ elsif cells[COL_XPPI].to_i > @pixels_per_inch
77
+ # By the magic of nil#to_i if we don't have more than 12 columns, we've already set
78
+ # the @pixels_per_inch and this line won't due much of anything.
79
+ @pixels_per_inch = cells[COL_XPPI].to_i
80
+ end
81
+ end
79
82
  end
80
- # with poppler 0.25+, pdfimages just gives us this:
81
- selectcolumn(COL_XPPI, &:to_i).max
82
83
  end
84
+ # rubocop:enable Metrics/AbcSize
85
+ # rubocop:enable Metrics/MethodLength
86
+ # rubocop:enable Metrics/CyclomaticComplexity
87
+ # rubocop:enable Metrics/PerceivedComplexity
83
88
  end
84
89
  end
85
90
  end
@@ -84,6 +84,7 @@ module IiifPrint
84
84
  # add trailing space to plaintext buffer for between words:
85
85
  @text += ' '
86
86
  @words.push(@current) if word_complete?
87
+ @current = nil # clear the current word
87
88
  end
88
89
 
89
90
  def end_line
@@ -120,9 +121,12 @@ module IiifPrint
120
121
  # for current word, and append line endings to plain text:
121
122
  #
122
123
  # @param name [String] element name.
123
- def end_element(_name)
124
- end_line if @element_class_name == 'ocr_line'
125
- end_word if @element_class_name == 'ocrx_word'
124
+ def end_element(name)
125
+ if name == 'span'
126
+ end_word if @element_class_name == 'ocrx_word'
127
+ @text += "\n" if @element_class_name.nil?
128
+ end
129
+ @element_class_name = nil
126
130
  end
127
131
 
128
132
  # Callback for completion of parsing hOCR, used to normalize generated
@@ -9,7 +9,7 @@ module IiifPrint
9
9
  class PageOCR
10
10
  attr_accessor :html, :path
11
11
 
12
- def initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options)
12
+ def initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options)
13
13
  @path = path
14
14
  # hOCR html:
15
15
  @html = nil
@@ -17,13 +17,14 @@ module IiifPrint
17
17
  @source_meta = nil
18
18
  @box = nil
19
19
  @plain = nil
20
- @additional_tessearct_options = additional_tessearct_options
20
+ @additional_tesseract_options = additional_tesseract_options
21
21
  end
22
22
 
23
23
  def run_ocr
24
24
  outfile = File.join(Dir.mktmpdir, 'output_html')
25
- cmd = "tesseract #{path} #{outfile} hocr"
26
- cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present?
25
+ cmd = "OMP_THREAD_LIMIT=1 tesseract #{path} #{outfile}"
26
+ cmd += " #{@additional_tesseract_options}" if @additional_tesseract_options.present?
27
+ cmd += " hocr"
27
28
  `#{cmd}`
28
29
  outfile + '.hocr'
29
30
  end
@@ -1,3 +1,3 @@
1
1
  module IiifPrint
2
- VERSION = '1.0.0'.freeze
2
+ VERSION = '1.1.0'.freeze
3
3
  end