iiif_print 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
- data/.github/workflows/build-lint-test-action.yaml +4 -5
- data/.gitignore +5 -4
- data/.rubocop.yml +1 -0
- data/.solargraph.yml +19 -0
- data/Gemfile.lock +1025 -0
- data/README.md +98 -9
- data/Rakefile +6 -0
- data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
- data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
- data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
- data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
- data/app/helpers/iiif_print_helper.rb +0 -20
- data/app/indexers/concerns/iiif_print/child_indexer.rb +9 -3
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +17 -4
- data/app/models/concerns/iiif_print/set_child_flag.rb +9 -0
- data/app/models/concerns/iiif_print/solr/document.rb +14 -0
- data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
- data/app/models/iiif_print/pending_relationship.rb +3 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +19 -10
- data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
- data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +88 -31
- data/app/services/iiif_print/pluggable_derivative_service.rb +3 -9
- data/app/views/catalog/_index_header_list_default.html.erb +13 -0
- data/app/views/hyrax/base/_representative_media.html.erb +4 -3
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
- data/app/views/hyrax/file_sets/_actions.html.erb +2 -1
- data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
- data/config/locales/iiif_print.en.yml +4 -0
- data/config/routes.rb +3 -0
- data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
- data/docker-compose.yml +2 -2
- data/iiif_print.gemspec +10 -9
- data/lib/generators/iiif_print/install_generator.rb +21 -1
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
- data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
- data/lib/iiif_print/base_derivative_service.rb +2 -1
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +57 -5
- data/lib/iiif_print/catalog_search_builder.rb +5 -1
- data/lib/iiif_print/configuration.rb +145 -8
- data/lib/iiif_print/data/fileset_helper.rb +1 -1
- data/lib/iiif_print/data/work_derivatives.rb +3 -3
- data/lib/iiif_print/engine.rb +7 -13
- data/lib/iiif_print/errors.rb +18 -0
- data/lib/iiif_print/homepage_search_builder.rb +17 -0
- data/lib/iiif_print/image_tool.rb +12 -8
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +74 -33
- data/lib/iiif_print/jobs/create_relationships_job.rb +80 -31
- data/lib/iiif_print/jobs/request_split_pdf_job.rb +31 -0
- data/lib/iiif_print/lineage_service.rb +29 -8
- data/lib/iiif_print/metadata.rb +67 -48
- data/lib/iiif_print/split_pdfs/base_splitter.rb +142 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +68 -32
- data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
- data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +33 -0
- data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
- data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
- data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
- data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
- data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
- data/lib/iiif_print/version.rb +1 -1
- data/lib/iiif_print.rb +167 -12
- data/lib/samvera/derivatives/configuration.rb +83 -0
- data/lib/samvera/derivatives/hyrax.rb +129 -0
- data/lib/samvera/derivatives.rb +238 -0
- data/spec/factories/newspaper_page_solr_document.rb +9 -1
- data/spec/fixtures/authorities/licenses.yml +4 -0
- data/spec/fixtures/authorities/rights_statements.yml +4 -0
- data/spec/iiif_print/base_derivative_service_spec.rb +20 -3
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +11 -3
- data/spec/iiif_print/catalog_search_builder_spec.rb +1 -1
- data/spec/iiif_print/configuration_spec.rb +141 -15
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +7 -2
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +110 -9
- data/spec/iiif_print/lineage_service_spec.rb +1 -1
- data/spec/iiif_print/metadata_spec.rb +157 -23
- data/spec/iiif_print/split_pdfs/base_splitter_spec.rb +27 -0
- data/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +80 -0
- data/spec/iiif_print/split_pdfs/destroy_pdf_child_works_service_spec.rb +92 -0
- data/spec/iiif_print/split_pdfs/pages_to_jpgs_splitter_spec.rb +22 -0
- data/spec/iiif_print/split_pdfs/pages_to_pngs_splitter_spec.rb +18 -0
- data/spec/iiif_print/split_pdfs/pages_to_tiffs_splitter_spec.rb +19 -0
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +2 -2
- data/spec/iiif_print_spec.rb +125 -5
- data/spec/models/iiif_print/iiif_search_decorator_spec.rb +27 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +51 -0
- data/spec/samvera/derivatives/configuration_spec.rb +41 -0
- data/spec/samvera/derivatives/hyrax_spec.rb +62 -0
- data/spec/samvera/derivatives_spec.rb +54 -0
- data/spec/services/iiif_print/derivative_rodeo_service_spec.rb +103 -0
- data/spec/services/iiif_print/manifest_builder_service_behavior_spec.rb +20 -0
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +8 -11
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +1 -1
- data/tasks/copy_authorities_to_test_app.rake +11 -0
- data/tasks/iiif_print_dev.rake +4 -4
- metadata +123 -35
- data/app/helpers/hyrax/iiif_helper.rb +0 -22
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
@@ -1,20 +1,76 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
# Encapsulates methods used for pdf splitting into child works
|
4
3
|
module IiifPrint
|
5
4
|
module SplitPdfs
|
5
|
+
##
|
6
|
+
# Encapsulates methods used for pdf splitting into child works.
|
7
|
+
#
|
8
|
+
# The primary point of entry is {.conditionally_enqueue}.
|
6
9
|
class ChildWorkCreationFromPdfService
|
10
|
+
##
|
11
|
+
# Responsible for conditionally enqueueing the PDF splitting job. The conditions attempt to
|
12
|
+
# sniff out whether the given file was a PDF.
|
13
|
+
#
|
14
|
+
# @param file_set [FileSet] What is the containing file set for the provided file.
|
15
|
+
# @param file [#path, #id]
|
16
|
+
# @param user [User] Who did the upload?
|
17
|
+
# @param import_url [NilClass, String] Provided when we're dealing with a file provided via a
|
18
|
+
# URL.
|
19
|
+
# @param work [Hydra::PCDM::Work] An optional parameter that saves us a bit of time in not
|
20
|
+
# needing to query for the parent of the given :file_set (see {.parent_for})
|
21
|
+
#
|
22
|
+
# @return [Symbol] when we don't enqueue the job
|
23
|
+
# @return [TrueClass] when we actually enqueue the job underlying job.
|
24
|
+
# rubocop:disable Metrics/MethodLength
|
25
|
+
def self.conditionally_enqueue(file_set:, file:, user:, import_url: nil, work: nil)
|
26
|
+
work ||= IiifPrint.parent_for(file_set)
|
27
|
+
|
28
|
+
return :no_split_for_parent unless iiif_print_split?(work: work)
|
29
|
+
return :no_pdfs_to_split_for_import_url if import_url && !pdfs?(paths: [import_url])
|
30
|
+
|
31
|
+
file_locations = if import_url
|
32
|
+
[Hyrax::WorkingDirectory.find_or_retrieve(file.id, file_set.id)]
|
33
|
+
else
|
34
|
+
pdf_paths(files: [file.try(:id)&.to_s].compact)
|
35
|
+
end
|
36
|
+
return :no_pdfs_to_split if file_locations.empty?
|
37
|
+
|
38
|
+
IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: file_locations, user: user)
|
39
|
+
:enqueued
|
40
|
+
end
|
41
|
+
# rubocop:enable Metrics/MethodLength
|
42
|
+
|
43
|
+
##
|
44
|
+
# @api private
|
45
|
+
#
|
46
|
+
# Are there any PDF files?
|
47
|
+
# @param [Array > String] paths to PDFs
|
48
|
+
# @return [Boolean]
|
49
|
+
def self.pdfs?(paths:)
|
50
|
+
pdf_paths = pdfs_only_for(paths)
|
51
|
+
return false unless pdf_paths.count.positive?
|
52
|
+
true
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# @api private
|
7
57
|
# Load an array of paths to pdf files
|
8
58
|
# @param [Array > Hyrax::Upload file ids]
|
9
59
|
# @return [Array > String] file paths to temp directory
|
10
60
|
def self.pdf_paths(files:)
|
61
|
+
return [] if files.all?(&:empty?) # assumes an array
|
62
|
+
|
11
63
|
upload_ids = filter_file_ids(files)
|
12
64
|
return [] if upload_ids.empty?
|
65
|
+
|
13
66
|
uploads = Hyrax::UploadedFile.find(upload_ids)
|
14
67
|
paths = uploads.map(&method(:upload_path))
|
15
68
|
pdfs_only_for(paths)
|
16
69
|
end
|
17
70
|
|
71
|
+
##
|
72
|
+
# @api private
|
73
|
+
#
|
18
74
|
# Is child work splitting defined for model?
|
19
75
|
# @param [GenericWork, etc] A valid type of hyrax work
|
20
76
|
# @return [Boolean]
|
@@ -24,51 +80,31 @@ module IiifPrint
|
|
24
80
|
false
|
25
81
|
end
|
26
82
|
|
27
|
-
|
28
|
-
# @
|
29
|
-
# @return [Boolean]
|
30
|
-
def self.pdfs?(paths:)
|
31
|
-
pdf_paths = pdfs_only_for(paths)
|
32
|
-
return false unless pdf_paths.count.positive?
|
33
|
-
true
|
34
|
-
end
|
35
|
-
|
36
|
-
# Submit the job to split PDF into child works
|
37
|
-
# @param [GenericWork, etc] A valid type of hyrax work
|
38
|
-
# @param [Array<String>] paths to PDF attachments
|
39
|
-
# @param [User] user
|
40
|
-
# @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
|
41
|
-
def self.queue_job(work:, file_locations:, user:, admin_set_id:)
|
42
|
-
work.iiif_print_config.pdf_splitter_job.perform_later(
|
43
|
-
work,
|
44
|
-
file_locations,
|
45
|
-
user,
|
46
|
-
admin_set_id,
|
47
|
-
count_existing_pdfs(work)
|
48
|
-
)
|
49
|
-
end
|
50
|
-
|
83
|
+
##
|
84
|
+
# @api private
|
51
85
|
def self.filter_file_ids(input)
|
52
86
|
Array.wrap(input).select(&:present?)
|
53
87
|
end
|
54
88
|
|
89
|
+
##
|
90
|
+
# @api private
|
91
|
+
#
|
55
92
|
# Given Hyrax::Upload object, return path to file on local filesystem
|
56
93
|
def self.upload_path(upload)
|
57
94
|
# so many layers to this onion:
|
95
|
+
# TODO: Write a recursive function to keep calling file until
|
96
|
+
# the file doesn't respond to file then return that file.
|
58
97
|
upload.file.file.file
|
59
98
|
end
|
60
99
|
|
61
|
-
|
62
|
-
#
|
63
|
-
|
64
|
-
0
|
65
|
-
end
|
66
|
-
|
100
|
+
##
|
101
|
+
# @api private
|
102
|
+
#
|
67
103
|
# TODO: Consider other methods to identify a PDF file.
|
68
104
|
# This sub-selection may need to be moved to use mimetype if there
|
69
105
|
# is a need to support paths not ending in .pdf (i.e. remote_urls)
|
70
106
|
def self.pdfs_only_for(paths)
|
71
|
-
paths.select { |path|
|
107
|
+
paths.select { |path| IiifPrint.split_for_path_suffix?(path) }
|
72
108
|
end
|
73
109
|
end
|
74
110
|
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module SplitPdfs
|
3
|
+
##
|
4
|
+
# This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed
|
5
|
+
# images, or split a PDF if there are no preprocessed images.
|
6
|
+
#
|
7
|
+
# We have already attached the original file to the file_set. We want to convert that original
|
8
|
+
# file that's attached to a input_uri (e.g. "file://path/to/original-file" as in what we have
|
9
|
+
# written to Fedora as the PDF)
|
10
|
+
#
|
11
|
+
# @see .call
|
12
|
+
class DerivativeRodeoSplitter
|
13
|
+
##
|
14
|
+
# @param filename [String] the local path to the PDFDerivativeServicele
|
15
|
+
# @param file_set [FileSet] file set containing the PDF file to split
|
16
|
+
#
|
17
|
+
# @return [Array<String>] paths to images split from each page of PDF file
|
18
|
+
#
|
19
|
+
# @see IiifPrint::SplitPdfs::BaseSplitter
|
20
|
+
def self.call(filename, file_set:)
|
21
|
+
new(filename, file_set: file_set).split_files
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# @param filename [String] path to the original file. Note that we use {#filename} to
|
26
|
+
# derivate {#input_uri}
|
27
|
+
# @param file_set [FileSet] the container for the original file and its derivatives.
|
28
|
+
#
|
29
|
+
# @param output_tmp_dir [String] where we will be writing things. In using `Dir.mktmpdir`
|
30
|
+
# we're creating a sudirectory on `Dir.tmpdir`
|
31
|
+
def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
|
32
|
+
@filename = filename
|
33
|
+
@file_set = file_set
|
34
|
+
|
35
|
+
@input_uri = "file://#{filename}"
|
36
|
+
|
37
|
+
# We are writing the images to a local location that CarrierWave can upload. This is a
|
38
|
+
# local file, internal to IiifPrint; it looks like SpaceStone/DerivativeRodeo lingo, but
|
39
|
+
# that's just a convenience.
|
40
|
+
output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}')
|
41
|
+
|
42
|
+
@output_location_template = "file://#{output_template_path}"
|
43
|
+
end
|
44
|
+
|
45
|
+
attr_reader :filename, :file_set
|
46
|
+
|
47
|
+
##
|
48
|
+
# This is where, in "Fedora" we have the original file. This is not the original file in the
|
49
|
+
# pre-processing location but instead the long-term location of the file in the application
|
50
|
+
# that mounts IIIF Print.
|
51
|
+
#
|
52
|
+
# @return [String]
|
53
|
+
attr_reader :input_uri
|
54
|
+
|
55
|
+
##
|
56
|
+
# This is the location where we're going to write the derivatives that will "go into Fedora";
|
57
|
+
# it is a local location, one that IIIF Print's mounting application can directly do
|
58
|
+
# "File.read"
|
59
|
+
#
|
60
|
+
# @return [String]
|
61
|
+
attr_reader :output_location_template
|
62
|
+
|
63
|
+
##
|
64
|
+
# Where can we find the file that represents the pre-processing template. In this case, the
|
65
|
+
# original PDF file.
|
66
|
+
#
|
67
|
+
# The logic handles a case where SpaceStone successfully fetched the file to then perform
|
68
|
+
# processing.
|
69
|
+
#
|
70
|
+
# For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3
|
71
|
+
# bucket that we then use for IIIF Print.
|
72
|
+
#
|
73
|
+
# @note The preprocessed_location_template should end in `.pdf`. The
|
74
|
+
# DerivativeRodeo::BaseGenerator::PdfSplitGenerator#derive_preprocessed_template_from
|
75
|
+
# will coerce the template into one that represents the split pages.
|
76
|
+
#
|
77
|
+
# @return [String]
|
78
|
+
#
|
79
|
+
# @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63
|
80
|
+
# rubocop:disable Metrics/MethodLength
|
81
|
+
# rubocop:disable Metrics/AbcSize
|
82
|
+
def preprocessed_location_template
|
83
|
+
return @preprocessed_location_template if defined?(@preprocessed_location_template)
|
84
|
+
|
85
|
+
derivative_rodeo_candidate = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename)
|
86
|
+
|
87
|
+
@preprocessed_location_template =
|
88
|
+
if derivative_rodeo_candidate.blank?
|
89
|
+
message = "#{self.class}##{__method__} could not establish derivative_rodeo_candidate for " \
|
90
|
+
"#{file_set.class} ID=#{file_set&.id} #to_param=#{file_set&.to_param} with filename #{filename.inspect}. " \
|
91
|
+
"Move along little buddy."
|
92
|
+
Rails.logger.debug(message)
|
93
|
+
nil
|
94
|
+
elsif rodeo_conformant_uri_exists?(derivative_rodeo_candidate)
|
95
|
+
Rails.logger.debug("#{self.class}##{__method__} found existing file at location #{derivative_rodeo_candidate}. High five partner!")
|
96
|
+
derivative_rodeo_candidate
|
97
|
+
elsif file_set.import_url
|
98
|
+
message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist. " \
|
99
|
+
"Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}"
|
100
|
+
Rails.logger.warn(message)
|
101
|
+
handle_original_file_not_in_derivative_rodeo
|
102
|
+
else
|
103
|
+
message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \
|
104
|
+
"nor a remote_url for #{file_set.class} ID=#{file_set.id} #to_param=#{file_set&.to_param}. " \
|
105
|
+
"Returning `nil' as we have no possible preprocess. " \
|
106
|
+
"Maybe the input_uri #{input_uri.inspect} will be adequate."
|
107
|
+
Rails.logger.warn(message)
|
108
|
+
nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
# rubocop:enable Metrics/AbcSize
|
112
|
+
# rubocop:enable Metrics/MethodLength
|
113
|
+
|
114
|
+
##
|
115
|
+
# @api private
|
116
|
+
#
|
117
|
+
# When the file does not exist in the pre-processed location (e.g. "SpaceStone") we need to
|
118
|
+
# ensure that we have something locally. We copy the {FileSet#import_url} to the {#input_uri}
|
119
|
+
# location.
|
120
|
+
#
|
121
|
+
# @return [String] should be the {#input_uri}
|
122
|
+
# @raise [DerivativeRodeo::Errors::FileMissingError] when the input_uri does not exist
|
123
|
+
def handle_original_file_not_in_derivative_rodeo
|
124
|
+
# A quick short-circuit. Don't attempt to copy. Likely already covered by the DerivativeRodeo::Generators::CopyGenerator
|
125
|
+
return input_uri if rodeo_conformant_uri_exists?(input_uri)
|
126
|
+
|
127
|
+
message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \
|
128
|
+
"Perhaps there was a problem in SpaceStone downloading the file? " \
|
129
|
+
"Regardless, we'll use DerivativeRodeo::Generators::CopyGenerator to ensure #{input_uri.inspect} exists. " \
|
130
|
+
"However, we'll almost certainly be generating child pages locally."
|
131
|
+
Rails.logger.info(message)
|
132
|
+
|
133
|
+
# This ensures that we have a copy of the file_set.import_uri at the input_uri location;
|
134
|
+
# we likely have this.
|
135
|
+
DerivativeRodeo::Generators::CopyGenerator.new(
|
136
|
+
input_uris: [file_set.import_url],
|
137
|
+
output_location_template: input_uri
|
138
|
+
).generated_uris.first
|
139
|
+
end
|
140
|
+
# private :handle_original_file_not_in_derivative_rodeo
|
141
|
+
|
142
|
+
def rodeo_conformant_uri_exists?(uri)
|
143
|
+
DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist?
|
144
|
+
end
|
145
|
+
private :rodeo_conformant_uri_exists?
|
146
|
+
|
147
|
+
##
|
148
|
+
# @return [Array<Strings>] the paths to each of the images split off from the PDF.
|
149
|
+
def split_files
|
150
|
+
DerivativeRodeo::Generators::PdfSplitGenerator.new(
|
151
|
+
input_uris: [input_uri],
|
152
|
+
output_location_template: output_location_template,
|
153
|
+
preprocessed_location_template: preprocessed_location_template
|
154
|
+
).generated_files.map(&:file_path)
|
155
|
+
rescue => e
|
156
|
+
message = "#{self.class}##{__method__} encountered `#{e.class}' “#{e}” for " \
|
157
|
+
"input_uri: #{input_uri.inspect}, " \
|
158
|
+
"output_location_template: #{output_location_template.inspect}, and " \
|
159
|
+
"preprocessed_location_template: #{preprocessed_location_template.inspect}."
|
160
|
+
exception = RuntimeError.new(message)
|
161
|
+
exception.set_backtrace(e.backtrace)
|
162
|
+
raise exception
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
module SplitPdfs
|
5
|
+
## Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works
|
6
|
+
class DestroyPdfChildWorksService
|
7
|
+
## @api public
|
8
|
+
# @param file_set [FileSet] What is the containing file set for the provided file.
|
9
|
+
# @param work [Hydra::PCDM::Work] Parent of the fileset being deleted
|
10
|
+
def self.conditionally_destroy_spawned_children_of(file_set:, work:)
|
11
|
+
child_model = work.try(:iiif_print_config)&.pdf_split_child_model
|
12
|
+
return unless child_model
|
13
|
+
return unless file_set.class.pdf_mime_types.include?(file_set.mime_type)
|
14
|
+
|
15
|
+
IiifPrint::PendingRelationship.where(parent_id: work.id, file_id: file_set.id).find_each(&:destroy)
|
16
|
+
destroy_spawned_children(model: child_model, file_set: file_set, work: work)
|
17
|
+
end
|
18
|
+
|
19
|
+
private_class_method def self.destroy_spawned_children(model:, file_set:, work:)
|
20
|
+
# look first for children by the file set id they were split from
|
21
|
+
children = model.where(split_from_pdf_id: file_set.id)
|
22
|
+
if children.blank?
|
23
|
+
# find works where file name and work `to_param` are both in the title
|
24
|
+
children = model.where(title: file_set.label).where(title: work.to_param)
|
25
|
+
end
|
26
|
+
return if children.blank?
|
27
|
+
children.each do |rcd|
|
28
|
+
rcd.destroy(eradicate: true)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module SplitPdfs
|
3
|
+
# @abstract
|
4
|
+
#
|
5
|
+
# The purpose of this class is to split the PDF into constituent jpg files.
|
6
|
+
#
|
7
|
+
# @see #each
|
8
|
+
class PagesToJpgsSplitter < BaseSplitter
|
9
|
+
self.image_extension = 'jpg'
|
10
|
+
self.quality = '50'
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def gsdevice
|
15
|
+
'jpeg'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module SplitPdfs
|
3
|
+
# @abstract
|
4
|
+
#
|
5
|
+
# The purpose of this class is to split the PDF into constituent png files.
|
6
|
+
#
|
7
|
+
# @see #each
|
8
|
+
class PagesToPngsSplitter < BaseSplitter
|
9
|
+
self.image_extension = 'png'
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def gsdevice
|
14
|
+
color, _channels, bpc = pdfinfo.color
|
15
|
+
device = nil
|
16
|
+
# 1 Bit Grayscale, if applicable:
|
17
|
+
device = 'pngmonod' if color == 'gray' && bpc == 1
|
18
|
+
# 8 Bit Grayscale, if applicable:
|
19
|
+
device = 'pnggray' if color == 'gray' && bpc > 1
|
20
|
+
# otherwise 24 Bit RGB:
|
21
|
+
device = 'png16m' if device.nil?
|
22
|
+
device
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module SplitPdfs
|
3
|
+
# The purpose of this class is to split the PDF into constituent TIFF files.
|
4
|
+
#
|
5
|
+
# @see #each
|
6
|
+
class PagesToTiffsSplitter < BaseSplitter
|
7
|
+
self.image_extension = 'tiff'
|
8
|
+
DEFAULT_COMPRESSION = 'lzw'.freeze
|
9
|
+
self.compression = DEFAULT_COMPRESSION
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def gsdevice
|
14
|
+
color, channels, bpc = pdfinfo.color
|
15
|
+
device = nil
|
16
|
+
if color == 'gray'
|
17
|
+
# CCITT Group 4 Black and White, if applicable:
|
18
|
+
if bpc == 1
|
19
|
+
device = 'tiffg4'
|
20
|
+
self.compression = 'g4'
|
21
|
+
elsif bpc > 1
|
22
|
+
# 8 Bit Grayscale, if applicable:
|
23
|
+
device = 'tiffgray'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# otherwise color:
|
28
|
+
device = colordevice(channels, bpc) if device.nil?
|
29
|
+
device
|
30
|
+
end
|
31
|
+
|
32
|
+
def colordevice(channels, bpc)
|
33
|
+
bits = bpc * channels
|
34
|
+
# will be either 8bpc/16bpd color TIFF,
|
35
|
+
# with any CMYK source transformed to 8bpc RBG
|
36
|
+
bits = 24 unless [24, 48].include? bits
|
37
|
+
"tiff#{bits}nc"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -8,78 +8,83 @@ module IiifPrint
|
|
8
8
|
# For dpi extraction, falls back to calculating using MiniMagick,
|
9
9
|
# if neccessary.
|
10
10
|
class PdfImageExtractionService
|
11
|
-
# class constant column numbers
|
12
|
-
COL_WIDTH = 3
|
13
|
-
COL_HEIGHT = 4
|
14
|
-
COL_COLOR = 5
|
15
|
-
COL_CHANNELS = 6
|
16
|
-
COL_BITS = 7
|
17
|
-
# only poppler 0.25+ has this column in output:
|
18
|
-
COL_XPPI = 12
|
19
|
-
|
20
11
|
def initialize(path)
|
21
12
|
@path = path
|
22
|
-
|
23
|
-
@output = nil
|
24
|
-
@entries = nil
|
13
|
+
process(command: format('pdfimages -list %<path>s 2>/dev/null', path: path))
|
25
14
|
end
|
26
15
|
|
27
|
-
|
28
|
-
|
29
|
-
if @output.nil?
|
30
|
-
Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
31
|
-
@output = stdout.read.split("\n")
|
32
|
-
end
|
33
|
-
end
|
34
|
-
@output.slice(2, @output.size - 1)
|
35
|
-
end
|
16
|
+
attr_reader :path, :page_count, :width, :height, :pixels_per_inch
|
17
|
+
alias ppi pixels_per_inch
|
36
18
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
output = process
|
41
|
-
(0..output.size - 1).each do |i|
|
42
|
-
@entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
|
43
|
-
end
|
44
|
-
end
|
45
|
-
@entries
|
19
|
+
# @return [Array<String, Integer, Integer>]
|
20
|
+
def color
|
21
|
+
[@color_description, @channels, @bits]
|
46
22
|
end
|
47
23
|
|
48
|
-
|
49
|
-
result = entries.map { |e| e[i] }
|
50
|
-
return result.map!(&block) if block_given?
|
51
|
-
result
|
52
|
-
end
|
24
|
+
private
|
53
25
|
|
54
|
-
|
55
|
-
|
56
|
-
|
26
|
+
# class constant column numbers
|
27
|
+
COL_WIDTH = 3
|
28
|
+
COL_HEIGHT = 4
|
29
|
+
COL_COLOR_DESC = 5
|
30
|
+
COL_CHANNELS = 6
|
31
|
+
COL_BITS = 7
|
32
|
+
# only poppler 0.25+ has this column in output:
|
33
|
+
COL_XPPI = 12
|
57
34
|
|
58
|
-
|
59
|
-
|
60
|
-
|
35
|
+
# rubocop:disable Metrics/AbcSize - Because this helps us process the results in one loop.
|
36
|
+
# rubocop:disable Metrics/MethodLength - Again, to help speed up the processing loop.
|
37
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
38
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
39
|
+
#
|
40
|
+
# The first two lines are tabular header information:
|
41
|
+
#
|
42
|
+
# Example:
|
43
|
+
#
|
44
|
+
# bash-5.1$ pdfimages -list fmc_color.pdf | head -5
|
45
|
+
# page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio
|
46
|
+
# --------------------------------------------------------------------------------------------
|
47
|
+
# 1 0 image 2475 413 rgb 3 8 jpeg no 10 0 300 300 21.8K 0.7%
|
48
|
+
def process(command:)
|
49
|
+
@page_count = 0
|
50
|
+
@color_description = 'gray'
|
51
|
+
@width = 0
|
52
|
+
@height = 0
|
53
|
+
@channels = 0
|
54
|
+
@bits = 0
|
55
|
+
@pixels_per_inch = 0
|
56
|
+
Open3.popen3(command) do |_stdin, stdout, _stderr, _wait_thr|
|
57
|
+
stdout.read.split("\n").each_with_index do |line, index|
|
58
|
+
# Skip the two header lines
|
59
|
+
next if index <= 1
|
60
|
+
@page_count += 1
|
61
|
+
cells = line.gsub(/\s+/m, ' ').strip.split(" ")
|
61
62
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
|
68
|
-
bits = entries.map { |e| e[COL_BITS].to_i }.max
|
69
|
-
[desc, channels, bits]
|
70
|
-
end
|
63
|
+
@color_description = 'rgb' if cells[COL_COLOR_DESC] != 'gray'
|
64
|
+
@width = cells[COL_WIDTH].to_i if cells[COL_WIDTH].to_i > @width
|
65
|
+
@height = cells[COL_HEIGHT].to_i if cells[COL_HEIGHT].to_i > @height
|
66
|
+
@channels = cells[COL_CHANNELS].to_i if cells[COL_CHANNELS].to_i > @channels
|
67
|
+
@bits = cells[COL_BITS].to_i if cells[COL_BITS].to_i > @bits
|
71
68
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
69
|
+
# In the case of poppler version < 0.25, we will have no more than 12 columns. As such,
|
70
|
+
# we need to do some alternative magic to calculate this.
|
71
|
+
if @page_count == 1 && cells.size <= 12
|
72
|
+
pdf = MiniMagick::Image.open(@path)
|
73
|
+
width_points = pdf.width
|
74
|
+
width_px = width
|
75
|
+
@pixels_per_inch = (72 * width_px / width_points).to_i
|
76
|
+
elsif cells[COL_XPPI].to_i > @pixels_per_inch
|
77
|
+
# By the magic of nil#to_i if we don't have more than 12 columns, we've already set
|
78
|
+
# the @pixels_per_inch and this line won't due much of anything.
|
79
|
+
@pixels_per_inch = cells[COL_XPPI].to_i
|
80
|
+
end
|
81
|
+
end
|
79
82
|
end
|
80
|
-
# with poppler 0.25+, pdfimages just gives us this:
|
81
|
-
selectcolumn(COL_XPPI, &:to_i).max
|
82
83
|
end
|
84
|
+
# rubocop:enable Metrics/AbcSize
|
85
|
+
# rubocop:enable Metrics/MethodLength
|
86
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
87
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
83
88
|
end
|
84
89
|
end
|
85
90
|
end
|
@@ -84,6 +84,7 @@ module IiifPrint
|
|
84
84
|
# add trailing space to plaintext buffer for between words:
|
85
85
|
@text += ' '
|
86
86
|
@words.push(@current) if word_complete?
|
87
|
+
@current = nil # clear the current word
|
87
88
|
end
|
88
89
|
|
89
90
|
def end_line
|
@@ -120,9 +121,12 @@ module IiifPrint
|
|
120
121
|
# for current word, and append line endings to plain text:
|
121
122
|
#
|
122
123
|
# @param name [String] element name.
|
123
|
-
def end_element(
|
124
|
-
|
125
|
-
|
124
|
+
def end_element(name)
|
125
|
+
if name == 'span'
|
126
|
+
end_word if @element_class_name == 'ocrx_word'
|
127
|
+
@text += "\n" if @element_class_name.nil?
|
128
|
+
end
|
129
|
+
@element_class_name = nil
|
126
130
|
end
|
127
131
|
|
128
132
|
# Callback for completion of parsing hOCR, used to normalize generated
|
@@ -9,7 +9,7 @@ module IiifPrint
|
|
9
9
|
class PageOCR
|
10
10
|
attr_accessor :html, :path
|
11
11
|
|
12
|
-
def initialize(path,
|
12
|
+
def initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options)
|
13
13
|
@path = path
|
14
14
|
# hOCR html:
|
15
15
|
@html = nil
|
@@ -17,13 +17,14 @@ module IiifPrint
|
|
17
17
|
@source_meta = nil
|
18
18
|
@box = nil
|
19
19
|
@plain = nil
|
20
|
-
@
|
20
|
+
@additional_tesseract_options = additional_tesseract_options
|
21
21
|
end
|
22
22
|
|
23
23
|
def run_ocr
|
24
24
|
outfile = File.join(Dir.mktmpdir, 'output_html')
|
25
|
-
cmd = "tesseract #{path} #{outfile}
|
26
|
-
cmd += " #{@
|
25
|
+
cmd = "OMP_THREAD_LIMIT=1 tesseract #{path} #{outfile}"
|
26
|
+
cmd += " #{@additional_tesseract_options}" if @additional_tesseract_options.present?
|
27
|
+
cmd += " hocr"
|
27
28
|
`#{cmd}`
|
28
29
|
outfile + '.hocr'
|
29
30
|
end
|
data/lib/iiif_print/version.rb
CHANGED