iiif_print 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
- data/.github/workflows/build-lint-test-action.yaml +4 -5
- data/.gitignore +5 -4
- data/.rubocop.yml +1 -0
- data/.solargraph.yml +19 -0
- data/Gemfile.lock +1025 -0
- data/README.md +102 -9
- data/Rakefile +6 -0
- data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
- data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
- data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
- data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
- data/app/helpers/iiif_print_helper.rb +0 -20
- data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +45 -17
- data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
- data/app/jobs/iiif_print/jobs/child_works_from_pdf_job.rb +153 -0
- data/app/jobs/iiif_print/jobs/create_relationships_job.rb +117 -0
- data/app/jobs/iiif_print/jobs/request_split_pdf_job.rb +31 -0
- data/app/listeners/iiif_print/listener.rb +31 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +10 -1
- data/app/models/concerns/iiif_print/solr/document.rb +19 -3
- data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
- data/app/models/iiif_print/pending_relationship.rb +3 -0
- data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +23 -11
- data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
- data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +90 -31
- data/app/services/iiif_print/pluggable_derivative_service.rb +8 -10
- data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
- data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
- data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
- data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
- data/app/views/catalog/_index_header_list_default.html.erb +13 -0
- data/app/views/hyrax/base/_representative_media.html.erb +4 -3
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
- data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
- data/config/initializers/simple_schema_loader.rb +1 -0
- data/config/locales/iiif_print.en.yml +4 -0
- data/config/metadata/child_works_from_pdf_splitting.yaml +21 -0
- data/config/routes.rb +3 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
- data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
- data/docker-compose.yml +2 -2
- data/iiif_print.gemspec +11 -10
- data/lib/generators/iiif_print/install_generator.rb +21 -1
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
- data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
- data/lib/iiif_print/base_derivative_service.rb +14 -2
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +58 -6
- data/lib/iiif_print/catalog_search_builder.rb +7 -3
- data/lib/iiif_print/configuration.rb +205 -8
- data/lib/iiif_print/data/fileset_helper.rb +3 -3
- data/lib/iiif_print/data/work_derivatives.rb +4 -4
- data/lib/iiif_print/engine.rb +53 -15
- data/lib/iiif_print/errors.rb +18 -0
- data/lib/iiif_print/homepage_search_builder.rb +17 -0
- data/lib/iiif_print/image_tool.rb +12 -8
- data/lib/iiif_print/jp2_derivative_service.rb +4 -1
- data/lib/iiif_print/lineage_service.rb +47 -13
- data/lib/iiif_print/metadata.rb +67 -48
- data/lib/iiif_print/pdf_derivative_service.rb +3 -1
- data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
- data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
- data/lib/iiif_print/persistence_layer.rb +118 -0
- data/lib/iiif_print/split_pdfs/base_splitter.rb +153 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +83 -37
- data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
- data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +22 -0
- data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
- data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
- data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
- data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
- data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
- data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
- data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
- data/lib/iiif_print/tiff_derivative_service.rb +3 -1
- data/lib/iiif_print/version.rb +1 -1
- data/lib/iiif_print.rb +210 -20
- data/lib/samvera/derivatives/configuration.rb +83 -0
- data/lib/samvera/derivatives/hyrax.rb +129 -0
- data/lib/samvera/derivatives.rb +238 -0
- data/tasks/copy_authorities_to_test_app.rake +11 -0
- data/tasks/iiif_print_dev.rake +4 -4
- metadata +111 -196
- data/app/helpers/hyrax/iiif_helper.rb +0 -22
- data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -34
- data/app/views/hyrax/file_sets/_actions.html.erb +0 -45
- data/bin/rails +0 -13
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +0 -107
- data/lib/iiif_print/jobs/create_relationships_job.rb +0 -78
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
- data/spec/.keep.txt +0 -1
- data/spec/factories/ability.rb +0 -6
- data/spec/factories/newspaper_issue.rb +0 -7
- data/spec/factories/newspaper_page.rb +0 -7
- data/spec/factories/newspaper_page_solr_document.rb +0 -12
- data/spec/factories/newspaper_title.rb +0 -8
- data/spec/factories/uploaded_pdf_file.rb +0 -9
- data/spec/factories/uploaded_txt_file.rb +0 -9
- data/spec/factories/user.rb +0 -13
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +0 -7
- data/spec/fixtures/files/alto-2-0.xsd +0 -714
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +0 -16
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +0 -31
- data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
- data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
- data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +0 -202
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
- data/spec/helpers/iiif_print_helper_spec.rb +0 -43
- data/spec/iiif_print/base_derivative_service_spec.rb +0 -11
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -51
- data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
- data/spec/iiif_print/configuration_spec.rb +0 -67
- data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
- data/spec/iiif_print/data/work_file_spec.rb +0 -99
- data/spec/iiif_print/data/work_files_spec.rb +0 -237
- data/spec/iiif_print/image_tool_spec.rb +0 -109
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -30
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -17
- data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
- data/spec/iiif_print/lineage_service_spec.rb +0 -13
- data/spec/iiif_print/metadata_spec.rb +0 -115
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
- data/spec/iiif_print_spec.rb +0 -51
- data/spec/misc_shared.rb +0 -111
- data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
- data/spec/models/solr_document_spec.rb +0 -14
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -19
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -178
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
- data/spec/spec_helper.rb +0 -181
- data/spec/support/controller_level_helpers.rb +0 -28
- data/spec/support/iiif_print_models.rb +0 -127
- data/spec/test_app_templates/blacklight.yml +0 -9
- data/spec/test_app_templates/fedora.yml +0 -15
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
- data/spec/test_app_templates/redis.yml +0 -9
- data/spec/test_app_templates/solr/conf/schema.xml +0 -362
- data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
- data/spec/test_app_templates/solr.yml +0 -7
@@ -0,0 +1,153 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'securerandom'
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'iiif_print/split_pdfs/pdf_image_extraction_service'
|
5
|
+
|
6
|
+
module IiifPrint
|
7
|
+
module SplitPdfs
|
8
|
+
# @abstract
|
9
|
+
#
|
10
|
+
# The purpose of this class is to split the PDF into constituent image files.
|
11
|
+
#
|
12
|
+
# @see .call
|
13
|
+
class BaseSplitter
|
14
|
+
##
|
15
|
+
# @api public
|
16
|
+
#
|
17
|
+
# @param path [String] local path to the PDF that we will split.
|
18
|
+
# @return [Enumerable]
|
19
|
+
#
|
20
|
+
# @see #each
|
21
|
+
#
|
22
|
+
# @note We're including the ** args to provide method conformity; other services require
|
23
|
+
# additional information (such as the FileSet)
|
24
|
+
#
|
25
|
+
# @see IiifPrint::SplitPdfs::DerivativeRodeoSplitter
|
26
|
+
def self.call(path, **)
|
27
|
+
new(path).to_a
|
28
|
+
end
|
29
|
+
|
30
|
+
##
|
31
|
+
# @api public
|
32
|
+
#
|
33
|
+
# Added to allow for fine-tuning of splitting decision such as tenant-based omission
|
34
|
+
# @see https://github.com/samvera/hyku/blob/main/app/services/iiif_print/tenant_config.rb
|
35
|
+
#
|
36
|
+
# @return [Boolean] returns false to not limit the splitting of PDFs
|
37
|
+
def self.never_split_pdfs?
|
38
|
+
false
|
39
|
+
end
|
40
|
+
|
41
|
+
class_attribute :image_extension
|
42
|
+
class_attribute :compression, default: nil
|
43
|
+
class_attribute :quality, default: nil
|
44
|
+
|
45
|
+
def initialize(path, tmpdir: Dir.mktmpdir, default_dpi: 400)
|
46
|
+
@baseid = SecureRandom.uuid
|
47
|
+
@pdfpath = path
|
48
|
+
@pdfinfo = IiifPrint::SplitPdfs::PdfImageExtractionService.new(pdfpath)
|
49
|
+
@tmpdir = tmpdir
|
50
|
+
@default_dpi = default_dpi
|
51
|
+
end
|
52
|
+
|
53
|
+
# In creating {#each} we get many of the methods of array operation (e.g. #to_a).
|
54
|
+
include Enumerable
|
55
|
+
|
56
|
+
# @api public
|
57
|
+
#
|
58
|
+
# @yieldparam [String] the path to the page's tiff.
|
59
|
+
def each
|
60
|
+
entries.each do |e|
|
61
|
+
yield(e)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# @api private
|
66
|
+
#
|
67
|
+
# TODO: put this test somewhere to prevent invalid pdfs from crashing the image service.
|
68
|
+
def invalid_pdf?
|
69
|
+
return true if pdfinfo.color.include?(nil) || pdfinfo.width.nil? || pdfinfo.height.nil? || pdfinfo.page_count.zero?
|
70
|
+
false
|
71
|
+
end
|
72
|
+
|
73
|
+
attr_reader :pdfinfo, :tmpdir, :baseid, :default_dpi, :pdfpath
|
74
|
+
private :pdfinfo, :tmpdir, :baseid, :default_dpi, :pdfpath
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
# entries for each page
|
79
|
+
def entries
|
80
|
+
return @entries if defined? @entries
|
81
|
+
|
82
|
+
@entries = Array.wrap(gsconvert)
|
83
|
+
end
|
84
|
+
|
85
|
+
# rubocop:disable Metrics/MethodLength
|
86
|
+
# ghostscript convert all pages to TIFF
|
87
|
+
def gsconvert
|
88
|
+
output_base = File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
|
89
|
+
# NOTE: you must call gsdevice before compression, as compression is
|
90
|
+
# updated during the gsdevice call.
|
91
|
+
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
|
92
|
+
cmd += " -sCompression=#{compression}" if compression?
|
93
|
+
cmd += " -dJPEGQ=#{quality}" if quality?
|
94
|
+
cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
|
95
|
+
filenames = []
|
96
|
+
|
97
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
98
|
+
page_number = 0
|
99
|
+
stdout.read.split("\n").each do |line|
|
100
|
+
next unless line.start_with?('Page ')
|
101
|
+
|
102
|
+
page_number += 1
|
103
|
+
filenames << File.join(tmpdir, "#{baseid}-page#{page_number}.#{image_extension}")
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
filenames
|
108
|
+
end
|
109
|
+
# rubocop:enable Metrics/MethodLength
|
110
|
+
|
111
|
+
def gsdevice
|
112
|
+
raise NotImplementedError
|
113
|
+
end
|
114
|
+
|
115
|
+
PAGE_COUNT_REGEXP = %r{^Pages: +(\d+)$}.freeze
|
116
|
+
|
117
|
+
def pagecount
|
118
|
+
return @pagecount if defined? @pagecount
|
119
|
+
|
120
|
+
cmd = "pdfinfo #{pdfpath}"
|
121
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
122
|
+
match = PAGE_COUNT_REGEXP.match(stdout.read)
|
123
|
+
@pagecount = match[1].to_i
|
124
|
+
end
|
125
|
+
@pagecount
|
126
|
+
end
|
127
|
+
|
128
|
+
def ppi
|
129
|
+
if looks_scanned?
|
130
|
+
# For scanned media, defer to detected image PPI:
|
131
|
+
pdfinfo.ppi
|
132
|
+
else
|
133
|
+
# 400 dpi for something that does not look like scanned media:
|
134
|
+
default_dpi
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def looks_scanned?
|
139
|
+
max_image_px = pdfinfo.width * pdfinfo.height
|
140
|
+
# single 10mp+ image per page?
|
141
|
+
single_image_per_page? && max_image_px > 1024 * 1024 * 10
|
142
|
+
end
|
143
|
+
|
144
|
+
def single_image_per_page?
|
145
|
+
pdfinfo.page_count == pagecount
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
require "iiif_print/split_pdfs/pages_to_jpgs_splitter"
|
152
|
+
require "iiif_print/split_pdfs/pages_to_pngs_splitter"
|
153
|
+
require "iiif_print/split_pdfs/pages_to_tiffs_splitter"
|
@@ -1,29 +1,49 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
# Encapsulates methods used for pdf splitting into child works
|
4
3
|
module IiifPrint
|
5
4
|
module SplitPdfs
|
5
|
+
##
|
6
|
+
# Encapsulates methods used for pdf splitting into child works.
|
7
|
+
#
|
8
|
+
# The primary point of entry is {.conditionally_enqueue}.
|
6
9
|
class ChildWorkCreationFromPdfService
|
7
|
-
|
8
|
-
#
|
9
|
-
#
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
10
|
+
##
|
11
|
+
# Responsible for conditionally enqueueing the PDF splitting job. The conditions attempt to
|
12
|
+
# sniff out whether the given file was a PDF.
|
13
|
+
#
|
14
|
+
# @param file_set [FileSet] What is the containing file set for the provided file.
|
15
|
+
# @param file [#path, #id]
|
16
|
+
# @param user [User] Who did the upload?
|
17
|
+
# @param import_url [NilClass, String] Provided when we're dealing with a file provided via a
|
18
|
+
# URL.
|
19
|
+
# @param work [Hydra::PCDM::Work] An optional parameter that saves us a bit of time in not
|
20
|
+
# needing to query for the parent of the given :file_set (see {.parent_for})
|
21
|
+
#
|
22
|
+
# @return [Symbol] when we don't enqueue the job
|
23
|
+
# @return [TrueClass] when we actually enqueue the job underlying job.
|
24
|
+
# rubocop:disable Metrics/MethodLength
|
25
|
+
def self.conditionally_enqueue(file_set:, file:, user:, import_url: nil, work: nil)
|
26
|
+
work ||= IiifPrint.parent_for(file_set)
|
17
27
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
28
|
+
return :no_split_for_parent unless iiif_print_split?(work: work)
|
29
|
+
return :no_pdfs_to_split_for_import_url if import_url && !pdfs?(paths: [import_url])
|
30
|
+
|
31
|
+
file_locations = if import_url
|
32
|
+
# TODO: Fix this logic, currently unsupported in Bulkrax
|
33
|
+
[Hyrax::WorkingDirectory.find_or_retrieve(file.id, file_set.id)]
|
34
|
+
else
|
35
|
+
pdf_paths(file: file)
|
36
|
+
end
|
37
|
+
return :no_pdfs_to_split if file_locations.empty?
|
38
|
+
|
39
|
+
IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: file_locations, user: user)
|
40
|
+
:enqueued
|
25
41
|
end
|
42
|
+
# rubocop:enable Metrics/MethodLength
|
26
43
|
|
44
|
+
##
|
45
|
+
# @api private
|
46
|
+
#
|
27
47
|
# Are there any PDF files?
|
28
48
|
# @param [Array > String] paths to PDFs
|
29
49
|
# @return [Boolean]
|
@@ -33,42 +53,68 @@ module IiifPrint
|
|
33
53
|
true
|
34
54
|
end
|
35
55
|
|
36
|
-
|
56
|
+
##
|
57
|
+
# @api private
|
58
|
+
# Load an array of paths to pdf files
|
59
|
+
# @param [Array > Hyrax::Upload file ids]
|
60
|
+
# @return [Array > String] file paths to temp directory
|
61
|
+
def self.pdf_paths(file:)
|
62
|
+
return [] unless file
|
63
|
+
|
64
|
+
if file.class < Valkyrie::Resource
|
65
|
+
# assuming that if one PDF is uploaded to a Valkyrie resource then all of them should be
|
66
|
+
paths = [Hyrax.storage_adapter.file_path(file.file_identifier)]
|
67
|
+
pdfs_only_for(paths)
|
68
|
+
else
|
69
|
+
upload_ids = filter_file_ids(file.id.to_s)
|
70
|
+
return [] if upload_ids.empty?
|
71
|
+
|
72
|
+
uploads = Hyrax::UploadedFile.find(upload_ids)
|
73
|
+
paths = uploads.map(&method(:upload_path))
|
74
|
+
pdfs_only_for(paths)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
##
|
79
|
+
# @api private
|
80
|
+
#
|
81
|
+
# Is child work splitting defined for model?
|
37
82
|
# @param [GenericWork, etc] A valid type of hyrax work
|
38
|
-
# @
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
admin_set_id,
|
47
|
-
count_existing_pdfs(work)
|
48
|
-
)
|
83
|
+
# @return [Boolean]
|
84
|
+
def self.iiif_print_split?(work:)
|
85
|
+
config = work.try(:iiif_print_config)
|
86
|
+
return false unless config
|
87
|
+
return false if config.pdf_splitter_service.try(:never_split_pdfs?)
|
88
|
+
# defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
|
89
|
+
return true if config&.pdf_split_child_model
|
90
|
+
false
|
49
91
|
end
|
50
92
|
|
93
|
+
##
|
94
|
+
# @api private
|
51
95
|
def self.filter_file_ids(input)
|
52
96
|
Array.wrap(input).select(&:present?)
|
53
97
|
end
|
54
98
|
|
99
|
+
##
|
100
|
+
# @api private
|
101
|
+
#
|
55
102
|
# Given Hyrax::Upload object, return path to file on local filesystem
|
56
103
|
def self.upload_path(upload)
|
57
104
|
# so many layers to this onion:
|
105
|
+
# TODO: Write a recursive function to keep calling file until
|
106
|
+
# the file doesn't respond to file then return that file.
|
58
107
|
upload.file.file.file
|
59
108
|
end
|
60
109
|
|
61
|
-
|
62
|
-
#
|
63
|
-
|
64
|
-
0
|
65
|
-
end
|
66
|
-
|
110
|
+
##
|
111
|
+
# @api private
|
112
|
+
#
|
67
113
|
# TODO: Consider other methods to identify a PDF file.
|
68
114
|
# This sub-selection may need to be moved to use mimetype if there
|
69
115
|
# is a need to support paths not ending in .pdf (i.e. remote_urls)
|
70
116
|
def self.pdfs_only_for(paths)
|
71
|
-
paths.select { |path|
|
117
|
+
paths.select { |path| IiifPrint.split_for_path_suffix?(path) }
|
72
118
|
end
|
73
119
|
end
|
74
120
|
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module SplitPdfs
|
3
|
+
##
|
4
|
+
# This class wraps the DerivativeRodeo::Generators::PdfSplitGenerator to find preprocessed
|
5
|
+
# images, or split a PDF if there are no preprocessed images.
|
6
|
+
#
|
7
|
+
# We have already attached the original file to the file_set. We want to convert that original
|
8
|
+
# file that's attached to a input_uri (e.g. "file://path/to/original-file" as in what we have
|
9
|
+
# written to Fedora as the PDF)
|
10
|
+
#
|
11
|
+
# @see .call
|
12
|
+
class DerivativeRodeoSplitter
|
13
|
+
##
|
14
|
+
# @param filename [String] the local path to the PDFDerivativeServicele
|
15
|
+
# @param file_set [FileSet] file set containing the PDF file to split
|
16
|
+
#
|
17
|
+
# @return [Array<String>] paths to images split from each page of PDF file
|
18
|
+
#
|
19
|
+
# @see IiifPrint::SplitPdfs::BaseSplitter
|
20
|
+
def self.call(filename, file_set:)
|
21
|
+
new(filename, file_set: file_set).split_files
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# @param filename [String] path to the original file. Note that we use {#filename} to
|
26
|
+
# derivate {#input_uri}
|
27
|
+
# @param file_set [FileSet] the container for the original file and its derivatives.
|
28
|
+
#
|
29
|
+
# @param output_tmp_dir [String] where we will be writing things. In using `Dir.mktmpdir`
|
30
|
+
# we're creating a sudirectory on `Dir.tmpdir`
|
31
|
+
def initialize(filename, file_set:, output_tmp_dir: Dir.tmpdir)
|
32
|
+
@filename = filename
|
33
|
+
@file_set = file_set
|
34
|
+
|
35
|
+
@input_uri = "file://#{filename}"
|
36
|
+
|
37
|
+
# We are writing the images to a local location that CarrierWave can upload. This is a
|
38
|
+
# local file, internal to IiifPrint; it looks like SpaceStone/DerivativeRodeo lingo, but
|
39
|
+
# that's just a convenience.
|
40
|
+
output_template_path = File.join(output_tmp_dir, '{{ dir_parts[-1..-1] }}', '{{ filename }}')
|
41
|
+
|
42
|
+
@output_location_template = "file://#{output_template_path}"
|
43
|
+
end
|
44
|
+
|
45
|
+
attr_reader :filename, :file_set
|
46
|
+
|
47
|
+
##
|
48
|
+
# This is where, in "Fedora" we have the original file. This is not the original file in the
|
49
|
+
# pre-processing location but instead the long-term location of the file in the application
|
50
|
+
# that mounts IIIF Print.
|
51
|
+
#
|
52
|
+
# @return [String]
|
53
|
+
attr_reader :input_uri
|
54
|
+
|
55
|
+
##
|
56
|
+
# This is the location where we're going to write the derivatives that will "go into Fedora";
|
57
|
+
# it is a local location, one that IIIF Print's mounting application can directly do
|
58
|
+
# "File.read"
|
59
|
+
#
|
60
|
+
# @return [String]
|
61
|
+
attr_reader :output_location_template
|
62
|
+
|
63
|
+
##
|
64
|
+
# Where can we find the file that represents the pre-processing template. In this case, the
|
65
|
+
# original PDF file.
|
66
|
+
#
|
67
|
+
# The logic handles a case where SpaceStone successfully fetched the file to then perform
|
68
|
+
# processing.
|
69
|
+
#
|
70
|
+
# For example, SpaceStone::Serverless will pre-process derivatives and write them into an S3
|
71
|
+
# bucket that we then use for IIIF Print.
|
72
|
+
#
|
73
|
+
# @note The preprocessed_location_template should end in `.pdf`. The
|
74
|
+
# DerivativeRodeo::BaseGenerator::PdfSplitGenerator#derive_preprocessed_template_from
|
75
|
+
# will coerce the template into one that represents the split pages.
|
76
|
+
#
|
77
|
+
# @return [String]
|
78
|
+
#
|
79
|
+
# @see https://github.com/scientist-softserv/space_stone-serverless/blob/7f46dd5b218381739cd1c771183f95408a4e0752/awslambda/handler.rb#L58-L63
|
80
|
+
# rubocop:disable Metrics/MethodLength
|
81
|
+
# rubocop:disable Metrics/AbcSize
|
82
|
+
def preprocessed_location_template
|
83
|
+
return @preprocessed_location_template if defined?(@preprocessed_location_template)
|
84
|
+
|
85
|
+
derivative_rodeo_candidate = IiifPrint::DerivativeRodeoService.derivative_rodeo_uri(file_set: file_set, filename: filename)
|
86
|
+
|
87
|
+
@preprocessed_location_template =
|
88
|
+
if derivative_rodeo_candidate.blank?
|
89
|
+
message = "#{self.class}##{__method__} could not establish derivative_rodeo_candidate for " \
|
90
|
+
"#{file_set.class} ID=#{file_set&.id} #to_param=#{file_set&.to_param} with filename #{filename.inspect}. " \
|
91
|
+
"Move along little buddy."
|
92
|
+
Rails.logger.debug(message)
|
93
|
+
nil
|
94
|
+
elsif rodeo_conformant_uri_exists?(derivative_rodeo_candidate)
|
95
|
+
Rails.logger.debug("#{self.class}##{__method__} found existing file at location #{derivative_rodeo_candidate}. High five partner!")
|
96
|
+
derivative_rodeo_candidate
|
97
|
+
elsif file_set.import_url
|
98
|
+
message = "#{self.class}##{__method__} did not find #{derivative_rodeo_candidate.inspect} to exist. " \
|
99
|
+
"Moving on to check the #{file_set.class}#import_url of #{file_set.import_url.inspect}"
|
100
|
+
Rails.logger.warn(message)
|
101
|
+
handle_original_file_not_in_derivative_rodeo
|
102
|
+
else
|
103
|
+
message = "#{self.class}##{__method__} could not find an existing file at #{derivative_rodeo_candidate} " \
|
104
|
+
"nor a remote_url for #{file_set.class} ID=#{file_set.id} #to_param=#{file_set&.to_param}. " \
|
105
|
+
"Returning `nil' as we have no possible preprocess. " \
|
106
|
+
"Maybe the input_uri #{input_uri.inspect} will be adequate."
|
107
|
+
Rails.logger.warn(message)
|
108
|
+
nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
# rubocop:enable Metrics/AbcSize
|
112
|
+
# rubocop:enable Metrics/MethodLength
|
113
|
+
|
114
|
+
##
|
115
|
+
# @api private
|
116
|
+
#
|
117
|
+
# When the file does not exist in the pre-processed location (e.g. "SpaceStone") we need to
|
118
|
+
# ensure that we have something locally. We copy the {FileSet#import_url} to the {#input_uri}
|
119
|
+
# location.
|
120
|
+
#
|
121
|
+
# @return [String] should be the {#input_uri}
|
122
|
+
# @raise [DerivativeRodeo::Errors::FileMissingError] when the input_uri does not exist
|
123
|
+
def handle_original_file_not_in_derivative_rodeo
|
124
|
+
# A quick short-circuit. Don't attempt to copy. Likely already covered by the DerivativeRodeo::Generators::CopyGenerator
|
125
|
+
return input_uri if rodeo_conformant_uri_exists?(input_uri)
|
126
|
+
|
127
|
+
message = "#{self.class}##{__method__} found #{file_set.class}#import_url of #{file_set.import_url.inspect} to exist. " \
|
128
|
+
"Perhaps there was a problem in SpaceStone downloading the file? " \
|
129
|
+
"Regardless, we'll use DerivativeRodeo::Generators::CopyGenerator to ensure #{input_uri.inspect} exists. " \
|
130
|
+
"However, we'll almost certainly be generating child pages locally."
|
131
|
+
Rails.logger.info(message)
|
132
|
+
|
133
|
+
# This ensures that we have a copy of the file_set.import_uri at the input_uri location;
|
134
|
+
# we likely have this.
|
135
|
+
DerivativeRodeo::Generators::CopyGenerator.new(
|
136
|
+
input_uris: [file_set.import_url],
|
137
|
+
output_location_template: input_uri
|
138
|
+
).generated_uris.first
|
139
|
+
end
|
140
|
+
# private :handle_original_file_not_in_derivative_rodeo
|
141
|
+
|
142
|
+
def rodeo_conformant_uri_exists?(uri)
|
143
|
+
DerivativeRodeo::StorageLocations::BaseLocation.from_uri(uri).exist?
|
144
|
+
end
|
145
|
+
private :rodeo_conformant_uri_exists?
|
146
|
+
|
147
|
+
##
|
148
|
+
# @return [Array<Strings>] the paths to each of the images split off from the PDF.
|
149
|
+
def split_files
|
150
|
+
DerivativeRodeo::Generators::PdfSplitGenerator.new(
|
151
|
+
input_uris: [input_uri],
|
152
|
+
output_location_template: output_location_template,
|
153
|
+
preprocessed_location_template: preprocessed_location_template
|
154
|
+
).generated_files.map(&:file_path)
|
155
|
+
rescue => e
|
156
|
+
message = "#{self.class}##{__method__} encountered `#{e.class}' “#{e}” for " \
|
157
|
+
"input_uri: #{input_uri.inspect}, " \
|
158
|
+
"output_location_template: #{output_location_template.inspect}, and " \
|
159
|
+
"preprocessed_location_template: #{preprocessed_location_template.inspect}."
|
160
|
+
exception = RuntimeError.new(message)
|
161
|
+
exception.set_backtrace(e.backtrace)
|
162
|
+
raise exception
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
module SplitPdfs
|
5
|
+
## Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works
|
6
|
+
class DestroyPdfChildWorksService
|
7
|
+
## @api public
|
8
|
+
# @param file_set [FileSet] What is the containing file set for the provided file.
|
9
|
+
# @param work [Hydra::PCDM::Work] Parent of the fileset being deleted
|
10
|
+
def self.conditionally_destroy_spawned_children_of(file_set:, work:, user: nil)
|
11
|
+
child_model = work.try(:iiif_print_config)&.pdf_split_child_model
|
12
|
+
return unless child_model
|
13
|
+
return unless IiifPrint.pdf?(file_set)
|
14
|
+
|
15
|
+
# NOTE: The IiifPrint::PendingRelationship is an ActiveRecord object; hence we don't need to
|
16
|
+
# leverage an adapter.
|
17
|
+
IiifPrint::PendingRelationship.where(parent_id: work.id, file_id: file_set.id).find_each(&:destroy)
|
18
|
+
IiifPrint.destroy_children_split_from(file_set: file_set, work: work, model: child_model, user: user)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module SplitPdfs
|
3
|
+
# @abstract
|
4
|
+
#
|
5
|
+
# The purpose of this class is to split the PDF into constituent jpg files.
|
6
|
+
#
|
7
|
+
# @see #each
|
8
|
+
class PagesToJpgsSplitter < BaseSplitter
|
9
|
+
self.image_extension = 'jpg'
|
10
|
+
self.quality = '50'
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def gsdevice
|
15
|
+
'jpeg'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module SplitPdfs
|
3
|
+
# @abstract
|
4
|
+
#
|
5
|
+
# The purpose of this class is to split the PDF into constituent png files.
|
6
|
+
#
|
7
|
+
# @see #each
|
8
|
+
class PagesToPngsSplitter < BaseSplitter
|
9
|
+
self.image_extension = 'png'
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def gsdevice
|
14
|
+
color, _channels, bpc = pdfinfo.color
|
15
|
+
device = nil
|
16
|
+
# 1 Bit Grayscale, if applicable:
|
17
|
+
device = 'pngmonod' if color == 'gray' && bpc == 1
|
18
|
+
# 8 Bit Grayscale, if applicable:
|
19
|
+
device = 'pnggray' if color == 'gray' && bpc > 1
|
20
|
+
# otherwise 24 Bit RGB:
|
21
|
+
device = 'png16m' if device.nil?
|
22
|
+
device
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module SplitPdfs
|
3
|
+
# The purpose of this class is to split the PDF into constituent TIFF files.
|
4
|
+
#
|
5
|
+
# @see #each
|
6
|
+
class PagesToTiffsSplitter < BaseSplitter
|
7
|
+
self.image_extension = 'tiff'
|
8
|
+
DEFAULT_COMPRESSION = 'lzw'.freeze
|
9
|
+
self.compression = DEFAULT_COMPRESSION
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def gsdevice
|
14
|
+
color, channels, bpc = pdfinfo.color
|
15
|
+
device = nil
|
16
|
+
if color == 'gray'
|
17
|
+
# CCITT Group 4 Black and White, if applicable:
|
18
|
+
if bpc == 1
|
19
|
+
device = 'tiffg4'
|
20
|
+
self.compression = 'g4'
|
21
|
+
elsif bpc > 1
|
22
|
+
# 8 Bit Grayscale, if applicable:
|
23
|
+
device = 'tiffgray'
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# otherwise color:
|
28
|
+
device = colordevice(channels, bpc) if device.nil?
|
29
|
+
device
|
30
|
+
end
|
31
|
+
|
32
|
+
def colordevice(channels, bpc)
|
33
|
+
bits = bpc * channels
|
34
|
+
# will be either 8bpc/16bpd color TIFF,
|
35
|
+
# with any CMYK source transformed to 8bpc RBG
|
36
|
+
bits = 24 unless [24, 48].include? bits
|
37
|
+
"tiff#{bits}nc"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|