iiif_print 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.github/PULL_REQUEST_TEMPLATE.md +16 -0
- data/.github/workflows/build-lint-test-action.yaml +4 -5
- data/.gitignore +5 -4
- data/.rubocop.yml +1 -0
- data/.solargraph.yml +19 -0
- data/Gemfile.lock +1025 -0
- data/README.md +102 -9
- data/Rakefile +6 -0
- data/app/actors/iiif_print/actors/cleanup_file_sets_actor_decorator.rb +24 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +30 -28
- data/app/controllers/iiif_print/split_pdfs_controller.rb +38 -0
- data/app/helpers/iiif_print/iiif_helper_decorator.rb +32 -0
- data/app/helpers/iiif_print/iiif_print_helper_behavior.rb +23 -0
- data/app/helpers/iiif_print_helper.rb +0 -20
- data/app/indexers/concerns/iiif_print/child_work_indexer.rb +27 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +45 -17
- data/{lib → app/jobs}/iiif_print/jobs/application_job.rb +2 -1
- data/app/jobs/iiif_print/jobs/child_works_from_pdf_job.rb +153 -0
- data/app/jobs/iiif_print/jobs/create_relationships_job.rb +117 -0
- data/app/jobs/iiif_print/jobs/request_split_pdf_job.rb +31 -0
- data/app/listeners/iiif_print/listener.rb +31 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +10 -1
- data/app/models/concerns/iiif_print/solr/document.rb +19 -3
- data/app/models/iiif_print/iiif_search_decorator.rb +35 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +25 -2
- data/app/models/iiif_print/pending_relationship.rb +3 -0
- data/app/presenters/iiif_print/file_set_presenter_decorator.rb +11 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +120 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +1 -1
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +23 -11
- data/app/search_builders/concerns/iiif_print/allinson_flex_fields.rb +15 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +2 -1
- data/app/services/iiif_print/derivative_rodeo_service.rb +382 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +90 -31
- data/app/services/iiif_print/pluggable_derivative_service.rb +8 -10
- data/app/services/iiif_print/simple_schema_loader_decorator.rb +11 -0
- data/app/transactions/hyrax/transactions/iiif_print_container_decorator.rb +34 -0
- data/app/transactions/hyrax/transactions/steps/conditionally_destroy_children_from_split.rb +32 -0
- data/app/transactions/hyrax/transactions/steps/delete_all_file_sets_decorator.rb +35 -0
- data/app/views/catalog/_index_header_list_default.html.erb +13 -0
- data/app/views/hyrax/base/_representative_media.html.erb +4 -3
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +1 -1
- data/app/views/hyrax/file_sets/_show_actions.html.erb +24 -0
- data/config/initializers/simple_schema_loader.rb +1 -0
- data/config/locales/iiif_print.en.yml +4 -0
- data/config/metadata/child_works_from_pdf_splitting.yaml +21 -0
- data/config/routes.rb +3 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +8 -6
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +7 -5
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +8 -6
- data/db/migrate/20231110163052_add_model_details_to_iiif_print_pending_relationships.rb +7 -0
- data/docker-compose.yml +2 -2
- data/iiif_print.gemspec +11 -10
- data/lib/generators/iiif_print/install_generator.rb +21 -1
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +11 -4
- data/lib/generators/iiif_print/templates/helpers/iiif_print_helper.rb +5 -0
- data/lib/iiif_print/base_derivative_service.rb +14 -2
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +58 -6
- data/lib/iiif_print/catalog_search_builder.rb +7 -3
- data/lib/iiif_print/configuration.rb +205 -8
- data/lib/iiif_print/data/fileset_helper.rb +3 -3
- data/lib/iiif_print/data/work_derivatives.rb +4 -4
- data/lib/iiif_print/engine.rb +53 -15
- data/lib/iiif_print/errors.rb +18 -0
- data/lib/iiif_print/homepage_search_builder.rb +17 -0
- data/lib/iiif_print/image_tool.rb +12 -8
- data/lib/iiif_print/jp2_derivative_service.rb +4 -1
- data/lib/iiif_print/lineage_service.rb +47 -13
- data/lib/iiif_print/metadata.rb +67 -48
- data/lib/iiif_print/pdf_derivative_service.rb +3 -1
- data/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +189 -0
- data/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +183 -0
- data/lib/iiif_print/persistence_layer.rb +118 -0
- data/lib/iiif_print/split_pdfs/base_splitter.rb +153 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +83 -37
- data/lib/iiif_print/split_pdfs/derivative_rodeo_splitter.rb +166 -0
- data/lib/iiif_print/split_pdfs/destroy_pdf_child_works_service.rb +22 -0
- data/lib/iiif_print/split_pdfs/pages_to_jpgs_splitter.rb +19 -0
- data/lib/iiif_print/split_pdfs/pages_to_pngs_splitter.rb +26 -0
- data/lib/iiif_print/split_pdfs/pages_to_tiffs_splitter.rb +41 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +64 -59
- data/lib/iiif_print/text_extraction/hocr_reader.rb +7 -3
- data/lib/iiif_print/text_extraction/page_ocr.rb +5 -4
- data/lib/iiif_print/text_extraction_derivative_service.rb +4 -2
- data/lib/iiif_print/text_formats_from_alto_service.rb +3 -1
- data/lib/iiif_print/tiff_derivative_service.rb +3 -1
- data/lib/iiif_print/version.rb +1 -1
- data/lib/iiif_print.rb +210 -20
- data/lib/samvera/derivatives/configuration.rb +83 -0
- data/lib/samvera/derivatives/hyrax.rb +129 -0
- data/lib/samvera/derivatives.rb +238 -0
- data/tasks/copy_authorities_to_test_app.rake +11 -0
- data/tasks/iiif_print_dev.rake +4 -4
- metadata +111 -196
- data/app/helpers/hyrax/iiif_helper.rb +0 -22
- data/app/indexers/concerns/iiif_print/child_indexer.rb +0 -34
- data/app/views/hyrax/file_sets/_actions.html.erb +0 -45
- data/bin/rails +0 -13
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +0 -107
- data/lib/iiif_print/jobs/create_relationships_job.rb +0 -78
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +0 -130
- data/spec/.keep.txt +0 -1
- data/spec/factories/ability.rb +0 -6
- data/spec/factories/newspaper_issue.rb +0 -7
- data/spec/factories/newspaper_page.rb +0 -7
- data/spec/factories/newspaper_page_solr_document.rb +0 -12
- data/spec/factories/newspaper_title.rb +0 -8
- data/spec/factories/uploaded_pdf_file.rb +0 -9
- data/spec/factories/uploaded_txt_file.rb +0 -9
- data/spec/factories/user.rb +0 -13
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +0 -7
- data/spec/fixtures/files/alto-2-0.xsd +0 -714
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +0 -16
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +0 -31
- data/spec/fixtures/files/ndnp-alto-sample.xml +0 -24
- data/spec/fixtures/files/ndnp-sample1-json.json +0 -1
- data/spec/fixtures/files/ndnp-sample1-txt.txt +0 -1
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +0 -202
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +0 -202
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +0 -78
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +0 -65
- data/spec/helpers/iiif_print_helper_spec.rb +0 -43
- data/spec/iiif_print/base_derivative_service_spec.rb +0 -11
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +0 -51
- data/spec/iiif_print/catalog_search_builder_spec.rb +0 -60
- data/spec/iiif_print/configuration_spec.rb +0 -67
- data/spec/iiif_print/data/work_derivatives_spec.rb +0 -245
- data/spec/iiif_print/data/work_file_spec.rb +0 -99
- data/spec/iiif_print/data/work_files_spec.rb +0 -237
- data/spec/iiif_print/image_tool_spec.rb +0 -109
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +0 -30
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +0 -17
- data/spec/iiif_print/jp2_image_metadata_spec.rb +0 -37
- data/spec/iiif_print/lineage_service_spec.rb +0 -13
- data/spec/iiif_print/metadata_spec.rb +0 -115
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +0 -6
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +0 -49
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +0 -45
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +0 -84
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +0 -54
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +0 -44
- data/spec/iiif_print_spec.rb +0 -51
- data/spec/misc_shared.rb +0 -111
- data/spec/models/iiif_print/derivative_attachment_spec.rb +0 -37
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +0 -56
- data/spec/models/solr_document_spec.rb +0 -14
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +0 -19
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +0 -49
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +0 -59
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +0 -66
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +0 -178
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +0 -82
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +0 -127
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +0 -65
- data/spec/spec_helper.rb +0 -181
- data/spec/support/controller_level_helpers.rb +0 -28
- data/spec/support/iiif_print_models.rb +0 -127
- data/spec/test_app_templates/blacklight.yml +0 -9
- data/spec/test_app_templates/fedora.yml +0 -15
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +0 -40
- data/spec/test_app_templates/redis.yml +0 -9
- data/spec/test_app_templates/solr/conf/schema.xml +0 -362
- data/spec/test_app_templates/solr/conf/solrconfig.xml +0 -322
- data/spec/test_app_templates/solr.yml +0 -7
@@ -8,78 +8,83 @@ module IiifPrint
|
|
8
8
|
# For dpi extraction, falls back to calculating using MiniMagick,
|
9
9
|
# if neccessary.
|
10
10
|
class PdfImageExtractionService
|
11
|
-
# class constant column numbers
|
12
|
-
COL_WIDTH = 3
|
13
|
-
COL_HEIGHT = 4
|
14
|
-
COL_COLOR = 5
|
15
|
-
COL_CHANNELS = 6
|
16
|
-
COL_BITS = 7
|
17
|
-
# only poppler 0.25+ has this column in output:
|
18
|
-
COL_XPPI = 12
|
19
|
-
|
20
11
|
def initialize(path)
|
21
12
|
@path = path
|
22
|
-
|
23
|
-
@output = nil
|
24
|
-
@entries = nil
|
13
|
+
process(command: format('pdfimages -list %<path>s 2>/dev/null', path: path))
|
25
14
|
end
|
26
15
|
|
27
|
-
|
28
|
-
|
29
|
-
if @output.nil?
|
30
|
-
Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
31
|
-
@output = stdout.read.split("\n")
|
32
|
-
end
|
33
|
-
end
|
34
|
-
@output.slice(2, @output.size - 1)
|
35
|
-
end
|
16
|
+
attr_reader :path, :page_count, :width, :height, :pixels_per_inch
|
17
|
+
alias ppi pixels_per_inch
|
36
18
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
output = process
|
41
|
-
(0..output.size - 1).each do |i|
|
42
|
-
@entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
|
43
|
-
end
|
44
|
-
end
|
45
|
-
@entries
|
19
|
+
# @return [Array<String, Integer, Integer>]
|
20
|
+
def color
|
21
|
+
[@color_description, @channels, @bits]
|
46
22
|
end
|
47
23
|
|
48
|
-
|
49
|
-
result = entries.map { |e| e[i] }
|
50
|
-
return result.map!(&block) if block_given?
|
51
|
-
result
|
52
|
-
end
|
24
|
+
private
|
53
25
|
|
54
|
-
|
55
|
-
|
56
|
-
|
26
|
+
# class constant column numbers
|
27
|
+
COL_WIDTH = 3
|
28
|
+
COL_HEIGHT = 4
|
29
|
+
COL_COLOR_DESC = 5
|
30
|
+
COL_CHANNELS = 6
|
31
|
+
COL_BITS = 7
|
32
|
+
# only poppler 0.25+ has this column in output:
|
33
|
+
COL_XPPI = 12
|
57
34
|
|
58
|
-
|
59
|
-
|
60
|
-
|
35
|
+
# rubocop:disable Metrics/AbcSize - Because this helps us process the results in one loop.
|
36
|
+
# rubocop:disable Metrics/MethodLength - Again, to help speed up the processing loop.
|
37
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
38
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
39
|
+
#
|
40
|
+
# The first two lines are tabular header information:
|
41
|
+
#
|
42
|
+
# Example:
|
43
|
+
#
|
44
|
+
# bash-5.1$ pdfimages -list fmc_color.pdf | head -5
|
45
|
+
# page num type width height color comp bpc enc interp object ID x-ppi y-ppi size ratio
|
46
|
+
# --------------------------------------------------------------------------------------------
|
47
|
+
# 1 0 image 2475 413 rgb 3 8 jpeg no 10 0 300 300 21.8K 0.7%
|
48
|
+
def process(command:)
|
49
|
+
@page_count = 0
|
50
|
+
@color_description = 'gray'
|
51
|
+
@width = 0
|
52
|
+
@height = 0
|
53
|
+
@channels = 0
|
54
|
+
@bits = 0
|
55
|
+
@pixels_per_inch = 0
|
56
|
+
Open3.popen3(command) do |_stdin, stdout, _stderr, _wait_thr|
|
57
|
+
stdout.read.split("\n").each_with_index do |line, index|
|
58
|
+
# Skip the two header lines
|
59
|
+
next if index <= 1
|
60
|
+
@page_count += 1
|
61
|
+
cells = line.gsub(/\s+/m, ' ').strip.split(" ")
|
61
62
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
|
68
|
-
bits = entries.map { |e| e[COL_BITS].to_i }.max
|
69
|
-
[desc, channels, bits]
|
70
|
-
end
|
63
|
+
@color_description = 'rgb' if cells[COL_COLOR_DESC] != 'gray'
|
64
|
+
@width = cells[COL_WIDTH].to_i if cells[COL_WIDTH].to_i > @width
|
65
|
+
@height = cells[COL_HEIGHT].to_i if cells[COL_HEIGHT].to_i > @height
|
66
|
+
@channels = cells[COL_CHANNELS].to_i if cells[COL_CHANNELS].to_i > @channels
|
67
|
+
@bits = cells[COL_BITS].to_i if cells[COL_BITS].to_i > @bits
|
71
68
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
69
|
+
# In the case of poppler version < 0.25, we will have no more than 12 columns. As such,
|
70
|
+
# we need to do some alternative magic to calculate this.
|
71
|
+
if @page_count == 1 && cells.size <= 12
|
72
|
+
pdf = MiniMagick::Image.open(@path)
|
73
|
+
width_points = pdf.width
|
74
|
+
width_px = width
|
75
|
+
@pixels_per_inch = (72 * width_px / width_points).to_i
|
76
|
+
elsif cells[COL_XPPI].to_i > @pixels_per_inch
|
77
|
+
# By the magic of nil#to_i if we don't have more than 12 columns, we've already set
|
78
|
+
# the @pixels_per_inch and this line won't due much of anything.
|
79
|
+
@pixels_per_inch = cells[COL_XPPI].to_i
|
80
|
+
end
|
81
|
+
end
|
79
82
|
end
|
80
|
-
# with poppler 0.25+, pdfimages just gives us this:
|
81
|
-
selectcolumn(COL_XPPI, &:to_i).max
|
82
83
|
end
|
84
|
+
# rubocop:enable Metrics/AbcSize
|
85
|
+
# rubocop:enable Metrics/MethodLength
|
86
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
87
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
83
88
|
end
|
84
89
|
end
|
85
90
|
end
|
@@ -84,6 +84,7 @@ module IiifPrint
|
|
84
84
|
# add trailing space to plaintext buffer for between words:
|
85
85
|
@text += ' '
|
86
86
|
@words.push(@current) if word_complete?
|
87
|
+
@current = nil # clear the current word
|
87
88
|
end
|
88
89
|
|
89
90
|
def end_line
|
@@ -120,9 +121,12 @@ module IiifPrint
|
|
120
121
|
# for current word, and append line endings to plain text:
|
121
122
|
#
|
122
123
|
# @param name [String] element name.
|
123
|
-
def end_element(
|
124
|
-
|
125
|
-
|
124
|
+
def end_element(name)
|
125
|
+
if name == 'span'
|
126
|
+
end_word if @element_class_name == 'ocrx_word'
|
127
|
+
@text += "\n" if @element_class_name.nil?
|
128
|
+
end
|
129
|
+
@element_class_name = nil
|
126
130
|
end
|
127
131
|
|
128
132
|
# Callback for completion of parsing hOCR, used to normalize generated
|
@@ -9,7 +9,7 @@ module IiifPrint
|
|
9
9
|
class PageOCR
|
10
10
|
attr_accessor :html, :path
|
11
11
|
|
12
|
-
def initialize(path,
|
12
|
+
def initialize(path, additional_tesseract_options: IiifPrint.config.additional_tesseract_options)
|
13
13
|
@path = path
|
14
14
|
# hOCR html:
|
15
15
|
@html = nil
|
@@ -17,13 +17,14 @@ module IiifPrint
|
|
17
17
|
@source_meta = nil
|
18
18
|
@box = nil
|
19
19
|
@plain = nil
|
20
|
-
@
|
20
|
+
@additional_tesseract_options = additional_tesseract_options
|
21
21
|
end
|
22
22
|
|
23
23
|
def run_ocr
|
24
24
|
outfile = File.join(Dir.mktmpdir, 'output_html')
|
25
|
-
cmd = "tesseract #{path} #{outfile}
|
26
|
-
cmd += " #{@
|
25
|
+
cmd = "OMP_THREAD_LIMIT=1 tesseract #{path} #{outfile}"
|
26
|
+
cmd += " #{@additional_tesseract_options}" if @additional_tesseract_options.present?
|
27
|
+
cmd += " hocr"
|
27
28
|
`#{cmd}`
|
28
29
|
outfile + '.hocr'
|
29
30
|
end
|
@@ -28,13 +28,15 @@ module IiifPrint
|
|
28
28
|
|
29
29
|
ocr_derivatives.each do |extension, method_name|
|
30
30
|
path = prepare_path(extension.to_s)
|
31
|
-
write(content: ocr.public_send(method_name), path: path)
|
31
|
+
write(content: ocr.public_send(method_name), path: path, extension: extension)
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
-
def write(content:, path:)
|
35
|
+
def write(content:, path:, extension:)
|
36
|
+
mime_type = mime_type_for(extension)
|
36
37
|
File.open(path, 'w') do |outfile|
|
37
38
|
outfile.write(content)
|
39
|
+
IiifPrint.copy_derivatives_from_data_store(stream: content, directives: { url: path, container: 'extracted_text', mime_type: mime_type })
|
38
40
|
end
|
39
41
|
end
|
40
42
|
|
@@ -4,9 +4,10 @@ module IiifPrint
|
|
4
4
|
# NOTE: to keep this from conflicting with TextExtractionDerivativeService,
|
5
5
|
# this class should be invoked by it, not PluggableDerivativeService.
|
6
6
|
class TextFormatsFromALTOService < BaseDerivativeService
|
7
|
-
self.target_extension = '
|
7
|
+
self.target_extension = 'txt'.freeze
|
8
8
|
|
9
9
|
def save_derivative(destination, data)
|
10
|
+
mime_type = mime_type_for(destination)
|
10
11
|
# Load/prepare base of "pairtree" dir structure for extension, fileset
|
11
12
|
prepare_path(destination)
|
12
13
|
#
|
@@ -17,6 +18,7 @@ module IiifPrint
|
|
17
18
|
# Write data as UTF-8 encoded text
|
18
19
|
File.open(save_path, "w:UTF-8") do |f|
|
19
20
|
f.write(data)
|
21
|
+
IiifPrint.copy_derivatives_from_data_store(stream: data, directives: { url: file_set.id.to_s, container: 'extracted_text', mime_type: mime_type })
|
20
22
|
end
|
21
23
|
end
|
22
24
|
|
@@ -32,7 +32,9 @@ module IiifPrint
|
|
32
32
|
source_path += '[0]' if @source_path.ends_with?('pdf')
|
33
33
|
template = use_color? ? COLOR_CMD : GRAY_CMD
|
34
34
|
template = MONO_CMD if one_bit?
|
35
|
-
format(template, source_file: source_path, out_file: @dest_path)
|
35
|
+
data = format(template, source_file: source_path, out_file: @dest_path)
|
36
|
+
IiifPrint.copy_derivatives_from_data_store(stream: data, directives: { url: file_set.id.to_s, container: 'service_file', mime_type: mime_type_for(target_extension) })
|
37
|
+
data
|
36
38
|
end
|
37
39
|
|
38
40
|
def create_derivatives(filename)
|
data/lib/iiif_print/version.rb
CHANGED
data/lib/iiif_print.rb
CHANGED
@@ -14,13 +14,16 @@ require "iiif_print/tiff_derivative_service"
|
|
14
14
|
require "iiif_print/lineage_service"
|
15
15
|
require "iiif_print/metadata"
|
16
16
|
require "iiif_print/works_controller_behavior"
|
17
|
-
require "iiif_print/jobs/application_job"
|
18
17
|
require "iiif_print/blacklight_iiif_search/annotation_decorator"
|
19
|
-
require "iiif_print/
|
20
|
-
require "iiif_print/jobs/create_relationships_job"
|
21
|
-
require "iiif_print/split_pdfs/pages_into_images_service"
|
18
|
+
require "iiif_print/split_pdfs/base_splitter"
|
22
19
|
require "iiif_print/split_pdfs/child_work_creation_from_pdf_service"
|
20
|
+
require "iiif_print/split_pdfs/derivative_rodeo_splitter"
|
21
|
+
require "iiif_print/split_pdfs/destroy_pdf_child_works_service"
|
22
|
+
require "iiif_print/persistence_layer"
|
23
|
+
require "iiif_print/persistence_layer/active_fedora_adapter"
|
24
|
+
require "iiif_print/persistence_layer/valkyrie_adapter"
|
23
25
|
|
26
|
+
# rubocop:disable Metrics/ModuleLength
|
24
27
|
module IiifPrint
|
25
28
|
extend ActiveSupport::Autoload
|
26
29
|
autoload :Configuration
|
@@ -28,9 +31,10 @@ module IiifPrint
|
|
28
31
|
|
29
32
|
##
|
30
33
|
# @api public
|
34
|
+
#
|
31
35
|
# Exposes the IiifPrint configuration.
|
32
36
|
#
|
33
|
-
# @
|
37
|
+
# @yieldparam [IiifPrint::Configuration] config if a block is passed
|
34
38
|
# @return [IiifPrint::Configuration]
|
35
39
|
# @see IiifPrint::Configuration for configuration options
|
36
40
|
def self.config(&block)
|
@@ -39,28 +43,64 @@ module IiifPrint
|
|
39
43
|
@config
|
40
44
|
end
|
41
45
|
|
46
|
+
class << self
|
47
|
+
delegate(
|
48
|
+
:persistence_adapter,
|
49
|
+
:skip_splitting_pdf_files_that_end_with_these_texts,
|
50
|
+
to: :config
|
51
|
+
)
|
52
|
+
|
53
|
+
delegate(
|
54
|
+
:clean_for_tests!,
|
55
|
+
:copy_derivatives_from_data_store,
|
56
|
+
:create_relationship_between,
|
57
|
+
:destroy_children_split_from,
|
58
|
+
:extract_text_for,
|
59
|
+
:find_by,
|
60
|
+
:find_by_title_for,
|
61
|
+
:grandparent_for,
|
62
|
+
:index_works,
|
63
|
+
:object_in_works,
|
64
|
+
:object_ordered_works,
|
65
|
+
:parent_for,
|
66
|
+
:pdf?,
|
67
|
+
:save,
|
68
|
+
:solr_construct_query,
|
69
|
+
:solr_name,
|
70
|
+
:solr_query,
|
71
|
+
to: :persistence_adapter
|
72
|
+
)
|
73
|
+
end
|
74
|
+
|
75
|
+
# NOTE: We use lambdas so we can have default values but also provide a lazy configuration.
|
76
|
+
# There are certainly better ways but this is the least intrusive refactor from prior state.
|
42
77
|
DEFAULT_MODEL_CONFIGURATION = {
|
43
78
|
# Split a PDF into individual page images and create a new child work for each image.
|
44
|
-
pdf_splitter_job: IiifPrint::Jobs::ChildWorksFromPdfJob,
|
45
|
-
pdf_splitter_service: IiifPrint::SplitPdfs::
|
46
|
-
derivative_service_plugins:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
]
|
79
|
+
pdf_splitter_job: -> { IiifPrint::Jobs::ChildWorksFromPdfJob },
|
80
|
+
pdf_splitter_service: -> { IiifPrint::SplitPdfs::PagesToJpgsSplitter },
|
81
|
+
derivative_service_plugins: lambda {
|
82
|
+
[
|
83
|
+
IiifPrint::TextExtractionDerivativeService
|
84
|
+
]
|
85
|
+
}
|
52
86
|
}.freeze
|
53
87
|
|
54
88
|
# This is the record level configuration for PDF split handling.
|
55
89
|
ModelConfig = Struct.new(:pdf_split_child_model, *DEFAULT_MODEL_CONFIGURATION.keys, keyword_init: true)
|
90
|
+
private_constant :ModelConfig
|
56
91
|
|
57
|
-
|
92
|
+
##
|
93
|
+
# @api public
|
94
|
+
# This method is responsible for configuring a model for additional derivative generation.
|
58
95
|
#
|
59
96
|
# @example
|
60
97
|
# class Book < ActiveFedora::Base
|
61
98
|
# include IiifPrint.model_configuration(
|
62
99
|
# pdf_split_child_model: Page,
|
63
100
|
# derivative_service_plugins: [
|
101
|
+
# IiifPrint::JP2DerivativeService,
|
102
|
+
# IiifPrint::PDFDerivativeService,
|
103
|
+
# IiifPrint::TextExtractionDerivativeService,
|
64
104
|
# IiifPrint::TIFFDerivativeService
|
65
105
|
# ]
|
66
106
|
# )
|
@@ -68,29 +108,73 @@ module IiifPrint
|
|
68
108
|
#
|
69
109
|
# @param kwargs [Hash<Symbol,Object>] the configuration values that overrides the
|
70
110
|
# DEFAULT_MODEL_CONFIGURATION.
|
111
|
+
# @option kwargs [Array<Class>] derivative_service_plugins the various derivatives to run on the
|
112
|
+
# "original" files associated with this work. Options include:
|
113
|
+
# {IiifPrint::JP2DerivativeService}, {IiifPrint::PDFDerivativeService},
|
114
|
+
# {IiifPrint::TextExtractionDerivativeService}, {IiifPrint::TIFFDerivativeService}
|
115
|
+
# @option kwargs [Class] pdf_splitter_job responsible for handling the splitting of the original file
|
116
|
+
# @option kwargs [Class] pdf_split_child_model when we split the file into pages, what's the child model
|
117
|
+
# we want for those pages? Often times this is likely the same model as the parent.
|
118
|
+
# @option kwargs [Class] pdf_splitter_service the specific service that splits the PDF. Options are:
|
119
|
+
# {IiifPrint::SplitPdfs::PagesToJpgsSplitter},
|
120
|
+
# {IiifPrint::SplitPdfs::PagesToTiffsSplitter},
|
121
|
+
# {IiifPrint::SplitPdfs::PagesToPngsSplitter},
|
122
|
+
# {IiifPrint::SplitPdfs::DerivativeRodeoSplitter}
|
71
123
|
#
|
72
124
|
# @return [Module]
|
73
125
|
#
|
74
126
|
# @see IiifPrint::DEFAULT_MODEL_CONFIGURATION
|
75
127
|
# @todo Because not every job will split PDFs and write to a child model. May want to introduce
|
76
128
|
# an alternative splitting method to create new filesets on the existing work instead of new child works.
|
129
|
+
# rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
77
130
|
def self.model_configuration(**kwargs)
|
78
131
|
Module.new do
|
79
|
-
|
80
|
-
|
132
|
+
extend ActiveSupport::Concern
|
133
|
+
|
134
|
+
included do
|
135
|
+
work_type = self # In this case self is the class we're mixing the new module into.
|
136
|
+
|
137
|
+
# Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
|
138
|
+
indexer = if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
|
139
|
+
IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_with_adapter_logic(work_type: work_type)
|
140
|
+
elsif work_type < ActiveFedora::Base
|
141
|
+
IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_with_adapter_logic(work_type: work_type)
|
142
|
+
else
|
143
|
+
raise "Unable to mix '.model_configuration' into #{work_type}"
|
144
|
+
end
|
145
|
+
|
146
|
+
# Ensure that the work_type and corresponding indexer are properly decorated for IiifPrint
|
147
|
+
if defined?(Valkyrie::Resource) && work_type < Valkyrie::Resource
|
148
|
+
IiifPrint::PersistenceLayer::ValkyrieAdapter.decorate_form_with_adapter_logic(work_type: work_type)
|
149
|
+
elsif work_type < ActiveFedora::Base
|
150
|
+
IiifPrint::PersistenceLayer::ActiveFedoraAdapter.decorate_form_with_adapter_logic(work_type: work_type)
|
151
|
+
else
|
152
|
+
raise "Unable to mix '.model_configuration' into #{work_type}"
|
153
|
+
end
|
154
|
+
|
155
|
+
# Deriving lineage of objects is a potentially complicated thing. We provide a default
|
156
|
+
# service but each work_type's indexer can be configured by amending it's
|
157
|
+
# {.iiif_print_lineage_service}.
|
158
|
+
indexer.class_attribute(:iiif_print_lineage_service, default: IiifPrint::LineageService) unless indexer.respond_to?(:iiif_print_lineage_service)
|
159
|
+
work_type::GeneratedResourceSchema.send(:include, IiifPrint::SetChildFlag) if work_type.const_defined?(:GeneratedResourceSchema)
|
81
160
|
end
|
82
161
|
|
83
162
|
# We don't know what you may want in your configuration, but from this gems implementation,
|
84
163
|
# we're going to provide the defaults to ensure that it works.
|
85
164
|
DEFAULT_MODEL_CONFIGURATION.each_pair do |key, default_value|
|
86
|
-
kwargs[key] ||= default_value
|
165
|
+
kwargs[key] ||= default_value.call
|
87
166
|
end
|
88
167
|
|
89
168
|
define_method(:iiif_print_config) do
|
90
169
|
@iiif_print_config ||= ModelConfig.new(**kwargs)
|
91
170
|
end
|
171
|
+
|
172
|
+
def iiif_print_config?
|
173
|
+
true
|
174
|
+
end
|
92
175
|
end
|
93
176
|
end
|
177
|
+
# rubocop:enable Metrics/MethodLength
|
94
178
|
|
95
179
|
# @api public
|
96
180
|
#
|
@@ -107,7 +191,7 @@ module IiifPrint
|
|
107
191
|
# @see Hyrax::IiifManifestPresenter#manifest_metadata
|
108
192
|
def self.manifest_metadata_for(work:,
|
109
193
|
version: config.default_iiif_manifest_version,
|
110
|
-
fields:
|
194
|
+
fields: defined?(AllinsonFlex) ? fields_for_allinson_flex : default_fields,
|
111
195
|
current_ability:,
|
112
196
|
base_url:)
|
113
197
|
Metadata.build_metadata_for(work: work,
|
@@ -117,6 +201,11 @@ module IiifPrint
|
|
117
201
|
base_url: base_url)
|
118
202
|
end
|
119
203
|
|
204
|
+
def self.manifest_metadata_from(work:, presenter:)
|
205
|
+
current_ability = presenter.try(:ability) || presenter.try(:current_ability)
|
206
|
+
base_url = presenter.try(:base_url) || presenter.try(:request)&.base_url
|
207
|
+
IiifPrint.manifest_metadata_for(work: work, current_ability: current_ability, base_url: base_url)
|
208
|
+
end
|
120
209
|
# Hash is an arbitrary attribute key/value pairs
|
121
210
|
# Struct is a defined set of attribute "keys". When we favor defined values,
|
122
211
|
# then we are naming the concept and defining the range of potential values.
|
@@ -124,13 +213,114 @@ module IiifPrint
|
|
124
213
|
|
125
214
|
# @api private
|
126
215
|
# @todo Figure out a way to use a custom label, right now it takes it get rendered from the title.
|
127
|
-
def self.
|
216
|
+
def self.default_fields(fields: config.metadata_fields)
|
128
217
|
fields.map do |field|
|
129
218
|
Field.new(
|
130
219
|
name: field.first,
|
131
|
-
label: Hyrax::Renderers::AttributeRenderer.new(field, nil).label,
|
220
|
+
label: Hyrax::Renderers::AttributeRenderer.new(field.first, nil).label,
|
132
221
|
options: field.last
|
133
222
|
)
|
134
223
|
end
|
135
224
|
end
|
225
|
+
|
226
|
+
##
|
227
|
+
# @param fields [Array<IiifPrint::Field>]
|
228
|
+
def self.fields_for_allinson_flex(fields: allinson_flex_fields, sort_order: IiifPrint.config.iiif_metadata_field_presentation_order)
|
229
|
+
fields = sort_af_fields!(fields, sort_order: sort_order)
|
230
|
+
fields.each_with_object({}) do |field, hash|
|
231
|
+
# filters out admin_only fields
|
232
|
+
next if field.indexing&.include?('admin_only')
|
233
|
+
|
234
|
+
# WARNING: This is assuming A LOT
|
235
|
+
# This is taking the Allinson Flex fields that have the same name and only
|
236
|
+
# using the first one while discarding the rest. There currently no way to
|
237
|
+
# controller which one(s) are discarded but this fits for the moment.
|
238
|
+
next if hash.key?(field.name)
|
239
|
+
|
240
|
+
# currently only supports the faceted option
|
241
|
+
# Why the `render_as:`? This was originally derived from Hyku default attributes
|
242
|
+
# @see https://github.com/samvera/hyku/blob/c702844de4c003eaa88eb5a7514c7a1eae1b289e/app/views/hyrax/base/_attribute_rows.html.erb#L3
|
243
|
+
hash[field.name] = Field.new(
|
244
|
+
name: field.name,
|
245
|
+
label: field.value,
|
246
|
+
options: field.indexing&.include?('facetable') ? { render_as: :faceted } : nil
|
247
|
+
)
|
248
|
+
end.values
|
249
|
+
end
|
250
|
+
|
251
|
+
CollectionFieldShim = Struct.new(:name, :value, :indexing, keyword_init: true)
|
252
|
+
|
253
|
+
##
|
254
|
+
# @return [Array<IiifPrint::Field>]
|
255
|
+
def self.allinson_flex_fields
|
256
|
+
return @allinson_flex_fields if defined?(@allinson_flex_fields)
|
257
|
+
|
258
|
+
allinson_flex_relation = AllinsonFlex::ProfileProperty
|
259
|
+
.joins(:texts)
|
260
|
+
.where(allinson_flex_profile_texts: { name: 'display_label' })
|
261
|
+
.distinct
|
262
|
+
.select(:name, :value, :indexing)
|
263
|
+
flex_fields = allinson_flex_relation.to_a
|
264
|
+
unless allinson_flex_relation.exists?(name: 'collection')
|
265
|
+
collection_field = CollectionFieldShim.new(name: :collection, value: 'Collection', indexing: [])
|
266
|
+
flex_fields << collection_field
|
267
|
+
end
|
268
|
+
@allinson_flex_fields = flex_fields
|
269
|
+
end
|
270
|
+
|
271
|
+
##
|
272
|
+
# @param fields [Array<IiifPrint::Field>]
|
273
|
+
# @param sort_order [Array<Symbol>]
|
274
|
+
def self.sort_af_fields!(fields, sort_order:)
|
275
|
+
return fields if sort_order.blank?
|
276
|
+
|
277
|
+
fields.sort_by do |field|
|
278
|
+
sort_order_index = sort_order.index(field.name.to_sym)
|
279
|
+
sort_order_index.nil? ? sort_order.length : sort_order_index
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
##
|
284
|
+
# @api public
|
285
|
+
#
|
286
|
+
# @param work [ActiveFedora::Base]
|
287
|
+
# @param file_set [FileSet]
|
288
|
+
# @param locations [Array<String>]
|
289
|
+
# @param user [User]
|
290
|
+
#
|
291
|
+
# @return [Symbol] when none of the locations are to be split.
|
292
|
+
def self.conditionally_submit_split_for(work:, file_set:, locations:, user:, skip_these_endings: skip_splitting_pdf_files_that_end_with_these_texts)
|
293
|
+
locations = locations.select { |location| split_for_path_suffix?(location, skip_these_endings: skip_these_endings) }
|
294
|
+
return :no_pdfs_for_splitting if locations.empty?
|
295
|
+
|
296
|
+
# Hyrax::FileSet ids are Valkyrie::ID's which can't be passed, so we call id on that and get the string id
|
297
|
+
file_set_id = file_set.id.try(:id) || file_set.id
|
298
|
+
work_admin_set_id = work.admin_set_id.try(:id) || work.admin_set_id
|
299
|
+
|
300
|
+
work.try(:iiif_print_config)&.pdf_splitter_job&.perform_later(
|
301
|
+
file_set_id,
|
302
|
+
locations,
|
303
|
+
user,
|
304
|
+
work_admin_set_id,
|
305
|
+
0 # A no longer used parameter; but we need to preserve the method signature (for now)
|
306
|
+
)
|
307
|
+
end
|
308
|
+
|
309
|
+
##
|
310
|
+
# @api public
|
311
|
+
#
|
312
|
+
# @param path [String] the path, hopefully with an extension, to the file we're considering
|
313
|
+
# splitting.
|
314
|
+
# @param skip_these_endings [Array<#downcase>] the endings that we should skip for splitting
|
315
|
+
# purposes.
|
316
|
+
# @return [TrueClass] when the path is one we should split
|
317
|
+
# @return [FalseClass] when the path is one we should not split
|
318
|
+
#
|
319
|
+
# @see .skip_splitting_pdf_files_that_end_with_these_texts
|
320
|
+
def self.split_for_path_suffix?(path, skip_these_endings: skip_splitting_pdf_files_that_end_with_these_texts)
|
321
|
+
return false unless path.downcase.end_with?('.pdf')
|
322
|
+
return true if skip_these_endings.empty?
|
323
|
+
!path.downcase.end_with?(*skip_these_endings.map(&:downcase))
|
324
|
+
end
|
136
325
|
end
|
326
|
+
# rubocop:enable Metrics/ModuleLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Samvera
|
4
|
+
module Derivatives
|
5
|
+
##
|
6
|
+
# The purpose of this class is to contain the explicit derivative generation directives for the
|
7
|
+
# upstream application.
|
8
|
+
#
|
9
|
+
# @note The implicit deriviate types for Hyrax are as follows:
|
10
|
+
# - type :extracted_text with sources [:pdf, :office_document]
|
11
|
+
# - type :thumbnail with sources [:pdf, :office_document, :thumbnail, :image]
|
12
|
+
# - type :mp3 with sources [:audio]
|
13
|
+
# - type :ogg with sources [:audio]
|
14
|
+
# - type :webm with sources [:video]
|
15
|
+
# - type :mp4 with sources [:video]
|
16
|
+
#
|
17
|
+
# @note A long-standing practice of Samvera's Hyrax has been to have assumptive and implicit
|
18
|
+
# derivative generation (see Hyrax::FileSetDerivativesService). In being implicit, a
|
19
|
+
# challenge arises, namely overriding and configuring. There exists a crease in the code
|
20
|
+
# to allow for a different derivative approach (see Hyrax::DerivativeService). Yet that
|
21
|
+
# approach continues the tradition of implicit work.
|
22
|
+
class Configuration
|
23
|
+
def initialize
|
24
|
+
# Favoring a Hash for ease of lookup as well as the concept that there can be only one entry
|
25
|
+
# per type.
|
26
|
+
@registered_types = {}
|
27
|
+
end
|
28
|
+
|
29
|
+
# TODO: Consider the appropriate extension
|
30
|
+
RegisteredType = Struct.new(:type, :locators, :applicators, :applicability, keyword_init: true) do
|
31
|
+
def applicable_for?(file_set:)
|
32
|
+
applicability.call(file_set)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# @api pulic
|
38
|
+
#
|
39
|
+
# @param type [Symbol] The named type of derivative
|
40
|
+
# @param locators [Array<Samvera::Derivatives::FileLocator::Strategy>] The strategies that
|
41
|
+
# we'll attempt in finding the derivative that we will later apply.
|
42
|
+
# @param applicators [Array<Samvera::Derivatives::FileApplicator::Strategy>] The strategies
|
43
|
+
# that we'll use to apply the found derivative to the {FileSet}
|
44
|
+
#
|
45
|
+
# @yieldparam applicability [#call]
|
46
|
+
#
|
47
|
+
# @return [RegisteredType]
|
48
|
+
#
|
49
|
+
# @note What is the best mechanism for naming the sources? At present we're doing a lot of
|
50
|
+
# assumption on the types.
|
51
|
+
def register(type:, locators:, applicators:, &applicability)
|
52
|
+
# Should the validator be required?
|
53
|
+
@registered_types[type.to_sym] = RegisteredType.new(
|
54
|
+
type: type.to_sym,
|
55
|
+
locators: Array(locators),
|
56
|
+
applicators: Array(applicators),
|
57
|
+
applicability: applicability || default_applicability
|
58
|
+
)
|
59
|
+
end
|
60
|
+
|
61
|
+
##
|
62
|
+
# @api public
|
63
|
+
#
|
64
|
+
# @param type [Symbol]
|
65
|
+
#
|
66
|
+
# @return [RegisteredType]
|
67
|
+
def registry_for(type:)
|
68
|
+
@registered_types.fetch(type.to_sym) { empty_registry_for(type: type.to_sym) }
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def empty_registry_for(type:)
|
74
|
+
RegisteredType.new(type: type, locators: [], applicators: [], applicability: ->(_file_set) { false })
|
75
|
+
end
|
76
|
+
|
77
|
+
# We're going to assume this is true unless configured otherwise.
|
78
|
+
def default_applicability
|
79
|
+
->(_file_set) { true }
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|