iiif_print 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.coveralls.yml +2 -0
- data/.env +5 -0
- data/.fcrepo_wrapper +4 -0
- data/.github/release.yml +20 -0
- data/.github/workflows/branches.yml +24 -0
- data/.github/workflows/build-lint-test-action.yaml +33 -0
- data/.github/workflows/release_labels.yml +25 -0
- data/.gitignore +52 -0
- data/.rubocop.yml +177 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +49 -0
- data/CONTRIBUTING.md +181 -0
- data/Dockerfile +15 -0
- data/Gemfile +52 -0
- data/LICENSE +203 -0
- data/README.md +203 -0
- data/Rakefile +38 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
- data/app/assets/config/iiif_print_manifest.js +2 -0
- data/app/assets/images/iiif_print/.keep +0 -0
- data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/iiif_print.js +3 -0
- data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
- data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
- data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
- data/app/helpers/hyrax/iiif_helper.rb +22 -0
- data/app/helpers/iiif_print/application_helper.rb +5 -0
- data/app/helpers/iiif_print_helper.rb +64 -0
- data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
- data/app/mailers/iiif_print/application_mailer.rb +8 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
- data/app/models/concerns/iiif_print/solr/document.rb +47 -0
- data/app/models/iiif_print/application_record.rb +6 -0
- data/app/models/iiif_print/derivative_attachment.rb +8 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
- data/app/models/iiif_print/ingest_file_relation.rb +14 -0
- data/app/models/iiif_print/pending_relationship.rb +7 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
- data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
- data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
- data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/hyrax/base/_representative_media.html.erb +9 -0
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/iiif_print.de.yml +148 -0
- data/config/locales/iiif_print.en.yml +119 -0
- data/config/locales/iiif_print.es.yml +148 -0
- data/config/locales/iiif_print.fr.yml +149 -0
- data/config/locales/iiif_print.it.yml +142 -0
- data/config/locales/iiif_print.pt-BR.yml +148 -0
- data/config/locales/iiif_print.zh.yml +142 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
- data/docker-compose.yml +129 -0
- data/iiif_print.gemspec +43 -0
- data/lib/generators/iiif_print/assets_generator.rb +29 -0
- data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
- data/lib/generators/iiif_print/install_generator.rb +52 -0
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
- data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
- data/lib/iiif_print/base_derivative_service.rb +113 -0
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
- data/lib/iiif_print/catalog_search_builder.rb +31 -0
- data/lib/iiif_print/configuration.rb +99 -0
- data/lib/iiif_print/data/fileset_helper.rb +25 -0
- data/lib/iiif_print/data/path_helper.rb +40 -0
- data/lib/iiif_print/data/work_derivatives.rb +323 -0
- data/lib/iiif_print/data/work_file.rb +92 -0
- data/lib/iiif_print/data/work_files.rb +199 -0
- data/lib/iiif_print/data.rb +35 -0
- data/lib/iiif_print/engine.rb +77 -0
- data/lib/iiif_print/errors.rb +9 -0
- data/lib/iiif_print/image_tool.rb +119 -0
- data/lib/iiif_print/jobs/application_job.rb +8 -0
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
- data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
- data/lib/iiif_print/jp2_derivative_service.rb +118 -0
- data/lib/iiif_print/jp2_image_metadata.rb +81 -0
- data/lib/iiif_print/lineage_service.rb +41 -0
- data/lib/iiif_print/metadata.rb +125 -0
- data/lib/iiif_print/pdf_derivative_service.rb +42 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
- data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
- data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
- data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
- data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
- data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
- data/lib/iiif_print/text_extraction.rb +11 -0
- data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
- data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
- data/lib/iiif_print/tiff_derivative_service.rb +50 -0
- data/lib/iiif_print/version.rb +3 -0
- data/lib/iiif_print/works_controller_behavior.rb +9 -0
- data/lib/iiif_print.rb +136 -0
- data/lib/tasks/set_child_works.rake +22 -0
- data/spec/.keep.txt +1 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/uploaded_txt_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
- data/spec/helpers/iiif_print_helper_spec.rb +43 -0
- data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
- data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
- data/spec/iiif_print/configuration_spec.rb +67 -0
- data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
- data/spec/iiif_print/data/work_file_spec.rb +99 -0
- data/spec/iiif_print/data/work_files_spec.rb +237 -0
- data/spec/iiif_print/image_tool_spec.rb +109 -0
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
- data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
- data/spec/iiif_print/lineage_service_spec.rb +13 -0
- data/spec/iiif_print/metadata_spec.rb +115 -0
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
- data/spec/iiif_print_spec.rb +51 -0
- data/spec/misc_shared.rb +111 -0
- data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
- data/spec/spec_helper.rb +181 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/support/iiif_print_models.rb +127 -0
- data/spec/test_app_templates/blacklight.yml +9 -0
- data/spec/test_app_templates/fedora.yml +15 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
- data/spec/test_app_templates/redis.yml +9 -0
- data/spec/test_app_templates/solr/conf/schema.xml +362 -0
- data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
- data/spec/test_app_templates/solr.yml +7 -0
- data/tasks/iiif_print_dev.rake +34 -0
- data/tmp/.keep +0 -0
- metadata +605 -0
@@ -0,0 +1,125 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
# rubocop:disable Metrics/ClassLength
|
3
|
+
class Metadata
|
4
|
+
def self.build_metadata_for(work:, version:, fields:, current_ability:, base_url:)
|
5
|
+
new(work: work,
|
6
|
+
version: version,
|
7
|
+
fields: fields,
|
8
|
+
current_ability: current_ability,
|
9
|
+
base_url: base_url).build_metadata
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(work:, version:, fields:, current_ability:, base_url:)
|
13
|
+
@work = work
|
14
|
+
@version = version.to_i
|
15
|
+
@fields = fields
|
16
|
+
@current_ability = current_ability
|
17
|
+
@base_url = base_url
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :work, :version, :fields
|
21
|
+
|
22
|
+
def build_metadata
|
23
|
+
send("build_metadata_for_v#{version}")
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def build_metadata_for_v2
|
29
|
+
fields.map do |field|
|
30
|
+
label = Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label
|
31
|
+
if field.name == :collection && member_of_collection?
|
32
|
+
viewable_collections = Hyrax::CollectionMemberService.run(work, @current_ability)
|
33
|
+
next if viewable_collections.empty?
|
34
|
+
{ 'label' => label,
|
35
|
+
'value' => make_collection_link(viewable_collections) }
|
36
|
+
else
|
37
|
+
next if field_is_empty?(field)
|
38
|
+
{ 'label' => label,
|
39
|
+
'value' => cast_to_value(field_name: field.name, options: field.options) }
|
40
|
+
end
|
41
|
+
end.compact
|
42
|
+
end
|
43
|
+
|
44
|
+
def build_metadata_for_v3
|
45
|
+
fields.map do |field|
|
46
|
+
values = Array(work.try(field.name)).map { |value| scrub(value.to_s) }
|
47
|
+
next if values.empty?
|
48
|
+
{
|
49
|
+
'label' => {
|
50
|
+
# Since we're using I18n to translate the field, we're setting the locale used in the translation.
|
51
|
+
I18n.locale.to_s => [Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label]
|
52
|
+
},
|
53
|
+
'value' => {
|
54
|
+
'none' => values
|
55
|
+
}
|
56
|
+
}
|
57
|
+
end.compact
|
58
|
+
end
|
59
|
+
|
60
|
+
def field_is_empty?(field)
|
61
|
+
Array(work.try(field.name)).empty?
|
62
|
+
end
|
63
|
+
|
64
|
+
def member_of_collection?
|
65
|
+
work[:member_of_collection_ids_ssim]&.present?
|
66
|
+
end
|
67
|
+
|
68
|
+
def scrub(value)
|
69
|
+
Loofah.fragment(value).scrub!(:whitewash).to_s
|
70
|
+
end
|
71
|
+
|
72
|
+
def cast_to_value(field_name:, options:)
|
73
|
+
if options&.[](:render_as) == :faceted
|
74
|
+
values_for(field_name: field_name).map do |value|
|
75
|
+
search_field = field_name.to_s + "_sim"
|
76
|
+
path = Rails.application.routes.url_helpers.search_catalog_path(
|
77
|
+
"f[#{search_field}][]": value, locale: I18n.locale
|
78
|
+
)
|
79
|
+
path += '&include_child_works=true' if work["is_child_bsi"] == true
|
80
|
+
"<a href='#{File.join(@base_url, path)}'>#{value}</a>"
|
81
|
+
end
|
82
|
+
else
|
83
|
+
make_link(values_for(field_name: field_name))
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def values_for(field_name:)
|
88
|
+
Array(work.send(field_name))
|
89
|
+
end
|
90
|
+
|
91
|
+
def make_collection_link(collection_documents)
|
92
|
+
collection_documents.map do |collection|
|
93
|
+
"<a href='#{File.join(@base_url, 'collections', collection.id)}'>#{collection.title.first}</a>"
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# @note This method turns link looking strings into links
|
98
|
+
def make_link(texts)
|
99
|
+
texts.map do |t|
|
100
|
+
t.to_s.gsub(MAKE_LINK_REGEX) do |url|
|
101
|
+
"<a href='#{url}' target='_blank'>#{url}</a>"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
MAKE_LINK_REGEX = %r{
|
107
|
+
\b
|
108
|
+
(
|
109
|
+
(?: [a-z][\w-]+:
|
110
|
+
(?: /{1,3} | [a-z0-9%] ) |
|
111
|
+
www\d{0,3}[.] |
|
112
|
+
[a-z0-9.\-]+[.][a-z]{2,4}/
|
113
|
+
)
|
114
|
+
(?:
|
115
|
+
[^\s()<>]+ | \(([^\s()<>]+|(\([^\s()<>]+\)))*\)
|
116
|
+
)+
|
117
|
+
(?:
|
118
|
+
\(([^\s()<>]+|(\([^\s()<>]+\)))*\) |
|
119
|
+
[^\s`!()\[\]{};:'".,<>?«»〝〞‘‛]
|
120
|
+
)
|
121
|
+
)
|
122
|
+
}ix.freeze
|
123
|
+
end
|
124
|
+
# rubocop:enable Metrics/ClassLength
|
125
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
class PDFDerivativeService < BaseDerivativeService
|
5
|
+
self.target_extension = 'pdf'.freeze
|
6
|
+
|
7
|
+
# PDF (JPEG, 8 bit grayscale), 150ppi
|
8
|
+
GRAY_PDF_CMD = 'convert %<source_file>s ' \
|
9
|
+
'-resize 1800 -density 150 ' \
|
10
|
+
'-depth 8 -colorspace Gray ' \
|
11
|
+
'-compress jpeg %<out_file>s'.freeze
|
12
|
+
|
13
|
+
# sRBG color PDF (JPEG, 8 bits per channel), 150ppi
|
14
|
+
COLOR_PDF_CMD = 'convert %<source_file>s ' \
|
15
|
+
'-resize 1800 -density 150 ' \
|
16
|
+
'-depth 8 ' \
|
17
|
+
'-compress jpeg %<out_file>s'.freeze
|
18
|
+
|
19
|
+
def initialize(file_set)
|
20
|
+
super(file_set)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Get conversion command; command varies on whether or not we have
|
24
|
+
# JP2 source, and whether we have color or grayscale material.
|
25
|
+
def convert_cmd
|
26
|
+
template = use_color? ? COLOR_PDF_CMD : GRAY_PDF_CMD
|
27
|
+
format(template, source_file: @source_path, out_file: @dest_path)
|
28
|
+
end
|
29
|
+
|
30
|
+
def create_derivatives(filename)
|
31
|
+
# Base class takes care of loading @source_path, @dest_path
|
32
|
+
super(filename)
|
33
|
+
|
34
|
+
# no creation if pdf master
|
35
|
+
return if mime_type == 'application/pdf'
|
36
|
+
|
37
|
+
# Get and run conversion command
|
38
|
+
return jp2_convert if mime_type == 'image/jp2'
|
39
|
+
im_convert
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Encapsulates methods used for pdf splitting into child works
|
4
|
+
module IiifPrint
|
5
|
+
module SplitPdfs
|
6
|
+
class ChildWorkCreationFromPdfService
|
7
|
+
# Load an array of paths to pdf files
|
8
|
+
# @param [Array > Hyrax::Upload file ids]
|
9
|
+
# @return [Array > String] file paths to temp directory
|
10
|
+
def self.pdf_paths(files:)
|
11
|
+
upload_ids = filter_file_ids(files)
|
12
|
+
return [] if upload_ids.empty?
|
13
|
+
uploads = Hyrax::UploadedFile.find(upload_ids)
|
14
|
+
paths = uploads.map(&method(:upload_path))
|
15
|
+
pdfs_only_for(paths)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Is child work splitting defined for model?
|
19
|
+
# @param [GenericWork, etc] A valid type of hyrax work
|
20
|
+
# @return [Boolean]
|
21
|
+
def self.iiif_print_split?(work:)
|
22
|
+
# defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
|
23
|
+
return true if work.try(:iiif_print_config)&.pdf_split_child_model
|
24
|
+
false
|
25
|
+
end
|
26
|
+
|
27
|
+
# Are there any PDF files?
|
28
|
+
# @param [Array > String] paths to PDFs
|
29
|
+
# @return [Boolean]
|
30
|
+
def self.pdfs?(paths:)
|
31
|
+
pdf_paths = pdfs_only_for(paths)
|
32
|
+
return false unless pdf_paths.count.positive?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
|
36
|
+
# Submit the job to split PDF into child works
|
37
|
+
# @param [GenericWork, etc] A valid type of hyrax work
|
38
|
+
# @param [Array<String>] paths to PDF attachments
|
39
|
+
# @param [User] user
|
40
|
+
# @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
|
41
|
+
def self.queue_job(work:, file_locations:, user:, admin_set_id:)
|
42
|
+
work.iiif_print_config.pdf_splitter_job.perform_later(
|
43
|
+
work,
|
44
|
+
file_locations,
|
45
|
+
user,
|
46
|
+
admin_set_id,
|
47
|
+
count_existing_pdfs(work)
|
48
|
+
)
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.filter_file_ids(input)
|
52
|
+
Array.wrap(input).select(&:present?)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Given Hyrax::Upload object, return path to file on local filesystem
|
56
|
+
def self.upload_path(upload)
|
57
|
+
# so many layers to this onion:
|
58
|
+
upload.file.file.file
|
59
|
+
end
|
60
|
+
|
61
|
+
# TODO: implement a method to count existing PDFs on a work to support
|
62
|
+
# adding more PDFs to an existing work.
|
63
|
+
def self.count_existing_pdfs(_work)
|
64
|
+
0
|
65
|
+
end
|
66
|
+
|
67
|
+
# TODO: Consider other methods to identify a PDF file.
|
68
|
+
# This sub-selection may need to be moved to use mimetype if there
|
69
|
+
# is a need to support paths not ending in .pdf (i.e. remote_urls)
|
70
|
+
def self.pdfs_only_for(paths)
|
71
|
+
paths.select { |path| path.end_with?('.pdf', '.PDF') }
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'securerandom'
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'iiif_print/split_pdfs/pdf_image_extraction_service'
|
5
|
+
|
6
|
+
module IiifPrint
|
7
|
+
module SplitPdfs
|
8
|
+
class PagesIntoImagesService
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
def initialize(path)
|
12
|
+
@baseid = SecureRandom.uuid
|
13
|
+
@pdfpath = path
|
14
|
+
@info = nil
|
15
|
+
@entries = nil
|
16
|
+
@tmpdir = nil
|
17
|
+
@size = nil
|
18
|
+
@pagecount = nil
|
19
|
+
@pdftext = nil
|
20
|
+
@compression = 'lzw'
|
21
|
+
end
|
22
|
+
|
23
|
+
# return
|
24
|
+
def pdfinfo
|
25
|
+
@info = IiifPrint::SplitPdfs::PdfImageExtractionService.new(@pdfpath) if @info.nil?
|
26
|
+
@info
|
27
|
+
end
|
28
|
+
|
29
|
+
# TODO: put this test somewhere to prevent invalid pdfs from crashing the image service.
|
30
|
+
def invalid_pdf?
|
31
|
+
return true if pdfinfo.color.include?(nil) || pdfinfo.width.nil? || pdfinfo.height.nil? || pdfinfo.entries.length.zero?
|
32
|
+
false
|
33
|
+
end
|
34
|
+
|
35
|
+
def tmpdir
|
36
|
+
@tmpdir = Dir.mktmpdir if @tmpdir.nil?
|
37
|
+
@tmpdir
|
38
|
+
end
|
39
|
+
|
40
|
+
def colordevice(channels, bpc)
|
41
|
+
bits = bpc * channels
|
42
|
+
# will be either 8bpc/16bpd color TIFF,
|
43
|
+
# with any CMYK source transformed to 8bpc RBG
|
44
|
+
bits = 24 unless [24, 48].include? bits
|
45
|
+
"tiff#{bits}nc"
|
46
|
+
end
|
47
|
+
|
48
|
+
def gsdevice
|
49
|
+
color, channels, bpc = pdfinfo.color
|
50
|
+
device = nil
|
51
|
+
# CCITT Group 4 Black and White, if applicable:
|
52
|
+
if color == 'gray' && bpc == 1
|
53
|
+
device = 'tiffg4'
|
54
|
+
@compression = 'g4'
|
55
|
+
end
|
56
|
+
# 8 Bit Grayscale, if applicable:
|
57
|
+
device = 'tiffgray' if color == 'gray' && bpc > 1
|
58
|
+
# otherwise color:
|
59
|
+
device = colordevice(channels, bpc) if device.nil?
|
60
|
+
device
|
61
|
+
end
|
62
|
+
|
63
|
+
# TODO: this method came from newspaper gem but appears to be unused. Is it needed anywhere?
|
64
|
+
# def gstext
|
65
|
+
# cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
|
66
|
+
# "-sOutputFile=- -f #{@pdfpath}"
|
67
|
+
# Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
68
|
+
# @pdftext = stdout.read
|
69
|
+
# end
|
70
|
+
# @pdftext
|
71
|
+
# end
|
72
|
+
|
73
|
+
def pagecount
|
74
|
+
cmd = "pdfinfo #{@pdfpath}"
|
75
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
76
|
+
output = stdout.read.split("\n")
|
77
|
+
# rubocop:disable Performance/Detect
|
78
|
+
pages_e = output.select { |e| e.start_with?('Pages:') }[0]
|
79
|
+
# rubocop:enable Performance/Detect
|
80
|
+
@pagecount = pages_e.split[-1].to_i
|
81
|
+
end
|
82
|
+
@pagecount
|
83
|
+
end
|
84
|
+
|
85
|
+
def looks_scanned
|
86
|
+
max_image_px = pdfinfo.width * pdfinfo.height
|
87
|
+
single_image_per_page = pdfinfo.entries.length == pagecount
|
88
|
+
# single 10mp+ image per page?
|
89
|
+
single_image_per_page && max_image_px > 1024 * 1024 * 10
|
90
|
+
end
|
91
|
+
|
92
|
+
def ppi
|
93
|
+
unless looks_scanned
|
94
|
+
# 400 dpi for something that does not look like scanned media:
|
95
|
+
return 400
|
96
|
+
end
|
97
|
+
# For scanned media, defer to detected image PPI:
|
98
|
+
pdfinfo.ppi
|
99
|
+
end
|
100
|
+
|
101
|
+
# ghostscript convert all pages to TIFF
|
102
|
+
def gsconvert
|
103
|
+
output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
|
104
|
+
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
|
105
|
+
"-dTextAlphaBits=4 -sCompression=#{@compression} " \
|
106
|
+
"-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
|
107
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
108
|
+
output = stdout.read.split("\n")
|
109
|
+
# rubocop:disable Performance/Count
|
110
|
+
@size = output.select { |e| e.start_with?('Page ') }.length
|
111
|
+
# rubocop:enable Performance/Count
|
112
|
+
end
|
113
|
+
# Return an array of expected filenames
|
114
|
+
(1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
|
115
|
+
end
|
116
|
+
|
117
|
+
# entries for each page
|
118
|
+
def entries
|
119
|
+
@entries = gsconvert if @entries.nil?
|
120
|
+
@entries
|
121
|
+
end
|
122
|
+
|
123
|
+
def each
|
124
|
+
entries.each do |e|
|
125
|
+
yield(e)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'mini_magick'
|
3
|
+
|
4
|
+
module IiifPrint
|
5
|
+
module SplitPdfs
|
6
|
+
# Uses poppler 0.19+ pdfimages command to extract image
|
7
|
+
# listing metadata from PDF files.
|
8
|
+
# For dpi extraction, falls back to calculating using MiniMagick,
|
9
|
+
# if neccessary.
|
10
|
+
class PdfImageExtractionService
|
11
|
+
# class constant column numbers
|
12
|
+
COL_WIDTH = 3
|
13
|
+
COL_HEIGHT = 4
|
14
|
+
COL_COLOR = 5
|
15
|
+
COL_CHANNELS = 6
|
16
|
+
COL_BITS = 7
|
17
|
+
# only poppler 0.25+ has this column in output:
|
18
|
+
COL_XPPI = 12
|
19
|
+
|
20
|
+
def initialize(path)
|
21
|
+
@path = path
|
22
|
+
@cmd = format('pdfimages -list %<path>s', path: path)
|
23
|
+
@output = nil
|
24
|
+
@entries = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def process
|
28
|
+
# call just once
|
29
|
+
if @output.nil?
|
30
|
+
Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
31
|
+
@output = stdout.read.split("\n")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
@output.slice(2, @output.size - 1)
|
35
|
+
end
|
36
|
+
|
37
|
+
def entries
|
38
|
+
if @entries.nil?
|
39
|
+
@entries = []
|
40
|
+
output = process
|
41
|
+
(0..output.size - 1).each do |i|
|
42
|
+
@entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
|
43
|
+
end
|
44
|
+
end
|
45
|
+
@entries
|
46
|
+
end
|
47
|
+
|
48
|
+
def selectcolumn(i, &block)
|
49
|
+
result = entries.map { |e| e[i] }
|
50
|
+
return result.map!(&block) if block_given?
|
51
|
+
result
|
52
|
+
end
|
53
|
+
|
54
|
+
def width
|
55
|
+
selectcolumn(COL_WIDTH, &:to_i).max
|
56
|
+
end
|
57
|
+
|
58
|
+
def height
|
59
|
+
selectcolumn(COL_HEIGHT, &:to_i).max
|
60
|
+
end
|
61
|
+
|
62
|
+
def color
|
63
|
+
# desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
|
64
|
+
# so caller may want all of this information, and in case of
|
65
|
+
# mixed color spaces across images, this returns maximum
|
66
|
+
desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
|
67
|
+
channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
|
68
|
+
bits = entries.map { |e| e[COL_BITS].to_i }.max
|
69
|
+
[desc, channels, bits]
|
70
|
+
end
|
71
|
+
|
72
|
+
def ppi
|
73
|
+
if entries[0].size <= 12
|
74
|
+
# poppler < 0.25
|
75
|
+
pdf = MiniMagick::Image.open(@path)
|
76
|
+
width_points = pdf.width
|
77
|
+
width_px = width
|
78
|
+
return (72 * width_px / width_points).to_i
|
79
|
+
end
|
80
|
+
# with poppler 0.25+, pdfimages just gives us this:
|
81
|
+
selectcolumn(COL_XPPI, &:to_i).max
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'active_support/core_ext/module/delegation'
|
2
|
+
require 'json'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module IiifPrint
|
6
|
+
# Module for text extraction
|
7
|
+
module TextExtraction
|
8
|
+
# Class to obtain plain text and JSON word-coordinates from ALTO source
|
9
|
+
class AltoReader
|
10
|
+
attr_accessor :source, :doc_stream
|
11
|
+
delegate :text, to: :doc_stream
|
12
|
+
|
13
|
+
# SAX Document Stream class to gather text and word tokens from ALTO
|
14
|
+
class AltoDocStream < Nokogiri::XML::SAX::Document
|
15
|
+
attr_accessor :text, :words
|
16
|
+
|
17
|
+
def initialize(image_width = nil)
|
18
|
+
super()
|
19
|
+
# scaling matters:
|
20
|
+
@image_width = image_width
|
21
|
+
@scaling = 1.0 # pt to px, if ALTO using points
|
22
|
+
# plain text buffer:
|
23
|
+
@text = ''
|
24
|
+
# list of word hash, containing word+coord:
|
25
|
+
@words = []
|
26
|
+
end
|
27
|
+
|
28
|
+
# Return coordinates from String element attribute hash
|
29
|
+
#
|
30
|
+
# @param attrs [Hash] hash containing ALTO `String` element attributes.
|
31
|
+
# @return [Array] Array of position x, y, width, height in px.
|
32
|
+
def s_coords(attrs)
|
33
|
+
height = scale_value((attrs['HEIGHT'] || 0).to_i)
|
34
|
+
width = scale_value((attrs['WIDTH'] || 0).to_i)
|
35
|
+
hpos = scale_value((attrs['HPOS'] || 0).to_i)
|
36
|
+
vpos = scale_value((attrs['VPOS'] || 0).to_i)
|
37
|
+
[hpos, vpos, width, height]
|
38
|
+
end
|
39
|
+
|
40
|
+
def compute_scaling(attrs)
|
41
|
+
return if @image_width.nil?
|
42
|
+
match = attrs.find { |e| e[0].casecmp?('WIDTH') }
|
43
|
+
return if match.empty?
|
44
|
+
page_width = match[1].to_i
|
45
|
+
return if @image_width == page_width
|
46
|
+
@scaling = page_width / @image_width.to_f
|
47
|
+
end
|
48
|
+
|
49
|
+
def scale_value(v)
|
50
|
+
(v / @scaling).to_i
|
51
|
+
end
|
52
|
+
|
53
|
+
# Callback for element start, implementation of which ignores
|
54
|
+
# non-String elements.
|
55
|
+
#
|
56
|
+
# @param name [String] element name.
|
57
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
58
|
+
def start_element(name, attrs = [])
|
59
|
+
values = attrs.to_h
|
60
|
+
compute_scaling(attrs) if name == 'Page'
|
61
|
+
return if name != 'String'
|
62
|
+
token = values['CONTENT']
|
63
|
+
@text << token
|
64
|
+
@words << {
|
65
|
+
word: token,
|
66
|
+
coordinates: s_coords(values)
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
# Callback for element end, used here to manage endings of lines and
|
71
|
+
# blocks.
|
72
|
+
#
|
73
|
+
# @param name [String] element name.
|
74
|
+
def end_element(name)
|
75
|
+
@text << " " if name == 'String'
|
76
|
+
@text << "\n" if name == 'TextBlock'
|
77
|
+
@text << "\n" if name == 'TextLine'
|
78
|
+
end
|
79
|
+
|
80
|
+
# Callback for completion of parsing ALTO, used to normalize generated
|
81
|
+
# text content (strip unneeded whitespace incidental to output).
|
82
|
+
def end_document
|
83
|
+
# postprocess @text to remove trailing spaces on lines
|
84
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
85
|
+
# remove trailing whitespace at end of buffer
|
86
|
+
@text.strip!
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Construct with either path
|
91
|
+
#
|
92
|
+
# @param xml [String], and process document
|
93
|
+
def initialize(xml, image_width = nil, image_height = nil)
|
94
|
+
@source = isxml?(xml) ? xml : File.read(xml)
|
95
|
+
@image_width = image_width
|
96
|
+
@image_height = image_height
|
97
|
+
@doc_stream = AltoDocStream.new(image_width)
|
98
|
+
parser = Nokogiri::XML::SAX::Parser.new(doc_stream)
|
99
|
+
parser.parse(@source)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Determine if source parameter is path or xml
|
103
|
+
#
|
104
|
+
# @param xml [String] either path to xml file or xml source
|
105
|
+
# @return [true, false] true if string appears to be XML source, not path
|
106
|
+
def isxml?(xml)
|
107
|
+
xml.lstrip.start_with?('<')
|
108
|
+
end
|
109
|
+
|
110
|
+
# Output JSON flattened word coordinates
|
111
|
+
#
|
112
|
+
# @return [String] JSON serialization of flattened word coordinates
|
113
|
+
def json
|
114
|
+
words = @doc_stream.words
|
115
|
+
IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
|
116
|
+
words: words,
|
117
|
+
width: @image_width,
|
118
|
+
height: @image_height
|
119
|
+
)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|