iiif_print 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +2 -0
- data/.env +5 -0
- data/.fcrepo_wrapper +4 -0
- data/.github/release.yml +20 -0
- data/.github/workflows/branches.yml +24 -0
- data/.github/workflows/build-lint-test-action.yaml +33 -0
- data/.github/workflows/release_labels.yml +25 -0
- data/.gitignore +52 -0
- data/.rubocop.yml +177 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +49 -0
- data/CONTRIBUTING.md +181 -0
- data/Dockerfile +15 -0
- data/Gemfile +52 -0
- data/LICENSE +203 -0
- data/README.md +203 -0
- data/Rakefile +38 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
- data/app/assets/config/iiif_print_manifest.js +2 -0
- data/app/assets/images/iiif_print/.keep +0 -0
- data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/iiif_print.js +3 -0
- data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
- data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
- data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
- data/app/helpers/hyrax/iiif_helper.rb +22 -0
- data/app/helpers/iiif_print/application_helper.rb +5 -0
- data/app/helpers/iiif_print_helper.rb +64 -0
- data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
- data/app/mailers/iiif_print/application_mailer.rb +8 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
- data/app/models/concerns/iiif_print/solr/document.rb +47 -0
- data/app/models/iiif_print/application_record.rb +6 -0
- data/app/models/iiif_print/derivative_attachment.rb +8 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
- data/app/models/iiif_print/ingest_file_relation.rb +14 -0
- data/app/models/iiif_print/pending_relationship.rb +7 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
- data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
- data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
- data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/hyrax/base/_representative_media.html.erb +9 -0
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/iiif_print.de.yml +148 -0
- data/config/locales/iiif_print.en.yml +119 -0
- data/config/locales/iiif_print.es.yml +148 -0
- data/config/locales/iiif_print.fr.yml +149 -0
- data/config/locales/iiif_print.it.yml +142 -0
- data/config/locales/iiif_print.pt-BR.yml +148 -0
- data/config/locales/iiif_print.zh.yml +142 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
- data/docker-compose.yml +129 -0
- data/iiif_print.gemspec +43 -0
- data/lib/generators/iiif_print/assets_generator.rb +29 -0
- data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
- data/lib/generators/iiif_print/install_generator.rb +52 -0
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
- data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
- data/lib/iiif_print/base_derivative_service.rb +113 -0
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
- data/lib/iiif_print/catalog_search_builder.rb +31 -0
- data/lib/iiif_print/configuration.rb +99 -0
- data/lib/iiif_print/data/fileset_helper.rb +25 -0
- data/lib/iiif_print/data/path_helper.rb +40 -0
- data/lib/iiif_print/data/work_derivatives.rb +323 -0
- data/lib/iiif_print/data/work_file.rb +92 -0
- data/lib/iiif_print/data/work_files.rb +199 -0
- data/lib/iiif_print/data.rb +35 -0
- data/lib/iiif_print/engine.rb +77 -0
- data/lib/iiif_print/errors.rb +9 -0
- data/lib/iiif_print/image_tool.rb +119 -0
- data/lib/iiif_print/jobs/application_job.rb +8 -0
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
- data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
- data/lib/iiif_print/jp2_derivative_service.rb +118 -0
- data/lib/iiif_print/jp2_image_metadata.rb +81 -0
- data/lib/iiif_print/lineage_service.rb +41 -0
- data/lib/iiif_print/metadata.rb +125 -0
- data/lib/iiif_print/pdf_derivative_service.rb +42 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
- data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
- data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
- data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
- data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
- data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
- data/lib/iiif_print/text_extraction.rb +11 -0
- data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
- data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
- data/lib/iiif_print/tiff_derivative_service.rb +50 -0
- data/lib/iiif_print/version.rb +3 -0
- data/lib/iiif_print/works_controller_behavior.rb +9 -0
- data/lib/iiif_print.rb +136 -0
- data/lib/tasks/set_child_works.rake +22 -0
- data/spec/.keep.txt +1 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/uploaded_txt_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
- data/spec/helpers/iiif_print_helper_spec.rb +43 -0
- data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
- data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
- data/spec/iiif_print/configuration_spec.rb +67 -0
- data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
- data/spec/iiif_print/data/work_file_spec.rb +99 -0
- data/spec/iiif_print/data/work_files_spec.rb +237 -0
- data/spec/iiif_print/image_tool_spec.rb +109 -0
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
- data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
- data/spec/iiif_print/lineage_service_spec.rb +13 -0
- data/spec/iiif_print/metadata_spec.rb +115 -0
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
- data/spec/iiif_print_spec.rb +51 -0
- data/spec/misc_shared.rb +111 -0
- data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
- data/spec/spec_helper.rb +181 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/support/iiif_print_models.rb +127 -0
- data/spec/test_app_templates/blacklight.yml +9 -0
- data/spec/test_app_templates/fedora.yml +15 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
- data/spec/test_app_templates/redis.yml +9 -0
- data/spec/test_app_templates/solr/conf/schema.xml +362 -0
- data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
- data/spec/test_app_templates/solr.yml +7 -0
- data/tasks/iiif_print_dev.rake +34 -0
- data/tmp/.keep +0 -0
- metadata +605 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
module IiifPrint
|
|
2
|
+
# rubocop:disable Metrics/ClassLength
|
|
3
|
+
class Metadata
|
|
4
|
+
def self.build_metadata_for(work:, version:, fields:, current_ability:, base_url:)
|
|
5
|
+
new(work: work,
|
|
6
|
+
version: version,
|
|
7
|
+
fields: fields,
|
|
8
|
+
current_ability: current_ability,
|
|
9
|
+
base_url: base_url).build_metadata
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def initialize(work:, version:, fields:, current_ability:, base_url:)
|
|
13
|
+
@work = work
|
|
14
|
+
@version = version.to_i
|
|
15
|
+
@fields = fields
|
|
16
|
+
@current_ability = current_ability
|
|
17
|
+
@base_url = base_url
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
attr_reader :work, :version, :fields
|
|
21
|
+
|
|
22
|
+
def build_metadata
|
|
23
|
+
send("build_metadata_for_v#{version}")
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def build_metadata_for_v2
|
|
29
|
+
fields.map do |field|
|
|
30
|
+
label = Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label
|
|
31
|
+
if field.name == :collection && member_of_collection?
|
|
32
|
+
viewable_collections = Hyrax::CollectionMemberService.run(work, @current_ability)
|
|
33
|
+
next if viewable_collections.empty?
|
|
34
|
+
{ 'label' => label,
|
|
35
|
+
'value' => make_collection_link(viewable_collections) }
|
|
36
|
+
else
|
|
37
|
+
next if field_is_empty?(field)
|
|
38
|
+
{ 'label' => label,
|
|
39
|
+
'value' => cast_to_value(field_name: field.name, options: field.options) }
|
|
40
|
+
end
|
|
41
|
+
end.compact
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def build_metadata_for_v3
|
|
45
|
+
fields.map do |field|
|
|
46
|
+
values = Array(work.try(field.name)).map { |value| scrub(value.to_s) }
|
|
47
|
+
next if values.empty?
|
|
48
|
+
{
|
|
49
|
+
'label' => {
|
|
50
|
+
# Since we're using I18n to translate the field, we're setting the locale used in the translation.
|
|
51
|
+
I18n.locale.to_s => [Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label]
|
|
52
|
+
},
|
|
53
|
+
'value' => {
|
|
54
|
+
'none' => values
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
end.compact
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def field_is_empty?(field)
|
|
61
|
+
Array(work.try(field.name)).empty?
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def member_of_collection?
|
|
65
|
+
work[:member_of_collection_ids_ssim]&.present?
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def scrub(value)
|
|
69
|
+
Loofah.fragment(value).scrub!(:whitewash).to_s
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def cast_to_value(field_name:, options:)
|
|
73
|
+
if options&.[](:render_as) == :faceted
|
|
74
|
+
values_for(field_name: field_name).map do |value|
|
|
75
|
+
search_field = field_name.to_s + "_sim"
|
|
76
|
+
path = Rails.application.routes.url_helpers.search_catalog_path(
|
|
77
|
+
"f[#{search_field}][]": value, locale: I18n.locale
|
|
78
|
+
)
|
|
79
|
+
path += '&include_child_works=true' if work["is_child_bsi"] == true
|
|
80
|
+
"<a href='#{File.join(@base_url, path)}'>#{value}</a>"
|
|
81
|
+
end
|
|
82
|
+
else
|
|
83
|
+
make_link(values_for(field_name: field_name))
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def values_for(field_name:)
|
|
88
|
+
Array(work.send(field_name))
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def make_collection_link(collection_documents)
|
|
92
|
+
collection_documents.map do |collection|
|
|
93
|
+
"<a href='#{File.join(@base_url, 'collections', collection.id)}'>#{collection.title.first}</a>"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# @note This method turns link looking strings into links
|
|
98
|
+
def make_link(texts)
|
|
99
|
+
texts.map do |t|
|
|
100
|
+
t.to_s.gsub(MAKE_LINK_REGEX) do |url|
|
|
101
|
+
"<a href='#{url}' target='_blank'>#{url}</a>"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
MAKE_LINK_REGEX = %r{
|
|
107
|
+
\b
|
|
108
|
+
(
|
|
109
|
+
(?: [a-z][\w-]+:
|
|
110
|
+
(?: /{1,3} | [a-z0-9%] ) |
|
|
111
|
+
www\d{0,3}[.] |
|
|
112
|
+
[a-z0-9.\-]+[.][a-z]{2,4}/
|
|
113
|
+
)
|
|
114
|
+
(?:
|
|
115
|
+
[^\s()<>]+ | \(([^\s()<>]+|(\([^\s()<>]+\)))*\)
|
|
116
|
+
)+
|
|
117
|
+
(?:
|
|
118
|
+
\(([^\s()<>]+|(\([^\s()<>]+\)))*\) |
|
|
119
|
+
[^\s`!()\[\]{};:'".,<>?«»〝〞‘‛]
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
}ix.freeze
|
|
123
|
+
end
|
|
124
|
+
# rubocop:enable Metrics/ClassLength
|
|
125
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
require 'open3'
|
|
2
|
+
|
|
3
|
+
module IiifPrint
|
|
4
|
+
class PDFDerivativeService < BaseDerivativeService
|
|
5
|
+
self.target_extension = 'pdf'.freeze
|
|
6
|
+
|
|
7
|
+
# PDF (JPEG, 8 bit grayscale), 150ppi
|
|
8
|
+
GRAY_PDF_CMD = 'convert %<source_file>s ' \
|
|
9
|
+
'-resize 1800 -density 150 ' \
|
|
10
|
+
'-depth 8 -colorspace Gray ' \
|
|
11
|
+
'-compress jpeg %<out_file>s'.freeze
|
|
12
|
+
|
|
13
|
+
# sRBG color PDF (JPEG, 8 bits per channel), 150ppi
|
|
14
|
+
COLOR_PDF_CMD = 'convert %<source_file>s ' \
|
|
15
|
+
'-resize 1800 -density 150 ' \
|
|
16
|
+
'-depth 8 ' \
|
|
17
|
+
'-compress jpeg %<out_file>s'.freeze
|
|
18
|
+
|
|
19
|
+
def initialize(file_set)
|
|
20
|
+
super(file_set)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Get conversion command; command varies on whether or not we have
|
|
24
|
+
# JP2 source, and whether we have color or grayscale material.
|
|
25
|
+
def convert_cmd
|
|
26
|
+
template = use_color? ? COLOR_PDF_CMD : GRAY_PDF_CMD
|
|
27
|
+
format(template, source_file: @source_path, out_file: @dest_path)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def create_derivatives(filename)
|
|
31
|
+
# Base class takes care of loading @source_path, @dest_path
|
|
32
|
+
super(filename)
|
|
33
|
+
|
|
34
|
+
# no creation if pdf master
|
|
35
|
+
return if mime_type == 'application/pdf'
|
|
36
|
+
|
|
37
|
+
# Get and run conversion command
|
|
38
|
+
return jp2_convert if mime_type == 'image/jp2'
|
|
39
|
+
im_convert
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Encapsulates methods used for pdf splitting into child works
|
|
4
|
+
module IiifPrint
|
|
5
|
+
module SplitPdfs
|
|
6
|
+
class ChildWorkCreationFromPdfService
|
|
7
|
+
# Load an array of paths to pdf files
|
|
8
|
+
# @param [Array > Hyrax::Upload file ids]
|
|
9
|
+
# @return [Array > String] file paths to temp directory
|
|
10
|
+
def self.pdf_paths(files:)
|
|
11
|
+
upload_ids = filter_file_ids(files)
|
|
12
|
+
return [] if upload_ids.empty?
|
|
13
|
+
uploads = Hyrax::UploadedFile.find(upload_ids)
|
|
14
|
+
paths = uploads.map(&method(:upload_path))
|
|
15
|
+
pdfs_only_for(paths)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Is child work splitting defined for model?
|
|
19
|
+
# @param [GenericWork, etc] A valid type of hyrax work
|
|
20
|
+
# @return [Boolean]
|
|
21
|
+
def self.iiif_print_split?(work:)
|
|
22
|
+
# defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
|
|
23
|
+
return true if work.try(:iiif_print_config)&.pdf_split_child_model
|
|
24
|
+
false
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Are there any PDF files?
|
|
28
|
+
# @param [Array > String] paths to PDFs
|
|
29
|
+
# @return [Boolean]
|
|
30
|
+
def self.pdfs?(paths:)
|
|
31
|
+
pdf_paths = pdfs_only_for(paths)
|
|
32
|
+
return false unless pdf_paths.count.positive?
|
|
33
|
+
true
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Submit the job to split PDF into child works
|
|
37
|
+
# @param [GenericWork, etc] A valid type of hyrax work
|
|
38
|
+
# @param [Array<String>] paths to PDF attachments
|
|
39
|
+
# @param [User] user
|
|
40
|
+
# @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
|
|
41
|
+
def self.queue_job(work:, file_locations:, user:, admin_set_id:)
|
|
42
|
+
work.iiif_print_config.pdf_splitter_job.perform_later(
|
|
43
|
+
work,
|
|
44
|
+
file_locations,
|
|
45
|
+
user,
|
|
46
|
+
admin_set_id,
|
|
47
|
+
count_existing_pdfs(work)
|
|
48
|
+
)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def self.filter_file_ids(input)
|
|
52
|
+
Array.wrap(input).select(&:present?)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Given Hyrax::Upload object, return path to file on local filesystem
|
|
56
|
+
def self.upload_path(upload)
|
|
57
|
+
# so many layers to this onion:
|
|
58
|
+
upload.file.file.file
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# TODO: implement a method to count existing PDFs on a work to support
|
|
62
|
+
# adding more PDFs to an existing work.
|
|
63
|
+
def self.count_existing_pdfs(_work)
|
|
64
|
+
0
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# TODO: Consider other methods to identify a PDF file.
|
|
68
|
+
# This sub-selection may need to be moved to use mimetype if there
|
|
69
|
+
# is a need to support paths not ending in .pdf (i.e. remote_urls)
|
|
70
|
+
def self.pdfs_only_for(paths)
|
|
71
|
+
paths.select { |path| path.end_with?('.pdf', '.PDF') }
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
require 'open3'
|
|
2
|
+
require 'securerandom'
|
|
3
|
+
require 'tmpdir'
|
|
4
|
+
require 'iiif_print/split_pdfs/pdf_image_extraction_service'
|
|
5
|
+
|
|
6
|
+
module IiifPrint
|
|
7
|
+
module SplitPdfs
|
|
8
|
+
class PagesIntoImagesService
|
|
9
|
+
include Enumerable
|
|
10
|
+
|
|
11
|
+
def initialize(path)
|
|
12
|
+
@baseid = SecureRandom.uuid
|
|
13
|
+
@pdfpath = path
|
|
14
|
+
@info = nil
|
|
15
|
+
@entries = nil
|
|
16
|
+
@tmpdir = nil
|
|
17
|
+
@size = nil
|
|
18
|
+
@pagecount = nil
|
|
19
|
+
@pdftext = nil
|
|
20
|
+
@compression = 'lzw'
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# return
|
|
24
|
+
def pdfinfo
|
|
25
|
+
@info = IiifPrint::SplitPdfs::PdfImageExtractionService.new(@pdfpath) if @info.nil?
|
|
26
|
+
@info
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# TODO: put this test somewhere to prevent invalid pdfs from crashing the image service.
|
|
30
|
+
def invalid_pdf?
|
|
31
|
+
return true if pdfinfo.color.include?(nil) || pdfinfo.width.nil? || pdfinfo.height.nil? || pdfinfo.entries.length.zero?
|
|
32
|
+
false
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def tmpdir
|
|
36
|
+
@tmpdir = Dir.mktmpdir if @tmpdir.nil?
|
|
37
|
+
@tmpdir
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def colordevice(channels, bpc)
|
|
41
|
+
bits = bpc * channels
|
|
42
|
+
# will be either 8bpc/16bpd color TIFF,
|
|
43
|
+
# with any CMYK source transformed to 8bpc RBG
|
|
44
|
+
bits = 24 unless [24, 48].include? bits
|
|
45
|
+
"tiff#{bits}nc"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def gsdevice
|
|
49
|
+
color, channels, bpc = pdfinfo.color
|
|
50
|
+
device = nil
|
|
51
|
+
# CCITT Group 4 Black and White, if applicable:
|
|
52
|
+
if color == 'gray' && bpc == 1
|
|
53
|
+
device = 'tiffg4'
|
|
54
|
+
@compression = 'g4'
|
|
55
|
+
end
|
|
56
|
+
# 8 Bit Grayscale, if applicable:
|
|
57
|
+
device = 'tiffgray' if color == 'gray' && bpc > 1
|
|
58
|
+
# otherwise color:
|
|
59
|
+
device = colordevice(channels, bpc) if device.nil?
|
|
60
|
+
device
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# TODO: this method came from newspaper gem but appears to be unused. Is it needed anywhere?
|
|
64
|
+
# def gstext
|
|
65
|
+
# cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
|
|
66
|
+
# "-sOutputFile=- -f #{@pdfpath}"
|
|
67
|
+
# Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
|
68
|
+
# @pdftext = stdout.read
|
|
69
|
+
# end
|
|
70
|
+
# @pdftext
|
|
71
|
+
# end
|
|
72
|
+
|
|
73
|
+
def pagecount
|
|
74
|
+
cmd = "pdfinfo #{@pdfpath}"
|
|
75
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
|
76
|
+
output = stdout.read.split("\n")
|
|
77
|
+
# rubocop:disable Performance/Detect
|
|
78
|
+
pages_e = output.select { |e| e.start_with?('Pages:') }[0]
|
|
79
|
+
# rubocop:enable Performance/Detect
|
|
80
|
+
@pagecount = pages_e.split[-1].to_i
|
|
81
|
+
end
|
|
82
|
+
@pagecount
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def looks_scanned
|
|
86
|
+
max_image_px = pdfinfo.width * pdfinfo.height
|
|
87
|
+
single_image_per_page = pdfinfo.entries.length == pagecount
|
|
88
|
+
# single 10mp+ image per page?
|
|
89
|
+
single_image_per_page && max_image_px > 1024 * 1024 * 10
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def ppi
|
|
93
|
+
unless looks_scanned
|
|
94
|
+
# 400 dpi for something that does not look like scanned media:
|
|
95
|
+
return 400
|
|
96
|
+
end
|
|
97
|
+
# For scanned media, defer to detected image PPI:
|
|
98
|
+
pdfinfo.ppi
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# ghostscript convert all pages to TIFF
|
|
102
|
+
def gsconvert
|
|
103
|
+
output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
|
|
104
|
+
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
|
|
105
|
+
"-dTextAlphaBits=4 -sCompression=#{@compression} " \
|
|
106
|
+
"-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
|
|
107
|
+
Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
|
108
|
+
output = stdout.read.split("\n")
|
|
109
|
+
# rubocop:disable Performance/Count
|
|
110
|
+
@size = output.select { |e| e.start_with?('Page ') }.length
|
|
111
|
+
# rubocop:enable Performance/Count
|
|
112
|
+
end
|
|
113
|
+
# Return an array of expected filenames
|
|
114
|
+
(1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# entries for each page
|
|
118
|
+
def entries
|
|
119
|
+
@entries = gsconvert if @entries.nil?
|
|
120
|
+
@entries
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def each
|
|
124
|
+
entries.each do |e|
|
|
125
|
+
yield(e)
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
require 'open3'
|
|
2
|
+
require 'mini_magick'
|
|
3
|
+
|
|
4
|
+
module IiifPrint
|
|
5
|
+
module SplitPdfs
|
|
6
|
+
# Uses poppler 0.19+ pdfimages command to extract image
|
|
7
|
+
# listing metadata from PDF files.
|
|
8
|
+
# For dpi extraction, falls back to calculating using MiniMagick,
|
|
9
|
+
# if neccessary.
|
|
10
|
+
class PdfImageExtractionService
|
|
11
|
+
# class constant column numbers
|
|
12
|
+
COL_WIDTH = 3
|
|
13
|
+
COL_HEIGHT = 4
|
|
14
|
+
COL_COLOR = 5
|
|
15
|
+
COL_CHANNELS = 6
|
|
16
|
+
COL_BITS = 7
|
|
17
|
+
# only poppler 0.25+ has this column in output:
|
|
18
|
+
COL_XPPI = 12
|
|
19
|
+
|
|
20
|
+
def initialize(path)
|
|
21
|
+
@path = path
|
|
22
|
+
@cmd = format('pdfimages -list %<path>s', path: path)
|
|
23
|
+
@output = nil
|
|
24
|
+
@entries = nil
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def process
|
|
28
|
+
# call just once
|
|
29
|
+
if @output.nil?
|
|
30
|
+
Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
|
|
31
|
+
@output = stdout.read.split("\n")
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
@output.slice(2, @output.size - 1)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def entries
|
|
38
|
+
if @entries.nil?
|
|
39
|
+
@entries = []
|
|
40
|
+
output = process
|
|
41
|
+
(0..output.size - 1).each do |i|
|
|
42
|
+
@entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
@entries
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def selectcolumn(i, &block)
|
|
49
|
+
result = entries.map { |e| e[i] }
|
|
50
|
+
return result.map!(&block) if block_given?
|
|
51
|
+
result
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def width
|
|
55
|
+
selectcolumn(COL_WIDTH, &:to_i).max
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def height
|
|
59
|
+
selectcolumn(COL_HEIGHT, &:to_i).max
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def color
|
|
63
|
+
# desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
|
|
64
|
+
# so caller may want all of this information, and in case of
|
|
65
|
+
# mixed color spaces across images, this returns maximum
|
|
66
|
+
desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
|
|
67
|
+
channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
|
|
68
|
+
bits = entries.map { |e| e[COL_BITS].to_i }.max
|
|
69
|
+
[desc, channels, bits]
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def ppi
|
|
73
|
+
if entries[0].size <= 12
|
|
74
|
+
# poppler < 0.25
|
|
75
|
+
pdf = MiniMagick::Image.open(@path)
|
|
76
|
+
width_points = pdf.width
|
|
77
|
+
width_px = width
|
|
78
|
+
return (72 * width_px / width_points).to_i
|
|
79
|
+
end
|
|
80
|
+
# with poppler 0.25+, pdfimages just gives us this:
|
|
81
|
+
selectcolumn(COL_XPPI, &:to_i).max
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
require 'active_support/core_ext/module/delegation'
|
|
2
|
+
require 'json'
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
|
|
5
|
+
module IiifPrint
|
|
6
|
+
# Module for text extraction
|
|
7
|
+
module TextExtraction
|
|
8
|
+
# Class to obtain plain text and JSON word-coordinates from ALTO source
|
|
9
|
+
class AltoReader
|
|
10
|
+
attr_accessor :source, :doc_stream
|
|
11
|
+
delegate :text, to: :doc_stream
|
|
12
|
+
|
|
13
|
+
# SAX Document Stream class to gather text and word tokens from ALTO
|
|
14
|
+
class AltoDocStream < Nokogiri::XML::SAX::Document
|
|
15
|
+
attr_accessor :text, :words
|
|
16
|
+
|
|
17
|
+
def initialize(image_width = nil)
|
|
18
|
+
super()
|
|
19
|
+
# scaling matters:
|
|
20
|
+
@image_width = image_width
|
|
21
|
+
@scaling = 1.0 # pt to px, if ALTO using points
|
|
22
|
+
# plain text buffer:
|
|
23
|
+
@text = ''
|
|
24
|
+
# list of word hash, containing word+coord:
|
|
25
|
+
@words = []
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Return coordinates from String element attribute hash
|
|
29
|
+
#
|
|
30
|
+
# @param attrs [Hash] hash containing ALTO `String` element attributes.
|
|
31
|
+
# @return [Array] Array of position x, y, width, height in px.
|
|
32
|
+
def s_coords(attrs)
|
|
33
|
+
height = scale_value((attrs['HEIGHT'] || 0).to_i)
|
|
34
|
+
width = scale_value((attrs['WIDTH'] || 0).to_i)
|
|
35
|
+
hpos = scale_value((attrs['HPOS'] || 0).to_i)
|
|
36
|
+
vpos = scale_value((attrs['VPOS'] || 0).to_i)
|
|
37
|
+
[hpos, vpos, width, height]
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def compute_scaling(attrs)
|
|
41
|
+
return if @image_width.nil?
|
|
42
|
+
match = attrs.find { |e| e[0].casecmp?('WIDTH') }
|
|
43
|
+
return if match.empty?
|
|
44
|
+
page_width = match[1].to_i
|
|
45
|
+
return if @image_width == page_width
|
|
46
|
+
@scaling = page_width / @image_width.to_f
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def scale_value(v)
|
|
50
|
+
(v / @scaling).to_i
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Callback for element start, implementation of which ignores
|
|
54
|
+
# non-String elements.
|
|
55
|
+
#
|
|
56
|
+
# @param name [String] element name.
|
|
57
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
|
58
|
+
def start_element(name, attrs = [])
|
|
59
|
+
values = attrs.to_h
|
|
60
|
+
compute_scaling(attrs) if name == 'Page'
|
|
61
|
+
return if name != 'String'
|
|
62
|
+
token = values['CONTENT']
|
|
63
|
+
@text << token
|
|
64
|
+
@words << {
|
|
65
|
+
word: token,
|
|
66
|
+
coordinates: s_coords(values)
|
|
67
|
+
}
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Callback for element end, used here to manage endings of lines and
|
|
71
|
+
# blocks.
|
|
72
|
+
#
|
|
73
|
+
# @param name [String] element name.
|
|
74
|
+
def end_element(name)
|
|
75
|
+
@text << " " if name == 'String'
|
|
76
|
+
@text << "\n" if name == 'TextBlock'
|
|
77
|
+
@text << "\n" if name == 'TextLine'
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Callback for completion of parsing ALTO, used to normalize generated
|
|
81
|
+
# text content (strip unneeded whitespace incidental to output).
|
|
82
|
+
def end_document
|
|
83
|
+
# postprocess @text to remove trailing spaces on lines
|
|
84
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
|
85
|
+
# remove trailing whitespace at end of buffer
|
|
86
|
+
@text.strip!
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Construct with either path
|
|
91
|
+
#
|
|
92
|
+
# @param xml [String], and process document
|
|
93
|
+
def initialize(xml, image_width = nil, image_height = nil)
|
|
94
|
+
@source = isxml?(xml) ? xml : File.read(xml)
|
|
95
|
+
@image_width = image_width
|
|
96
|
+
@image_height = image_height
|
|
97
|
+
@doc_stream = AltoDocStream.new(image_width)
|
|
98
|
+
parser = Nokogiri::XML::SAX::Parser.new(doc_stream)
|
|
99
|
+
parser.parse(@source)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Determine if source parameter is path or xml
|
|
103
|
+
#
|
|
104
|
+
# @param xml [String] either path to xml file or xml source
|
|
105
|
+
# @return [true, false] true if string appears to be XML source, not path
|
|
106
|
+
def isxml?(xml)
|
|
107
|
+
xml.lstrip.start_with?('<')
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Output JSON flattened word coordinates
|
|
111
|
+
#
|
|
112
|
+
# @return [String] JSON serialization of flattened word coordinates
|
|
113
|
+
def json
|
|
114
|
+
words = @doc_stream.words
|
|
115
|
+
IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
|
|
116
|
+
words: words,
|
|
117
|
+
width: @image_width,
|
|
118
|
+
height: @image_height
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|