iiif_print 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.coveralls.yml +2 -0
- data/.env +5 -0
- data/.fcrepo_wrapper +4 -0
- data/.github/release.yml +20 -0
- data/.github/workflows/branches.yml +24 -0
- data/.github/workflows/build-lint-test-action.yaml +33 -0
- data/.github/workflows/release_labels.yml +25 -0
- data/.gitignore +52 -0
- data/.rubocop.yml +177 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +49 -0
- data/CONTRIBUTING.md +181 -0
- data/Dockerfile +15 -0
- data/Gemfile +52 -0
- data/LICENSE +203 -0
- data/README.md +203 -0
- data/Rakefile +38 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
- data/app/assets/config/iiif_print_manifest.js +2 -0
- data/app/assets/images/iiif_print/.keep +0 -0
- data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/iiif_print.js +3 -0
- data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
- data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
- data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
- data/app/helpers/hyrax/iiif_helper.rb +22 -0
- data/app/helpers/iiif_print/application_helper.rb +5 -0
- data/app/helpers/iiif_print_helper.rb +64 -0
- data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
- data/app/mailers/iiif_print/application_mailer.rb +8 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
- data/app/models/concerns/iiif_print/solr/document.rb +47 -0
- data/app/models/iiif_print/application_record.rb +6 -0
- data/app/models/iiif_print/derivative_attachment.rb +8 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
- data/app/models/iiif_print/ingest_file_relation.rb +14 -0
- data/app/models/iiif_print/pending_relationship.rb +7 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
- data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
- data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
- data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/hyrax/base/_representative_media.html.erb +9 -0
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/iiif_print.de.yml +148 -0
- data/config/locales/iiif_print.en.yml +119 -0
- data/config/locales/iiif_print.es.yml +148 -0
- data/config/locales/iiif_print.fr.yml +149 -0
- data/config/locales/iiif_print.it.yml +142 -0
- data/config/locales/iiif_print.pt-BR.yml +148 -0
- data/config/locales/iiif_print.zh.yml +142 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
- data/docker-compose.yml +129 -0
- data/iiif_print.gemspec +43 -0
- data/lib/generators/iiif_print/assets_generator.rb +29 -0
- data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
- data/lib/generators/iiif_print/install_generator.rb +52 -0
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
- data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
- data/lib/iiif_print/base_derivative_service.rb +113 -0
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
- data/lib/iiif_print/catalog_search_builder.rb +31 -0
- data/lib/iiif_print/configuration.rb +99 -0
- data/lib/iiif_print/data/fileset_helper.rb +25 -0
- data/lib/iiif_print/data/path_helper.rb +40 -0
- data/lib/iiif_print/data/work_derivatives.rb +323 -0
- data/lib/iiif_print/data/work_file.rb +92 -0
- data/lib/iiif_print/data/work_files.rb +199 -0
- data/lib/iiif_print/data.rb +35 -0
- data/lib/iiif_print/engine.rb +77 -0
- data/lib/iiif_print/errors.rb +9 -0
- data/lib/iiif_print/image_tool.rb +119 -0
- data/lib/iiif_print/jobs/application_job.rb +8 -0
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
- data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
- data/lib/iiif_print/jp2_derivative_service.rb +118 -0
- data/lib/iiif_print/jp2_image_metadata.rb +81 -0
- data/lib/iiif_print/lineage_service.rb +41 -0
- data/lib/iiif_print/metadata.rb +125 -0
- data/lib/iiif_print/pdf_derivative_service.rb +42 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
- data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
- data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
- data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
- data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
- data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
- data/lib/iiif_print/text_extraction.rb +11 -0
- data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
- data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
- data/lib/iiif_print/tiff_derivative_service.rb +50 -0
- data/lib/iiif_print/version.rb +3 -0
- data/lib/iiif_print/works_controller_behavior.rb +9 -0
- data/lib/iiif_print.rb +136 -0
- data/lib/tasks/set_child_works.rake +22 -0
- data/spec/.keep.txt +1 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/uploaded_txt_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
- data/spec/helpers/iiif_print_helper_spec.rb +43 -0
- data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
- data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
- data/spec/iiif_print/configuration_spec.rb +67 -0
- data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
- data/spec/iiif_print/data/work_file_spec.rb +99 -0
- data/spec/iiif_print/data/work_files_spec.rb +237 -0
- data/spec/iiif_print/image_tool_spec.rb +109 -0
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
- data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
- data/spec/iiif_print/lineage_service_spec.rb +13 -0
- data/spec/iiif_print/metadata_spec.rb +115 -0
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
- data/spec/iiif_print_spec.rb +51 -0
- data/spec/misc_shared.rb +111 -0
- data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
- data/spec/spec_helper.rb +181 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/support/iiif_print_models.rb +127 -0
- data/spec/test_app_templates/blacklight.yml +9 -0
- data/spec/test_app_templates/fedora.yml +15 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
- data/spec/test_app_templates/redis.yml +9 -0
- data/spec/test_app_templates/solr/conf/schema.xml +362 -0
- data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
- data/spec/test_app_templates/solr.yml +7 -0
- data/tasks/iiif_print_dev.rake +34 -0
- data/tmp/.keep +0 -0
- metadata +605 -0
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'active_fedora'
|
2
|
+
require 'hyrax'
|
3
|
+
require 'blacklight_iiif_search'
|
4
|
+
|
5
|
+
module IiifPrint
|
6
|
+
# module constants:
|
7
|
+
GEM_PATH = Gem::Specification.find_by_name("iiif_print").gem_dir
|
8
|
+
|
9
|
+
# Engine Class
|
10
|
+
class Engine < ::Rails::Engine
|
11
|
+
isolate_namespace IiifPrint
|
12
|
+
|
13
|
+
# rubocop:disable Metrics/BlockLength
|
14
|
+
config.to_prepare do
|
15
|
+
# We don't have a hard requirement of Bullkrax but in our experience, lingering on earlier
|
16
|
+
# versions can introduce bugs of both Bulkrax and some of the assumptions that we've resolved.
|
17
|
+
# Very early versions of Bulkrax do not have VERSION defined
|
18
|
+
if defined?(Bulkrax) && !ENV.fetch("SKIP_IIIF_PRINT_BULKRAX_VERSION_REQUIREMENT", false)
|
19
|
+
if !defined?(Bulkrax::VERSION) || (Bulkrax::VERSION.to_i < 5)
|
20
|
+
raise "IiifPrint does not have a hard dependency on Bulkrax, " \
|
21
|
+
"but if you have Bulkrax installed we recommend at least version 5.0.0. " \
|
22
|
+
"To ignore this recommendation please add SKIP_IIIF_PRINT_BULKRAX_VERSION_REQUIREMENT " \
|
23
|
+
"to your ENV variables."
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Inject PluggableDerivativeService ahead of Hyrax default.
|
28
|
+
# This wraps Hyrax default, but allows multiple valid services
|
29
|
+
# to be configured, instead of just the _first_ valid service.
|
30
|
+
#
|
31
|
+
# To configure specific services, inject each service, in desired order
|
32
|
+
# to IiifPrint::PluggableDerivativeService.plugins array.
|
33
|
+
|
34
|
+
Hyrax::DerivativeService.services.unshift(
|
35
|
+
IiifPrint::PluggableDerivativeService
|
36
|
+
)
|
37
|
+
|
38
|
+
Hyrax::IiifManifestPresenter.prepend(IiifPrint::IiifManifestPresenterBehavior)
|
39
|
+
Hyrax::IiifManifestPresenter::Factory.prepend(IiifPrint::IiifManifestPresenterFactoryBehavior)
|
40
|
+
Hyrax::ManifestBuilderService.prepend(IiifPrint::ManifestBuilderServiceBehavior)
|
41
|
+
Hyrax::Renderers::FacetedAttributeRenderer.prepend(Hyrax::Renderers::FacetedAttributeRendererDecorator)
|
42
|
+
Hyrax::WorksControllerBehavior.prepend(IiifPrint::WorksControllerBehaviorDecorator)
|
43
|
+
Hyrax::WorkShowPresenter.prepend(IiifPrint::WorkShowPresenterDecorator)
|
44
|
+
|
45
|
+
IiifPrint::ChildIndexer.decorate_work_types!
|
46
|
+
IiifPrint::FileSetIndexer.decorate(Hyrax::FileSetIndexer)
|
47
|
+
|
48
|
+
::BlacklightIiifSearch::IiifSearchResponse.prepend(IiifPrint::IiifSearchResponseDecorator)
|
49
|
+
::BlacklightIiifSearch::IiifSearchAnnotation.prepend(IiifPrint::BlacklightIiifSearch::AnnotationDecorator)
|
50
|
+
Hyrax::Actors::FileSetActor.prepend(IiifPrint::Actors::FileSetActorDecorator)
|
51
|
+
|
52
|
+
# Extending the presenter to the base url which includes the protocol.
|
53
|
+
# We need the base url to render the facet links and normalize the interface.
|
54
|
+
Hyrax::IiifManifestPresenter.send(:attr_accessor, :base_url)
|
55
|
+
Hyrax::IiifManifestPresenter::DisplayImagePresenter.send(:attr_accessor, :base_url)
|
56
|
+
# Extending this class because there is an #ability= but not #ability and this definition
|
57
|
+
# mirrors the Hyrax::IiifManifestPresenter#ability.
|
58
|
+
module Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator
|
59
|
+
def ability
|
60
|
+
@ability ||= NullAbility.new
|
61
|
+
end
|
62
|
+
end
|
63
|
+
Hyrax::IiifManifestPresenter::DisplayImagePresenter.prepend(Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator)
|
64
|
+
|
65
|
+
Hyrax.config do |config|
|
66
|
+
config.callback.set(:after_create_fileset) do |file_set, user|
|
67
|
+
IiifPrint.config.handle_after_create_fileset(file_set, user)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
config.after_initialize do
|
73
|
+
IiifPrint::Solr::Document.decorate(SolrDocument)
|
74
|
+
end
|
75
|
+
# rubocop:enable Metrics/BlockLength
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
module IiifPrint
|
5
|
+
class ImageTool
|
6
|
+
attr_accessor :path, :ftype
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@path = path
|
10
|
+
@ftype = magic
|
11
|
+
@metadata = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
# @return [Hash] hash with following symbol keys, and respectively
|
15
|
+
# typed String and/or Integer values.
|
16
|
+
# :width, :height — both in Integer px units
|
17
|
+
# :color — (String enumerated from 'gray', 'monochrome', 'color')
|
18
|
+
# :num_components - Integer, number of channels
|
19
|
+
# :bits_per_component — Integer, bits per channel (e.g. 8 vs. 1)
|
20
|
+
# :content_type — RFC 2045 MIME type
|
21
|
+
def metadata
|
22
|
+
return @metadata unless @metadata.nil?
|
23
|
+
@metadata = jp2? ? jp2_metadata : identify_metadata
|
24
|
+
end
|
25
|
+
|
26
|
+
# Convert source image to image at destination path, inferring file type
|
27
|
+
# from destination file extension. In case of JP2 files, create
|
28
|
+
# intermediate file using OpenJPEG 2000 that ImageMagick can use.
|
29
|
+
# Only outputs monochrome output if monochrome is true, destination
|
30
|
+
# format is TIFF.
|
31
|
+
# @param destination [String] Path to output / destination file
|
32
|
+
# @param monochrome [Boolean] true if monochrome output, otherwise false
|
33
|
+
def convert(destination, monochrome = false)
|
34
|
+
raise 'JP2 output not yet supported' if destination.end_with?('jp2')
|
35
|
+
return convert_image(jp2_to_tiff(@path), destination, monochrome) if jp2?
|
36
|
+
convert_image(@path, destination, monochrome)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def convert_image(source, destination, monochrome)
|
42
|
+
monochrome &&= destination.slice(-4, 4).index('tif')
|
43
|
+
mono_opts = "-depth 1 -monochrome -compress Group4 -type bilevel "
|
44
|
+
opts = monochrome ? mono_opts : ''
|
45
|
+
cmd = "convert #{source} #{opts}#{destination}"
|
46
|
+
`#{cmd}`
|
47
|
+
end
|
48
|
+
|
49
|
+
def jp2_to_tiff(source)
|
50
|
+
intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
|
51
|
+
jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
|
52
|
+
`#{jp2_cmd}`
|
53
|
+
intermediate_path
|
54
|
+
end
|
55
|
+
|
56
|
+
def jp2_metadata
|
57
|
+
result = IiifPrint::JP2ImageMetadata.new(path).technical_metadata
|
58
|
+
result[:content_type] = 'image/jp2'
|
59
|
+
result
|
60
|
+
end
|
61
|
+
|
62
|
+
def im_line_select(lines, key)
|
63
|
+
line = lines.find { |l| l.scrub.downcase.strip.start_with?(key) }
|
64
|
+
# Given "key: value" line, return the value as String stripped of
|
65
|
+
# leading and trailing whitespace
|
66
|
+
return line if line.nil?
|
67
|
+
line.strip.split(':')[-1].strip
|
68
|
+
end
|
69
|
+
|
70
|
+
# @return [Array(Integer, Integer)] width, height in Integer px units
|
71
|
+
def im_identify_geometry(lines)
|
72
|
+
img_geo = im_line_select(lines, 'geometry').split('+')[0]
|
73
|
+
img_geo.split('x').map(&:to_i)
|
74
|
+
end
|
75
|
+
|
76
|
+
# @return [Array<String>] lines of output from imagemagick `identify`
|
77
|
+
def im_identify
|
78
|
+
cmd = "identify -verbose #{path}"
|
79
|
+
`#{cmd}`.lines
|
80
|
+
end
|
81
|
+
|
82
|
+
def im_mime(lines)
|
83
|
+
return 'application/pdf' if pdf? # workaround older imagemagick bug
|
84
|
+
im_line_select(lines, 'mime type')
|
85
|
+
end
|
86
|
+
|
87
|
+
def populate_im_color!(lines, result)
|
88
|
+
bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
|
89
|
+
colorspace = im_line_select(lines, 'colorspace')
|
90
|
+
color = colorspace == 'Gray' ? 'gray' : 'color'
|
91
|
+
has_alpha = !im_line_select(lines, 'Alpha').nil?
|
92
|
+
result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
|
93
|
+
result[:color] = bpc == 1 ? 'monochrome' : color
|
94
|
+
result[:bits_per_component] = bpc
|
95
|
+
end
|
96
|
+
|
97
|
+
# Return metadata by means of imagemagick identify
|
98
|
+
def identify_metadata
|
99
|
+
result = {}
|
100
|
+
lines = im_identify
|
101
|
+
result[:width], result[:height] = im_identify_geometry(lines)
|
102
|
+
result[:content_type] = im_mime(lines)
|
103
|
+
populate_im_color!(lines, result)
|
104
|
+
result
|
105
|
+
end
|
106
|
+
|
107
|
+
def magic
|
108
|
+
File.read(@path, 23, 0)
|
109
|
+
end
|
110
|
+
|
111
|
+
def jp2?
|
112
|
+
@ftype.end_with?('ftypjp2')
|
113
|
+
end
|
114
|
+
|
115
|
+
def pdf?
|
116
|
+
magic.start_with?('%PDF-')
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module Jobs
|
3
|
+
class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
|
4
|
+
# Break a pdf into individual pages
|
5
|
+
# @param parent_work
|
6
|
+
# @param pdf_paths: [<Array => String>] paths to pdfs
|
7
|
+
# @param user: [User]
|
8
|
+
# @param admin_set_id: [<String>]
|
9
|
+
# @param prior_pdfs: [<Integer>] count of pdfs already on parent work
|
10
|
+
def perform(parent_work, pdf_paths, user, admin_set_id, prior_pdfs)
|
11
|
+
@parent_work = parent_work
|
12
|
+
@child_admin_set_id = admin_set_id
|
13
|
+
child_model = @parent_work.iiif_print_config.pdf_split_child_model
|
14
|
+
|
15
|
+
# handle each input pdf
|
16
|
+
pdf_paths.each_with_index do |path, pdf_idx|
|
17
|
+
split_pdf(path, pdf_idx, user, prior_pdfs, child_model)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Link newly created child works to the parent
|
21
|
+
# @param user: [User] user
|
22
|
+
# @param parent_id: [<String>] parent work id
|
23
|
+
# @param parent_model: [<String>] parent model
|
24
|
+
# @param child_model: [<String>] child model
|
25
|
+
IiifPrint::Jobs::CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
|
26
|
+
user: user,
|
27
|
+
parent_id: @parent_work.id,
|
28
|
+
parent_model: @parent_work.class.to_s,
|
29
|
+
child_model: child_model.to_s
|
30
|
+
)
|
31
|
+
|
32
|
+
# TODO: clean up image_files and pdf_paths
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def split_pdf(path, pdf_idx, user, prior_pdfs_count, child_model)
|
38
|
+
image_files = @parent_work.iiif_print_config.pdf_splitter_service.new(path).to_a
|
39
|
+
return if image_files.blank?
|
40
|
+
|
41
|
+
pdf_sequence = pdf_idx + prior_pdfs_count
|
42
|
+
prepare_import_data(pdf_sequence, image_files, user)
|
43
|
+
|
44
|
+
# submit the job to create all the child works for one PDF
|
45
|
+
# @param [User] user
|
46
|
+
# @param [Hash<String => String>] titles
|
47
|
+
# @param [Hash<String => String>] resource_types (optional)
|
48
|
+
# @param [Array<String>] uploaded_files Hyrax::UploadedFile IDs
|
49
|
+
# @param [Hash] attributes attributes to apply to all works, including :model
|
50
|
+
# @param [Hyrax::BatchCreateOperation] operation
|
51
|
+
operation = Hyrax::BatchCreateOperation.create!(
|
52
|
+
user: user,
|
53
|
+
operation_type: "PDF Batch Create"
|
54
|
+
)
|
55
|
+
BatchCreateJob.perform_later(user,
|
56
|
+
@child_work_titles,
|
57
|
+
{},
|
58
|
+
@uploaded_files,
|
59
|
+
attributes.merge!(model: child_model.to_s).with_indifferent_access,
|
60
|
+
operation)
|
61
|
+
end
|
62
|
+
|
63
|
+
def prepare_import_data(pdf_sequence, image_files, user)
|
64
|
+
@uploaded_files = []
|
65
|
+
@child_work_titles = {}
|
66
|
+
image_files.each_with_index do |image_path, idx|
|
67
|
+
file_id = create_uploaded_file(user, image_path).to_s
|
68
|
+
file_title = set_title(@parent_work.title.first, pdf_sequence, idx)
|
69
|
+
@uploaded_files << file_id
|
70
|
+
@child_work_titles[file_id] = file_title
|
71
|
+
# save child work info to create the member relationships
|
72
|
+
PendingRelationship.create!(child_title: file_title,
|
73
|
+
parent_id: @parent_work.id,
|
74
|
+
child_order: sort_order(pdf_sequence, idx))
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def sort_order(pdf_sequence, idx)
|
79
|
+
"#{pdf_sequence} #{idx}"
|
80
|
+
end
|
81
|
+
|
82
|
+
def create_uploaded_file(user, path)
|
83
|
+
uf = Hyrax::UploadedFile.new
|
84
|
+
uf.user_id = user.id
|
85
|
+
uf.file = CarrierWave::SanitizedFile.new(path)
|
86
|
+
uf.save!
|
87
|
+
uf.id
|
88
|
+
end
|
89
|
+
|
90
|
+
def set_title(title, pdf_sequence, idx)
|
91
|
+
pdf_index = "Pdf Nbr #{pdf_sequence + 1}"
|
92
|
+
page_number = "Page #{idx + 1}"
|
93
|
+
"#{title}: #{pdf_index}, #{page_number}"
|
94
|
+
end
|
95
|
+
|
96
|
+
# TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
|
97
|
+
def attributes
|
98
|
+
{
|
99
|
+
admin_set_id: @child_admin_set_id.to_s,
|
100
|
+
creator: @parent_work.creator.to_a,
|
101
|
+
rights_statement: @parent_work.rights_statement.to_a,
|
102
|
+
visibility: @parent_work.visibility.to_s
|
103
|
+
}
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module Jobs
|
3
|
+
# Break a pdf into individual pages
|
4
|
+
class CreateRelationshipsJob < IiifPrint::Jobs::ApplicationJob
|
5
|
+
# Link newly created child works to the parent
|
6
|
+
# @param user: [User] user
|
7
|
+
# @param parent_id: [<String>] parent work id
|
8
|
+
# @param parent_model: [<String>] parent model
|
9
|
+
# @param child_model: [<String>] child model
|
10
|
+
def perform(user:, parent_id:, parent_model:, child_model:)
|
11
|
+
if completed_child_data_for(parent_id, child_model)
|
12
|
+
# add the members
|
13
|
+
parent_work = parent_model.constantize.find(parent_id)
|
14
|
+
create_relationships(user: user, parent: parent_work, ordered_children: @child_works)
|
15
|
+
@pending_children.each(&:destroy)
|
16
|
+
else
|
17
|
+
# reschedule the job and end this one normally
|
18
|
+
#
|
19
|
+
# TODO: Depending on how things shake out, we could be infinitely rescheduling this job.
|
20
|
+
# Consider a time to live parameter.
|
21
|
+
reschedule(user: user, parent_id: parent_id, parent_model: parent_model, child_model: child_model)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# load @child_works, and return true or false
|
28
|
+
def completed_child_data_for(parent_id, child_model)
|
29
|
+
@child_works = []
|
30
|
+
found_all_children = true
|
31
|
+
|
32
|
+
# find and sequence all pending children
|
33
|
+
@pending_children = IiifPrint::PendingRelationship.where(parent_id: parent_id).order('child_order asc')
|
34
|
+
|
35
|
+
# find child works (skip out if any haven't yet been created)
|
36
|
+
@pending_children.each do |child|
|
37
|
+
# find by title... if any aren't found, the child works are not yet ready
|
38
|
+
found_children = find_children_by_title_for(child.child_title, child_model)
|
39
|
+
found_all_children = false if found_children.empty?
|
40
|
+
break unless found_all_children == true
|
41
|
+
@child_works += found_children
|
42
|
+
end
|
43
|
+
# return boolean
|
44
|
+
found_all_children
|
45
|
+
end
|
46
|
+
|
47
|
+
def find_children_by_title_for(title, model)
|
48
|
+
# We should only find one, but there is no guarantee of that and `:where` returns an array.
|
49
|
+
model.constantize.where(title: title)
|
50
|
+
end
|
51
|
+
|
52
|
+
def reschedule(user:, parent_id:, parent_model:, child_model:)
|
53
|
+
CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
|
54
|
+
user: user,
|
55
|
+
parent_id: parent_id,
|
56
|
+
parent_model: parent_model,
|
57
|
+
child_model: child_model
|
58
|
+
)
|
59
|
+
end
|
60
|
+
|
61
|
+
def create_relationships(user:, parent:, ordered_children:)
|
62
|
+
records_hash = {}
|
63
|
+
ordered_children.map(&:id).each_with_index do |child_id, i|
|
64
|
+
records_hash[i.to_s] = { id: child_id }
|
65
|
+
end
|
66
|
+
attrs = { work_members_attributes: records_hash }
|
67
|
+
parent.try(:reindex_extent=, Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX)
|
68
|
+
env = Hyrax::Actors::Environment.new(parent, Ability.new(user), attrs)
|
69
|
+
|
70
|
+
Hyrax::CurationConcern.actor.update(env)
|
71
|
+
# need to reindex all file_sets to make all ancestors are indexed
|
72
|
+
ordered_children.each do |child_work|
|
73
|
+
child_work.file_sets.each(&:update_index) if child_work.respond_to?(:file_sets)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
class JP2DerivativeService < BaseDerivativeService
|
5
|
+
# OpenJPEG 2000 Command to make NDNP-compliant grayscale JP2:
|
6
|
+
CMD_GRAY = 'opj_compress -i %<source_file>s -o %<out_file>s ' \
|
7
|
+
'-d 0,0 -b 64,64 -n 6 -p RLCP -t 1024,1024 -I -M 1 ' \
|
8
|
+
'-r 64,53.821,45.249,40,32,26.911,22.630,20,16,14.286,' \
|
9
|
+
'11.364,10,8,6.667,5.556,4.762,4,3.333,2.857,2.500,2,' \
|
10
|
+
'1.667,1.429,1.190,1'.freeze
|
11
|
+
|
12
|
+
# OpenJPEG 2000 Command to make RGB JP2:
|
13
|
+
CMD_COLOR = 'opj_compress -i %<source_file>s -o %<out_file>s ' \
|
14
|
+
'-d 0,0 -b 64,64 -n 6 -p RPCL -t 1024,1024 -I -M 1 '\
|
15
|
+
'-r 2.4,1.48331273,.91673033,.56657224,.35016049,.21641118,' \
|
16
|
+
'.13374944,.0944,.08266171'.freeze
|
17
|
+
|
18
|
+
# OpenJPEG 1.x command replacement for 2.x opj_compress, takes same options;
|
19
|
+
# this is necessary on Ubuntu Trusty (e.g. Travis CI)
|
20
|
+
CMD_1X = 'image_to_j2k'.freeze
|
21
|
+
|
22
|
+
# Target file extension of this service plugin:
|
23
|
+
self.target_extension = 'jp2'.freeze
|
24
|
+
|
25
|
+
attr_reader :file_set
|
26
|
+
delegate :uri, :mime_type, to: :file_set
|
27
|
+
|
28
|
+
def initialize(file_set)
|
29
|
+
# cached result string for imagemagick `identify` command
|
30
|
+
@command = nil
|
31
|
+
@unlink_after_creation = []
|
32
|
+
super(file_set)
|
33
|
+
end
|
34
|
+
|
35
|
+
def create_derivatives(filename)
|
36
|
+
# Base class takes care of loading @source_path, @dest_path
|
37
|
+
super(filename)
|
38
|
+
|
39
|
+
# no creation if jp2 master => deemed unnecessary/duplicative
|
40
|
+
return if mime_type == 'image/jp2'
|
41
|
+
|
42
|
+
# if we have a non-TIFF source, or a 1-bit monochrome source, we need
|
43
|
+
# to make a NetPBM-based intermediate (temporary) file for OpenJPEG
|
44
|
+
# to consume.
|
45
|
+
needs_intermediate = !tiff_source? || one_bit?
|
46
|
+
|
47
|
+
# We use either intermediate temp file, or temp symlink (to work
|
48
|
+
# around OpenJPEG 2000 file naming quirk).
|
49
|
+
needs_intermediate ? make_intermediate_source : make_symlink
|
50
|
+
|
51
|
+
# Get OpenJPEG command, rendered with source, destination, appropriate
|
52
|
+
# to either color or grayscale source
|
53
|
+
render_cmd = opj_command
|
54
|
+
|
55
|
+
# Run the generated command to make derivative file at @dest_path
|
56
|
+
`#{render_cmd}`
|
57
|
+
|
58
|
+
# Clean up any intermediate files or symlinks used during creation
|
59
|
+
cleanup_intermediate
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
# source introspection:
|
65
|
+
|
66
|
+
def tiff_source?
|
67
|
+
identify[:content_type] == 'image/tiff'
|
68
|
+
end
|
69
|
+
|
70
|
+
def make_symlink
|
71
|
+
# OpenJPEG binaries have annoying quirk of only using TIFF input
|
72
|
+
# files whose name ends in .TIF or .tif (three letter); for all
|
73
|
+
# non-monochrome TIFF files, we just assume we need to symlink
|
74
|
+
# to such a filename.
|
75
|
+
tmpname = File.join(Dir.tmpdir, "#{SecureRandom.uuid}.tif")
|
76
|
+
FileUtils.ln_s(@source_path, tmpname)
|
77
|
+
@unlink_after_creation.push(tmpname)
|
78
|
+
# finally, point @source_path for command at intermediate link:
|
79
|
+
@source_path = tmpname
|
80
|
+
end
|
81
|
+
|
82
|
+
def make_intermediate_source
|
83
|
+
# generate a random filename to be made, with appropriate extension,
|
84
|
+
# inside /tmp dir:
|
85
|
+
tmpname = File.join(
|
86
|
+
Dir.tmpdir,
|
87
|
+
format(
|
88
|
+
"#{SecureRandom.uuid}.%<ext>s",
|
89
|
+
ext: use_color? ? 'ppm' : 'pgm'
|
90
|
+
)
|
91
|
+
)
|
92
|
+
# if pdf source, get only first page
|
93
|
+
source_path = @source_path
|
94
|
+
source_path += '[0]' if @source_path.ends_with?('pdf')
|
95
|
+
# Use ImageMagick `convert` to create intermediate bitmap:
|
96
|
+
`convert #{source_path} #{tmpname}`
|
97
|
+
@unlink_after_creation.push(tmpname)
|
98
|
+
# finally, point @source_path for command at intermediate file:
|
99
|
+
@source_path = tmpname
|
100
|
+
end
|
101
|
+
|
102
|
+
def opj_command
|
103
|
+
# Get a command template appropriate to OpenJPEG 1.x or 2.x
|
104
|
+
use_openjpeg_1x = `which opj_compress`.empty?
|
105
|
+
cmd = use_color? ? CMD_COLOR : CMD_GRAY
|
106
|
+
cmd = cmd.sub('opj_compress', 'image_to_j2k') if use_openjpeg_1x
|
107
|
+
# return command with source and destination file names injected
|
108
|
+
format(cmd, source_file: @source_path, out_file: @dest_path)
|
109
|
+
end
|
110
|
+
|
111
|
+
def cleanup_intermediate
|
112
|
+
# remove symlink or intermediate file once we no longer need
|
113
|
+
@unlink_after_creation.each do |path|
|
114
|
+
FileUtils.rm(path)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
class JP2ImageMetadata
|
3
|
+
TOKEN_MARKER_START = "\xFF".force_encoding("BINARY").freeze
|
4
|
+
TOKEN_MARKER_SIZ = "\x51".force_encoding("BINARY").freeze
|
5
|
+
TOKEN_IHDR = 'ihdr'.freeze
|
6
|
+
|
7
|
+
attr_accessor :path
|
8
|
+
|
9
|
+
def initialize(path)
|
10
|
+
@path = path
|
11
|
+
end
|
12
|
+
|
13
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
14
|
+
# @return [Array(Integer, Integer)] X size, Y size, in Integer-typed px
|
15
|
+
def extract_jp2_dim(io)
|
16
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
17
|
+
buffer = ''
|
18
|
+
siz_found = false
|
19
|
+
# Informed by ISO/IEC 15444-1:2000, pp. 26-27
|
20
|
+
# via:
|
21
|
+
# http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
|
22
|
+
#
|
23
|
+
# first 23 bytes are file-magic, we can skip
|
24
|
+
io.seek(23, IO::SEEK_SET)
|
25
|
+
while !siz_found && !buffer.nil?
|
26
|
+
# read one byte at a time, until we hit marker start 0xFF
|
27
|
+
buffer = io.read(1) while buffer != TOKEN_MARKER_START
|
28
|
+
# - on 0xFF read subsequent byte; if value != 0x51, continue
|
29
|
+
buffer = io.read(1)
|
30
|
+
next if buffer != TOKEN_MARKER_SIZ
|
31
|
+
# - on 0x51, read next 12 bytes
|
32
|
+
buffer = io.read(12)
|
33
|
+
siz_found = true
|
34
|
+
end
|
35
|
+
# discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
|
36
|
+
x_siz = buffer.byteslice(4, 4).unpack('N').first
|
37
|
+
y_siz = buffer.byteslice(8, 4).unpack('N').first
|
38
|
+
[x_siz, y_siz]
|
39
|
+
end
|
40
|
+
|
41
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
42
|
+
# @return [Array(Integer, Integer)] number components, bits-per-component
|
43
|
+
def extract_jp2_components(io)
|
44
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
45
|
+
io.seek(0, IO::SEEK_SET)
|
46
|
+
# IHDR should be in first 64 bytes
|
47
|
+
buffer = io.read(64)
|
48
|
+
ihdr_data = buffer.split(TOKEN_IHDR)[-1]
|
49
|
+
raise IOError if ihdr_data.nil?
|
50
|
+
num_components = ihdr_data.byteslice(8, 2).unpack('n').first
|
51
|
+
# stored as "bit depth of the components in the codestream, minus 1", so add 1
|
52
|
+
bits_per_component = ihdr_data.byteslice(10, 1).unpack('c').first + 1
|
53
|
+
[num_components, bits_per_component]
|
54
|
+
end
|
55
|
+
|
56
|
+
def validate_jp2(io)
|
57
|
+
# verify file is jp2
|
58
|
+
magic = io.read(23)
|
59
|
+
raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
|
60
|
+
end
|
61
|
+
|
62
|
+
# @param path [String] path to jp2, for reading
|
63
|
+
# @return [Hash] hash
|
64
|
+
def technical_metadata
|
65
|
+
io = File.open(path, 'rb')
|
66
|
+
io.seek(0, IO::SEEK_SET)
|
67
|
+
validate_jp2(io)
|
68
|
+
x_siz, y_siz = extract_jp2_dim(io)
|
69
|
+
nc, bpc = extract_jp2_components(io)
|
70
|
+
color = nc >= 3 ? 'color' : 'gray'
|
71
|
+
io.close
|
72
|
+
{
|
73
|
+
color: bpc == 1 ? 'monochrome' : color,
|
74
|
+
num_components: nc,
|
75
|
+
bits_per_component: bpc,
|
76
|
+
width: x_siz,
|
77
|
+
height: y_siz
|
78
|
+
}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
# The purpose of this module is to encode lineage related services:
|
3
|
+
#
|
4
|
+
# - {.ancestor_ids_for}
|
5
|
+
# - {.descendent_file_set_ids_for}
|
6
|
+
#
|
7
|
+
# The ancestor and descendent_file_sets are useful for ensuring we index together related items.
|
8
|
+
# For example, when I have a work that is a book, and one file set per page of that book, when I
|
9
|
+
# search the book I want to find the text within the given book's pages.
|
10
|
+
#
|
11
|
+
# The methods of this module should be considered as defining an interface.
|
12
|
+
module LineageService
|
13
|
+
##
|
14
|
+
# @api public
|
15
|
+
#
|
16
|
+
# @param object [#in_works] An object that responds to #in_works
|
17
|
+
# @return [Array<String>]
|
18
|
+
def self.ancestor_ids_for(object)
|
19
|
+
ancestor_ids ||= []
|
20
|
+
object.in_works.each do |work|
|
21
|
+
ancestor_ids << work.id
|
22
|
+
ancestor_ids += ancestor_ids_for(work) if work.is_child
|
23
|
+
end
|
24
|
+
ancestor_ids.flatten.compact.uniq
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# @param object [#ordered_works, #file_sets, #member_ids]
|
29
|
+
# @return [Array<String>] the ids of associated file sets
|
30
|
+
def self.descendent_file_set_ids_for(object)
|
31
|
+
# enables us to return parents when searching for child OCR
|
32
|
+
file_set_ids = object.file_sets.map(&:id)
|
33
|
+
object.ordered_works&.each do |child|
|
34
|
+
file_set_ids += descendent_file_set_ids_for(child)
|
35
|
+
end
|
36
|
+
# enables us to return parents when searching for child metadata
|
37
|
+
file_set_ids += object.member_ids
|
38
|
+
file_set_ids.flatten.uniq.compact
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|