iiif_print 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (211) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.env +5 -0
  4. data/.fcrepo_wrapper +4 -0
  5. data/.github/release.yml +20 -0
  6. data/.github/workflows/branches.yml +24 -0
  7. data/.github/workflows/build-lint-test-action.yaml +33 -0
  8. data/.github/workflows/release_labels.yml +25 -0
  9. data/.gitignore +52 -0
  10. data/.rubocop.yml +177 -0
  11. data/.solr_wrapper +8 -0
  12. data/.travis.yml +49 -0
  13. data/CONTRIBUTING.md +181 -0
  14. data/Dockerfile +15 -0
  15. data/Gemfile +52 -0
  16. data/LICENSE +203 -0
  17. data/README.md +203 -0
  18. data/Rakefile +38 -0
  19. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
  20. data/app/assets/config/iiif_print_manifest.js +2 -0
  21. data/app/assets/images/iiif_print/.keep +0 -0
  22. data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
  23. data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
  24. data/app/assets/javascripts/iiif_print.js +3 -0
  25. data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
  26. data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
  27. data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
  28. data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
  29. data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
  30. data/app/helpers/hyrax/iiif_helper.rb +22 -0
  31. data/app/helpers/iiif_print/application_helper.rb +5 -0
  32. data/app/helpers/iiif_print_helper.rb +64 -0
  33. data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
  34. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
  35. data/app/mailers/iiif_print/application_mailer.rb +8 -0
  36. data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
  37. data/app/models/concerns/iiif_print/solr/document.rb +47 -0
  38. data/app/models/iiif_print/application_record.rb +6 -0
  39. data/app/models/iiif_print/derivative_attachment.rb +8 -0
  40. data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
  41. data/app/models/iiif_print/ingest_file_relation.rb +14 -0
  42. data/app/models/iiif_print/pending_relationship.rb +7 -0
  43. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
  44. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
  45. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
  46. data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
  47. data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
  48. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
  49. data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
  50. data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
  51. data/app/views/catalog/_snippets_more.html.erb +16 -0
  52. data/app/views/hyrax/base/_representative_media.html.erb +9 -0
  53. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
  54. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  55. data/bin/rails +13 -0
  56. data/config/fcrepo_wrapper_test.yml +5 -0
  57. data/config/initializers/assets.rb +2 -0
  58. data/config/locales/iiif_print.de.yml +148 -0
  59. data/config/locales/iiif_print.en.yml +119 -0
  60. data/config/locales/iiif_print.es.yml +148 -0
  61. data/config/locales/iiif_print.fr.yml +149 -0
  62. data/config/locales/iiif_print.it.yml +142 -0
  63. data/config/locales/iiif_print.pt-BR.yml +148 -0
  64. data/config/locales/iiif_print.zh.yml +142 -0
  65. data/config/solr_wrapper_test.yml +9 -0
  66. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  67. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  68. data/config/test-fixture/solr-config/elevate.xml +36 -0
  69. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  70. data/config/test-fixture/solr-config/protwords.txt +21 -0
  71. data/config/test-fixture/solr-config/schema.xml +366 -0
  72. data/config/test-fixture/solr-config/scripts.conf +24 -0
  73. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  74. data/config/test-fixture/solr-config/spellings.txt +2 -0
  75. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  76. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  77. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  78. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  79. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  80. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  81. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  82. data/config/vendor/fits.xml +55 -0
  83. data/config/vendor/imagemagick-6-policy.xml +76 -0
  84. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
  85. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
  86. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
  87. data/docker-compose.yml +129 -0
  88. data/iiif_print.gemspec +43 -0
  89. data/lib/generators/iiif_print/assets_generator.rb +29 -0
  90. data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
  91. data/lib/generators/iiif_print/install_generator.rb +52 -0
  92. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
  93. data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
  94. data/lib/iiif_print/base_derivative_service.rb +113 -0
  95. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
  96. data/lib/iiif_print/catalog_search_builder.rb +31 -0
  97. data/lib/iiif_print/configuration.rb +99 -0
  98. data/lib/iiif_print/data/fileset_helper.rb +25 -0
  99. data/lib/iiif_print/data/path_helper.rb +40 -0
  100. data/lib/iiif_print/data/work_derivatives.rb +323 -0
  101. data/lib/iiif_print/data/work_file.rb +92 -0
  102. data/lib/iiif_print/data/work_files.rb +199 -0
  103. data/lib/iiif_print/data.rb +35 -0
  104. data/lib/iiif_print/engine.rb +77 -0
  105. data/lib/iiif_print/errors.rb +9 -0
  106. data/lib/iiif_print/image_tool.rb +119 -0
  107. data/lib/iiif_print/jobs/application_job.rb +8 -0
  108. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
  109. data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
  110. data/lib/iiif_print/jp2_derivative_service.rb +118 -0
  111. data/lib/iiif_print/jp2_image_metadata.rb +81 -0
  112. data/lib/iiif_print/lineage_service.rb +41 -0
  113. data/lib/iiif_print/metadata.rb +125 -0
  114. data/lib/iiif_print/pdf_derivative_service.rb +42 -0
  115. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
  116. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
  117. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
  118. data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
  119. data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
  120. data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
  121. data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
  122. data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
  123. data/lib/iiif_print/text_extraction.rb +11 -0
  124. data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
  125. data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
  126. data/lib/iiif_print/tiff_derivative_service.rb +50 -0
  127. data/lib/iiif_print/version.rb +3 -0
  128. data/lib/iiif_print/works_controller_behavior.rb +9 -0
  129. data/lib/iiif_print.rb +136 -0
  130. data/lib/tasks/set_child_works.rake +22 -0
  131. data/spec/.keep.txt +1 -0
  132. data/spec/factories/ability.rb +6 -0
  133. data/spec/factories/newspaper_issue.rb +7 -0
  134. data/spec/factories/newspaper_page.rb +7 -0
  135. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  136. data/spec/factories/newspaper_title.rb +8 -0
  137. data/spec/factories/uploaded_pdf_file.rb +9 -0
  138. data/spec/factories/uploaded_txt_file.rb +9 -0
  139. data/spec/factories/user.rb +13 -0
  140. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  141. data/spec/fixtures/files/4.1.07.tiff +0 -0
  142. data/spec/fixtures/files/README.md +7 -0
  143. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  144. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  145. data/spec/fixtures/files/credits.md +16 -0
  146. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  147. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  148. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  149. data/spec/fixtures/files/minimal-alto.xml +31 -0
  150. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  151. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  152. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  153. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  154. data/spec/fixtures/files/ocr_alto.xml +202 -0
  155. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  156. data/spec/fixtures/files/ocr_color.tiff +0 -0
  157. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  158. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  159. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  160. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  161. data/spec/fixtures/files/page1.tiff +0 -0
  162. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  163. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  164. data/spec/fixtures/files/thumbnail.jpg +0 -0
  165. data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
  166. data/spec/helpers/iiif_print_helper_spec.rb +43 -0
  167. data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
  168. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
  169. data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
  170. data/spec/iiif_print/configuration_spec.rb +67 -0
  171. data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
  172. data/spec/iiif_print/data/work_file_spec.rb +99 -0
  173. data/spec/iiif_print/data/work_files_spec.rb +237 -0
  174. data/spec/iiif_print/image_tool_spec.rb +109 -0
  175. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
  176. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
  177. data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
  178. data/spec/iiif_print/lineage_service_spec.rb +13 -0
  179. data/spec/iiif_print/metadata_spec.rb +115 -0
  180. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
  181. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
  182. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
  183. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
  184. data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
  185. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
  186. data/spec/iiif_print_spec.rb +51 -0
  187. data/spec/misc_shared.rb +111 -0
  188. data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
  189. data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
  190. data/spec/models/solr_document_spec.rb +14 -0
  191. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
  192. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
  193. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
  194. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
  195. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
  196. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
  197. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
  198. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
  199. data/spec/spec_helper.rb +181 -0
  200. data/spec/support/controller_level_helpers.rb +28 -0
  201. data/spec/support/iiif_print_models.rb +127 -0
  202. data/spec/test_app_templates/blacklight.yml +9 -0
  203. data/spec/test_app_templates/fedora.yml +15 -0
  204. data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
  205. data/spec/test_app_templates/redis.yml +9 -0
  206. data/spec/test_app_templates/solr/conf/schema.xml +362 -0
  207. data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
  208. data/spec/test_app_templates/solr.yml +7 -0
  209. data/tasks/iiif_print_dev.rake +34 -0
  210. data/tmp/.keep +0 -0
  211. metadata +605 -0
@@ -0,0 +1,77 @@
1
+ require 'active_fedora'
2
+ require 'hyrax'
3
+ require 'blacklight_iiif_search'
4
+
5
+ module IiifPrint
6
+ # module constants:
7
+ GEM_PATH = Gem::Specification.find_by_name("iiif_print").gem_dir
8
+
9
+ # Engine Class
10
+ class Engine < ::Rails::Engine
11
+ isolate_namespace IiifPrint
12
+
13
+ # rubocop:disable Metrics/BlockLength
14
+ config.to_prepare do
15
+ # We don't have a hard requirement of Bullkrax but in our experience, lingering on earlier
16
+ # versions can introduce bugs of both Bulkrax and some of the assumptions that we've resolved.
17
+ # Very early versions of Bulkrax do not have VERSION defined
18
+ if defined?(Bulkrax) && !ENV.fetch("SKIP_IIIF_PRINT_BULKRAX_VERSION_REQUIREMENT", false)
19
+ if !defined?(Bulkrax::VERSION) || (Bulkrax::VERSION.to_i < 5)
20
+ raise "IiifPrint does not have a hard dependency on Bulkrax, " \
21
+ "but if you have Bulkrax installed we recommend at least version 5.0.0. " \
22
+ "To ignore this recommendation please add SKIP_IIIF_PRINT_BULKRAX_VERSION_REQUIREMENT " \
23
+ "to your ENV variables."
24
+ end
25
+ end
26
+
27
+ # Inject PluggableDerivativeService ahead of Hyrax default.
28
+ # This wraps Hyrax default, but allows multiple valid services
29
+ # to be configured, instead of just the _first_ valid service.
30
+ #
31
+ # To configure specific services, inject each service, in desired order
32
+ # to IiifPrint::PluggableDerivativeService.plugins array.
33
+
34
+ Hyrax::DerivativeService.services.unshift(
35
+ IiifPrint::PluggableDerivativeService
36
+ )
37
+
38
+ Hyrax::IiifManifestPresenter.prepend(IiifPrint::IiifManifestPresenterBehavior)
39
+ Hyrax::IiifManifestPresenter::Factory.prepend(IiifPrint::IiifManifestPresenterFactoryBehavior)
40
+ Hyrax::ManifestBuilderService.prepend(IiifPrint::ManifestBuilderServiceBehavior)
41
+ Hyrax::Renderers::FacetedAttributeRenderer.prepend(Hyrax::Renderers::FacetedAttributeRendererDecorator)
42
+ Hyrax::WorksControllerBehavior.prepend(IiifPrint::WorksControllerBehaviorDecorator)
43
+ Hyrax::WorkShowPresenter.prepend(IiifPrint::WorkShowPresenterDecorator)
44
+
45
+ IiifPrint::ChildIndexer.decorate_work_types!
46
+ IiifPrint::FileSetIndexer.decorate(Hyrax::FileSetIndexer)
47
+
48
+ ::BlacklightIiifSearch::IiifSearchResponse.prepend(IiifPrint::IiifSearchResponseDecorator)
49
+ ::BlacklightIiifSearch::IiifSearchAnnotation.prepend(IiifPrint::BlacklightIiifSearch::AnnotationDecorator)
50
+ Hyrax::Actors::FileSetActor.prepend(IiifPrint::Actors::FileSetActorDecorator)
51
+
52
+ # Extending the presenter to the base url which includes the protocol.
53
+ # We need the base url to render the facet links and normalize the interface.
54
+ Hyrax::IiifManifestPresenter.send(:attr_accessor, :base_url)
55
+ Hyrax::IiifManifestPresenter::DisplayImagePresenter.send(:attr_accessor, :base_url)
56
+ # Extending this class because there is an #ability= but not #ability and this definition
57
+ # mirrors the Hyrax::IiifManifestPresenter#ability.
58
+ module Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator
59
+ def ability
60
+ @ability ||= NullAbility.new
61
+ end
62
+ end
63
+ Hyrax::IiifManifestPresenter::DisplayImagePresenter.prepend(Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator)
64
+
65
+ Hyrax.config do |config|
66
+ config.callback.set(:after_create_fileset) do |file_set, user|
67
+ IiifPrint.config.handle_after_create_fileset(file_set, user)
68
+ end
69
+ end
70
+ end
71
+
72
+ config.after_initialize do
73
+ IiifPrint::Solr::Document.decorate(SolrDocument)
74
+ end
75
+ # rubocop:enable Metrics/BlockLength
76
+ end
77
+ end
@@ -0,0 +1,9 @@
1
+ module IiifPrint
2
+ # generic/base IiifPrint-specific exception:
3
+ class IiifPrintError < StandardError
4
+ end
5
+
6
+ # Data transformation or read-error:
7
+ class DataError < IiifPrintError
8
+ end
9
+ end
@@ -0,0 +1,119 @@
1
+ require 'open3'
2
+ require 'tmpdir'
3
+
4
+ module IiifPrint
5
+ class ImageTool
6
+ attr_accessor :path, :ftype
7
+
8
+ def initialize(path)
9
+ @path = path
10
+ @ftype = magic
11
+ @metadata = nil
12
+ end
13
+
14
+ # @return [Hash] hash with following symbol keys, and respectively
15
+ # typed String and/or Integer values.
16
+ # :width, :height — both in Integer px units
17
+ # :color — (String enumerated from 'gray', 'monochrome', 'color')
18
+ # :num_components - Integer, number of channels
19
+ # :bits_per_component — Integer, bits per channel (e.g. 8 vs. 1)
20
+ # :content_type — RFC 2045 MIME type
21
+ def metadata
22
+ return @metadata unless @metadata.nil?
23
+ @metadata = jp2? ? jp2_metadata : identify_metadata
24
+ end
25
+
26
+ # Convert source image to image at destination path, inferring file type
27
+ # from destination file extension. In case of JP2 files, create
28
+ # intermediate file using OpenJPEG 2000 that ImageMagick can use.
29
+ # Only outputs monochrome output if monochrome is true, destination
30
+ # format is TIFF.
31
+ # @param destination [String] Path to output / destination file
32
+ # @param monochrome [Boolean] true if monochrome output, otherwise false
33
+ def convert(destination, monochrome = false)
34
+ raise 'JP2 output not yet supported' if destination.end_with?('jp2')
35
+ return convert_image(jp2_to_tiff(@path), destination, monochrome) if jp2?
36
+ convert_image(@path, destination, monochrome)
37
+ end
38
+
39
+ private
40
+
41
+ def convert_image(source, destination, monochrome)
42
+ monochrome &&= destination.slice(-4, 4).index('tif')
43
+ mono_opts = "-depth 1 -monochrome -compress Group4 -type bilevel "
44
+ opts = monochrome ? mono_opts : ''
45
+ cmd = "convert #{source} #{opts}#{destination}"
46
+ `#{cmd}`
47
+ end
48
+
49
+ def jp2_to_tiff(source)
50
+ intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
51
+ jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
52
+ `#{jp2_cmd}`
53
+ intermediate_path
54
+ end
55
+
56
+ def jp2_metadata
57
+ result = IiifPrint::JP2ImageMetadata.new(path).technical_metadata
58
+ result[:content_type] = 'image/jp2'
59
+ result
60
+ end
61
+
62
+ def im_line_select(lines, key)
63
+ line = lines.find { |l| l.scrub.downcase.strip.start_with?(key) }
64
+ # Given "key: value" line, return the value as String stripped of
65
+ # leading and trailing whitespace
66
+ return line if line.nil?
67
+ line.strip.split(':')[-1].strip
68
+ end
69
+
70
+ # @return [Array(Integer, Integer)] width, height in Integer px units
71
+ def im_identify_geometry(lines)
72
+ img_geo = im_line_select(lines, 'geometry').split('+')[0]
73
+ img_geo.split('x').map(&:to_i)
74
+ end
75
+
76
+ # @return [Array<String>] lines of output from imagemagick `identify`
77
+ def im_identify
78
+ cmd = "identify -verbose #{path}"
79
+ `#{cmd}`.lines
80
+ end
81
+
82
+ def im_mime(lines)
83
+ return 'application/pdf' if pdf? # workaround older imagemagick bug
84
+ im_line_select(lines, 'mime type')
85
+ end
86
+
87
+ def populate_im_color!(lines, result)
88
+ bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
89
+ colorspace = im_line_select(lines, 'colorspace')
90
+ color = colorspace == 'Gray' ? 'gray' : 'color'
91
+ has_alpha = !im_line_select(lines, 'Alpha').nil?
92
+ result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
93
+ result[:color] = bpc == 1 ? 'monochrome' : color
94
+ result[:bits_per_component] = bpc
95
+ end
96
+
97
+ # Return metadata by means of imagemagick identify
98
+ def identify_metadata
99
+ result = {}
100
+ lines = im_identify
101
+ result[:width], result[:height] = im_identify_geometry(lines)
102
+ result[:content_type] = im_mime(lines)
103
+ populate_im_color!(lines, result)
104
+ result
105
+ end
106
+
107
+ def magic
108
+ File.read(@path, 23, 0)
109
+ end
110
+
111
+ def jp2?
112
+ @ftype.end_with?('ftypjp2')
113
+ end
114
+
115
+ def pdf?
116
+ magic.start_with?('%PDF-')
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,8 @@
1
+ module IiifPrint
2
+ module Jobs
3
+ # TODO: Consider inheriting from ::Application job. That means we would have the upstreams
4
+ # based job behavior.
5
+ class ApplicationJob < ActiveJob::Base
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,107 @@
1
+ module IiifPrint
2
+ module Jobs
3
+ class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
4
+ # Break a pdf into individual pages
5
+ # @param parent_work
6
+ # @param pdf_paths: [<Array => String>] paths to pdfs
7
+ # @param user: [User]
8
+ # @param admin_set_id: [<String>]
9
+ # @param prior_pdfs: [<Integer>] count of pdfs already on parent work
10
+ def perform(parent_work, pdf_paths, user, admin_set_id, prior_pdfs)
11
+ @parent_work = parent_work
12
+ @child_admin_set_id = admin_set_id
13
+ child_model = @parent_work.iiif_print_config.pdf_split_child_model
14
+
15
+ # handle each input pdf
16
+ pdf_paths.each_with_index do |path, pdf_idx|
17
+ split_pdf(path, pdf_idx, user, prior_pdfs, child_model)
18
+ end
19
+
20
+ # Link newly created child works to the parent
21
+ # @param user: [User] user
22
+ # @param parent_id: [<String>] parent work id
23
+ # @param parent_model: [<String>] parent model
24
+ # @param child_model: [<String>] child model
25
+ IiifPrint::Jobs::CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
26
+ user: user,
27
+ parent_id: @parent_work.id,
28
+ parent_model: @parent_work.class.to_s,
29
+ child_model: child_model.to_s
30
+ )
31
+
32
+ # TODO: clean up image_files and pdf_paths
33
+ end
34
+
35
+ private
36
+
37
+ def split_pdf(path, pdf_idx, user, prior_pdfs_count, child_model)
38
+ image_files = @parent_work.iiif_print_config.pdf_splitter_service.new(path).to_a
39
+ return if image_files.blank?
40
+
41
+ pdf_sequence = pdf_idx + prior_pdfs_count
42
+ prepare_import_data(pdf_sequence, image_files, user)
43
+
44
+ # submit the job to create all the child works for one PDF
45
+ # @param [User] user
46
+ # @param [Hash<String => String>] titles
47
+ # @param [Hash<String => String>] resource_types (optional)
48
+ # @param [Array<String>] uploaded_files Hyrax::UploadedFile IDs
49
+ # @param [Hash] attributes attributes to apply to all works, including :model
50
+ # @param [Hyrax::BatchCreateOperation] operation
51
+ operation = Hyrax::BatchCreateOperation.create!(
52
+ user: user,
53
+ operation_type: "PDF Batch Create"
54
+ )
55
+ BatchCreateJob.perform_later(user,
56
+ @child_work_titles,
57
+ {},
58
+ @uploaded_files,
59
+ attributes.merge!(model: child_model.to_s).with_indifferent_access,
60
+ operation)
61
+ end
62
+
63
+ def prepare_import_data(pdf_sequence, image_files, user)
64
+ @uploaded_files = []
65
+ @child_work_titles = {}
66
+ image_files.each_with_index do |image_path, idx|
67
+ file_id = create_uploaded_file(user, image_path).to_s
68
+ file_title = set_title(@parent_work.title.first, pdf_sequence, idx)
69
+ @uploaded_files << file_id
70
+ @child_work_titles[file_id] = file_title
71
+ # save child work info to create the member relationships
72
+ PendingRelationship.create!(child_title: file_title,
73
+ parent_id: @parent_work.id,
74
+ child_order: sort_order(pdf_sequence, idx))
75
+ end
76
+ end
77
+
78
+ def sort_order(pdf_sequence, idx)
79
+ "#{pdf_sequence} #{idx}"
80
+ end
81
+
82
+ def create_uploaded_file(user, path)
83
+ uf = Hyrax::UploadedFile.new
84
+ uf.user_id = user.id
85
+ uf.file = CarrierWave::SanitizedFile.new(path)
86
+ uf.save!
87
+ uf.id
88
+ end
89
+
90
+ def set_title(title, pdf_sequence, idx)
91
+ pdf_index = "Pdf Nbr #{pdf_sequence + 1}"
92
+ page_number = "Page #{idx + 1}"
93
+ "#{title}: #{pdf_index}, #{page_number}"
94
+ end
95
+
96
+ # TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
97
+ def attributes
98
+ {
99
+ admin_set_id: @child_admin_set_id.to_s,
100
+ creator: @parent_work.creator.to_a,
101
+ rights_statement: @parent_work.rights_statement.to_a,
102
+ visibility: @parent_work.visibility.to_s
103
+ }
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,78 @@
1
+ module IiifPrint
2
+ module Jobs
3
+ # Break a pdf into individual pages
4
+ class CreateRelationshipsJob < IiifPrint::Jobs::ApplicationJob
5
+ # Link newly created child works to the parent
6
+ # @param user: [User] user
7
+ # @param parent_id: [<String>] parent work id
8
+ # @param parent_model: [<String>] parent model
9
+ # @param child_model: [<String>] child model
10
+ def perform(user:, parent_id:, parent_model:, child_model:)
11
+ if completed_child_data_for(parent_id, child_model)
12
+ # add the members
13
+ parent_work = parent_model.constantize.find(parent_id)
14
+ create_relationships(user: user, parent: parent_work, ordered_children: @child_works)
15
+ @pending_children.each(&:destroy)
16
+ else
17
+ # reschedule the job and end this one normally
18
+ #
19
+ # TODO: Depending on how things shake out, we could be infinitely rescheduling this job.
20
+ # Consider a time to live parameter.
21
+ reschedule(user: user, parent_id: parent_id, parent_model: parent_model, child_model: child_model)
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ # load @child_works, and return true or false
28
+ def completed_child_data_for(parent_id, child_model)
29
+ @child_works = []
30
+ found_all_children = true
31
+
32
+ # find and sequence all pending children
33
+ @pending_children = IiifPrint::PendingRelationship.where(parent_id: parent_id).order('child_order asc')
34
+
35
+ # find child works (skip out if any haven't yet been created)
36
+ @pending_children.each do |child|
37
+ # find by title... if any aren't found, the child works are not yet ready
38
+ found_children = find_children_by_title_for(child.child_title, child_model)
39
+ found_all_children = false if found_children.empty?
40
+ break unless found_all_children == true
41
+ @child_works += found_children
42
+ end
43
+ # return boolean
44
+ found_all_children
45
+ end
46
+
47
+ def find_children_by_title_for(title, model)
48
+ # We should only find one, but there is no guarantee of that and `:where` returns an array.
49
+ model.constantize.where(title: title)
50
+ end
51
+
52
+ def reschedule(user:, parent_id:, parent_model:, child_model:)
53
+ CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
54
+ user: user,
55
+ parent_id: parent_id,
56
+ parent_model: parent_model,
57
+ child_model: child_model
58
+ )
59
+ end
60
+
61
+ def create_relationships(user:, parent:, ordered_children:)
62
+ records_hash = {}
63
+ ordered_children.map(&:id).each_with_index do |child_id, i|
64
+ records_hash[i.to_s] = { id: child_id }
65
+ end
66
+ attrs = { work_members_attributes: records_hash }
67
+ parent.try(:reindex_extent=, Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX)
68
+ env = Hyrax::Actors::Environment.new(parent, Ability.new(user), attrs)
69
+
70
+ Hyrax::CurationConcern.actor.update(env)
71
+ # need to reindex all file_sets to make all ancestors are indexed
72
+ ordered_children.each do |child_work|
73
+ child_work.file_sets.each(&:update_index) if child_work.respond_to?(:file_sets)
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,118 @@
1
+ require 'open3'
2
+
3
+ module IiifPrint
4
+ class JP2DerivativeService < BaseDerivativeService
5
+ # OpenJPEG 2000 Command to make NDNP-compliant grayscale JP2:
6
+ CMD_GRAY = 'opj_compress -i %<source_file>s -o %<out_file>s ' \
7
+ '-d 0,0 -b 64,64 -n 6 -p RLCP -t 1024,1024 -I -M 1 ' \
8
+ '-r 64,53.821,45.249,40,32,26.911,22.630,20,16,14.286,' \
9
+ '11.364,10,8,6.667,5.556,4.762,4,3.333,2.857,2.500,2,' \
10
+ '1.667,1.429,1.190,1'.freeze
11
+
12
+ # OpenJPEG 2000 Command to make RGB JP2:
13
+ CMD_COLOR = 'opj_compress -i %<source_file>s -o %<out_file>s ' \
14
+ '-d 0,0 -b 64,64 -n 6 -p RPCL -t 1024,1024 -I -M 1 '\
15
+ '-r 2.4,1.48331273,.91673033,.56657224,.35016049,.21641118,' \
16
+ '.13374944,.0944,.08266171'.freeze
17
+
18
+ # OpenJPEG 1.x command replacement for 2.x opj_compress, takes same options;
19
+ # this is necessary on Ubuntu Trusty (e.g. Travis CI)
20
+ CMD_1X = 'image_to_j2k'.freeze
21
+
22
+ # Target file extension of this service plugin:
23
+ self.target_extension = 'jp2'.freeze
24
+
25
+ attr_reader :file_set
26
+ delegate :uri, :mime_type, to: :file_set
27
+
28
+ def initialize(file_set)
29
+ # cached result string for imagemagick `identify` command
30
+ @command = nil
31
+ @unlink_after_creation = []
32
+ super(file_set)
33
+ end
34
+
35
+ def create_derivatives(filename)
36
+ # Base class takes care of loading @source_path, @dest_path
37
+ super(filename)
38
+
39
+ # no creation if jp2 master => deemed unnecessary/duplicative
40
+ return if mime_type == 'image/jp2'
41
+
42
+ # if we have a non-TIFF source, or a 1-bit monochrome source, we need
43
+ # to make a NetPBM-based intermediate (temporary) file for OpenJPEG
44
+ # to consume.
45
+ needs_intermediate = !tiff_source? || one_bit?
46
+
47
+ # We use either intermediate temp file, or temp symlink (to work
48
+ # around OpenJPEG 2000 file naming quirk).
49
+ needs_intermediate ? make_intermediate_source : make_symlink
50
+
51
+ # Get OpenJPEG command, rendered with source, destination, appropriate
52
+ # to either color or grayscale source
53
+ render_cmd = opj_command
54
+
55
+ # Run the generated command to make derivative file at @dest_path
56
+ `#{render_cmd}`
57
+
58
+ # Clean up any intermediate files or symlinks used during creation
59
+ cleanup_intermediate
60
+ end
61
+
62
+ private
63
+
64
+ # source introspection:
65
+
66
+ def tiff_source?
67
+ identify[:content_type] == 'image/tiff'
68
+ end
69
+
70
+ def make_symlink
71
+ # OpenJPEG binaries have annoying quirk of only using TIFF input
72
+ # files whose name ends in .TIF or .tif (three letter); for all
73
+ # non-monochrome TIFF files, we just assume we need to symlink
74
+ # to such a filename.
75
+ tmpname = File.join(Dir.tmpdir, "#{SecureRandom.uuid}.tif")
76
+ FileUtils.ln_s(@source_path, tmpname)
77
+ @unlink_after_creation.push(tmpname)
78
+ # finally, point @source_path for command at intermediate link:
79
+ @source_path = tmpname
80
+ end
81
+
82
+ def make_intermediate_source
83
+ # generate a random filename to be made, with appropriate extension,
84
+ # inside /tmp dir:
85
+ tmpname = File.join(
86
+ Dir.tmpdir,
87
+ format(
88
+ "#{SecureRandom.uuid}.%<ext>s",
89
+ ext: use_color? ? 'ppm' : 'pgm'
90
+ )
91
+ )
92
+ # if pdf source, get only first page
93
+ source_path = @source_path
94
+ source_path += '[0]' if @source_path.ends_with?('pdf')
95
+ # Use ImageMagick `convert` to create intermediate bitmap:
96
+ `convert #{source_path} #{tmpname}`
97
+ @unlink_after_creation.push(tmpname)
98
+ # finally, point @source_path for command at intermediate file:
99
+ @source_path = tmpname
100
+ end
101
+
102
+ def opj_command
103
+ # Get a command template appropriate to OpenJPEG 1.x or 2.x
104
+ use_openjpeg_1x = `which opj_compress`.empty?
105
+ cmd = use_color? ? CMD_COLOR : CMD_GRAY
106
+ cmd = cmd.sub('opj_compress', 'image_to_j2k') if use_openjpeg_1x
107
+ # return command with source and destination file names injected
108
+ format(cmd, source_file: @source_path, out_file: @dest_path)
109
+ end
110
+
111
+ def cleanup_intermediate
112
+ # remove symlink or intermediate file once we no longer need
113
+ @unlink_after_creation.each do |path|
114
+ FileUtils.rm(path)
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,81 @@
1
+ module IiifPrint
2
+ class JP2ImageMetadata
3
+ TOKEN_MARKER_START = "\xFF".force_encoding("BINARY").freeze
4
+ TOKEN_MARKER_SIZ = "\x51".force_encoding("BINARY").freeze
5
+ TOKEN_IHDR = 'ihdr'.freeze
6
+
7
+ attr_accessor :path
8
+
9
+ def initialize(path)
10
+ @path = path
11
+ end
12
+
13
+ # @param io [IO] IO stream opened in binary mode, for reading
14
+ # @return [Array(Integer, Integer)] X size, Y size, in Integer-typed px
15
+ def extract_jp2_dim(io)
16
+ raise IOError, 'file not open in binary mode' unless io.binmode?
17
+ buffer = ''
18
+ siz_found = false
19
+ # Informed by ISO/IEC 15444-1:2000, pp. 26-27
20
+ # via:
21
+ # http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
22
+ #
23
+ # first 23 bytes are file-magic, we can skip
24
+ io.seek(23, IO::SEEK_SET)
25
+ while !siz_found && !buffer.nil?
26
+ # read one byte at a time, until we hit marker start 0xFF
27
+ buffer = io.read(1) while buffer != TOKEN_MARKER_START
28
+ # - on 0xFF read subsequent byte; if value != 0x51, continue
29
+ buffer = io.read(1)
30
+ next if buffer != TOKEN_MARKER_SIZ
31
+ # - on 0x51, read next 12 bytes
32
+ buffer = io.read(12)
33
+ siz_found = true
34
+ end
35
+ # discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
36
+ x_siz = buffer.byteslice(4, 4).unpack('N').first
37
+ y_siz = buffer.byteslice(8, 4).unpack('N').first
38
+ [x_siz, y_siz]
39
+ end
40
+
41
+ # @param io [IO] IO stream opened in binary mode, for reading
42
+ # @return [Array(Integer, Integer)] number components, bits-per-component
43
+ def extract_jp2_components(io)
44
+ raise IOError, 'file not open in binary mode' unless io.binmode?
45
+ io.seek(0, IO::SEEK_SET)
46
+ # IHDR should be in first 64 bytes
47
+ buffer = io.read(64)
48
+ ihdr_data = buffer.split(TOKEN_IHDR)[-1]
49
+ raise IOError if ihdr_data.nil?
50
+ num_components = ihdr_data.byteslice(8, 2).unpack('n').first
51
+ # stored as "bit depth of the components in the codestream, minus 1", so add 1
52
+ bits_per_component = ihdr_data.byteslice(10, 1).unpack('c').first + 1
53
+ [num_components, bits_per_component]
54
+ end
55
+
56
+ def validate_jp2(io)
57
+ # verify file is jp2
58
+ magic = io.read(23)
59
+ raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
60
+ end
61
+
62
+ # @param path [String] path to jp2, for reading
63
+ # @return [Hash] hash
64
+ def technical_metadata
65
+ io = File.open(path, 'rb')
66
+ io.seek(0, IO::SEEK_SET)
67
+ validate_jp2(io)
68
+ x_siz, y_siz = extract_jp2_dim(io)
69
+ nc, bpc = extract_jp2_components(io)
70
+ color = nc >= 3 ? 'color' : 'gray'
71
+ io.close
72
+ {
73
+ color: bpc == 1 ? 'monochrome' : color,
74
+ num_components: nc,
75
+ bits_per_component: bpc,
76
+ width: x_siz,
77
+ height: y_siz
78
+ }
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,41 @@
1
+ module IiifPrint
2
+ # The purpose of this module is to encode lineage related services:
3
+ #
4
+ # - {.ancestor_ids_for}
5
+ # - {.descendent_file_set_ids_for}
6
+ #
7
+ # The ancestor and descendent_file_sets are useful for ensuring we index together related items.
8
+ # For example, when I have a work that is a book, and one file set per page of that book, when I
9
+ # search the book I want to find the text within the given book's pages.
10
+ #
11
+ # The methods of this module should be considered as defining an interface.
12
+ module LineageService
13
+ ##
14
+ # @api public
15
+ #
16
+ # @param object [#in_works] An object that responds to #in_works
17
+ # @return [Array<String>]
18
+ def self.ancestor_ids_for(object)
19
+ ancestor_ids ||= []
20
+ object.in_works.each do |work|
21
+ ancestor_ids << work.id
22
+ ancestor_ids += ancestor_ids_for(work) if work.is_child
23
+ end
24
+ ancestor_ids.flatten.compact.uniq
25
+ end
26
+
27
+ ##
28
+ # @param object [#ordered_works, #file_sets, #member_ids]
29
+ # @return [Array<String>] the ids of associated file sets
30
+ def self.descendent_file_set_ids_for(object)
31
+ # enables us to return parents when searching for child OCR
32
+ file_set_ids = object.file_sets.map(&:id)
33
+ object.ordered_works&.each do |child|
34
+ file_set_ids += descendent_file_set_ids_for(child)
35
+ end
36
+ # enables us to return parents when searching for child metadata
37
+ file_set_ids += object.member_ids
38
+ file_set_ids.flatten.uniq.compact
39
+ end
40
+ end
41
+ end