iiif_print 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.env +5 -0
  4. data/.fcrepo_wrapper +4 -0
  5. data/.github/release.yml +20 -0
  6. data/.github/workflows/branches.yml +24 -0
  7. data/.github/workflows/build-lint-test-action.yaml +33 -0
  8. data/.github/workflows/release_labels.yml +25 -0
  9. data/.gitignore +52 -0
  10. data/.rubocop.yml +177 -0
  11. data/.solr_wrapper +8 -0
  12. data/.travis.yml +49 -0
  13. data/CONTRIBUTING.md +181 -0
  14. data/Dockerfile +15 -0
  15. data/Gemfile +52 -0
  16. data/LICENSE +203 -0
  17. data/README.md +203 -0
  18. data/Rakefile +38 -0
  19. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
  20. data/app/assets/config/iiif_print_manifest.js +2 -0
  21. data/app/assets/images/iiif_print/.keep +0 -0
  22. data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
  23. data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
  24. data/app/assets/javascripts/iiif_print.js +3 -0
  25. data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
  26. data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
  27. data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
  28. data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
  29. data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
  30. data/app/helpers/hyrax/iiif_helper.rb +22 -0
  31. data/app/helpers/iiif_print/application_helper.rb +5 -0
  32. data/app/helpers/iiif_print_helper.rb +64 -0
  33. data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
  34. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
  35. data/app/mailers/iiif_print/application_mailer.rb +8 -0
  36. data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
  37. data/app/models/concerns/iiif_print/solr/document.rb +47 -0
  38. data/app/models/iiif_print/application_record.rb +6 -0
  39. data/app/models/iiif_print/derivative_attachment.rb +8 -0
  40. data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
  41. data/app/models/iiif_print/ingest_file_relation.rb +14 -0
  42. data/app/models/iiif_print/pending_relationship.rb +7 -0
  43. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
  44. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
  45. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
  46. data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
  47. data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
  48. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
  49. data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
  50. data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
  51. data/app/views/catalog/_snippets_more.html.erb +16 -0
  52. data/app/views/hyrax/base/_representative_media.html.erb +9 -0
  53. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
  54. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  55. data/bin/rails +13 -0
  56. data/config/fcrepo_wrapper_test.yml +5 -0
  57. data/config/initializers/assets.rb +2 -0
  58. data/config/locales/iiif_print.de.yml +148 -0
  59. data/config/locales/iiif_print.en.yml +119 -0
  60. data/config/locales/iiif_print.es.yml +148 -0
  61. data/config/locales/iiif_print.fr.yml +149 -0
  62. data/config/locales/iiif_print.it.yml +142 -0
  63. data/config/locales/iiif_print.pt-BR.yml +148 -0
  64. data/config/locales/iiif_print.zh.yml +142 -0
  65. data/config/solr_wrapper_test.yml +9 -0
  66. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  67. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  68. data/config/test-fixture/solr-config/elevate.xml +36 -0
  69. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  70. data/config/test-fixture/solr-config/protwords.txt +21 -0
  71. data/config/test-fixture/solr-config/schema.xml +366 -0
  72. data/config/test-fixture/solr-config/scripts.conf +24 -0
  73. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  74. data/config/test-fixture/solr-config/spellings.txt +2 -0
  75. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  76. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  77. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  78. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  79. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  80. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  81. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  82. data/config/vendor/fits.xml +55 -0
  83. data/config/vendor/imagemagick-6-policy.xml +76 -0
  84. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
  85. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
  86. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
  87. data/docker-compose.yml +129 -0
  88. data/iiif_print.gemspec +43 -0
  89. data/lib/generators/iiif_print/assets_generator.rb +29 -0
  90. data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
  91. data/lib/generators/iiif_print/install_generator.rb +52 -0
  92. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
  93. data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
  94. data/lib/iiif_print/base_derivative_service.rb +113 -0
  95. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
  96. data/lib/iiif_print/catalog_search_builder.rb +31 -0
  97. data/lib/iiif_print/configuration.rb +99 -0
  98. data/lib/iiif_print/data/fileset_helper.rb +25 -0
  99. data/lib/iiif_print/data/path_helper.rb +40 -0
  100. data/lib/iiif_print/data/work_derivatives.rb +323 -0
  101. data/lib/iiif_print/data/work_file.rb +92 -0
  102. data/lib/iiif_print/data/work_files.rb +199 -0
  103. data/lib/iiif_print/data.rb +35 -0
  104. data/lib/iiif_print/engine.rb +77 -0
  105. data/lib/iiif_print/errors.rb +9 -0
  106. data/lib/iiif_print/image_tool.rb +119 -0
  107. data/lib/iiif_print/jobs/application_job.rb +8 -0
  108. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
  109. data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
  110. data/lib/iiif_print/jp2_derivative_service.rb +118 -0
  111. data/lib/iiif_print/jp2_image_metadata.rb +81 -0
  112. data/lib/iiif_print/lineage_service.rb +41 -0
  113. data/lib/iiif_print/metadata.rb +125 -0
  114. data/lib/iiif_print/pdf_derivative_service.rb +42 -0
  115. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
  116. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
  117. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
  118. data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
  119. data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
  120. data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
  121. data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
  122. data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
  123. data/lib/iiif_print/text_extraction.rb +11 -0
  124. data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
  125. data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
  126. data/lib/iiif_print/tiff_derivative_service.rb +50 -0
  127. data/lib/iiif_print/version.rb +3 -0
  128. data/lib/iiif_print/works_controller_behavior.rb +9 -0
  129. data/lib/iiif_print.rb +136 -0
  130. data/lib/tasks/set_child_works.rake +22 -0
  131. data/spec/.keep.txt +1 -0
  132. data/spec/factories/ability.rb +6 -0
  133. data/spec/factories/newspaper_issue.rb +7 -0
  134. data/spec/factories/newspaper_page.rb +7 -0
  135. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  136. data/spec/factories/newspaper_title.rb +8 -0
  137. data/spec/factories/uploaded_pdf_file.rb +9 -0
  138. data/spec/factories/uploaded_txt_file.rb +9 -0
  139. data/spec/factories/user.rb +13 -0
  140. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  141. data/spec/fixtures/files/4.1.07.tiff +0 -0
  142. data/spec/fixtures/files/README.md +7 -0
  143. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  144. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  145. data/spec/fixtures/files/credits.md +16 -0
  146. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  147. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  148. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  149. data/spec/fixtures/files/minimal-alto.xml +31 -0
  150. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  151. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  152. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  153. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  154. data/spec/fixtures/files/ocr_alto.xml +202 -0
  155. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  156. data/spec/fixtures/files/ocr_color.tiff +0 -0
  157. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  158. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  159. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  160. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  161. data/spec/fixtures/files/page1.tiff +0 -0
  162. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  163. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  164. data/spec/fixtures/files/thumbnail.jpg +0 -0
  165. data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
  166. data/spec/helpers/iiif_print_helper_spec.rb +43 -0
  167. data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
  168. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
  169. data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
  170. data/spec/iiif_print/configuration_spec.rb +67 -0
  171. data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
  172. data/spec/iiif_print/data/work_file_spec.rb +99 -0
  173. data/spec/iiif_print/data/work_files_spec.rb +237 -0
  174. data/spec/iiif_print/image_tool_spec.rb +109 -0
  175. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
  176. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
  177. data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
  178. data/spec/iiif_print/lineage_service_spec.rb +13 -0
  179. data/spec/iiif_print/metadata_spec.rb +115 -0
  180. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
  181. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
  182. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
  183. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
  184. data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
  185. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
  186. data/spec/iiif_print_spec.rb +51 -0
  187. data/spec/misc_shared.rb +111 -0
  188. data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
  189. data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
  190. data/spec/models/solr_document_spec.rb +14 -0
  191. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
  192. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
  193. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
  194. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
  195. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
  196. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
  197. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
  198. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
  199. data/spec/spec_helper.rb +181 -0
  200. data/spec/support/controller_level_helpers.rb +28 -0
  201. data/spec/support/iiif_print_models.rb +127 -0
  202. data/spec/test_app_templates/blacklight.yml +9 -0
  203. data/spec/test_app_templates/fedora.yml +15 -0
  204. data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
  205. data/spec/test_app_templates/redis.yml +9 -0
  206. data/spec/test_app_templates/solr/conf/schema.xml +362 -0
  207. data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
  208. data/spec/test_app_templates/solr.yml +7 -0
  209. data/tasks/iiif_print_dev.rake +34 -0
  210. data/tmp/.keep +0 -0
  211. metadata +605 -0
@@ -0,0 +1,77 @@
1
+ require 'active_fedora'
2
+ require 'hyrax'
3
+ require 'blacklight_iiif_search'
4
+
5
+ module IiifPrint
6
+ # module constants:
7
+ GEM_PATH = Gem::Specification.find_by_name("iiif_print").gem_dir
8
+
9
+ # Engine Class
10
+ class Engine < ::Rails::Engine
11
+ isolate_namespace IiifPrint
12
+
13
+ # rubocop:disable Metrics/BlockLength
14
+ config.to_prepare do
15
+ # We don't have a hard requirement of Bullkrax but in our experience, lingering on earlier
16
+ # versions can introduce bugs of both Bulkrax and some of the assumptions that we've resolved.
17
+ # Very early versions of Bulkrax do not have VERSION defined
18
+ if defined?(Bulkrax) && !ENV.fetch("SKIP_IIIF_PRINT_BULKRAX_VERSION_REQUIREMENT", false)
19
+ if !defined?(Bulkrax::VERSION) || (Bulkrax::VERSION.to_i < 5)
20
+ raise "IiifPrint does not have a hard dependency on Bulkrax, " \
21
+ "but if you have Bulkrax installed we recommend at least version 5.0.0. " \
22
+ "To ignore this recommendation please add SKIP_IIIF_PRINT_BULKRAX_VERSION_REQUIREMENT " \
23
+ "to your ENV variables."
24
+ end
25
+ end
26
+
27
+ # Inject PluggableDerivativeService ahead of Hyrax default.
28
+ # This wraps Hyrax default, but allows multiple valid services
29
+ # to be configured, instead of just the _first_ valid service.
30
+ #
31
+ # To configure specific services, inject each service, in desired order
32
+ # to IiifPrint::PluggableDerivativeService.plugins array.
33
+
34
+ Hyrax::DerivativeService.services.unshift(
35
+ IiifPrint::PluggableDerivativeService
36
+ )
37
+
38
+ Hyrax::IiifManifestPresenter.prepend(IiifPrint::IiifManifestPresenterBehavior)
39
+ Hyrax::IiifManifestPresenter::Factory.prepend(IiifPrint::IiifManifestPresenterFactoryBehavior)
40
+ Hyrax::ManifestBuilderService.prepend(IiifPrint::ManifestBuilderServiceBehavior)
41
+ Hyrax::Renderers::FacetedAttributeRenderer.prepend(Hyrax::Renderers::FacetedAttributeRendererDecorator)
42
+ Hyrax::WorksControllerBehavior.prepend(IiifPrint::WorksControllerBehaviorDecorator)
43
+ Hyrax::WorkShowPresenter.prepend(IiifPrint::WorkShowPresenterDecorator)
44
+
45
+ IiifPrint::ChildIndexer.decorate_work_types!
46
+ IiifPrint::FileSetIndexer.decorate(Hyrax::FileSetIndexer)
47
+
48
+ ::BlacklightIiifSearch::IiifSearchResponse.prepend(IiifPrint::IiifSearchResponseDecorator)
49
+ ::BlacklightIiifSearch::IiifSearchAnnotation.prepend(IiifPrint::BlacklightIiifSearch::AnnotationDecorator)
50
+ Hyrax::Actors::FileSetActor.prepend(IiifPrint::Actors::FileSetActorDecorator)
51
+
52
+ # Extending the presenter to the base url which includes the protocol.
53
+ # We need the base url to render the facet links and normalize the interface.
54
+ Hyrax::IiifManifestPresenter.send(:attr_accessor, :base_url)
55
+ Hyrax::IiifManifestPresenter::DisplayImagePresenter.send(:attr_accessor, :base_url)
56
+ # Extending this class because there is an #ability= but not #ability and this definition
57
+ # mirrors the Hyrax::IiifManifestPresenter#ability.
58
+ module Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator
59
+ def ability
60
+ @ability ||= NullAbility.new
61
+ end
62
+ end
63
+ Hyrax::IiifManifestPresenter::DisplayImagePresenter.prepend(Hyrax::IiifManifestPresenter::DisplayImagePresenterDecorator)
64
+
65
+ Hyrax.config do |config|
66
+ config.callback.set(:after_create_fileset) do |file_set, user|
67
+ IiifPrint.config.handle_after_create_fileset(file_set, user)
68
+ end
69
+ end
70
+ end
71
+
72
+ config.after_initialize do
73
+ IiifPrint::Solr::Document.decorate(SolrDocument)
74
+ end
75
+ # rubocop:enable Metrics/BlockLength
76
+ end
77
+ end
@@ -0,0 +1,9 @@
1
+ module IiifPrint
2
+ # generic/base IiifPrint-specific exception:
3
+ class IiifPrintError < StandardError
4
+ end
5
+
6
+ # Data transformation or read-error:
7
+ class DataError < IiifPrintError
8
+ end
9
+ end
@@ -0,0 +1,119 @@
1
+ require 'open3'
2
+ require 'tmpdir'
3
+
4
+ module IiifPrint
5
+ class ImageTool
6
+ attr_accessor :path, :ftype
7
+
8
+ def initialize(path)
9
+ @path = path
10
+ @ftype = magic
11
+ @metadata = nil
12
+ end
13
+
14
+ # @return [Hash] hash with following symbol keys, and respectively
15
+ # typed String and/or Integer values.
16
+ # :width, :height — both in Integer px units
17
+ # :color — (String enumerated from 'gray', 'monochrome', 'color')
18
+ # :num_components - Integer, number of channels
19
+ # :bits_per_component — Integer, bits per channel (e.g. 8 vs. 1)
20
+ # :content_type — RFC 2045 MIME type
21
+ def metadata
22
+ return @metadata unless @metadata.nil?
23
+ @metadata = jp2? ? jp2_metadata : identify_metadata
24
+ end
25
+
26
+ # Convert source image to image at destination path, inferring file type
27
+ # from destination file extension. In case of JP2 files, create
28
+ # intermediate file using OpenJPEG 2000 that ImageMagick can use.
29
+ # Only outputs monochrome output if monochrome is true, destination
30
+ # format is TIFF.
31
+ # @param destination [String] Path to output / destination file
32
+ # @param monochrome [Boolean] true if monochrome output, otherwise false
33
+ def convert(destination, monochrome = false)
34
+ raise 'JP2 output not yet supported' if destination.end_with?('jp2')
35
+ return convert_image(jp2_to_tiff(@path), destination, monochrome) if jp2?
36
+ convert_image(@path, destination, monochrome)
37
+ end
38
+
39
+ private
40
+
41
+ def convert_image(source, destination, monochrome)
42
+ monochrome &&= destination.slice(-4, 4).index('tif')
43
+ mono_opts = "-depth 1 -monochrome -compress Group4 -type bilevel "
44
+ opts = monochrome ? mono_opts : ''
45
+ cmd = "convert #{source} #{opts}#{destination}"
46
+ `#{cmd}`
47
+ end
48
+
49
+ def jp2_to_tiff(source)
50
+ intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
51
+ jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
52
+ `#{jp2_cmd}`
53
+ intermediate_path
54
+ end
55
+
56
+ def jp2_metadata
57
+ result = IiifPrint::JP2ImageMetadata.new(path).technical_metadata
58
+ result[:content_type] = 'image/jp2'
59
+ result
60
+ end
61
+
62
+ def im_line_select(lines, key)
63
+ line = lines.find { |l| l.scrub.downcase.strip.start_with?(key) }
64
+ # Given "key: value" line, return the value as String stripped of
65
+ # leading and trailing whitespace
66
+ return line if line.nil?
67
+ line.strip.split(':')[-1].strip
68
+ end
69
+
70
+ # @return [Array(Integer, Integer)] width, height in Integer px units
71
+ def im_identify_geometry(lines)
72
+ img_geo = im_line_select(lines, 'geometry').split('+')[0]
73
+ img_geo.split('x').map(&:to_i)
74
+ end
75
+
76
+ # @return [Array<String>] lines of output from imagemagick `identify`
77
+ def im_identify
78
+ cmd = "identify -verbose #{path}"
79
+ `#{cmd}`.lines
80
+ end
81
+
82
+ def im_mime(lines)
83
+ return 'application/pdf' if pdf? # workaround older imagemagick bug
84
+ im_line_select(lines, 'mime type')
85
+ end
86
+
87
+ def populate_im_color!(lines, result)
88
+ bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
89
+ colorspace = im_line_select(lines, 'colorspace')
90
+ color = colorspace == 'Gray' ? 'gray' : 'color'
91
+ has_alpha = !im_line_select(lines, 'Alpha').nil?
92
+ result[:num_components] = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
93
+ result[:color] = bpc == 1 ? 'monochrome' : color
94
+ result[:bits_per_component] = bpc
95
+ end
96
+
97
+ # Return metadata by means of imagemagick identify
98
+ def identify_metadata
99
+ result = {}
100
+ lines = im_identify
101
+ result[:width], result[:height] = im_identify_geometry(lines)
102
+ result[:content_type] = im_mime(lines)
103
+ populate_im_color!(lines, result)
104
+ result
105
+ end
106
+
107
+ def magic
108
+ File.read(@path, 23, 0)
109
+ end
110
+
111
+ def jp2?
112
+ @ftype.end_with?('ftypjp2')
113
+ end
114
+
115
+ def pdf?
116
+ magic.start_with?('%PDF-')
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,8 @@
1
+ module IiifPrint
2
+ module Jobs
3
+ # TODO: Consider inheriting from ::Application job. That means we would have the upstreams
4
+ # based job behavior.
5
+ class ApplicationJob < ActiveJob::Base
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,107 @@
1
+ module IiifPrint
2
+ module Jobs
3
+ class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
4
+ # Break a pdf into individual pages
5
+ # @param parent_work
6
+ # @param pdf_paths: [<Array => String>] paths to pdfs
7
+ # @param user: [User]
8
+ # @param admin_set_id: [<String>]
9
+ # @param prior_pdfs: [<Integer>] count of pdfs already on parent work
10
+ def perform(parent_work, pdf_paths, user, admin_set_id, prior_pdfs)
11
+ @parent_work = parent_work
12
+ @child_admin_set_id = admin_set_id
13
+ child_model = @parent_work.iiif_print_config.pdf_split_child_model
14
+
15
+ # handle each input pdf
16
+ pdf_paths.each_with_index do |path, pdf_idx|
17
+ split_pdf(path, pdf_idx, user, prior_pdfs, child_model)
18
+ end
19
+
20
+ # Link newly created child works to the parent
21
+ # @param user: [User] user
22
+ # @param parent_id: [<String>] parent work id
23
+ # @param parent_model: [<String>] parent model
24
+ # @param child_model: [<String>] child model
25
+ IiifPrint::Jobs::CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
26
+ user: user,
27
+ parent_id: @parent_work.id,
28
+ parent_model: @parent_work.class.to_s,
29
+ child_model: child_model.to_s
30
+ )
31
+
32
+ # TODO: clean up image_files and pdf_paths
33
+ end
34
+
35
+ private
36
+
37
+ def split_pdf(path, pdf_idx, user, prior_pdfs_count, child_model)
38
+ image_files = @parent_work.iiif_print_config.pdf_splitter_service.new(path).to_a
39
+ return if image_files.blank?
40
+
41
+ pdf_sequence = pdf_idx + prior_pdfs_count
42
+ prepare_import_data(pdf_sequence, image_files, user)
43
+
44
+ # submit the job to create all the child works for one PDF
45
+ # @param [User] user
46
+ # @param [Hash<String => String>] titles
47
+ # @param [Hash<String => String>] resource_types (optional)
48
+ # @param [Array<String>] uploaded_files Hyrax::UploadedFile IDs
49
+ # @param [Hash] attributes attributes to apply to all works, including :model
50
+ # @param [Hyrax::BatchCreateOperation] operation
51
+ operation = Hyrax::BatchCreateOperation.create!(
52
+ user: user,
53
+ operation_type: "PDF Batch Create"
54
+ )
55
+ BatchCreateJob.perform_later(user,
56
+ @child_work_titles,
57
+ {},
58
+ @uploaded_files,
59
+ attributes.merge!(model: child_model.to_s).with_indifferent_access,
60
+ operation)
61
+ end
62
+
63
+ def prepare_import_data(pdf_sequence, image_files, user)
64
+ @uploaded_files = []
65
+ @child_work_titles = {}
66
+ image_files.each_with_index do |image_path, idx|
67
+ file_id = create_uploaded_file(user, image_path).to_s
68
+ file_title = set_title(@parent_work.title.first, pdf_sequence, idx)
69
+ @uploaded_files << file_id
70
+ @child_work_titles[file_id] = file_title
71
+ # save child work info to create the member relationships
72
+ PendingRelationship.create!(child_title: file_title,
73
+ parent_id: @parent_work.id,
74
+ child_order: sort_order(pdf_sequence, idx))
75
+ end
76
+ end
77
+
78
+ def sort_order(pdf_sequence, idx)
79
+ "#{pdf_sequence} #{idx}"
80
+ end
81
+
82
+ def create_uploaded_file(user, path)
83
+ uf = Hyrax::UploadedFile.new
84
+ uf.user_id = user.id
85
+ uf.file = CarrierWave::SanitizedFile.new(path)
86
+ uf.save!
87
+ uf.id
88
+ end
89
+
90
+ def set_title(title, pdf_sequence, idx)
91
+ pdf_index = "Pdf Nbr #{pdf_sequence + 1}"
92
+ page_number = "Page #{idx + 1}"
93
+ "#{title}: #{pdf_index}, #{page_number}"
94
+ end
95
+
96
+ # TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
97
+ def attributes
98
+ {
99
+ admin_set_id: @child_admin_set_id.to_s,
100
+ creator: @parent_work.creator.to_a,
101
+ rights_statement: @parent_work.rights_statement.to_a,
102
+ visibility: @parent_work.visibility.to_s
103
+ }
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,78 @@
1
+ module IiifPrint
2
+ module Jobs
3
+ # Break a pdf into individual pages
4
+ class CreateRelationshipsJob < IiifPrint::Jobs::ApplicationJob
5
+ # Link newly created child works to the parent
6
+ # @param user: [User] user
7
+ # @param parent_id: [<String>] parent work id
8
+ # @param parent_model: [<String>] parent model
9
+ # @param child_model: [<String>] child model
10
+ def perform(user:, parent_id:, parent_model:, child_model:)
11
+ if completed_child_data_for(parent_id, child_model)
12
+ # add the members
13
+ parent_work = parent_model.constantize.find(parent_id)
14
+ create_relationships(user: user, parent: parent_work, ordered_children: @child_works)
15
+ @pending_children.each(&:destroy)
16
+ else
17
+ # reschedule the job and end this one normally
18
+ #
19
+ # TODO: Depending on how things shake out, we could be infinitely rescheduling this job.
20
+ # Consider a time to live parameter.
21
+ reschedule(user: user, parent_id: parent_id, parent_model: parent_model, child_model: child_model)
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ # load @child_works, and return true or false
28
+ def completed_child_data_for(parent_id, child_model)
29
+ @child_works = []
30
+ found_all_children = true
31
+
32
+ # find and sequence all pending children
33
+ @pending_children = IiifPrint::PendingRelationship.where(parent_id: parent_id).order('child_order asc')
34
+
35
+ # find child works (skip out if any haven't yet been created)
36
+ @pending_children.each do |child|
37
+ # find by title... if any aren't found, the child works are not yet ready
38
+ found_children = find_children_by_title_for(child.child_title, child_model)
39
+ found_all_children = false if found_children.empty?
40
+ break unless found_all_children == true
41
+ @child_works += found_children
42
+ end
43
+ # return boolean
44
+ found_all_children
45
+ end
46
+
47
+ def find_children_by_title_for(title, model)
48
+ # We should only find one, but there is no guarantee of that and `:where` returns an array.
49
+ model.constantize.where(title: title)
50
+ end
51
+
52
+ def reschedule(user:, parent_id:, parent_model:, child_model:)
53
+ CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
54
+ user: user,
55
+ parent_id: parent_id,
56
+ parent_model: parent_model,
57
+ child_model: child_model
58
+ )
59
+ end
60
+
61
+ def create_relationships(user:, parent:, ordered_children:)
62
+ records_hash = {}
63
+ ordered_children.map(&:id).each_with_index do |child_id, i|
64
+ records_hash[i.to_s] = { id: child_id }
65
+ end
66
+ attrs = { work_members_attributes: records_hash }
67
+ parent.try(:reindex_extent=, Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX)
68
+ env = Hyrax::Actors::Environment.new(parent, Ability.new(user), attrs)
69
+
70
+ Hyrax::CurationConcern.actor.update(env)
71
+ # need to reindex all file_sets to make all ancestors are indexed
72
+ ordered_children.each do |child_work|
73
+ child_work.file_sets.each(&:update_index) if child_work.respond_to?(:file_sets)
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,118 @@
1
+ require 'open3'
2
+
3
+ module IiifPrint
4
+ class JP2DerivativeService < BaseDerivativeService
5
+ # OpenJPEG 2000 Command to make NDNP-compliant grayscale JP2:
6
+ CMD_GRAY = 'opj_compress -i %<source_file>s -o %<out_file>s ' \
7
+ '-d 0,0 -b 64,64 -n 6 -p RLCP -t 1024,1024 -I -M 1 ' \
8
+ '-r 64,53.821,45.249,40,32,26.911,22.630,20,16,14.286,' \
9
+ '11.364,10,8,6.667,5.556,4.762,4,3.333,2.857,2.500,2,' \
10
+ '1.667,1.429,1.190,1'.freeze
11
+
12
+ # OpenJPEG 2000 Command to make RGB JP2:
13
+ CMD_COLOR = 'opj_compress -i %<source_file>s -o %<out_file>s ' \
14
+ '-d 0,0 -b 64,64 -n 6 -p RPCL -t 1024,1024 -I -M 1 '\
15
+ '-r 2.4,1.48331273,.91673033,.56657224,.35016049,.21641118,' \
16
+ '.13374944,.0944,.08266171'.freeze
17
+
18
+ # OpenJPEG 1.x command replacement for 2.x opj_compress, takes same options;
19
+ # this is necessary on Ubuntu Trusty (e.g. Travis CI)
20
+ CMD_1X = 'image_to_j2k'.freeze
21
+
22
+ # Target file extension of this service plugin:
23
+ self.target_extension = 'jp2'.freeze
24
+
25
+ attr_reader :file_set
26
+ delegate :uri, :mime_type, to: :file_set
27
+
28
+ def initialize(file_set)
29
+ # cached result string for imagemagick `identify` command
30
+ @command = nil
31
+ @unlink_after_creation = []
32
+ super(file_set)
33
+ end
34
+
35
+ def create_derivatives(filename)
36
+ # Base class takes care of loading @source_path, @dest_path
37
+ super(filename)
38
+
39
+ # no creation if jp2 master => deemed unnecessary/duplicative
40
+ return if mime_type == 'image/jp2'
41
+
42
+ # if we have a non-TIFF source, or a 1-bit monochrome source, we need
43
+ # to make a NetPBM-based intermediate (temporary) file for OpenJPEG
44
+ # to consume.
45
+ needs_intermediate = !tiff_source? || one_bit?
46
+
47
+ # We use either intermediate temp file, or temp symlink (to work
48
+ # around OpenJPEG 2000 file naming quirk).
49
+ needs_intermediate ? make_intermediate_source : make_symlink
50
+
51
+ # Get OpenJPEG command, rendered with source, destination, appropriate
52
+ # to either color or grayscale source
53
+ render_cmd = opj_command
54
+
55
+ # Run the generated command to make derivative file at @dest_path
56
+ `#{render_cmd}`
57
+
58
+ # Clean up any intermediate files or symlinks used during creation
59
+ cleanup_intermediate
60
+ end
61
+
62
+ private
63
+
64
+ # source introspection:
65
+
66
+ def tiff_source?
67
+ identify[:content_type] == 'image/tiff'
68
+ end
69
+
70
+ def make_symlink
71
+ # OpenJPEG binaries have annoying quirk of only using TIFF input
72
+ # files whose name ends in .TIF or .tif (three letter); for all
73
+ # non-monochrome TIFF files, we just assume we need to symlink
74
+ # to such a filename.
75
+ tmpname = File.join(Dir.tmpdir, "#{SecureRandom.uuid}.tif")
76
+ FileUtils.ln_s(@source_path, tmpname)
77
+ @unlink_after_creation.push(tmpname)
78
+ # finally, point @source_path for command at intermediate link:
79
+ @source_path = tmpname
80
+ end
81
+
82
+ def make_intermediate_source
83
+ # generate a random filename to be made, with appropriate extension,
84
+ # inside /tmp dir:
85
+ tmpname = File.join(
86
+ Dir.tmpdir,
87
+ format(
88
+ "#{SecureRandom.uuid}.%<ext>s",
89
+ ext: use_color? ? 'ppm' : 'pgm'
90
+ )
91
+ )
92
+ # if pdf source, get only first page
93
+ source_path = @source_path
94
+ source_path += '[0]' if @source_path.ends_with?('pdf')
95
+ # Use ImageMagick `convert` to create intermediate bitmap:
96
+ `convert #{source_path} #{tmpname}`
97
+ @unlink_after_creation.push(tmpname)
98
+ # finally, point @source_path for command at intermediate file:
99
+ @source_path = tmpname
100
+ end
101
+
102
+ def opj_command
103
+ # Get a command template appropriate to OpenJPEG 1.x or 2.x
104
+ use_openjpeg_1x = `which opj_compress`.empty?
105
+ cmd = use_color? ? CMD_COLOR : CMD_GRAY
106
+ cmd = cmd.sub('opj_compress', 'image_to_j2k') if use_openjpeg_1x
107
+ # return command with source and destination file names injected
108
+ format(cmd, source_file: @source_path, out_file: @dest_path)
109
+ end
110
+
111
+ def cleanup_intermediate
112
+ # remove symlink or intermediate file once we no longer need
113
+ @unlink_after_creation.each do |path|
114
+ FileUtils.rm(path)
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,81 @@
1
+ module IiifPrint
2
+ class JP2ImageMetadata
3
+ TOKEN_MARKER_START = "\xFF".force_encoding("BINARY").freeze
4
+ TOKEN_MARKER_SIZ = "\x51".force_encoding("BINARY").freeze
5
+ TOKEN_IHDR = 'ihdr'.freeze
6
+
7
+ attr_accessor :path
8
+
9
+ def initialize(path)
10
+ @path = path
11
+ end
12
+
13
+ # @param io [IO] IO stream opened in binary mode, for reading
14
+ # @return [Array(Integer, Integer)] X size, Y size, in Integer-typed px
15
+ def extract_jp2_dim(io)
16
+ raise IOError, 'file not open in binary mode' unless io.binmode?
17
+ buffer = ''
18
+ siz_found = false
19
+ # Informed by ISO/IEC 15444-1:2000, pp. 26-27
20
+ # via:
21
+ # http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
22
+ #
23
+ # first 23 bytes are file-magic, we can skip
24
+ io.seek(23, IO::SEEK_SET)
25
+ while !siz_found && !buffer.nil?
26
+ # read one byte at a time, until we hit marker start 0xFF
27
+ buffer = io.read(1) while buffer != TOKEN_MARKER_START
28
+ # - on 0xFF read subsequent byte; if value != 0x51, continue
29
+ buffer = io.read(1)
30
+ next if buffer != TOKEN_MARKER_SIZ
31
+ # - on 0x51, read next 12 bytes
32
+ buffer = io.read(12)
33
+ siz_found = true
34
+ end
35
+ # discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
36
+ x_siz = buffer.byteslice(4, 4).unpack('N').first
37
+ y_siz = buffer.byteslice(8, 4).unpack('N').first
38
+ [x_siz, y_siz]
39
+ end
40
+
41
+ # @param io [IO] IO stream opened in binary mode, for reading
42
+ # @return [Array(Integer, Integer)] number components, bits-per-component
43
+ def extract_jp2_components(io)
44
+ raise IOError, 'file not open in binary mode' unless io.binmode?
45
+ io.seek(0, IO::SEEK_SET)
46
+ # IHDR should be in first 64 bytes
47
+ buffer = io.read(64)
48
+ ihdr_data = buffer.split(TOKEN_IHDR)[-1]
49
+ raise IOError if ihdr_data.nil?
50
+ num_components = ihdr_data.byteslice(8, 2).unpack('n').first
51
+ # stored as "bit depth of the components in the codestream, minus 1", so add 1
52
+ bits_per_component = ihdr_data.byteslice(10, 1).unpack('c').first + 1
53
+ [num_components, bits_per_component]
54
+ end
55
+
56
+ def validate_jp2(io)
57
+ # verify file is jp2
58
+ magic = io.read(23)
59
+ raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
60
+ end
61
+
62
+ # @param path [String] path to jp2, for reading
63
+ # @return [Hash] hash
64
+ def technical_metadata
65
+ io = File.open(path, 'rb')
66
+ io.seek(0, IO::SEEK_SET)
67
+ validate_jp2(io)
68
+ x_siz, y_siz = extract_jp2_dim(io)
69
+ nc, bpc = extract_jp2_components(io)
70
+ color = nc >= 3 ? 'color' : 'gray'
71
+ io.close
72
+ {
73
+ color: bpc == 1 ? 'monochrome' : color,
74
+ num_components: nc,
75
+ bits_per_component: bpc,
76
+ width: x_siz,
77
+ height: y_siz
78
+ }
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,41 @@
1
+ module IiifPrint
2
+ # The purpose of this module is to encode lineage related services:
3
+ #
4
+ # - {.ancestor_ids_for}
5
+ # - {.descendent_file_set_ids_for}
6
+ #
7
+ # The ancestor and descendent_file_sets are useful for ensuring we index together related items.
8
+ # For example, when I have a work that is a book, and one file set per page of that book, when I
9
+ # search the book I want to find the text within the given book's pages.
10
+ #
11
+ # The methods of this module should be considered as defining an interface.
12
+ module LineageService
13
+ ##
14
+ # @api public
15
+ #
16
+ # @param object [#in_works] An object that responds to #in_works
17
+ # @return [Array<String>]
18
+ def self.ancestor_ids_for(object)
19
+ ancestor_ids ||= []
20
+ object.in_works.each do |work|
21
+ ancestor_ids << work.id
22
+ ancestor_ids += ancestor_ids_for(work) if work.is_child
23
+ end
24
+ ancestor_ids.flatten.compact.uniq
25
+ end
26
+
27
+ ##
28
+ # @param object [#ordered_works, #file_sets, #member_ids]
29
+ # @return [Array<String>] the ids of associated file sets
30
+ def self.descendent_file_set_ids_for(object)
31
+ # enables us to return parents when searching for child OCR
32
+ file_set_ids = object.file_sets.map(&:id)
33
+ object.ordered_works&.each do |child|
34
+ file_set_ids += descendent_file_set_ids_for(child)
35
+ end
36
+ # enables us to return parents when searching for child metadata
37
+ file_set_ids += object.member_ids
38
+ file_set_ids.flatten.uniq.compact
39
+ end
40
+ end
41
+ end