iiif_print 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.env +5 -0
  4. data/.fcrepo_wrapper +4 -0
  5. data/.github/release.yml +20 -0
  6. data/.github/workflows/branches.yml +24 -0
  7. data/.github/workflows/build-lint-test-action.yaml +33 -0
  8. data/.github/workflows/release_labels.yml +25 -0
  9. data/.gitignore +52 -0
  10. data/.rubocop.yml +177 -0
  11. data/.solr_wrapper +8 -0
  12. data/.travis.yml +49 -0
  13. data/CONTRIBUTING.md +181 -0
  14. data/Dockerfile +15 -0
  15. data/Gemfile +52 -0
  16. data/LICENSE +203 -0
  17. data/README.md +203 -0
  18. data/Rakefile +38 -0
  19. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
  20. data/app/assets/config/iiif_print_manifest.js +2 -0
  21. data/app/assets/images/iiif_print/.keep +0 -0
  22. data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
  23. data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
  24. data/app/assets/javascripts/iiif_print.js +3 -0
  25. data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
  26. data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
  27. data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
  28. data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
  29. data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
  30. data/app/helpers/hyrax/iiif_helper.rb +22 -0
  31. data/app/helpers/iiif_print/application_helper.rb +5 -0
  32. data/app/helpers/iiif_print_helper.rb +64 -0
  33. data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
  34. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
  35. data/app/mailers/iiif_print/application_mailer.rb +8 -0
  36. data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
  37. data/app/models/concerns/iiif_print/solr/document.rb +47 -0
  38. data/app/models/iiif_print/application_record.rb +6 -0
  39. data/app/models/iiif_print/derivative_attachment.rb +8 -0
  40. data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
  41. data/app/models/iiif_print/ingest_file_relation.rb +14 -0
  42. data/app/models/iiif_print/pending_relationship.rb +7 -0
  43. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
  44. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
  45. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
  46. data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
  47. data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
  48. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
  49. data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
  50. data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
  51. data/app/views/catalog/_snippets_more.html.erb +16 -0
  52. data/app/views/hyrax/base/_representative_media.html.erb +9 -0
  53. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
  54. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  55. data/bin/rails +13 -0
  56. data/config/fcrepo_wrapper_test.yml +5 -0
  57. data/config/initializers/assets.rb +2 -0
  58. data/config/locales/iiif_print.de.yml +148 -0
  59. data/config/locales/iiif_print.en.yml +119 -0
  60. data/config/locales/iiif_print.es.yml +148 -0
  61. data/config/locales/iiif_print.fr.yml +149 -0
  62. data/config/locales/iiif_print.it.yml +142 -0
  63. data/config/locales/iiif_print.pt-BR.yml +148 -0
  64. data/config/locales/iiif_print.zh.yml +142 -0
  65. data/config/solr_wrapper_test.yml +9 -0
  66. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  67. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  68. data/config/test-fixture/solr-config/elevate.xml +36 -0
  69. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  70. data/config/test-fixture/solr-config/protwords.txt +21 -0
  71. data/config/test-fixture/solr-config/schema.xml +366 -0
  72. data/config/test-fixture/solr-config/scripts.conf +24 -0
  73. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  74. data/config/test-fixture/solr-config/spellings.txt +2 -0
  75. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  76. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  77. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  78. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  79. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  80. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  81. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  82. data/config/vendor/fits.xml +55 -0
  83. data/config/vendor/imagemagick-6-policy.xml +76 -0
  84. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
  85. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
  86. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
  87. data/docker-compose.yml +129 -0
  88. data/iiif_print.gemspec +43 -0
  89. data/lib/generators/iiif_print/assets_generator.rb +29 -0
  90. data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
  91. data/lib/generators/iiif_print/install_generator.rb +52 -0
  92. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
  93. data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
  94. data/lib/iiif_print/base_derivative_service.rb +113 -0
  95. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
  96. data/lib/iiif_print/catalog_search_builder.rb +31 -0
  97. data/lib/iiif_print/configuration.rb +99 -0
  98. data/lib/iiif_print/data/fileset_helper.rb +25 -0
  99. data/lib/iiif_print/data/path_helper.rb +40 -0
  100. data/lib/iiif_print/data/work_derivatives.rb +323 -0
  101. data/lib/iiif_print/data/work_file.rb +92 -0
  102. data/lib/iiif_print/data/work_files.rb +199 -0
  103. data/lib/iiif_print/data.rb +35 -0
  104. data/lib/iiif_print/engine.rb +77 -0
  105. data/lib/iiif_print/errors.rb +9 -0
  106. data/lib/iiif_print/image_tool.rb +119 -0
  107. data/lib/iiif_print/jobs/application_job.rb +8 -0
  108. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
  109. data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
  110. data/lib/iiif_print/jp2_derivative_service.rb +118 -0
  111. data/lib/iiif_print/jp2_image_metadata.rb +81 -0
  112. data/lib/iiif_print/lineage_service.rb +41 -0
  113. data/lib/iiif_print/metadata.rb +125 -0
  114. data/lib/iiif_print/pdf_derivative_service.rb +42 -0
  115. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
  116. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
  117. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
  118. data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
  119. data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
  120. data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
  121. data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
  122. data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
  123. data/lib/iiif_print/text_extraction.rb +11 -0
  124. data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
  125. data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
  126. data/lib/iiif_print/tiff_derivative_service.rb +50 -0
  127. data/lib/iiif_print/version.rb +3 -0
  128. data/lib/iiif_print/works_controller_behavior.rb +9 -0
  129. data/lib/iiif_print.rb +136 -0
  130. data/lib/tasks/set_child_works.rake +22 -0
  131. data/spec/.keep.txt +1 -0
  132. data/spec/factories/ability.rb +6 -0
  133. data/spec/factories/newspaper_issue.rb +7 -0
  134. data/spec/factories/newspaper_page.rb +7 -0
  135. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  136. data/spec/factories/newspaper_title.rb +8 -0
  137. data/spec/factories/uploaded_pdf_file.rb +9 -0
  138. data/spec/factories/uploaded_txt_file.rb +9 -0
  139. data/spec/factories/user.rb +13 -0
  140. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  141. data/spec/fixtures/files/4.1.07.tiff +0 -0
  142. data/spec/fixtures/files/README.md +7 -0
  143. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  144. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  145. data/spec/fixtures/files/credits.md +16 -0
  146. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  147. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  148. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  149. data/spec/fixtures/files/minimal-alto.xml +31 -0
  150. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  151. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  152. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  153. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  154. data/spec/fixtures/files/ocr_alto.xml +202 -0
  155. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  156. data/spec/fixtures/files/ocr_color.tiff +0 -0
  157. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  158. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  159. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  160. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  161. data/spec/fixtures/files/page1.tiff +0 -0
  162. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  163. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  164. data/spec/fixtures/files/thumbnail.jpg +0 -0
  165. data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
  166. data/spec/helpers/iiif_print_helper_spec.rb +43 -0
  167. data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
  168. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
  169. data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
  170. data/spec/iiif_print/configuration_spec.rb +67 -0
  171. data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
  172. data/spec/iiif_print/data/work_file_spec.rb +99 -0
  173. data/spec/iiif_print/data/work_files_spec.rb +237 -0
  174. data/spec/iiif_print/image_tool_spec.rb +109 -0
  175. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
  176. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
  177. data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
  178. data/spec/iiif_print/lineage_service_spec.rb +13 -0
  179. data/spec/iiif_print/metadata_spec.rb +115 -0
  180. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
  181. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
  182. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
  183. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
  184. data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
  185. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
  186. data/spec/iiif_print_spec.rb +51 -0
  187. data/spec/misc_shared.rb +111 -0
  188. data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
  189. data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
  190. data/spec/models/solr_document_spec.rb +14 -0
  191. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
  192. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
  193. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
  194. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
  195. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
  196. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
  197. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
  198. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
  199. data/spec/spec_helper.rb +181 -0
  200. data/spec/support/controller_level_helpers.rb +28 -0
  201. data/spec/support/iiif_print_models.rb +127 -0
  202. data/spec/test_app_templates/blacklight.yml +9 -0
  203. data/spec/test_app_templates/fedora.yml +15 -0
  204. data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
  205. data/spec/test_app_templates/redis.yml +9 -0
  206. data/spec/test_app_templates/solr/conf/schema.xml +362 -0
  207. data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
  208. data/spec/test_app_templates/solr.yml +7 -0
  209. data/tasks/iiif_print_dev.rake +34 -0
  210. data/tmp/.keep +0 -0
  211. metadata +605 -0
@@ -0,0 +1,125 @@
1
+ module IiifPrint
2
+ # rubocop:disable Metrics/ClassLength
3
+ class Metadata
4
+ def self.build_metadata_for(work:, version:, fields:, current_ability:, base_url:)
5
+ new(work: work,
6
+ version: version,
7
+ fields: fields,
8
+ current_ability: current_ability,
9
+ base_url: base_url).build_metadata
10
+ end
11
+
12
+ def initialize(work:, version:, fields:, current_ability:, base_url:)
13
+ @work = work
14
+ @version = version.to_i
15
+ @fields = fields
16
+ @current_ability = current_ability
17
+ @base_url = base_url
18
+ end
19
+
20
+ attr_reader :work, :version, :fields
21
+
22
+ def build_metadata
23
+ send("build_metadata_for_v#{version}")
24
+ end
25
+
26
+ private
27
+
28
+ def build_metadata_for_v2
29
+ fields.map do |field|
30
+ label = Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label
31
+ if field.name == :collection && member_of_collection?
32
+ viewable_collections = Hyrax::CollectionMemberService.run(work, @current_ability)
33
+ next if viewable_collections.empty?
34
+ { 'label' => label,
35
+ 'value' => make_collection_link(viewable_collections) }
36
+ else
37
+ next if field_is_empty?(field)
38
+ { 'label' => label,
39
+ 'value' => cast_to_value(field_name: field.name, options: field.options) }
40
+ end
41
+ end.compact
42
+ end
43
+
44
+ def build_metadata_for_v3
45
+ fields.map do |field|
46
+ values = Array(work.try(field.name)).map { |value| scrub(value.to_s) }
47
+ next if values.empty?
48
+ {
49
+ 'label' => {
50
+ # Since we're using I18n to translate the field, we're setting the locale used in the translation.
51
+ I18n.locale.to_s => [Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label]
52
+ },
53
+ 'value' => {
54
+ 'none' => values
55
+ }
56
+ }
57
+ end.compact
58
+ end
59
+
60
+ def field_is_empty?(field)
61
+ Array(work.try(field.name)).empty?
62
+ end
63
+
64
+ def member_of_collection?
65
+ work[:member_of_collection_ids_ssim]&.present?
66
+ end
67
+
68
+ def scrub(value)
69
+ Loofah.fragment(value).scrub!(:whitewash).to_s
70
+ end
71
+
72
+ def cast_to_value(field_name:, options:)
73
+ if options&.[](:render_as) == :faceted
74
+ values_for(field_name: field_name).map do |value|
75
+ search_field = field_name.to_s + "_sim"
76
+ path = Rails.application.routes.url_helpers.search_catalog_path(
77
+ "f[#{search_field}][]": value, locale: I18n.locale
78
+ )
79
+ path += '&include_child_works=true' if work["is_child_bsi"] == true
80
+ "<a href='#{File.join(@base_url, path)}'>#{value}</a>"
81
+ end
82
+ else
83
+ make_link(values_for(field_name: field_name))
84
+ end
85
+ end
86
+
87
+ def values_for(field_name:)
88
+ Array(work.send(field_name))
89
+ end
90
+
91
+ def make_collection_link(collection_documents)
92
+ collection_documents.map do |collection|
93
+ "<a href='#{File.join(@base_url, 'collections', collection.id)}'>#{collection.title.first}</a>"
94
+ end
95
+ end
96
+
97
+ # @note This method turns link looking strings into links
98
+ def make_link(texts)
99
+ texts.map do |t|
100
+ t.to_s.gsub(MAKE_LINK_REGEX) do |url|
101
+ "<a href='#{url}' target='_blank'>#{url}</a>"
102
+ end
103
+ end
104
+ end
105
+
106
+ MAKE_LINK_REGEX = %r{
107
+ \b
108
+ (
109
+ (?: [a-z][\w-]+:
110
+ (?: /{1,3} | [a-z0-9%] ) |
111
+ www\d{0,3}[.] |
112
+ [a-z0-9.\-]+[.][a-z]{2,4}/
113
+ )
114
+ (?:
115
+ [^\s()<>]+ | \(([^\s()<>]+|(\([^\s()<>]+\)))*\)
116
+ )+
117
+ (?:
118
+ \(([^\s()<>]+|(\([^\s()<>]+\)))*\) |
119
+ [^\s`!()\[\]{};:'".,<>?«»〝〞‘‛]
120
+ )
121
+ )
122
+ }ix.freeze
123
+ end
124
+ # rubocop:enable Metrics/ClassLength
125
+ end
@@ -0,0 +1,42 @@
1
+ require 'open3'
2
+
3
+ module IiifPrint
4
+ class PDFDerivativeService < BaseDerivativeService
5
+ self.target_extension = 'pdf'.freeze
6
+
7
+ # PDF (JPEG, 8 bit grayscale), 150ppi
8
+ GRAY_PDF_CMD = 'convert %<source_file>s ' \
9
+ '-resize 1800 -density 150 ' \
10
+ '-depth 8 -colorspace Gray ' \
11
+ '-compress jpeg %<out_file>s'.freeze
12
+
13
+ # sRBG color PDF (JPEG, 8 bits per channel), 150ppi
14
+ COLOR_PDF_CMD = 'convert %<source_file>s ' \
15
+ '-resize 1800 -density 150 ' \
16
+ '-depth 8 ' \
17
+ '-compress jpeg %<out_file>s'.freeze
18
+
19
+ def initialize(file_set)
20
+ super(file_set)
21
+ end
22
+
23
+ # Get conversion command; command varies on whether or not we have
24
+ # JP2 source, and whether we have color or grayscale material.
25
+ def convert_cmd
26
+ template = use_color? ? COLOR_PDF_CMD : GRAY_PDF_CMD
27
+ format(template, source_file: @source_path, out_file: @dest_path)
28
+ end
29
+
30
+ def create_derivatives(filename)
31
+ # Base class takes care of loading @source_path, @dest_path
32
+ super(filename)
33
+
34
+ # no creation if pdf master
35
+ return if mime_type == 'application/pdf'
36
+
37
+ # Get and run conversion command
38
+ return jp2_convert if mime_type == 'image/jp2'
39
+ im_convert
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Encapsulates methods used for pdf splitting into child works
4
+ module IiifPrint
5
+ module SplitPdfs
6
+ class ChildWorkCreationFromPdfService
7
+ # Load an array of paths to pdf files
8
+ # @param [Array > Hyrax::Upload file ids]
9
+ # @return [Array > String] file paths to temp directory
10
+ def self.pdf_paths(files:)
11
+ upload_ids = filter_file_ids(files)
12
+ return [] if upload_ids.empty?
13
+ uploads = Hyrax::UploadedFile.find(upload_ids)
14
+ paths = uploads.map(&method(:upload_path))
15
+ pdfs_only_for(paths)
16
+ end
17
+
18
+ # Is child work splitting defined for model?
19
+ # @param [GenericWork, etc] A valid type of hyrax work
20
+ # @return [Boolean]
21
+ def self.iiif_print_split?(work:)
22
+ # defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
23
+ return true if work.try(:iiif_print_config)&.pdf_split_child_model
24
+ false
25
+ end
26
+
27
+ # Are there any PDF files?
28
+ # @param [Array > String] paths to PDFs
29
+ # @return [Boolean]
30
+ def self.pdfs?(paths:)
31
+ pdf_paths = pdfs_only_for(paths)
32
+ return false unless pdf_paths.count.positive?
33
+ true
34
+ end
35
+
36
+ # Submit the job to split PDF into child works
37
+ # @param [GenericWork, etc] A valid type of hyrax work
38
+ # @param [Array<String>] paths to PDF attachments
39
+ # @param [User] user
40
+ # @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
41
+ def self.queue_job(work:, file_locations:, user:, admin_set_id:)
42
+ work.iiif_print_config.pdf_splitter_job.perform_later(
43
+ work,
44
+ file_locations,
45
+ user,
46
+ admin_set_id,
47
+ count_existing_pdfs(work)
48
+ )
49
+ end
50
+
51
+ def self.filter_file_ids(input)
52
+ Array.wrap(input).select(&:present?)
53
+ end
54
+
55
+ # Given Hyrax::Upload object, return path to file on local filesystem
56
+ def self.upload_path(upload)
57
+ # so many layers to this onion:
58
+ upload.file.file.file
59
+ end
60
+
61
+ # TODO: implement a method to count existing PDFs on a work to support
62
+ # adding more PDFs to an existing work.
63
+ def self.count_existing_pdfs(_work)
64
+ 0
65
+ end
66
+
67
+ # TODO: Consider other methods to identify a PDF file.
68
+ # This sub-selection may need to be moved to use mimetype if there
69
+ # is a need to support paths not ending in .pdf (i.e. remote_urls)
70
+ def self.pdfs_only_for(paths)
71
+ paths.select { |path| path.end_with?('.pdf', '.PDF') }
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,130 @@
1
+ require 'open3'
2
+ require 'securerandom'
3
+ require 'tmpdir'
4
+ require 'iiif_print/split_pdfs/pdf_image_extraction_service'
5
+
6
+ module IiifPrint
7
+ module SplitPdfs
8
+ class PagesIntoImagesService
9
+ include Enumerable
10
+
11
+ def initialize(path)
12
+ @baseid = SecureRandom.uuid
13
+ @pdfpath = path
14
+ @info = nil
15
+ @entries = nil
16
+ @tmpdir = nil
17
+ @size = nil
18
+ @pagecount = nil
19
+ @pdftext = nil
20
+ @compression = 'lzw'
21
+ end
22
+
23
+ # return
24
+ def pdfinfo
25
+ @info = IiifPrint::SplitPdfs::PdfImageExtractionService.new(@pdfpath) if @info.nil?
26
+ @info
27
+ end
28
+
29
+ # TODO: put this test somewhere to prevent invalid pdfs from crashing the image service.
30
+ def invalid_pdf?
31
+ return true if pdfinfo.color.include?(nil) || pdfinfo.width.nil? || pdfinfo.height.nil? || pdfinfo.entries.length.zero?
32
+ false
33
+ end
34
+
35
+ def tmpdir
36
+ @tmpdir = Dir.mktmpdir if @tmpdir.nil?
37
+ @tmpdir
38
+ end
39
+
40
+ def colordevice(channels, bpc)
41
+ bits = bpc * channels
42
+ # will be either 8bpc/16bpd color TIFF,
43
+ # with any CMYK source transformed to 8bpc RBG
44
+ bits = 24 unless [24, 48].include? bits
45
+ "tiff#{bits}nc"
46
+ end
47
+
48
+ def gsdevice
49
+ color, channels, bpc = pdfinfo.color
50
+ device = nil
51
+ # CCITT Group 4 Black and White, if applicable:
52
+ if color == 'gray' && bpc == 1
53
+ device = 'tiffg4'
54
+ @compression = 'g4'
55
+ end
56
+ # 8 Bit Grayscale, if applicable:
57
+ device = 'tiffgray' if color == 'gray' && bpc > 1
58
+ # otherwise color:
59
+ device = colordevice(channels, bpc) if device.nil?
60
+ device
61
+ end
62
+
63
+ # TODO: this method came from newspaper gem but appears to be unused. Is it needed anywhere?
64
+ # def gstext
65
+ # cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
66
+ # "-sOutputFile=- -f #{@pdfpath}"
67
+ # Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
68
+ # @pdftext = stdout.read
69
+ # end
70
+ # @pdftext
71
+ # end
72
+
73
+ def pagecount
74
+ cmd = "pdfinfo #{@pdfpath}"
75
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
76
+ output = stdout.read.split("\n")
77
+ # rubocop:disable Performance/Detect
78
+ pages_e = output.select { |e| e.start_with?('Pages:') }[0]
79
+ # rubocop:enable Performance/Detect
80
+ @pagecount = pages_e.split[-1].to_i
81
+ end
82
+ @pagecount
83
+ end
84
+
85
+ def looks_scanned
86
+ max_image_px = pdfinfo.width * pdfinfo.height
87
+ single_image_per_page = pdfinfo.entries.length == pagecount
88
+ # single 10mp+ image per page?
89
+ single_image_per_page && max_image_px > 1024 * 1024 * 10
90
+ end
91
+
92
+ def ppi
93
+ unless looks_scanned
94
+ # 400 dpi for something that does not look like scanned media:
95
+ return 400
96
+ end
97
+ # For scanned media, defer to detected image PPI:
98
+ pdfinfo.ppi
99
+ end
100
+
101
+ # ghostscript convert all pages to TIFF
102
+ def gsconvert
103
+ output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
104
+ cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
105
+ "-dTextAlphaBits=4 -sCompression=#{@compression} " \
106
+ "-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
107
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
108
+ output = stdout.read.split("\n")
109
+ # rubocop:disable Performance/Count
110
+ @size = output.select { |e| e.start_with?('Page ') }.length
111
+ # rubocop:enable Performance/Count
112
+ end
113
+ # Return an array of expected filenames
114
+ (1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
115
+ end
116
+
117
+ # entries for each page
118
+ def entries
119
+ @entries = gsconvert if @entries.nil?
120
+ @entries
121
+ end
122
+
123
+ def each
124
+ entries.each do |e|
125
+ yield(e)
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,85 @@
1
+ require 'open3'
2
+ require 'mini_magick'
3
+
4
+ module IiifPrint
5
+ module SplitPdfs
6
+ # Uses poppler 0.19+ pdfimages command to extract image
7
+ # listing metadata from PDF files.
8
+ # For dpi extraction, falls back to calculating using MiniMagick,
9
+ # if neccessary.
10
+ class PdfImageExtractionService
11
+ # class constant column numbers
12
+ COL_WIDTH = 3
13
+ COL_HEIGHT = 4
14
+ COL_COLOR = 5
15
+ COL_CHANNELS = 6
16
+ COL_BITS = 7
17
+ # only poppler 0.25+ has this column in output:
18
+ COL_XPPI = 12
19
+
20
+ def initialize(path)
21
+ @path = path
22
+ @cmd = format('pdfimages -list %<path>s', path: path)
23
+ @output = nil
24
+ @entries = nil
25
+ end
26
+
27
+ def process
28
+ # call just once
29
+ if @output.nil?
30
+ Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
31
+ @output = stdout.read.split("\n")
32
+ end
33
+ end
34
+ @output.slice(2, @output.size - 1)
35
+ end
36
+
37
+ def entries
38
+ if @entries.nil?
39
+ @entries = []
40
+ output = process
41
+ (0..output.size - 1).each do |i|
42
+ @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
43
+ end
44
+ end
45
+ @entries
46
+ end
47
+
48
+ def selectcolumn(i, &block)
49
+ result = entries.map { |e| e[i] }
50
+ return result.map!(&block) if block_given?
51
+ result
52
+ end
53
+
54
+ def width
55
+ selectcolumn(COL_WIDTH, &:to_i).max
56
+ end
57
+
58
+ def height
59
+ selectcolumn(COL_HEIGHT, &:to_i).max
60
+ end
61
+
62
+ def color
63
+ # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
64
+ # so caller may want all of this information, and in case of
65
+ # mixed color spaces across images, this returns maximum
66
+ desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
67
+ channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
68
+ bits = entries.map { |e| e[COL_BITS].to_i }.max
69
+ [desc, channels, bits]
70
+ end
71
+
72
+ def ppi
73
+ if entries[0].size <= 12
74
+ # poppler < 0.25
75
+ pdf = MiniMagick::Image.open(@path)
76
+ width_points = pdf.width
77
+ width_px = width
78
+ return (72 * width_px / width_points).to_i
79
+ end
80
+ # with poppler 0.25+, pdfimages just gives us this:
81
+ selectcolumn(COL_XPPI, &:to_i).max
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,123 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'json'
3
+ require 'nokogiri'
4
+
5
+ module IiifPrint
6
+ # Module for text extraction
7
+ module TextExtraction
8
+ # Class to obtain plain text and JSON word-coordinates from ALTO source
9
+ class AltoReader
10
+ attr_accessor :source, :doc_stream
11
+ delegate :text, to: :doc_stream
12
+
13
+ # SAX Document Stream class to gather text and word tokens from ALTO
14
+ class AltoDocStream < Nokogiri::XML::SAX::Document
15
+ attr_accessor :text, :words
16
+
17
+ def initialize(image_width = nil)
18
+ super()
19
+ # scaling matters:
20
+ @image_width = image_width
21
+ @scaling = 1.0 # pt to px, if ALTO using points
22
+ # plain text buffer:
23
+ @text = ''
24
+ # list of word hash, containing word+coord:
25
+ @words = []
26
+ end
27
+
28
+ # Return coordinates from String element attribute hash
29
+ #
30
+ # @param attrs [Hash] hash containing ALTO `String` element attributes.
31
+ # @return [Array] Array of position x, y, width, height in px.
32
+ def s_coords(attrs)
33
+ height = scale_value((attrs['HEIGHT'] || 0).to_i)
34
+ width = scale_value((attrs['WIDTH'] || 0).to_i)
35
+ hpos = scale_value((attrs['HPOS'] || 0).to_i)
36
+ vpos = scale_value((attrs['VPOS'] || 0).to_i)
37
+ [hpos, vpos, width, height]
38
+ end
39
+
40
+ def compute_scaling(attrs)
41
+ return if @image_width.nil?
42
+ match = attrs.find { |e| e[0].casecmp?('WIDTH') }
43
+ return if match.empty?
44
+ page_width = match[1].to_i
45
+ return if @image_width == page_width
46
+ @scaling = page_width / @image_width.to_f
47
+ end
48
+
49
+ def scale_value(v)
50
+ (v / @scaling).to_i
51
+ end
52
+
53
+ # Callback for element start, implementation of which ignores
54
+ # non-String elements.
55
+ #
56
+ # @param name [String] element name.
57
+ # @param attrs [Array] Array of key, value pair Arrays.
58
+ def start_element(name, attrs = [])
59
+ values = attrs.to_h
60
+ compute_scaling(attrs) if name == 'Page'
61
+ return if name != 'String'
62
+ token = values['CONTENT']
63
+ @text << token
64
+ @words << {
65
+ word: token,
66
+ coordinates: s_coords(values)
67
+ }
68
+ end
69
+
70
+ # Callback for element end, used here to manage endings of lines and
71
+ # blocks.
72
+ #
73
+ # @param name [String] element name.
74
+ def end_element(name)
75
+ @text << " " if name == 'String'
76
+ @text << "\n" if name == 'TextBlock'
77
+ @text << "\n" if name == 'TextLine'
78
+ end
79
+
80
+ # Callback for completion of parsing ALTO, used to normalize generated
81
+ # text content (strip unneeded whitespace incidental to output).
82
+ def end_document
83
+ # postprocess @text to remove trailing spaces on lines
84
+ @text = @text.split("\n").map(&:strip).join("\n")
85
+ # remove trailing whitespace at end of buffer
86
+ @text.strip!
87
+ end
88
+ end
89
+
90
+ # Construct with either path
91
+ #
92
+ # @param xml [String], and process document
93
+ def initialize(xml, image_width = nil, image_height = nil)
94
+ @source = isxml?(xml) ? xml : File.read(xml)
95
+ @image_width = image_width
96
+ @image_height = image_height
97
+ @doc_stream = AltoDocStream.new(image_width)
98
+ parser = Nokogiri::XML::SAX::Parser.new(doc_stream)
99
+ parser.parse(@source)
100
+ end
101
+
102
+ # Determine if source parameter is path or xml
103
+ #
104
+ # @param xml [String] either path to xml file or xml source
105
+ # @return [true, false] true if string appears to be XML source, not path
106
+ def isxml?(xml)
107
+ xml.lstrip.start_with?('<')
108
+ end
109
+
110
+ # Output JSON flattened word coordinates
111
+ #
112
+ # @return [String] JSON serialization of flattened word coordinates
113
+ def json
114
+ words = @doc_stream.words
115
+ IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
116
+ words: words,
117
+ width: @image_width,
118
+ height: @image_height
119
+ )
120
+ end
121
+ end
122
+ end
123
+ end