iiif_print 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (211) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.env +5 -0
  4. data/.fcrepo_wrapper +4 -0
  5. data/.github/release.yml +20 -0
  6. data/.github/workflows/branches.yml +24 -0
  7. data/.github/workflows/build-lint-test-action.yaml +33 -0
  8. data/.github/workflows/release_labels.yml +25 -0
  9. data/.gitignore +52 -0
  10. data/.rubocop.yml +177 -0
  11. data/.solr_wrapper +8 -0
  12. data/.travis.yml +49 -0
  13. data/CONTRIBUTING.md +181 -0
  14. data/Dockerfile +15 -0
  15. data/Gemfile +52 -0
  16. data/LICENSE +203 -0
  17. data/README.md +203 -0
  18. data/Rakefile +38 -0
  19. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
  20. data/app/assets/config/iiif_print_manifest.js +2 -0
  21. data/app/assets/images/iiif_print/.keep +0 -0
  22. data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
  23. data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
  24. data/app/assets/javascripts/iiif_print.js +3 -0
  25. data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
  26. data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
  27. data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
  28. data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
  29. data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
  30. data/app/helpers/hyrax/iiif_helper.rb +22 -0
  31. data/app/helpers/iiif_print/application_helper.rb +5 -0
  32. data/app/helpers/iiif_print_helper.rb +64 -0
  33. data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
  34. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
  35. data/app/mailers/iiif_print/application_mailer.rb +8 -0
  36. data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
  37. data/app/models/concerns/iiif_print/solr/document.rb +47 -0
  38. data/app/models/iiif_print/application_record.rb +6 -0
  39. data/app/models/iiif_print/derivative_attachment.rb +8 -0
  40. data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
  41. data/app/models/iiif_print/ingest_file_relation.rb +14 -0
  42. data/app/models/iiif_print/pending_relationship.rb +7 -0
  43. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
  44. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
  45. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
  46. data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
  47. data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
  48. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
  49. data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
  50. data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
  51. data/app/views/catalog/_snippets_more.html.erb +16 -0
  52. data/app/views/hyrax/base/_representative_media.html.erb +9 -0
  53. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
  54. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  55. data/bin/rails +13 -0
  56. data/config/fcrepo_wrapper_test.yml +5 -0
  57. data/config/initializers/assets.rb +2 -0
  58. data/config/locales/iiif_print.de.yml +148 -0
  59. data/config/locales/iiif_print.en.yml +119 -0
  60. data/config/locales/iiif_print.es.yml +148 -0
  61. data/config/locales/iiif_print.fr.yml +149 -0
  62. data/config/locales/iiif_print.it.yml +142 -0
  63. data/config/locales/iiif_print.pt-BR.yml +148 -0
  64. data/config/locales/iiif_print.zh.yml +142 -0
  65. data/config/solr_wrapper_test.yml +9 -0
  66. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  67. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  68. data/config/test-fixture/solr-config/elevate.xml +36 -0
  69. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  70. data/config/test-fixture/solr-config/protwords.txt +21 -0
  71. data/config/test-fixture/solr-config/schema.xml +366 -0
  72. data/config/test-fixture/solr-config/scripts.conf +24 -0
  73. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  74. data/config/test-fixture/solr-config/spellings.txt +2 -0
  75. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  76. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  77. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  78. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  79. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  80. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  81. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  82. data/config/vendor/fits.xml +55 -0
  83. data/config/vendor/imagemagick-6-policy.xml +76 -0
  84. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
  85. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
  86. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
  87. data/docker-compose.yml +129 -0
  88. data/iiif_print.gemspec +43 -0
  89. data/lib/generators/iiif_print/assets_generator.rb +29 -0
  90. data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
  91. data/lib/generators/iiif_print/install_generator.rb +52 -0
  92. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
  93. data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
  94. data/lib/iiif_print/base_derivative_service.rb +113 -0
  95. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
  96. data/lib/iiif_print/catalog_search_builder.rb +31 -0
  97. data/lib/iiif_print/configuration.rb +99 -0
  98. data/lib/iiif_print/data/fileset_helper.rb +25 -0
  99. data/lib/iiif_print/data/path_helper.rb +40 -0
  100. data/lib/iiif_print/data/work_derivatives.rb +323 -0
  101. data/lib/iiif_print/data/work_file.rb +92 -0
  102. data/lib/iiif_print/data/work_files.rb +199 -0
  103. data/lib/iiif_print/data.rb +35 -0
  104. data/lib/iiif_print/engine.rb +77 -0
  105. data/lib/iiif_print/errors.rb +9 -0
  106. data/lib/iiif_print/image_tool.rb +119 -0
  107. data/lib/iiif_print/jobs/application_job.rb +8 -0
  108. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
  109. data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
  110. data/lib/iiif_print/jp2_derivative_service.rb +118 -0
  111. data/lib/iiif_print/jp2_image_metadata.rb +81 -0
  112. data/lib/iiif_print/lineage_service.rb +41 -0
  113. data/lib/iiif_print/metadata.rb +125 -0
  114. data/lib/iiif_print/pdf_derivative_service.rb +42 -0
  115. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
  116. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
  117. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
  118. data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
  119. data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
  120. data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
  121. data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
  122. data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
  123. data/lib/iiif_print/text_extraction.rb +11 -0
  124. data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
  125. data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
  126. data/lib/iiif_print/tiff_derivative_service.rb +50 -0
  127. data/lib/iiif_print/version.rb +3 -0
  128. data/lib/iiif_print/works_controller_behavior.rb +9 -0
  129. data/lib/iiif_print.rb +136 -0
  130. data/lib/tasks/set_child_works.rake +22 -0
  131. data/spec/.keep.txt +1 -0
  132. data/spec/factories/ability.rb +6 -0
  133. data/spec/factories/newspaper_issue.rb +7 -0
  134. data/spec/factories/newspaper_page.rb +7 -0
  135. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  136. data/spec/factories/newspaper_title.rb +8 -0
  137. data/spec/factories/uploaded_pdf_file.rb +9 -0
  138. data/spec/factories/uploaded_txt_file.rb +9 -0
  139. data/spec/factories/user.rb +13 -0
  140. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  141. data/spec/fixtures/files/4.1.07.tiff +0 -0
  142. data/spec/fixtures/files/README.md +7 -0
  143. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  144. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  145. data/spec/fixtures/files/credits.md +16 -0
  146. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  147. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  148. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  149. data/spec/fixtures/files/minimal-alto.xml +31 -0
  150. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  151. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  152. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  153. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  154. data/spec/fixtures/files/ocr_alto.xml +202 -0
  155. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  156. data/spec/fixtures/files/ocr_color.tiff +0 -0
  157. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  158. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  159. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  160. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  161. data/spec/fixtures/files/page1.tiff +0 -0
  162. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  163. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  164. data/spec/fixtures/files/thumbnail.jpg +0 -0
  165. data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
  166. data/spec/helpers/iiif_print_helper_spec.rb +43 -0
  167. data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
  168. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
  169. data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
  170. data/spec/iiif_print/configuration_spec.rb +67 -0
  171. data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
  172. data/spec/iiif_print/data/work_file_spec.rb +99 -0
  173. data/spec/iiif_print/data/work_files_spec.rb +237 -0
  174. data/spec/iiif_print/image_tool_spec.rb +109 -0
  175. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
  176. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
  177. data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
  178. data/spec/iiif_print/lineage_service_spec.rb +13 -0
  179. data/spec/iiif_print/metadata_spec.rb +115 -0
  180. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
  181. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
  182. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
  183. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
  184. data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
  185. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
  186. data/spec/iiif_print_spec.rb +51 -0
  187. data/spec/misc_shared.rb +111 -0
  188. data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
  189. data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
  190. data/spec/models/solr_document_spec.rb +14 -0
  191. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
  192. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
  193. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
  194. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
  195. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
  196. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
  197. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
  198. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
  199. data/spec/spec_helper.rb +181 -0
  200. data/spec/support/controller_level_helpers.rb +28 -0
  201. data/spec/support/iiif_print_models.rb +127 -0
  202. data/spec/test_app_templates/blacklight.yml +9 -0
  203. data/spec/test_app_templates/fedora.yml +15 -0
  204. data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
  205. data/spec/test_app_templates/redis.yml +9 -0
  206. data/spec/test_app_templates/solr/conf/schema.xml +362 -0
  207. data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
  208. data/spec/test_app_templates/solr.yml +7 -0
  209. data/tasks/iiif_print_dev.rake +34 -0
  210. data/tmp/.keep +0 -0
  211. metadata +605 -0
@@ -0,0 +1,125 @@
1
+ module IiifPrint
2
+ # rubocop:disable Metrics/ClassLength
3
+ class Metadata
4
+ def self.build_metadata_for(work:, version:, fields:, current_ability:, base_url:)
5
+ new(work: work,
6
+ version: version,
7
+ fields: fields,
8
+ current_ability: current_ability,
9
+ base_url: base_url).build_metadata
10
+ end
11
+
12
+ def initialize(work:, version:, fields:, current_ability:, base_url:)
13
+ @work = work
14
+ @version = version.to_i
15
+ @fields = fields
16
+ @current_ability = current_ability
17
+ @base_url = base_url
18
+ end
19
+
20
+ attr_reader :work, :version, :fields
21
+
22
+ def build_metadata
23
+ send("build_metadata_for_v#{version}")
24
+ end
25
+
26
+ private
27
+
28
+ def build_metadata_for_v2
29
+ fields.map do |field|
30
+ label = Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label
31
+ if field.name == :collection && member_of_collection?
32
+ viewable_collections = Hyrax::CollectionMemberService.run(work, @current_ability)
33
+ next if viewable_collections.empty?
34
+ { 'label' => label,
35
+ 'value' => make_collection_link(viewable_collections) }
36
+ else
37
+ next if field_is_empty?(field)
38
+ { 'label' => label,
39
+ 'value' => cast_to_value(field_name: field.name, options: field.options) }
40
+ end
41
+ end.compact
42
+ end
43
+
44
+ def build_metadata_for_v3
45
+ fields.map do |field|
46
+ values = Array(work.try(field.name)).map { |value| scrub(value.to_s) }
47
+ next if values.empty?
48
+ {
49
+ 'label' => {
50
+ # Since we're using I18n to translate the field, we're setting the locale used in the translation.
51
+ I18n.locale.to_s => [Hyrax::Renderers::AttributeRenderer.new(field.name, nil).label]
52
+ },
53
+ 'value' => {
54
+ 'none' => values
55
+ }
56
+ }
57
+ end.compact
58
+ end
59
+
60
+ def field_is_empty?(field)
61
+ Array(work.try(field.name)).empty?
62
+ end
63
+
64
+ def member_of_collection?
65
+ work[:member_of_collection_ids_ssim]&.present?
66
+ end
67
+
68
+ def scrub(value)
69
+ Loofah.fragment(value).scrub!(:whitewash).to_s
70
+ end
71
+
72
+ def cast_to_value(field_name:, options:)
73
+ if options&.[](:render_as) == :faceted
74
+ values_for(field_name: field_name).map do |value|
75
+ search_field = field_name.to_s + "_sim"
76
+ path = Rails.application.routes.url_helpers.search_catalog_path(
77
+ "f[#{search_field}][]": value, locale: I18n.locale
78
+ )
79
+ path += '&include_child_works=true' if work["is_child_bsi"] == true
80
+ "<a href='#{File.join(@base_url, path)}'>#{value}</a>"
81
+ end
82
+ else
83
+ make_link(values_for(field_name: field_name))
84
+ end
85
+ end
86
+
87
+ def values_for(field_name:)
88
+ Array(work.send(field_name))
89
+ end
90
+
91
+ def make_collection_link(collection_documents)
92
+ collection_documents.map do |collection|
93
+ "<a href='#{File.join(@base_url, 'collections', collection.id)}'>#{collection.title.first}</a>"
94
+ end
95
+ end
96
+
97
+ # @note This method turns link looking strings into links
98
+ def make_link(texts)
99
+ texts.map do |t|
100
+ t.to_s.gsub(MAKE_LINK_REGEX) do |url|
101
+ "<a href='#{url}' target='_blank'>#{url}</a>"
102
+ end
103
+ end
104
+ end
105
+
106
+ MAKE_LINK_REGEX = %r{
107
+ \b
108
+ (
109
+ (?: [a-z][\w-]+:
110
+ (?: /{1,3} | [a-z0-9%] ) |
111
+ www\d{0,3}[.] |
112
+ [a-z0-9.\-]+[.][a-z]{2,4}/
113
+ )
114
+ (?:
115
+ [^\s()<>]+ | \(([^\s()<>]+|(\([^\s()<>]+\)))*\)
116
+ )+
117
+ (?:
118
+ \(([^\s()<>]+|(\([^\s()<>]+\)))*\) |
119
+ [^\s`!()\[\]{};:'".,<>?«»〝〞‘‛]
120
+ )
121
+ )
122
+ }ix.freeze
123
+ end
124
+ # rubocop:enable Metrics/ClassLength
125
+ end
@@ -0,0 +1,42 @@
1
+ require 'open3'
2
+
3
+ module IiifPrint
4
+ class PDFDerivativeService < BaseDerivativeService
5
+ self.target_extension = 'pdf'.freeze
6
+
7
+ # PDF (JPEG, 8 bit grayscale), 150ppi
8
+ GRAY_PDF_CMD = 'convert %<source_file>s ' \
9
+ '-resize 1800 -density 150 ' \
10
+ '-depth 8 -colorspace Gray ' \
11
+ '-compress jpeg %<out_file>s'.freeze
12
+
13
+ # sRBG color PDF (JPEG, 8 bits per channel), 150ppi
14
+ COLOR_PDF_CMD = 'convert %<source_file>s ' \
15
+ '-resize 1800 -density 150 ' \
16
+ '-depth 8 ' \
17
+ '-compress jpeg %<out_file>s'.freeze
18
+
19
+ def initialize(file_set)
20
+ super(file_set)
21
+ end
22
+
23
+ # Get conversion command; command varies on whether or not we have
24
+ # JP2 source, and whether we have color or grayscale material.
25
+ def convert_cmd
26
+ template = use_color? ? COLOR_PDF_CMD : GRAY_PDF_CMD
27
+ format(template, source_file: @source_path, out_file: @dest_path)
28
+ end
29
+
30
+ def create_derivatives(filename)
31
+ # Base class takes care of loading @source_path, @dest_path
32
+ super(filename)
33
+
34
+ # no creation if pdf master
35
+ return if mime_type == 'application/pdf'
36
+
37
+ # Get and run conversion command
38
+ return jp2_convert if mime_type == 'image/jp2'
39
+ im_convert
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Encapsulates methods used for pdf splitting into child works
4
+ module IiifPrint
5
+ module SplitPdfs
6
+ class ChildWorkCreationFromPdfService
7
+ # Load an array of paths to pdf files
8
+ # @param [Array > Hyrax::Upload file ids]
9
+ # @return [Array > String] file paths to temp directory
10
+ def self.pdf_paths(files:)
11
+ upload_ids = filter_file_ids(files)
12
+ return [] if upload_ids.empty?
13
+ uploads = Hyrax::UploadedFile.find(upload_ids)
14
+ paths = uploads.map(&method(:upload_path))
15
+ pdfs_only_for(paths)
16
+ end
17
+
18
+ # Is child work splitting defined for model?
19
+ # @param [GenericWork, etc] A valid type of hyrax work
20
+ # @return [Boolean]
21
+ def self.iiif_print_split?(work:)
22
+ # defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
23
+ return true if work.try(:iiif_print_config)&.pdf_split_child_model
24
+ false
25
+ end
26
+
27
+ # Are there any PDF files?
28
+ # @param [Array > String] paths to PDFs
29
+ # @return [Boolean]
30
+ def self.pdfs?(paths:)
31
+ pdf_paths = pdfs_only_for(paths)
32
+ return false unless pdf_paths.count.positive?
33
+ true
34
+ end
35
+
36
+ # Submit the job to split PDF into child works
37
+ # @param [GenericWork, etc] A valid type of hyrax work
38
+ # @param [Array<String>] paths to PDF attachments
39
+ # @param [User] user
40
+ # @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
41
+ def self.queue_job(work:, file_locations:, user:, admin_set_id:)
42
+ work.iiif_print_config.pdf_splitter_job.perform_later(
43
+ work,
44
+ file_locations,
45
+ user,
46
+ admin_set_id,
47
+ count_existing_pdfs(work)
48
+ )
49
+ end
50
+
51
+ def self.filter_file_ids(input)
52
+ Array.wrap(input).select(&:present?)
53
+ end
54
+
55
+ # Given Hyrax::Upload object, return path to file on local filesystem
56
+ def self.upload_path(upload)
57
+ # so many layers to this onion:
58
+ upload.file.file.file
59
+ end
60
+
61
+ # TODO: implement a method to count existing PDFs on a work to support
62
+ # adding more PDFs to an existing work.
63
+ def self.count_existing_pdfs(_work)
64
+ 0
65
+ end
66
+
67
+ # TODO: Consider other methods to identify a PDF file.
68
+ # This sub-selection may need to be moved to use mimetype if there
69
+ # is a need to support paths not ending in .pdf (i.e. remote_urls)
70
+ def self.pdfs_only_for(paths)
71
+ paths.select { |path| path.end_with?('.pdf', '.PDF') }
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,130 @@
1
+ require 'open3'
2
+ require 'securerandom'
3
+ require 'tmpdir'
4
+ require 'iiif_print/split_pdfs/pdf_image_extraction_service'
5
+
6
+ module IiifPrint
7
+ module SplitPdfs
8
+ class PagesIntoImagesService
9
+ include Enumerable
10
+
11
+ def initialize(path)
12
+ @baseid = SecureRandom.uuid
13
+ @pdfpath = path
14
+ @info = nil
15
+ @entries = nil
16
+ @tmpdir = nil
17
+ @size = nil
18
+ @pagecount = nil
19
+ @pdftext = nil
20
+ @compression = 'lzw'
21
+ end
22
+
23
+ # return
24
+ def pdfinfo
25
+ @info = IiifPrint::SplitPdfs::PdfImageExtractionService.new(@pdfpath) if @info.nil?
26
+ @info
27
+ end
28
+
29
+ # TODO: put this test somewhere to prevent invalid pdfs from crashing the image service.
30
+ def invalid_pdf?
31
+ return true if pdfinfo.color.include?(nil) || pdfinfo.width.nil? || pdfinfo.height.nil? || pdfinfo.entries.length.zero?
32
+ false
33
+ end
34
+
35
+ def tmpdir
36
+ @tmpdir = Dir.mktmpdir if @tmpdir.nil?
37
+ @tmpdir
38
+ end
39
+
40
+ def colordevice(channels, bpc)
41
+ bits = bpc * channels
42
+ # will be either 8bpc/16bpd color TIFF,
43
+ # with any CMYK source transformed to 8bpc RBG
44
+ bits = 24 unless [24, 48].include? bits
45
+ "tiff#{bits}nc"
46
+ end
47
+
48
+ def gsdevice
49
+ color, channels, bpc = pdfinfo.color
50
+ device = nil
51
+ # CCITT Group 4 Black and White, if applicable:
52
+ if color == 'gray' && bpc == 1
53
+ device = 'tiffg4'
54
+ @compression = 'g4'
55
+ end
56
+ # 8 Bit Grayscale, if applicable:
57
+ device = 'tiffgray' if color == 'gray' && bpc > 1
58
+ # otherwise color:
59
+ device = colordevice(channels, bpc) if device.nil?
60
+ device
61
+ end
62
+
63
+ # TODO: this method came from newspaper gem but appears to be unused. Is it needed anywhere?
64
+ # def gstext
65
+ # cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
66
+ # "-sOutputFile=- -f #{@pdfpath}"
67
+ # Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
68
+ # @pdftext = stdout.read
69
+ # end
70
+ # @pdftext
71
+ # end
72
+
73
+ def pagecount
74
+ cmd = "pdfinfo #{@pdfpath}"
75
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
76
+ output = stdout.read.split("\n")
77
+ # rubocop:disable Performance/Detect
78
+ pages_e = output.select { |e| e.start_with?('Pages:') }[0]
79
+ # rubocop:enable Performance/Detect
80
+ @pagecount = pages_e.split[-1].to_i
81
+ end
82
+ @pagecount
83
+ end
84
+
85
+ def looks_scanned
86
+ max_image_px = pdfinfo.width * pdfinfo.height
87
+ single_image_per_page = pdfinfo.entries.length == pagecount
88
+ # single 10mp+ image per page?
89
+ single_image_per_page && max_image_px > 1024 * 1024 * 10
90
+ end
91
+
92
+ def ppi
93
+ unless looks_scanned
94
+ # 400 dpi for something that does not look like scanned media:
95
+ return 400
96
+ end
97
+ # For scanned media, defer to detected image PPI:
98
+ pdfinfo.ppi
99
+ end
100
+
101
+ # ghostscript convert all pages to TIFF
102
+ def gsconvert
103
+ output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
104
+ cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
105
+ "-dTextAlphaBits=4 -sCompression=#{@compression} " \
106
+ "-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
107
+ Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
108
+ output = stdout.read.split("\n")
109
+ # rubocop:disable Performance/Count
110
+ @size = output.select { |e| e.start_with?('Page ') }.length
111
+ # rubocop:enable Performance/Count
112
+ end
113
+ # Return an array of expected filenames
114
+ (1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
115
+ end
116
+
117
+ # entries for each page
118
+ def entries
119
+ @entries = gsconvert if @entries.nil?
120
+ @entries
121
+ end
122
+
123
+ def each
124
+ entries.each do |e|
125
+ yield(e)
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,85 @@
1
+ require 'open3'
2
+ require 'mini_magick'
3
+
4
+ module IiifPrint
5
+ module SplitPdfs
6
+ # Uses poppler 0.19+ pdfimages command to extract image
7
+ # listing metadata from PDF files.
8
+ # For dpi extraction, falls back to calculating using MiniMagick,
9
+ # if neccessary.
10
+ class PdfImageExtractionService
11
+ # class constant column numbers
12
+ COL_WIDTH = 3
13
+ COL_HEIGHT = 4
14
+ COL_COLOR = 5
15
+ COL_CHANNELS = 6
16
+ COL_BITS = 7
17
+ # only poppler 0.25+ has this column in output:
18
+ COL_XPPI = 12
19
+
20
+ def initialize(path)
21
+ @path = path
22
+ @cmd = format('pdfimages -list %<path>s', path: path)
23
+ @output = nil
24
+ @entries = nil
25
+ end
26
+
27
+ def process
28
+ # call just once
29
+ if @output.nil?
30
+ Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
31
+ @output = stdout.read.split("\n")
32
+ end
33
+ end
34
+ @output.slice(2, @output.size - 1)
35
+ end
36
+
37
+ def entries
38
+ if @entries.nil?
39
+ @entries = []
40
+ output = process
41
+ (0..output.size - 1).each do |i|
42
+ @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
43
+ end
44
+ end
45
+ @entries
46
+ end
47
+
48
+ def selectcolumn(i, &block)
49
+ result = entries.map { |e| e[i] }
50
+ return result.map!(&block) if block_given?
51
+ result
52
+ end
53
+
54
+ def width
55
+ selectcolumn(COL_WIDTH, &:to_i).max
56
+ end
57
+
58
+ def height
59
+ selectcolumn(COL_HEIGHT, &:to_i).max
60
+ end
61
+
62
+ def color
63
+ # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
64
+ # so caller may want all of this information, and in case of
65
+ # mixed color spaces across images, this returns maximum
66
+ desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
67
+ channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
68
+ bits = entries.map { |e| e[COL_BITS].to_i }.max
69
+ [desc, channels, bits]
70
+ end
71
+
72
+ def ppi
73
+ if entries[0].size <= 12
74
+ # poppler < 0.25
75
+ pdf = MiniMagick::Image.open(@path)
76
+ width_points = pdf.width
77
+ width_px = width
78
+ return (72 * width_px / width_points).to_i
79
+ end
80
+ # with poppler 0.25+, pdfimages just gives us this:
81
+ selectcolumn(COL_XPPI, &:to_i).max
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,123 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'json'
3
+ require 'nokogiri'
4
+
5
+ module IiifPrint
6
+ # Module for text extraction
7
+ module TextExtraction
8
+ # Class to obtain plain text and JSON word-coordinates from ALTO source
9
+ class AltoReader
10
+ attr_accessor :source, :doc_stream
11
+ delegate :text, to: :doc_stream
12
+
13
+ # SAX Document Stream class to gather text and word tokens from ALTO
14
+ class AltoDocStream < Nokogiri::XML::SAX::Document
15
+ attr_accessor :text, :words
16
+
17
+ def initialize(image_width = nil)
18
+ super()
19
+ # scaling matters:
20
+ @image_width = image_width
21
+ @scaling = 1.0 # pt to px, if ALTO using points
22
+ # plain text buffer:
23
+ @text = ''
24
+ # list of word hash, containing word+coord:
25
+ @words = []
26
+ end
27
+
28
+ # Return coordinates from String element attribute hash
29
+ #
30
+ # @param attrs [Hash] hash containing ALTO `String` element attributes.
31
+ # @return [Array] Array of position x, y, width, height in px.
32
+ def s_coords(attrs)
33
+ height = scale_value((attrs['HEIGHT'] || 0).to_i)
34
+ width = scale_value((attrs['WIDTH'] || 0).to_i)
35
+ hpos = scale_value((attrs['HPOS'] || 0).to_i)
36
+ vpos = scale_value((attrs['VPOS'] || 0).to_i)
37
+ [hpos, vpos, width, height]
38
+ end
39
+
40
+ def compute_scaling(attrs)
41
+ return if @image_width.nil?
42
+ match = attrs.find { |e| e[0].casecmp?('WIDTH') }
43
+ return if match.empty?
44
+ page_width = match[1].to_i
45
+ return if @image_width == page_width
46
+ @scaling = page_width / @image_width.to_f
47
+ end
48
+
49
+ def scale_value(v)
50
+ (v / @scaling).to_i
51
+ end
52
+
53
+ # Callback for element start, implementation of which ignores
54
+ # non-String elements.
55
+ #
56
+ # @param name [String] element name.
57
+ # @param attrs [Array] Array of key, value pair Arrays.
58
+ def start_element(name, attrs = [])
59
+ values = attrs.to_h
60
+ compute_scaling(attrs) if name == 'Page'
61
+ return if name != 'String'
62
+ token = values['CONTENT']
63
+ @text << token
64
+ @words << {
65
+ word: token,
66
+ coordinates: s_coords(values)
67
+ }
68
+ end
69
+
70
+ # Callback for element end, used here to manage endings of lines and
71
+ # blocks.
72
+ #
73
+ # @param name [String] element name.
74
+ def end_element(name)
75
+ @text << " " if name == 'String'
76
+ @text << "\n" if name == 'TextBlock'
77
+ @text << "\n" if name == 'TextLine'
78
+ end
79
+
80
+ # Callback for completion of parsing ALTO, used to normalize generated
81
+ # text content (strip unneeded whitespace incidental to output).
82
+ def end_document
83
+ # postprocess @text to remove trailing spaces on lines
84
+ @text = @text.split("\n").map(&:strip).join("\n")
85
+ # remove trailing whitespace at end of buffer
86
+ @text.strip!
87
+ end
88
+ end
89
+
90
+ # Construct with either path
91
+ #
92
+ # @param xml [String], and process document
93
+ def initialize(xml, image_width = nil, image_height = nil)
94
+ @source = isxml?(xml) ? xml : File.read(xml)
95
+ @image_width = image_width
96
+ @image_height = image_height
97
+ @doc_stream = AltoDocStream.new(image_width)
98
+ parser = Nokogiri::XML::SAX::Parser.new(doc_stream)
99
+ parser.parse(@source)
100
+ end
101
+
102
+ # Determine if source parameter is path or xml
103
+ #
104
+ # @param xml [String] either path to xml file or xml source
105
+ # @return [true, false] true if string appears to be XML source, not path
106
+ def isxml?(xml)
107
+ xml.lstrip.start_with?('<')
108
+ end
109
+
110
+ # Output JSON flattened word coordinates
111
+ #
112
+ # @return [String] JSON serialization of flattened word coordinates
113
+ def json
114
+ words = @doc_stream.words
115
+ IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
116
+ words: words,
117
+ width: @image_width,
118
+ height: @image_height
119
+ )
120
+ end
121
+ end
122
+ end
123
+ end