iiif_print 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.env +5 -0
  4. data/.fcrepo_wrapper +4 -0
  5. data/.github/release.yml +20 -0
  6. data/.github/workflows/branches.yml +24 -0
  7. data/.github/workflows/build-lint-test-action.yaml +33 -0
  8. data/.github/workflows/release_labels.yml +25 -0
  9. data/.gitignore +52 -0
  10. data/.rubocop.yml +177 -0
  11. data/.solr_wrapper +8 -0
  12. data/.travis.yml +49 -0
  13. data/CONTRIBUTING.md +181 -0
  14. data/Dockerfile +15 -0
  15. data/Gemfile +52 -0
  16. data/LICENSE +203 -0
  17. data/README.md +203 -0
  18. data/Rakefile +38 -0
  19. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
  20. data/app/assets/config/iiif_print_manifest.js +2 -0
  21. data/app/assets/images/iiif_print/.keep +0 -0
  22. data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
  23. data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
  24. data/app/assets/javascripts/iiif_print.js +3 -0
  25. data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
  26. data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
  27. data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
  28. data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
  29. data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
  30. data/app/helpers/hyrax/iiif_helper.rb +22 -0
  31. data/app/helpers/iiif_print/application_helper.rb +5 -0
  32. data/app/helpers/iiif_print_helper.rb +64 -0
  33. data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
  34. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
  35. data/app/mailers/iiif_print/application_mailer.rb +8 -0
  36. data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
  37. data/app/models/concerns/iiif_print/solr/document.rb +47 -0
  38. data/app/models/iiif_print/application_record.rb +6 -0
  39. data/app/models/iiif_print/derivative_attachment.rb +8 -0
  40. data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
  41. data/app/models/iiif_print/ingest_file_relation.rb +14 -0
  42. data/app/models/iiif_print/pending_relationship.rb +7 -0
  43. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
  44. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
  45. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
  46. data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
  47. data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
  48. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
  49. data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
  50. data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
  51. data/app/views/catalog/_snippets_more.html.erb +16 -0
  52. data/app/views/hyrax/base/_representative_media.html.erb +9 -0
  53. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
  54. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  55. data/bin/rails +13 -0
  56. data/config/fcrepo_wrapper_test.yml +5 -0
  57. data/config/initializers/assets.rb +2 -0
  58. data/config/locales/iiif_print.de.yml +148 -0
  59. data/config/locales/iiif_print.en.yml +119 -0
  60. data/config/locales/iiif_print.es.yml +148 -0
  61. data/config/locales/iiif_print.fr.yml +149 -0
  62. data/config/locales/iiif_print.it.yml +142 -0
  63. data/config/locales/iiif_print.pt-BR.yml +148 -0
  64. data/config/locales/iiif_print.zh.yml +142 -0
  65. data/config/solr_wrapper_test.yml +9 -0
  66. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  67. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  68. data/config/test-fixture/solr-config/elevate.xml +36 -0
  69. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  70. data/config/test-fixture/solr-config/protwords.txt +21 -0
  71. data/config/test-fixture/solr-config/schema.xml +366 -0
  72. data/config/test-fixture/solr-config/scripts.conf +24 -0
  73. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  74. data/config/test-fixture/solr-config/spellings.txt +2 -0
  75. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  76. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  77. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  78. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  79. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  80. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  81. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  82. data/config/vendor/fits.xml +55 -0
  83. data/config/vendor/imagemagick-6-policy.xml +76 -0
  84. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
  85. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
  86. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
  87. data/docker-compose.yml +129 -0
  88. data/iiif_print.gemspec +43 -0
  89. data/lib/generators/iiif_print/assets_generator.rb +29 -0
  90. data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
  91. data/lib/generators/iiif_print/install_generator.rb +52 -0
  92. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
  93. data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
  94. data/lib/iiif_print/base_derivative_service.rb +113 -0
  95. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
  96. data/lib/iiif_print/catalog_search_builder.rb +31 -0
  97. data/lib/iiif_print/configuration.rb +99 -0
  98. data/lib/iiif_print/data/fileset_helper.rb +25 -0
  99. data/lib/iiif_print/data/path_helper.rb +40 -0
  100. data/lib/iiif_print/data/work_derivatives.rb +323 -0
  101. data/lib/iiif_print/data/work_file.rb +92 -0
  102. data/lib/iiif_print/data/work_files.rb +199 -0
  103. data/lib/iiif_print/data.rb +35 -0
  104. data/lib/iiif_print/engine.rb +77 -0
  105. data/lib/iiif_print/errors.rb +9 -0
  106. data/lib/iiif_print/image_tool.rb +119 -0
  107. data/lib/iiif_print/jobs/application_job.rb +8 -0
  108. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
  109. data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
  110. data/lib/iiif_print/jp2_derivative_service.rb +118 -0
  111. data/lib/iiif_print/jp2_image_metadata.rb +81 -0
  112. data/lib/iiif_print/lineage_service.rb +41 -0
  113. data/lib/iiif_print/metadata.rb +125 -0
  114. data/lib/iiif_print/pdf_derivative_service.rb +42 -0
  115. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
  116. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
  117. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
  118. data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
  119. data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
  120. data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
  121. data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
  122. data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
  123. data/lib/iiif_print/text_extraction.rb +11 -0
  124. data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
  125. data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
  126. data/lib/iiif_print/tiff_derivative_service.rb +50 -0
  127. data/lib/iiif_print/version.rb +3 -0
  128. data/lib/iiif_print/works_controller_behavior.rb +9 -0
  129. data/lib/iiif_print.rb +136 -0
  130. data/lib/tasks/set_child_works.rake +22 -0
  131. data/spec/.keep.txt +1 -0
  132. data/spec/factories/ability.rb +6 -0
  133. data/spec/factories/newspaper_issue.rb +7 -0
  134. data/spec/factories/newspaper_page.rb +7 -0
  135. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  136. data/spec/factories/newspaper_title.rb +8 -0
  137. data/spec/factories/uploaded_pdf_file.rb +9 -0
  138. data/spec/factories/uploaded_txt_file.rb +9 -0
  139. data/spec/factories/user.rb +13 -0
  140. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  141. data/spec/fixtures/files/4.1.07.tiff +0 -0
  142. data/spec/fixtures/files/README.md +7 -0
  143. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  144. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  145. data/spec/fixtures/files/credits.md +16 -0
  146. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  147. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  148. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  149. data/spec/fixtures/files/minimal-alto.xml +31 -0
  150. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  151. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  152. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  153. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  154. data/spec/fixtures/files/ocr_alto.xml +202 -0
  155. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  156. data/spec/fixtures/files/ocr_color.tiff +0 -0
  157. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  158. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  159. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  160. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  161. data/spec/fixtures/files/page1.tiff +0 -0
  162. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  163. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  164. data/spec/fixtures/files/thumbnail.jpg +0 -0
  165. data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
  166. data/spec/helpers/iiif_print_helper_spec.rb +43 -0
  167. data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
  168. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
  169. data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
  170. data/spec/iiif_print/configuration_spec.rb +67 -0
  171. data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
  172. data/spec/iiif_print/data/work_file_spec.rb +99 -0
  173. data/spec/iiif_print/data/work_files_spec.rb +237 -0
  174. data/spec/iiif_print/image_tool_spec.rb +109 -0
  175. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
  176. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
  177. data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
  178. data/spec/iiif_print/lineage_service_spec.rb +13 -0
  179. data/spec/iiif_print/metadata_spec.rb +115 -0
  180. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
  181. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
  182. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
  183. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
  184. data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
  185. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
  186. data/spec/iiif_print_spec.rb +51 -0
  187. data/spec/misc_shared.rb +111 -0
  188. data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
  189. data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
  190. data/spec/models/solr_document_spec.rb +14 -0
  191. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
  192. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
  193. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
  194. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
  195. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
  196. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
  197. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
  198. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
  199. data/spec/spec_helper.rb +181 -0
  200. data/spec/support/controller_level_helpers.rb +28 -0
  201. data/spec/support/iiif_print_models.rb +127 -0
  202. data/spec/test_app_templates/blacklight.yml +9 -0
  203. data/spec/test_app_templates/fedora.yml +15 -0
  204. data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
  205. data/spec/test_app_templates/redis.yml +9 -0
  206. data/spec/test_app_templates/solr/conf/schema.xml +362 -0
  207. data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
  208. data/spec/test_app_templates/solr.yml +7 -0
  209. data/tasks/iiif_print_dev.rake +34 -0
  210. data/tmp/.keep +0 -0
  211. metadata +605 -0
@@ -0,0 +1,172 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'json'
3
+ require 'nokogiri'
4
+
5
+ module IiifPrint
6
+ # Module for text extraction
7
+ module TextExtraction
8
+ # Class to obtain plain text and JSON word-coordinates from hOCR source
9
+ # - Coordinates in px units, unlike ALTO, which may have scaling concerns
10
+ class HOCRReader
11
+ attr_accessor :source, :doc_stream
12
+ delegate :text, :width, :height, :words, to: :doc_stream
13
+
14
+ # SAX Document Stream class to gather text and word tokens from hOCR
15
+ class HOCRDocStream < Nokogiri::XML::SAX::Document
16
+ attr_accessor :text, :words, :width, :height
17
+
18
+ def initialize
19
+ super()
20
+ # plain text buffer:
21
+ @text = ''
22
+ # list of word hash, containing word+coord:
23
+ @words = []
24
+ # page width and height to be found in hOCR for `div.ocr_page`
25
+ @width = nil
26
+ @height = nil
27
+ # to hold current word data state across #start_element, #characters,
28
+ # and #end_element methods (to associate word with coordinates).
29
+ @current = nil
30
+ # to preserve element classname from start to use by #end_element
31
+ @element_class_name = nil
32
+ end
33
+
34
+ # Return coordinates from `span.ocrx_word` element attribute hash
35
+ #
36
+ # @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
37
+ # @return [Array] Array of position x, y, width, height in px.
38
+ def s_coords(attrs)
39
+ element_title = attrs['title']
40
+ bbox = element_title.split(';')[0].split('bbox ')[-1]
41
+ x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
42
+ height = y2 - y1
43
+ width = x2 - x1
44
+ hpos = x1
45
+ vpos = y1
46
+ [hpos, vpos, width, height]
47
+ end
48
+
49
+ # Consider element for processing?
50
+ # - `div.ocr_page` — to get page width/height
51
+ # - `span.ocr_line` — to help make plain text readable
52
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
53
+ # @param name [String] Element name
54
+ # @param class_name [String] HTML class name
55
+ # @return [Boolean] true if element should be processed; otherwise false
56
+ def consider?(name, class_name)
57
+ selector = "#{name}.#{class_name}"
58
+ ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
59
+ end
60
+
61
+ def start_word(attrs)
62
+ @current = {}
63
+ # will be replaced during #characters method call:
64
+ @current[:word] = nil
65
+ @current[:coordinates] = s_coords(attrs)
66
+ end
67
+
68
+ def start_page(attrs)
69
+ title = attrs['title']
70
+ fields = title.split(';')
71
+ bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
72
+ # width and height:
73
+ @width = bbox[2]
74
+ @height = bbox[3]
75
+ end
76
+
77
+ def word_complete?
78
+ return false if @current.nil?
79
+ coords = @current[:coordinates]
80
+ @current[:word] && !@current[:word].empty? && coords.size == 4
81
+ end
82
+
83
+ def end_word
84
+ # add trailing space to plaintext buffer for between words:
85
+ @text += ' '
86
+ @words.push(@current) if word_complete?
87
+ end
88
+
89
+ def end_line
90
+ # strip trailing whitespace
91
+ @text.strip!
92
+ # then insert a line break
93
+ @text += "\n"
94
+ end
95
+
96
+ # Callback for element start, ignores elements except for:
97
+ # - `div.ocr_page` — to get page width/height
98
+ # - `span.ocr_line` — to help make plain text readable
99
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
100
+ #
101
+ # @param name [String] element name.
102
+ # @param attrs [Array] Array of key, value pair Arrays.
103
+ def start_element(name, attrs = [])
104
+ attributes = attrs.to_h
105
+ @element_class_name = attributes['class']
106
+ return unless consider?(name, @element_class_name)
107
+ start_word(attributes) if @element_class_name == 'ocrx_word'
108
+ start_page(attributes) if @element_class_name == 'ocr_page'
109
+ end
110
+
111
+ def characters(value)
112
+ return if @current.nil?
113
+ return if @current[:coordinates].nil?
114
+ @current[:word] ||= ''
115
+ @current[:word] += value
116
+ @text += value
117
+ end
118
+
119
+ # Callback for element end; at this time, flush word coordinate state
120
+ # for current word, and append line endings to plain text:
121
+ #
122
+ # @param name [String] element name.
123
+ def end_element(_name)
124
+ end_line if @element_class_name == 'ocr_line'
125
+ end_word if @element_class_name == 'ocrx_word'
126
+ end
127
+
128
+ # Callback for completion of parsing hOCR, used to normalize generated
129
+ # text content (strip unneeded whitespace incidental to output).
130
+ def end_document
131
+ # postprocess @text to remove trailing spaces on lines
132
+ @text = @text.split("\n").map(&:strip).join("\n")
133
+ # remove excess line break
134
+ @text.gsub!(/\n+/, "\n")
135
+ @text.delete("\r")
136
+ # remove trailing whitespace at end of buffer
137
+ @text.strip!
138
+ end
139
+ end
140
+
141
+ # Construct with either path or HTML [String]
142
+ #
143
+ # @param html [String], and process document
144
+ def initialize(html)
145
+ @source = isxml?(html) ? html : File.read(html)
146
+ @doc_stream = HOCRDocStream.new
147
+ parser = Nokogiri::HTML::SAX::Parser.new(doc_stream)
148
+ parser.parse(@source)
149
+ end
150
+
151
+ # Determine if source parameter is path or xml/html
152
+ #
153
+ # @param xml [String] either path to xml file or xml source
154
+ # @return [true, false] true if value appears to be XML/HTML, not path
155
+ def isxml?(xml)
156
+ xml.lstrip.start_with?('<')
157
+ end
158
+
159
+ # Output JSON flattened word coordinates
160
+ #
161
+ # @return [String] JSON serialization of flattened word coordinates
162
+ def json
163
+ words = @doc_stream.words
164
+ IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
165
+ words: words,
166
+ width: @doc_stream.width,
167
+ height: @doc_stream.height
168
+ )
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,87 @@
1
+ require 'json'
2
+ require 'open3'
3
+ require 'tmpdir'
4
+
5
+ # --
6
+ module IiifPrint
7
+ # Module for text extraction (OCR or otherwise)
8
+ module TextExtraction
9
+ class PageOCR
10
+ attr_accessor :html, :path
11
+
12
+ def initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options)
13
+ @path = path
14
+ # hOCR html:
15
+ @html = nil
16
+ @words = nil
17
+ @source_meta = nil
18
+ @box = nil
19
+ @plain = nil
20
+ @additional_tessearct_options = additional_tessearct_options
21
+ end
22
+
23
+ def run_ocr
24
+ outfile = File.join(Dir.mktmpdir, 'output_html')
25
+ cmd = "tesseract #{path} #{outfile} hocr"
26
+ cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present?
27
+ `#{cmd}`
28
+ outfile + '.hocr'
29
+ end
30
+
31
+ def load_words
32
+ preprocess_image
33
+ html_path = run_ocr
34
+ reader = IiifPrint::TextExtraction::HOCRReader.new(html_path)
35
+ @words = reader.words
36
+ @plain = reader.text
37
+ end
38
+
39
+ def words
40
+ load_words if @words.nil?
41
+ @words
42
+ end
43
+
44
+ def word_json
45
+ IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
46
+ words: words,
47
+ width: width,
48
+ height: height
49
+ )
50
+ end
51
+
52
+ def plain
53
+ load_words if @plain.nil?
54
+ @plain
55
+ end
56
+
57
+ def identify
58
+ return @source_meta unless @source_meta.nil?
59
+ @source_meta = IiifPrint::ImageTool.new(@path).metadata
60
+ end
61
+
62
+ def width
63
+ identify[:width]
64
+ end
65
+
66
+ def height
67
+ identify[:height]
68
+ end
69
+
70
+ def alto
71
+ writer = IiifPrint::TextExtraction::RenderAlto.new(width, height)
72
+ writer.to_alto(words)
73
+ end
74
+
75
+ private
76
+
77
+ # transform the image into a one-bit TIFF for OCR
78
+ def preprocess_image
79
+ tool = IiifPrint::ImageTool.new(@path)
80
+ return if tool.metadata[:color] == 'monochrome'
81
+ intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
82
+ tool.convert(intermediate_path, true)
83
+ @path = intermediate_path
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,84 @@
1
+ require 'nokogiri'
2
+
3
+ module IiifPrint
4
+ # Module for text extraction (OCR or otherwise)
5
+ module TextExtraction
6
+ class RenderAlto
7
+ def initialize(width, height, scaling = 1.0)
8
+ @height = height
9
+ @width = width
10
+ @scaling = scaling
11
+ end
12
+
13
+ def to_alto(words)
14
+ page = alto_page(@width, @height) do |xml|
15
+ words.each do |word|
16
+ xml.String(
17
+ CONTENT: word[:word],
18
+ WIDTH: scale_point(word[:coordinates][2]).to_s,
19
+ HEIGHT: scale_point(word[:coordinates][3]).to_s,
20
+ HPOS: scale_point(word[:coordinates][0]).to_s,
21
+ VPOS: scale_point(word[:coordinates][1]).to_s
22
+ ) { xml.text '' }
23
+ end
24
+ end
25
+ page.to_xml
26
+ end
27
+
28
+ private
29
+
30
+ # given block to manage word generation, wrap with page/block/line
31
+ def alto_page(pxwidth, pxheight, &block)
32
+ builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
33
+ xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
34
+ xml.Description do
35
+ xml.MeasurementUnit 'pixel'
36
+ end
37
+ alto_layout(xml, pxwidth, pxheight, &block)
38
+ end
39
+ end
40
+ builder
41
+ end
42
+
43
+ def scale_point(value)
44
+ # NOTE: presuming non-fractional, even though ALTO 2.1
45
+ # specifies coordinates are xsd:float, not xsd:int,
46
+ # simplify to integer value for output:
47
+ (value * @scaling).to_i
48
+ end
49
+
50
+ # return layout for page
51
+ def alto_layout(xml, pxwidth, pxheight, &block)
52
+ xml.Layout do
53
+ xml.Page(ID: 'ID1',
54
+ PHYSICAL_IMG_NR: '1',
55
+ HEIGHT: pxheight.to_i,
56
+ WIDTH: pxwidth.to_i) do
57
+ xml.PrintSpace(HEIGHT: pxheight.to_i,
58
+ WIDTH: pxwidth.to_i,
59
+ HPOS: '0',
60
+ VPOS: '0') do
61
+ alto_blockline(xml, pxwidth, pxheight, &block)
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ # make block line and call word-block
68
+ def alto_blockline(xml, pxwidth, pxheight)
69
+ xml.TextBlock(ID: 'ID1a',
70
+ HEIGHT: pxheight.to_i,
71
+ WIDTH: pxwidth.to_i,
72
+ HPOS: '0',
73
+ VPOS: '0') do
74
+ xml.TextLine(HEIGHT: pxheight.to_i,
75
+ WIDTH: pxwidth.to_i,
76
+ HPOS: '0',
77
+ VPOS: '0') do
78
+ yield(xml)
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,38 @@
1
+ module IiifPrint
2
+ # Module for text extraction (OCR or otherwise)
3
+ module TextExtraction
4
+ class WordCoordsBuilder
5
+ # @params words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
6
+ # @params width [Integer] the width of the "canvas" on which the words appear.
7
+ # @params height [Integer] the height of the "canvas" on which the words appear.
8
+ # @return [String] a JSON encoded string.
9
+ def self.json_coordinates_for(words:, width: nil, height: nil)
10
+ new(words, width, height).to_json
11
+ end
12
+
13
+ def initialize(words, width = nil, height = nil)
14
+ @words = words
15
+ @width = width
16
+ @height = height
17
+ end
18
+
19
+ # Output JSON flattened word coordinates
20
+ #
21
+ # @return [String] JSON serialization of flattened word coordinates
22
+ def to_json
23
+ coordinates = {}
24
+ @words.each do |w|
25
+ word_chars = w[:word]
26
+ word_coords = w[:coordinates]
27
+ if coordinates[word_chars]
28
+ coordinates[word_chars] << word_coords
29
+ else
30
+ coordinates[word_chars] = [word_coords]
31
+ end
32
+ end
33
+ payload = { width: @width, height: @height, coords: coordinates }
34
+ JSON.generate(payload)
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,11 @@
1
+ require 'iiif_print/text_extraction/alto_reader'
2
+ require 'iiif_print/text_extraction/hocr_reader'
3
+ require 'iiif_print/text_extraction/page_ocr'
4
+ require 'iiif_print/text_extraction/render_alto'
5
+ require 'iiif_print/text_extraction/word_coords_builder'
6
+
7
+ module IiifPrint
8
+ # Module for text extraction (OCR or otherwise)
9
+ module TextExtraction
10
+ end
11
+ end
@@ -0,0 +1,47 @@
1
+ require 'iiif_print/text_formats_from_alto_service'
2
+
3
+ module IiifPrint
4
+ class TextExtractionDerivativeService < BaseDerivativeService
5
+ # @param [Hash<Symbol,Symbol>]
6
+ #
7
+ # The key for the hash represents the file extension. The key's value represents the instance
8
+ # method to call on {IiifPrint::TextExtraction::PageOCR}
9
+ class_attribute :ocr_derivatives, default: { txt: :plain, xml: :alto, json: :word_json }
10
+ class_attribute :alto_derivative_service_class, default: IiifPrint::TextFormatsFromALTOService
11
+ class_attribute :page_ocr_service_class, default: IiifPrint::TextExtraction::PageOCR
12
+ def initialize(file_set)
13
+ super(file_set)
14
+ end
15
+
16
+ def create_derivatives(src)
17
+ from_alto = alto_derivative_service_class.new(
18
+ file_set
19
+ )
20
+ return from_alto.create_derivatives(src) unless from_alto.alto_path.nil?
21
+ create_derivatives_from_ocr(src)
22
+ end
23
+
24
+ def create_derivatives_from_ocr(filename)
25
+ # TODO: Do we need this source_path instance variable?
26
+ @source_path = filename
27
+ ocr = page_ocr_service_class.new(filename)
28
+
29
+ ocr_derivatives.each do |extension, method_name|
30
+ path = prepare_path(extension.to_s)
31
+ write(content: ocr.public_send(method_name), path: path)
32
+ end
33
+ end
34
+
35
+ def write(content:, path:)
36
+ File.open(path, 'w') do |outfile|
37
+ outfile.write(content)
38
+ end
39
+ end
40
+
41
+ def cleanup_derivatives(*)
42
+ ocr_derivatives.keys do |extension|
43
+ super(extension.to_s)
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,77 @@
1
+ module IiifPrint
2
+ # Plugin to make text format derviatives (JSON, plain-text) from ALTO,
3
+ # either existing derivative, or an impending attachment.
4
+ # NOTE: to keep this from conflicting with TextExtractionDerivativeService,
5
+ # this class should be invoked by it, not PluggableDerivativeService.
6
+ class TextFormatsFromALTOService < BaseDerivativeService
7
+ self.target_extension = 'tiff'.freeze
8
+
9
+ def save_derivative(destination, data)
10
+ # Load/prepare base of "pairtree" dir structure for extension, fileset
11
+ prepare_path(destination)
12
+ #
13
+ save_path = derivative_path_factory.derivative_path_for_reference(
14
+ @file_set,
15
+ destination
16
+ )
17
+ # Write data as UTF-8 encoded text
18
+ File.open(save_path, "w:UTF-8") do |f|
19
+ f.write(data)
20
+ end
21
+ end
22
+
23
+ def nonempty_file?(path)
24
+ return false if path.nil?
25
+ return false unless File.exist?(path)
26
+ !File.size(path).zero?
27
+ end
28
+
29
+ # if there was no derivative yet, there might be one in-transit from
30
+ # an ingest, so check for that, and use its source if applicable:
31
+ def incoming_alto_path
32
+ path = IiifPrint::DerivativeAttachment.where(
33
+ fileset_id: @file_set.id,
34
+ destination_name: 'xml'
35
+ ).pluck(:path).uniq.first
36
+ path if nonempty_file?(path)
37
+ end
38
+
39
+ def alto_path
40
+ # check first for existing, non-empty derivative data:
41
+ path = derivative_path_factory.derivative_path_for_reference(
42
+ @file_set,
43
+ 'xml'
44
+ )
45
+ return path if nonempty_file?(path)
46
+ incoming_alto_path
47
+ end
48
+
49
+ def alto
50
+ path = alto_path
51
+ File.read(path, encoding: 'UTF-8') unless path.nil?
52
+ end
53
+
54
+ def create_derivatives(_filename)
55
+ # as this plugin makes derivatives of derivative, _filename is ignored
56
+ source_file = alto
57
+ return if source_file.nil?
58
+ # Image width from characterized primary file helps ensure proper scaling:
59
+ file = @file_set.original_file
60
+ width = file.nil? ? nil : file.width[0].to_i
61
+ height = file.nil? ? nil : file.height[0].to_i
62
+ # ALTOReader is responsible for transcoding, this class just saves result
63
+ reader = IiifPrint::TextExtraction::AltoReader.new(
64
+ source_file,
65
+ width,
66
+ height
67
+ )
68
+ save_derivative('json', reader.json)
69
+ save_derivative('txt', reader.text)
70
+ end
71
+
72
+ def cleanup_derivatives(*args)
73
+ # do nothing here; IiifPrint::TextExtractionDerivativeService
74
+ # has this job instead for cleaning ALTO, JSON, TXT.
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,50 @@
1
+ require 'open3'
2
+
3
+ module IiifPrint
4
+ class TIFFDerivativeService < BaseDerivativeService
5
+ self.target_extension = 'tiff'.freeze
6
+
7
+ # For imagemagick commands, the output type is determined by the
8
+ # output file's extension.
9
+ # TIFF (LZW, 8 bit grayscale)
10
+ GRAY_CMD = 'convert %<source_file>s ' \
11
+ '-depth 8 -colorspace Gray ' \
12
+ '-compress lzw %<out_file>s'.freeze
13
+
14
+ # Monochrome one-bit black/white TIFF, Group 4 compressed:
15
+ MONO_CMD = 'convert %<source_file>s ' \
16
+ '-depth 1 -monochrome -compress Group4 -type bilevel ' \
17
+ '%<out_file>s'.freeze
18
+
19
+ # sRBG color TIFF (8 bits per channel, lzw)
20
+ COLOR_CMD = 'convert %<source_file>s ' \
21
+ '-depth 24 ' \
22
+ '-compress lzw %<out_file>s'.freeze
23
+
24
+ def initialize(file_set)
25
+ super(file_set)
26
+ end
27
+
28
+ # Get conversion command; command varies on whether or not we have
29
+ # JP2 source, and whether we have color or grayscale material.
30
+ def convert_cmd
31
+ source_path = @source_path
32
+ source_path += '[0]' if @source_path.ends_with?('pdf')
33
+ template = use_color? ? COLOR_CMD : GRAY_CMD
34
+ template = MONO_CMD if one_bit?
35
+ format(template, source_file: source_path, out_file: @dest_path)
36
+ end
37
+
38
+ def create_derivatives(filename)
39
+ # Base class takes care of loading @source_path, @dest_path
40
+ super(filename)
41
+
42
+ # no creation of TIFF deriviative if primary is TIFF
43
+ return if mime_type == 'image/tiff'
44
+
45
+ return jp2_convert if mime_type == 'image/jp2'
46
+ # Otherwise, get, run imagemagick command to convert
47
+ im_convert
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,3 @@
1
+ module IiifPrint
2
+ VERSION = '1.0.0'.freeze
3
+ end
@@ -0,0 +1,9 @@
1
+ module IiifPrint
2
+ module WorksControllerBehaviorDecorator
3
+ # Extending the presenter to the base url which includes the protocol.
4
+ # We need the base url to render the facet links.
5
+ def iiif_manifest_presenter
6
+ super.tap { |i| i.base_url = request.base_url }
7
+ end
8
+ end
9
+ end