iiif_print 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (211) hide show
  1. checksums.yaml +7 -0
  2. data/.coveralls.yml +2 -0
  3. data/.env +5 -0
  4. data/.fcrepo_wrapper +4 -0
  5. data/.github/release.yml +20 -0
  6. data/.github/workflows/branches.yml +24 -0
  7. data/.github/workflows/build-lint-test-action.yaml +33 -0
  8. data/.github/workflows/release_labels.yml +25 -0
  9. data/.gitignore +52 -0
  10. data/.rubocop.yml +177 -0
  11. data/.solr_wrapper +8 -0
  12. data/.travis.yml +49 -0
  13. data/CONTRIBUTING.md +181 -0
  14. data/Dockerfile +15 -0
  15. data/Gemfile +52 -0
  16. data/LICENSE +203 -0
  17. data/README.md +203 -0
  18. data/Rakefile +38 -0
  19. data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
  20. data/app/assets/config/iiif_print_manifest.js +2 -0
  21. data/app/assets/images/iiif_print/.keep +0 -0
  22. data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
  23. data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
  24. data/app/assets/javascripts/iiif_print.js +3 -0
  25. data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
  26. data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
  27. data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
  28. data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
  29. data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
  30. data/app/helpers/hyrax/iiif_helper.rb +22 -0
  31. data/app/helpers/iiif_print/application_helper.rb +5 -0
  32. data/app/helpers/iiif_print_helper.rb +64 -0
  33. data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
  34. data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
  35. data/app/mailers/iiif_print/application_mailer.rb +8 -0
  36. data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
  37. data/app/models/concerns/iiif_print/solr/document.rb +47 -0
  38. data/app/models/iiif_print/application_record.rb +6 -0
  39. data/app/models/iiif_print/derivative_attachment.rb +8 -0
  40. data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
  41. data/app/models/iiif_print/ingest_file_relation.rb +14 -0
  42. data/app/models/iiif_print/pending_relationship.rb +7 -0
  43. data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
  44. data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
  45. data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
  46. data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
  47. data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
  48. data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
  49. data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
  50. data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
  51. data/app/views/catalog/_snippets_more.html.erb +16 -0
  52. data/app/views/hyrax/base/_representative_media.html.erb +9 -0
  53. data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
  54. data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
  55. data/bin/rails +13 -0
  56. data/config/fcrepo_wrapper_test.yml +5 -0
  57. data/config/initializers/assets.rb +2 -0
  58. data/config/locales/iiif_print.de.yml +148 -0
  59. data/config/locales/iiif_print.en.yml +119 -0
  60. data/config/locales/iiif_print.es.yml +148 -0
  61. data/config/locales/iiif_print.fr.yml +149 -0
  62. data/config/locales/iiif_print.it.yml +142 -0
  63. data/config/locales/iiif_print.pt-BR.yml +148 -0
  64. data/config/locales/iiif_print.zh.yml +142 -0
  65. data/config/solr_wrapper_test.yml +9 -0
  66. data/config/test-fixture/solr-config/_rest_managed.json +3 -0
  67. data/config/test-fixture/solr-config/admin-extra.html +31 -0
  68. data/config/test-fixture/solr-config/elevate.xml +36 -0
  69. data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
  70. data/config/test-fixture/solr-config/protwords.txt +21 -0
  71. data/config/test-fixture/solr-config/schema.xml +366 -0
  72. data/config/test-fixture/solr-config/scripts.conf +24 -0
  73. data/config/test-fixture/solr-config/solrconfig.xml +322 -0
  74. data/config/test-fixture/solr-config/spellings.txt +2 -0
  75. data/config/test-fixture/solr-config/stopwords.txt +58 -0
  76. data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
  77. data/config/test-fixture/solr-config/synonyms.txt +31 -0
  78. data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
  79. data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
  80. data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
  81. data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
  82. data/config/vendor/fits.xml +55 -0
  83. data/config/vendor/imagemagick-6-policy.xml +76 -0
  84. data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
  85. data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
  86. data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
  87. data/docker-compose.yml +129 -0
  88. data/iiif_print.gemspec +43 -0
  89. data/lib/generators/iiif_print/assets_generator.rb +29 -0
  90. data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
  91. data/lib/generators/iiif_print/install_generator.rb +52 -0
  92. data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
  93. data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
  94. data/lib/iiif_print/base_derivative_service.rb +113 -0
  95. data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
  96. data/lib/iiif_print/catalog_search_builder.rb +31 -0
  97. data/lib/iiif_print/configuration.rb +99 -0
  98. data/lib/iiif_print/data/fileset_helper.rb +25 -0
  99. data/lib/iiif_print/data/path_helper.rb +40 -0
  100. data/lib/iiif_print/data/work_derivatives.rb +323 -0
  101. data/lib/iiif_print/data/work_file.rb +92 -0
  102. data/lib/iiif_print/data/work_files.rb +199 -0
  103. data/lib/iiif_print/data.rb +35 -0
  104. data/lib/iiif_print/engine.rb +77 -0
  105. data/lib/iiif_print/errors.rb +9 -0
  106. data/lib/iiif_print/image_tool.rb +119 -0
  107. data/lib/iiif_print/jobs/application_job.rb +8 -0
  108. data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
  109. data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
  110. data/lib/iiif_print/jp2_derivative_service.rb +118 -0
  111. data/lib/iiif_print/jp2_image_metadata.rb +81 -0
  112. data/lib/iiif_print/lineage_service.rb +41 -0
  113. data/lib/iiif_print/metadata.rb +125 -0
  114. data/lib/iiif_print/pdf_derivative_service.rb +42 -0
  115. data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
  116. data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
  117. data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
  118. data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
  119. data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
  120. data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
  121. data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
  122. data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
  123. data/lib/iiif_print/text_extraction.rb +11 -0
  124. data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
  125. data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
  126. data/lib/iiif_print/tiff_derivative_service.rb +50 -0
  127. data/lib/iiif_print/version.rb +3 -0
  128. data/lib/iiif_print/works_controller_behavior.rb +9 -0
  129. data/lib/iiif_print.rb +136 -0
  130. data/lib/tasks/set_child_works.rake +22 -0
  131. data/spec/.keep.txt +1 -0
  132. data/spec/factories/ability.rb +6 -0
  133. data/spec/factories/newspaper_issue.rb +7 -0
  134. data/spec/factories/newspaper_page.rb +7 -0
  135. data/spec/factories/newspaper_page_solr_document.rb +12 -0
  136. data/spec/factories/newspaper_title.rb +8 -0
  137. data/spec/factories/uploaded_pdf_file.rb +9 -0
  138. data/spec/factories/uploaded_txt_file.rb +9 -0
  139. data/spec/factories/user.rb +13 -0
  140. data/spec/fixtures/files/4.1.07.jp2 +0 -0
  141. data/spec/fixtures/files/4.1.07.tiff +0 -0
  142. data/spec/fixtures/files/README.md +7 -0
  143. data/spec/fixtures/files/alto-2-0.xsd +714 -0
  144. data/spec/fixtures/files/broken-truncated.pdf +0 -0
  145. data/spec/fixtures/files/credits.md +16 -0
  146. data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
  147. data/spec/fixtures/files/minimal-1-page.pdf +0 -0
  148. data/spec/fixtures/files/minimal-2-page.pdf +0 -0
  149. data/spec/fixtures/files/minimal-alto.xml +31 -0
  150. data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
  151. data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
  152. data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
  153. data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
  154. data/spec/fixtures/files/ocr_alto.xml +202 -0
  155. data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
  156. data/spec/fixtures/files/ocr_color.tiff +0 -0
  157. data/spec/fixtures/files/ocr_gray.jp2 +0 -0
  158. data/spec/fixtures/files/ocr_gray.tiff +0 -0
  159. data/spec/fixtures/files/ocr_mono.tiff +0 -0
  160. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  161. data/spec/fixtures/files/page1.tiff +0 -0
  162. data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
  163. data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
  164. data/spec/fixtures/files/thumbnail.jpg +0 -0
  165. data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
  166. data/spec/helpers/iiif_print_helper_spec.rb +43 -0
  167. data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
  168. data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
  169. data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
  170. data/spec/iiif_print/configuration_spec.rb +67 -0
  171. data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
  172. data/spec/iiif_print/data/work_file_spec.rb +99 -0
  173. data/spec/iiif_print/data/work_files_spec.rb +237 -0
  174. data/spec/iiif_print/image_tool_spec.rb +109 -0
  175. data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
  176. data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
  177. data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
  178. data/spec/iiif_print/lineage_service_spec.rb +13 -0
  179. data/spec/iiif_print/metadata_spec.rb +115 -0
  180. data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
  181. data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
  182. data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
  183. data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
  184. data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
  185. data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
  186. data/spec/iiif_print_spec.rb +51 -0
  187. data/spec/misc_shared.rb +111 -0
  188. data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
  189. data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
  190. data/spec/models/solr_document_spec.rb +14 -0
  191. data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
  192. data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
  193. data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
  194. data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
  195. data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
  196. data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
  197. data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
  198. data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
  199. data/spec/spec_helper.rb +181 -0
  200. data/spec/support/controller_level_helpers.rb +28 -0
  201. data/spec/support/iiif_print_models.rb +127 -0
  202. data/spec/test_app_templates/blacklight.yml +9 -0
  203. data/spec/test_app_templates/fedora.yml +15 -0
  204. data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
  205. data/spec/test_app_templates/redis.yml +9 -0
  206. data/spec/test_app_templates/solr/conf/schema.xml +362 -0
  207. data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
  208. data/spec/test_app_templates/solr.yml +7 -0
  209. data/tasks/iiif_print_dev.rake +34 -0
  210. data/tmp/.keep +0 -0
  211. metadata +605 -0
@@ -0,0 +1,172 @@
1
+ require 'active_support/core_ext/module/delegation'
2
+ require 'json'
3
+ require 'nokogiri'
4
+
5
+ module IiifPrint
6
+ # Module for text extraction
7
+ module TextExtraction
8
+ # Class to obtain plain text and JSON word-coordinates from hOCR source
9
+ # - Coordinates in px units, unlike ALTO, which may have scaling concerns
10
+ class HOCRReader
11
+ attr_accessor :source, :doc_stream
12
+ delegate :text, :width, :height, :words, to: :doc_stream
13
+
14
+ # SAX Document Stream class to gather text and word tokens from hOCR
15
+ class HOCRDocStream < Nokogiri::XML::SAX::Document
16
+ attr_accessor :text, :words, :width, :height
17
+
18
+ def initialize
19
+ super()
20
+ # plain text buffer:
21
+ @text = ''
22
+ # list of word hash, containing word+coord:
23
+ @words = []
24
+ # page width and height to be found in hOCR for `div.ocr_page`
25
+ @width = nil
26
+ @height = nil
27
+ # to hold current word data state across #start_element, #characters,
28
+ # and #end_element methods (to associate word with coordinates).
29
+ @current = nil
30
+ # to preserve element classname from start to use by #end_element
31
+ @element_class_name = nil
32
+ end
33
+
34
+ # Return coordinates from `span.ocrx_word` element attribute hash
35
+ #
36
+ # @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
37
+ # @return [Array] Array of position x, y, width, height in px.
38
+ def s_coords(attrs)
39
+ element_title = attrs['title']
40
+ bbox = element_title.split(';')[0].split('bbox ')[-1]
41
+ x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
42
+ height = y2 - y1
43
+ width = x2 - x1
44
+ hpos = x1
45
+ vpos = y1
46
+ [hpos, vpos, width, height]
47
+ end
48
+
49
+ # Consider element for processing?
50
+ # - `div.ocr_page` — to get page width/height
51
+ # - `span.ocr_line` — to help make plain text readable
52
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
53
+ # @param name [String] Element name
54
+ # @param class_name [String] HTML class name
55
+ # @return [Boolean] true if element should be processed; otherwise false
56
+ def consider?(name, class_name)
57
+ selector = "#{name}.#{class_name}"
58
+ ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
59
+ end
60
+
61
+ def start_word(attrs)
62
+ @current = {}
63
+ # will be replaced during #characters method call:
64
+ @current[:word] = nil
65
+ @current[:coordinates] = s_coords(attrs)
66
+ end
67
+
68
+ def start_page(attrs)
69
+ title = attrs['title']
70
+ fields = title.split(';')
71
+ bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
72
+ # width and height:
73
+ @width = bbox[2]
74
+ @height = bbox[3]
75
+ end
76
+
77
+ def word_complete?
78
+ return false if @current.nil?
79
+ coords = @current[:coordinates]
80
+ @current[:word] && !@current[:word].empty? && coords.size == 4
81
+ end
82
+
83
+ def end_word
84
+ # add trailing space to plaintext buffer for between words:
85
+ @text += ' '
86
+ @words.push(@current) if word_complete?
87
+ end
88
+
89
+ def end_line
90
+ # strip trailing whitespace
91
+ @text.strip!
92
+ # then insert a line break
93
+ @text += "\n"
94
+ end
95
+
96
+ # Callback for element start, ignores elements except for:
97
+ # - `div.ocr_page` — to get page width/height
98
+ # - `span.ocr_line` — to help make plain text readable
99
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
100
+ #
101
+ # @param name [String] element name.
102
+ # @param attrs [Array] Array of key, value pair Arrays.
103
+ def start_element(name, attrs = [])
104
+ attributes = attrs.to_h
105
+ @element_class_name = attributes['class']
106
+ return unless consider?(name, @element_class_name)
107
+ start_word(attributes) if @element_class_name == 'ocrx_word'
108
+ start_page(attributes) if @element_class_name == 'ocr_page'
109
+ end
110
+
111
+ def characters(value)
112
+ return if @current.nil?
113
+ return if @current[:coordinates].nil?
114
+ @current[:word] ||= ''
115
+ @current[:word] += value
116
+ @text += value
117
+ end
118
+
119
+ # Callback for element end; at this time, flush word coordinate state
120
+ # for current word, and append line endings to plain text:
121
+ #
122
+ # @param name [String] element name.
123
+ def end_element(_name)
124
+ end_line if @element_class_name == 'ocr_line'
125
+ end_word if @element_class_name == 'ocrx_word'
126
+ end
127
+
128
+ # Callback for completion of parsing hOCR, used to normalize generated
129
+ # text content (strip unneeded whitespace incidental to output).
130
+ def end_document
131
+ # postprocess @text to remove trailing spaces on lines
132
+ @text = @text.split("\n").map(&:strip).join("\n")
133
+ # remove excess line break
134
+ @text.gsub!(/\n+/, "\n")
135
+ @text.delete("\r")
136
+ # remove trailing whitespace at end of buffer
137
+ @text.strip!
138
+ end
139
+ end
140
+
141
+ # Construct with either path or HTML [String]
142
+ #
143
+ # @param html [String], and process document
144
+ def initialize(html)
145
+ @source = isxml?(html) ? html : File.read(html)
146
+ @doc_stream = HOCRDocStream.new
147
+ parser = Nokogiri::HTML::SAX::Parser.new(doc_stream)
148
+ parser.parse(@source)
149
+ end
150
+
151
+ # Determine if source parameter is path or xml/html
152
+ #
153
+ # @param xml [String] either path to xml file or xml source
154
+ # @return [true, false] true if value appears to be XML/HTML, not path
155
+ def isxml?(xml)
156
+ xml.lstrip.start_with?('<')
157
+ end
158
+
159
+ # Output JSON flattened word coordinates
160
+ #
161
+ # @return [String] JSON serialization of flattened word coordinates
162
+ def json
163
+ words = @doc_stream.words
164
+ IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
165
+ words: words,
166
+ width: @doc_stream.width,
167
+ height: @doc_stream.height
168
+ )
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,87 @@
1
+ require 'json'
2
+ require 'open3'
3
+ require 'tmpdir'
4
+
5
+ # --
6
+ module IiifPrint
7
+ # Module for text extraction (OCR or otherwise)
8
+ module TextExtraction
9
+ class PageOCR
10
+ attr_accessor :html, :path
11
+
12
+ def initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options)
13
+ @path = path
14
+ # hOCR html:
15
+ @html = nil
16
+ @words = nil
17
+ @source_meta = nil
18
+ @box = nil
19
+ @plain = nil
20
+ @additional_tessearct_options = additional_tessearct_options
21
+ end
22
+
23
+ def run_ocr
24
+ outfile = File.join(Dir.mktmpdir, 'output_html')
25
+ cmd = "tesseract #{path} #{outfile} hocr"
26
+ cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present?
27
+ `#{cmd}`
28
+ outfile + '.hocr'
29
+ end
30
+
31
+ def load_words
32
+ preprocess_image
33
+ html_path = run_ocr
34
+ reader = IiifPrint::TextExtraction::HOCRReader.new(html_path)
35
+ @words = reader.words
36
+ @plain = reader.text
37
+ end
38
+
39
+ def words
40
+ load_words if @words.nil?
41
+ @words
42
+ end
43
+
44
+ def word_json
45
+ IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
46
+ words: words,
47
+ width: width,
48
+ height: height
49
+ )
50
+ end
51
+
52
+ def plain
53
+ load_words if @plain.nil?
54
+ @plain
55
+ end
56
+
57
+ def identify
58
+ return @source_meta unless @source_meta.nil?
59
+ @source_meta = IiifPrint::ImageTool.new(@path).metadata
60
+ end
61
+
62
+ def width
63
+ identify[:width]
64
+ end
65
+
66
+ def height
67
+ identify[:height]
68
+ end
69
+
70
+ def alto
71
+ writer = IiifPrint::TextExtraction::RenderAlto.new(width, height)
72
+ writer.to_alto(words)
73
+ end
74
+
75
+ private
76
+
77
+ # transform the image into a one-bit TIFF for OCR
78
+ def preprocess_image
79
+ tool = IiifPrint::ImageTool.new(@path)
80
+ return if tool.metadata[:color] == 'monochrome'
81
+ intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
82
+ tool.convert(intermediate_path, true)
83
+ @path = intermediate_path
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,84 @@
1
+ require 'nokogiri'
2
+
3
+ module IiifPrint
4
+ # Module for text extraction (OCR or otherwise)
5
+ module TextExtraction
6
+ class RenderAlto
7
+ def initialize(width, height, scaling = 1.0)
8
+ @height = height
9
+ @width = width
10
+ @scaling = scaling
11
+ end
12
+
13
+ def to_alto(words)
14
+ page = alto_page(@width, @height) do |xml|
15
+ words.each do |word|
16
+ xml.String(
17
+ CONTENT: word[:word],
18
+ WIDTH: scale_point(word[:coordinates][2]).to_s,
19
+ HEIGHT: scale_point(word[:coordinates][3]).to_s,
20
+ HPOS: scale_point(word[:coordinates][0]).to_s,
21
+ VPOS: scale_point(word[:coordinates][1]).to_s
22
+ ) { xml.text '' }
23
+ end
24
+ end
25
+ page.to_xml
26
+ end
27
+
28
+ private
29
+
30
+ # given block to manage word generation, wrap with page/block/line
31
+ def alto_page(pxwidth, pxheight, &block)
32
+ builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
33
+ xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
34
+ xml.Description do
35
+ xml.MeasurementUnit 'pixel'
36
+ end
37
+ alto_layout(xml, pxwidth, pxheight, &block)
38
+ end
39
+ end
40
+ builder
41
+ end
42
+
43
+ def scale_point(value)
44
+ # NOTE: presuming non-fractional, even though ALTO 2.1
45
+ # specifies coordinates are xsd:float, not xsd:int,
46
+ # simplify to integer value for output:
47
+ (value * @scaling).to_i
48
+ end
49
+
50
+ # return layout for page
51
+ def alto_layout(xml, pxwidth, pxheight, &block)
52
+ xml.Layout do
53
+ xml.Page(ID: 'ID1',
54
+ PHYSICAL_IMG_NR: '1',
55
+ HEIGHT: pxheight.to_i,
56
+ WIDTH: pxwidth.to_i) do
57
+ xml.PrintSpace(HEIGHT: pxheight.to_i,
58
+ WIDTH: pxwidth.to_i,
59
+ HPOS: '0',
60
+ VPOS: '0') do
61
+ alto_blockline(xml, pxwidth, pxheight, &block)
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ # make block line and call word-block
68
+ def alto_blockline(xml, pxwidth, pxheight)
69
+ xml.TextBlock(ID: 'ID1a',
70
+ HEIGHT: pxheight.to_i,
71
+ WIDTH: pxwidth.to_i,
72
+ HPOS: '0',
73
+ VPOS: '0') do
74
+ xml.TextLine(HEIGHT: pxheight.to_i,
75
+ WIDTH: pxwidth.to_i,
76
+ HPOS: '0',
77
+ VPOS: '0') do
78
+ yield(xml)
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,38 @@
1
+ module IiifPrint
2
+ # Module for text extraction (OCR or otherwise)
3
+ module TextExtraction
4
+ class WordCoordsBuilder
5
+ # @params words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
6
+ # @params width [Integer] the width of the "canvas" on which the words appear.
7
+ # @params height [Integer] the height of the "canvas" on which the words appear.
8
+ # @return [String] a JSON encoded string.
9
+ def self.json_coordinates_for(words:, width: nil, height: nil)
10
+ new(words, width, height).to_json
11
+ end
12
+
13
+ def initialize(words, width = nil, height = nil)
14
+ @words = words
15
+ @width = width
16
+ @height = height
17
+ end
18
+
19
+ # Output JSON flattened word coordinates
20
+ #
21
+ # @return [String] JSON serialization of flattened word coordinates
22
+ def to_json
23
+ coordinates = {}
24
+ @words.each do |w|
25
+ word_chars = w[:word]
26
+ word_coords = w[:coordinates]
27
+ if coordinates[word_chars]
28
+ coordinates[word_chars] << word_coords
29
+ else
30
+ coordinates[word_chars] = [word_coords]
31
+ end
32
+ end
33
+ payload = { width: @width, height: @height, coords: coordinates }
34
+ JSON.generate(payload)
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,11 @@
1
+ require 'iiif_print/text_extraction/alto_reader'
2
+ require 'iiif_print/text_extraction/hocr_reader'
3
+ require 'iiif_print/text_extraction/page_ocr'
4
+ require 'iiif_print/text_extraction/render_alto'
5
+ require 'iiif_print/text_extraction/word_coords_builder'
6
+
7
+ module IiifPrint
8
+ # Module for text extraction (OCR or otherwise)
9
+ module TextExtraction
10
+ end
11
+ end
@@ -0,0 +1,47 @@
1
+ require 'iiif_print/text_formats_from_alto_service'
2
+
3
+ module IiifPrint
4
+ class TextExtractionDerivativeService < BaseDerivativeService
5
+ # @param [Hash<Symbol,Symbol>]
6
+ #
7
+ # The key for the hash represents the file extension. The key's value represents the instance
8
+ # method to call on {IiifPrint::TextExtraction::PageOCR}
9
+ class_attribute :ocr_derivatives, default: { txt: :plain, xml: :alto, json: :word_json }
10
+ class_attribute :alto_derivative_service_class, default: IiifPrint::TextFormatsFromALTOService
11
+ class_attribute :page_ocr_service_class, default: IiifPrint::TextExtraction::PageOCR
12
+ def initialize(file_set)
13
+ super(file_set)
14
+ end
15
+
16
+ def create_derivatives(src)
17
+ from_alto = alto_derivative_service_class.new(
18
+ file_set
19
+ )
20
+ return from_alto.create_derivatives(src) unless from_alto.alto_path.nil?
21
+ create_derivatives_from_ocr(src)
22
+ end
23
+
24
+ def create_derivatives_from_ocr(filename)
25
+ # TODO: Do we need this source_path instance variable?
26
+ @source_path = filename
27
+ ocr = page_ocr_service_class.new(filename)
28
+
29
+ ocr_derivatives.each do |extension, method_name|
30
+ path = prepare_path(extension.to_s)
31
+ write(content: ocr.public_send(method_name), path: path)
32
+ end
33
+ end
34
+
35
+ def write(content:, path:)
36
+ File.open(path, 'w') do |outfile|
37
+ outfile.write(content)
38
+ end
39
+ end
40
+
41
+ def cleanup_derivatives(*)
42
+ ocr_derivatives.keys do |extension|
43
+ super(extension.to_s)
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,77 @@
1
+ module IiifPrint
2
+ # Plugin to make text format derviatives (JSON, plain-text) from ALTO,
3
+ # either existing derivative, or an impending attachment.
4
+ # NOTE: to keep this from conflicting with TextExtractionDerivativeService,
5
+ # this class should be invoked by it, not PluggableDerivativeService.
6
+ class TextFormatsFromALTOService < BaseDerivativeService
7
+ self.target_extension = 'tiff'.freeze
8
+
9
+ def save_derivative(destination, data)
10
+ # Load/prepare base of "pairtree" dir structure for extension, fileset
11
+ prepare_path(destination)
12
+ #
13
+ save_path = derivative_path_factory.derivative_path_for_reference(
14
+ @file_set,
15
+ destination
16
+ )
17
+ # Write data as UTF-8 encoded text
18
+ File.open(save_path, "w:UTF-8") do |f|
19
+ f.write(data)
20
+ end
21
+ end
22
+
23
+ def nonempty_file?(path)
24
+ return false if path.nil?
25
+ return false unless File.exist?(path)
26
+ !File.size(path).zero?
27
+ end
28
+
29
+ # if there was no derivative yet, there might be one in-transit from
30
+ # an ingest, so check for that, and use its source if applicable:
31
+ def incoming_alto_path
32
+ path = IiifPrint::DerivativeAttachment.where(
33
+ fileset_id: @file_set.id,
34
+ destination_name: 'xml'
35
+ ).pluck(:path).uniq.first
36
+ path if nonempty_file?(path)
37
+ end
38
+
39
+ def alto_path
40
+ # check first for existing, non-empty derivative data:
41
+ path = derivative_path_factory.derivative_path_for_reference(
42
+ @file_set,
43
+ 'xml'
44
+ )
45
+ return path if nonempty_file?(path)
46
+ incoming_alto_path
47
+ end
48
+
49
+ def alto
50
+ path = alto_path
51
+ File.read(path, encoding: 'UTF-8') unless path.nil?
52
+ end
53
+
54
+ def create_derivatives(_filename)
55
+ # as this plugin makes derivatives of derivative, _filename is ignored
56
+ source_file = alto
57
+ return if source_file.nil?
58
+ # Image width from characterized primary file helps ensure proper scaling:
59
+ file = @file_set.original_file
60
+ width = file.nil? ? nil : file.width[0].to_i
61
+ height = file.nil? ? nil : file.height[0].to_i
62
+ # ALTOReader is responsible for transcoding, this class just saves result
63
+ reader = IiifPrint::TextExtraction::AltoReader.new(
64
+ source_file,
65
+ width,
66
+ height
67
+ )
68
+ save_derivative('json', reader.json)
69
+ save_derivative('txt', reader.text)
70
+ end
71
+
72
+ def cleanup_derivatives(*args)
73
+ # do nothing here; IiifPrint::TextExtractionDerivativeService
74
+ # has this job instead for cleaning ALTO, JSON, TXT.
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,50 @@
1
+ require 'open3'
2
+
3
+ module IiifPrint
4
+ class TIFFDerivativeService < BaseDerivativeService
5
+ self.target_extension = 'tiff'.freeze
6
+
7
+ # For imagemagick commands, the output type is determined by the
8
+ # output file's extension.
9
+ # TIFF (LZW, 8 bit grayscale)
10
+ GRAY_CMD = 'convert %<source_file>s ' \
11
+ '-depth 8 -colorspace Gray ' \
12
+ '-compress lzw %<out_file>s'.freeze
13
+
14
+ # Monochrome one-bit black/white TIFF, Group 4 compressed:
15
+ MONO_CMD = 'convert %<source_file>s ' \
16
+ '-depth 1 -monochrome -compress Group4 -type bilevel ' \
17
+ '%<out_file>s'.freeze
18
+
19
+ # sRBG color TIFF (8 bits per channel, lzw)
20
+ COLOR_CMD = 'convert %<source_file>s ' \
21
+ '-depth 24 ' \
22
+ '-compress lzw %<out_file>s'.freeze
23
+
24
+ def initialize(file_set)
25
+ super(file_set)
26
+ end
27
+
28
+ # Get conversion command; command varies on whether or not we have
29
+ # JP2 source, and whether we have color or grayscale material.
30
+ def convert_cmd
31
+ source_path = @source_path
32
+ source_path += '[0]' if @source_path.ends_with?('pdf')
33
+ template = use_color? ? COLOR_CMD : GRAY_CMD
34
+ template = MONO_CMD if one_bit?
35
+ format(template, source_file: source_path, out_file: @dest_path)
36
+ end
37
+
38
+ def create_derivatives(filename)
39
+ # Base class takes care of loading @source_path, @dest_path
40
+ super(filename)
41
+
42
+ # no creation of TIFF deriviative if primary is TIFF
43
+ return if mime_type == 'image/tiff'
44
+
45
+ return jp2_convert if mime_type == 'image/jp2'
46
+ # Otherwise, get, run imagemagick command to convert
47
+ im_convert
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,3 @@
1
+ module IiifPrint
2
+ VERSION = '1.0.0'.freeze
3
+ end
@@ -0,0 +1,9 @@
1
+ module IiifPrint
2
+ module WorksControllerBehaviorDecorator
3
+ # Extending the presenter to the base url which includes the protocol.
4
+ # We need the base url to render the facet links.
5
+ def iiif_manifest_presenter
6
+ super.tap { |i| i.base_url = request.base_url }
7
+ end
8
+ end
9
+ end