iiif_print 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.coveralls.yml +2 -0
- data/.env +5 -0
- data/.fcrepo_wrapper +4 -0
- data/.github/release.yml +20 -0
- data/.github/workflows/branches.yml +24 -0
- data/.github/workflows/build-lint-test-action.yaml +33 -0
- data/.github/workflows/release_labels.yml +25 -0
- data/.gitignore +52 -0
- data/.rubocop.yml +177 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +49 -0
- data/CONTRIBUTING.md +181 -0
- data/Dockerfile +15 -0
- data/Gemfile +52 -0
- data/LICENSE +203 -0
- data/README.md +203 -0
- data/Rakefile +38 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
- data/app/assets/config/iiif_print_manifest.js +2 -0
- data/app/assets/images/iiif_print/.keep +0 -0
- data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/iiif_print.js +3 -0
- data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
- data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
- data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
- data/app/helpers/hyrax/iiif_helper.rb +22 -0
- data/app/helpers/iiif_print/application_helper.rb +5 -0
- data/app/helpers/iiif_print_helper.rb +64 -0
- data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
- data/app/mailers/iiif_print/application_mailer.rb +8 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
- data/app/models/concerns/iiif_print/solr/document.rb +47 -0
- data/app/models/iiif_print/application_record.rb +6 -0
- data/app/models/iiif_print/derivative_attachment.rb +8 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
- data/app/models/iiif_print/ingest_file_relation.rb +14 -0
- data/app/models/iiif_print/pending_relationship.rb +7 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
- data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
- data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
- data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/hyrax/base/_representative_media.html.erb +9 -0
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/iiif_print.de.yml +148 -0
- data/config/locales/iiif_print.en.yml +119 -0
- data/config/locales/iiif_print.es.yml +148 -0
- data/config/locales/iiif_print.fr.yml +149 -0
- data/config/locales/iiif_print.it.yml +142 -0
- data/config/locales/iiif_print.pt-BR.yml +148 -0
- data/config/locales/iiif_print.zh.yml +142 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
- data/docker-compose.yml +129 -0
- data/iiif_print.gemspec +43 -0
- data/lib/generators/iiif_print/assets_generator.rb +29 -0
- data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
- data/lib/generators/iiif_print/install_generator.rb +52 -0
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
- data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
- data/lib/iiif_print/base_derivative_service.rb +113 -0
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
- data/lib/iiif_print/catalog_search_builder.rb +31 -0
- data/lib/iiif_print/configuration.rb +99 -0
- data/lib/iiif_print/data/fileset_helper.rb +25 -0
- data/lib/iiif_print/data/path_helper.rb +40 -0
- data/lib/iiif_print/data/work_derivatives.rb +323 -0
- data/lib/iiif_print/data/work_file.rb +92 -0
- data/lib/iiif_print/data/work_files.rb +199 -0
- data/lib/iiif_print/data.rb +35 -0
- data/lib/iiif_print/engine.rb +77 -0
- data/lib/iiif_print/errors.rb +9 -0
- data/lib/iiif_print/image_tool.rb +119 -0
- data/lib/iiif_print/jobs/application_job.rb +8 -0
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
- data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
- data/lib/iiif_print/jp2_derivative_service.rb +118 -0
- data/lib/iiif_print/jp2_image_metadata.rb +81 -0
- data/lib/iiif_print/lineage_service.rb +41 -0
- data/lib/iiif_print/metadata.rb +125 -0
- data/lib/iiif_print/pdf_derivative_service.rb +42 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
- data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
- data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
- data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
- data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
- data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
- data/lib/iiif_print/text_extraction.rb +11 -0
- data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
- data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
- data/lib/iiif_print/tiff_derivative_service.rb +50 -0
- data/lib/iiif_print/version.rb +3 -0
- data/lib/iiif_print/works_controller_behavior.rb +9 -0
- data/lib/iiif_print.rb +136 -0
- data/lib/tasks/set_child_works.rake +22 -0
- data/spec/.keep.txt +1 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/uploaded_txt_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
- data/spec/helpers/iiif_print_helper_spec.rb +43 -0
- data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
- data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
- data/spec/iiif_print/configuration_spec.rb +67 -0
- data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
- data/spec/iiif_print/data/work_file_spec.rb +99 -0
- data/spec/iiif_print/data/work_files_spec.rb +237 -0
- data/spec/iiif_print/image_tool_spec.rb +109 -0
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
- data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
- data/spec/iiif_print/lineage_service_spec.rb +13 -0
- data/spec/iiif_print/metadata_spec.rb +115 -0
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
- data/spec/iiif_print_spec.rb +51 -0
- data/spec/misc_shared.rb +111 -0
- data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
- data/spec/spec_helper.rb +181 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/support/iiif_print_models.rb +127 -0
- data/spec/test_app_templates/blacklight.yml +9 -0
- data/spec/test_app_templates/fedora.yml +15 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
- data/spec/test_app_templates/redis.yml +9 -0
- data/spec/test_app_templates/solr/conf/schema.xml +362 -0
- data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
- data/spec/test_app_templates/solr.yml +7 -0
- data/tasks/iiif_print_dev.rake +34 -0
- data/tmp/.keep +0 -0
- metadata +605 -0
@@ -0,0 +1,172 @@
|
|
1
|
+
require 'active_support/core_ext/module/delegation'
|
2
|
+
require 'json'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
module IiifPrint
|
6
|
+
# Module for text extraction
|
7
|
+
module TextExtraction
|
8
|
+
# Class to obtain plain text and JSON word-coordinates from hOCR source
|
9
|
+
# - Coordinates in px units, unlike ALTO, which may have scaling concerns
|
10
|
+
class HOCRReader
|
11
|
+
attr_accessor :source, :doc_stream
|
12
|
+
delegate :text, :width, :height, :words, to: :doc_stream
|
13
|
+
|
14
|
+
# SAX Document Stream class to gather text and word tokens from hOCR
|
15
|
+
class HOCRDocStream < Nokogiri::XML::SAX::Document
|
16
|
+
attr_accessor :text, :words, :width, :height
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
super()
|
20
|
+
# plain text buffer:
|
21
|
+
@text = ''
|
22
|
+
# list of word hash, containing word+coord:
|
23
|
+
@words = []
|
24
|
+
# page width and height to be found in hOCR for `div.ocr_page`
|
25
|
+
@width = nil
|
26
|
+
@height = nil
|
27
|
+
# to hold current word data state across #start_element, #characters,
|
28
|
+
# and #end_element methods (to associate word with coordinates).
|
29
|
+
@current = nil
|
30
|
+
# to preserve element classname from start to use by #end_element
|
31
|
+
@element_class_name = nil
|
32
|
+
end
|
33
|
+
|
34
|
+
# Return coordinates from `span.ocrx_word` element attribute hash
|
35
|
+
#
|
36
|
+
# @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
|
37
|
+
# @return [Array] Array of position x, y, width, height in px.
|
38
|
+
def s_coords(attrs)
|
39
|
+
element_title = attrs['title']
|
40
|
+
bbox = element_title.split(';')[0].split('bbox ')[-1]
|
41
|
+
x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
|
42
|
+
height = y2 - y1
|
43
|
+
width = x2 - x1
|
44
|
+
hpos = x1
|
45
|
+
vpos = y1
|
46
|
+
[hpos, vpos, width, height]
|
47
|
+
end
|
48
|
+
|
49
|
+
# Consider element for processing?
|
50
|
+
# - `div.ocr_page` — to get page width/height
|
51
|
+
# - `span.ocr_line` — to help make plain text readable
|
52
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
53
|
+
# @param name [String] Element name
|
54
|
+
# @param class_name [String] HTML class name
|
55
|
+
# @return [Boolean] true if element should be processed; otherwise false
|
56
|
+
def consider?(name, class_name)
|
57
|
+
selector = "#{name}.#{class_name}"
|
58
|
+
['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
|
59
|
+
end
|
60
|
+
|
61
|
+
def start_word(attrs)
|
62
|
+
@current = {}
|
63
|
+
# will be replaced during #characters method call:
|
64
|
+
@current[:word] = nil
|
65
|
+
@current[:coordinates] = s_coords(attrs)
|
66
|
+
end
|
67
|
+
|
68
|
+
def start_page(attrs)
|
69
|
+
title = attrs['title']
|
70
|
+
fields = title.split(';')
|
71
|
+
bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
|
72
|
+
# width and height:
|
73
|
+
@width = bbox[2]
|
74
|
+
@height = bbox[3]
|
75
|
+
end
|
76
|
+
|
77
|
+
def word_complete?
|
78
|
+
return false if @current.nil?
|
79
|
+
coords = @current[:coordinates]
|
80
|
+
@current[:word] && !@current[:word].empty? && coords.size == 4
|
81
|
+
end
|
82
|
+
|
83
|
+
def end_word
|
84
|
+
# add trailing space to plaintext buffer for between words:
|
85
|
+
@text += ' '
|
86
|
+
@words.push(@current) if word_complete?
|
87
|
+
end
|
88
|
+
|
89
|
+
def end_line
|
90
|
+
# strip trailing whitespace
|
91
|
+
@text.strip!
|
92
|
+
# then insert a line break
|
93
|
+
@text += "\n"
|
94
|
+
end
|
95
|
+
|
96
|
+
# Callback for element start, ignores elements except for:
|
97
|
+
# - `div.ocr_page` — to get page width/height
|
98
|
+
# - `span.ocr_line` — to help make plain text readable
|
99
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
100
|
+
#
|
101
|
+
# @param name [String] element name.
|
102
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
103
|
+
def start_element(name, attrs = [])
|
104
|
+
attributes = attrs.to_h
|
105
|
+
@element_class_name = attributes['class']
|
106
|
+
return unless consider?(name, @element_class_name)
|
107
|
+
start_word(attributes) if @element_class_name == 'ocrx_word'
|
108
|
+
start_page(attributes) if @element_class_name == 'ocr_page'
|
109
|
+
end
|
110
|
+
|
111
|
+
def characters(value)
|
112
|
+
return if @current.nil?
|
113
|
+
return if @current[:coordinates].nil?
|
114
|
+
@current[:word] ||= ''
|
115
|
+
@current[:word] += value
|
116
|
+
@text += value
|
117
|
+
end
|
118
|
+
|
119
|
+
# Callback for element end; at this time, flush word coordinate state
|
120
|
+
# for current word, and append line endings to plain text:
|
121
|
+
#
|
122
|
+
# @param name [String] element name.
|
123
|
+
def end_element(_name)
|
124
|
+
end_line if @element_class_name == 'ocr_line'
|
125
|
+
end_word if @element_class_name == 'ocrx_word'
|
126
|
+
end
|
127
|
+
|
128
|
+
# Callback for completion of parsing hOCR, used to normalize generated
|
129
|
+
# text content (strip unneeded whitespace incidental to output).
|
130
|
+
def end_document
|
131
|
+
# postprocess @text to remove trailing spaces on lines
|
132
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
133
|
+
# remove excess line break
|
134
|
+
@text.gsub!(/\n+/, "\n")
|
135
|
+
@text.delete("\r")
|
136
|
+
# remove trailing whitespace at end of buffer
|
137
|
+
@text.strip!
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# Construct with either path or HTML [String]
|
142
|
+
#
|
143
|
+
# @param html [String], and process document
|
144
|
+
def initialize(html)
|
145
|
+
@source = isxml?(html) ? html : File.read(html)
|
146
|
+
@doc_stream = HOCRDocStream.new
|
147
|
+
parser = Nokogiri::HTML::SAX::Parser.new(doc_stream)
|
148
|
+
parser.parse(@source)
|
149
|
+
end
|
150
|
+
|
151
|
+
# Determine if source parameter is path or xml/html
|
152
|
+
#
|
153
|
+
# @param xml [String] either path to xml file or xml source
|
154
|
+
# @return [true, false] true if value appears to be XML/HTML, not path
|
155
|
+
def isxml?(xml)
|
156
|
+
xml.lstrip.start_with?('<')
|
157
|
+
end
|
158
|
+
|
159
|
+
# Output JSON flattened word coordinates
|
160
|
+
#
|
161
|
+
# @return [String] JSON serialization of flattened word coordinates
|
162
|
+
def json
|
163
|
+
words = @doc_stream.words
|
164
|
+
IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
|
165
|
+
words: words,
|
166
|
+
width: @doc_stream.width,
|
167
|
+
height: @doc_stream.height
|
168
|
+
)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'open3'
|
3
|
+
require 'tmpdir'
|
4
|
+
|
5
|
+
# --
|
6
|
+
module IiifPrint
|
7
|
+
# Module for text extraction (OCR or otherwise)
|
8
|
+
module TextExtraction
|
9
|
+
class PageOCR
|
10
|
+
attr_accessor :html, :path
|
11
|
+
|
12
|
+
def initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options)
|
13
|
+
@path = path
|
14
|
+
# hOCR html:
|
15
|
+
@html = nil
|
16
|
+
@words = nil
|
17
|
+
@source_meta = nil
|
18
|
+
@box = nil
|
19
|
+
@plain = nil
|
20
|
+
@additional_tessearct_options = additional_tessearct_options
|
21
|
+
end
|
22
|
+
|
23
|
+
def run_ocr
|
24
|
+
outfile = File.join(Dir.mktmpdir, 'output_html')
|
25
|
+
cmd = "tesseract #{path} #{outfile} hocr"
|
26
|
+
cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present?
|
27
|
+
`#{cmd}`
|
28
|
+
outfile + '.hocr'
|
29
|
+
end
|
30
|
+
|
31
|
+
def load_words
|
32
|
+
preprocess_image
|
33
|
+
html_path = run_ocr
|
34
|
+
reader = IiifPrint::TextExtraction::HOCRReader.new(html_path)
|
35
|
+
@words = reader.words
|
36
|
+
@plain = reader.text
|
37
|
+
end
|
38
|
+
|
39
|
+
def words
|
40
|
+
load_words if @words.nil?
|
41
|
+
@words
|
42
|
+
end
|
43
|
+
|
44
|
+
def word_json
|
45
|
+
IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
|
46
|
+
words: words,
|
47
|
+
width: width,
|
48
|
+
height: height
|
49
|
+
)
|
50
|
+
end
|
51
|
+
|
52
|
+
def plain
|
53
|
+
load_words if @plain.nil?
|
54
|
+
@plain
|
55
|
+
end
|
56
|
+
|
57
|
+
def identify
|
58
|
+
return @source_meta unless @source_meta.nil?
|
59
|
+
@source_meta = IiifPrint::ImageTool.new(@path).metadata
|
60
|
+
end
|
61
|
+
|
62
|
+
def width
|
63
|
+
identify[:width]
|
64
|
+
end
|
65
|
+
|
66
|
+
def height
|
67
|
+
identify[:height]
|
68
|
+
end
|
69
|
+
|
70
|
+
def alto
|
71
|
+
writer = IiifPrint::TextExtraction::RenderAlto.new(width, height)
|
72
|
+
writer.to_alto(words)
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
# transform the image into a one-bit TIFF for OCR
|
78
|
+
def preprocess_image
|
79
|
+
tool = IiifPrint::ImageTool.new(@path)
|
80
|
+
return if tool.metadata[:color] == 'monochrome'
|
81
|
+
intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
|
82
|
+
tool.convert(intermediate_path, true)
|
83
|
+
@path = intermediate_path
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
# Module for text extraction (OCR or otherwise)
|
5
|
+
module TextExtraction
|
6
|
+
class RenderAlto
|
7
|
+
def initialize(width, height, scaling = 1.0)
|
8
|
+
@height = height
|
9
|
+
@width = width
|
10
|
+
@scaling = scaling
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_alto(words)
|
14
|
+
page = alto_page(@width, @height) do |xml|
|
15
|
+
words.each do |word|
|
16
|
+
xml.String(
|
17
|
+
CONTENT: word[:word],
|
18
|
+
WIDTH: scale_point(word[:coordinates][2]).to_s,
|
19
|
+
HEIGHT: scale_point(word[:coordinates][3]).to_s,
|
20
|
+
HPOS: scale_point(word[:coordinates][0]).to_s,
|
21
|
+
VPOS: scale_point(word[:coordinates][1]).to_s
|
22
|
+
) { xml.text '' }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
page.to_xml
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# given block to manage word generation, wrap with page/block/line
|
31
|
+
def alto_page(pxwidth, pxheight, &block)
|
32
|
+
builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
|
33
|
+
xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
|
34
|
+
xml.Description do
|
35
|
+
xml.MeasurementUnit 'pixel'
|
36
|
+
end
|
37
|
+
alto_layout(xml, pxwidth, pxheight, &block)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
builder
|
41
|
+
end
|
42
|
+
|
43
|
+
def scale_point(value)
|
44
|
+
# NOTE: presuming non-fractional, even though ALTO 2.1
|
45
|
+
# specifies coordinates are xsd:float, not xsd:int,
|
46
|
+
# simplify to integer value for output:
|
47
|
+
(value * @scaling).to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
# return layout for page
|
51
|
+
def alto_layout(xml, pxwidth, pxheight, &block)
|
52
|
+
xml.Layout do
|
53
|
+
xml.Page(ID: 'ID1',
|
54
|
+
PHYSICAL_IMG_NR: '1',
|
55
|
+
HEIGHT: pxheight.to_i,
|
56
|
+
WIDTH: pxwidth.to_i) do
|
57
|
+
xml.PrintSpace(HEIGHT: pxheight.to_i,
|
58
|
+
WIDTH: pxwidth.to_i,
|
59
|
+
HPOS: '0',
|
60
|
+
VPOS: '0') do
|
61
|
+
alto_blockline(xml, pxwidth, pxheight, &block)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# make block line and call word-block
|
68
|
+
def alto_blockline(xml, pxwidth, pxheight)
|
69
|
+
xml.TextBlock(ID: 'ID1a',
|
70
|
+
HEIGHT: pxheight.to_i,
|
71
|
+
WIDTH: pxwidth.to_i,
|
72
|
+
HPOS: '0',
|
73
|
+
VPOS: '0') do
|
74
|
+
xml.TextLine(HEIGHT: pxheight.to_i,
|
75
|
+
WIDTH: pxwidth.to_i,
|
76
|
+
HPOS: '0',
|
77
|
+
VPOS: '0') do
|
78
|
+
yield(xml)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
# Module for text extraction (OCR or otherwise)
|
3
|
+
module TextExtraction
|
4
|
+
class WordCoordsBuilder
|
5
|
+
# @params words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
|
6
|
+
# @params width [Integer] the width of the "canvas" on which the words appear.
|
7
|
+
# @params height [Integer] the height of the "canvas" on which the words appear.
|
8
|
+
# @return [String] a JSON encoded string.
|
9
|
+
def self.json_coordinates_for(words:, width: nil, height: nil)
|
10
|
+
new(words, width, height).to_json
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(words, width = nil, height = nil)
|
14
|
+
@words = words
|
15
|
+
@width = width
|
16
|
+
@height = height
|
17
|
+
end
|
18
|
+
|
19
|
+
# Output JSON flattened word coordinates
|
20
|
+
#
|
21
|
+
# @return [String] JSON serialization of flattened word coordinates
|
22
|
+
def to_json
|
23
|
+
coordinates = {}
|
24
|
+
@words.each do |w|
|
25
|
+
word_chars = w[:word]
|
26
|
+
word_coords = w[:coordinates]
|
27
|
+
if coordinates[word_chars]
|
28
|
+
coordinates[word_chars] << word_coords
|
29
|
+
else
|
30
|
+
coordinates[word_chars] = [word_coords]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
payload = { width: @width, height: @height, coords: coordinates }
|
34
|
+
JSON.generate(payload)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'iiif_print/text_extraction/alto_reader'
|
2
|
+
require 'iiif_print/text_extraction/hocr_reader'
|
3
|
+
require 'iiif_print/text_extraction/page_ocr'
|
4
|
+
require 'iiif_print/text_extraction/render_alto'
|
5
|
+
require 'iiif_print/text_extraction/word_coords_builder'
|
6
|
+
|
7
|
+
module IiifPrint
|
8
|
+
# Module for text extraction (OCR or otherwise)
|
9
|
+
module TextExtraction
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'iiif_print/text_formats_from_alto_service'
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
class TextExtractionDerivativeService < BaseDerivativeService
|
5
|
+
# @param [Hash<Symbol,Symbol>]
|
6
|
+
#
|
7
|
+
# The key for the hash represents the file extension. The key's value represents the instance
|
8
|
+
# method to call on {IiifPrint::TextExtraction::PageOCR}
|
9
|
+
class_attribute :ocr_derivatives, default: { txt: :plain, xml: :alto, json: :word_json }
|
10
|
+
class_attribute :alto_derivative_service_class, default: IiifPrint::TextFormatsFromALTOService
|
11
|
+
class_attribute :page_ocr_service_class, default: IiifPrint::TextExtraction::PageOCR
|
12
|
+
def initialize(file_set)
|
13
|
+
super(file_set)
|
14
|
+
end
|
15
|
+
|
16
|
+
def create_derivatives(src)
|
17
|
+
from_alto = alto_derivative_service_class.new(
|
18
|
+
file_set
|
19
|
+
)
|
20
|
+
return from_alto.create_derivatives(src) unless from_alto.alto_path.nil?
|
21
|
+
create_derivatives_from_ocr(src)
|
22
|
+
end
|
23
|
+
|
24
|
+
def create_derivatives_from_ocr(filename)
|
25
|
+
# TODO: Do we need this source_path instance variable?
|
26
|
+
@source_path = filename
|
27
|
+
ocr = page_ocr_service_class.new(filename)
|
28
|
+
|
29
|
+
ocr_derivatives.each do |extension, method_name|
|
30
|
+
path = prepare_path(extension.to_s)
|
31
|
+
write(content: ocr.public_send(method_name), path: path)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def write(content:, path:)
|
36
|
+
File.open(path, 'w') do |outfile|
|
37
|
+
outfile.write(content)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def cleanup_derivatives(*)
|
42
|
+
ocr_derivatives.keys do |extension|
|
43
|
+
super(extension.to_s)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
# Plugin to make text format derviatives (JSON, plain-text) from ALTO,
|
3
|
+
# either existing derivative, or an impending attachment.
|
4
|
+
# NOTE: to keep this from conflicting with TextExtractionDerivativeService,
|
5
|
+
# this class should be invoked by it, not PluggableDerivativeService.
|
6
|
+
class TextFormatsFromALTOService < BaseDerivativeService
|
7
|
+
self.target_extension = 'tiff'.freeze
|
8
|
+
|
9
|
+
def save_derivative(destination, data)
|
10
|
+
# Load/prepare base of "pairtree" dir structure for extension, fileset
|
11
|
+
prepare_path(destination)
|
12
|
+
#
|
13
|
+
save_path = derivative_path_factory.derivative_path_for_reference(
|
14
|
+
@file_set,
|
15
|
+
destination
|
16
|
+
)
|
17
|
+
# Write data as UTF-8 encoded text
|
18
|
+
File.open(save_path, "w:UTF-8") do |f|
|
19
|
+
f.write(data)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def nonempty_file?(path)
|
24
|
+
return false if path.nil?
|
25
|
+
return false unless File.exist?(path)
|
26
|
+
!File.size(path).zero?
|
27
|
+
end
|
28
|
+
|
29
|
+
# if there was no derivative yet, there might be one in-transit from
|
30
|
+
# an ingest, so check for that, and use its source if applicable:
|
31
|
+
def incoming_alto_path
|
32
|
+
path = IiifPrint::DerivativeAttachment.where(
|
33
|
+
fileset_id: @file_set.id,
|
34
|
+
destination_name: 'xml'
|
35
|
+
).pluck(:path).uniq.first
|
36
|
+
path if nonempty_file?(path)
|
37
|
+
end
|
38
|
+
|
39
|
+
def alto_path
|
40
|
+
# check first for existing, non-empty derivative data:
|
41
|
+
path = derivative_path_factory.derivative_path_for_reference(
|
42
|
+
@file_set,
|
43
|
+
'xml'
|
44
|
+
)
|
45
|
+
return path if nonempty_file?(path)
|
46
|
+
incoming_alto_path
|
47
|
+
end
|
48
|
+
|
49
|
+
def alto
|
50
|
+
path = alto_path
|
51
|
+
File.read(path, encoding: 'UTF-8') unless path.nil?
|
52
|
+
end
|
53
|
+
|
54
|
+
def create_derivatives(_filename)
|
55
|
+
# as this plugin makes derivatives of derivative, _filename is ignored
|
56
|
+
source_file = alto
|
57
|
+
return if source_file.nil?
|
58
|
+
# Image width from characterized primary file helps ensure proper scaling:
|
59
|
+
file = @file_set.original_file
|
60
|
+
width = file.nil? ? nil : file.width[0].to_i
|
61
|
+
height = file.nil? ? nil : file.height[0].to_i
|
62
|
+
# ALTOReader is responsible for transcoding, this class just saves result
|
63
|
+
reader = IiifPrint::TextExtraction::AltoReader.new(
|
64
|
+
source_file,
|
65
|
+
width,
|
66
|
+
height
|
67
|
+
)
|
68
|
+
save_derivative('json', reader.json)
|
69
|
+
save_derivative('txt', reader.text)
|
70
|
+
end
|
71
|
+
|
72
|
+
def cleanup_derivatives(*args)
|
73
|
+
# do nothing here; IiifPrint::TextExtractionDerivativeService
|
74
|
+
# has this job instead for cleaning ALTO, JSON, TXT.
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'open3'
|
2
|
+
|
3
|
+
module IiifPrint
|
4
|
+
class TIFFDerivativeService < BaseDerivativeService
|
5
|
+
self.target_extension = 'tiff'.freeze
|
6
|
+
|
7
|
+
# For imagemagick commands, the output type is determined by the
|
8
|
+
# output file's extension.
|
9
|
+
# TIFF (LZW, 8 bit grayscale)
|
10
|
+
GRAY_CMD = 'convert %<source_file>s ' \
|
11
|
+
'-depth 8 -colorspace Gray ' \
|
12
|
+
'-compress lzw %<out_file>s'.freeze
|
13
|
+
|
14
|
+
# Monochrome one-bit black/white TIFF, Group 4 compressed:
|
15
|
+
MONO_CMD = 'convert %<source_file>s ' \
|
16
|
+
'-depth 1 -monochrome -compress Group4 -type bilevel ' \
|
17
|
+
'%<out_file>s'.freeze
|
18
|
+
|
19
|
+
# sRBG color TIFF (8 bits per channel, lzw)
|
20
|
+
COLOR_CMD = 'convert %<source_file>s ' \
|
21
|
+
'-depth 24 ' \
|
22
|
+
'-compress lzw %<out_file>s'.freeze
|
23
|
+
|
24
|
+
def initialize(file_set)
|
25
|
+
super(file_set)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Get conversion command; command varies on whether or not we have
|
29
|
+
# JP2 source, and whether we have color or grayscale material.
|
30
|
+
def convert_cmd
|
31
|
+
source_path = @source_path
|
32
|
+
source_path += '[0]' if @source_path.ends_with?('pdf')
|
33
|
+
template = use_color? ? COLOR_CMD : GRAY_CMD
|
34
|
+
template = MONO_CMD if one_bit?
|
35
|
+
format(template, source_file: source_path, out_file: @dest_path)
|
36
|
+
end
|
37
|
+
|
38
|
+
def create_derivatives(filename)
|
39
|
+
# Base class takes care of loading @source_path, @dest_path
|
40
|
+
super(filename)
|
41
|
+
|
42
|
+
# no creation of TIFF deriviative if primary is TIFF
|
43
|
+
return if mime_type == 'image/tiff'
|
44
|
+
|
45
|
+
return jp2_convert if mime_type == 'image/jp2'
|
46
|
+
# Otherwise, get, run imagemagick command to convert
|
47
|
+
im_convert
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
module IiifPrint
|
2
|
+
module WorksControllerBehaviorDecorator
|
3
|
+
# Extending the presenter to the base url which includes the protocol.
|
4
|
+
# We need the base url to render the facet links.
|
5
|
+
def iiif_manifest_presenter
|
6
|
+
super.tap { |i| i.base_url = request.base_url }
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|