iiif_print 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.coveralls.yml +2 -0
- data/.env +5 -0
- data/.fcrepo_wrapper +4 -0
- data/.github/release.yml +20 -0
- data/.github/workflows/branches.yml +24 -0
- data/.github/workflows/build-lint-test-action.yaml +33 -0
- data/.github/workflows/release_labels.yml +25 -0
- data/.gitignore +52 -0
- data/.rubocop.yml +177 -0
- data/.solr_wrapper +8 -0
- data/.travis.yml +49 -0
- data/CONTRIBUTING.md +181 -0
- data/Dockerfile +15 -0
- data/Gemfile +52 -0
- data/LICENSE +203 -0
- data/README.md +203 -0
- data/Rakefile +38 -0
- data/app/actors/iiif_print/actors/file_set_actor_decorator.rb +56 -0
- data/app/assets/config/iiif_print_manifest.js +2 -0
- data/app/assets/images/iiif_print/.keep +0 -0
- data/app/assets/javascripts/iiif_print/autocomplete_fix.js +33 -0
- data/app/assets/javascripts/iiif_print/ocr_search.js.erb +6 -0
- data/app/assets/javascripts/iiif_print.js +3 -0
- data/app/assets/stylesheets/iiif_print/_iiif_print.scss +4 -0
- data/app/assets/stylesheets/iiif_print/_issue_search.scss +13 -0
- data/app/assets/stylesheets/iiif_print/_issues_calendar.scss +18 -0
- data/app/assets/stylesheets/iiif_print/_newspapers_search.scss +38 -0
- data/app/assets/stylesheets/iiif_print/_search_results.scss +6 -0
- data/app/helpers/hyrax/iiif_helper.rb +22 -0
- data/app/helpers/iiif_print/application_helper.rb +5 -0
- data/app/helpers/iiif_print_helper.rb +64 -0
- data/app/indexers/concerns/iiif_print/child_indexer.rb +34 -0
- data/app/indexers/concerns/iiif_print/file_set_indexer.rb +29 -0
- data/app/mailers/iiif_print/application_mailer.rb +8 -0
- data/app/models/concerns/iiif_print/set_child_flag.rb +29 -0
- data/app/models/concerns/iiif_print/solr/document.rb +47 -0
- data/app/models/iiif_print/application_record.rb +6 -0
- data/app/models/iiif_print/derivative_attachment.rb +8 -0
- data/app/models/iiif_print/iiif_search_response_decorator.rb +17 -0
- data/app/models/iiif_print/ingest_file_relation.rb +14 -0
- data/app/models/iiif_print/pending_relationship.rb +7 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_behavior.rb +10 -0
- data/app/presenters/iiif_print/iiif_manifest_presenter_factory_behavior.rb +33 -0
- data/app/presenters/iiif_print/work_show_presenter_decorator.rb +29 -0
- data/app/renderers/hyrax/renderers/faceted_attribute_renderer_decorator.rb +18 -0
- data/app/search_builders/concerns/iiif_print/exclude_models.rb +17 -0
- data/app/search_builders/concerns/iiif_print/highlight_search_params.rb +14 -0
- data/app/services/iiif_print/manifest_builder_service_behavior.rb +97 -0
- data/app/services/iiif_print/pluggable_derivative_service.rb +120 -0
- data/app/views/catalog/_snippets_more.html.erb +16 -0
- data/app/views/hyrax/base/_representative_media.html.erb +9 -0
- data/app/views/hyrax/base/iiif_viewers/_universal_viewer.html.erb +8 -0
- data/app/views/hyrax/file_sets/_actions.html.erb +45 -0
- data/bin/rails +13 -0
- data/config/fcrepo_wrapper_test.yml +5 -0
- data/config/initializers/assets.rb +2 -0
- data/config/locales/iiif_print.de.yml +148 -0
- data/config/locales/iiif_print.en.yml +119 -0
- data/config/locales/iiif_print.es.yml +148 -0
- data/config/locales/iiif_print.fr.yml +149 -0
- data/config/locales/iiif_print.it.yml +142 -0
- data/config/locales/iiif_print.pt-BR.yml +148 -0
- data/config/locales/iiif_print.zh.yml +142 -0
- data/config/solr_wrapper_test.yml +9 -0
- data/config/test-fixture/solr-config/_rest_managed.json +3 -0
- data/config/test-fixture/solr-config/admin-extra.html +31 -0
- data/config/test-fixture/solr-config/elevate.xml +36 -0
- data/config/test-fixture/solr-config/mapping-ISOLatin1Accent.txt +246 -0
- data/config/test-fixture/solr-config/protwords.txt +21 -0
- data/config/test-fixture/solr-config/schema.xml +366 -0
- data/config/test-fixture/solr-config/scripts.conf +24 -0
- data/config/test-fixture/solr-config/solrconfig.xml +322 -0
- data/config/test-fixture/solr-config/spellings.txt +2 -0
- data/config/test-fixture/solr-config/stopwords.txt +58 -0
- data/config/test-fixture/solr-config/stopwords_en.txt +58 -0
- data/config/test-fixture/solr-config/synonyms.txt +31 -0
- data/config/test-fixture/solr-config/xslt/example.xsl +132 -0
- data/config/test-fixture/solr-config/xslt/example_atom.xsl +67 -0
- data/config/test-fixture/solr-config/xslt/example_rss.xsl +66 -0
- data/config/test-fixture/solr-config/xslt/luke.xsl +337 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +76 -0
- data/db/migrate/20181214181358_create_iiif_print_derivative_attachments.rb +12 -0
- data/db/migrate/20190107165909_create_iiif_print_ingest_file_relations.rb +11 -0
- data/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb +11 -0
- data/docker-compose.yml +129 -0
- data/iiif_print.gemspec +43 -0
- data/lib/generators/iiif_print/assets_generator.rb +29 -0
- data/lib/generators/iiif_print/catalog_controller_generator.rb +32 -0
- data/lib/generators/iiif_print/install_generator.rb +52 -0
- data/lib/generators/iiif_print/templates/config/initializers/iiif_print.rb +22 -0
- data/lib/generators/iiif_print/templates/iiif_print.scss +1 -0
- data/lib/iiif_print/base_derivative_service.rb +113 -0
- data/lib/iiif_print/blacklight_iiif_search/annotation_decorator.rb +84 -0
- data/lib/iiif_print/catalog_search_builder.rb +31 -0
- data/lib/iiif_print/configuration.rb +99 -0
- data/lib/iiif_print/data/fileset_helper.rb +25 -0
- data/lib/iiif_print/data/path_helper.rb +40 -0
- data/lib/iiif_print/data/work_derivatives.rb +323 -0
- data/lib/iiif_print/data/work_file.rb +92 -0
- data/lib/iiif_print/data/work_files.rb +199 -0
- data/lib/iiif_print/data.rb +35 -0
- data/lib/iiif_print/engine.rb +77 -0
- data/lib/iiif_print/errors.rb +9 -0
- data/lib/iiif_print/image_tool.rb +119 -0
- data/lib/iiif_print/jobs/application_job.rb +8 -0
- data/lib/iiif_print/jobs/child_works_from_pdf_job.rb +107 -0
- data/lib/iiif_print/jobs/create_relationships_job.rb +78 -0
- data/lib/iiif_print/jp2_derivative_service.rb +118 -0
- data/lib/iiif_print/jp2_image_metadata.rb +81 -0
- data/lib/iiif_print/lineage_service.rb +41 -0
- data/lib/iiif_print/metadata.rb +125 -0
- data/lib/iiif_print/pdf_derivative_service.rb +42 -0
- data/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb +75 -0
- data/lib/iiif_print/split_pdfs/pages_into_images_service.rb +130 -0
- data/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb +85 -0
- data/lib/iiif_print/text_extraction/alto_reader.rb +123 -0
- data/lib/iiif_print/text_extraction/hocr_reader.rb +172 -0
- data/lib/iiif_print/text_extraction/page_ocr.rb +87 -0
- data/lib/iiif_print/text_extraction/render_alto.rb +84 -0
- data/lib/iiif_print/text_extraction/word_coords_builder.rb +38 -0
- data/lib/iiif_print/text_extraction.rb +11 -0
- data/lib/iiif_print/text_extraction_derivative_service.rb +47 -0
- data/lib/iiif_print/text_formats_from_alto_service.rb +77 -0
- data/lib/iiif_print/tiff_derivative_service.rb +50 -0
- data/lib/iiif_print/version.rb +3 -0
- data/lib/iiif_print/works_controller_behavior.rb +9 -0
- data/lib/iiif_print.rb +136 -0
- data/lib/tasks/set_child_works.rake +22 -0
- data/spec/.keep.txt +1 -0
- data/spec/factories/ability.rb +6 -0
- data/spec/factories/newspaper_issue.rb +7 -0
- data/spec/factories/newspaper_page.rb +7 -0
- data/spec/factories/newspaper_page_solr_document.rb +12 -0
- data/spec/factories/newspaper_title.rb +8 -0
- data/spec/factories/uploaded_pdf_file.rb +9 -0
- data/spec/factories/uploaded_txt_file.rb +9 -0
- data/spec/factories/user.rb +13 -0
- data/spec/fixtures/files/4.1.07.jp2 +0 -0
- data/spec/fixtures/files/4.1.07.tiff +0 -0
- data/spec/fixtures/files/README.md +7 -0
- data/spec/fixtures/files/alto-2-0.xsd +714 -0
- data/spec/fixtures/files/broken-truncated.pdf +0 -0
- data/spec/fixtures/files/credits.md +16 -0
- data/spec/fixtures/files/lowres-gray-via-ndnp-sample.tiff +0 -0
- data/spec/fixtures/files/minimal-1-page.pdf +0 -0
- data/spec/fixtures/files/minimal-2-page.pdf +0 -0
- data/spec/fixtures/files/minimal-alto.xml +31 -0
- data/spec/fixtures/files/ndnp-alto-sample.xml +24 -0
- data/spec/fixtures/files/ndnp-sample1-json.json +1 -0
- data/spec/fixtures/files/ndnp-sample1-txt.txt +1 -0
- data/spec/fixtures/files/ndnp-sample1.pdf +0 -0
- data/spec/fixtures/files/ocr_alto.xml +202 -0
- data/spec/fixtures/files/ocr_alto_scaled_4pts_per_px.xml +202 -0
- data/spec/fixtures/files/ocr_color.tiff +0 -0
- data/spec/fixtures/files/ocr_gray.jp2 +0 -0
- data/spec/fixtures/files/ocr_gray.tiff +0 -0
- data/spec/fixtures/files/ocr_mono.tiff +0 -0
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/fixtures/files/page1.tiff +0 -0
- data/spec/fixtures/files/sample-4page-issue.pdf +0 -0
- data/spec/fixtures/files/sample-color-newsletter.pdf +0 -0
- data/spec/fixtures/files/thumbnail.jpg +0 -0
- data/spec/helpers/hyrax/iiif_helper_spec.rb +65 -0
- data/spec/helpers/iiif_print_helper_spec.rb +43 -0
- data/spec/iiif_print/base_derivative_service_spec.rb +11 -0
- data/spec/iiif_print/blacklight_iiif_search/annotation_decorator_spec.rb +51 -0
- data/spec/iiif_print/catalog_search_builder_spec.rb +60 -0
- data/spec/iiif_print/configuration_spec.rb +67 -0
- data/spec/iiif_print/data/work_derivatives_spec.rb +245 -0
- data/spec/iiif_print/data/work_file_spec.rb +99 -0
- data/spec/iiif_print/data/work_files_spec.rb +237 -0
- data/spec/iiif_print/image_tool_spec.rb +109 -0
- data/spec/iiif_print/jobs/child_works_from_pdf_job_spec.rb +30 -0
- data/spec/iiif_print/jobs/create_relationships_job_spec.rb +17 -0
- data/spec/iiif_print/jp2_image_metadata_spec.rb +37 -0
- data/spec/iiif_print/lineage_service_spec.rb +13 -0
- data/spec/iiif_print/metadata_spec.rb +115 -0
- data/spec/iiif_print/split_pdfs/pages_into_images_service_spec.rb +6 -0
- data/spec/iiif_print/text_extraction/alto_reader_spec.rb +49 -0
- data/spec/iiif_print/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/iiif_print/text_extraction/page_ocr_spec.rb +84 -0
- data/spec/iiif_print/text_extraction/render_alto_spec.rb +54 -0
- data/spec/iiif_print/text_extraction/word_coords_builder_spec.rb +44 -0
- data/spec/iiif_print_spec.rb +51 -0
- data/spec/misc_shared.rb +111 -0
- data/spec/models/iiif_print/derivative_attachment_spec.rb +37 -0
- data/spec/models/iiif_print/ingest_file_relation_spec.rb +56 -0
- data/spec/models/solr_document_spec.rb +14 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_behavior_spec.rb +19 -0
- data/spec/presenters/iiif_print/iiif_manifest_presenter_factory_behavior_spec.rb +49 -0
- data/spec/services/iiif_print/jp2_derivative_service_spec.rb +59 -0
- data/spec/services/iiif_print/pdf_derivative_service_spec.rb +66 -0
- data/spec/services/iiif_print/pluggable_derivative_service_spec.rb +178 -0
- data/spec/services/iiif_print/text_extraction_derivative_service_spec.rb +82 -0
- data/spec/services/iiif_print/text_formats_from_alto_service_spec.rb +127 -0
- data/spec/services/iiif_print/tiff_derivative_service_spec.rb +65 -0
- data/spec/spec_helper.rb +181 -0
- data/spec/support/controller_level_helpers.rb +28 -0
- data/spec/support/iiif_print_models.rb +127 -0
- data/spec/test_app_templates/blacklight.yml +9 -0
- data/spec/test_app_templates/fedora.yml +15 -0
- data/spec/test_app_templates/lib/generators/test_app_generator.rb +40 -0
- data/spec/test_app_templates/redis.yml +9 -0
- data/spec/test_app_templates/solr/conf/schema.xml +362 -0
- data/spec/test_app_templates/solr/conf/solrconfig.xml +322 -0
- data/spec/test_app_templates/solr.yml +7 -0
- data/tasks/iiif_print_dev.rake +34 -0
- data/tmp/.keep +0 -0
- metadata +605 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
require 'active_support/core_ext/module/delegation'
|
|
2
|
+
require 'json'
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
|
|
5
|
+
module IiifPrint
|
|
6
|
+
# Module for text extraction
|
|
7
|
+
module TextExtraction
|
|
8
|
+
# Class to obtain plain text and JSON word-coordinates from hOCR source
|
|
9
|
+
# - Coordinates in px units, unlike ALTO, which may have scaling concerns
|
|
10
|
+
class HOCRReader
|
|
11
|
+
attr_accessor :source, :doc_stream
|
|
12
|
+
delegate :text, :width, :height, :words, to: :doc_stream
|
|
13
|
+
|
|
14
|
+
# SAX Document Stream class to gather text and word tokens from hOCR
|
|
15
|
+
class HOCRDocStream < Nokogiri::XML::SAX::Document
|
|
16
|
+
attr_accessor :text, :words, :width, :height
|
|
17
|
+
|
|
18
|
+
def initialize
|
|
19
|
+
super()
|
|
20
|
+
# plain text buffer:
|
|
21
|
+
@text = ''
|
|
22
|
+
# list of word hash, containing word+coord:
|
|
23
|
+
@words = []
|
|
24
|
+
# page width and height to be found in hOCR for `div.ocr_page`
|
|
25
|
+
@width = nil
|
|
26
|
+
@height = nil
|
|
27
|
+
# to hold current word data state across #start_element, #characters,
|
|
28
|
+
# and #end_element methods (to associate word with coordinates).
|
|
29
|
+
@current = nil
|
|
30
|
+
# to preserve element classname from start to use by #end_element
|
|
31
|
+
@element_class_name = nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Return coordinates from `span.ocrx_word` element attribute hash
|
|
35
|
+
#
|
|
36
|
+
# @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
|
|
37
|
+
# @return [Array] Array of position x, y, width, height in px.
|
|
38
|
+
def s_coords(attrs)
|
|
39
|
+
element_title = attrs['title']
|
|
40
|
+
bbox = element_title.split(';')[0].split('bbox ')[-1]
|
|
41
|
+
x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
|
|
42
|
+
height = y2 - y1
|
|
43
|
+
width = x2 - x1
|
|
44
|
+
hpos = x1
|
|
45
|
+
vpos = y1
|
|
46
|
+
[hpos, vpos, width, height]
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Consider element for processing?
|
|
50
|
+
# - `div.ocr_page` — to get page width/height
|
|
51
|
+
# - `span.ocr_line` — to help make plain text readable
|
|
52
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
|
53
|
+
# @param name [String] Element name
|
|
54
|
+
# @param class_name [String] HTML class name
|
|
55
|
+
# @return [Boolean] true if element should be processed; otherwise false
|
|
56
|
+
def consider?(name, class_name)
|
|
57
|
+
selector = "#{name}.#{class_name}"
|
|
58
|
+
['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def start_word(attrs)
|
|
62
|
+
@current = {}
|
|
63
|
+
# will be replaced during #characters method call:
|
|
64
|
+
@current[:word] = nil
|
|
65
|
+
@current[:coordinates] = s_coords(attrs)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def start_page(attrs)
|
|
69
|
+
title = attrs['title']
|
|
70
|
+
fields = title.split(';')
|
|
71
|
+
bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
|
|
72
|
+
# width and height:
|
|
73
|
+
@width = bbox[2]
|
|
74
|
+
@height = bbox[3]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def word_complete?
|
|
78
|
+
return false if @current.nil?
|
|
79
|
+
coords = @current[:coordinates]
|
|
80
|
+
@current[:word] && !@current[:word].empty? && coords.size == 4
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def end_word
|
|
84
|
+
# add trailing space to plaintext buffer for between words:
|
|
85
|
+
@text += ' '
|
|
86
|
+
@words.push(@current) if word_complete?
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def end_line
|
|
90
|
+
# strip trailing whitespace
|
|
91
|
+
@text.strip!
|
|
92
|
+
# then insert a line break
|
|
93
|
+
@text += "\n"
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Callback for element start, ignores elements except for:
|
|
97
|
+
# - `div.ocr_page` — to get page width/height
|
|
98
|
+
# - `span.ocr_line` — to help make plain text readable
|
|
99
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
|
100
|
+
#
|
|
101
|
+
# @param name [String] element name.
|
|
102
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
|
103
|
+
def start_element(name, attrs = [])
|
|
104
|
+
attributes = attrs.to_h
|
|
105
|
+
@element_class_name = attributes['class']
|
|
106
|
+
return unless consider?(name, @element_class_name)
|
|
107
|
+
start_word(attributes) if @element_class_name == 'ocrx_word'
|
|
108
|
+
start_page(attributes) if @element_class_name == 'ocr_page'
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def characters(value)
|
|
112
|
+
return if @current.nil?
|
|
113
|
+
return if @current[:coordinates].nil?
|
|
114
|
+
@current[:word] ||= ''
|
|
115
|
+
@current[:word] += value
|
|
116
|
+
@text += value
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Callback for element end; at this time, flush word coordinate state
|
|
120
|
+
# for current word, and append line endings to plain text:
|
|
121
|
+
#
|
|
122
|
+
# @param name [String] element name.
|
|
123
|
+
def end_element(_name)
|
|
124
|
+
end_line if @element_class_name == 'ocr_line'
|
|
125
|
+
end_word if @element_class_name == 'ocrx_word'
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Callback for completion of parsing hOCR, used to normalize generated
|
|
129
|
+
# text content (strip unneeded whitespace incidental to output).
|
|
130
|
+
def end_document
|
|
131
|
+
# postprocess @text to remove trailing spaces on lines
|
|
132
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
|
133
|
+
# remove excess line break
|
|
134
|
+
@text.gsub!(/\n+/, "\n")
|
|
135
|
+
@text.delete("\r")
|
|
136
|
+
# remove trailing whitespace at end of buffer
|
|
137
|
+
@text.strip!
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Construct with either path or HTML [String]
|
|
142
|
+
#
|
|
143
|
+
# @param html [String], and process document
|
|
144
|
+
def initialize(html)
|
|
145
|
+
@source = isxml?(html) ? html : File.read(html)
|
|
146
|
+
@doc_stream = HOCRDocStream.new
|
|
147
|
+
parser = Nokogiri::HTML::SAX::Parser.new(doc_stream)
|
|
148
|
+
parser.parse(@source)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Determine if source parameter is path or xml/html
|
|
152
|
+
#
|
|
153
|
+
# @param xml [String] either path to xml file or xml source
|
|
154
|
+
# @return [true, false] true if value appears to be XML/HTML, not path
|
|
155
|
+
def isxml?(xml)
|
|
156
|
+
xml.lstrip.start_with?('<')
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Output JSON flattened word coordinates
|
|
160
|
+
#
|
|
161
|
+
# @return [String] JSON serialization of flattened word coordinates
|
|
162
|
+
def json
|
|
163
|
+
words = @doc_stream.words
|
|
164
|
+
IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
|
|
165
|
+
words: words,
|
|
166
|
+
width: @doc_stream.width,
|
|
167
|
+
height: @doc_stream.height
|
|
168
|
+
)
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
require 'open3'
|
|
3
|
+
require 'tmpdir'
|
|
4
|
+
|
|
5
|
+
# --
|
|
6
|
+
module IiifPrint
|
|
7
|
+
# Module for text extraction (OCR or otherwise)
|
|
8
|
+
module TextExtraction
|
|
9
|
+
class PageOCR
|
|
10
|
+
attr_accessor :html, :path
|
|
11
|
+
|
|
12
|
+
def initialize(path, additional_tessearct_options: IiifPrint.config.additional_tessearct_options)
|
|
13
|
+
@path = path
|
|
14
|
+
# hOCR html:
|
|
15
|
+
@html = nil
|
|
16
|
+
@words = nil
|
|
17
|
+
@source_meta = nil
|
|
18
|
+
@box = nil
|
|
19
|
+
@plain = nil
|
|
20
|
+
@additional_tessearct_options = additional_tessearct_options
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def run_ocr
|
|
24
|
+
outfile = File.join(Dir.mktmpdir, 'output_html')
|
|
25
|
+
cmd = "tesseract #{path} #{outfile} hocr"
|
|
26
|
+
cmd += " #{@additional_tessearct_options}" if @additional_tessearct_options.present?
|
|
27
|
+
`#{cmd}`
|
|
28
|
+
outfile + '.hocr'
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def load_words
|
|
32
|
+
preprocess_image
|
|
33
|
+
html_path = run_ocr
|
|
34
|
+
reader = IiifPrint::TextExtraction::HOCRReader.new(html_path)
|
|
35
|
+
@words = reader.words
|
|
36
|
+
@plain = reader.text
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def words
|
|
40
|
+
load_words if @words.nil?
|
|
41
|
+
@words
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def word_json
|
|
45
|
+
IiifPrint::TextExtraction::WordCoordsBuilder.json_coordinates_for(
|
|
46
|
+
words: words,
|
|
47
|
+
width: width,
|
|
48
|
+
height: height
|
|
49
|
+
)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def plain
|
|
53
|
+
load_words if @plain.nil?
|
|
54
|
+
@plain
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def identify
|
|
58
|
+
return @source_meta unless @source_meta.nil?
|
|
59
|
+
@source_meta = IiifPrint::ImageTool.new(@path).metadata
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def width
|
|
63
|
+
identify[:width]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def height
|
|
67
|
+
identify[:height]
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def alto
|
|
71
|
+
writer = IiifPrint::TextExtraction::RenderAlto.new(width, height)
|
|
72
|
+
writer.to_alto(words)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
# transform the image into a one-bit TIFF for OCR
|
|
78
|
+
def preprocess_image
|
|
79
|
+
tool = IiifPrint::ImageTool.new(@path)
|
|
80
|
+
return if tool.metadata[:color] == 'monochrome'
|
|
81
|
+
intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
|
|
82
|
+
tool.convert(intermediate_path, true)
|
|
83
|
+
@path = intermediate_path
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
|
|
3
|
+
module IiifPrint
|
|
4
|
+
# Module for text extraction (OCR or otherwise)
|
|
5
|
+
module TextExtraction
|
|
6
|
+
class RenderAlto
|
|
7
|
+
def initialize(width, height, scaling = 1.0)
|
|
8
|
+
@height = height
|
|
9
|
+
@width = width
|
|
10
|
+
@scaling = scaling
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def to_alto(words)
|
|
14
|
+
page = alto_page(@width, @height) do |xml|
|
|
15
|
+
words.each do |word|
|
|
16
|
+
xml.String(
|
|
17
|
+
CONTENT: word[:word],
|
|
18
|
+
WIDTH: scale_point(word[:coordinates][2]).to_s,
|
|
19
|
+
HEIGHT: scale_point(word[:coordinates][3]).to_s,
|
|
20
|
+
HPOS: scale_point(word[:coordinates][0]).to_s,
|
|
21
|
+
VPOS: scale_point(word[:coordinates][1]).to_s
|
|
22
|
+
) { xml.text '' }
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
page.to_xml
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
# given block to manage word generation, wrap with page/block/line
|
|
31
|
+
def alto_page(pxwidth, pxheight, &block)
|
|
32
|
+
builder = Nokogiri::XML::Builder.new(encoding: 'UTF-8') do |xml|
|
|
33
|
+
xml.alto(xmlns: 'http://www.loc.gov/standards/alto/ns-v2#') do
|
|
34
|
+
xml.Description do
|
|
35
|
+
xml.MeasurementUnit 'pixel'
|
|
36
|
+
end
|
|
37
|
+
alto_layout(xml, pxwidth, pxheight, &block)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
builder
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def scale_point(value)
|
|
44
|
+
# NOTE: presuming non-fractional, even though ALTO 2.1
|
|
45
|
+
# specifies coordinates are xsd:float, not xsd:int,
|
|
46
|
+
# simplify to integer value for output:
|
|
47
|
+
(value * @scaling).to_i
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# return layout for page
|
|
51
|
+
def alto_layout(xml, pxwidth, pxheight, &block)
|
|
52
|
+
xml.Layout do
|
|
53
|
+
xml.Page(ID: 'ID1',
|
|
54
|
+
PHYSICAL_IMG_NR: '1',
|
|
55
|
+
HEIGHT: pxheight.to_i,
|
|
56
|
+
WIDTH: pxwidth.to_i) do
|
|
57
|
+
xml.PrintSpace(HEIGHT: pxheight.to_i,
|
|
58
|
+
WIDTH: pxwidth.to_i,
|
|
59
|
+
HPOS: '0',
|
|
60
|
+
VPOS: '0') do
|
|
61
|
+
alto_blockline(xml, pxwidth, pxheight, &block)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# make block line and call word-block
|
|
68
|
+
def alto_blockline(xml, pxwidth, pxheight)
|
|
69
|
+
xml.TextBlock(ID: 'ID1a',
|
|
70
|
+
HEIGHT: pxheight.to_i,
|
|
71
|
+
WIDTH: pxwidth.to_i,
|
|
72
|
+
HPOS: '0',
|
|
73
|
+
VPOS: '0') do
|
|
74
|
+
xml.TextLine(HEIGHT: pxheight.to_i,
|
|
75
|
+
WIDTH: pxwidth.to_i,
|
|
76
|
+
HPOS: '0',
|
|
77
|
+
VPOS: '0') do
|
|
78
|
+
yield(xml)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
module IiifPrint
|
|
2
|
+
# Module for text extraction (OCR or otherwise)
|
|
3
|
+
module TextExtraction
|
|
4
|
+
class WordCoordsBuilder
|
|
5
|
+
# @params words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
|
|
6
|
+
# @params width [Integer] the width of the "canvas" on which the words appear.
|
|
7
|
+
# @params height [Integer] the height of the "canvas" on which the words appear.
|
|
8
|
+
# @return [String] a JSON encoded string.
|
|
9
|
+
def self.json_coordinates_for(words:, width: nil, height: nil)
|
|
10
|
+
new(words, width, height).to_json
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def initialize(words, width = nil, height = nil)
|
|
14
|
+
@words = words
|
|
15
|
+
@width = width
|
|
16
|
+
@height = height
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Output JSON flattened word coordinates
|
|
20
|
+
#
|
|
21
|
+
# @return [String] JSON serialization of flattened word coordinates
|
|
22
|
+
def to_json
|
|
23
|
+
coordinates = {}
|
|
24
|
+
@words.each do |w|
|
|
25
|
+
word_chars = w[:word]
|
|
26
|
+
word_coords = w[:coordinates]
|
|
27
|
+
if coordinates[word_chars]
|
|
28
|
+
coordinates[word_chars] << word_coords
|
|
29
|
+
else
|
|
30
|
+
coordinates[word_chars] = [word_coords]
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
payload = { width: @width, height: @height, coords: coordinates }
|
|
34
|
+
JSON.generate(payload)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
require 'iiif_print/text_extraction/alto_reader'
|
|
2
|
+
require 'iiif_print/text_extraction/hocr_reader'
|
|
3
|
+
require 'iiif_print/text_extraction/page_ocr'
|
|
4
|
+
require 'iiif_print/text_extraction/render_alto'
|
|
5
|
+
require 'iiif_print/text_extraction/word_coords_builder'
|
|
6
|
+
|
|
7
|
+
module IiifPrint
|
|
8
|
+
# Module for text extraction (OCR or otherwise)
|
|
9
|
+
module TextExtraction
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require 'iiif_print/text_formats_from_alto_service'
|
|
2
|
+
|
|
3
|
+
module IiifPrint
|
|
4
|
+
class TextExtractionDerivativeService < BaseDerivativeService
|
|
5
|
+
# @param [Hash<Symbol,Symbol>]
|
|
6
|
+
#
|
|
7
|
+
# The key for the hash represents the file extension. The key's value represents the instance
|
|
8
|
+
# method to call on {IiifPrint::TextExtraction::PageOCR}
|
|
9
|
+
class_attribute :ocr_derivatives, default: { txt: :plain, xml: :alto, json: :word_json }
|
|
10
|
+
class_attribute :alto_derivative_service_class, default: IiifPrint::TextFormatsFromALTOService
|
|
11
|
+
class_attribute :page_ocr_service_class, default: IiifPrint::TextExtraction::PageOCR
|
|
12
|
+
def initialize(file_set)
|
|
13
|
+
super(file_set)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def create_derivatives(src)
|
|
17
|
+
from_alto = alto_derivative_service_class.new(
|
|
18
|
+
file_set
|
|
19
|
+
)
|
|
20
|
+
return from_alto.create_derivatives(src) unless from_alto.alto_path.nil?
|
|
21
|
+
create_derivatives_from_ocr(src)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def create_derivatives_from_ocr(filename)
|
|
25
|
+
# TODO: Do we need this source_path instance variable?
|
|
26
|
+
@source_path = filename
|
|
27
|
+
ocr = page_ocr_service_class.new(filename)
|
|
28
|
+
|
|
29
|
+
ocr_derivatives.each do |extension, method_name|
|
|
30
|
+
path = prepare_path(extension.to_s)
|
|
31
|
+
write(content: ocr.public_send(method_name), path: path)
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def write(content:, path:)
|
|
36
|
+
File.open(path, 'w') do |outfile|
|
|
37
|
+
outfile.write(content)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def cleanup_derivatives(*)
|
|
42
|
+
ocr_derivatives.keys do |extension|
|
|
43
|
+
super(extension.to_s)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
module IiifPrint
|
|
2
|
+
# Plugin to make text format derviatives (JSON, plain-text) from ALTO,
|
|
3
|
+
# either existing derivative, or an impending attachment.
|
|
4
|
+
# NOTE: to keep this from conflicting with TextExtractionDerivativeService,
|
|
5
|
+
# this class should be invoked by it, not PluggableDerivativeService.
|
|
6
|
+
class TextFormatsFromALTOService < BaseDerivativeService
|
|
7
|
+
self.target_extension = 'tiff'.freeze
|
|
8
|
+
|
|
9
|
+
def save_derivative(destination, data)
|
|
10
|
+
# Load/prepare base of "pairtree" dir structure for extension, fileset
|
|
11
|
+
prepare_path(destination)
|
|
12
|
+
#
|
|
13
|
+
save_path = derivative_path_factory.derivative_path_for_reference(
|
|
14
|
+
@file_set,
|
|
15
|
+
destination
|
|
16
|
+
)
|
|
17
|
+
# Write data as UTF-8 encoded text
|
|
18
|
+
File.open(save_path, "w:UTF-8") do |f|
|
|
19
|
+
f.write(data)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def nonempty_file?(path)
|
|
24
|
+
return false if path.nil?
|
|
25
|
+
return false unless File.exist?(path)
|
|
26
|
+
!File.size(path).zero?
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# if there was no derivative yet, there might be one in-transit from
|
|
30
|
+
# an ingest, so check for that, and use its source if applicable:
|
|
31
|
+
def incoming_alto_path
|
|
32
|
+
path = IiifPrint::DerivativeAttachment.where(
|
|
33
|
+
fileset_id: @file_set.id,
|
|
34
|
+
destination_name: 'xml'
|
|
35
|
+
).pluck(:path).uniq.first
|
|
36
|
+
path if nonempty_file?(path)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def alto_path
|
|
40
|
+
# check first for existing, non-empty derivative data:
|
|
41
|
+
path = derivative_path_factory.derivative_path_for_reference(
|
|
42
|
+
@file_set,
|
|
43
|
+
'xml'
|
|
44
|
+
)
|
|
45
|
+
return path if nonempty_file?(path)
|
|
46
|
+
incoming_alto_path
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def alto
|
|
50
|
+
path = alto_path
|
|
51
|
+
File.read(path, encoding: 'UTF-8') unless path.nil?
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def create_derivatives(_filename)
|
|
55
|
+
# as this plugin makes derivatives of derivative, _filename is ignored
|
|
56
|
+
source_file = alto
|
|
57
|
+
return if source_file.nil?
|
|
58
|
+
# Image width from characterized primary file helps ensure proper scaling:
|
|
59
|
+
file = @file_set.original_file
|
|
60
|
+
width = file.nil? ? nil : file.width[0].to_i
|
|
61
|
+
height = file.nil? ? nil : file.height[0].to_i
|
|
62
|
+
# ALTOReader is responsible for transcoding, this class just saves result
|
|
63
|
+
reader = IiifPrint::TextExtraction::AltoReader.new(
|
|
64
|
+
source_file,
|
|
65
|
+
width,
|
|
66
|
+
height
|
|
67
|
+
)
|
|
68
|
+
save_derivative('json', reader.json)
|
|
69
|
+
save_derivative('txt', reader.text)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def cleanup_derivatives(*args)
|
|
73
|
+
# do nothing here; IiifPrint::TextExtractionDerivativeService
|
|
74
|
+
# has this job instead for cleaning ALTO, JSON, TXT.
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
require 'open3'
|
|
2
|
+
|
|
3
|
+
module IiifPrint
|
|
4
|
+
class TIFFDerivativeService < BaseDerivativeService
|
|
5
|
+
self.target_extension = 'tiff'.freeze
|
|
6
|
+
|
|
7
|
+
# For imagemagick commands, the output type is determined by the
|
|
8
|
+
# output file's extension.
|
|
9
|
+
# TIFF (LZW, 8 bit grayscale)
|
|
10
|
+
GRAY_CMD = 'convert %<source_file>s ' \
|
|
11
|
+
'-depth 8 -colorspace Gray ' \
|
|
12
|
+
'-compress lzw %<out_file>s'.freeze
|
|
13
|
+
|
|
14
|
+
# Monochrome one-bit black/white TIFF, Group 4 compressed:
|
|
15
|
+
MONO_CMD = 'convert %<source_file>s ' \
|
|
16
|
+
'-depth 1 -monochrome -compress Group4 -type bilevel ' \
|
|
17
|
+
'%<out_file>s'.freeze
|
|
18
|
+
|
|
19
|
+
# sRBG color TIFF (8 bits per channel, lzw)
|
|
20
|
+
COLOR_CMD = 'convert %<source_file>s ' \
|
|
21
|
+
'-depth 24 ' \
|
|
22
|
+
'-compress lzw %<out_file>s'.freeze
|
|
23
|
+
|
|
24
|
+
def initialize(file_set)
|
|
25
|
+
super(file_set)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Get conversion command; command varies on whether or not we have
|
|
29
|
+
# JP2 source, and whether we have color or grayscale material.
|
|
30
|
+
def convert_cmd
|
|
31
|
+
source_path = @source_path
|
|
32
|
+
source_path += '[0]' if @source_path.ends_with?('pdf')
|
|
33
|
+
template = use_color? ? COLOR_CMD : GRAY_CMD
|
|
34
|
+
template = MONO_CMD if one_bit?
|
|
35
|
+
format(template, source_file: source_path, out_file: @dest_path)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def create_derivatives(filename)
|
|
39
|
+
# Base class takes care of loading @source_path, @dest_path
|
|
40
|
+
super(filename)
|
|
41
|
+
|
|
42
|
+
# no creation of TIFF deriviative if primary is TIFF
|
|
43
|
+
return if mime_type == 'image/tiff'
|
|
44
|
+
|
|
45
|
+
return jp2_convert if mime_type == 'image/jp2'
|
|
46
|
+
# Otherwise, get, run imagemagick command to convert
|
|
47
|
+
im_convert
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
module IiifPrint
|
|
2
|
+
module WorksControllerBehaviorDecorator
|
|
3
|
+
# Extending the presenter to the base url which includes the protocol.
|
|
4
|
+
# We need the base url to render the facet links.
|
|
5
|
+
def iiif_manifest_presenter
|
|
6
|
+
super.tap { |i| i.base_url = request.base_url }
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
end
|