newspaper_works 0.1.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.coveralls.yml +2 -0
- data/.gitignore +4 -0
- data/.travis.yml +2 -2
- data/README.md +14 -13
- data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
- data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
- data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
- data/config/locales/newspaper_article.de.yml +1 -1
- data/config/locales/newspaper_article.en.yml +1 -1
- data/config/locales/newspaper_article.es.yml +1 -1
- data/config/locales/newspaper_article.fr.yml +1 -1
- data/config/locales/newspaper_article.it.yml +1 -1
- data/config/locales/newspaper_article.pt-BR.yml +1 -1
- data/config/locales/newspaper_article.zh.yml +1 -1
- data/config/locales/newspaper_container.de.yml +1 -1
- data/config/locales/newspaper_container.en.yml +1 -1
- data/config/locales/newspaper_container.es.yml +1 -1
- data/config/locales/newspaper_container.fr.yml +1 -1
- data/config/locales/newspaper_container.it.yml +1 -1
- data/config/locales/newspaper_container.pt-BR.yml +1 -1
- data/config/locales/newspaper_container.zh.yml +1 -1
- data/config/locales/newspaper_issue.de.yml +1 -1
- data/config/locales/newspaper_issue.en.yml +1 -1
- data/config/locales/newspaper_issue.es.yml +1 -1
- data/config/locales/newspaper_issue.fr.yml +1 -1
- data/config/locales/newspaper_issue.it.yml +2 -2
- data/config/locales/newspaper_issue.pt-BR.yml +2 -2
- data/config/locales/newspaper_issue.zh.yml +2 -2
- data/config/locales/newspaper_page.de.yml +1 -1
- data/config/locales/newspaper_page.en.yml +1 -1
- data/config/locales/newspaper_page.es.yml +1 -1
- data/config/locales/newspaper_page.fr.yml +1 -1
- data/config/locales/newspaper_page.it.yml +1 -1
- data/config/locales/newspaper_page.pt-BR.yml +1 -1
- data/config/locales/newspaper_page.zh.yml +1 -1
- data/config/locales/newspaper_title.de.yml +1 -1
- data/config/locales/newspaper_title.en.yml +1 -1
- data/config/locales/newspaper_title.es.yml +1 -1
- data/config/locales/newspaper_title.fr.yml +1 -1
- data/config/locales/newspaper_title.it.yml +1 -1
- data/config/locales/newspaper_title.pt-BR.yml +1 -1
- data/config/locales/newspaper_title.zh.yml +1 -1
- data/config/locales/newspaper_works.de.yml +98 -0
- data/config/locales/newspaper_works.en.yml +67 -0
- data/config/locales/newspaper_works.es.yml +96 -0
- data/config/locales/newspaper_works.fr.yml +97 -0
- data/config/locales/newspaper_works.it.yml +90 -0
- data/config/locales/newspaper_works.pt-BR.yml +96 -0
- data/config/locales/newspaper_works.zh.yml +90 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +39 -39
- data/lib/newspaper_works.rb +2 -0
- data/lib/newspaper_works/image_tool.rb +119 -0
- data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
- data/lib/newspaper_works/text_extraction.rb +1 -0
- data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
- data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
- data/lib/newspaper_works/version.rb +1 -1
- data/newspaper_works.gemspec +2 -3
- data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
- data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
- data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
- data/spec/spec_helper.rb +19 -0
- metadata +21 -22
@@ -1,100 +1,86 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'open3'
|
3
|
-
require '
|
3
|
+
require 'tmpdir'
|
4
4
|
|
5
5
|
# --
|
6
6
|
module NewspaperWorks
|
7
7
|
# Module for text extraction (OCR or otherwise)
|
8
8
|
module TextExtraction
|
9
9
|
class PageOCR
|
10
|
-
|
11
|
-
new(path).alto
|
12
|
-
end
|
10
|
+
attr_accessor :html, :path
|
13
11
|
|
14
12
|
def initialize(path)
|
15
13
|
@path = path
|
14
|
+
# hOCR html:
|
15
|
+
@html = nil
|
16
16
|
@words = nil
|
17
|
-
@processor = "mini_magick"
|
18
17
|
@source_meta = nil
|
19
|
-
@use_gm = extension.start_with?('jp2')
|
20
18
|
@box = nil
|
21
19
|
@plain = nil
|
22
20
|
end
|
23
21
|
|
24
|
-
def
|
25
|
-
|
22
|
+
def run_ocr
|
23
|
+
outfile = File.join(Dir.mktmpdir, 'output_html')
|
24
|
+
cmd = "tesseract #{path} #{outfile} hocr"
|
25
|
+
`#{cmd}`
|
26
|
+
outfile + '.hocr'
|
26
27
|
end
|
27
28
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
end
|
35
|
-
else
|
36
|
-
@box = RTesseract::Box.new(@path, processor: @processor)
|
37
|
-
@plain = @box.to_s
|
38
|
-
end
|
39
|
-
end
|
40
|
-
@box
|
29
|
+
def load_words
|
30
|
+
preprocess_image
|
31
|
+
html_path = run_ocr
|
32
|
+
reader = NewspaperWorks::TextExtraction::HOCRReader.new(html_path)
|
33
|
+
@words = reader.words
|
34
|
+
@plain = reader.text
|
41
35
|
end
|
42
36
|
|
43
37
|
def words
|
44
|
-
|
38
|
+
load_words if @words.nil?
|
45
39
|
@words
|
46
40
|
end
|
47
41
|
|
48
|
-
def normalized_coordinate(word)
|
49
|
-
{
|
50
|
-
word: word[:word],
|
51
|
-
coordinates: [
|
52
|
-
word[:x_start],
|
53
|
-
word[:y_start],
|
54
|
-
(word[:x_end] - word[:x_start]),
|
55
|
-
(word[:y_end] - word[:y_start])
|
56
|
-
]
|
57
|
-
}
|
58
|
-
end
|
59
|
-
|
60
42
|
def word_json
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
43
|
+
builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
|
44
|
+
words,
|
45
|
+
width,
|
46
|
+
height
|
47
|
+
)
|
65
48
|
builder.to_json
|
66
49
|
end
|
67
50
|
|
68
51
|
def plain
|
69
|
-
|
52
|
+
load_words if @plain.nil?
|
70
53
|
@plain
|
71
54
|
end
|
72
55
|
|
73
56
|
def identify
|
74
|
-
|
75
|
-
|
76
|
-
cmd = "identify -verbose #{path}"
|
77
|
-
cmd = 'gm ' + cmd if @use_gm
|
78
|
-
lines = `#{cmd}`.lines
|
79
|
-
geo = lines.select { |line| line.strip.start_with?('Geometry') }[0]
|
80
|
-
img_geo = geo.strip.split(':')[-1].strip.split('+')[0]
|
81
|
-
@source_geometry = img_geo.split('x').map(&:to_i)
|
82
|
-
end
|
83
|
-
@source_geometry
|
57
|
+
return @source_meta unless @source_meta.nil?
|
58
|
+
@source_meta = NewspaperWorks::ImageTool.new(@path).metadata
|
84
59
|
end
|
85
60
|
|
86
61
|
def width
|
87
|
-
identify[
|
62
|
+
identify[:width]
|
88
63
|
end
|
89
64
|
|
90
65
|
def height
|
91
|
-
identify[
|
66
|
+
identify[:height]
|
92
67
|
end
|
93
68
|
|
94
69
|
def alto
|
95
70
|
writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height)
|
96
71
|
writer.to_alto(words)
|
97
72
|
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
# transform the image into a one-bit TIFF for OCR
|
77
|
+
def preprocess_image
|
78
|
+
tool = NewspaperWorks::ImageTool.new(@path)
|
79
|
+
return if tool.metadata[:color] == 'monochrome'
|
80
|
+
intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
|
81
|
+
tool.convert(intermediate_path, true)
|
82
|
+
@path = intermediate_path
|
83
|
+
end
|
98
84
|
end
|
99
85
|
end
|
100
86
|
end
|
@@ -15,10 +15,10 @@ module NewspaperWorks
|
|
15
15
|
words.each do |word|
|
16
16
|
xml.String(
|
17
17
|
CONTENT: word[:word],
|
18
|
-
|
19
|
-
|
20
|
-
HPOS: scale_point(word[:
|
21
|
-
VPOS: scale_point(word[:
|
18
|
+
WIDTH: scale_point(word[:coordinates][2]).to_s,
|
19
|
+
HEIGHT: scale_point(word[:coordinates][3]).to_s,
|
20
|
+
HPOS: scale_point(word[:coordinates][0]).to_s,
|
21
|
+
VPOS: scale_point(word[:coordinates][1]).to_s
|
22
22
|
) { xml.text '' }
|
23
23
|
end
|
24
24
|
end
|
data/newspaper_works.gemspec
CHANGED
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
|
|
11
11
|
'Eben English']
|
12
12
|
spec.email = ['sean.upton@utah.edu', 'jacob.reed@utah.edu',
|
13
13
|
'brian.mcbride@utah.edu', 'eenglish@bpl.org']
|
14
|
-
spec.homepage = 'https://github.com/
|
14
|
+
spec.homepage = 'https://github.com/samvera-labs/newspaper_works'
|
15
15
|
spec.description = 'Gem/Engine for Newspaper Works in Hyrax-based Samvera
|
16
16
|
Application.'
|
17
17
|
spec.summary = <<-SUMMARY
|
@@ -22,12 +22,11 @@ SUMMARY
|
|
22
22
|
spec.license = 'Apache-2.0'
|
23
23
|
spec.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
|
24
24
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
25
|
-
spec.add_dependency 'blacklight_iiif_search'
|
25
|
+
spec.add_dependency 'blacklight_iiif_search', '~> 1.0'
|
26
26
|
spec.add_dependency 'blacklight_advanced_search', '6.4.1'
|
27
27
|
spec.add_dependency 'hyrax', '2.5.1'
|
28
28
|
spec.add_dependency 'nokogiri'
|
29
29
|
spec.add_dependency 'rails', '~> 5.1'
|
30
|
-
spec.add_dependency 'rtesseract', '~> 2.2.0'
|
31
30
|
spec.add_dependency 'sass-rails', '~> 5.0'
|
32
31
|
|
33
32
|
spec.add_development_dependency 'bixby'
|
@@ -15,7 +15,7 @@ RSpec.describe 'thumbnail_highlights', js: true do
|
|
15
15
|
visibility: "open"
|
16
16
|
)
|
17
17
|
attachment = NewspaperWorks::Data::WorkFiles.of(@work)
|
18
|
-
attachment.assign(File.join(fixture_path, '
|
18
|
+
attachment.assign(File.join(fixture_path, 'ocr_mono.tiff'))
|
19
19
|
attachment.derivatives.assign(File.join(fixture_path, 'ndnp-sample1-txt.txt'))
|
20
20
|
attachment.derivatives.assign(File.join(fixture_path, 'ndnp-sample1-json.json'))
|
21
21
|
attachment.commit!
|
@@ -0,0 +1,78 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
4
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
5
|
+
<head>
|
6
|
+
<title></title>
|
7
|
+
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
8
|
+
<meta name='ocr-system' content='tesseract 4.0.0-beta.1' />
|
9
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
10
|
+
</head>
|
11
|
+
<body>
|
12
|
+
<div class='ocr_page' id='page_1' title='image "ocr_mono.tiff"; bbox 0 0 1261 1744; ppageno 0'>
|
13
|
+
<div class='ocr_carea' id='block_1_1' title="bbox 155 22 1073 129">
|
14
|
+
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 155 22 1073 129">
|
15
|
+
<span class='ocr_line' id='line_1_1' title="bbox 155 22 1073 129; baseline 0.003 -18; x_size 111; x_descenders 19; x_ascenders 40"><span class='ocrx_word' id='word_1_1' title='bbox 155 59 247 129; x_wconf 76'>_A</span> <span class='ocrx_word' id='word_1_2' title='bbox 272 28 600 113; x_wconf 95'>FEARFUL</span> <span class='ocrx_word' id='word_1_3' title='bbox 622 22 1073 129; x_wconf 85'>ADVENTURE.</span>
|
16
|
+
</span>
|
17
|
+
</p>
|
18
|
+
</div>
|
19
|
+
<div class='ocr_carea' id='block_1_2' title="bbox 551 152 695 161">
|
20
|
+
<p class='ocr_par' id='par_1_2' lang='eng' title="bbox 551 152 695 161">
|
21
|
+
<span class='ocr_line' id='line_1_2' title="bbox 551 152 695 161; baseline 0 0; x_size 4.5; x_descenders -2.25; x_ascenders 2.25"><span class='ocrx_word' id='word_1_4' title='bbox 551 152 695 161; x_wconf 95'> </span>
|
22
|
+
</span>
|
23
|
+
</p>
|
24
|
+
</div>
|
25
|
+
<div class='ocr_carea' id='block_1_3' title="bbox 11 174 1244 613">
|
26
|
+
<p class='ocr_par' id='par_1_3' lang='eng' title="bbox 11 174 1242 429">
|
27
|
+
<span class='ocr_line' id='line_1_3' title="bbox 69 174 1242 261; baseline 0.003 -21; x_size 70; x_descenders 18; x_ascenders 18"><span class='ocrx_word' id='word_1_5' title='bbox 69 188 193 240; x_wconf 57'>‘The</span> <span class='ocrx_word' id='word_1_6' title='bbox 225 191 464 244; x_wconf 52'>Missouri.</span> <span class='ocrx_word' id='word_1_7' title='bbox 517 174 865 261; x_wconf 92'>Republican,</span> <span class='ocrx_word' id='word_1_8' title='bbox 906 195 954 246; x_wconf 95'>in</span> <span class='ocrx_word' id='word_1_9' title='bbox 1007 211 1040 247; x_wconf 77'>a</span> <span class='ocrx_word' id='word_1_10' title='bbox 1087 195 1242 246; x_wconf 90'>letter</span>
|
28
|
+
</span>
|
29
|
+
<span class='ocr_line' id='line_1_4' title="bbox 11 270 1238 387; baseline 0.006 -63; x_size 71; x_descenders 18; x_ascenders 18"><span class='ocrx_word' id='word_1_11' title='bbox 11 270 139 326; x_wconf 96'>from</span> <span class='ocrx_word' id='word_1_12' title='bbox 167 293 199 328; x_wconf 93'>a</span> <span class='ocrx_word' id='word_1_13' title='bbox 229 277 440 329; x_wconf 95'>Kansas</span> <span class='ocrx_word' id='word_1_14' title='bbox 464 281 888 374; x_wconf 94'>correspondent,</span> <span class='ocrx_word' id='word_1_15' title='bbox 903 281 1008 387; x_wconf 96'>has</span> <span class='ocrx_word' id='word_1_16' title='bbox 1039 282 1128 333; x_wconf 93'>the</span> <span class='ocrx_word' id='word_1_17' title='bbox 1149 279 1238 332; x_wconf 92'>fol-</span>
|
30
|
+
</span>
|
31
|
+
<span class='ocr_line' id='line_1_5' title="bbox 12 361 224 429; baseline 0.021 -19; x_size 66; x_descenders 17; x_ascenders 16"><span class='ocrx_word' id='word_1_18' title='bbox 12 361 224 429; x_wconf 95'>lowing:</span>
|
32
|
+
</span>
|
33
|
+
</p>
|
34
|
+
|
35
|
+
<p class='ocr_par' id='par_1_4' lang='eng' title="bbox 11 407 1244 613">
|
36
|
+
<span class='ocr_line' id='line_1_6' title="bbox 86 407 1244 520; baseline 0.005 -19; x_size 70; x_descenders 16; x_ascenders 19"><span class='ocrx_word' id='word_1_19' title='bbox 86 452 206 501; x_wconf 95'>“At</span> <span class='ocrx_word' id='word_1_20' title='bbox 234 449 311 504; x_wconf 95'>St.</span> <span class='ocrx_word' id='word_1_21' title='bbox 339 415 567 520; x_wconf 49'>Josephs</span> <span class='ocrx_word' id='word_1_22' title='bbox 595 454 751 505; x_wconf 54'>Tsaw</span> <span class='ocrx_word' id='word_1_23' title='bbox 781 456 884 509; x_wconf 95'>Mr,</span> <span class='ocrx_word' id='word_1_24' title='bbox 915 457 982 508; x_wconf 91'>A.</span> <span class='ocrx_word' id='word_1_25' title='bbox 1011 456 1074 508; x_wconf 21'>'T.</span> <span class='ocrx_word' id='word_1_26' title='bbox 1116 407 1244 508; x_wconf 91'>Gor-</span>
|
37
|
+
</span>
|
38
|
+
<span class='ocr_line' id='line_1_7' title="bbox 11 539 1242 613; baseline 0.006 -24; x_size 65; x_descenders 14; x_ascenders 16"><span class='ocrx_word' id='word_1_27' title='bbox 11 554 154 602; x_wconf 96'>man,</span> <span class='ocrx_word' id='word_1_28' title='bbox 177 539 228 590; x_wconf 95'>of</span> <span class='ocrx_word' id='word_1_29' title='bbox 260 539 389 592; x_wconf 95'>New</span> <span class='ocrx_word' id='word_1_30' title='bbox 417 542 580 606; x_wconf 96'>York,</span> <span class='ocrx_word' id='word_1_31' title='bbox 607 544 724 596; x_wconf 96'>who</span> <span class='ocrx_word' id='word_1_32' title='bbox 752 544 859 597; x_wconf 38'>had</span> <span class='ocrx_word' id='word_1_33' title='bbox 861 546 988 613; x_wconf 38'>just</span> <span class='ocrx_word' id='word_1_34' title='bbox 1012 562 1170 598; x_wconf 85'>come:</span> <span class='ocrx_word' id='word_1_35' title='bbox 1194 546 1242 597; x_wconf 96'>in</span>
|
39
|
+
</span>
|
40
|
+
</p>
|
41
|
+
</div>
|
42
|
+
<div class='ocr_carea' id='block_1_4' title="bbox 12 625 1261 699">
|
43
|
+
<p class='ocr_par' id='par_1_5' lang='eng' title="bbox 12 625 1261 699">
|
44
|
+
<span class='ocr_line' id='line_1_8' title="bbox 12 625 1261 699; baseline 0.007 -24; x_size 66; x_descenders 16; x_ascenders 15"><span class='ocrx_word' id='word_1_36' title='bbox 12 625 140 676; x_wconf 95'>from</span> <span class='ocrx_word' id='word_1_37' title='bbox 163 627 257 679; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_38' title='bbox 287 631 576 680; x_wconf 95'>mountains</span> <span class='ocrx_word' id='word_1_39' title='bbox 599 631 650 682; x_wconf 94'>in</span> <span class='ocrx_word' id='word_1_40' title='bbox 678 632 802 683; x_wconf 89'>such</span> <span class='ocrx_word' id='word_1_41' title='bbox 824 648 855 682; x_wconf 89'>a</span> <span class='ocrx_word' id='word_1_42' title='bbox 882 636 1019 683; x_wconf 85'>state</span> <span class='ocrx_word' id='word_1_43' title='bbox 1043 633 1097 683; x_wconf 92'>of</span> <span class='ocrx_word' id='word_1_44' title='bbox 1109 637 1261 699; x_wconf 88'>pros-</span>
|
45
|
+
</span>
|
46
|
+
</p>
|
47
|
+
</div>
|
48
|
+
<div class='ocr_carea' id='block_1_5' title="bbox 11 714 1249 1742">
|
49
|
+
<p class='ocr_par' id='par_1_6' lang='eng' title="bbox 11 714 1249 1742">
|
50
|
+
<span class='ocr_line' id='line_1_9' title="bbox 12 714 1244 787; baseline 0.006 -23; x_size 69; x_descenders 17; x_ascenders 17"><span class='ocrx_word' id='word_1_45' title='bbox 12 714 209 765; x_wconf 95'>tration</span> <span class='ocrx_word' id='word_1_46' title='bbox 238 717 339 783; x_wconf 93'>and</span> <span class='ocrx_word' id='word_1_47' title='bbox 372 717 611 769; x_wconf 91'>affiiction</span> <span class='ocrx_word' id='word_1_48' title='bbox 645 734 701 769; x_wconf 96'>as</span> <span class='ocrx_word' id='word_1_49' title='bbox 746 719 896 772; x_wconf 95'>could</span> <span class='ocrx_word' id='word_1_50' title='bbox 950 722 1067 787; x_wconf 96'>only</span> <span class='ocrx_word' id='word_1_51' title='bbox 1112 719 1244 772; x_wconf 96'>have</span>
|
51
|
+
</span>
|
52
|
+
<span class='ocr_line' id='line_1_10' title="bbox 15 803 1245 882; baseline 0.005 -29; x_size 70; x_descenders 19; x_ascenders 16"><span class='ocrx_word' id='word_1_52' title='bbox 15 803 139 854; x_wconf 95'>been</span> <span class='ocrx_word' id='word_1_53' title='bbox 168 805 474 858; x_wconf 91'>occasioned</span> <span class='ocrx_word' id='word_1_54' title='bbox 514 808 582 873; x_wconf 95'>by</span> <span class='ocrx_word' id='word_1_55' title='bbox 608 807 748 859; x_wconf 96'>such</span> <span class='ocrx_word' id='word_1_56' title='bbox 795 824 1065 882; x_wconf 86'>exposure,</span> <span class='ocrx_word' id='word_1_57' title='bbox 1074 808 1245 859; x_wconf 86'>hard-</span>
|
53
|
+
</span>
|
54
|
+
<span class='ocr_line' id='line_1_11' title="bbox 11 892 1244 966; baseline 0.006 -26; x_size 71; x_descenders 20; x_ascenders 16"><span class='ocrx_word' id='word_1_58' title='bbox 11 892 125 960; x_wconf 95'>ship</span> <span class='ocrx_word' id='word_1_59' title='bbox 155 896 256 944; x_wconf 96'>and</span> <span class='ocrx_word' id='word_1_60' title='bbox 296 894 561 963; x_wconf 91'>suffering,</span> <span class='ocrx_word' id='word_1_61' title='bbox 590 913 649 947; x_wconf 91'>as</span> <span class='ocrx_word' id='word_1_62' title='bbox 699 898 925 966; x_wconf 94'>perhaps</span> <span class='ocrx_word' id='word_1_63' title='bbox 974 913 1042 949; x_wconf 91'>no</span> <span class='ocrx_word' id='word_1_64' title='bbox 1090 898 1244 948; x_wconf 91'>other</span>
|
55
|
+
</span>
|
56
|
+
<span class='ocr_line' id='line_1_12' title="bbox 14 983 1245 1053; baseline 0.005 -23; x_size 71; x_descenders 19; x_ascenders 18"><span class='ocrx_word' id='word_1_65' title='bbox 14 995 128 1030; x_wconf 96'>man</span> <span class='ocrx_word' id='word_1_66' title='bbox 156 997 279 1031; x_wconf 92'>ever</span> <span class='ocrx_word' id='word_1_67' title='bbox 306 983 560 1034; x_wconf 88'>snrvived.</span> <span class='ocrx_word' id='word_1_68' title='bbox 635 984 703 1034; x_wconf 47'>din</span> <span class='ocrx_word' id='word_1_69' title='bbox 732 999 985 1053; x_wconf 47'>company</span> <span class='ocrx_word' id='word_1_70' title='bbox 1042 984 1161 1049; x_wconf 96'>with</span> <span class='ocrx_word' id='word_1_71' title='bbox 1214 1002 1245 1036; x_wconf 93'>a</span>
|
57
|
+
</span>
|
58
|
+
<span class='ocr_line' id='line_1_13' title="bbox 15 1065 1246 1126; baseline 0.004 -11; x_size 62; x_descenders 10; x_ascenders 18"><span class='ocrx_word' id='word_1_72' title='bbox 15 1065 283 1126; x_wconf 47'>Canadian</span> <span class='ocrx_word' id='word_1_73' title='bbox 312 1066 638 1120; x_wconf 96'>Frenchman</span> <span class='ocrx_word' id='word_1_74' title='bbox 669 1071 774 1122; x_wconf 94'>and</span> <span class='ocrx_word' id='word_1_75' title='bbox 804 1077 908 1122; x_wconf 91'>two</span> <span class='ocrx_word' id='word_1_76' title='bbox 961 1069 1246 1123; x_wconf 91'>Kentucki-</span>
|
59
|
+
</span>
|
60
|
+
<span class='ocr_line' id='line_1_14' title="bbox 12 1153 1248 1223; baseline 0.006 -22; x_size 68; x_descenders 17; x_ascenders 18"><span class='ocrx_word' id='word_1_77' title='bbox 12 1169 105 1202; x_wconf 90'>ans</span> <span class='ocrx_word' id='word_1_78' title='bbox 132 1153 310 1205; x_wconf 85'>he-left</span> <span class='ocrx_word' id='word_1_79' title='bbox 337 1155 425 1205; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_80' title='bbox 452 1162 672 1223; x_wconf 96'>country</span> <span class='ocrx_word' id='word_1_81' title='bbox 724 1157 783 1208; x_wconf 96'>of</span> <span class='ocrx_word' id='word_1_82' title='bbox 832 1157 927 1208; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_83' title='bbox 972 1157 1248 1209; x_wconf 90'>Blackfeet</span>
|
61
|
+
</span>
|
62
|
+
<span class='ocr_line' id='line_1_15' title="bbox 16 1239 1246 1306; baseline 0.003 -18; x_size 69; x_descenders 15; x_ascenders 21"><span class='ocrx_word' id='word_1_84' title='bbox 16 1239 228 1311; x_wconf 88'>Indians</span> <span class='ocrx_word' id='word_1_85' title='bbox 247 1241 356 1310; x_wconf 64'>last</span> <span class='ocrx_word' id='word_1_86' title='bbox 366 1240 599 1288; x_wconf 36'>-Fall.:to</span> <span class='ocrx_word' id='word_1_87' title='bbox 623 1244 795 1315; x_wconf 86'>join.</span> <span class='ocrx_word' id='word_1_88' title='bbox 800 1243 1100 1310; x_wconf 76'>Culverson</span> <span class='ocrx_word' id='word_1_89' title='bbox 1140 1245 1246 1309; x_wconf 96'>and</span>
|
63
|
+
</span>
|
64
|
+
<span class='ocr_line' id='line_1_16' title="bbox 14 1303 1246 1393; baseline 0.003 -21; x_size 67; x_descenders 16; x_ascenders 16"><span class='ocrx_word' id='word_1_90' title='bbox 14 1309 167 1388; x_wconf 96'>party</span> <span class='ocrx_word' id='word_1_91' title='bbox 195 1307 253 1373; x_wconf 96'>at</span> <span class='ocrx_word' id='word_1_92' title='bbox 282 1305 413 1373; x_wconf 96'>Fort</span> <span class='ocrx_word' id='word_1_93' title='bbox 443 1313 623 1376; x_wconf 96'>Pierre</span> <span class='ocrx_word' id='word_1_94' title='bbox 647 1326 750 1376; x_wconf 93'>and</span> <span class='ocrx_word' id='word_1_95' title='bbox 774 1358 789 1375; x_wconf 91'>¢</span> <span class='ocrx_word' id='word_1_96' title='bbox 773 1313 1089 1393; x_wconf 96'>accompany</span> <span class='ocrx_word' id='word_1_97' title='bbox 1109 1303 1246 1376; x_wconf 96'>them</span>
|
65
|
+
</span>
|
66
|
+
<span class='ocr_line' id='line_1_17' title="bbox 15 1411 1245 1480; baseline 0.004 -19; x_size 68; x_descenders 17; x_ascenders 15"><span class='ocrx_word' id='word_1_98' title='bbox 15 1419 70 1462; x_wconf 96'>to</span> <span class='ocrx_word' id='word_1_99' title='bbox 99 1411 185 1462; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_100' title='bbox 213 1418 393 1463; x_wconf 96'>states.</span> <span class='ocrx_word' id='word_1_101' title='bbox 432 1412 579 1480; x_wconf 96'>They</span> <span class='ocrx_word' id='word_1_102' title='bbox 608 1414 816 1466; x_wconf 96'>arrived</span> <span class='ocrx_word' id='word_1_103' title='bbox 837 1420 894 1465; x_wconf 96'>at</span> <span class='ocrx_word' id='word_1_104' title='bbox 914 1414 1050 1465; x_wconf 96'>Fort</span> <span class='ocrx_word' id='word_1_105' title='bbox 1067 1414 1245 1466; x_wconf 96'>Pierre</span>
|
67
|
+
</span>
|
68
|
+
<span class='ocr_line' id='line_1_18' title="bbox 15 1497 1246 1568; baseline 0.002 -18; x_size 70; x_descenders 17; x_ascenders 18"><span class='ocrx_word' id='word_1_106' title='bbox 15 1505 120 1550; x_wconf 96'>two</span> <span class='ocrx_word' id='word_1_107' title='bbox 147 1500 276 1564; x_wconf 96'>days</span> <span class='ocrx_word' id='word_1_108' title='bbox 303 1497 436 1550; x_wconf 92'>after</span> <span class='ocrx_word' id='word_1_109' title='bbox 466 1497 794 1551; x_wconf 74'>Calverson’s</span> <span class='ocrx_word' id='word_1_110' title='bbox 821 1500 1116 1568; x_wconf 96'>departure,</span> <span class='ocrx_word' id='word_1_111' title='bbox 1145 1502 1246 1552; x_wconf 96'>and</span>
|
69
|
+
</span>
|
70
|
+
<span class='ocr_line' id='line_1_19' title="bbox 15 1586 1249 1656; baseline 0.001 -19; x_size 68; x_descenders 17; x_ascenders 17"><span class='ocrx_word' id='word_1_112' title='bbox 15 1587 222 1638; x_wconf 51'>hurried</span> <span class='ocrx_word' id='word_1_113' title='bbox 252 1603 316 1638; x_wconf 96'>on</span> <span class='ocrx_word' id='word_1_114' title='bbox 343 1586 498 1650; x_wconf 96'>after,</span> <span class='ocrx_word' id='word_1_115' title='bbox 526 1587 578 1637; x_wconf 96'>in</span> <span class='ocrx_word' id='word_1_116' title='bbox 606 1587 693 1640; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_117' title='bbox 719 1587 853 1656; x_wconf 95'>hope</span> <span class='ocrx_word' id='word_1_118' title='bbox 874 1589 929 1640; x_wconf 95'>of</span> <span class='ocrx_word' id='word_1_119' title='bbox 939 1589 1249 1656; x_wconf 96'>overtaking</span>
|
71
|
+
</span>
|
72
|
+
<span class='ocr_line' id='line_1_20' title="bbox 18 1672 1248 1742; baseline 0.001 -16; x_size 69; x_descenders 16; x_ascenders 18"><span class='ocrx_word' id='word_1_120' title='bbox 18 1675 160 1734; x_wconf 90'>him.</span> <span class='ocrx_word' id='word_1_121' title='bbox 214 1672 296 1726; x_wconf 92'>On</span> <span class='ocrx_word' id='word_1_122' title='bbox 327 1675 416 1726; x_wconf 95'>the</span> <span class='ocrx_word' id='word_1_123' title='bbox 442 1675 584 1727; x_wconf 96'>third</span> <span class='ocrx_word' id='word_1_124' title='bbox 608 1675 711 1742; x_wconf 96'>day</span> <span class='ocrx_word' id='word_1_125' title='bbox 740 1691 837 1727; x_wconf 96'>one</span> <span class='ocrx_word' id='word_1_126' title='bbox 865 1676 922 1727; x_wconf 96'>of</span> <span class='ocrx_word' id='word_1_127' title='bbox 942 1675 1090 1727; x_wconf 96'>those</span> <span class='ocrx_word' id='word_1_128' title='bbox 1110 1692 1248 1727; x_wconf 96'>snow</span>
|
73
|
+
</span>
|
74
|
+
</p>
|
75
|
+
</div>
|
76
|
+
</div>
|
77
|
+
</body>
|
78
|
+
</html>
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
describe NewspaperWorks::ImageTool do
|
5
|
+
let(:fixtures) { File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files') }
|
6
|
+
|
7
|
+
# Image fixtures to test identification, metadata extraction for:
|
8
|
+
let(:gray_jp2) { File.join(fixtures, 'ocr_gray.jp2') }
|
9
|
+
let(:color_jp2) { File.join(fixtures, '4.1.07.jp2') }
|
10
|
+
let(:gray_tiff) { File.join(fixtures, 'ocr_gray.tiff') }
|
11
|
+
let(:mono_tiff) { File.join(fixtures, 'ocr_mono.tiff') }
|
12
|
+
let(:color_tiff) { File.join(fixtures, '4.1.07.tiff') }
|
13
|
+
let(:pdf) { File.join(fixtures, 'minimal-1-page.pdf') }
|
14
|
+
|
15
|
+
describe "Extracts metadata with JP2 backend" do
|
16
|
+
it "constructs with a path" do
|
17
|
+
identify = described_class.new(gray_jp2)
|
18
|
+
expect(identify.path).to eq gray_jp2
|
19
|
+
end
|
20
|
+
|
21
|
+
it "gets metadata for grayscale JP2 image" do
|
22
|
+
result = described_class.new(gray_jp2).metadata
|
23
|
+
expect(result[:color]).to eq 'gray'
|
24
|
+
expect(result[:width]).to eq 418
|
25
|
+
expect(result[:height]).to eq 1046
|
26
|
+
expect(result[:bits_per_component]).to eq 8
|
27
|
+
expect(result[:num_components]).to eq 1
|
28
|
+
end
|
29
|
+
|
30
|
+
it "gets metadata for color JP2 image" do
|
31
|
+
result = described_class.new(color_jp2).metadata
|
32
|
+
expect(result[:color]).to eq 'color'
|
33
|
+
expect(result[:width]).to eq 256
|
34
|
+
expect(result[:height]).to eq 256
|
35
|
+
expect(result[:bits_per_component]).to eq 8
|
36
|
+
# e.g. is 3, but would be four if sample image had an alpha channel
|
37
|
+
expect(result[:num_components]).to eq 3
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "Extracts metadata for non-JP2 images with imagemagick" do
|
42
|
+
it "gets metadata for gray TIFF image" do
|
43
|
+
result = described_class.new(gray_tiff).metadata
|
44
|
+
expect(result[:color]).to eq 'gray'
|
45
|
+
expect(result[:width]).to eq 418
|
46
|
+
expect(result[:height]).to eq 1046
|
47
|
+
expect(result[:bits_per_component]).to eq 8
|
48
|
+
expect(result[:num_components]).to eq 1
|
49
|
+
end
|
50
|
+
|
51
|
+
it "gets metadata for monochrome TIFF image" do
|
52
|
+
result = described_class.new(mono_tiff).metadata
|
53
|
+
expect(result[:color]).to eq 'monochrome'
|
54
|
+
expect(result[:width]).to eq 1261
|
55
|
+
expect(result[:height]).to eq 1744
|
56
|
+
expect(result[:bits_per_component]).to eq 1
|
57
|
+
expect(result[:num_components]).to eq 1
|
58
|
+
end
|
59
|
+
|
60
|
+
it "gets metadata for color TIFF image" do
|
61
|
+
result = described_class.new(color_tiff).metadata
|
62
|
+
expect(result[:color]).to eq 'color'
|
63
|
+
expect(result[:width]).to eq 256
|
64
|
+
expect(result[:height]).to eq 256
|
65
|
+
expect(result[:bits_per_component]).to eq 8
|
66
|
+
# e.g. is 3, but would be four if sample image had an alpha channel
|
67
|
+
expect(result[:num_components]).to eq 3
|
68
|
+
end
|
69
|
+
|
70
|
+
it "detects mime type of pdf" do
|
71
|
+
result = described_class.new(pdf).metadata
|
72
|
+
expect(result[:content_type]).to eq 'application/pdf'
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
describe "converts images" do
|
77
|
+
it "makes a monochrome TIFF from JP2" do
|
78
|
+
tool = described_class.new(gray_jp2)
|
79
|
+
dest = File.join(Dir.mktmpdir, 'mono.tif')
|
80
|
+
tool.convert(dest, true)
|
81
|
+
expect(File.exist?(dest)).to be true
|
82
|
+
expect(described_class.new(dest).metadata[:color]).to eq 'monochrome'
|
83
|
+
end
|
84
|
+
|
85
|
+
it "makes a gray TIFF from JP2" do
|
86
|
+
tool = described_class.new(gray_jp2)
|
87
|
+
dest = File.join(Dir.mktmpdir, 'gray.tif')
|
88
|
+
tool.convert(dest, false)
|
89
|
+
expect(File.exist?(dest)).to be true
|
90
|
+
expect(described_class.new(dest).metadata[:color]).to eq 'gray'
|
91
|
+
end
|
92
|
+
|
93
|
+
it "makes a monochrome TIFF from grayscale TIFF" do
|
94
|
+
tool = described_class.new(gray_tiff)
|
95
|
+
dest = File.join(Dir.mktmpdir, 'mono.tif')
|
96
|
+
tool.convert(dest, true)
|
97
|
+
expect(File.exist?(dest)).to be true
|
98
|
+
expect(described_class.new(dest).metadata[:color]).to eq 'monochrome'
|
99
|
+
end
|
100
|
+
|
101
|
+
# Not yet supported to use this tool to make JP2, for now the only
|
102
|
+
# component in NewspaperWorks doing that is
|
103
|
+
# NewspaperWorks::JP2DerivativeService
|
104
|
+
it "raises error on JP2 destination" do
|
105
|
+
expect { described_class.new(gray_tiff).convert('out.jp2') }.to \
|
106
|
+
raise_error(RuntimeError)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -24,13 +24,13 @@ RSpec.shared_examples 'ingest adapter IO' do
|
|
24
24
|
# define the path to the file we will use for multiple examples
|
25
25
|
let(:path) do
|
26
26
|
fixtures = File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files')
|
27
|
-
File.join(fixtures, '
|
27
|
+
File.join(fixtures, 'ocr_mono.tiff')
|
28
28
|
end
|
29
29
|
|
30
30
|
# DRY for this matcher's use in multiple examples:
|
31
31
|
let(:have_io_and_correct_filename) do
|
32
32
|
have_attributes(
|
33
|
-
filename: '
|
33
|
+
filename: 'ocr_mono.tiff',
|
34
34
|
io: an_object_responding_to(:read)
|
35
35
|
)
|
36
36
|
end
|
@@ -62,7 +62,7 @@ RSpec.shared_examples 'ingest adapter IO' do
|
|
62
62
|
it "loads a StringIO with filename" do
|
63
63
|
adapter = build(:newspaper_page_ingest)
|
64
64
|
io = StringIO.new('File Content Here, Maybe')
|
65
|
-
adapter.load(io, filename: '
|
65
|
+
adapter.load(io, filename: 'ocr_mono.tiff')
|
66
66
|
expect(adapter).to have_io_and_correct_filename
|
67
67
|
end
|
68
68
|
|
@@ -8,7 +8,7 @@ RSpec.describe NewspaperWorks::Ingest::NewspaperPageIngest do
|
|
8
8
|
|
9
9
|
# define the path to the file we will use for multiple examples
|
10
10
|
let(:path) do
|
11
|
-
File.join(fixture_path, '
|
11
|
+
File.join(fixture_path, 'ocr_mono.tiff')
|
12
12
|
end
|
13
13
|
|
14
14
|
it_behaves_like('ingest adapter IO')
|
@@ -32,7 +32,7 @@ RSpec.describe NewspaperWorks::Ingest::NewspaperPageIngest do
|
|
32
32
|
|
33
33
|
def verify_pcdm_fileset(fileset)
|
34
34
|
# Hyrax always sets label (if not title) on fileset:
|
35
|
-
expect(fileset.label).to eq '
|
35
|
+
expect(fileset.label).to eq 'ocr_mono.tiff'
|
36
36
|
# reload file set and check on original file
|
37
37
|
fileset.reload
|
38
38
|
file = fileset.original_file
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe NewspaperWorks::JP2ImageMetadata do
|
4
|
+
let(:fixtures) { File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files') }
|
5
|
+
|
6
|
+
let(:gray_jp2) { File.join(fixtures, 'ocr_gray.jp2') }
|
7
|
+
|
8
|
+
let(:color_jp2) { File.join(fixtures, '4.1.07.jp2') }
|
9
|
+
|
10
|
+
describe "Extracts technical metadata from a JP2 file" do
|
11
|
+
it "constructs with a path" do
|
12
|
+
meta = described_class.new(gray_jp2)
|
13
|
+
expect(meta.path).to eq gray_jp2
|
14
|
+
end
|
15
|
+
|
16
|
+
it "gets metadata for grayscale image" do
|
17
|
+
meta = described_class.new(gray_jp2)
|
18
|
+
result = meta.technical_metadata
|
19
|
+
expect(result[:color]).to eq 'gray'
|
20
|
+
expect(result[:width]).to eq 418
|
21
|
+
expect(result[:height]).to eq 1046
|
22
|
+
expect(result[:bits_per_component]).to eq 8
|
23
|
+
expect(result[:num_components]).to eq 1
|
24
|
+
end
|
25
|
+
|
26
|
+
it "gets metadata for color image" do
|
27
|
+
meta = described_class.new(color_jp2)
|
28
|
+
result = meta.technical_metadata
|
29
|
+
expect(result[:color]).to eq 'color'
|
30
|
+
expect(result[:width]).to eq 256
|
31
|
+
expect(result[:height]).to eq 256
|
32
|
+
expect(result[:bits_per_component]).to eq 8
|
33
|
+
# e.g. is 3, but would be four if sample image had an alpha channel
|
34
|
+
expect(result[:num_components]).to eq 3
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|