newspaper_works 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.coveralls.yml +2 -0
- data/.gitignore +4 -0
- data/.travis.yml +2 -2
- data/README.md +14 -13
- data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
- data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
- data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
- data/config/locales/newspaper_article.de.yml +1 -1
- data/config/locales/newspaper_article.en.yml +1 -1
- data/config/locales/newspaper_article.es.yml +1 -1
- data/config/locales/newspaper_article.fr.yml +1 -1
- data/config/locales/newspaper_article.it.yml +1 -1
- data/config/locales/newspaper_article.pt-BR.yml +1 -1
- data/config/locales/newspaper_article.zh.yml +1 -1
- data/config/locales/newspaper_container.de.yml +1 -1
- data/config/locales/newspaper_container.en.yml +1 -1
- data/config/locales/newspaper_container.es.yml +1 -1
- data/config/locales/newspaper_container.fr.yml +1 -1
- data/config/locales/newspaper_container.it.yml +1 -1
- data/config/locales/newspaper_container.pt-BR.yml +1 -1
- data/config/locales/newspaper_container.zh.yml +1 -1
- data/config/locales/newspaper_issue.de.yml +1 -1
- data/config/locales/newspaper_issue.en.yml +1 -1
- data/config/locales/newspaper_issue.es.yml +1 -1
- data/config/locales/newspaper_issue.fr.yml +1 -1
- data/config/locales/newspaper_issue.it.yml +2 -2
- data/config/locales/newspaper_issue.pt-BR.yml +2 -2
- data/config/locales/newspaper_issue.zh.yml +2 -2
- data/config/locales/newspaper_page.de.yml +1 -1
- data/config/locales/newspaper_page.en.yml +1 -1
- data/config/locales/newspaper_page.es.yml +1 -1
- data/config/locales/newspaper_page.fr.yml +1 -1
- data/config/locales/newspaper_page.it.yml +1 -1
- data/config/locales/newspaper_page.pt-BR.yml +1 -1
- data/config/locales/newspaper_page.zh.yml +1 -1
- data/config/locales/newspaper_title.de.yml +1 -1
- data/config/locales/newspaper_title.en.yml +1 -1
- data/config/locales/newspaper_title.es.yml +1 -1
- data/config/locales/newspaper_title.fr.yml +1 -1
- data/config/locales/newspaper_title.it.yml +1 -1
- data/config/locales/newspaper_title.pt-BR.yml +1 -1
- data/config/locales/newspaper_title.zh.yml +1 -1
- data/config/locales/newspaper_works.de.yml +98 -0
- data/config/locales/newspaper_works.en.yml +67 -0
- data/config/locales/newspaper_works.es.yml +96 -0
- data/config/locales/newspaper_works.fr.yml +97 -0
- data/config/locales/newspaper_works.it.yml +90 -0
- data/config/locales/newspaper_works.pt-BR.yml +96 -0
- data/config/locales/newspaper_works.zh.yml +90 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +39 -39
- data/lib/newspaper_works.rb +2 -0
- data/lib/newspaper_works/image_tool.rb +119 -0
- data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
- data/lib/newspaper_works/text_extraction.rb +1 -0
- data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
- data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
- data/lib/newspaper_works/version.rb +1 -1
- data/newspaper_works.gemspec +2 -3
- data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
- data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
- data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
- data/spec/spec_helper.rb +19 -0
- metadata +21 -22
@@ -1,100 +1,86 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'open3'
|
3
|
-
require '
|
3
|
+
require 'tmpdir'
|
4
4
|
|
5
5
|
# --
|
6
6
|
module NewspaperWorks
|
7
7
|
# Module for text extraction (OCR or otherwise)
|
8
8
|
module TextExtraction
|
9
9
|
class PageOCR
|
10
|
-
|
11
|
-
new(path).alto
|
12
|
-
end
|
10
|
+
attr_accessor :html, :path
|
13
11
|
|
14
12
|
def initialize(path)
|
15
13
|
@path = path
|
14
|
+
# hOCR html:
|
15
|
+
@html = nil
|
16
16
|
@words = nil
|
17
|
-
@processor = "mini_magick"
|
18
17
|
@source_meta = nil
|
19
|
-
@use_gm = extension.start_with?('jp2')
|
20
18
|
@box = nil
|
21
19
|
@plain = nil
|
22
20
|
end
|
23
21
|
|
24
|
-
def
|
25
|
-
|
22
|
+
def run_ocr
|
23
|
+
outfile = File.join(Dir.mktmpdir, 'output_html')
|
24
|
+
cmd = "tesseract #{path} #{outfile} hocr"
|
25
|
+
`#{cmd}`
|
26
|
+
outfile + '.hocr'
|
26
27
|
end
|
27
28
|
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
end
|
35
|
-
else
|
36
|
-
@box = RTesseract::Box.new(@path, processor: @processor)
|
37
|
-
@plain = @box.to_s
|
38
|
-
end
|
39
|
-
end
|
40
|
-
@box
|
29
|
+
def load_words
|
30
|
+
preprocess_image
|
31
|
+
html_path = run_ocr
|
32
|
+
reader = NewspaperWorks::TextExtraction::HOCRReader.new(html_path)
|
33
|
+
@words = reader.words
|
34
|
+
@plain = reader.text
|
41
35
|
end
|
42
36
|
|
43
37
|
def words
|
44
|
-
|
38
|
+
load_words if @words.nil?
|
45
39
|
@words
|
46
40
|
end
|
47
41
|
|
48
|
-
def normalized_coordinate(word)
|
49
|
-
{
|
50
|
-
word: word[:word],
|
51
|
-
coordinates: [
|
52
|
-
word[:x_start],
|
53
|
-
word[:y_start],
|
54
|
-
(word[:x_end] - word[:x_start]),
|
55
|
-
(word[:y_end] - word[:y_start])
|
56
|
-
]
|
57
|
-
}
|
58
|
-
end
|
59
|
-
|
60
42
|
def word_json
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
43
|
+
builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
|
44
|
+
words,
|
45
|
+
width,
|
46
|
+
height
|
47
|
+
)
|
65
48
|
builder.to_json
|
66
49
|
end
|
67
50
|
|
68
51
|
def plain
|
69
|
-
|
52
|
+
load_words if @plain.nil?
|
70
53
|
@plain
|
71
54
|
end
|
72
55
|
|
73
56
|
def identify
|
74
|
-
|
75
|
-
|
76
|
-
cmd = "identify -verbose #{path}"
|
77
|
-
cmd = 'gm ' + cmd if @use_gm
|
78
|
-
lines = `#{cmd}`.lines
|
79
|
-
geo = lines.select { |line| line.strip.start_with?('Geometry') }[0]
|
80
|
-
img_geo = geo.strip.split(':')[-1].strip.split('+')[0]
|
81
|
-
@source_geometry = img_geo.split('x').map(&:to_i)
|
82
|
-
end
|
83
|
-
@source_geometry
|
57
|
+
return @source_meta unless @source_meta.nil?
|
58
|
+
@source_meta = NewspaperWorks::ImageTool.new(@path).metadata
|
84
59
|
end
|
85
60
|
|
86
61
|
def width
|
87
|
-
identify[
|
62
|
+
identify[:width]
|
88
63
|
end
|
89
64
|
|
90
65
|
def height
|
91
|
-
identify[
|
66
|
+
identify[:height]
|
92
67
|
end
|
93
68
|
|
94
69
|
def alto
|
95
70
|
writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height)
|
96
71
|
writer.to_alto(words)
|
97
72
|
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
# transform the image into a one-bit TIFF for OCR
|
77
|
+
def preprocess_image
|
78
|
+
tool = NewspaperWorks::ImageTool.new(@path)
|
79
|
+
return if tool.metadata[:color] == 'monochrome'
|
80
|
+
intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
|
81
|
+
tool.convert(intermediate_path, true)
|
82
|
+
@path = intermediate_path
|
83
|
+
end
|
98
84
|
end
|
99
85
|
end
|
100
86
|
end
|
@@ -15,10 +15,10 @@ module NewspaperWorks
|
|
15
15
|
words.each do |word|
|
16
16
|
xml.String(
|
17
17
|
CONTENT: word[:word],
|
18
|
-
|
19
|
-
|
20
|
-
HPOS: scale_point(word[:
|
21
|
-
VPOS: scale_point(word[:
|
18
|
+
WIDTH: scale_point(word[:coordinates][2]).to_s,
|
19
|
+
HEIGHT: scale_point(word[:coordinates][3]).to_s,
|
20
|
+
HPOS: scale_point(word[:coordinates][0]).to_s,
|
21
|
+
VPOS: scale_point(word[:coordinates][1]).to_s
|
22
22
|
) { xml.text '' }
|
23
23
|
end
|
24
24
|
end
|
data/newspaper_works.gemspec
CHANGED
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
|
|
11
11
|
'Eben English']
|
12
12
|
spec.email = ['sean.upton@utah.edu', 'jacob.reed@utah.edu',
|
13
13
|
'brian.mcbride@utah.edu', 'eenglish@bpl.org']
|
14
|
-
spec.homepage = 'https://github.com/
|
14
|
+
spec.homepage = 'https://github.com/samvera-labs/newspaper_works'
|
15
15
|
spec.description = 'Gem/Engine for Newspaper Works in Hyrax-based Samvera
|
16
16
|
Application.'
|
17
17
|
spec.summary = <<-SUMMARY
|
@@ -22,12 +22,11 @@ SUMMARY
|
|
22
22
|
spec.license = 'Apache-2.0'
|
23
23
|
spec.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
|
24
24
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
25
|
-
spec.add_dependency 'blacklight_iiif_search'
|
25
|
+
spec.add_dependency 'blacklight_iiif_search', '~> 1.0'
|
26
26
|
spec.add_dependency 'blacklight_advanced_search', '6.4.1'
|
27
27
|
spec.add_dependency 'hyrax', '2.5.1'
|
28
28
|
spec.add_dependency 'nokogiri'
|
29
29
|
spec.add_dependency 'rails', '~> 5.1'
|
30
|
-
spec.add_dependency 'rtesseract', '~> 2.2.0'
|
31
30
|
spec.add_dependency 'sass-rails', '~> 5.0'
|
32
31
|
|
33
32
|
spec.add_development_dependency 'bixby'
|
@@ -15,7 +15,7 @@ RSpec.describe 'thumbnail_highlights', js: true do
|
|
15
15
|
visibility: "open"
|
16
16
|
)
|
17
17
|
attachment = NewspaperWorks::Data::WorkFiles.of(@work)
|
18
|
-
attachment.assign(File.join(fixture_path, '
|
18
|
+
attachment.assign(File.join(fixture_path, 'ocr_mono.tiff'))
|
19
19
|
attachment.derivatives.assign(File.join(fixture_path, 'ndnp-sample1-txt.txt'))
|
20
20
|
attachment.derivatives.assign(File.join(fixture_path, 'ndnp-sample1-json.json'))
|
21
21
|
attachment.commit!
|
@@ -0,0 +1,78 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
4
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
5
|
+
<head>
|
6
|
+
<title></title>
|
7
|
+
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
8
|
+
<meta name='ocr-system' content='tesseract 4.0.0-beta.1' />
|
9
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
10
|
+
</head>
|
11
|
+
<body>
|
12
|
+
<div class='ocr_page' id='page_1' title='image "ocr_mono.tiff"; bbox 0 0 1261 1744; ppageno 0'>
|
13
|
+
<div class='ocr_carea' id='block_1_1' title="bbox 155 22 1073 129">
|
14
|
+
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 155 22 1073 129">
|
15
|
+
<span class='ocr_line' id='line_1_1' title="bbox 155 22 1073 129; baseline 0.003 -18; x_size 111; x_descenders 19; x_ascenders 40"><span class='ocrx_word' id='word_1_1' title='bbox 155 59 247 129; x_wconf 76'>_A</span> <span class='ocrx_word' id='word_1_2' title='bbox 272 28 600 113; x_wconf 95'>FEARFUL</span> <span class='ocrx_word' id='word_1_3' title='bbox 622 22 1073 129; x_wconf 85'>ADVENTURE.</span>
|
16
|
+
</span>
|
17
|
+
</p>
|
18
|
+
</div>
|
19
|
+
<div class='ocr_carea' id='block_1_2' title="bbox 551 152 695 161">
|
20
|
+
<p class='ocr_par' id='par_1_2' lang='eng' title="bbox 551 152 695 161">
|
21
|
+
<span class='ocr_line' id='line_1_2' title="bbox 551 152 695 161; baseline 0 0; x_size 4.5; x_descenders -2.25; x_ascenders 2.25"><span class='ocrx_word' id='word_1_4' title='bbox 551 152 695 161; x_wconf 95'> </span>
|
22
|
+
</span>
|
23
|
+
</p>
|
24
|
+
</div>
|
25
|
+
<div class='ocr_carea' id='block_1_3' title="bbox 11 174 1244 613">
|
26
|
+
<p class='ocr_par' id='par_1_3' lang='eng' title="bbox 11 174 1242 429">
|
27
|
+
<span class='ocr_line' id='line_1_3' title="bbox 69 174 1242 261; baseline 0.003 -21; x_size 70; x_descenders 18; x_ascenders 18"><span class='ocrx_word' id='word_1_5' title='bbox 69 188 193 240; x_wconf 57'>‘The</span> <span class='ocrx_word' id='word_1_6' title='bbox 225 191 464 244; x_wconf 52'>Missouri.</span> <span class='ocrx_word' id='word_1_7' title='bbox 517 174 865 261; x_wconf 92'>Republican,</span> <span class='ocrx_word' id='word_1_8' title='bbox 906 195 954 246; x_wconf 95'>in</span> <span class='ocrx_word' id='word_1_9' title='bbox 1007 211 1040 247; x_wconf 77'>a</span> <span class='ocrx_word' id='word_1_10' title='bbox 1087 195 1242 246; x_wconf 90'>letter</span>
|
28
|
+
</span>
|
29
|
+
<span class='ocr_line' id='line_1_4' title="bbox 11 270 1238 387; baseline 0.006 -63; x_size 71; x_descenders 18; x_ascenders 18"><span class='ocrx_word' id='word_1_11' title='bbox 11 270 139 326; x_wconf 96'>from</span> <span class='ocrx_word' id='word_1_12' title='bbox 167 293 199 328; x_wconf 93'>a</span> <span class='ocrx_word' id='word_1_13' title='bbox 229 277 440 329; x_wconf 95'>Kansas</span> <span class='ocrx_word' id='word_1_14' title='bbox 464 281 888 374; x_wconf 94'>correspondent,</span> <span class='ocrx_word' id='word_1_15' title='bbox 903 281 1008 387; x_wconf 96'>has</span> <span class='ocrx_word' id='word_1_16' title='bbox 1039 282 1128 333; x_wconf 93'>the</span> <span class='ocrx_word' id='word_1_17' title='bbox 1149 279 1238 332; x_wconf 92'>fol-</span>
|
30
|
+
</span>
|
31
|
+
<span class='ocr_line' id='line_1_5' title="bbox 12 361 224 429; baseline 0.021 -19; x_size 66; x_descenders 17; x_ascenders 16"><span class='ocrx_word' id='word_1_18' title='bbox 12 361 224 429; x_wconf 95'>lowing:</span>
|
32
|
+
</span>
|
33
|
+
</p>
|
34
|
+
|
35
|
+
<p class='ocr_par' id='par_1_4' lang='eng' title="bbox 11 407 1244 613">
|
36
|
+
<span class='ocr_line' id='line_1_6' title="bbox 86 407 1244 520; baseline 0.005 -19; x_size 70; x_descenders 16; x_ascenders 19"><span class='ocrx_word' id='word_1_19' title='bbox 86 452 206 501; x_wconf 95'>“At</span> <span class='ocrx_word' id='word_1_20' title='bbox 234 449 311 504; x_wconf 95'>St.</span> <span class='ocrx_word' id='word_1_21' title='bbox 339 415 567 520; x_wconf 49'>Josephs</span> <span class='ocrx_word' id='word_1_22' title='bbox 595 454 751 505; x_wconf 54'>Tsaw</span> <span class='ocrx_word' id='word_1_23' title='bbox 781 456 884 509; x_wconf 95'>Mr,</span> <span class='ocrx_word' id='word_1_24' title='bbox 915 457 982 508; x_wconf 91'>A.</span> <span class='ocrx_word' id='word_1_25' title='bbox 1011 456 1074 508; x_wconf 21'>'T.</span> <span class='ocrx_word' id='word_1_26' title='bbox 1116 407 1244 508; x_wconf 91'>Gor-</span>
|
37
|
+
</span>
|
38
|
+
<span class='ocr_line' id='line_1_7' title="bbox 11 539 1242 613; baseline 0.006 -24; x_size 65; x_descenders 14; x_ascenders 16"><span class='ocrx_word' id='word_1_27' title='bbox 11 554 154 602; x_wconf 96'>man,</span> <span class='ocrx_word' id='word_1_28' title='bbox 177 539 228 590; x_wconf 95'>of</span> <span class='ocrx_word' id='word_1_29' title='bbox 260 539 389 592; x_wconf 95'>New</span> <span class='ocrx_word' id='word_1_30' title='bbox 417 542 580 606; x_wconf 96'>York,</span> <span class='ocrx_word' id='word_1_31' title='bbox 607 544 724 596; x_wconf 96'>who</span> <span class='ocrx_word' id='word_1_32' title='bbox 752 544 859 597; x_wconf 38'>had</span> <span class='ocrx_word' id='word_1_33' title='bbox 861 546 988 613; x_wconf 38'>just</span> <span class='ocrx_word' id='word_1_34' title='bbox 1012 562 1170 598; x_wconf 85'>come:</span> <span class='ocrx_word' id='word_1_35' title='bbox 1194 546 1242 597; x_wconf 96'>in</span>
|
39
|
+
</span>
|
40
|
+
</p>
|
41
|
+
</div>
|
42
|
+
<div class='ocr_carea' id='block_1_4' title="bbox 12 625 1261 699">
|
43
|
+
<p class='ocr_par' id='par_1_5' lang='eng' title="bbox 12 625 1261 699">
|
44
|
+
<span class='ocr_line' id='line_1_8' title="bbox 12 625 1261 699; baseline 0.007 -24; x_size 66; x_descenders 16; x_ascenders 15"><span class='ocrx_word' id='word_1_36' title='bbox 12 625 140 676; x_wconf 95'>from</span> <span class='ocrx_word' id='word_1_37' title='bbox 163 627 257 679; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_38' title='bbox 287 631 576 680; x_wconf 95'>mountains</span> <span class='ocrx_word' id='word_1_39' title='bbox 599 631 650 682; x_wconf 94'>in</span> <span class='ocrx_word' id='word_1_40' title='bbox 678 632 802 683; x_wconf 89'>such</span> <span class='ocrx_word' id='word_1_41' title='bbox 824 648 855 682; x_wconf 89'>a</span> <span class='ocrx_word' id='word_1_42' title='bbox 882 636 1019 683; x_wconf 85'>state</span> <span class='ocrx_word' id='word_1_43' title='bbox 1043 633 1097 683; x_wconf 92'>of</span> <span class='ocrx_word' id='word_1_44' title='bbox 1109 637 1261 699; x_wconf 88'>pros-</span>
|
45
|
+
</span>
|
46
|
+
</p>
|
47
|
+
</div>
|
48
|
+
<div class='ocr_carea' id='block_1_5' title="bbox 11 714 1249 1742">
|
49
|
+
<p class='ocr_par' id='par_1_6' lang='eng' title="bbox 11 714 1249 1742">
|
50
|
+
<span class='ocr_line' id='line_1_9' title="bbox 12 714 1244 787; baseline 0.006 -23; x_size 69; x_descenders 17; x_ascenders 17"><span class='ocrx_word' id='word_1_45' title='bbox 12 714 209 765; x_wconf 95'>tration</span> <span class='ocrx_word' id='word_1_46' title='bbox 238 717 339 783; x_wconf 93'>and</span> <span class='ocrx_word' id='word_1_47' title='bbox 372 717 611 769; x_wconf 91'>affiiction</span> <span class='ocrx_word' id='word_1_48' title='bbox 645 734 701 769; x_wconf 96'>as</span> <span class='ocrx_word' id='word_1_49' title='bbox 746 719 896 772; x_wconf 95'>could</span> <span class='ocrx_word' id='word_1_50' title='bbox 950 722 1067 787; x_wconf 96'>only</span> <span class='ocrx_word' id='word_1_51' title='bbox 1112 719 1244 772; x_wconf 96'>have</span>
|
51
|
+
</span>
|
52
|
+
<span class='ocr_line' id='line_1_10' title="bbox 15 803 1245 882; baseline 0.005 -29; x_size 70; x_descenders 19; x_ascenders 16"><span class='ocrx_word' id='word_1_52' title='bbox 15 803 139 854; x_wconf 95'>been</span> <span class='ocrx_word' id='word_1_53' title='bbox 168 805 474 858; x_wconf 91'>occasioned</span> <span class='ocrx_word' id='word_1_54' title='bbox 514 808 582 873; x_wconf 95'>by</span> <span class='ocrx_word' id='word_1_55' title='bbox 608 807 748 859; x_wconf 96'>such</span> <span class='ocrx_word' id='word_1_56' title='bbox 795 824 1065 882; x_wconf 86'>exposure,</span> <span class='ocrx_word' id='word_1_57' title='bbox 1074 808 1245 859; x_wconf 86'>hard-</span>
|
53
|
+
</span>
|
54
|
+
<span class='ocr_line' id='line_1_11' title="bbox 11 892 1244 966; baseline 0.006 -26; x_size 71; x_descenders 20; x_ascenders 16"><span class='ocrx_word' id='word_1_58' title='bbox 11 892 125 960; x_wconf 95'>ship</span> <span class='ocrx_word' id='word_1_59' title='bbox 155 896 256 944; x_wconf 96'>and</span> <span class='ocrx_word' id='word_1_60' title='bbox 296 894 561 963; x_wconf 91'>suffering,</span> <span class='ocrx_word' id='word_1_61' title='bbox 590 913 649 947; x_wconf 91'>as</span> <span class='ocrx_word' id='word_1_62' title='bbox 699 898 925 966; x_wconf 94'>perhaps</span> <span class='ocrx_word' id='word_1_63' title='bbox 974 913 1042 949; x_wconf 91'>no</span> <span class='ocrx_word' id='word_1_64' title='bbox 1090 898 1244 948; x_wconf 91'>other</span>
|
55
|
+
</span>
|
56
|
+
<span class='ocr_line' id='line_1_12' title="bbox 14 983 1245 1053; baseline 0.005 -23; x_size 71; x_descenders 19; x_ascenders 18"><span class='ocrx_word' id='word_1_65' title='bbox 14 995 128 1030; x_wconf 96'>man</span> <span class='ocrx_word' id='word_1_66' title='bbox 156 997 279 1031; x_wconf 92'>ever</span> <span class='ocrx_word' id='word_1_67' title='bbox 306 983 560 1034; x_wconf 88'>snrvived.</span> <span class='ocrx_word' id='word_1_68' title='bbox 635 984 703 1034; x_wconf 47'>din</span> <span class='ocrx_word' id='word_1_69' title='bbox 732 999 985 1053; x_wconf 47'>company</span> <span class='ocrx_word' id='word_1_70' title='bbox 1042 984 1161 1049; x_wconf 96'>with</span> <span class='ocrx_word' id='word_1_71' title='bbox 1214 1002 1245 1036; x_wconf 93'>a</span>
|
57
|
+
</span>
|
58
|
+
<span class='ocr_line' id='line_1_13' title="bbox 15 1065 1246 1126; baseline 0.004 -11; x_size 62; x_descenders 10; x_ascenders 18"><span class='ocrx_word' id='word_1_72' title='bbox 15 1065 283 1126; x_wconf 47'>Canadian</span> <span class='ocrx_word' id='word_1_73' title='bbox 312 1066 638 1120; x_wconf 96'>Frenchman</span> <span class='ocrx_word' id='word_1_74' title='bbox 669 1071 774 1122; x_wconf 94'>and</span> <span class='ocrx_word' id='word_1_75' title='bbox 804 1077 908 1122; x_wconf 91'>two</span> <span class='ocrx_word' id='word_1_76' title='bbox 961 1069 1246 1123; x_wconf 91'>Kentucki-</span>
|
59
|
+
</span>
|
60
|
+
<span class='ocr_line' id='line_1_14' title="bbox 12 1153 1248 1223; baseline 0.006 -22; x_size 68; x_descenders 17; x_ascenders 18"><span class='ocrx_word' id='word_1_77' title='bbox 12 1169 105 1202; x_wconf 90'>ans</span> <span class='ocrx_word' id='word_1_78' title='bbox 132 1153 310 1205; x_wconf 85'>he-left</span> <span class='ocrx_word' id='word_1_79' title='bbox 337 1155 425 1205; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_80' title='bbox 452 1162 672 1223; x_wconf 96'>country</span> <span class='ocrx_word' id='word_1_81' title='bbox 724 1157 783 1208; x_wconf 96'>of</span> <span class='ocrx_word' id='word_1_82' title='bbox 832 1157 927 1208; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_83' title='bbox 972 1157 1248 1209; x_wconf 90'>Blackfeet</span>
|
61
|
+
</span>
|
62
|
+
<span class='ocr_line' id='line_1_15' title="bbox 16 1239 1246 1306; baseline 0.003 -18; x_size 69; x_descenders 15; x_ascenders 21"><span class='ocrx_word' id='word_1_84' title='bbox 16 1239 228 1311; x_wconf 88'>Indians</span> <span class='ocrx_word' id='word_1_85' title='bbox 247 1241 356 1310; x_wconf 64'>last</span> <span class='ocrx_word' id='word_1_86' title='bbox 366 1240 599 1288; x_wconf 36'>-Fall.:to</span> <span class='ocrx_word' id='word_1_87' title='bbox 623 1244 795 1315; x_wconf 86'>join.</span> <span class='ocrx_word' id='word_1_88' title='bbox 800 1243 1100 1310; x_wconf 76'>Culverson</span> <span class='ocrx_word' id='word_1_89' title='bbox 1140 1245 1246 1309; x_wconf 96'>and</span>
|
63
|
+
</span>
|
64
|
+
<span class='ocr_line' id='line_1_16' title="bbox 14 1303 1246 1393; baseline 0.003 -21; x_size 67; x_descenders 16; x_ascenders 16"><span class='ocrx_word' id='word_1_90' title='bbox 14 1309 167 1388; x_wconf 96'>party</span> <span class='ocrx_word' id='word_1_91' title='bbox 195 1307 253 1373; x_wconf 96'>at</span> <span class='ocrx_word' id='word_1_92' title='bbox 282 1305 413 1373; x_wconf 96'>Fort</span> <span class='ocrx_word' id='word_1_93' title='bbox 443 1313 623 1376; x_wconf 96'>Pierre</span> <span class='ocrx_word' id='word_1_94' title='bbox 647 1326 750 1376; x_wconf 93'>and</span> <span class='ocrx_word' id='word_1_95' title='bbox 774 1358 789 1375; x_wconf 91'>¢</span> <span class='ocrx_word' id='word_1_96' title='bbox 773 1313 1089 1393; x_wconf 96'>accompany</span> <span class='ocrx_word' id='word_1_97' title='bbox 1109 1303 1246 1376; x_wconf 96'>them</span>
|
65
|
+
</span>
|
66
|
+
<span class='ocr_line' id='line_1_17' title="bbox 15 1411 1245 1480; baseline 0.004 -19; x_size 68; x_descenders 17; x_ascenders 15"><span class='ocrx_word' id='word_1_98' title='bbox 15 1419 70 1462; x_wconf 96'>to</span> <span class='ocrx_word' id='word_1_99' title='bbox 99 1411 185 1462; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_100' title='bbox 213 1418 393 1463; x_wconf 96'>states.</span> <span class='ocrx_word' id='word_1_101' title='bbox 432 1412 579 1480; x_wconf 96'>They</span> <span class='ocrx_word' id='word_1_102' title='bbox 608 1414 816 1466; x_wconf 96'>arrived</span> <span class='ocrx_word' id='word_1_103' title='bbox 837 1420 894 1465; x_wconf 96'>at</span> <span class='ocrx_word' id='word_1_104' title='bbox 914 1414 1050 1465; x_wconf 96'>Fort</span> <span class='ocrx_word' id='word_1_105' title='bbox 1067 1414 1245 1466; x_wconf 96'>Pierre</span>
|
67
|
+
</span>
|
68
|
+
<span class='ocr_line' id='line_1_18' title="bbox 15 1497 1246 1568; baseline 0.002 -18; x_size 70; x_descenders 17; x_ascenders 18"><span class='ocrx_word' id='word_1_106' title='bbox 15 1505 120 1550; x_wconf 96'>two</span> <span class='ocrx_word' id='word_1_107' title='bbox 147 1500 276 1564; x_wconf 96'>days</span> <span class='ocrx_word' id='word_1_108' title='bbox 303 1497 436 1550; x_wconf 92'>after</span> <span class='ocrx_word' id='word_1_109' title='bbox 466 1497 794 1551; x_wconf 74'>Calverson’s</span> <span class='ocrx_word' id='word_1_110' title='bbox 821 1500 1116 1568; x_wconf 96'>departure,</span> <span class='ocrx_word' id='word_1_111' title='bbox 1145 1502 1246 1552; x_wconf 96'>and</span>
|
69
|
+
</span>
|
70
|
+
<span class='ocr_line' id='line_1_19' title="bbox 15 1586 1249 1656; baseline 0.001 -19; x_size 68; x_descenders 17; x_ascenders 17"><span class='ocrx_word' id='word_1_112' title='bbox 15 1587 222 1638; x_wconf 51'>hurried</span> <span class='ocrx_word' id='word_1_113' title='bbox 252 1603 316 1638; x_wconf 96'>on</span> <span class='ocrx_word' id='word_1_114' title='bbox 343 1586 498 1650; x_wconf 96'>after,</span> <span class='ocrx_word' id='word_1_115' title='bbox 526 1587 578 1637; x_wconf 96'>in</span> <span class='ocrx_word' id='word_1_116' title='bbox 606 1587 693 1640; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_117' title='bbox 719 1587 853 1656; x_wconf 95'>hope</span> <span class='ocrx_word' id='word_1_118' title='bbox 874 1589 929 1640; x_wconf 95'>of</span> <span class='ocrx_word' id='word_1_119' title='bbox 939 1589 1249 1656; x_wconf 96'>overtaking</span>
|
71
|
+
</span>
|
72
|
+
<span class='ocr_line' id='line_1_20' title="bbox 18 1672 1248 1742; baseline 0.001 -16; x_size 69; x_descenders 16; x_ascenders 18"><span class='ocrx_word' id='word_1_120' title='bbox 18 1675 160 1734; x_wconf 90'>him.</span> <span class='ocrx_word' id='word_1_121' title='bbox 214 1672 296 1726; x_wconf 92'>On</span> <span class='ocrx_word' id='word_1_122' title='bbox 327 1675 416 1726; x_wconf 95'>the</span> <span class='ocrx_word' id='word_1_123' title='bbox 442 1675 584 1727; x_wconf 96'>third</span> <span class='ocrx_word' id='word_1_124' title='bbox 608 1675 711 1742; x_wconf 96'>day</span> <span class='ocrx_word' id='word_1_125' title='bbox 740 1691 837 1727; x_wconf 96'>one</span> <span class='ocrx_word' id='word_1_126' title='bbox 865 1676 922 1727; x_wconf 96'>of</span> <span class='ocrx_word' id='word_1_127' title='bbox 942 1675 1090 1727; x_wconf 96'>those</span> <span class='ocrx_word' id='word_1_128' title='bbox 1110 1692 1248 1727; x_wconf 96'>snow</span>
|
73
|
+
</span>
|
74
|
+
</p>
|
75
|
+
</div>
|
76
|
+
</div>
|
77
|
+
</body>
|
78
|
+
</html>
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'tmpdir'
|
3
|
+
|
4
|
+
describe NewspaperWorks::ImageTool do
|
5
|
+
let(:fixtures) { File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files') }
|
6
|
+
|
7
|
+
# Image fixtures to test identification, metadata extraction for:
|
8
|
+
let(:gray_jp2) { File.join(fixtures, 'ocr_gray.jp2') }
|
9
|
+
let(:color_jp2) { File.join(fixtures, '4.1.07.jp2') }
|
10
|
+
let(:gray_tiff) { File.join(fixtures, 'ocr_gray.tiff') }
|
11
|
+
let(:mono_tiff) { File.join(fixtures, 'ocr_mono.tiff') }
|
12
|
+
let(:color_tiff) { File.join(fixtures, '4.1.07.tiff') }
|
13
|
+
let(:pdf) { File.join(fixtures, 'minimal-1-page.pdf') }
|
14
|
+
|
15
|
+
describe "Extracts metadata with JP2 backend" do
|
16
|
+
it "constructs with a path" do
|
17
|
+
identify = described_class.new(gray_jp2)
|
18
|
+
expect(identify.path).to eq gray_jp2
|
19
|
+
end
|
20
|
+
|
21
|
+
it "gets metadata for grayscale JP2 image" do
|
22
|
+
result = described_class.new(gray_jp2).metadata
|
23
|
+
expect(result[:color]).to eq 'gray'
|
24
|
+
expect(result[:width]).to eq 418
|
25
|
+
expect(result[:height]).to eq 1046
|
26
|
+
expect(result[:bits_per_component]).to eq 8
|
27
|
+
expect(result[:num_components]).to eq 1
|
28
|
+
end
|
29
|
+
|
30
|
+
it "gets metadata for color JP2 image" do
|
31
|
+
result = described_class.new(color_jp2).metadata
|
32
|
+
expect(result[:color]).to eq 'color'
|
33
|
+
expect(result[:width]).to eq 256
|
34
|
+
expect(result[:height]).to eq 256
|
35
|
+
expect(result[:bits_per_component]).to eq 8
|
36
|
+
# e.g. is 3, but would be four if sample image had an alpha channel
|
37
|
+
expect(result[:num_components]).to eq 3
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "Extracts metadata for non-JP2 images with imagemagick" do
|
42
|
+
it "gets metadata for gray TIFF image" do
|
43
|
+
result = described_class.new(gray_tiff).metadata
|
44
|
+
expect(result[:color]).to eq 'gray'
|
45
|
+
expect(result[:width]).to eq 418
|
46
|
+
expect(result[:height]).to eq 1046
|
47
|
+
expect(result[:bits_per_component]).to eq 8
|
48
|
+
expect(result[:num_components]).to eq 1
|
49
|
+
end
|
50
|
+
|
51
|
+
it "gets metadata for monochrome TIFF image" do
|
52
|
+
result = described_class.new(mono_tiff).metadata
|
53
|
+
expect(result[:color]).to eq 'monochrome'
|
54
|
+
expect(result[:width]).to eq 1261
|
55
|
+
expect(result[:height]).to eq 1744
|
56
|
+
expect(result[:bits_per_component]).to eq 1
|
57
|
+
expect(result[:num_components]).to eq 1
|
58
|
+
end
|
59
|
+
|
60
|
+
it "gets metadata for color TIFF image" do
|
61
|
+
result = described_class.new(color_tiff).metadata
|
62
|
+
expect(result[:color]).to eq 'color'
|
63
|
+
expect(result[:width]).to eq 256
|
64
|
+
expect(result[:height]).to eq 256
|
65
|
+
expect(result[:bits_per_component]).to eq 8
|
66
|
+
# e.g. is 3, but would be four if sample image had an alpha channel
|
67
|
+
expect(result[:num_components]).to eq 3
|
68
|
+
end
|
69
|
+
|
70
|
+
it "detects mime type of pdf" do
|
71
|
+
result = described_class.new(pdf).metadata
|
72
|
+
expect(result[:content_type]).to eq 'application/pdf'
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
describe "converts images" do
|
77
|
+
it "makes a monochrome TIFF from JP2" do
|
78
|
+
tool = described_class.new(gray_jp2)
|
79
|
+
dest = File.join(Dir.mktmpdir, 'mono.tif')
|
80
|
+
tool.convert(dest, true)
|
81
|
+
expect(File.exist?(dest)).to be true
|
82
|
+
expect(described_class.new(dest).metadata[:color]).to eq 'monochrome'
|
83
|
+
end
|
84
|
+
|
85
|
+
it "makes a gray TIFF from JP2" do
|
86
|
+
tool = described_class.new(gray_jp2)
|
87
|
+
dest = File.join(Dir.mktmpdir, 'gray.tif')
|
88
|
+
tool.convert(dest, false)
|
89
|
+
expect(File.exist?(dest)).to be true
|
90
|
+
expect(described_class.new(dest).metadata[:color]).to eq 'gray'
|
91
|
+
end
|
92
|
+
|
93
|
+
it "makes a monochrome TIFF from grayscale TIFF" do
|
94
|
+
tool = described_class.new(gray_tiff)
|
95
|
+
dest = File.join(Dir.mktmpdir, 'mono.tif')
|
96
|
+
tool.convert(dest, true)
|
97
|
+
expect(File.exist?(dest)).to be true
|
98
|
+
expect(described_class.new(dest).metadata[:color]).to eq 'monochrome'
|
99
|
+
end
|
100
|
+
|
101
|
+
# Not yet supported to use this tool to make JP2, for now the only
|
102
|
+
# component in NewspaperWorks doing that is
|
103
|
+
# NewspaperWorks::JP2DerivativeService
|
104
|
+
it "raises error on JP2 destination" do
|
105
|
+
expect { described_class.new(gray_tiff).convert('out.jp2') }.to \
|
106
|
+
raise_error(RuntimeError)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -24,13 +24,13 @@ RSpec.shared_examples 'ingest adapter IO' do
|
|
24
24
|
# define the path to the file we will use for multiple examples
|
25
25
|
let(:path) do
|
26
26
|
fixtures = File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files')
|
27
|
-
File.join(fixtures, '
|
27
|
+
File.join(fixtures, 'ocr_mono.tiff')
|
28
28
|
end
|
29
29
|
|
30
30
|
# DRY for this matcher's use in multiple examples:
|
31
31
|
let(:have_io_and_correct_filename) do
|
32
32
|
have_attributes(
|
33
|
-
filename: '
|
33
|
+
filename: 'ocr_mono.tiff',
|
34
34
|
io: an_object_responding_to(:read)
|
35
35
|
)
|
36
36
|
end
|
@@ -62,7 +62,7 @@ RSpec.shared_examples 'ingest adapter IO' do
|
|
62
62
|
it "loads a StringIO with filename" do
|
63
63
|
adapter = build(:newspaper_page_ingest)
|
64
64
|
io = StringIO.new('File Content Here, Maybe')
|
65
|
-
adapter.load(io, filename: '
|
65
|
+
adapter.load(io, filename: 'ocr_mono.tiff')
|
66
66
|
expect(adapter).to have_io_and_correct_filename
|
67
67
|
end
|
68
68
|
|
@@ -8,7 +8,7 @@ RSpec.describe NewspaperWorks::Ingest::NewspaperPageIngest do
|
|
8
8
|
|
9
9
|
# define the path to the file we will use for multiple examples
|
10
10
|
let(:path) do
|
11
|
-
File.join(fixture_path, '
|
11
|
+
File.join(fixture_path, 'ocr_mono.tiff')
|
12
12
|
end
|
13
13
|
|
14
14
|
it_behaves_like('ingest adapter IO')
|
@@ -32,7 +32,7 @@ RSpec.describe NewspaperWorks::Ingest::NewspaperPageIngest do
|
|
32
32
|
|
33
33
|
def verify_pcdm_fileset(fileset)
|
34
34
|
# Hyrax always sets label (if not title) on fileset:
|
35
|
-
expect(fileset.label).to eq '
|
35
|
+
expect(fileset.label).to eq 'ocr_mono.tiff'
|
36
36
|
# reload file set and check on original file
|
37
37
|
fileset.reload
|
38
38
|
file = fileset.original_file
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe NewspaperWorks::JP2ImageMetadata do
|
4
|
+
let(:fixtures) { File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files') }
|
5
|
+
|
6
|
+
let(:gray_jp2) { File.join(fixtures, 'ocr_gray.jp2') }
|
7
|
+
|
8
|
+
let(:color_jp2) { File.join(fixtures, '4.1.07.jp2') }
|
9
|
+
|
10
|
+
describe "Extracts technical metadata from a JP2 file" do
|
11
|
+
it "constructs with a path" do
|
12
|
+
meta = described_class.new(gray_jp2)
|
13
|
+
expect(meta.path).to eq gray_jp2
|
14
|
+
end
|
15
|
+
|
16
|
+
it "gets metadata for grayscale image" do
|
17
|
+
meta = described_class.new(gray_jp2)
|
18
|
+
result = meta.technical_metadata
|
19
|
+
expect(result[:color]).to eq 'gray'
|
20
|
+
expect(result[:width]).to eq 418
|
21
|
+
expect(result[:height]).to eq 1046
|
22
|
+
expect(result[:bits_per_component]).to eq 8
|
23
|
+
expect(result[:num_components]).to eq 1
|
24
|
+
end
|
25
|
+
|
26
|
+
it "gets metadata for color image" do
|
27
|
+
meta = described_class.new(color_jp2)
|
28
|
+
result = meta.technical_metadata
|
29
|
+
expect(result[:color]).to eq 'color'
|
30
|
+
expect(result[:width]).to eq 256
|
31
|
+
expect(result[:height]).to eq 256
|
32
|
+
expect(result[:bits_per_component]).to eq 8
|
33
|
+
# e.g. is 3, but would be four if sample image had an alpha channel
|
34
|
+
expect(result[:num_components]).to eq 3
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|