newspaper_works 0.1.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +5 -5
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +4 -0
  4. data/.travis.yml +2 -2
  5. data/README.md +14 -13
  6. data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
  7. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
  8. data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
  9. data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
  10. data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
  11. data/config/locales/newspaper_article.de.yml +1 -1
  12. data/config/locales/newspaper_article.en.yml +1 -1
  13. data/config/locales/newspaper_article.es.yml +1 -1
  14. data/config/locales/newspaper_article.fr.yml +1 -1
  15. data/config/locales/newspaper_article.it.yml +1 -1
  16. data/config/locales/newspaper_article.pt-BR.yml +1 -1
  17. data/config/locales/newspaper_article.zh.yml +1 -1
  18. data/config/locales/newspaper_container.de.yml +1 -1
  19. data/config/locales/newspaper_container.en.yml +1 -1
  20. data/config/locales/newspaper_container.es.yml +1 -1
  21. data/config/locales/newspaper_container.fr.yml +1 -1
  22. data/config/locales/newspaper_container.it.yml +1 -1
  23. data/config/locales/newspaper_container.pt-BR.yml +1 -1
  24. data/config/locales/newspaper_container.zh.yml +1 -1
  25. data/config/locales/newspaper_issue.de.yml +1 -1
  26. data/config/locales/newspaper_issue.en.yml +1 -1
  27. data/config/locales/newspaper_issue.es.yml +1 -1
  28. data/config/locales/newspaper_issue.fr.yml +1 -1
  29. data/config/locales/newspaper_issue.it.yml +2 -2
  30. data/config/locales/newspaper_issue.pt-BR.yml +2 -2
  31. data/config/locales/newspaper_issue.zh.yml +2 -2
  32. data/config/locales/newspaper_page.de.yml +1 -1
  33. data/config/locales/newspaper_page.en.yml +1 -1
  34. data/config/locales/newspaper_page.es.yml +1 -1
  35. data/config/locales/newspaper_page.fr.yml +1 -1
  36. data/config/locales/newspaper_page.it.yml +1 -1
  37. data/config/locales/newspaper_page.pt-BR.yml +1 -1
  38. data/config/locales/newspaper_page.zh.yml +1 -1
  39. data/config/locales/newspaper_title.de.yml +1 -1
  40. data/config/locales/newspaper_title.en.yml +1 -1
  41. data/config/locales/newspaper_title.es.yml +1 -1
  42. data/config/locales/newspaper_title.fr.yml +1 -1
  43. data/config/locales/newspaper_title.it.yml +1 -1
  44. data/config/locales/newspaper_title.pt-BR.yml +1 -1
  45. data/config/locales/newspaper_title.zh.yml +1 -1
  46. data/config/locales/newspaper_works.de.yml +98 -0
  47. data/config/locales/newspaper_works.en.yml +67 -0
  48. data/config/locales/newspaper_works.es.yml +96 -0
  49. data/config/locales/newspaper_works.fr.yml +97 -0
  50. data/config/locales/newspaper_works.it.yml +90 -0
  51. data/config/locales/newspaper_works.pt-BR.yml +96 -0
  52. data/config/locales/newspaper_works.zh.yml +90 -0
  53. data/config/vendor/fits.xml +55 -0
  54. data/config/vendor/imagemagick-6-policy.xml +39 -39
  55. data/lib/newspaper_works.rb +2 -0
  56. data/lib/newspaper_works/image_tool.rb +119 -0
  57. data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
  58. data/lib/newspaper_works/text_extraction.rb +1 -0
  59. data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
  60. data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
  61. data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
  62. data/lib/newspaper_works/version.rb +1 -1
  63. data/newspaper_works.gemspec +2 -3
  64. data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
  65. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  66. data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
  67. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
  68. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
  69. data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
  70. data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
  71. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
  72. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
  73. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
  74. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
  75. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
  76. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
  77. data/spec/spec_helper.rb +19 -0
  78. metadata +21 -22
@@ -1,100 +1,86 @@
1
1
  require 'json'
2
2
  require 'open3'
3
- require 'rtesseract'
3
+ require 'tmpdir'
4
4
 
5
5
  # --
6
6
  module NewspaperWorks
7
7
  # Module for text extraction (OCR or otherwise)
8
8
  module TextExtraction
9
9
  class PageOCR
10
- def self.alto_from(path)
11
- new(path).alto
12
- end
10
+ attr_accessor :html, :path
13
11
 
14
12
  def initialize(path)
15
13
  @path = path
14
+ # hOCR html:
15
+ @html = nil
16
16
  @words = nil
17
- @processor = "mini_magick"
18
17
  @source_meta = nil
19
- @use_gm = extension.start_with?('jp2')
20
18
  @box = nil
21
19
  @plain = nil
22
20
  end
23
21
 
24
- def extension
25
- @path.split('.')[-1].downcase
22
+ def run_ocr
23
+ outfile = File.join(Dir.mktmpdir, 'output_html')
24
+ cmd = "tesseract #{path} #{outfile} hocr"
25
+ `#{cmd}`
26
+ outfile + '.hocr'
26
27
  end
27
28
 
28
- def load_box
29
- if @box.nil?
30
- if @use_gm
31
- MiniMagick.with_cli(:graphicsmagick) do
32
- @box = RTesseract::Box.new(@path, processor: @processor)
33
- @plain = @box.to_s
34
- end
35
- else
36
- @box = RTesseract::Box.new(@path, processor: @processor)
37
- @plain = @box.to_s
38
- end
39
- end
40
- @box
29
+ def load_words
30
+ preprocess_image
31
+ html_path = run_ocr
32
+ reader = NewspaperWorks::TextExtraction::HOCRReader.new(html_path)
33
+ @words = reader.words
34
+ @plain = reader.text
41
35
  end
42
36
 
43
37
  def words
44
- @words = load_box.words if @words.nil?
38
+ load_words if @words.nil?
45
39
  @words
46
40
  end
47
41
 
48
- def normalized_coordinate(word)
49
- {
50
- word: word[:word],
51
- coordinates: [
52
- word[:x_start],
53
- word[:y_start],
54
- (word[:x_end] - word[:x_start]),
55
- (word[:y_end] - word[:y_start])
56
- ]
57
- }
58
- end
59
-
60
42
  def word_json
61
- save_words = words.map { |w| normalized_coordinate(w) }
62
- builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(save_words,
63
- width,
64
- height)
43
+ builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
44
+ words,
45
+ width,
46
+ height
47
+ )
65
48
  builder.to_json
66
49
  end
67
50
 
68
51
  def plain
69
- load_box
52
+ load_words if @plain.nil?
70
53
  @plain
71
54
  end
72
55
 
73
56
  def identify
74
- if @source_geometry.nil?
75
- path = @path
76
- cmd = "identify -verbose #{path}"
77
- cmd = 'gm ' + cmd if @use_gm
78
- lines = `#{cmd}`.lines
79
- geo = lines.select { |line| line.strip.start_with?('Geometry') }[0]
80
- img_geo = geo.strip.split(':')[-1].strip.split('+')[0]
81
- @source_geometry = img_geo.split('x').map(&:to_i)
82
- end
83
- @source_geometry
57
+ return @source_meta unless @source_meta.nil?
58
+ @source_meta = NewspaperWorks::ImageTool.new(@path).metadata
84
59
  end
85
60
 
86
61
  def width
87
- identify[0]
62
+ identify[:width]
88
63
  end
89
64
 
90
65
  def height
91
- identify[1]
66
+ identify[:height]
92
67
  end
93
68
 
94
69
  def alto
95
70
  writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height)
96
71
  writer.to_alto(words)
97
72
  end
73
+
74
+ private
75
+
76
+ # transform the image into a one-bit TIFF for OCR
77
+ def preprocess_image
78
+ tool = NewspaperWorks::ImageTool.new(@path)
79
+ return if tool.metadata[:color] == 'monochrome'
80
+ intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
81
+ tool.convert(intermediate_path, true)
82
+ @path = intermediate_path
83
+ end
98
84
  end
99
85
  end
100
86
  end
@@ -15,10 +15,10 @@ module NewspaperWorks
15
15
  words.each do |word|
16
16
  xml.String(
17
17
  CONTENT: word[:word],
18
- HEIGHT: scale_point(word[:y_end] - word[:y_start]).to_s,
19
- WIDTH: scale_point(word[:x_end] - word[:x_start]).to_s,
20
- HPOS: scale_point(word[:x_start]).to_s,
21
- VPOS: scale_point(word[:y_start]).to_s
18
+ WIDTH: scale_point(word[:coordinates][2]).to_s,
19
+ HEIGHT: scale_point(word[:coordinates][3]).to_s,
20
+ HPOS: scale_point(word[:coordinates][0]).to_s,
21
+ VPOS: scale_point(word[:coordinates][1]).to_s
22
22
  ) { xml.text '' }
23
23
  end
24
24
  end
@@ -1,3 +1,3 @@
1
1
  module NewspaperWorks
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '1.0.0'.freeze
3
3
  end
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
  'Eben English']
12
12
  spec.email = ['sean.upton@utah.edu', 'jacob.reed@utah.edu',
13
13
  'brian.mcbride@utah.edu', 'eenglish@bpl.org']
14
- spec.homepage = 'https://github.com/marriott-library/newspaper_works'
14
+ spec.homepage = 'https://github.com/samvera-labs/newspaper_works'
15
15
  spec.description = 'Gem/Engine for Newspaper Works in Hyrax-based Samvera
16
16
  Application.'
17
17
  spec.summary = <<-SUMMARY
@@ -22,12 +22,11 @@ SUMMARY
22
22
  spec.license = 'Apache-2.0'
23
23
  spec.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
24
24
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
25
- spec.add_dependency 'blacklight_iiif_search'
25
+ spec.add_dependency 'blacklight_iiif_search', '~> 1.0'
26
26
  spec.add_dependency 'blacklight_advanced_search', '6.4.1'
27
27
  spec.add_dependency 'hyrax', '2.5.1'
28
28
  spec.add_dependency 'nokogiri'
29
29
  spec.add_dependency 'rails', '~> 5.1'
30
- spec.add_dependency 'rtesseract', '~> 2.2.0'
31
30
  spec.add_dependency 'sass-rails', '~> 5.0'
32
31
 
33
32
  spec.add_development_dependency 'bixby'
@@ -15,7 +15,7 @@ RSpec.describe 'thumbnail_highlights', js: true do
15
15
  visibility: "open"
16
16
  )
17
17
  attachment = NewspaperWorks::Data::WorkFiles.of(@work)
18
- attachment.assign(File.join(fixture_path, 'page1.tiff'))
18
+ attachment.assign(File.join(fixture_path, 'ocr_mono.tiff'))
19
19
  attachment.derivatives.assign(File.join(fixture_path, 'ndnp-sample1-txt.txt'))
20
20
  attachment.derivatives.assign(File.join(fixture_path, 'ndnp-sample1-json.json'))
21
21
  attachment.commit!
@@ -0,0 +1,78 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
4
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
+ <head>
6
+ <title></title>
7
+ <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
8
+ <meta name='ocr-system' content='tesseract 4.0.0-beta.1' />
9
+ <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
10
+ </head>
11
+ <body>
12
+ <div class='ocr_page' id='page_1' title='image "ocr_mono.tiff"; bbox 0 0 1261 1744; ppageno 0'>
13
+ <div class='ocr_carea' id='block_1_1' title="bbox 155 22 1073 129">
14
+ <p class='ocr_par' id='par_1_1' lang='eng' title="bbox 155 22 1073 129">
15
+ <span class='ocr_line' id='line_1_1' title="bbox 155 22 1073 129; baseline 0.003 -18; x_size 111; x_descenders 19; x_ascenders 40"><span class='ocrx_word' id='word_1_1' title='bbox 155 59 247 129; x_wconf 76'>_A</span> <span class='ocrx_word' id='word_1_2' title='bbox 272 28 600 113; x_wconf 95'>FEARFUL</span> <span class='ocrx_word' id='word_1_3' title='bbox 622 22 1073 129; x_wconf 85'>ADVENTURE.</span>
16
+ </span>
17
+ </p>
18
+ </div>
19
+ <div class='ocr_carea' id='block_1_2' title="bbox 551 152 695 161">
20
+ <p class='ocr_par' id='par_1_2' lang='eng' title="bbox 551 152 695 161">
21
+ <span class='ocr_line' id='line_1_2' title="bbox 551 152 695 161; baseline 0 0; x_size 4.5; x_descenders -2.25; x_ascenders 2.25"><span class='ocrx_word' id='word_1_4' title='bbox 551 152 695 161; x_wconf 95'> </span>
22
+ </span>
23
+ </p>
24
+ </div>
25
+ <div class='ocr_carea' id='block_1_3' title="bbox 11 174 1244 613">
26
+ <p class='ocr_par' id='par_1_3' lang='eng' title="bbox 11 174 1242 429">
27
+ <span class='ocr_line' id='line_1_3' title="bbox 69 174 1242 261; baseline 0.003 -21; x_size 70; x_descenders 18; x_ascenders 18"><span class='ocrx_word' id='word_1_5' title='bbox 69 188 193 240; x_wconf 57'>‘The</span> <span class='ocrx_word' id='word_1_6' title='bbox 225 191 464 244; x_wconf 52'>Missouri.</span> <span class='ocrx_word' id='word_1_7' title='bbox 517 174 865 261; x_wconf 92'>Republican,</span> <span class='ocrx_word' id='word_1_8' title='bbox 906 195 954 246; x_wconf 95'>in</span> <span class='ocrx_word' id='word_1_9' title='bbox 1007 211 1040 247; x_wconf 77'>a</span> <span class='ocrx_word' id='word_1_10' title='bbox 1087 195 1242 246; x_wconf 90'>letter</span>
28
+ </span>
29
+ <span class='ocr_line' id='line_1_4' title="bbox 11 270 1238 387; baseline 0.006 -63; x_size 71; x_descenders 18; x_ascenders 18"><span class='ocrx_word' id='word_1_11' title='bbox 11 270 139 326; x_wconf 96'>from</span> <span class='ocrx_word' id='word_1_12' title='bbox 167 293 199 328; x_wconf 93'>a</span> <span class='ocrx_word' id='word_1_13' title='bbox 229 277 440 329; x_wconf 95'>Kansas</span> <span class='ocrx_word' id='word_1_14' title='bbox 464 281 888 374; x_wconf 94'>correspondent,</span> <span class='ocrx_word' id='word_1_15' title='bbox 903 281 1008 387; x_wconf 96'>has</span> <span class='ocrx_word' id='word_1_16' title='bbox 1039 282 1128 333; x_wconf 93'>the</span> <span class='ocrx_word' id='word_1_17' title='bbox 1149 279 1238 332; x_wconf 92'>fol-</span>
30
+ </span>
31
+ <span class='ocr_line' id='line_1_5' title="bbox 12 361 224 429; baseline 0.021 -19; x_size 66; x_descenders 17; x_ascenders 16"><span class='ocrx_word' id='word_1_18' title='bbox 12 361 224 429; x_wconf 95'>lowing:</span>
32
+ </span>
33
+ </p>
34
+
35
+ <p class='ocr_par' id='par_1_4' lang='eng' title="bbox 11 407 1244 613">
36
+ <span class='ocr_line' id='line_1_6' title="bbox 86 407 1244 520; baseline 0.005 -19; x_size 70; x_descenders 16; x_ascenders 19"><span class='ocrx_word' id='word_1_19' title='bbox 86 452 206 501; x_wconf 95'>“At</span> <span class='ocrx_word' id='word_1_20' title='bbox 234 449 311 504; x_wconf 95'>St.</span> <span class='ocrx_word' id='word_1_21' title='bbox 339 415 567 520; x_wconf 49'>Josephs</span> <span class='ocrx_word' id='word_1_22' title='bbox 595 454 751 505; x_wconf 54'>Tsaw</span> <span class='ocrx_word' id='word_1_23' title='bbox 781 456 884 509; x_wconf 95'>Mr,</span> <span class='ocrx_word' id='word_1_24' title='bbox 915 457 982 508; x_wconf 91'>A.</span> <span class='ocrx_word' id='word_1_25' title='bbox 1011 456 1074 508; x_wconf 21'>&#39;T.</span> <span class='ocrx_word' id='word_1_26' title='bbox 1116 407 1244 508; x_wconf 91'>Gor-</span>
37
+ </span>
38
+ <span class='ocr_line' id='line_1_7' title="bbox 11 539 1242 613; baseline 0.006 -24; x_size 65; x_descenders 14; x_ascenders 16"><span class='ocrx_word' id='word_1_27' title='bbox 11 554 154 602; x_wconf 96'>man,</span> <span class='ocrx_word' id='word_1_28' title='bbox 177 539 228 590; x_wconf 95'>of</span> <span class='ocrx_word' id='word_1_29' title='bbox 260 539 389 592; x_wconf 95'>New</span> <span class='ocrx_word' id='word_1_30' title='bbox 417 542 580 606; x_wconf 96'>York,</span> <span class='ocrx_word' id='word_1_31' title='bbox 607 544 724 596; x_wconf 96'>who</span> <span class='ocrx_word' id='word_1_32' title='bbox 752 544 859 597; x_wconf 38'>had</span> <span class='ocrx_word' id='word_1_33' title='bbox 861 546 988 613; x_wconf 38'>just</span> <span class='ocrx_word' id='word_1_34' title='bbox 1012 562 1170 598; x_wconf 85'>come:</span> <span class='ocrx_word' id='word_1_35' title='bbox 1194 546 1242 597; x_wconf 96'>in</span>
39
+ </span>
40
+ </p>
41
+ </div>
42
+ <div class='ocr_carea' id='block_1_4' title="bbox 12 625 1261 699">
43
+ <p class='ocr_par' id='par_1_5' lang='eng' title="bbox 12 625 1261 699">
44
+ <span class='ocr_line' id='line_1_8' title="bbox 12 625 1261 699; baseline 0.007 -24; x_size 66; x_descenders 16; x_ascenders 15"><span class='ocrx_word' id='word_1_36' title='bbox 12 625 140 676; x_wconf 95'>from</span> <span class='ocrx_word' id='word_1_37' title='bbox 163 627 257 679; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_38' title='bbox 287 631 576 680; x_wconf 95'>mountains</span> <span class='ocrx_word' id='word_1_39' title='bbox 599 631 650 682; x_wconf 94'>in</span> <span class='ocrx_word' id='word_1_40' title='bbox 678 632 802 683; x_wconf 89'>such</span> <span class='ocrx_word' id='word_1_41' title='bbox 824 648 855 682; x_wconf 89'>a</span> <span class='ocrx_word' id='word_1_42' title='bbox 882 636 1019 683; x_wconf 85'>state</span> <span class='ocrx_word' id='word_1_43' title='bbox 1043 633 1097 683; x_wconf 92'>of</span> <span class='ocrx_word' id='word_1_44' title='bbox 1109 637 1261 699; x_wconf 88'>pros-</span>
45
+ </span>
46
+ </p>
47
+ </div>
48
+ <div class='ocr_carea' id='block_1_5' title="bbox 11 714 1249 1742">
49
+ <p class='ocr_par' id='par_1_6' lang='eng' title="bbox 11 714 1249 1742">
50
+ <span class='ocr_line' id='line_1_9' title="bbox 12 714 1244 787; baseline 0.006 -23; x_size 69; x_descenders 17; x_ascenders 17"><span class='ocrx_word' id='word_1_45' title='bbox 12 714 209 765; x_wconf 95'>tration</span> <span class='ocrx_word' id='word_1_46' title='bbox 238 717 339 783; x_wconf 93'>and</span> <span class='ocrx_word' id='word_1_47' title='bbox 372 717 611 769; x_wconf 91'>affiiction</span> <span class='ocrx_word' id='word_1_48' title='bbox 645 734 701 769; x_wconf 96'>as</span> <span class='ocrx_word' id='word_1_49' title='bbox 746 719 896 772; x_wconf 95'>could</span> <span class='ocrx_word' id='word_1_50' title='bbox 950 722 1067 787; x_wconf 96'>only</span> <span class='ocrx_word' id='word_1_51' title='bbox 1112 719 1244 772; x_wconf 96'>have</span>
51
+ </span>
52
+ <span class='ocr_line' id='line_1_10' title="bbox 15 803 1245 882; baseline 0.005 -29; x_size 70; x_descenders 19; x_ascenders 16"><span class='ocrx_word' id='word_1_52' title='bbox 15 803 139 854; x_wconf 95'>been</span> <span class='ocrx_word' id='word_1_53' title='bbox 168 805 474 858; x_wconf 91'>occasioned</span> <span class='ocrx_word' id='word_1_54' title='bbox 514 808 582 873; x_wconf 95'>by</span> <span class='ocrx_word' id='word_1_55' title='bbox 608 807 748 859; x_wconf 96'>such</span> <span class='ocrx_word' id='word_1_56' title='bbox 795 824 1065 882; x_wconf 86'>exposure,</span> <span class='ocrx_word' id='word_1_57' title='bbox 1074 808 1245 859; x_wconf 86'>hard-</span>
53
+ </span>
54
+ <span class='ocr_line' id='line_1_11' title="bbox 11 892 1244 966; baseline 0.006 -26; x_size 71; x_descenders 20; x_ascenders 16"><span class='ocrx_word' id='word_1_58' title='bbox 11 892 125 960; x_wconf 95'>ship</span> <span class='ocrx_word' id='word_1_59' title='bbox 155 896 256 944; x_wconf 96'>and</span> <span class='ocrx_word' id='word_1_60' title='bbox 296 894 561 963; x_wconf 91'>suffering,</span> <span class='ocrx_word' id='word_1_61' title='bbox 590 913 649 947; x_wconf 91'>as</span> <span class='ocrx_word' id='word_1_62' title='bbox 699 898 925 966; x_wconf 94'>perhaps</span> <span class='ocrx_word' id='word_1_63' title='bbox 974 913 1042 949; x_wconf 91'>no</span> <span class='ocrx_word' id='word_1_64' title='bbox 1090 898 1244 948; x_wconf 91'>other</span>
55
+ </span>
56
+ <span class='ocr_line' id='line_1_12' title="bbox 14 983 1245 1053; baseline 0.005 -23; x_size 71; x_descenders 19; x_ascenders 18"><span class='ocrx_word' id='word_1_65' title='bbox 14 995 128 1030; x_wconf 96'>man</span> <span class='ocrx_word' id='word_1_66' title='bbox 156 997 279 1031; x_wconf 92'>ever</span> <span class='ocrx_word' id='word_1_67' title='bbox 306 983 560 1034; x_wconf 88'>snrvived.</span> <span class='ocrx_word' id='word_1_68' title='bbox 635 984 703 1034; x_wconf 47'>din</span> <span class='ocrx_word' id='word_1_69' title='bbox 732 999 985 1053; x_wconf 47'>company</span> <span class='ocrx_word' id='word_1_70' title='bbox 1042 984 1161 1049; x_wconf 96'>with</span> <span class='ocrx_word' id='word_1_71' title='bbox 1214 1002 1245 1036; x_wconf 93'>a</span>
57
+ </span>
58
+ <span class='ocr_line' id='line_1_13' title="bbox 15 1065 1246 1126; baseline 0.004 -11; x_size 62; x_descenders 10; x_ascenders 18"><span class='ocrx_word' id='word_1_72' title='bbox 15 1065 283 1126; x_wconf 47'>Canadian</span> <span class='ocrx_word' id='word_1_73' title='bbox 312 1066 638 1120; x_wconf 96'>Frenchman</span> <span class='ocrx_word' id='word_1_74' title='bbox 669 1071 774 1122; x_wconf 94'>and</span> <span class='ocrx_word' id='word_1_75' title='bbox 804 1077 908 1122; x_wconf 91'>two</span> <span class='ocrx_word' id='word_1_76' title='bbox 961 1069 1246 1123; x_wconf 91'>Kentucki-</span>
59
+ </span>
60
+ <span class='ocr_line' id='line_1_14' title="bbox 12 1153 1248 1223; baseline 0.006 -22; x_size 68; x_descenders 17; x_ascenders 18"><span class='ocrx_word' id='word_1_77' title='bbox 12 1169 105 1202; x_wconf 90'>ans</span> <span class='ocrx_word' id='word_1_78' title='bbox 132 1153 310 1205; x_wconf 85'>he-left</span> <span class='ocrx_word' id='word_1_79' title='bbox 337 1155 425 1205; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_80' title='bbox 452 1162 672 1223; x_wconf 96'>country</span> <span class='ocrx_word' id='word_1_81' title='bbox 724 1157 783 1208; x_wconf 96'>of</span> <span class='ocrx_word' id='word_1_82' title='bbox 832 1157 927 1208; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_83' title='bbox 972 1157 1248 1209; x_wconf 90'>Blackfeet</span>
61
+ </span>
62
+ <span class='ocr_line' id='line_1_15' title="bbox 16 1239 1246 1306; baseline 0.003 -18; x_size 69; x_descenders 15; x_ascenders 21"><span class='ocrx_word' id='word_1_84' title='bbox 16 1239 228 1311; x_wconf 88'>Indians</span> <span class='ocrx_word' id='word_1_85' title='bbox 247 1241 356 1310; x_wconf 64'>last</span> <span class='ocrx_word' id='word_1_86' title='bbox 366 1240 599 1288; x_wconf 36'>-Fall.:to</span> <span class='ocrx_word' id='word_1_87' title='bbox 623 1244 795 1315; x_wconf 86'>join.</span> <span class='ocrx_word' id='word_1_88' title='bbox 800 1243 1100 1310; x_wconf 76'>Culverson</span> <span class='ocrx_word' id='word_1_89' title='bbox 1140 1245 1246 1309; x_wconf 96'>and</span>
63
+ </span>
64
+ <span class='ocr_line' id='line_1_16' title="bbox 14 1303 1246 1393; baseline 0.003 -21; x_size 67; x_descenders 16; x_ascenders 16"><span class='ocrx_word' id='word_1_90' title='bbox 14 1309 167 1388; x_wconf 96'>party</span> <span class='ocrx_word' id='word_1_91' title='bbox 195 1307 253 1373; x_wconf 96'>at</span> <span class='ocrx_word' id='word_1_92' title='bbox 282 1305 413 1373; x_wconf 96'>Fort</span> <span class='ocrx_word' id='word_1_93' title='bbox 443 1313 623 1376; x_wconf 96'>Pierre</span> <span class='ocrx_word' id='word_1_94' title='bbox 647 1326 750 1376; x_wconf 93'>and</span> <span class='ocrx_word' id='word_1_95' title='bbox 774 1358 789 1375; x_wconf 91'>¢</span> <span class='ocrx_word' id='word_1_96' title='bbox 773 1313 1089 1393; x_wconf 96'>accompany</span> <span class='ocrx_word' id='word_1_97' title='bbox 1109 1303 1246 1376; x_wconf 96'>them</span>
65
+ </span>
66
+ <span class='ocr_line' id='line_1_17' title="bbox 15 1411 1245 1480; baseline 0.004 -19; x_size 68; x_descenders 17; x_ascenders 15"><span class='ocrx_word' id='word_1_98' title='bbox 15 1419 70 1462; x_wconf 96'>to</span> <span class='ocrx_word' id='word_1_99' title='bbox 99 1411 185 1462; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_100' title='bbox 213 1418 393 1463; x_wconf 96'>states.</span> <span class='ocrx_word' id='word_1_101' title='bbox 432 1412 579 1480; x_wconf 96'>They</span> <span class='ocrx_word' id='word_1_102' title='bbox 608 1414 816 1466; x_wconf 96'>arrived</span> <span class='ocrx_word' id='word_1_103' title='bbox 837 1420 894 1465; x_wconf 96'>at</span> <span class='ocrx_word' id='word_1_104' title='bbox 914 1414 1050 1465; x_wconf 96'>Fort</span> <span class='ocrx_word' id='word_1_105' title='bbox 1067 1414 1245 1466; x_wconf 96'>Pierre</span>
67
+ </span>
68
+ <span class='ocr_line' id='line_1_18' title="bbox 15 1497 1246 1568; baseline 0.002 -18; x_size 70; x_descenders 17; x_ascenders 18"><span class='ocrx_word' id='word_1_106' title='bbox 15 1505 120 1550; x_wconf 96'>two</span> <span class='ocrx_word' id='word_1_107' title='bbox 147 1500 276 1564; x_wconf 96'>days</span> <span class='ocrx_word' id='word_1_108' title='bbox 303 1497 436 1550; x_wconf 92'>after</span> <span class='ocrx_word' id='word_1_109' title='bbox 466 1497 794 1551; x_wconf 74'>Calverson’s</span> <span class='ocrx_word' id='word_1_110' title='bbox 821 1500 1116 1568; x_wconf 96'>departure,</span> <span class='ocrx_word' id='word_1_111' title='bbox 1145 1502 1246 1552; x_wconf 96'>and</span>
69
+ </span>
70
+ <span class='ocr_line' id='line_1_19' title="bbox 15 1586 1249 1656; baseline 0.001 -19; x_size 68; x_descenders 17; x_ascenders 17"><span class='ocrx_word' id='word_1_112' title='bbox 15 1587 222 1638; x_wconf 51'>hurried</span> <span class='ocrx_word' id='word_1_113' title='bbox 252 1603 316 1638; x_wconf 96'>on</span> <span class='ocrx_word' id='word_1_114' title='bbox 343 1586 498 1650; x_wconf 96'>after,</span> <span class='ocrx_word' id='word_1_115' title='bbox 526 1587 578 1637; x_wconf 96'>in</span> <span class='ocrx_word' id='word_1_116' title='bbox 606 1587 693 1640; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_117' title='bbox 719 1587 853 1656; x_wconf 95'>hope</span> <span class='ocrx_word' id='word_1_118' title='bbox 874 1589 929 1640; x_wconf 95'>of</span> <span class='ocrx_word' id='word_1_119' title='bbox 939 1589 1249 1656; x_wconf 96'>overtaking</span>
71
+ </span>
72
+ <span class='ocr_line' id='line_1_20' title="bbox 18 1672 1248 1742; baseline 0.001 -16; x_size 69; x_descenders 16; x_ascenders 18"><span class='ocrx_word' id='word_1_120' title='bbox 18 1675 160 1734; x_wconf 90'>him.</span> <span class='ocrx_word' id='word_1_121' title='bbox 214 1672 296 1726; x_wconf 92'>On</span> <span class='ocrx_word' id='word_1_122' title='bbox 327 1675 416 1726; x_wconf 95'>the</span> <span class='ocrx_word' id='word_1_123' title='bbox 442 1675 584 1727; x_wconf 96'>third</span> <span class='ocrx_word' id='word_1_124' title='bbox 608 1675 711 1742; x_wconf 96'>day</span> <span class='ocrx_word' id='word_1_125' title='bbox 740 1691 837 1727; x_wconf 96'>one</span> <span class='ocrx_word' id='word_1_126' title='bbox 865 1676 922 1727; x_wconf 96'>of</span> <span class='ocrx_word' id='word_1_127' title='bbox 942 1675 1090 1727; x_wconf 96'>those</span> <span class='ocrx_word' id='word_1_128' title='bbox 1110 1692 1248 1727; x_wconf 96'>snow</span>
73
+ </span>
74
+ </p>
75
+ </div>
76
+ </div>
77
+ </body>
78
+ </html>
@@ -0,0 +1,109 @@
1
+ require 'spec_helper'
2
+ require 'tmpdir'
3
+
4
+ describe NewspaperWorks::ImageTool do
5
+ let(:fixtures) { File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files') }
6
+
7
+ # Image fixtures to test identification, metadata extraction for:
8
+ let(:gray_jp2) { File.join(fixtures, 'ocr_gray.jp2') }
9
+ let(:color_jp2) { File.join(fixtures, '4.1.07.jp2') }
10
+ let(:gray_tiff) { File.join(fixtures, 'ocr_gray.tiff') }
11
+ let(:mono_tiff) { File.join(fixtures, 'ocr_mono.tiff') }
12
+ let(:color_tiff) { File.join(fixtures, '4.1.07.tiff') }
13
+ let(:pdf) { File.join(fixtures, 'minimal-1-page.pdf') }
14
+
15
+ describe "Extracts metadata with JP2 backend" do
16
+ it "constructs with a path" do
17
+ identify = described_class.new(gray_jp2)
18
+ expect(identify.path).to eq gray_jp2
19
+ end
20
+
21
+ it "gets metadata for grayscale JP2 image" do
22
+ result = described_class.new(gray_jp2).metadata
23
+ expect(result[:color]).to eq 'gray'
24
+ expect(result[:width]).to eq 418
25
+ expect(result[:height]).to eq 1046
26
+ expect(result[:bits_per_component]).to eq 8
27
+ expect(result[:num_components]).to eq 1
28
+ end
29
+
30
+ it "gets metadata for color JP2 image" do
31
+ result = described_class.new(color_jp2).metadata
32
+ expect(result[:color]).to eq 'color'
33
+ expect(result[:width]).to eq 256
34
+ expect(result[:height]).to eq 256
35
+ expect(result[:bits_per_component]).to eq 8
36
+ # e.g. is 3, but would be four if sample image had an alpha channel
37
+ expect(result[:num_components]).to eq 3
38
+ end
39
+ end
40
+
41
+ describe "Extracts metadata for non-JP2 images with imagemagick" do
42
+ it "gets metadata for gray TIFF image" do
43
+ result = described_class.new(gray_tiff).metadata
44
+ expect(result[:color]).to eq 'gray'
45
+ expect(result[:width]).to eq 418
46
+ expect(result[:height]).to eq 1046
47
+ expect(result[:bits_per_component]).to eq 8
48
+ expect(result[:num_components]).to eq 1
49
+ end
50
+
51
+ it "gets metadata for monochrome TIFF image" do
52
+ result = described_class.new(mono_tiff).metadata
53
+ expect(result[:color]).to eq 'monochrome'
54
+ expect(result[:width]).to eq 1261
55
+ expect(result[:height]).to eq 1744
56
+ expect(result[:bits_per_component]).to eq 1
57
+ expect(result[:num_components]).to eq 1
58
+ end
59
+
60
+ it "gets metadata for color TIFF image" do
61
+ result = described_class.new(color_tiff).metadata
62
+ expect(result[:color]).to eq 'color'
63
+ expect(result[:width]).to eq 256
64
+ expect(result[:height]).to eq 256
65
+ expect(result[:bits_per_component]).to eq 8
66
+ # e.g. is 3, but would be four if sample image had an alpha channel
67
+ expect(result[:num_components]).to eq 3
68
+ end
69
+
70
+ it "detects mime type of pdf" do
71
+ result = described_class.new(pdf).metadata
72
+ expect(result[:content_type]).to eq 'application/pdf'
73
+ end
74
+ end
75
+
76
+ describe "converts images" do
77
+ it "makes a monochrome TIFF from JP2" do
78
+ tool = described_class.new(gray_jp2)
79
+ dest = File.join(Dir.mktmpdir, 'mono.tif')
80
+ tool.convert(dest, true)
81
+ expect(File.exist?(dest)).to be true
82
+ expect(described_class.new(dest).metadata[:color]).to eq 'monochrome'
83
+ end
84
+
85
+ it "makes a gray TIFF from JP2" do
86
+ tool = described_class.new(gray_jp2)
87
+ dest = File.join(Dir.mktmpdir, 'gray.tif')
88
+ tool.convert(dest, false)
89
+ expect(File.exist?(dest)).to be true
90
+ expect(described_class.new(dest).metadata[:color]).to eq 'gray'
91
+ end
92
+
93
+ it "makes a monochrome TIFF from grayscale TIFF" do
94
+ tool = described_class.new(gray_tiff)
95
+ dest = File.join(Dir.mktmpdir, 'mono.tif')
96
+ tool.convert(dest, true)
97
+ expect(File.exist?(dest)).to be true
98
+ expect(described_class.new(dest).metadata[:color]).to eq 'monochrome'
99
+ end
100
+
101
+ # Not yet supported to use this tool to make JP2, for now the only
102
+ # component in NewspaperWorks doing that is
103
+ # NewspaperWorks::JP2DerivativeService
104
+ it "raises error on JP2 destination" do
105
+ expect { described_class.new(gray_tiff).convert('out.jp2') }.to \
106
+ raise_error(RuntimeError)
107
+ end
108
+ end
109
+ end
@@ -24,13 +24,13 @@ RSpec.shared_examples 'ingest adapter IO' do
24
24
  # define the path to the file we will use for multiple examples
25
25
  let(:path) do
26
26
  fixtures = File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files')
27
- File.join(fixtures, 'page1.tiff')
27
+ File.join(fixtures, 'ocr_mono.tiff')
28
28
  end
29
29
 
30
30
  # DRY for this matcher's use in multiple examples:
31
31
  let(:have_io_and_correct_filename) do
32
32
  have_attributes(
33
- filename: 'page1.tiff',
33
+ filename: 'ocr_mono.tiff',
34
34
  io: an_object_responding_to(:read)
35
35
  )
36
36
  end
@@ -62,7 +62,7 @@ RSpec.shared_examples 'ingest adapter IO' do
62
62
  it "loads a StringIO with filename" do
63
63
  adapter = build(:newspaper_page_ingest)
64
64
  io = StringIO.new('File Content Here, Maybe')
65
- adapter.load(io, filename: 'page1.tiff')
65
+ adapter.load(io, filename: 'ocr_mono.tiff')
66
66
  expect(adapter).to have_io_and_correct_filename
67
67
  end
68
68
 
@@ -8,7 +8,7 @@ RSpec.describe NewspaperWorks::Ingest::NewspaperPageIngest do
8
8
 
9
9
  # define the path to the file we will use for multiple examples
10
10
  let(:path) do
11
- File.join(fixture_path, 'page1.tiff')
11
+ File.join(fixture_path, 'ocr_mono.tiff')
12
12
  end
13
13
 
14
14
  it_behaves_like('ingest adapter IO')
@@ -32,7 +32,7 @@ RSpec.describe NewspaperWorks::Ingest::NewspaperPageIngest do
32
32
 
33
33
  def verify_pcdm_fileset(fileset)
34
34
  # Hyrax always sets label (if not title) on fileset:
35
- expect(fileset.label).to eq 'page1.tiff'
35
+ expect(fileset.label).to eq 'ocr_mono.tiff'
36
36
  # reload file set and check on original file
37
37
  fileset.reload
38
38
  file = fileset.original_file
@@ -0,0 +1,37 @@
1
+ require 'spec_helper'
2
+
3
+ describe NewspaperWorks::JP2ImageMetadata do
4
+ let(:fixtures) { File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files') }
5
+
6
+ let(:gray_jp2) { File.join(fixtures, 'ocr_gray.jp2') }
7
+
8
+ let(:color_jp2) { File.join(fixtures, '4.1.07.jp2') }
9
+
10
+ describe "Extracts technical metadata from a JP2 file" do
11
+ it "constructs with a path" do
12
+ meta = described_class.new(gray_jp2)
13
+ expect(meta.path).to eq gray_jp2
14
+ end
15
+
16
+ it "gets metadata for grayscale image" do
17
+ meta = described_class.new(gray_jp2)
18
+ result = meta.technical_metadata
19
+ expect(result[:color]).to eq 'gray'
20
+ expect(result[:width]).to eq 418
21
+ expect(result[:height]).to eq 1046
22
+ expect(result[:bits_per_component]).to eq 8
23
+ expect(result[:num_components]).to eq 1
24
+ end
25
+
26
+ it "gets metadata for color image" do
27
+ meta = described_class.new(color_jp2)
28
+ result = meta.technical_metadata
29
+ expect(result[:color]).to eq 'color'
30
+ expect(result[:width]).to eq 256
31
+ expect(result[:height]).to eq 256
32
+ expect(result[:bits_per_component]).to eq 8
33
+ # e.g. is 3, but would be four if sample image had an alpha channel
34
+ expect(result[:num_components]).to eq 3
35
+ end
36
+ end
37
+ end