newspaper_works 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +5 -5
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +4 -0
  4. data/.travis.yml +2 -2
  5. data/README.md +14 -13
  6. data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
  7. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
  8. data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
  9. data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
  10. data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
  11. data/config/locales/newspaper_article.de.yml +1 -1
  12. data/config/locales/newspaper_article.en.yml +1 -1
  13. data/config/locales/newspaper_article.es.yml +1 -1
  14. data/config/locales/newspaper_article.fr.yml +1 -1
  15. data/config/locales/newspaper_article.it.yml +1 -1
  16. data/config/locales/newspaper_article.pt-BR.yml +1 -1
  17. data/config/locales/newspaper_article.zh.yml +1 -1
  18. data/config/locales/newspaper_container.de.yml +1 -1
  19. data/config/locales/newspaper_container.en.yml +1 -1
  20. data/config/locales/newspaper_container.es.yml +1 -1
  21. data/config/locales/newspaper_container.fr.yml +1 -1
  22. data/config/locales/newspaper_container.it.yml +1 -1
  23. data/config/locales/newspaper_container.pt-BR.yml +1 -1
  24. data/config/locales/newspaper_container.zh.yml +1 -1
  25. data/config/locales/newspaper_issue.de.yml +1 -1
  26. data/config/locales/newspaper_issue.en.yml +1 -1
  27. data/config/locales/newspaper_issue.es.yml +1 -1
  28. data/config/locales/newspaper_issue.fr.yml +1 -1
  29. data/config/locales/newspaper_issue.it.yml +2 -2
  30. data/config/locales/newspaper_issue.pt-BR.yml +2 -2
  31. data/config/locales/newspaper_issue.zh.yml +2 -2
  32. data/config/locales/newspaper_page.de.yml +1 -1
  33. data/config/locales/newspaper_page.en.yml +1 -1
  34. data/config/locales/newspaper_page.es.yml +1 -1
  35. data/config/locales/newspaper_page.fr.yml +1 -1
  36. data/config/locales/newspaper_page.it.yml +1 -1
  37. data/config/locales/newspaper_page.pt-BR.yml +1 -1
  38. data/config/locales/newspaper_page.zh.yml +1 -1
  39. data/config/locales/newspaper_title.de.yml +1 -1
  40. data/config/locales/newspaper_title.en.yml +1 -1
  41. data/config/locales/newspaper_title.es.yml +1 -1
  42. data/config/locales/newspaper_title.fr.yml +1 -1
  43. data/config/locales/newspaper_title.it.yml +1 -1
  44. data/config/locales/newspaper_title.pt-BR.yml +1 -1
  45. data/config/locales/newspaper_title.zh.yml +1 -1
  46. data/config/locales/newspaper_works.de.yml +98 -0
  47. data/config/locales/newspaper_works.en.yml +67 -0
  48. data/config/locales/newspaper_works.es.yml +96 -0
  49. data/config/locales/newspaper_works.fr.yml +97 -0
  50. data/config/locales/newspaper_works.it.yml +90 -0
  51. data/config/locales/newspaper_works.pt-BR.yml +96 -0
  52. data/config/locales/newspaper_works.zh.yml +90 -0
  53. data/config/vendor/fits.xml +55 -0
  54. data/config/vendor/imagemagick-6-policy.xml +39 -39
  55. data/lib/newspaper_works.rb +2 -0
  56. data/lib/newspaper_works/image_tool.rb +119 -0
  57. data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
  58. data/lib/newspaper_works/text_extraction.rb +1 -0
  59. data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
  60. data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
  61. data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
  62. data/lib/newspaper_works/version.rb +1 -1
  63. data/newspaper_works.gemspec +2 -3
  64. data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
  65. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  66. data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
  67. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
  68. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
  69. data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
  70. data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
  71. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
  72. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
  73. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
  74. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
  75. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
  76. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
  77. data/spec/spec_helper.rb +19 -0
  78. metadata +21 -22
@@ -1,100 +1,86 @@
1
1
  require 'json'
2
2
  require 'open3'
3
- require 'rtesseract'
3
+ require 'tmpdir'
4
4
 
5
5
  # --
6
6
  module NewspaperWorks
7
7
  # Module for text extraction (OCR or otherwise)
8
8
  module TextExtraction
9
9
  class PageOCR
10
- def self.alto_from(path)
11
- new(path).alto
12
- end
10
+ attr_accessor :html, :path
13
11
 
14
12
  def initialize(path)
15
13
  @path = path
14
+ # hOCR html:
15
+ @html = nil
16
16
  @words = nil
17
- @processor = "mini_magick"
18
17
  @source_meta = nil
19
- @use_gm = extension.start_with?('jp2')
20
18
  @box = nil
21
19
  @plain = nil
22
20
  end
23
21
 
24
- def extension
25
- @path.split('.')[-1].downcase
22
+ def run_ocr
23
+ outfile = File.join(Dir.mktmpdir, 'output_html')
24
+ cmd = "tesseract #{path} #{outfile} hocr"
25
+ `#{cmd}`
26
+ outfile + '.hocr'
26
27
  end
27
28
 
28
- def load_box
29
- if @box.nil?
30
- if @use_gm
31
- MiniMagick.with_cli(:graphicsmagick) do
32
- @box = RTesseract::Box.new(@path, processor: @processor)
33
- @plain = @box.to_s
34
- end
35
- else
36
- @box = RTesseract::Box.new(@path, processor: @processor)
37
- @plain = @box.to_s
38
- end
39
- end
40
- @box
29
+ def load_words
30
+ preprocess_image
31
+ html_path = run_ocr
32
+ reader = NewspaperWorks::TextExtraction::HOCRReader.new(html_path)
33
+ @words = reader.words
34
+ @plain = reader.text
41
35
  end
42
36
 
43
37
  def words
44
- @words = load_box.words if @words.nil?
38
+ load_words if @words.nil?
45
39
  @words
46
40
  end
47
41
 
48
- def normalized_coordinate(word)
49
- {
50
- word: word[:word],
51
- coordinates: [
52
- word[:x_start],
53
- word[:y_start],
54
- (word[:x_end] - word[:x_start]),
55
- (word[:y_end] - word[:y_start])
56
- ]
57
- }
58
- end
59
-
60
42
  def word_json
61
- save_words = words.map { |w| normalized_coordinate(w) }
62
- builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(save_words,
63
- width,
64
- height)
43
+ builder = NewspaperWorks::TextExtraction::WordCoordsBuilder.new(
44
+ words,
45
+ width,
46
+ height
47
+ )
65
48
  builder.to_json
66
49
  end
67
50
 
68
51
  def plain
69
- load_box
52
+ load_words if @plain.nil?
70
53
  @plain
71
54
  end
72
55
 
73
56
  def identify
74
- if @source_geometry.nil?
75
- path = @path
76
- cmd = "identify -verbose #{path}"
77
- cmd = 'gm ' + cmd if @use_gm
78
- lines = `#{cmd}`.lines
79
- geo = lines.select { |line| line.strip.start_with?('Geometry') }[0]
80
- img_geo = geo.strip.split(':')[-1].strip.split('+')[0]
81
- @source_geometry = img_geo.split('x').map(&:to_i)
82
- end
83
- @source_geometry
57
+ return @source_meta unless @source_meta.nil?
58
+ @source_meta = NewspaperWorks::ImageTool.new(@path).metadata
84
59
  end
85
60
 
86
61
  def width
87
- identify[0]
62
+ identify[:width]
88
63
  end
89
64
 
90
65
  def height
91
- identify[1]
66
+ identify[:height]
92
67
  end
93
68
 
94
69
  def alto
95
70
  writer = NewspaperWorks::TextExtraction::RenderAlto.new(width, height)
96
71
  writer.to_alto(words)
97
72
  end
73
+
74
+ private
75
+
76
+ # transform the image into a one-bit TIFF for OCR
77
+ def preprocess_image
78
+ tool = NewspaperWorks::ImageTool.new(@path)
79
+ return if tool.metadata[:color] == 'monochrome'
80
+ intermediate_path = File.join(Dir.mktmpdir, 'monochrome-interim.tif')
81
+ tool.convert(intermediate_path, true)
82
+ @path = intermediate_path
83
+ end
98
84
  end
99
85
  end
100
86
  end
@@ -15,10 +15,10 @@ module NewspaperWorks
15
15
  words.each do |word|
16
16
  xml.String(
17
17
  CONTENT: word[:word],
18
- HEIGHT: scale_point(word[:y_end] - word[:y_start]).to_s,
19
- WIDTH: scale_point(word[:x_end] - word[:x_start]).to_s,
20
- HPOS: scale_point(word[:x_start]).to_s,
21
- VPOS: scale_point(word[:y_start]).to_s
18
+ WIDTH: scale_point(word[:coordinates][2]).to_s,
19
+ HEIGHT: scale_point(word[:coordinates][3]).to_s,
20
+ HPOS: scale_point(word[:coordinates][0]).to_s,
21
+ VPOS: scale_point(word[:coordinates][1]).to_s
22
22
  ) { xml.text '' }
23
23
  end
24
24
  end
@@ -1,3 +1,3 @@
1
1
  module NewspaperWorks
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '1.0.0'.freeze
3
3
  end
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
  'Eben English']
12
12
  spec.email = ['sean.upton@utah.edu', 'jacob.reed@utah.edu',
13
13
  'brian.mcbride@utah.edu', 'eenglish@bpl.org']
14
- spec.homepage = 'https://github.com/marriott-library/newspaper_works'
14
+ spec.homepage = 'https://github.com/samvera-labs/newspaper_works'
15
15
  spec.description = 'Gem/Engine for Newspaper Works in Hyrax-based Samvera
16
16
  Application.'
17
17
  spec.summary = <<-SUMMARY
@@ -22,12 +22,11 @@ SUMMARY
22
22
  spec.license = 'Apache-2.0'
23
23
  spec.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR)
24
24
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
25
- spec.add_dependency 'blacklight_iiif_search'
25
+ spec.add_dependency 'blacklight_iiif_search', '~> 1.0'
26
26
  spec.add_dependency 'blacklight_advanced_search', '6.4.1'
27
27
  spec.add_dependency 'hyrax', '2.5.1'
28
28
  spec.add_dependency 'nokogiri'
29
29
  spec.add_dependency 'rails', '~> 5.1'
30
- spec.add_dependency 'rtesseract', '~> 2.2.0'
31
30
  spec.add_dependency 'sass-rails', '~> 5.0'
32
31
 
33
32
  spec.add_development_dependency 'bixby'
@@ -15,7 +15,7 @@ RSpec.describe 'thumbnail_highlights', js: true do
15
15
  visibility: "open"
16
16
  )
17
17
  attachment = NewspaperWorks::Data::WorkFiles.of(@work)
18
- attachment.assign(File.join(fixture_path, 'page1.tiff'))
18
+ attachment.assign(File.join(fixture_path, 'ocr_mono.tiff'))
19
19
  attachment.derivatives.assign(File.join(fixture_path, 'ndnp-sample1-txt.txt'))
20
20
  attachment.derivatives.assign(File.join(fixture_path, 'ndnp-sample1-json.json'))
21
21
  attachment.commit!
@@ -0,0 +1,78 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
3
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
4
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
5
+ <head>
6
+ <title></title>
7
+ <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
8
+ <meta name='ocr-system' content='tesseract 4.0.0-beta.1' />
9
+ <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
10
+ </head>
11
+ <body>
12
+ <div class='ocr_page' id='page_1' title='image "ocr_mono.tiff"; bbox 0 0 1261 1744; ppageno 0'>
13
+ <div class='ocr_carea' id='block_1_1' title="bbox 155 22 1073 129">
14
+ <p class='ocr_par' id='par_1_1' lang='eng' title="bbox 155 22 1073 129">
15
+ <span class='ocr_line' id='line_1_1' title="bbox 155 22 1073 129; baseline 0.003 -18; x_size 111; x_descenders 19; x_ascenders 40"><span class='ocrx_word' id='word_1_1' title='bbox 155 59 247 129; x_wconf 76'>_A</span> <span class='ocrx_word' id='word_1_2' title='bbox 272 28 600 113; x_wconf 95'>FEARFUL</span> <span class='ocrx_word' id='word_1_3' title='bbox 622 22 1073 129; x_wconf 85'>ADVENTURE.</span>
16
+ </span>
17
+ </p>
18
+ </div>
19
+ <div class='ocr_carea' id='block_1_2' title="bbox 551 152 695 161">
20
+ <p class='ocr_par' id='par_1_2' lang='eng' title="bbox 551 152 695 161">
21
+ <span class='ocr_line' id='line_1_2' title="bbox 551 152 695 161; baseline 0 0; x_size 4.5; x_descenders -2.25; x_ascenders 2.25"><span class='ocrx_word' id='word_1_4' title='bbox 551 152 695 161; x_wconf 95'> </span>
22
+ </span>
23
+ </p>
24
+ </div>
25
+ <div class='ocr_carea' id='block_1_3' title="bbox 11 174 1244 613">
26
+ <p class='ocr_par' id='par_1_3' lang='eng' title="bbox 11 174 1242 429">
27
+ <span class='ocr_line' id='line_1_3' title="bbox 69 174 1242 261; baseline 0.003 -21; x_size 70; x_descenders 18; x_ascenders 18"><span class='ocrx_word' id='word_1_5' title='bbox 69 188 193 240; x_wconf 57'>‘The</span> <span class='ocrx_word' id='word_1_6' title='bbox 225 191 464 244; x_wconf 52'>Missouri.</span> <span class='ocrx_word' id='word_1_7' title='bbox 517 174 865 261; x_wconf 92'>Republican,</span> <span class='ocrx_word' id='word_1_8' title='bbox 906 195 954 246; x_wconf 95'>in</span> <span class='ocrx_word' id='word_1_9' title='bbox 1007 211 1040 247; x_wconf 77'>a</span> <span class='ocrx_word' id='word_1_10' title='bbox 1087 195 1242 246; x_wconf 90'>letter</span>
28
+ </span>
29
+ <span class='ocr_line' id='line_1_4' title="bbox 11 270 1238 387; baseline 0.006 -63; x_size 71; x_descenders 18; x_ascenders 18"><span class='ocrx_word' id='word_1_11' title='bbox 11 270 139 326; x_wconf 96'>from</span> <span class='ocrx_word' id='word_1_12' title='bbox 167 293 199 328; x_wconf 93'>a</span> <span class='ocrx_word' id='word_1_13' title='bbox 229 277 440 329; x_wconf 95'>Kansas</span> <span class='ocrx_word' id='word_1_14' title='bbox 464 281 888 374; x_wconf 94'>correspondent,</span> <span class='ocrx_word' id='word_1_15' title='bbox 903 281 1008 387; x_wconf 96'>has</span> <span class='ocrx_word' id='word_1_16' title='bbox 1039 282 1128 333; x_wconf 93'>the</span> <span class='ocrx_word' id='word_1_17' title='bbox 1149 279 1238 332; x_wconf 92'>fol-</span>
30
+ </span>
31
+ <span class='ocr_line' id='line_1_5' title="bbox 12 361 224 429; baseline 0.021 -19; x_size 66; x_descenders 17; x_ascenders 16"><span class='ocrx_word' id='word_1_18' title='bbox 12 361 224 429; x_wconf 95'>lowing:</span>
32
+ </span>
33
+ </p>
34
+
35
+ <p class='ocr_par' id='par_1_4' lang='eng' title="bbox 11 407 1244 613">
36
+ <span class='ocr_line' id='line_1_6' title="bbox 86 407 1244 520; baseline 0.005 -19; x_size 70; x_descenders 16; x_ascenders 19"><span class='ocrx_word' id='word_1_19' title='bbox 86 452 206 501; x_wconf 95'>“At</span> <span class='ocrx_word' id='word_1_20' title='bbox 234 449 311 504; x_wconf 95'>St.</span> <span class='ocrx_word' id='word_1_21' title='bbox 339 415 567 520; x_wconf 49'>Josephs</span> <span class='ocrx_word' id='word_1_22' title='bbox 595 454 751 505; x_wconf 54'>Tsaw</span> <span class='ocrx_word' id='word_1_23' title='bbox 781 456 884 509; x_wconf 95'>Mr,</span> <span class='ocrx_word' id='word_1_24' title='bbox 915 457 982 508; x_wconf 91'>A.</span> <span class='ocrx_word' id='word_1_25' title='bbox 1011 456 1074 508; x_wconf 21'>&#39;T.</span> <span class='ocrx_word' id='word_1_26' title='bbox 1116 407 1244 508; x_wconf 91'>Gor-</span>
37
+ </span>
38
+ <span class='ocr_line' id='line_1_7' title="bbox 11 539 1242 613; baseline 0.006 -24; x_size 65; x_descenders 14; x_ascenders 16"><span class='ocrx_word' id='word_1_27' title='bbox 11 554 154 602; x_wconf 96'>man,</span> <span class='ocrx_word' id='word_1_28' title='bbox 177 539 228 590; x_wconf 95'>of</span> <span class='ocrx_word' id='word_1_29' title='bbox 260 539 389 592; x_wconf 95'>New</span> <span class='ocrx_word' id='word_1_30' title='bbox 417 542 580 606; x_wconf 96'>York,</span> <span class='ocrx_word' id='word_1_31' title='bbox 607 544 724 596; x_wconf 96'>who</span> <span class='ocrx_word' id='word_1_32' title='bbox 752 544 859 597; x_wconf 38'>had</span> <span class='ocrx_word' id='word_1_33' title='bbox 861 546 988 613; x_wconf 38'>just</span> <span class='ocrx_word' id='word_1_34' title='bbox 1012 562 1170 598; x_wconf 85'>come:</span> <span class='ocrx_word' id='word_1_35' title='bbox 1194 546 1242 597; x_wconf 96'>in</span>
39
+ </span>
40
+ </p>
41
+ </div>
42
+ <div class='ocr_carea' id='block_1_4' title="bbox 12 625 1261 699">
43
+ <p class='ocr_par' id='par_1_5' lang='eng' title="bbox 12 625 1261 699">
44
+ <span class='ocr_line' id='line_1_8' title="bbox 12 625 1261 699; baseline 0.007 -24; x_size 66; x_descenders 16; x_ascenders 15"><span class='ocrx_word' id='word_1_36' title='bbox 12 625 140 676; x_wconf 95'>from</span> <span class='ocrx_word' id='word_1_37' title='bbox 163 627 257 679; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_38' title='bbox 287 631 576 680; x_wconf 95'>mountains</span> <span class='ocrx_word' id='word_1_39' title='bbox 599 631 650 682; x_wconf 94'>in</span> <span class='ocrx_word' id='word_1_40' title='bbox 678 632 802 683; x_wconf 89'>such</span> <span class='ocrx_word' id='word_1_41' title='bbox 824 648 855 682; x_wconf 89'>a</span> <span class='ocrx_word' id='word_1_42' title='bbox 882 636 1019 683; x_wconf 85'>state</span> <span class='ocrx_word' id='word_1_43' title='bbox 1043 633 1097 683; x_wconf 92'>of</span> <span class='ocrx_word' id='word_1_44' title='bbox 1109 637 1261 699; x_wconf 88'>pros-</span>
45
+ </span>
46
+ </p>
47
+ </div>
48
+ <div class='ocr_carea' id='block_1_5' title="bbox 11 714 1249 1742">
49
+ <p class='ocr_par' id='par_1_6' lang='eng' title="bbox 11 714 1249 1742">
50
+ <span class='ocr_line' id='line_1_9' title="bbox 12 714 1244 787; baseline 0.006 -23; x_size 69; x_descenders 17; x_ascenders 17"><span class='ocrx_word' id='word_1_45' title='bbox 12 714 209 765; x_wconf 95'>tration</span> <span class='ocrx_word' id='word_1_46' title='bbox 238 717 339 783; x_wconf 93'>and</span> <span class='ocrx_word' id='word_1_47' title='bbox 372 717 611 769; x_wconf 91'>affiiction</span> <span class='ocrx_word' id='word_1_48' title='bbox 645 734 701 769; x_wconf 96'>as</span> <span class='ocrx_word' id='word_1_49' title='bbox 746 719 896 772; x_wconf 95'>could</span> <span class='ocrx_word' id='word_1_50' title='bbox 950 722 1067 787; x_wconf 96'>only</span> <span class='ocrx_word' id='word_1_51' title='bbox 1112 719 1244 772; x_wconf 96'>have</span>
51
+ </span>
52
+ <span class='ocr_line' id='line_1_10' title="bbox 15 803 1245 882; baseline 0.005 -29; x_size 70; x_descenders 19; x_ascenders 16"><span class='ocrx_word' id='word_1_52' title='bbox 15 803 139 854; x_wconf 95'>been</span> <span class='ocrx_word' id='word_1_53' title='bbox 168 805 474 858; x_wconf 91'>occasioned</span> <span class='ocrx_word' id='word_1_54' title='bbox 514 808 582 873; x_wconf 95'>by</span> <span class='ocrx_word' id='word_1_55' title='bbox 608 807 748 859; x_wconf 96'>such</span> <span class='ocrx_word' id='word_1_56' title='bbox 795 824 1065 882; x_wconf 86'>exposure,</span> <span class='ocrx_word' id='word_1_57' title='bbox 1074 808 1245 859; x_wconf 86'>hard-</span>
53
+ </span>
54
+ <span class='ocr_line' id='line_1_11' title="bbox 11 892 1244 966; baseline 0.006 -26; x_size 71; x_descenders 20; x_ascenders 16"><span class='ocrx_word' id='word_1_58' title='bbox 11 892 125 960; x_wconf 95'>ship</span> <span class='ocrx_word' id='word_1_59' title='bbox 155 896 256 944; x_wconf 96'>and</span> <span class='ocrx_word' id='word_1_60' title='bbox 296 894 561 963; x_wconf 91'>suffering,</span> <span class='ocrx_word' id='word_1_61' title='bbox 590 913 649 947; x_wconf 91'>as</span> <span class='ocrx_word' id='word_1_62' title='bbox 699 898 925 966; x_wconf 94'>perhaps</span> <span class='ocrx_word' id='word_1_63' title='bbox 974 913 1042 949; x_wconf 91'>no</span> <span class='ocrx_word' id='word_1_64' title='bbox 1090 898 1244 948; x_wconf 91'>other</span>
55
+ </span>
56
+ <span class='ocr_line' id='line_1_12' title="bbox 14 983 1245 1053; baseline 0.005 -23; x_size 71; x_descenders 19; x_ascenders 18"><span class='ocrx_word' id='word_1_65' title='bbox 14 995 128 1030; x_wconf 96'>man</span> <span class='ocrx_word' id='word_1_66' title='bbox 156 997 279 1031; x_wconf 92'>ever</span> <span class='ocrx_word' id='word_1_67' title='bbox 306 983 560 1034; x_wconf 88'>snrvived.</span> <span class='ocrx_word' id='word_1_68' title='bbox 635 984 703 1034; x_wconf 47'>din</span> <span class='ocrx_word' id='word_1_69' title='bbox 732 999 985 1053; x_wconf 47'>company</span> <span class='ocrx_word' id='word_1_70' title='bbox 1042 984 1161 1049; x_wconf 96'>with</span> <span class='ocrx_word' id='word_1_71' title='bbox 1214 1002 1245 1036; x_wconf 93'>a</span>
57
+ </span>
58
+ <span class='ocr_line' id='line_1_13' title="bbox 15 1065 1246 1126; baseline 0.004 -11; x_size 62; x_descenders 10; x_ascenders 18"><span class='ocrx_word' id='word_1_72' title='bbox 15 1065 283 1126; x_wconf 47'>Canadian</span> <span class='ocrx_word' id='word_1_73' title='bbox 312 1066 638 1120; x_wconf 96'>Frenchman</span> <span class='ocrx_word' id='word_1_74' title='bbox 669 1071 774 1122; x_wconf 94'>and</span> <span class='ocrx_word' id='word_1_75' title='bbox 804 1077 908 1122; x_wconf 91'>two</span> <span class='ocrx_word' id='word_1_76' title='bbox 961 1069 1246 1123; x_wconf 91'>Kentucki-</span>
59
+ </span>
60
+ <span class='ocr_line' id='line_1_14' title="bbox 12 1153 1248 1223; baseline 0.006 -22; x_size 68; x_descenders 17; x_ascenders 18"><span class='ocrx_word' id='word_1_77' title='bbox 12 1169 105 1202; x_wconf 90'>ans</span> <span class='ocrx_word' id='word_1_78' title='bbox 132 1153 310 1205; x_wconf 85'>he-left</span> <span class='ocrx_word' id='word_1_79' title='bbox 337 1155 425 1205; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_80' title='bbox 452 1162 672 1223; x_wconf 96'>country</span> <span class='ocrx_word' id='word_1_81' title='bbox 724 1157 783 1208; x_wconf 96'>of</span> <span class='ocrx_word' id='word_1_82' title='bbox 832 1157 927 1208; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_83' title='bbox 972 1157 1248 1209; x_wconf 90'>Blackfeet</span>
61
+ </span>
62
+ <span class='ocr_line' id='line_1_15' title="bbox 16 1239 1246 1306; baseline 0.003 -18; x_size 69; x_descenders 15; x_ascenders 21"><span class='ocrx_word' id='word_1_84' title='bbox 16 1239 228 1311; x_wconf 88'>Indians</span> <span class='ocrx_word' id='word_1_85' title='bbox 247 1241 356 1310; x_wconf 64'>last</span> <span class='ocrx_word' id='word_1_86' title='bbox 366 1240 599 1288; x_wconf 36'>-Fall.:to</span> <span class='ocrx_word' id='word_1_87' title='bbox 623 1244 795 1315; x_wconf 86'>join.</span> <span class='ocrx_word' id='word_1_88' title='bbox 800 1243 1100 1310; x_wconf 76'>Culverson</span> <span class='ocrx_word' id='word_1_89' title='bbox 1140 1245 1246 1309; x_wconf 96'>and</span>
63
+ </span>
64
+ <span class='ocr_line' id='line_1_16' title="bbox 14 1303 1246 1393; baseline 0.003 -21; x_size 67; x_descenders 16; x_ascenders 16"><span class='ocrx_word' id='word_1_90' title='bbox 14 1309 167 1388; x_wconf 96'>party</span> <span class='ocrx_word' id='word_1_91' title='bbox 195 1307 253 1373; x_wconf 96'>at</span> <span class='ocrx_word' id='word_1_92' title='bbox 282 1305 413 1373; x_wconf 96'>Fort</span> <span class='ocrx_word' id='word_1_93' title='bbox 443 1313 623 1376; x_wconf 96'>Pierre</span> <span class='ocrx_word' id='word_1_94' title='bbox 647 1326 750 1376; x_wconf 93'>and</span> <span class='ocrx_word' id='word_1_95' title='bbox 774 1358 789 1375; x_wconf 91'>¢</span> <span class='ocrx_word' id='word_1_96' title='bbox 773 1313 1089 1393; x_wconf 96'>accompany</span> <span class='ocrx_word' id='word_1_97' title='bbox 1109 1303 1246 1376; x_wconf 96'>them</span>
65
+ </span>
66
+ <span class='ocr_line' id='line_1_17' title="bbox 15 1411 1245 1480; baseline 0.004 -19; x_size 68; x_descenders 17; x_ascenders 15"><span class='ocrx_word' id='word_1_98' title='bbox 15 1419 70 1462; x_wconf 96'>to</span> <span class='ocrx_word' id='word_1_99' title='bbox 99 1411 185 1462; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_100' title='bbox 213 1418 393 1463; x_wconf 96'>states.</span> <span class='ocrx_word' id='word_1_101' title='bbox 432 1412 579 1480; x_wconf 96'>They</span> <span class='ocrx_word' id='word_1_102' title='bbox 608 1414 816 1466; x_wconf 96'>arrived</span> <span class='ocrx_word' id='word_1_103' title='bbox 837 1420 894 1465; x_wconf 96'>at</span> <span class='ocrx_word' id='word_1_104' title='bbox 914 1414 1050 1465; x_wconf 96'>Fort</span> <span class='ocrx_word' id='word_1_105' title='bbox 1067 1414 1245 1466; x_wconf 96'>Pierre</span>
67
+ </span>
68
+ <span class='ocr_line' id='line_1_18' title="bbox 15 1497 1246 1568; baseline 0.002 -18; x_size 70; x_descenders 17; x_ascenders 18"><span class='ocrx_word' id='word_1_106' title='bbox 15 1505 120 1550; x_wconf 96'>two</span> <span class='ocrx_word' id='word_1_107' title='bbox 147 1500 276 1564; x_wconf 96'>days</span> <span class='ocrx_word' id='word_1_108' title='bbox 303 1497 436 1550; x_wconf 92'>after</span> <span class='ocrx_word' id='word_1_109' title='bbox 466 1497 794 1551; x_wconf 74'>Calverson’s</span> <span class='ocrx_word' id='word_1_110' title='bbox 821 1500 1116 1568; x_wconf 96'>departure,</span> <span class='ocrx_word' id='word_1_111' title='bbox 1145 1502 1246 1552; x_wconf 96'>and</span>
69
+ </span>
70
+ <span class='ocr_line' id='line_1_19' title="bbox 15 1586 1249 1656; baseline 0.001 -19; x_size 68; x_descenders 17; x_ascenders 17"><span class='ocrx_word' id='word_1_112' title='bbox 15 1587 222 1638; x_wconf 51'>hurried</span> <span class='ocrx_word' id='word_1_113' title='bbox 252 1603 316 1638; x_wconf 96'>on</span> <span class='ocrx_word' id='word_1_114' title='bbox 343 1586 498 1650; x_wconf 96'>after,</span> <span class='ocrx_word' id='word_1_115' title='bbox 526 1587 578 1637; x_wconf 96'>in</span> <span class='ocrx_word' id='word_1_116' title='bbox 606 1587 693 1640; x_wconf 96'>the</span> <span class='ocrx_word' id='word_1_117' title='bbox 719 1587 853 1656; x_wconf 95'>hope</span> <span class='ocrx_word' id='word_1_118' title='bbox 874 1589 929 1640; x_wconf 95'>of</span> <span class='ocrx_word' id='word_1_119' title='bbox 939 1589 1249 1656; x_wconf 96'>overtaking</span>
71
+ </span>
72
+ <span class='ocr_line' id='line_1_20' title="bbox 18 1672 1248 1742; baseline 0.001 -16; x_size 69; x_descenders 16; x_ascenders 18"><span class='ocrx_word' id='word_1_120' title='bbox 18 1675 160 1734; x_wconf 90'>him.</span> <span class='ocrx_word' id='word_1_121' title='bbox 214 1672 296 1726; x_wconf 92'>On</span> <span class='ocrx_word' id='word_1_122' title='bbox 327 1675 416 1726; x_wconf 95'>the</span> <span class='ocrx_word' id='word_1_123' title='bbox 442 1675 584 1727; x_wconf 96'>third</span> <span class='ocrx_word' id='word_1_124' title='bbox 608 1675 711 1742; x_wconf 96'>day</span> <span class='ocrx_word' id='word_1_125' title='bbox 740 1691 837 1727; x_wconf 96'>one</span> <span class='ocrx_word' id='word_1_126' title='bbox 865 1676 922 1727; x_wconf 96'>of</span> <span class='ocrx_word' id='word_1_127' title='bbox 942 1675 1090 1727; x_wconf 96'>those</span> <span class='ocrx_word' id='word_1_128' title='bbox 1110 1692 1248 1727; x_wconf 96'>snow</span>
73
+ </span>
74
+ </p>
75
+ </div>
76
+ </div>
77
+ </body>
78
+ </html>
@@ -0,0 +1,109 @@
1
+ require 'spec_helper'
2
+ require 'tmpdir'
3
+
4
+ describe NewspaperWorks::ImageTool do
5
+ let(:fixtures) { File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files') }
6
+
7
+ # Image fixtures to test identification, metadata extraction for:
8
+ let(:gray_jp2) { File.join(fixtures, 'ocr_gray.jp2') }
9
+ let(:color_jp2) { File.join(fixtures, '4.1.07.jp2') }
10
+ let(:gray_tiff) { File.join(fixtures, 'ocr_gray.tiff') }
11
+ let(:mono_tiff) { File.join(fixtures, 'ocr_mono.tiff') }
12
+ let(:color_tiff) { File.join(fixtures, '4.1.07.tiff') }
13
+ let(:pdf) { File.join(fixtures, 'minimal-1-page.pdf') }
14
+
15
+ describe "Extracts metadata with JP2 backend" do
16
+ it "constructs with a path" do
17
+ identify = described_class.new(gray_jp2)
18
+ expect(identify.path).to eq gray_jp2
19
+ end
20
+
21
+ it "gets metadata for grayscale JP2 image" do
22
+ result = described_class.new(gray_jp2).metadata
23
+ expect(result[:color]).to eq 'gray'
24
+ expect(result[:width]).to eq 418
25
+ expect(result[:height]).to eq 1046
26
+ expect(result[:bits_per_component]).to eq 8
27
+ expect(result[:num_components]).to eq 1
28
+ end
29
+
30
+ it "gets metadata for color JP2 image" do
31
+ result = described_class.new(color_jp2).metadata
32
+ expect(result[:color]).to eq 'color'
33
+ expect(result[:width]).to eq 256
34
+ expect(result[:height]).to eq 256
35
+ expect(result[:bits_per_component]).to eq 8
36
+ # e.g. is 3, but would be four if sample image had an alpha channel
37
+ expect(result[:num_components]).to eq 3
38
+ end
39
+ end
40
+
41
+ describe "Extracts metadata for non-JP2 images with imagemagick" do
42
+ it "gets metadata for gray TIFF image" do
43
+ result = described_class.new(gray_tiff).metadata
44
+ expect(result[:color]).to eq 'gray'
45
+ expect(result[:width]).to eq 418
46
+ expect(result[:height]).to eq 1046
47
+ expect(result[:bits_per_component]).to eq 8
48
+ expect(result[:num_components]).to eq 1
49
+ end
50
+
51
+ it "gets metadata for monochrome TIFF image" do
52
+ result = described_class.new(mono_tiff).metadata
53
+ expect(result[:color]).to eq 'monochrome'
54
+ expect(result[:width]).to eq 1261
55
+ expect(result[:height]).to eq 1744
56
+ expect(result[:bits_per_component]).to eq 1
57
+ expect(result[:num_components]).to eq 1
58
+ end
59
+
60
+ it "gets metadata for color TIFF image" do
61
+ result = described_class.new(color_tiff).metadata
62
+ expect(result[:color]).to eq 'color'
63
+ expect(result[:width]).to eq 256
64
+ expect(result[:height]).to eq 256
65
+ expect(result[:bits_per_component]).to eq 8
66
+ # e.g. is 3, but would be four if sample image had an alpha channel
67
+ expect(result[:num_components]).to eq 3
68
+ end
69
+
70
+ it "detects mime type of pdf" do
71
+ result = described_class.new(pdf).metadata
72
+ expect(result[:content_type]).to eq 'application/pdf'
73
+ end
74
+ end
75
+
76
+ describe "converts images" do
77
+ it "makes a monochrome TIFF from JP2" do
78
+ tool = described_class.new(gray_jp2)
79
+ dest = File.join(Dir.mktmpdir, 'mono.tif')
80
+ tool.convert(dest, true)
81
+ expect(File.exist?(dest)).to be true
82
+ expect(described_class.new(dest).metadata[:color]).to eq 'monochrome'
83
+ end
84
+
85
+ it "makes a gray TIFF from JP2" do
86
+ tool = described_class.new(gray_jp2)
87
+ dest = File.join(Dir.mktmpdir, 'gray.tif')
88
+ tool.convert(dest, false)
89
+ expect(File.exist?(dest)).to be true
90
+ expect(described_class.new(dest).metadata[:color]).to eq 'gray'
91
+ end
92
+
93
+ it "makes a monochrome TIFF from grayscale TIFF" do
94
+ tool = described_class.new(gray_tiff)
95
+ dest = File.join(Dir.mktmpdir, 'mono.tif')
96
+ tool.convert(dest, true)
97
+ expect(File.exist?(dest)).to be true
98
+ expect(described_class.new(dest).metadata[:color]).to eq 'monochrome'
99
+ end
100
+
101
+ # Not yet supported to use this tool to make JP2, for now the only
102
+ # component in NewspaperWorks doing that is
103
+ # NewspaperWorks::JP2DerivativeService
104
+ it "raises error on JP2 destination" do
105
+ expect { described_class.new(gray_tiff).convert('out.jp2') }.to \
106
+ raise_error(RuntimeError)
107
+ end
108
+ end
109
+ end
@@ -24,13 +24,13 @@ RSpec.shared_examples 'ingest adapter IO' do
24
24
  # define the path to the file we will use for multiple examples
25
25
  let(:path) do
26
26
  fixtures = File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files')
27
- File.join(fixtures, 'page1.tiff')
27
+ File.join(fixtures, 'ocr_mono.tiff')
28
28
  end
29
29
 
30
30
  # DRY for this matcher's use in multiple examples:
31
31
  let(:have_io_and_correct_filename) do
32
32
  have_attributes(
33
- filename: 'page1.tiff',
33
+ filename: 'ocr_mono.tiff',
34
34
  io: an_object_responding_to(:read)
35
35
  )
36
36
  end
@@ -62,7 +62,7 @@ RSpec.shared_examples 'ingest adapter IO' do
62
62
  it "loads a StringIO with filename" do
63
63
  adapter = build(:newspaper_page_ingest)
64
64
  io = StringIO.new('File Content Here, Maybe')
65
- adapter.load(io, filename: 'page1.tiff')
65
+ adapter.load(io, filename: 'ocr_mono.tiff')
66
66
  expect(adapter).to have_io_and_correct_filename
67
67
  end
68
68
 
@@ -8,7 +8,7 @@ RSpec.describe NewspaperWorks::Ingest::NewspaperPageIngest do
8
8
 
9
9
  # define the path to the file we will use for multiple examples
10
10
  let(:path) do
11
- File.join(fixture_path, 'page1.tiff')
11
+ File.join(fixture_path, 'ocr_mono.tiff')
12
12
  end
13
13
 
14
14
  it_behaves_like('ingest adapter IO')
@@ -32,7 +32,7 @@ RSpec.describe NewspaperWorks::Ingest::NewspaperPageIngest do
32
32
 
33
33
  def verify_pcdm_fileset(fileset)
34
34
  # Hyrax always sets label (if not title) on fileset:
35
- expect(fileset.label).to eq 'page1.tiff'
35
+ expect(fileset.label).to eq 'ocr_mono.tiff'
36
36
  # reload file set and check on original file
37
37
  fileset.reload
38
38
  file = fileset.original_file
@@ -0,0 +1,37 @@
1
+ require 'spec_helper'
2
+
3
+ describe NewspaperWorks::JP2ImageMetadata do
4
+ let(:fixtures) { File.join(NewspaperWorks::GEM_PATH, 'spec/fixtures/files') }
5
+
6
+ let(:gray_jp2) { File.join(fixtures, 'ocr_gray.jp2') }
7
+
8
+ let(:color_jp2) { File.join(fixtures, '4.1.07.jp2') }
9
+
10
+ describe "Extracts technical metadata from a JP2 file" do
11
+ it "constructs with a path" do
12
+ meta = described_class.new(gray_jp2)
13
+ expect(meta.path).to eq gray_jp2
14
+ end
15
+
16
+ it "gets metadata for grayscale image" do
17
+ meta = described_class.new(gray_jp2)
18
+ result = meta.technical_metadata
19
+ expect(result[:color]).to eq 'gray'
20
+ expect(result[:width]).to eq 418
21
+ expect(result[:height]).to eq 1046
22
+ expect(result[:bits_per_component]).to eq 8
23
+ expect(result[:num_components]).to eq 1
24
+ end
25
+
26
+ it "gets metadata for color image" do
27
+ meta = described_class.new(color_jp2)
28
+ result = meta.technical_metadata
29
+ expect(result[:color]).to eq 'color'
30
+ expect(result[:width]).to eq 256
31
+ expect(result[:height]).to eq 256
32
+ expect(result[:bits_per_component]).to eq 8
33
+ # e.g. is 3, but would be four if sample image had an alpha channel
34
+ expect(result[:num_components]).to eq 3
35
+ end
36
+ end
37
+ end