newspaper_works 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +5 -5
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +4 -0
  4. data/.travis.yml +2 -2
  5. data/README.md +14 -13
  6. data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
  7. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
  8. data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
  9. data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
  10. data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
  11. data/config/locales/newspaper_article.de.yml +1 -1
  12. data/config/locales/newspaper_article.en.yml +1 -1
  13. data/config/locales/newspaper_article.es.yml +1 -1
  14. data/config/locales/newspaper_article.fr.yml +1 -1
  15. data/config/locales/newspaper_article.it.yml +1 -1
  16. data/config/locales/newspaper_article.pt-BR.yml +1 -1
  17. data/config/locales/newspaper_article.zh.yml +1 -1
  18. data/config/locales/newspaper_container.de.yml +1 -1
  19. data/config/locales/newspaper_container.en.yml +1 -1
  20. data/config/locales/newspaper_container.es.yml +1 -1
  21. data/config/locales/newspaper_container.fr.yml +1 -1
  22. data/config/locales/newspaper_container.it.yml +1 -1
  23. data/config/locales/newspaper_container.pt-BR.yml +1 -1
  24. data/config/locales/newspaper_container.zh.yml +1 -1
  25. data/config/locales/newspaper_issue.de.yml +1 -1
  26. data/config/locales/newspaper_issue.en.yml +1 -1
  27. data/config/locales/newspaper_issue.es.yml +1 -1
  28. data/config/locales/newspaper_issue.fr.yml +1 -1
  29. data/config/locales/newspaper_issue.it.yml +2 -2
  30. data/config/locales/newspaper_issue.pt-BR.yml +2 -2
  31. data/config/locales/newspaper_issue.zh.yml +2 -2
  32. data/config/locales/newspaper_page.de.yml +1 -1
  33. data/config/locales/newspaper_page.en.yml +1 -1
  34. data/config/locales/newspaper_page.es.yml +1 -1
  35. data/config/locales/newspaper_page.fr.yml +1 -1
  36. data/config/locales/newspaper_page.it.yml +1 -1
  37. data/config/locales/newspaper_page.pt-BR.yml +1 -1
  38. data/config/locales/newspaper_page.zh.yml +1 -1
  39. data/config/locales/newspaper_title.de.yml +1 -1
  40. data/config/locales/newspaper_title.en.yml +1 -1
  41. data/config/locales/newspaper_title.es.yml +1 -1
  42. data/config/locales/newspaper_title.fr.yml +1 -1
  43. data/config/locales/newspaper_title.it.yml +1 -1
  44. data/config/locales/newspaper_title.pt-BR.yml +1 -1
  45. data/config/locales/newspaper_title.zh.yml +1 -1
  46. data/config/locales/newspaper_works.de.yml +98 -0
  47. data/config/locales/newspaper_works.en.yml +67 -0
  48. data/config/locales/newspaper_works.es.yml +96 -0
  49. data/config/locales/newspaper_works.fr.yml +97 -0
  50. data/config/locales/newspaper_works.it.yml +90 -0
  51. data/config/locales/newspaper_works.pt-BR.yml +96 -0
  52. data/config/locales/newspaper_works.zh.yml +90 -0
  53. data/config/vendor/fits.xml +55 -0
  54. data/config/vendor/imagemagick-6-policy.xml +39 -39
  55. data/lib/newspaper_works.rb +2 -0
  56. data/lib/newspaper_works/image_tool.rb +119 -0
  57. data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
  58. data/lib/newspaper_works/text_extraction.rb +1 -0
  59. data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
  60. data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
  61. data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
  62. data/lib/newspaper_works/version.rb +1 -1
  63. data/newspaper_works.gemspec +2 -3
  64. data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
  65. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  66. data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
  67. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
  68. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
  69. data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
  70. data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
  71. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
  72. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
  73. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
  74. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
  75. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
  76. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
  77. data/spec/spec_helper.rb +19 -0
  78. metadata +21 -22
@@ -0,0 +1,45 @@
1
+ require 'json'
2
+ require 'nokogiri'
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe NewspaperWorks::TextExtraction::HOCRReader do
6
+ let(:fixture_path) do
7
+ File.join(
8
+ NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
9
+ )
10
+ end
11
+
12
+ let(:minimal_path) { File.join(fixture_path, 'ocr_mono_text_hocr.html') }
13
+ let(:minimal) { File.read(minimal_path) }
14
+
15
+ let(:reader_minimal) { described_class.new(minimal) }
16
+ let(:reader_minimal_path) { described_class.new(minimal_path) }
17
+
18
+ describe "reads hOCR" do
19
+ it "loads hOCR either from path or source text" do
20
+ expect(reader_minimal_path.source).to eq reader_minimal.source
21
+ # size here is in Unicode characters, not bytes:
22
+ expect(reader_minimal_path.source.size).to eq 16_590
23
+ end
24
+
25
+ it "loads document stream" do
26
+ expect(reader_minimal_path.doc_stream).to be_kind_of Nokogiri::XML::SAX::Document
27
+ expect(reader_minimal_path.doc_stream).to respond_to :text
28
+ expect(reader_minimal_path.doc_stream).to respond_to :words
29
+ end
30
+ end
31
+
32
+ describe "outputs text derivative formats" do
33
+ it "outputs plain text" do
34
+ plain_text = reader_minimal.text
35
+ expect(plain_text.slice(0, 40)).to eq "_A FEARFUL ADVENTURE.\n‘The Missouri. "
36
+ expect(reader_minimal.text).to eq reader_minimal.doc_stream.text
37
+ expect(reader_minimal.text.size).to eq 831
38
+ end
39
+
40
+ it "passes args to WordCoordsBuilder and receives output" do
41
+ parsed = JSON.parse(reader_minimal.json)
42
+ expect(parsed['coords'].length).to be > 1
43
+ end
44
+ end
45
+ end
@@ -26,7 +26,7 @@ RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
26
26
  expect(words).to be_an(Array)
27
27
  expect(words).not_to be_empty
28
28
  expect(words[0]).to be_a(Hash)
29
- [:word, :x_start, :y_start, :x_end, :y_end].each do |key|
29
+ [:word, :coordinates].each do |key|
30
30
  expect(words[0].keys).to include key
31
31
  end
32
32
  end
@@ -77,8 +77,8 @@ RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
77
77
  word = ocr_from_gray_tiff.words[0]
78
78
  word1 = parsed['coords'][word[:word]]
79
79
  word1_coords = word1[0]
80
- expect(word1_coords[2]).to eq word[:x_end] - word[:x_start]
81
- expect(word1_coords[3]).to eq word[:y_end] - word[:y_start]
80
+ expect(word1_coords[2]).to eq word[:coordinates][2]
81
+ expect(word1_coords[3]).to eq word[:coordinates][3]
82
82
  end
83
83
  end
84
84
  end
@@ -16,20 +16,20 @@ RSpec.describe NewspaperWorks::TextExtraction::RenderAlto do
16
16
 
17
17
  let(:words) do
18
18
  [
19
- { word: 'If', x_start: 52, y_start: 13, x_end: 63, y_end: 27 },
20
- { word: 'you', x_start: 69, y_start: 17, x_end: 100, y_end: 31 },
21
- { word: 'are', x_start: 108, y_start: 17, x_end: 136, y_end: 27 },
22
- { word: 'a', x_start: 143, y_start: 17, x_end: 151, y_end: 27 },
23
- { word: 'friend,', x_start: 158, y_start: 13, x_end: 214, y_end: 29 },
24
- { word: 'you', x_start: 51, y_start: 39, x_end: 82, y_end: 53 },
25
- { word: 'speak', x_start: 90, y_start: 35, x_end: 140, y_end: 53 },
26
- { word: 'the', x_start: 146, y_start: 35, x_end: 174, y_end: 49 },
27
- { word: 'password,', x_start: 182, y_start: 35, x_end: 267, y_end: 53 },
28
- { word: 'and', x_start: 51, y_start: 57, x_end: 81, y_end: 71 },
29
- { word: 'the', x_start: 89, y_start: 57, x_end: 117, y_end: 71 },
30
- { word: 'doors', x_start: 124, y_start: 57, x_end: 172, y_end: 71 },
31
- { word: 'will', x_start: 180, y_start: 57, x_end: 208, y_end: 71 },
32
- { word: 'open.', x_start: 216, y_start: 61, x_end: 263, y_end: 75 }
19
+ { word: "If", coordinates: [52, 13, 11, 14] },
20
+ { word: "you", coordinates: [69, 17, 31, 14] },
21
+ { word: "are", coordinates: [108, 17, 28, 10] },
22
+ { word: "a", coordinates: [143, 17, 8, 10] },
23
+ { word: "friend,", coordinates: [158, 13, 56, 16] },
24
+ { word: "you", coordinates: [51, 39, 31, 14] },
25
+ { word: "speak", coordinates: [90, 35, 50, 18] },
26
+ { word: "the", coordinates: [146, 35, 28, 14] },
27
+ { word: "password,", coordinates: [182, 35, 85, 18] },
28
+ { word: "and", coordinates: [51, 57, 30, 14] },
29
+ { word: "the", coordinates: [89, 57, 28, 14] },
30
+ { word: "doors", coordinates: [124, 57, 48, 14] },
31
+ { word: "will", coordinates: [180, 57, 28, 14] },
32
+ { word: "open.", coordinates: [216, 61, 47, 14] }
33
33
  ]
34
34
  end
35
35
 
@@ -21,30 +21,27 @@ RSpec.describe NewspaperWorks::JP2DerivativeService do
21
21
  Hyrax::DerivativePath.derivative_path_for_reference(file_set, 'jp2')
22
22
  end
23
23
 
24
- def get_res(path)
25
- lines = `gm identify -verbose #{path}`.lines
26
- lines.select { |line| line.strip.start_with?('Geometry') }[0].strip
27
- end
28
-
29
- def check_dpi_match(orig, dest)
30
- # check ppi, but skip pdf to avoid ghostscript warnings to stderr
31
- expect(get_res(orig)).to eq get_res(dest) unless orig.end_with?('pdf')
24
+ def metadata_match_checker(source, target)
25
+ target_meta = NewspaperWorks::ImageTool.new(target).metadata
26
+ source_meta = NewspaperWorks::ImageTool.new(source).metadata
27
+ expect(target_meta[:content_type]).to eq 'image/jp2'
28
+ expect(target_meta[:width]).to eq source_meta[:width]
29
+ expect(target_meta[:height]).to eq source_meta[:height]
32
30
  end
33
31
 
34
32
  def makes_jp2(filename)
35
33
  expected = expected_path(valid_file_set)
36
34
  expect(File.exist?(expected)).to be false
37
35
  svc = described_class.new(valid_file_set)
38
- svc.create_derivatives(source_image(filename))
36
+ source_path = source_image(filename)
37
+ svc.create_derivatives(source_path)
39
38
  expect(File.exist?(expected)).to be true
40
- desc = `gm identify #{expected}`
41
- expect(desc).to include 'JP2'
42
- check_dpi_match(source_image(filename), expected)
39
+ metadata_match_checker(source_path, expected)
43
40
  svc.cleanup_derivatives
44
41
  end
45
42
 
46
43
  it "creates gray JP2 derivative from one-bit source" do
47
- makes_jp2('page1.tiff')
44
+ makes_jp2('ocr_mono.tiff')
48
45
  end
49
46
 
50
47
  it "creates gray JP2 from grayscale source" do
@@ -96,30 +96,32 @@ RSpec.describe NewspaperWorks::NewspaperPageDerivativeService do
96
96
  end
97
97
 
98
98
  it "identifies a source file using ImageMagick" do
99
- expect(service_for_file('4.1.07.tiff').identify).to include 'TIFF'
100
- expect(service_for_file('4.1.07.tiff').identify).to include '8-bit'
99
+ service = service_for_file('4.1.07.tiff')
100
+ expect(service.identify[:content_type]).to eq 'image/tiff'
101
+ expect(service.identify[:bits_per_component]).to eq 8
101
102
  end
102
103
 
103
104
  it "identifies jp2 source" do
104
- # test/verify jp2 source is identified, which relies on GraphicsMagick
105
- expect(service_for_file('4.1.07.jp2').identify).to include 'JP2'
106
- expect(service_for_file('4.1.07.jp2').identify).to include '8-bit'
105
+ # test/verify jp2 source is identified, which relies on JP2 backend
106
+ service = service_for_file('4.1.07.jp2')
107
+ expect(service.identify[:content_type]).to eq 'image/jp2'
108
+ expect(service.identify[:bits_per_component]).to eq 8
107
109
  end
108
110
 
109
111
  it "identifies color and gray sources" do
110
112
  expect(service_for_file('4.1.07.tiff').use_color?).to be true
111
- expect(service_for_file('page1.tiff').use_color?).to be false
113
+ expect(service_for_file('ocr_gray.tiff').use_color?).to be false
112
114
  end
113
115
 
114
116
  it "identifies a one-bit source" do
115
117
  # 1-bit group4 monochrome TIFF:
116
- expect(service_for_file('page1.tiff').one_bit?).to be true
118
+ expect(service_for_file('ocr_mono.tiff').one_bit?).to be true
117
119
  # 8-bit gray TIFF:
118
120
  expect(
119
121
  service_for_file('lowres-gray-via-ndnp-sample.tiff').one_bit?
120
122
  ).to be false
121
123
  # color TIFF:
122
- expect(service_for_file('4.1.07.tif').one_bit?).to be false
124
+ expect(service_for_file('4.1.07.tiff').one_bit?).to be false
123
125
  end
124
126
  end
125
127
  end
@@ -23,10 +23,10 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
23
23
 
24
24
  # given output file name, check DPI is 150
25
25
  def check_dpi(expected)
26
- desc = `gm identify #{expected}`
27
- # get total width of pdf in points from identify, should be 864x == 12in
28
- page_width = 864
29
- expect(desc).to include "#{page_width}x"
26
+ metadata = NewspaperWorks::ImageTool.new(expected).metadata
27
+ # get width of pdf in points (via imagemagick), should be 864x == 12in
28
+ page_width = metadata[:width]
29
+ expect(page_width).to eq 864
30
30
  # get total width of image in pixels from pdfimages -list, ==> 1800
31
31
  image_width = 1800
32
32
  im_list = `pdfimages -list #{expected}`
@@ -41,14 +41,14 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
41
41
  svc = described_class.new(valid_file_set)
42
42
  svc.create_derivatives(source_image(filename))
43
43
  expect(File.exist?(expected)).to be true
44
- desc = `gm identify #{expected}`
45
- expect(desc).to include 'PDF'
44
+ metadata = NewspaperWorks::ImageTool.new(expected).metadata
45
+ expect(metadata[:content_type]).to eq 'application/pdf'
46
46
  check_dpi(expected)
47
47
  svc.cleanup_derivatives
48
48
  end
49
49
 
50
50
  it "creates gray PDF derivative from one-bit source" do
51
- makes_pdf('page1.tiff')
51
+ makes_pdf('ocr_mono.tiff')
52
52
  end
53
53
 
54
54
  it "creates gray PDF from grayscale source" do
@@ -58,5 +58,9 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
58
58
  it "creates color PDF from color source" do
59
59
  makes_pdf('4.1.07.tiff')
60
60
  end
61
+
62
+ it "creates color PDF from color JP2 source" do
63
+ makes_pdf('4.1.07.jp2')
64
+ end
61
65
  end
62
66
  end
@@ -22,8 +22,8 @@ RSpec.describe NewspaperWorks::TIFFDerivativeService do
22
22
  end
23
23
 
24
24
  def get_res(path)
25
- lines = `gm identify -verbose #{path}`.lines
26
- lines.select { |line| line.strip.start_with?('Geometry') }[0].strip
25
+ tool = NewspaperWorks::ImageTool.new(path)
26
+ "#{tool.metadata[:width]}x#{tool.metadata[:height]}"
27
27
  end
28
28
 
29
29
  def check_dpi_match(orig, dest)
@@ -32,23 +32,30 @@ RSpec.describe NewspaperWorks::TIFFDerivativeService do
32
32
  end
33
33
 
34
34
  def makes_tiff(filename)
35
+ path = source_image(filename)
35
36
  expected = expected_path(valid_file_set)
36
37
  expect(File.exist?(expected)).to be false
37
38
  svc = described_class.new(valid_file_set)
38
- svc.create_derivatives(source_image(filename))
39
+ svc.create_derivatives(path)
39
40
  expect(File.exist?(expected)).to be true
40
- desc = `gm identify #{expected}`
41
- expect(desc).to include 'TIFF'
42
- check_dpi_match(source_image(filename), expected)
41
+ mime = NewspaperWorks::ImageTool.new(expected).metadata[:content_type]
42
+ expect(mime).to eq 'image/tiff'
43
+ check_dpi_match(path, expected)
43
44
  svc.cleanup_derivatives
44
45
  end
45
46
 
46
- it "creates gray TIFF derivative from one-bit source" do
47
- makes_tiff('page1.tiff')
47
+ # for cases where primary file is TIFF already
48
+ def avoids_duplicative_creation(filename)
49
+ expected = expected_path(valid_file_set)
50
+ expect(File.exist?(expected)).to be false
51
+ svc = described_class.new(valid_file_set)
52
+ svc.create_derivatives(source_image(filename))
53
+ expect(File.exist?(expected)).not_to be true
48
54
  end
49
55
 
50
- it "creates gray TIFF from grayscale source" do
51
- makes_tiff('lowres-gray-via-ndnp-sample.tiff')
56
+ it "Does not make TIFF derivatives when primary is TIFF" do
57
+ avoids_duplicative_creation('ocr_mono.tiff')
58
+ avoids_duplicative_creation('ocr_gray.tiff')
52
59
  end
53
60
 
54
61
  it "creates TIFF from PDF source, robust to multi-page" do
@@ -64,6 +64,22 @@ module EngineRoutes
64
64
  end
65
65
  end
66
66
 
67
+ class CSVLoggingFormatter < RSpec::Core::Formatters::JsonFormatter
68
+ RSpec::Core::Formatters.register self
69
+
70
+ def close(_notification)
71
+ with_headers = {
72
+ write_headers: true,
73
+ headers: ['Example', 'Status', 'Run Time', 'Exception']
74
+ }
75
+ CSV.open(output.path, 'w', with_headers) do |csv|
76
+ @output_hash[:examples].map do |ex|
77
+ csv << [ex[:full_description], ex[:status], ex[:run_time], ex[:exception]]
78
+ end
79
+ end
80
+ end
81
+ end
82
+
67
83
  RSpec.configure do |config|
68
84
  # enable FactoryBot:
69
85
  require 'factory_bot'
@@ -226,6 +242,9 @@ RSpec.configure do |config|
226
242
  # config.default_formatter = "doc"
227
243
  # end
228
244
 
245
+ # opt-in CSV logging formatter, set SPEC_CSV environment variable to use:
246
+ config.add_formatter(CSVLoggingFormatter, 'spec_log.csv') unless ENV['SPEC_CSV'].nil?
247
+
229
248
  # Print the 10 slowest examples and example groups at the
230
249
  # end of the spec run, to help surface which specs are running
231
250
  # particularly slow.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: newspaper_works
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sean Upton
@@ -11,22 +11,22 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2019-09-27 00:00:00.000000000 Z
14
+ date: 2019-10-18 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: blacklight_iiif_search
18
18
  requirement: !ruby/object:Gem::Requirement
19
19
  requirements:
20
- - - ">="
20
+ - - "~>"
21
21
  - !ruby/object:Gem::Version
22
- version: '0'
22
+ version: '1.0'
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ">="
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
- version: '0'
29
+ version: '1.0'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: blacklight_advanced_search
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -83,20 +83,6 @@ dependencies:
83
83
  - - "~>"
84
84
  - !ruby/object:Gem::Version
85
85
  version: '5.1'
86
- - !ruby/object:Gem::Dependency
87
- name: rtesseract
88
- requirement: !ruby/object:Gem::Requirement
89
- requirements:
90
- - - "~>"
91
- - !ruby/object:Gem::Version
92
- version: 2.2.0
93
- type: :runtime
94
- prerelease: false
95
- version_requirements: !ruby/object:Gem::Requirement
96
- requirements:
97
- - - "~>"
98
- - !ruby/object:Gem::Version
99
- version: 2.2.0
100
86
  - !ruby/object:Gem::Dependency
101
87
  name: sass-rails
102
88
  requirement: !ruby/object:Gem::Requirement
@@ -371,6 +357,7 @@ executables: []
371
357
  extensions: []
372
358
  extra_rdoc_files: []
373
359
  files:
360
+ - ".coveralls.yml"
374
361
  - ".fcrepo_wrapper"
375
362
  - ".gitignore"
376
363
  - ".rubocop.yml"
@@ -577,6 +564,7 @@ files:
577
564
  - config/test-fixture/solr-config/xslt/example_atom.xsl
578
565
  - config/test-fixture/solr-config/xslt/example_rss.xsl
579
566
  - config/test-fixture/solr-config/xslt/luke.xsl
567
+ - config/vendor/fits.xml
580
568
  - config/vendor/imagemagick-6-policy.xml
581
569
  - db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb
582
570
  - db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb
@@ -603,6 +591,7 @@ files:
603
591
  - lib/newspaper_works/data/work_files.rb
604
592
  - lib/newspaper_works/engine.rb
605
593
  - lib/newspaper_works/errors.rb
594
+ - lib/newspaper_works/image_tool.rb
606
595
  - lib/newspaper_works/ingest.rb
607
596
  - lib/newspaper_works/ingest/base_ingest.rb
608
597
  - lib/newspaper_works/ingest/base_publication_info.rb
@@ -639,11 +628,13 @@ files:
639
628
  - lib/newspaper_works/ingest/pub_finder.rb
640
629
  - lib/newspaper_works/ingest/publication_info.rb
641
630
  - lib/newspaper_works/issue_pdf_composer.rb
631
+ - lib/newspaper_works/jp2_image_metadata.rb
642
632
  - lib/newspaper_works/logging.rb
643
633
  - lib/newspaper_works/page_finder.rb
644
634
  - lib/newspaper_works/resource_fetcher.rb
645
635
  - lib/newspaper_works/text_extraction.rb
646
636
  - lib/newspaper_works/text_extraction/alto_reader.rb
637
+ - lib/newspaper_works/text_extraction/hocr_reader.rb
647
638
  - lib/newspaper_works/text_extraction/page_ocr.rb
648
639
  - lib/newspaper_works/text_extraction/render_alto.rb
649
640
  - lib/newspaper_works/text_extraction/word_coords_builder.rb
@@ -689,6 +680,7 @@ files:
689
680
  - spec/fixtures/files/ocr_gray.jp2
690
681
  - spec/fixtures/files/ocr_gray.tiff
691
682
  - spec/fixtures/files/ocr_mono.tiff
683
+ - spec/fixtures/files/ocr_mono_text_hocr.html
692
684
  - spec/fixtures/files/page1.tiff
693
685
  - spec/fixtures/files/resource_mocks/chronam/http404-expected
694
686
  - spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf
@@ -732,6 +724,7 @@ files:
732
724
  - spec/lib/newspaper_works/data/work_derivatives_spec.rb
733
725
  - spec/lib/newspaper_works/data/work_file_spec.rb
734
726
  - spec/lib/newspaper_works/data/work_files_spec.rb
727
+ - spec/lib/newspaper_works/image_tool_spec.rb
735
728
  - spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb
736
729
  - spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb
737
730
  - spec/lib/newspaper_works/ingest/from_command_spec.rb
@@ -761,10 +754,12 @@ files:
761
754
  - spec/lib/newspaper_works/ingest/publication_info_spec.rb
762
755
  - spec/lib/newspaper_works/ingest_spec.rb
763
756
  - spec/lib/newspaper_works/issue_pdf_composer_spec.rb
757
+ - spec/lib/newspaper_works/jp2_image_metadata_spec.rb
764
758
  - spec/lib/newspaper_works/logging_spec.rb
765
759
  - spec/lib/newspaper_works/page_finder_spec.rb
766
760
  - spec/lib/newspaper_works/resource_fetcher_spec.rb
767
761
  - spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb
762
+ - spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb
768
763
  - spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb
769
764
  - spec/lib/newspaper_works/text_extraction/render_alto_spec.rb
770
765
  - spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb
@@ -830,7 +825,7 @@ files:
830
825
  - test/newspaper_works_test.rb
831
826
  - test/test_helper.rb
832
827
  - tmp/.keep
833
- homepage: https://github.com/marriott-library/newspaper_works
828
+ homepage: https://github.com/samvera-labs/newspaper_works
834
829
  licenses:
835
830
  - Apache-2.0
836
831
  metadata: {}
@@ -850,7 +845,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
850
845
  version: '0'
851
846
  requirements: []
852
847
  rubyforge_project:
853
- rubygems_version: 2.6.14.3
848
+ rubygems_version: 2.7.6.2
854
849
  signing_key:
855
850
  specification_version: 4
856
851
  summary: newspaper_works is a Rails Engine gem providing model and administrative
@@ -896,6 +891,7 @@ test_files:
896
891
  - spec/fixtures/files/ocr_gray.jp2
897
892
  - spec/fixtures/files/ocr_gray.tiff
898
893
  - spec/fixtures/files/ocr_mono.tiff
894
+ - spec/fixtures/files/ocr_mono_text_hocr.html
899
895
  - spec/fixtures/files/page1.tiff
900
896
  - spec/fixtures/files/resource_mocks/chronam/http404-expected
901
897
  - spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf
@@ -939,6 +935,7 @@ test_files:
939
935
  - spec/lib/newspaper_works/data/work_derivatives_spec.rb
940
936
  - spec/lib/newspaper_works/data/work_file_spec.rb
941
937
  - spec/lib/newspaper_works/data/work_files_spec.rb
938
+ - spec/lib/newspaper_works/image_tool_spec.rb
942
939
  - spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb
943
940
  - spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb
944
941
  - spec/lib/newspaper_works/ingest/from_command_spec.rb
@@ -968,10 +965,12 @@ test_files:
968
965
  - spec/lib/newspaper_works/ingest/publication_info_spec.rb
969
966
  - spec/lib/newspaper_works/ingest_spec.rb
970
967
  - spec/lib/newspaper_works/issue_pdf_composer_spec.rb
968
+ - spec/lib/newspaper_works/jp2_image_metadata_spec.rb
971
969
  - spec/lib/newspaper_works/logging_spec.rb
972
970
  - spec/lib/newspaper_works/page_finder_spec.rb
973
971
  - spec/lib/newspaper_works/resource_fetcher_spec.rb
974
972
  - spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb
973
+ - spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb
975
974
  - spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb
976
975
  - spec/lib/newspaper_works/text_extraction/render_alto_spec.rb
977
976
  - spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb