newspaper_works 0.1.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +5 -5
  2. data/.coveralls.yml +2 -0
  3. data/.gitignore +4 -0
  4. data/.travis.yml +2 -2
  5. data/README.md +14 -13
  6. data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
  7. data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
  8. data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
  9. data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
  10. data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
  11. data/config/locales/newspaper_article.de.yml +1 -1
  12. data/config/locales/newspaper_article.en.yml +1 -1
  13. data/config/locales/newspaper_article.es.yml +1 -1
  14. data/config/locales/newspaper_article.fr.yml +1 -1
  15. data/config/locales/newspaper_article.it.yml +1 -1
  16. data/config/locales/newspaper_article.pt-BR.yml +1 -1
  17. data/config/locales/newspaper_article.zh.yml +1 -1
  18. data/config/locales/newspaper_container.de.yml +1 -1
  19. data/config/locales/newspaper_container.en.yml +1 -1
  20. data/config/locales/newspaper_container.es.yml +1 -1
  21. data/config/locales/newspaper_container.fr.yml +1 -1
  22. data/config/locales/newspaper_container.it.yml +1 -1
  23. data/config/locales/newspaper_container.pt-BR.yml +1 -1
  24. data/config/locales/newspaper_container.zh.yml +1 -1
  25. data/config/locales/newspaper_issue.de.yml +1 -1
  26. data/config/locales/newspaper_issue.en.yml +1 -1
  27. data/config/locales/newspaper_issue.es.yml +1 -1
  28. data/config/locales/newspaper_issue.fr.yml +1 -1
  29. data/config/locales/newspaper_issue.it.yml +2 -2
  30. data/config/locales/newspaper_issue.pt-BR.yml +2 -2
  31. data/config/locales/newspaper_issue.zh.yml +2 -2
  32. data/config/locales/newspaper_page.de.yml +1 -1
  33. data/config/locales/newspaper_page.en.yml +1 -1
  34. data/config/locales/newspaper_page.es.yml +1 -1
  35. data/config/locales/newspaper_page.fr.yml +1 -1
  36. data/config/locales/newspaper_page.it.yml +1 -1
  37. data/config/locales/newspaper_page.pt-BR.yml +1 -1
  38. data/config/locales/newspaper_page.zh.yml +1 -1
  39. data/config/locales/newspaper_title.de.yml +1 -1
  40. data/config/locales/newspaper_title.en.yml +1 -1
  41. data/config/locales/newspaper_title.es.yml +1 -1
  42. data/config/locales/newspaper_title.fr.yml +1 -1
  43. data/config/locales/newspaper_title.it.yml +1 -1
  44. data/config/locales/newspaper_title.pt-BR.yml +1 -1
  45. data/config/locales/newspaper_title.zh.yml +1 -1
  46. data/config/locales/newspaper_works.de.yml +98 -0
  47. data/config/locales/newspaper_works.en.yml +67 -0
  48. data/config/locales/newspaper_works.es.yml +96 -0
  49. data/config/locales/newspaper_works.fr.yml +97 -0
  50. data/config/locales/newspaper_works.it.yml +90 -0
  51. data/config/locales/newspaper_works.pt-BR.yml +96 -0
  52. data/config/locales/newspaper_works.zh.yml +90 -0
  53. data/config/vendor/fits.xml +55 -0
  54. data/config/vendor/imagemagick-6-policy.xml +39 -39
  55. data/lib/newspaper_works.rb +2 -0
  56. data/lib/newspaper_works/image_tool.rb +119 -0
  57. data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
  58. data/lib/newspaper_works/text_extraction.rb +1 -0
  59. data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
  60. data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
  61. data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
  62. data/lib/newspaper_works/version.rb +1 -1
  63. data/newspaper_works.gemspec +2 -3
  64. data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
  65. data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
  66. data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
  67. data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
  68. data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
  69. data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
  70. data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
  71. data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
  72. data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
  73. data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
  74. data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
  75. data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
  76. data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
  77. data/spec/spec_helper.rb +19 -0
  78. metadata +21 -22
@@ -0,0 +1,45 @@
1
+ require 'json'
2
+ require 'nokogiri'
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe NewspaperWorks::TextExtraction::HOCRReader do
6
+ let(:fixture_path) do
7
+ File.join(
8
+ NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
9
+ )
10
+ end
11
+
12
+ let(:minimal_path) { File.join(fixture_path, 'ocr_mono_text_hocr.html') }
13
+ let(:minimal) { File.read(minimal_path) }
14
+
15
+ let(:reader_minimal) { described_class.new(minimal) }
16
+ let(:reader_minimal_path) { described_class.new(minimal_path) }
17
+
18
+ describe "reads hOCR" do
19
+ it "loads hOCR either from path or source text" do
20
+ expect(reader_minimal_path.source).to eq reader_minimal.source
21
+ # size here is in Unicode characters, not bytes:
22
+ expect(reader_minimal_path.source.size).to eq 16_590
23
+ end
24
+
25
+ it "loads document stream" do
26
+ expect(reader_minimal_path.doc_stream).to be_kind_of Nokogiri::XML::SAX::Document
27
+ expect(reader_minimal_path.doc_stream).to respond_to :text
28
+ expect(reader_minimal_path.doc_stream).to respond_to :words
29
+ end
30
+ end
31
+
32
+ describe "outputs text derivative formats" do
33
+ it "outputs plain text" do
34
+ plain_text = reader_minimal.text
35
+ expect(plain_text.slice(0, 40)).to eq "_A FEARFUL ADVENTURE.\n‘The Missouri. "
36
+ expect(reader_minimal.text).to eq reader_minimal.doc_stream.text
37
+ expect(reader_minimal.text.size).to eq 831
38
+ end
39
+
40
+ it "passes args to WordCoordsBuilder and receives output" do
41
+ parsed = JSON.parse(reader_minimal.json)
42
+ expect(parsed['coords'].length).to be > 1
43
+ end
44
+ end
45
+ end
@@ -26,7 +26,7 @@ RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
26
26
  expect(words).to be_an(Array)
27
27
  expect(words).not_to be_empty
28
28
  expect(words[0]).to be_a(Hash)
29
- [:word, :x_start, :y_start, :x_end, :y_end].each do |key|
29
+ [:word, :coordinates].each do |key|
30
30
  expect(words[0].keys).to include key
31
31
  end
32
32
  end
@@ -77,8 +77,8 @@ RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
77
77
  word = ocr_from_gray_tiff.words[0]
78
78
  word1 = parsed['coords'][word[:word]]
79
79
  word1_coords = word1[0]
80
- expect(word1_coords[2]).to eq word[:x_end] - word[:x_start]
81
- expect(word1_coords[3]).to eq word[:y_end] - word[:y_start]
80
+ expect(word1_coords[2]).to eq word[:coordinates][2]
81
+ expect(word1_coords[3]).to eq word[:coordinates][3]
82
82
  end
83
83
  end
84
84
  end
@@ -16,20 +16,20 @@ RSpec.describe NewspaperWorks::TextExtraction::RenderAlto do
16
16
 
17
17
  let(:words) do
18
18
  [
19
- { word: 'If', x_start: 52, y_start: 13, x_end: 63, y_end: 27 },
20
- { word: 'you', x_start: 69, y_start: 17, x_end: 100, y_end: 31 },
21
- { word: 'are', x_start: 108, y_start: 17, x_end: 136, y_end: 27 },
22
- { word: 'a', x_start: 143, y_start: 17, x_end: 151, y_end: 27 },
23
- { word: 'friend,', x_start: 158, y_start: 13, x_end: 214, y_end: 29 },
24
- { word: 'you', x_start: 51, y_start: 39, x_end: 82, y_end: 53 },
25
- { word: 'speak', x_start: 90, y_start: 35, x_end: 140, y_end: 53 },
26
- { word: 'the', x_start: 146, y_start: 35, x_end: 174, y_end: 49 },
27
- { word: 'password,', x_start: 182, y_start: 35, x_end: 267, y_end: 53 },
28
- { word: 'and', x_start: 51, y_start: 57, x_end: 81, y_end: 71 },
29
- { word: 'the', x_start: 89, y_start: 57, x_end: 117, y_end: 71 },
30
- { word: 'doors', x_start: 124, y_start: 57, x_end: 172, y_end: 71 },
31
- { word: 'will', x_start: 180, y_start: 57, x_end: 208, y_end: 71 },
32
- { word: 'open.', x_start: 216, y_start: 61, x_end: 263, y_end: 75 }
19
+ { word: "If", coordinates: [52, 13, 11, 14] },
20
+ { word: "you", coordinates: [69, 17, 31, 14] },
21
+ { word: "are", coordinates: [108, 17, 28, 10] },
22
+ { word: "a", coordinates: [143, 17, 8, 10] },
23
+ { word: "friend,", coordinates: [158, 13, 56, 16] },
24
+ { word: "you", coordinates: [51, 39, 31, 14] },
25
+ { word: "speak", coordinates: [90, 35, 50, 18] },
26
+ { word: "the", coordinates: [146, 35, 28, 14] },
27
+ { word: "password,", coordinates: [182, 35, 85, 18] },
28
+ { word: "and", coordinates: [51, 57, 30, 14] },
29
+ { word: "the", coordinates: [89, 57, 28, 14] },
30
+ { word: "doors", coordinates: [124, 57, 48, 14] },
31
+ { word: "will", coordinates: [180, 57, 28, 14] },
32
+ { word: "open.", coordinates: [216, 61, 47, 14] }
33
33
  ]
34
34
  end
35
35
 
@@ -21,30 +21,27 @@ RSpec.describe NewspaperWorks::JP2DerivativeService do
21
21
  Hyrax::DerivativePath.derivative_path_for_reference(file_set, 'jp2')
22
22
  end
23
23
 
24
- def get_res(path)
25
- lines = `gm identify -verbose #{path}`.lines
26
- lines.select { |line| line.strip.start_with?('Geometry') }[0].strip
27
- end
28
-
29
- def check_dpi_match(orig, dest)
30
- # check ppi, but skip pdf to avoid ghostscript warnings to stderr
31
- expect(get_res(orig)).to eq get_res(dest) unless orig.end_with?('pdf')
24
+ def metadata_match_checker(source, target)
25
+ target_meta = NewspaperWorks::ImageTool.new(target).metadata
26
+ source_meta = NewspaperWorks::ImageTool.new(source).metadata
27
+ expect(target_meta[:content_type]).to eq 'image/jp2'
28
+ expect(target_meta[:width]).to eq source_meta[:width]
29
+ expect(target_meta[:height]).to eq source_meta[:height]
32
30
  end
33
31
 
34
32
  def makes_jp2(filename)
35
33
  expected = expected_path(valid_file_set)
36
34
  expect(File.exist?(expected)).to be false
37
35
  svc = described_class.new(valid_file_set)
38
- svc.create_derivatives(source_image(filename))
36
+ source_path = source_image(filename)
37
+ svc.create_derivatives(source_path)
39
38
  expect(File.exist?(expected)).to be true
40
- desc = `gm identify #{expected}`
41
- expect(desc).to include 'JP2'
42
- check_dpi_match(source_image(filename), expected)
39
+ metadata_match_checker(source_path, expected)
43
40
  svc.cleanup_derivatives
44
41
  end
45
42
 
46
43
  it "creates gray JP2 derivative from one-bit source" do
47
- makes_jp2('page1.tiff')
44
+ makes_jp2('ocr_mono.tiff')
48
45
  end
49
46
 
50
47
  it "creates gray JP2 from grayscale source" do
@@ -96,30 +96,32 @@ RSpec.describe NewspaperWorks::NewspaperPageDerivativeService do
96
96
  end
97
97
 
98
98
  it "identifies a source file using ImageMagick" do
99
- expect(service_for_file('4.1.07.tiff').identify).to include 'TIFF'
100
- expect(service_for_file('4.1.07.tiff').identify).to include '8-bit'
99
+ service = service_for_file('4.1.07.tiff')
100
+ expect(service.identify[:content_type]).to eq 'image/tiff'
101
+ expect(service.identify[:bits_per_component]).to eq 8
101
102
  end
102
103
 
103
104
  it "identifies jp2 source" do
104
- # test/verify jp2 source is identified, which relies on GraphicsMagick
105
- expect(service_for_file('4.1.07.jp2').identify).to include 'JP2'
106
- expect(service_for_file('4.1.07.jp2').identify).to include '8-bit'
105
+ # test/verify jp2 source is identified, which relies on JP2 backend
106
+ service = service_for_file('4.1.07.jp2')
107
+ expect(service.identify[:content_type]).to eq 'image/jp2'
108
+ expect(service.identify[:bits_per_component]).to eq 8
107
109
  end
108
110
 
109
111
  it "identifies color and gray sources" do
110
112
  expect(service_for_file('4.1.07.tiff').use_color?).to be true
111
- expect(service_for_file('page1.tiff').use_color?).to be false
113
+ expect(service_for_file('ocr_gray.tiff').use_color?).to be false
112
114
  end
113
115
 
114
116
  it "identifies a one-bit source" do
115
117
  # 1-bit group4 monochrome TIFF:
116
- expect(service_for_file('page1.tiff').one_bit?).to be true
118
+ expect(service_for_file('ocr_mono.tiff').one_bit?).to be true
117
119
  # 8-bit gray TIFF:
118
120
  expect(
119
121
  service_for_file('lowres-gray-via-ndnp-sample.tiff').one_bit?
120
122
  ).to be false
121
123
  # color TIFF:
122
- expect(service_for_file('4.1.07.tif').one_bit?).to be false
124
+ expect(service_for_file('4.1.07.tiff').one_bit?).to be false
123
125
  end
124
126
  end
125
127
  end
@@ -23,10 +23,10 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
23
23
 
24
24
  # given output file name, check DPI is 150
25
25
  def check_dpi(expected)
26
- desc = `gm identify #{expected}`
27
- # get total width of pdf in points from identify, should be 864x == 12in
28
- page_width = 864
29
- expect(desc).to include "#{page_width}x"
26
+ metadata = NewspaperWorks::ImageTool.new(expected).metadata
27
+ # get width of pdf in points (via imagemagick), should be 864x == 12in
28
+ page_width = metadata[:width]
29
+ expect(page_width).to eq 864
30
30
  # get total width of image in pixels from pdfimages -list, ==> 1800
31
31
  image_width = 1800
32
32
  im_list = `pdfimages -list #{expected}`
@@ -41,14 +41,14 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
41
41
  svc = described_class.new(valid_file_set)
42
42
  svc.create_derivatives(source_image(filename))
43
43
  expect(File.exist?(expected)).to be true
44
- desc = `gm identify #{expected}`
45
- expect(desc).to include 'PDF'
44
+ metadata = NewspaperWorks::ImageTool.new(expected).metadata
45
+ expect(metadata[:content_type]).to eq 'application/pdf'
46
46
  check_dpi(expected)
47
47
  svc.cleanup_derivatives
48
48
  end
49
49
 
50
50
  it "creates gray PDF derivative from one-bit source" do
51
- makes_pdf('page1.tiff')
51
+ makes_pdf('ocr_mono.tiff')
52
52
  end
53
53
 
54
54
  it "creates gray PDF from grayscale source" do
@@ -58,5 +58,9 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
58
58
  it "creates color PDF from color source" do
59
59
  makes_pdf('4.1.07.tiff')
60
60
  end
61
+
62
+ it "creates color PDF from color JP2 source" do
63
+ makes_pdf('4.1.07.jp2')
64
+ end
61
65
  end
62
66
  end
@@ -22,8 +22,8 @@ RSpec.describe NewspaperWorks::TIFFDerivativeService do
22
22
  end
23
23
 
24
24
  def get_res(path)
25
- lines = `gm identify -verbose #{path}`.lines
26
- lines.select { |line| line.strip.start_with?('Geometry') }[0].strip
25
+ tool = NewspaperWorks::ImageTool.new(path)
26
+ "#{tool.metadata[:width]}x#{tool.metadata[:height]}"
27
27
  end
28
28
 
29
29
  def check_dpi_match(orig, dest)
@@ -32,23 +32,30 @@ RSpec.describe NewspaperWorks::TIFFDerivativeService do
32
32
  end
33
33
 
34
34
  def makes_tiff(filename)
35
+ path = source_image(filename)
35
36
  expected = expected_path(valid_file_set)
36
37
  expect(File.exist?(expected)).to be false
37
38
  svc = described_class.new(valid_file_set)
38
- svc.create_derivatives(source_image(filename))
39
+ svc.create_derivatives(path)
39
40
  expect(File.exist?(expected)).to be true
40
- desc = `gm identify #{expected}`
41
- expect(desc).to include 'TIFF'
42
- check_dpi_match(source_image(filename), expected)
41
+ mime = NewspaperWorks::ImageTool.new(expected).metadata[:content_type]
42
+ expect(mime).to eq 'image/tiff'
43
+ check_dpi_match(path, expected)
43
44
  svc.cleanup_derivatives
44
45
  end
45
46
 
46
- it "creates gray TIFF derivative from one-bit source" do
47
- makes_tiff('page1.tiff')
47
+ # for cases where primary file is TIFF already
48
+ def avoids_duplicative_creation(filename)
49
+ expected = expected_path(valid_file_set)
50
+ expect(File.exist?(expected)).to be false
51
+ svc = described_class.new(valid_file_set)
52
+ svc.create_derivatives(source_image(filename))
53
+ expect(File.exist?(expected)).not_to be true
48
54
  end
49
55
 
50
- it "creates gray TIFF from grayscale source" do
51
- makes_tiff('lowres-gray-via-ndnp-sample.tiff')
56
+ it "Does not make TIFF derivatives when primary is TIFF" do
57
+ avoids_duplicative_creation('ocr_mono.tiff')
58
+ avoids_duplicative_creation('ocr_gray.tiff')
52
59
  end
53
60
 
54
61
  it "creates TIFF from PDF source, robust to multi-page" do
@@ -64,6 +64,22 @@ module EngineRoutes
64
64
  end
65
65
  end
66
66
 
67
+ class CSVLoggingFormatter < RSpec::Core::Formatters::JsonFormatter
68
+ RSpec::Core::Formatters.register self
69
+
70
+ def close(_notification)
71
+ with_headers = {
72
+ write_headers: true,
73
+ headers: ['Example', 'Status', 'Run Time', 'Exception']
74
+ }
75
+ CSV.open(output.path, 'w', with_headers) do |csv|
76
+ @output_hash[:examples].map do |ex|
77
+ csv << [ex[:full_description], ex[:status], ex[:run_time], ex[:exception]]
78
+ end
79
+ end
80
+ end
81
+ end
82
+
67
83
  RSpec.configure do |config|
68
84
  # enable FactoryBot:
69
85
  require 'factory_bot'
@@ -226,6 +242,9 @@ RSpec.configure do |config|
226
242
  # config.default_formatter = "doc"
227
243
  # end
228
244
 
245
+ # opt-in CSV logging formatter, set SPEC_CSV environment variable to use:
246
+ config.add_formatter(CSVLoggingFormatter, 'spec_log.csv') unless ENV['SPEC_CSV'].nil?
247
+
229
248
  # Print the 10 slowest examples and example groups at the
230
249
  # end of the spec run, to help surface which specs are running
231
250
  # particularly slow.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: newspaper_works
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sean Upton
@@ -11,22 +11,22 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2019-09-27 00:00:00.000000000 Z
14
+ date: 2019-10-18 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: blacklight_iiif_search
18
18
  requirement: !ruby/object:Gem::Requirement
19
19
  requirements:
20
- - - ">="
20
+ - - "~>"
21
21
  - !ruby/object:Gem::Version
22
- version: '0'
22
+ version: '1.0'
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ">="
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
- version: '0'
29
+ version: '1.0'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: blacklight_advanced_search
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -83,20 +83,6 @@ dependencies:
83
83
  - - "~>"
84
84
  - !ruby/object:Gem::Version
85
85
  version: '5.1'
86
- - !ruby/object:Gem::Dependency
87
- name: rtesseract
88
- requirement: !ruby/object:Gem::Requirement
89
- requirements:
90
- - - "~>"
91
- - !ruby/object:Gem::Version
92
- version: 2.2.0
93
- type: :runtime
94
- prerelease: false
95
- version_requirements: !ruby/object:Gem::Requirement
96
- requirements:
97
- - - "~>"
98
- - !ruby/object:Gem::Version
99
- version: 2.2.0
100
86
  - !ruby/object:Gem::Dependency
101
87
  name: sass-rails
102
88
  requirement: !ruby/object:Gem::Requirement
@@ -371,6 +357,7 @@ executables: []
371
357
  extensions: []
372
358
  extra_rdoc_files: []
373
359
  files:
360
+ - ".coveralls.yml"
374
361
  - ".fcrepo_wrapper"
375
362
  - ".gitignore"
376
363
  - ".rubocop.yml"
@@ -577,6 +564,7 @@ files:
577
564
  - config/test-fixture/solr-config/xslt/example_atom.xsl
578
565
  - config/test-fixture/solr-config/xslt/example_rss.xsl
579
566
  - config/test-fixture/solr-config/xslt/luke.xsl
567
+ - config/vendor/fits.xml
580
568
  - config/vendor/imagemagick-6-policy.xml
581
569
  - db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb
582
570
  - db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb
@@ -603,6 +591,7 @@ files:
603
591
  - lib/newspaper_works/data/work_files.rb
604
592
  - lib/newspaper_works/engine.rb
605
593
  - lib/newspaper_works/errors.rb
594
+ - lib/newspaper_works/image_tool.rb
606
595
  - lib/newspaper_works/ingest.rb
607
596
  - lib/newspaper_works/ingest/base_ingest.rb
608
597
  - lib/newspaper_works/ingest/base_publication_info.rb
@@ -639,11 +628,13 @@ files:
639
628
  - lib/newspaper_works/ingest/pub_finder.rb
640
629
  - lib/newspaper_works/ingest/publication_info.rb
641
630
  - lib/newspaper_works/issue_pdf_composer.rb
631
+ - lib/newspaper_works/jp2_image_metadata.rb
642
632
  - lib/newspaper_works/logging.rb
643
633
  - lib/newspaper_works/page_finder.rb
644
634
  - lib/newspaper_works/resource_fetcher.rb
645
635
  - lib/newspaper_works/text_extraction.rb
646
636
  - lib/newspaper_works/text_extraction/alto_reader.rb
637
+ - lib/newspaper_works/text_extraction/hocr_reader.rb
647
638
  - lib/newspaper_works/text_extraction/page_ocr.rb
648
639
  - lib/newspaper_works/text_extraction/render_alto.rb
649
640
  - lib/newspaper_works/text_extraction/word_coords_builder.rb
@@ -689,6 +680,7 @@ files:
689
680
  - spec/fixtures/files/ocr_gray.jp2
690
681
  - spec/fixtures/files/ocr_gray.tiff
691
682
  - spec/fixtures/files/ocr_mono.tiff
683
+ - spec/fixtures/files/ocr_mono_text_hocr.html
692
684
  - spec/fixtures/files/page1.tiff
693
685
  - spec/fixtures/files/resource_mocks/chronam/http404-expected
694
686
  - spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf
@@ -732,6 +724,7 @@ files:
732
724
  - spec/lib/newspaper_works/data/work_derivatives_spec.rb
733
725
  - spec/lib/newspaper_works/data/work_file_spec.rb
734
726
  - spec/lib/newspaper_works/data/work_files_spec.rb
727
+ - spec/lib/newspaper_works/image_tool_spec.rb
735
728
  - spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb
736
729
  - spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb
737
730
  - spec/lib/newspaper_works/ingest/from_command_spec.rb
@@ -761,10 +754,12 @@ files:
761
754
  - spec/lib/newspaper_works/ingest/publication_info_spec.rb
762
755
  - spec/lib/newspaper_works/ingest_spec.rb
763
756
  - spec/lib/newspaper_works/issue_pdf_composer_spec.rb
757
+ - spec/lib/newspaper_works/jp2_image_metadata_spec.rb
764
758
  - spec/lib/newspaper_works/logging_spec.rb
765
759
  - spec/lib/newspaper_works/page_finder_spec.rb
766
760
  - spec/lib/newspaper_works/resource_fetcher_spec.rb
767
761
  - spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb
762
+ - spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb
768
763
  - spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb
769
764
  - spec/lib/newspaper_works/text_extraction/render_alto_spec.rb
770
765
  - spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb
@@ -830,7 +825,7 @@ files:
830
825
  - test/newspaper_works_test.rb
831
826
  - test/test_helper.rb
832
827
  - tmp/.keep
833
- homepage: https://github.com/marriott-library/newspaper_works
828
+ homepage: https://github.com/samvera-labs/newspaper_works
834
829
  licenses:
835
830
  - Apache-2.0
836
831
  metadata: {}
@@ -850,7 +845,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
850
845
  version: '0'
851
846
  requirements: []
852
847
  rubyforge_project:
853
- rubygems_version: 2.6.14.3
848
+ rubygems_version: 2.7.6.2
854
849
  signing_key:
855
850
  specification_version: 4
856
851
  summary: newspaper_works is a Rails Engine gem providing model and administrative
@@ -896,6 +891,7 @@ test_files:
896
891
  - spec/fixtures/files/ocr_gray.jp2
897
892
  - spec/fixtures/files/ocr_gray.tiff
898
893
  - spec/fixtures/files/ocr_mono.tiff
894
+ - spec/fixtures/files/ocr_mono_text_hocr.html
899
895
  - spec/fixtures/files/page1.tiff
900
896
  - spec/fixtures/files/resource_mocks/chronam/http404-expected
901
897
  - spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf
@@ -939,6 +935,7 @@ test_files:
939
935
  - spec/lib/newspaper_works/data/work_derivatives_spec.rb
940
936
  - spec/lib/newspaper_works/data/work_file_spec.rb
941
937
  - spec/lib/newspaper_works/data/work_files_spec.rb
938
+ - spec/lib/newspaper_works/image_tool_spec.rb
942
939
  - spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb
943
940
  - spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb
944
941
  - spec/lib/newspaper_works/ingest/from_command_spec.rb
@@ -968,10 +965,12 @@ test_files:
968
965
  - spec/lib/newspaper_works/ingest/publication_info_spec.rb
969
966
  - spec/lib/newspaper_works/ingest_spec.rb
970
967
  - spec/lib/newspaper_works/issue_pdf_composer_spec.rb
968
+ - spec/lib/newspaper_works/jp2_image_metadata_spec.rb
971
969
  - spec/lib/newspaper_works/logging_spec.rb
972
970
  - spec/lib/newspaper_works/page_finder_spec.rb
973
971
  - spec/lib/newspaper_works/resource_fetcher_spec.rb
974
972
  - spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb
973
+ - spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb
975
974
  - spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb
976
975
  - spec/lib/newspaper_works/text_extraction/render_alto_spec.rb
977
976
  - spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb