newspaper_works 0.1.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.coveralls.yml +2 -0
- data/.gitignore +4 -0
- data/.travis.yml +2 -2
- data/README.md +14 -13
- data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
- data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
- data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
- data/config/locales/newspaper_article.de.yml +1 -1
- data/config/locales/newspaper_article.en.yml +1 -1
- data/config/locales/newspaper_article.es.yml +1 -1
- data/config/locales/newspaper_article.fr.yml +1 -1
- data/config/locales/newspaper_article.it.yml +1 -1
- data/config/locales/newspaper_article.pt-BR.yml +1 -1
- data/config/locales/newspaper_article.zh.yml +1 -1
- data/config/locales/newspaper_container.de.yml +1 -1
- data/config/locales/newspaper_container.en.yml +1 -1
- data/config/locales/newspaper_container.es.yml +1 -1
- data/config/locales/newspaper_container.fr.yml +1 -1
- data/config/locales/newspaper_container.it.yml +1 -1
- data/config/locales/newspaper_container.pt-BR.yml +1 -1
- data/config/locales/newspaper_container.zh.yml +1 -1
- data/config/locales/newspaper_issue.de.yml +1 -1
- data/config/locales/newspaper_issue.en.yml +1 -1
- data/config/locales/newspaper_issue.es.yml +1 -1
- data/config/locales/newspaper_issue.fr.yml +1 -1
- data/config/locales/newspaper_issue.it.yml +2 -2
- data/config/locales/newspaper_issue.pt-BR.yml +2 -2
- data/config/locales/newspaper_issue.zh.yml +2 -2
- data/config/locales/newspaper_page.de.yml +1 -1
- data/config/locales/newspaper_page.en.yml +1 -1
- data/config/locales/newspaper_page.es.yml +1 -1
- data/config/locales/newspaper_page.fr.yml +1 -1
- data/config/locales/newspaper_page.it.yml +1 -1
- data/config/locales/newspaper_page.pt-BR.yml +1 -1
- data/config/locales/newspaper_page.zh.yml +1 -1
- data/config/locales/newspaper_title.de.yml +1 -1
- data/config/locales/newspaper_title.en.yml +1 -1
- data/config/locales/newspaper_title.es.yml +1 -1
- data/config/locales/newspaper_title.fr.yml +1 -1
- data/config/locales/newspaper_title.it.yml +1 -1
- data/config/locales/newspaper_title.pt-BR.yml +1 -1
- data/config/locales/newspaper_title.zh.yml +1 -1
- data/config/locales/newspaper_works.de.yml +98 -0
- data/config/locales/newspaper_works.en.yml +67 -0
- data/config/locales/newspaper_works.es.yml +96 -0
- data/config/locales/newspaper_works.fr.yml +97 -0
- data/config/locales/newspaper_works.it.yml +90 -0
- data/config/locales/newspaper_works.pt-BR.yml +96 -0
- data/config/locales/newspaper_works.zh.yml +90 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +39 -39
- data/lib/newspaper_works.rb +2 -0
- data/lib/newspaper_works/image_tool.rb +119 -0
- data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
- data/lib/newspaper_works/text_extraction.rb +1 -0
- data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
- data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
- data/lib/newspaper_works/version.rb +1 -1
- data/newspaper_works.gemspec +2 -3
- data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
- data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
- data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
- data/spec/spec_helper.rb +19 -0
- metadata +21 -22
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
RSpec.describe NewspaperWorks::TextExtraction::HOCRReader do
|
6
|
+
let(:fixture_path) do
|
7
|
+
File.join(
|
8
|
+
NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
|
9
|
+
)
|
10
|
+
end
|
11
|
+
|
12
|
+
let(:minimal_path) { File.join(fixture_path, 'ocr_mono_text_hocr.html') }
|
13
|
+
let(:minimal) { File.read(minimal_path) }
|
14
|
+
|
15
|
+
let(:reader_minimal) { described_class.new(minimal) }
|
16
|
+
let(:reader_minimal_path) { described_class.new(minimal_path) }
|
17
|
+
|
18
|
+
describe "reads hOCR" do
|
19
|
+
it "loads hOCR either from path or source text" do
|
20
|
+
expect(reader_minimal_path.source).to eq reader_minimal.source
|
21
|
+
# size here is in Unicode characters, not bytes:
|
22
|
+
expect(reader_minimal_path.source.size).to eq 16_590
|
23
|
+
end
|
24
|
+
|
25
|
+
it "loads document stream" do
|
26
|
+
expect(reader_minimal_path.doc_stream).to be_kind_of Nokogiri::XML::SAX::Document
|
27
|
+
expect(reader_minimal_path.doc_stream).to respond_to :text
|
28
|
+
expect(reader_minimal_path.doc_stream).to respond_to :words
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "outputs text derivative formats" do
|
33
|
+
it "outputs plain text" do
|
34
|
+
plain_text = reader_minimal.text
|
35
|
+
expect(plain_text.slice(0, 40)).to eq "_A FEARFUL ADVENTURE.\n‘The Missouri. "
|
36
|
+
expect(reader_minimal.text).to eq reader_minimal.doc_stream.text
|
37
|
+
expect(reader_minimal.text.size).to eq 831
|
38
|
+
end
|
39
|
+
|
40
|
+
it "passes args to WordCoordsBuilder and receives output" do
|
41
|
+
parsed = JSON.parse(reader_minimal.json)
|
42
|
+
expect(parsed['coords'].length).to be > 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -26,7 +26,7 @@ RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
|
|
26
26
|
expect(words).to be_an(Array)
|
27
27
|
expect(words).not_to be_empty
|
28
28
|
expect(words[0]).to be_a(Hash)
|
29
|
-
[:word, :
|
29
|
+
[:word, :coordinates].each do |key|
|
30
30
|
expect(words[0].keys).to include key
|
31
31
|
end
|
32
32
|
end
|
@@ -77,8 +77,8 @@ RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
|
|
77
77
|
word = ocr_from_gray_tiff.words[0]
|
78
78
|
word1 = parsed['coords'][word[:word]]
|
79
79
|
word1_coords = word1[0]
|
80
|
-
expect(word1_coords[2]).to eq word[:
|
81
|
-
expect(word1_coords[3]).to eq word[:
|
80
|
+
expect(word1_coords[2]).to eq word[:coordinates][2]
|
81
|
+
expect(word1_coords[3]).to eq word[:coordinates][3]
|
82
82
|
end
|
83
83
|
end
|
84
84
|
end
|
@@ -16,20 +16,20 @@ RSpec.describe NewspaperWorks::TextExtraction::RenderAlto do
|
|
16
16
|
|
17
17
|
let(:words) do
|
18
18
|
[
|
19
|
-
{ word:
|
20
|
-
{ word:
|
21
|
-
{ word:
|
22
|
-
{ word:
|
23
|
-
{ word:
|
24
|
-
{ word:
|
25
|
-
{ word:
|
26
|
-
{ word:
|
27
|
-
{ word:
|
28
|
-
{ word:
|
29
|
-
{ word:
|
30
|
-
{ word:
|
31
|
-
{ word:
|
32
|
-
{ word:
|
19
|
+
{ word: "If", coordinates: [52, 13, 11, 14] },
|
20
|
+
{ word: "you", coordinates: [69, 17, 31, 14] },
|
21
|
+
{ word: "are", coordinates: [108, 17, 28, 10] },
|
22
|
+
{ word: "a", coordinates: [143, 17, 8, 10] },
|
23
|
+
{ word: "friend,", coordinates: [158, 13, 56, 16] },
|
24
|
+
{ word: "you", coordinates: [51, 39, 31, 14] },
|
25
|
+
{ word: "speak", coordinates: [90, 35, 50, 18] },
|
26
|
+
{ word: "the", coordinates: [146, 35, 28, 14] },
|
27
|
+
{ word: "password,", coordinates: [182, 35, 85, 18] },
|
28
|
+
{ word: "and", coordinates: [51, 57, 30, 14] },
|
29
|
+
{ word: "the", coordinates: [89, 57, 28, 14] },
|
30
|
+
{ word: "doors", coordinates: [124, 57, 48, 14] },
|
31
|
+
{ word: "will", coordinates: [180, 57, 28, 14] },
|
32
|
+
{ word: "open.", coordinates: [216, 61, 47, 14] }
|
33
33
|
]
|
34
34
|
end
|
35
35
|
|
@@ -21,30 +21,27 @@ RSpec.describe NewspaperWorks::JP2DerivativeService do
|
|
21
21
|
Hyrax::DerivativePath.derivative_path_for_reference(file_set, 'jp2')
|
22
22
|
end
|
23
23
|
|
24
|
-
def
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
# check ppi, but skip pdf to avoid ghostscript warnings to stderr
|
31
|
-
expect(get_res(orig)).to eq get_res(dest) unless orig.end_with?('pdf')
|
24
|
+
def metadata_match_checker(source, target)
|
25
|
+
target_meta = NewspaperWorks::ImageTool.new(target).metadata
|
26
|
+
source_meta = NewspaperWorks::ImageTool.new(source).metadata
|
27
|
+
expect(target_meta[:content_type]).to eq 'image/jp2'
|
28
|
+
expect(target_meta[:width]).to eq source_meta[:width]
|
29
|
+
expect(target_meta[:height]).to eq source_meta[:height]
|
32
30
|
end
|
33
31
|
|
34
32
|
def makes_jp2(filename)
|
35
33
|
expected = expected_path(valid_file_set)
|
36
34
|
expect(File.exist?(expected)).to be false
|
37
35
|
svc = described_class.new(valid_file_set)
|
38
|
-
|
36
|
+
source_path = source_image(filename)
|
37
|
+
svc.create_derivatives(source_path)
|
39
38
|
expect(File.exist?(expected)).to be true
|
40
|
-
|
41
|
-
expect(desc).to include 'JP2'
|
42
|
-
check_dpi_match(source_image(filename), expected)
|
39
|
+
metadata_match_checker(source_path, expected)
|
43
40
|
svc.cleanup_derivatives
|
44
41
|
end
|
45
42
|
|
46
43
|
it "creates gray JP2 derivative from one-bit source" do
|
47
|
-
makes_jp2('
|
44
|
+
makes_jp2('ocr_mono.tiff')
|
48
45
|
end
|
49
46
|
|
50
47
|
it "creates gray JP2 from grayscale source" do
|
@@ -96,30 +96,32 @@ RSpec.describe NewspaperWorks::NewspaperPageDerivativeService do
|
|
96
96
|
end
|
97
97
|
|
98
98
|
it "identifies a source file using ImageMagick" do
|
99
|
-
|
100
|
-
expect(
|
99
|
+
service = service_for_file('4.1.07.tiff')
|
100
|
+
expect(service.identify[:content_type]).to eq 'image/tiff'
|
101
|
+
expect(service.identify[:bits_per_component]).to eq 8
|
101
102
|
end
|
102
103
|
|
103
104
|
it "identifies jp2 source" do
|
104
|
-
# test/verify jp2 source is identified, which relies on
|
105
|
-
|
106
|
-
expect(
|
105
|
+
# test/verify jp2 source is identified, which relies on JP2 backend
|
106
|
+
service = service_for_file('4.1.07.jp2')
|
107
|
+
expect(service.identify[:content_type]).to eq 'image/jp2'
|
108
|
+
expect(service.identify[:bits_per_component]).to eq 8
|
107
109
|
end
|
108
110
|
|
109
111
|
it "identifies color and gray sources" do
|
110
112
|
expect(service_for_file('4.1.07.tiff').use_color?).to be true
|
111
|
-
expect(service_for_file('
|
113
|
+
expect(service_for_file('ocr_gray.tiff').use_color?).to be false
|
112
114
|
end
|
113
115
|
|
114
116
|
it "identifies a one-bit source" do
|
115
117
|
# 1-bit group4 monochrome TIFF:
|
116
|
-
expect(service_for_file('
|
118
|
+
expect(service_for_file('ocr_mono.tiff').one_bit?).to be true
|
117
119
|
# 8-bit gray TIFF:
|
118
120
|
expect(
|
119
121
|
service_for_file('lowres-gray-via-ndnp-sample.tiff').one_bit?
|
120
122
|
).to be false
|
121
123
|
# color TIFF:
|
122
|
-
expect(service_for_file('4.1.07.
|
124
|
+
expect(service_for_file('4.1.07.tiff').one_bit?).to be false
|
123
125
|
end
|
124
126
|
end
|
125
127
|
end
|
@@ -23,10 +23,10 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
|
|
23
23
|
|
24
24
|
# given output file name, check DPI is 150
|
25
25
|
def check_dpi(expected)
|
26
|
-
|
27
|
-
# get
|
28
|
-
page_width =
|
29
|
-
expect(
|
26
|
+
metadata = NewspaperWorks::ImageTool.new(expected).metadata
|
27
|
+
# get width of pdf in points (via imagemagick), should be 864x == 12in
|
28
|
+
page_width = metadata[:width]
|
29
|
+
expect(page_width).to eq 864
|
30
30
|
# get total width of image in pixels from pdfimages -list, ==> 1800
|
31
31
|
image_width = 1800
|
32
32
|
im_list = `pdfimages -list #{expected}`
|
@@ -41,14 +41,14 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
|
|
41
41
|
svc = described_class.new(valid_file_set)
|
42
42
|
svc.create_derivatives(source_image(filename))
|
43
43
|
expect(File.exist?(expected)).to be true
|
44
|
-
|
45
|
-
expect(
|
44
|
+
metadata = NewspaperWorks::ImageTool.new(expected).metadata
|
45
|
+
expect(metadata[:content_type]).to eq 'application/pdf'
|
46
46
|
check_dpi(expected)
|
47
47
|
svc.cleanup_derivatives
|
48
48
|
end
|
49
49
|
|
50
50
|
it "creates gray PDF derivative from one-bit source" do
|
51
|
-
makes_pdf('
|
51
|
+
makes_pdf('ocr_mono.tiff')
|
52
52
|
end
|
53
53
|
|
54
54
|
it "creates gray PDF from grayscale source" do
|
@@ -58,5 +58,9 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
|
|
58
58
|
it "creates color PDF from color source" do
|
59
59
|
makes_pdf('4.1.07.tiff')
|
60
60
|
end
|
61
|
+
|
62
|
+
it "creates color PDF from color JP2 source" do
|
63
|
+
makes_pdf('4.1.07.jp2')
|
64
|
+
end
|
61
65
|
end
|
62
66
|
end
|
@@ -22,8 +22,8 @@ RSpec.describe NewspaperWorks::TIFFDerivativeService do
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def get_res(path)
|
25
|
-
|
26
|
-
|
25
|
+
tool = NewspaperWorks::ImageTool.new(path)
|
26
|
+
"#{tool.metadata[:width]}x#{tool.metadata[:height]}"
|
27
27
|
end
|
28
28
|
|
29
29
|
def check_dpi_match(orig, dest)
|
@@ -32,23 +32,30 @@ RSpec.describe NewspaperWorks::TIFFDerivativeService do
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def makes_tiff(filename)
|
35
|
+
path = source_image(filename)
|
35
36
|
expected = expected_path(valid_file_set)
|
36
37
|
expect(File.exist?(expected)).to be false
|
37
38
|
svc = described_class.new(valid_file_set)
|
38
|
-
svc.create_derivatives(
|
39
|
+
svc.create_derivatives(path)
|
39
40
|
expect(File.exist?(expected)).to be true
|
40
|
-
|
41
|
-
expect(
|
42
|
-
check_dpi_match(
|
41
|
+
mime = NewspaperWorks::ImageTool.new(expected).metadata[:content_type]
|
42
|
+
expect(mime).to eq 'image/tiff'
|
43
|
+
check_dpi_match(path, expected)
|
43
44
|
svc.cleanup_derivatives
|
44
45
|
end
|
45
46
|
|
46
|
-
|
47
|
-
|
47
|
+
# for cases where primary file is TIFF already
|
48
|
+
def avoids_duplicative_creation(filename)
|
49
|
+
expected = expected_path(valid_file_set)
|
50
|
+
expect(File.exist?(expected)).to be false
|
51
|
+
svc = described_class.new(valid_file_set)
|
52
|
+
svc.create_derivatives(source_image(filename))
|
53
|
+
expect(File.exist?(expected)).not_to be true
|
48
54
|
end
|
49
55
|
|
50
|
-
it "
|
51
|
-
|
56
|
+
it "Does not make TIFF derivatives when primary is TIFF" do
|
57
|
+
avoids_duplicative_creation('ocr_mono.tiff')
|
58
|
+
avoids_duplicative_creation('ocr_gray.tiff')
|
52
59
|
end
|
53
60
|
|
54
61
|
it "creates TIFF from PDF source, robust to multi-page" do
|
data/spec/spec_helper.rb
CHANGED
@@ -64,6 +64,22 @@ module EngineRoutes
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
67
|
+
class CSVLoggingFormatter < RSpec::Core::Formatters::JsonFormatter
|
68
|
+
RSpec::Core::Formatters.register self
|
69
|
+
|
70
|
+
def close(_notification)
|
71
|
+
with_headers = {
|
72
|
+
write_headers: true,
|
73
|
+
headers: ['Example', 'Status', 'Run Time', 'Exception']
|
74
|
+
}
|
75
|
+
CSV.open(output.path, 'w', with_headers) do |csv|
|
76
|
+
@output_hash[:examples].map do |ex|
|
77
|
+
csv << [ex[:full_description], ex[:status], ex[:run_time], ex[:exception]]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
67
83
|
RSpec.configure do |config|
|
68
84
|
# enable FactoryBot:
|
69
85
|
require 'factory_bot'
|
@@ -226,6 +242,9 @@ RSpec.configure do |config|
|
|
226
242
|
# config.default_formatter = "doc"
|
227
243
|
# end
|
228
244
|
|
245
|
+
# opt-in CSV logging formatter, set SPEC_CSV environment variable to use:
|
246
|
+
config.add_formatter(CSVLoggingFormatter, 'spec_log.csv') unless ENV['SPEC_CSV'].nil?
|
247
|
+
|
229
248
|
# Print the 10 slowest examples and example groups at the
|
230
249
|
# end of the spec run, to help surface which specs are running
|
231
250
|
# particularly slow.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: newspaper_works
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sean Upton
|
@@ -11,22 +11,22 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2019-
|
14
|
+
date: 2019-10-18 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: blacklight_iiif_search
|
18
18
|
requirement: !ruby/object:Gem::Requirement
|
19
19
|
requirements:
|
20
|
-
- - "
|
20
|
+
- - "~>"
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: '0'
|
22
|
+
version: '1.0'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - "
|
27
|
+
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '0'
|
29
|
+
version: '1.0'
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
31
|
name: blacklight_advanced_search
|
32
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -83,20 +83,6 @@ dependencies:
|
|
83
83
|
- - "~>"
|
84
84
|
- !ruby/object:Gem::Version
|
85
85
|
version: '5.1'
|
86
|
-
- !ruby/object:Gem::Dependency
|
87
|
-
name: rtesseract
|
88
|
-
requirement: !ruby/object:Gem::Requirement
|
89
|
-
requirements:
|
90
|
-
- - "~>"
|
91
|
-
- !ruby/object:Gem::Version
|
92
|
-
version: 2.2.0
|
93
|
-
type: :runtime
|
94
|
-
prerelease: false
|
95
|
-
version_requirements: !ruby/object:Gem::Requirement
|
96
|
-
requirements:
|
97
|
-
- - "~>"
|
98
|
-
- !ruby/object:Gem::Version
|
99
|
-
version: 2.2.0
|
100
86
|
- !ruby/object:Gem::Dependency
|
101
87
|
name: sass-rails
|
102
88
|
requirement: !ruby/object:Gem::Requirement
|
@@ -371,6 +357,7 @@ executables: []
|
|
371
357
|
extensions: []
|
372
358
|
extra_rdoc_files: []
|
373
359
|
files:
|
360
|
+
- ".coveralls.yml"
|
374
361
|
- ".fcrepo_wrapper"
|
375
362
|
- ".gitignore"
|
376
363
|
- ".rubocop.yml"
|
@@ -577,6 +564,7 @@ files:
|
|
577
564
|
- config/test-fixture/solr-config/xslt/example_atom.xsl
|
578
565
|
- config/test-fixture/solr-config/xslt/example_rss.xsl
|
579
566
|
- config/test-fixture/solr-config/xslt/luke.xsl
|
567
|
+
- config/vendor/fits.xml
|
580
568
|
- config/vendor/imagemagick-6-policy.xml
|
581
569
|
- db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb
|
582
570
|
- db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb
|
@@ -603,6 +591,7 @@ files:
|
|
603
591
|
- lib/newspaper_works/data/work_files.rb
|
604
592
|
- lib/newspaper_works/engine.rb
|
605
593
|
- lib/newspaper_works/errors.rb
|
594
|
+
- lib/newspaper_works/image_tool.rb
|
606
595
|
- lib/newspaper_works/ingest.rb
|
607
596
|
- lib/newspaper_works/ingest/base_ingest.rb
|
608
597
|
- lib/newspaper_works/ingest/base_publication_info.rb
|
@@ -639,11 +628,13 @@ files:
|
|
639
628
|
- lib/newspaper_works/ingest/pub_finder.rb
|
640
629
|
- lib/newspaper_works/ingest/publication_info.rb
|
641
630
|
- lib/newspaper_works/issue_pdf_composer.rb
|
631
|
+
- lib/newspaper_works/jp2_image_metadata.rb
|
642
632
|
- lib/newspaper_works/logging.rb
|
643
633
|
- lib/newspaper_works/page_finder.rb
|
644
634
|
- lib/newspaper_works/resource_fetcher.rb
|
645
635
|
- lib/newspaper_works/text_extraction.rb
|
646
636
|
- lib/newspaper_works/text_extraction/alto_reader.rb
|
637
|
+
- lib/newspaper_works/text_extraction/hocr_reader.rb
|
647
638
|
- lib/newspaper_works/text_extraction/page_ocr.rb
|
648
639
|
- lib/newspaper_works/text_extraction/render_alto.rb
|
649
640
|
- lib/newspaper_works/text_extraction/word_coords_builder.rb
|
@@ -689,6 +680,7 @@ files:
|
|
689
680
|
- spec/fixtures/files/ocr_gray.jp2
|
690
681
|
- spec/fixtures/files/ocr_gray.tiff
|
691
682
|
- spec/fixtures/files/ocr_mono.tiff
|
683
|
+
- spec/fixtures/files/ocr_mono_text_hocr.html
|
692
684
|
- spec/fixtures/files/page1.tiff
|
693
685
|
- spec/fixtures/files/resource_mocks/chronam/http404-expected
|
694
686
|
- spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf
|
@@ -732,6 +724,7 @@ files:
|
|
732
724
|
- spec/lib/newspaper_works/data/work_derivatives_spec.rb
|
733
725
|
- spec/lib/newspaper_works/data/work_file_spec.rb
|
734
726
|
- spec/lib/newspaper_works/data/work_files_spec.rb
|
727
|
+
- spec/lib/newspaper_works/image_tool_spec.rb
|
735
728
|
- spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb
|
736
729
|
- spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb
|
737
730
|
- spec/lib/newspaper_works/ingest/from_command_spec.rb
|
@@ -761,10 +754,12 @@ files:
|
|
761
754
|
- spec/lib/newspaper_works/ingest/publication_info_spec.rb
|
762
755
|
- spec/lib/newspaper_works/ingest_spec.rb
|
763
756
|
- spec/lib/newspaper_works/issue_pdf_composer_spec.rb
|
757
|
+
- spec/lib/newspaper_works/jp2_image_metadata_spec.rb
|
764
758
|
- spec/lib/newspaper_works/logging_spec.rb
|
765
759
|
- spec/lib/newspaper_works/page_finder_spec.rb
|
766
760
|
- spec/lib/newspaper_works/resource_fetcher_spec.rb
|
767
761
|
- spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb
|
762
|
+
- spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb
|
768
763
|
- spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb
|
769
764
|
- spec/lib/newspaper_works/text_extraction/render_alto_spec.rb
|
770
765
|
- spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb
|
@@ -830,7 +825,7 @@ files:
|
|
830
825
|
- test/newspaper_works_test.rb
|
831
826
|
- test/test_helper.rb
|
832
827
|
- tmp/.keep
|
833
|
-
homepage: https://github.com/
|
828
|
+
homepage: https://github.com/samvera-labs/newspaper_works
|
834
829
|
licenses:
|
835
830
|
- Apache-2.0
|
836
831
|
metadata: {}
|
@@ -850,7 +845,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
850
845
|
version: '0'
|
851
846
|
requirements: []
|
852
847
|
rubyforge_project:
|
853
|
-
rubygems_version: 2.6.
|
848
|
+
rubygems_version: 2.7.6.2
|
854
849
|
signing_key:
|
855
850
|
specification_version: 4
|
856
851
|
summary: newspaper_works is a Rails Engine gem providing model and administrative
|
@@ -896,6 +891,7 @@ test_files:
|
|
896
891
|
- spec/fixtures/files/ocr_gray.jp2
|
897
892
|
- spec/fixtures/files/ocr_gray.tiff
|
898
893
|
- spec/fixtures/files/ocr_mono.tiff
|
894
|
+
- spec/fixtures/files/ocr_mono_text_hocr.html
|
899
895
|
- spec/fixtures/files/page1.tiff
|
900
896
|
- spec/fixtures/files/resource_mocks/chronam/http404-expected
|
901
897
|
- spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf
|
@@ -939,6 +935,7 @@ test_files:
|
|
939
935
|
- spec/lib/newspaper_works/data/work_derivatives_spec.rb
|
940
936
|
- spec/lib/newspaper_works/data/work_file_spec.rb
|
941
937
|
- spec/lib/newspaper_works/data/work_files_spec.rb
|
938
|
+
- spec/lib/newspaper_works/image_tool_spec.rb
|
942
939
|
- spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb
|
943
940
|
- spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb
|
944
941
|
- spec/lib/newspaper_works/ingest/from_command_spec.rb
|
@@ -968,10 +965,12 @@ test_files:
|
|
968
965
|
- spec/lib/newspaper_works/ingest/publication_info_spec.rb
|
969
966
|
- spec/lib/newspaper_works/ingest_spec.rb
|
970
967
|
- spec/lib/newspaper_works/issue_pdf_composer_spec.rb
|
968
|
+
- spec/lib/newspaper_works/jp2_image_metadata_spec.rb
|
971
969
|
- spec/lib/newspaper_works/logging_spec.rb
|
972
970
|
- spec/lib/newspaper_works/page_finder_spec.rb
|
973
971
|
- spec/lib/newspaper_works/resource_fetcher_spec.rb
|
974
972
|
- spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb
|
973
|
+
- spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb
|
975
974
|
- spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb
|
976
975
|
- spec/lib/newspaper_works/text_extraction/render_alto_spec.rb
|
977
976
|
- spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb
|