newspaper_works 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.coveralls.yml +2 -0
- data/.gitignore +4 -0
- data/.travis.yml +2 -2
- data/README.md +14 -13
- data/app/services/newspaper_works/jp2_derivative_service.rb +1 -3
- data/app/services/newspaper_works/newspaper_page_derivative_service.rb +37 -15
- data/app/services/newspaper_works/pdf_derivative_service.rb +4 -7
- data/app/services/newspaper_works/tiff_derivative_service.rb +5 -9
- data/app/views/newspaper_works/base/_attribute_rows.html.erb +72 -24
- data/config/locales/newspaper_article.de.yml +1 -1
- data/config/locales/newspaper_article.en.yml +1 -1
- data/config/locales/newspaper_article.es.yml +1 -1
- data/config/locales/newspaper_article.fr.yml +1 -1
- data/config/locales/newspaper_article.it.yml +1 -1
- data/config/locales/newspaper_article.pt-BR.yml +1 -1
- data/config/locales/newspaper_article.zh.yml +1 -1
- data/config/locales/newspaper_container.de.yml +1 -1
- data/config/locales/newspaper_container.en.yml +1 -1
- data/config/locales/newspaper_container.es.yml +1 -1
- data/config/locales/newspaper_container.fr.yml +1 -1
- data/config/locales/newspaper_container.it.yml +1 -1
- data/config/locales/newspaper_container.pt-BR.yml +1 -1
- data/config/locales/newspaper_container.zh.yml +1 -1
- data/config/locales/newspaper_issue.de.yml +1 -1
- data/config/locales/newspaper_issue.en.yml +1 -1
- data/config/locales/newspaper_issue.es.yml +1 -1
- data/config/locales/newspaper_issue.fr.yml +1 -1
- data/config/locales/newspaper_issue.it.yml +2 -2
- data/config/locales/newspaper_issue.pt-BR.yml +2 -2
- data/config/locales/newspaper_issue.zh.yml +2 -2
- data/config/locales/newspaper_page.de.yml +1 -1
- data/config/locales/newspaper_page.en.yml +1 -1
- data/config/locales/newspaper_page.es.yml +1 -1
- data/config/locales/newspaper_page.fr.yml +1 -1
- data/config/locales/newspaper_page.it.yml +1 -1
- data/config/locales/newspaper_page.pt-BR.yml +1 -1
- data/config/locales/newspaper_page.zh.yml +1 -1
- data/config/locales/newspaper_title.de.yml +1 -1
- data/config/locales/newspaper_title.en.yml +1 -1
- data/config/locales/newspaper_title.es.yml +1 -1
- data/config/locales/newspaper_title.fr.yml +1 -1
- data/config/locales/newspaper_title.it.yml +1 -1
- data/config/locales/newspaper_title.pt-BR.yml +1 -1
- data/config/locales/newspaper_title.zh.yml +1 -1
- data/config/locales/newspaper_works.de.yml +98 -0
- data/config/locales/newspaper_works.en.yml +67 -0
- data/config/locales/newspaper_works.es.yml +96 -0
- data/config/locales/newspaper_works.fr.yml +97 -0
- data/config/locales/newspaper_works.it.yml +90 -0
- data/config/locales/newspaper_works.pt-BR.yml +96 -0
- data/config/locales/newspaper_works.zh.yml +90 -0
- data/config/vendor/fits.xml +55 -0
- data/config/vendor/imagemagick-6-policy.xml +39 -39
- data/lib/newspaper_works.rb +2 -0
- data/lib/newspaper_works/image_tool.rb +119 -0
- data/lib/newspaper_works/jp2_image_metadata.rb +81 -0
- data/lib/newspaper_works/text_extraction.rb +1 -0
- data/lib/newspaper_works/text_extraction/hocr_reader.rb +173 -0
- data/lib/newspaper_works/text_extraction/page_ocr.rb +37 -51
- data/lib/newspaper_works/text_extraction/render_alto.rb +4 -4
- data/lib/newspaper_works/version.rb +1 -1
- data/newspaper_works.gemspec +2 -3
- data/spec/features/search_results_thumbnail_highlights_spec.rb +1 -1
- data/spec/fixtures/files/ocr_mono_text_hocr.html +78 -0
- data/spec/lib/newspaper_works/image_tool_spec.rb +109 -0
- data/spec/lib/newspaper_works/ingest/ingest_shared.rb +3 -3
- data/spec/lib/newspaper_works/ingest/newspaper_page_ingest_spec.rb +2 -2
- data/spec/lib/newspaper_works/jp2_image_metadata_spec.rb +37 -0
- data/spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb +45 -0
- data/spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb +3 -3
- data/spec/lib/newspaper_works/text_extraction/render_alto_spec.rb +14 -14
- data/spec/services/newspaper_works/jp2_derivative_service_spec.rb +10 -13
- data/spec/services/newspaper_works/newspaper_page_derivative_service_spec.rb +10 -8
- data/spec/services/newspaper_works/pdf_derivative_service_spec.rb +11 -7
- data/spec/services/newspaper_works/tiff_derivative_service_spec.rb +17 -10
- data/spec/spec_helper.rb +19 -0
- metadata +21 -22
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
RSpec.describe NewspaperWorks::TextExtraction::HOCRReader do
|
6
|
+
let(:fixture_path) do
|
7
|
+
File.join(
|
8
|
+
NewspaperWorks::GEM_PATH, 'spec', 'fixtures', 'files'
|
9
|
+
)
|
10
|
+
end
|
11
|
+
|
12
|
+
let(:minimal_path) { File.join(fixture_path, 'ocr_mono_text_hocr.html') }
|
13
|
+
let(:minimal) { File.read(minimal_path) }
|
14
|
+
|
15
|
+
let(:reader_minimal) { described_class.new(minimal) }
|
16
|
+
let(:reader_minimal_path) { described_class.new(minimal_path) }
|
17
|
+
|
18
|
+
describe "reads hOCR" do
|
19
|
+
it "loads hOCR either from path or source text" do
|
20
|
+
expect(reader_minimal_path.source).to eq reader_minimal.source
|
21
|
+
# size here is in Unicode characters, not bytes:
|
22
|
+
expect(reader_minimal_path.source.size).to eq 16_590
|
23
|
+
end
|
24
|
+
|
25
|
+
it "loads document stream" do
|
26
|
+
expect(reader_minimal_path.doc_stream).to be_kind_of Nokogiri::XML::SAX::Document
|
27
|
+
expect(reader_minimal_path.doc_stream).to respond_to :text
|
28
|
+
expect(reader_minimal_path.doc_stream).to respond_to :words
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "outputs text derivative formats" do
|
33
|
+
it "outputs plain text" do
|
34
|
+
plain_text = reader_minimal.text
|
35
|
+
expect(plain_text.slice(0, 40)).to eq "_A FEARFUL ADVENTURE.\n‘The Missouri. "
|
36
|
+
expect(reader_minimal.text).to eq reader_minimal.doc_stream.text
|
37
|
+
expect(reader_minimal.text.size).to eq 831
|
38
|
+
end
|
39
|
+
|
40
|
+
it "passes args to WordCoordsBuilder and receives output" do
|
41
|
+
parsed = JSON.parse(reader_minimal.json)
|
42
|
+
expect(parsed['coords'].length).to be > 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -26,7 +26,7 @@ RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
|
|
26
26
|
expect(words).to be_an(Array)
|
27
27
|
expect(words).not_to be_empty
|
28
28
|
expect(words[0]).to be_a(Hash)
|
29
|
-
[:word, :
|
29
|
+
[:word, :coordinates].each do |key|
|
30
30
|
expect(words[0].keys).to include key
|
31
31
|
end
|
32
32
|
end
|
@@ -77,8 +77,8 @@ RSpec.describe NewspaperWorks::TextExtraction::PageOCR do
|
|
77
77
|
word = ocr_from_gray_tiff.words[0]
|
78
78
|
word1 = parsed['coords'][word[:word]]
|
79
79
|
word1_coords = word1[0]
|
80
|
-
expect(word1_coords[2]).to eq word[:
|
81
|
-
expect(word1_coords[3]).to eq word[:
|
80
|
+
expect(word1_coords[2]).to eq word[:coordinates][2]
|
81
|
+
expect(word1_coords[3]).to eq word[:coordinates][3]
|
82
82
|
end
|
83
83
|
end
|
84
84
|
end
|
@@ -16,20 +16,20 @@ RSpec.describe NewspaperWorks::TextExtraction::RenderAlto do
|
|
16
16
|
|
17
17
|
let(:words) do
|
18
18
|
[
|
19
|
-
{ word:
|
20
|
-
{ word:
|
21
|
-
{ word:
|
22
|
-
{ word:
|
23
|
-
{ word:
|
24
|
-
{ word:
|
25
|
-
{ word:
|
26
|
-
{ word:
|
27
|
-
{ word:
|
28
|
-
{ word:
|
29
|
-
{ word:
|
30
|
-
{ word:
|
31
|
-
{ word:
|
32
|
-
{ word:
|
19
|
+
{ word: "If", coordinates: [52, 13, 11, 14] },
|
20
|
+
{ word: "you", coordinates: [69, 17, 31, 14] },
|
21
|
+
{ word: "are", coordinates: [108, 17, 28, 10] },
|
22
|
+
{ word: "a", coordinates: [143, 17, 8, 10] },
|
23
|
+
{ word: "friend,", coordinates: [158, 13, 56, 16] },
|
24
|
+
{ word: "you", coordinates: [51, 39, 31, 14] },
|
25
|
+
{ word: "speak", coordinates: [90, 35, 50, 18] },
|
26
|
+
{ word: "the", coordinates: [146, 35, 28, 14] },
|
27
|
+
{ word: "password,", coordinates: [182, 35, 85, 18] },
|
28
|
+
{ word: "and", coordinates: [51, 57, 30, 14] },
|
29
|
+
{ word: "the", coordinates: [89, 57, 28, 14] },
|
30
|
+
{ word: "doors", coordinates: [124, 57, 48, 14] },
|
31
|
+
{ word: "will", coordinates: [180, 57, 28, 14] },
|
32
|
+
{ word: "open.", coordinates: [216, 61, 47, 14] }
|
33
33
|
]
|
34
34
|
end
|
35
35
|
|
@@ -21,30 +21,27 @@ RSpec.describe NewspaperWorks::JP2DerivativeService do
|
|
21
21
|
Hyrax::DerivativePath.derivative_path_for_reference(file_set, 'jp2')
|
22
22
|
end
|
23
23
|
|
24
|
-
def
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
# check ppi, but skip pdf to avoid ghostscript warnings to stderr
|
31
|
-
expect(get_res(orig)).to eq get_res(dest) unless orig.end_with?('pdf')
|
24
|
+
def metadata_match_checker(source, target)
|
25
|
+
target_meta = NewspaperWorks::ImageTool.new(target).metadata
|
26
|
+
source_meta = NewspaperWorks::ImageTool.new(source).metadata
|
27
|
+
expect(target_meta[:content_type]).to eq 'image/jp2'
|
28
|
+
expect(target_meta[:width]).to eq source_meta[:width]
|
29
|
+
expect(target_meta[:height]).to eq source_meta[:height]
|
32
30
|
end
|
33
31
|
|
34
32
|
def makes_jp2(filename)
|
35
33
|
expected = expected_path(valid_file_set)
|
36
34
|
expect(File.exist?(expected)).to be false
|
37
35
|
svc = described_class.new(valid_file_set)
|
38
|
-
|
36
|
+
source_path = source_image(filename)
|
37
|
+
svc.create_derivatives(source_path)
|
39
38
|
expect(File.exist?(expected)).to be true
|
40
|
-
|
41
|
-
expect(desc).to include 'JP2'
|
42
|
-
check_dpi_match(source_image(filename), expected)
|
39
|
+
metadata_match_checker(source_path, expected)
|
43
40
|
svc.cleanup_derivatives
|
44
41
|
end
|
45
42
|
|
46
43
|
it "creates gray JP2 derivative from one-bit source" do
|
47
|
-
makes_jp2('
|
44
|
+
makes_jp2('ocr_mono.tiff')
|
48
45
|
end
|
49
46
|
|
50
47
|
it "creates gray JP2 from grayscale source" do
|
@@ -96,30 +96,32 @@ RSpec.describe NewspaperWorks::NewspaperPageDerivativeService do
|
|
96
96
|
end
|
97
97
|
|
98
98
|
it "identifies a source file using ImageMagick" do
|
99
|
-
|
100
|
-
expect(
|
99
|
+
service = service_for_file('4.1.07.tiff')
|
100
|
+
expect(service.identify[:content_type]).to eq 'image/tiff'
|
101
|
+
expect(service.identify[:bits_per_component]).to eq 8
|
101
102
|
end
|
102
103
|
|
103
104
|
it "identifies jp2 source" do
|
104
|
-
# test/verify jp2 source is identified, which relies on
|
105
|
-
|
106
|
-
expect(
|
105
|
+
# test/verify jp2 source is identified, which relies on JP2 backend
|
106
|
+
service = service_for_file('4.1.07.jp2')
|
107
|
+
expect(service.identify[:content_type]).to eq 'image/jp2'
|
108
|
+
expect(service.identify[:bits_per_component]).to eq 8
|
107
109
|
end
|
108
110
|
|
109
111
|
it "identifies color and gray sources" do
|
110
112
|
expect(service_for_file('4.1.07.tiff').use_color?).to be true
|
111
|
-
expect(service_for_file('
|
113
|
+
expect(service_for_file('ocr_gray.tiff').use_color?).to be false
|
112
114
|
end
|
113
115
|
|
114
116
|
it "identifies a one-bit source" do
|
115
117
|
# 1-bit group4 monochrome TIFF:
|
116
|
-
expect(service_for_file('
|
118
|
+
expect(service_for_file('ocr_mono.tiff').one_bit?).to be true
|
117
119
|
# 8-bit gray TIFF:
|
118
120
|
expect(
|
119
121
|
service_for_file('lowres-gray-via-ndnp-sample.tiff').one_bit?
|
120
122
|
).to be false
|
121
123
|
# color TIFF:
|
122
|
-
expect(service_for_file('4.1.07.
|
124
|
+
expect(service_for_file('4.1.07.tiff').one_bit?).to be false
|
123
125
|
end
|
124
126
|
end
|
125
127
|
end
|
@@ -23,10 +23,10 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
|
|
23
23
|
|
24
24
|
# given output file name, check DPI is 150
|
25
25
|
def check_dpi(expected)
|
26
|
-
|
27
|
-
# get
|
28
|
-
page_width =
|
29
|
-
expect(
|
26
|
+
metadata = NewspaperWorks::ImageTool.new(expected).metadata
|
27
|
+
# get width of pdf in points (via imagemagick), should be 864x == 12in
|
28
|
+
page_width = metadata[:width]
|
29
|
+
expect(page_width).to eq 864
|
30
30
|
# get total width of image in pixels from pdfimages -list, ==> 1800
|
31
31
|
image_width = 1800
|
32
32
|
im_list = `pdfimages -list #{expected}`
|
@@ -41,14 +41,14 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
|
|
41
41
|
svc = described_class.new(valid_file_set)
|
42
42
|
svc.create_derivatives(source_image(filename))
|
43
43
|
expect(File.exist?(expected)).to be true
|
44
|
-
|
45
|
-
expect(
|
44
|
+
metadata = NewspaperWorks::ImageTool.new(expected).metadata
|
45
|
+
expect(metadata[:content_type]).to eq 'application/pdf'
|
46
46
|
check_dpi(expected)
|
47
47
|
svc.cleanup_derivatives
|
48
48
|
end
|
49
49
|
|
50
50
|
it "creates gray PDF derivative from one-bit source" do
|
51
|
-
makes_pdf('
|
51
|
+
makes_pdf('ocr_mono.tiff')
|
52
52
|
end
|
53
53
|
|
54
54
|
it "creates gray PDF from grayscale source" do
|
@@ -58,5 +58,9 @@ RSpec.describe NewspaperWorks::PDFDerivativeService do
|
|
58
58
|
it "creates color PDF from color source" do
|
59
59
|
makes_pdf('4.1.07.tiff')
|
60
60
|
end
|
61
|
+
|
62
|
+
it "creates color PDF from color JP2 source" do
|
63
|
+
makes_pdf('4.1.07.jp2')
|
64
|
+
end
|
61
65
|
end
|
62
66
|
end
|
@@ -22,8 +22,8 @@ RSpec.describe NewspaperWorks::TIFFDerivativeService do
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def get_res(path)
|
25
|
-
|
26
|
-
|
25
|
+
tool = NewspaperWorks::ImageTool.new(path)
|
26
|
+
"#{tool.metadata[:width]}x#{tool.metadata[:height]}"
|
27
27
|
end
|
28
28
|
|
29
29
|
def check_dpi_match(orig, dest)
|
@@ -32,23 +32,30 @@ RSpec.describe NewspaperWorks::TIFFDerivativeService do
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def makes_tiff(filename)
|
35
|
+
path = source_image(filename)
|
35
36
|
expected = expected_path(valid_file_set)
|
36
37
|
expect(File.exist?(expected)).to be false
|
37
38
|
svc = described_class.new(valid_file_set)
|
38
|
-
svc.create_derivatives(
|
39
|
+
svc.create_derivatives(path)
|
39
40
|
expect(File.exist?(expected)).to be true
|
40
|
-
|
41
|
-
expect(
|
42
|
-
check_dpi_match(
|
41
|
+
mime = NewspaperWorks::ImageTool.new(expected).metadata[:content_type]
|
42
|
+
expect(mime).to eq 'image/tiff'
|
43
|
+
check_dpi_match(path, expected)
|
43
44
|
svc.cleanup_derivatives
|
44
45
|
end
|
45
46
|
|
46
|
-
|
47
|
-
|
47
|
+
# for cases where primary file is TIFF already
|
48
|
+
def avoids_duplicative_creation(filename)
|
49
|
+
expected = expected_path(valid_file_set)
|
50
|
+
expect(File.exist?(expected)).to be false
|
51
|
+
svc = described_class.new(valid_file_set)
|
52
|
+
svc.create_derivatives(source_image(filename))
|
53
|
+
expect(File.exist?(expected)).not_to be true
|
48
54
|
end
|
49
55
|
|
50
|
-
it "
|
51
|
-
|
56
|
+
it "Does not make TIFF derivatives when primary is TIFF" do
|
57
|
+
avoids_duplicative_creation('ocr_mono.tiff')
|
58
|
+
avoids_duplicative_creation('ocr_gray.tiff')
|
52
59
|
end
|
53
60
|
|
54
61
|
it "creates TIFF from PDF source, robust to multi-page" do
|
data/spec/spec_helper.rb
CHANGED
@@ -64,6 +64,22 @@ module EngineRoutes
|
|
64
64
|
end
|
65
65
|
end
|
66
66
|
|
67
|
+
class CSVLoggingFormatter < RSpec::Core::Formatters::JsonFormatter
|
68
|
+
RSpec::Core::Formatters.register self
|
69
|
+
|
70
|
+
def close(_notification)
|
71
|
+
with_headers = {
|
72
|
+
write_headers: true,
|
73
|
+
headers: ['Example', 'Status', 'Run Time', 'Exception']
|
74
|
+
}
|
75
|
+
CSV.open(output.path, 'w', with_headers) do |csv|
|
76
|
+
@output_hash[:examples].map do |ex|
|
77
|
+
csv << [ex[:full_description], ex[:status], ex[:run_time], ex[:exception]]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
67
83
|
RSpec.configure do |config|
|
68
84
|
# enable FactoryBot:
|
69
85
|
require 'factory_bot'
|
@@ -226,6 +242,9 @@ RSpec.configure do |config|
|
|
226
242
|
# config.default_formatter = "doc"
|
227
243
|
# end
|
228
244
|
|
245
|
+
# opt-in CSV logging formatter, set SPEC_CSV environment variable to use:
|
246
|
+
config.add_formatter(CSVLoggingFormatter, 'spec_log.csv') unless ENV['SPEC_CSV'].nil?
|
247
|
+
|
229
248
|
# Print the 10 slowest examples and example groups at the
|
230
249
|
# end of the spec run, to help surface which specs are running
|
231
250
|
# particularly slow.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: newspaper_works
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sean Upton
|
@@ -11,22 +11,22 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2019-
|
14
|
+
date: 2019-10-18 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: blacklight_iiif_search
|
18
18
|
requirement: !ruby/object:Gem::Requirement
|
19
19
|
requirements:
|
20
|
-
- - "
|
20
|
+
- - "~>"
|
21
21
|
- !ruby/object:Gem::Version
|
22
|
-
version: '0'
|
22
|
+
version: '1.0'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
25
|
version_requirements: !ruby/object:Gem::Requirement
|
26
26
|
requirements:
|
27
|
-
- - "
|
27
|
+
- - "~>"
|
28
28
|
- !ruby/object:Gem::Version
|
29
|
-
version: '0'
|
29
|
+
version: '1.0'
|
30
30
|
- !ruby/object:Gem::Dependency
|
31
31
|
name: blacklight_advanced_search
|
32
32
|
requirement: !ruby/object:Gem::Requirement
|
@@ -83,20 +83,6 @@ dependencies:
|
|
83
83
|
- - "~>"
|
84
84
|
- !ruby/object:Gem::Version
|
85
85
|
version: '5.1'
|
86
|
-
- !ruby/object:Gem::Dependency
|
87
|
-
name: rtesseract
|
88
|
-
requirement: !ruby/object:Gem::Requirement
|
89
|
-
requirements:
|
90
|
-
- - "~>"
|
91
|
-
- !ruby/object:Gem::Version
|
92
|
-
version: 2.2.0
|
93
|
-
type: :runtime
|
94
|
-
prerelease: false
|
95
|
-
version_requirements: !ruby/object:Gem::Requirement
|
96
|
-
requirements:
|
97
|
-
- - "~>"
|
98
|
-
- !ruby/object:Gem::Version
|
99
|
-
version: 2.2.0
|
100
86
|
- !ruby/object:Gem::Dependency
|
101
87
|
name: sass-rails
|
102
88
|
requirement: !ruby/object:Gem::Requirement
|
@@ -371,6 +357,7 @@ executables: []
|
|
371
357
|
extensions: []
|
372
358
|
extra_rdoc_files: []
|
373
359
|
files:
|
360
|
+
- ".coveralls.yml"
|
374
361
|
- ".fcrepo_wrapper"
|
375
362
|
- ".gitignore"
|
376
363
|
- ".rubocop.yml"
|
@@ -577,6 +564,7 @@ files:
|
|
577
564
|
- config/test-fixture/solr-config/xslt/example_atom.xsl
|
578
565
|
- config/test-fixture/solr-config/xslt/example_rss.xsl
|
579
566
|
- config/test-fixture/solr-config/xslt/luke.xsl
|
567
|
+
- config/vendor/fits.xml
|
580
568
|
- config/vendor/imagemagick-6-policy.xml
|
581
569
|
- db/migrate/20181214181358_create_newspaper_works_derivative_attachments.rb
|
582
570
|
- db/migrate/20190107165909_create_newspaper_works_ingest_file_relations.rb
|
@@ -603,6 +591,7 @@ files:
|
|
603
591
|
- lib/newspaper_works/data/work_files.rb
|
604
592
|
- lib/newspaper_works/engine.rb
|
605
593
|
- lib/newspaper_works/errors.rb
|
594
|
+
- lib/newspaper_works/image_tool.rb
|
606
595
|
- lib/newspaper_works/ingest.rb
|
607
596
|
- lib/newspaper_works/ingest/base_ingest.rb
|
608
597
|
- lib/newspaper_works/ingest/base_publication_info.rb
|
@@ -639,11 +628,13 @@ files:
|
|
639
628
|
- lib/newspaper_works/ingest/pub_finder.rb
|
640
629
|
- lib/newspaper_works/ingest/publication_info.rb
|
641
630
|
- lib/newspaper_works/issue_pdf_composer.rb
|
631
|
+
- lib/newspaper_works/jp2_image_metadata.rb
|
642
632
|
- lib/newspaper_works/logging.rb
|
643
633
|
- lib/newspaper_works/page_finder.rb
|
644
634
|
- lib/newspaper_works/resource_fetcher.rb
|
645
635
|
- lib/newspaper_works/text_extraction.rb
|
646
636
|
- lib/newspaper_works/text_extraction/alto_reader.rb
|
637
|
+
- lib/newspaper_works/text_extraction/hocr_reader.rb
|
647
638
|
- lib/newspaper_works/text_extraction/page_ocr.rb
|
648
639
|
- lib/newspaper_works/text_extraction/render_alto.rb
|
649
640
|
- lib/newspaper_works/text_extraction/word_coords_builder.rb
|
@@ -689,6 +680,7 @@ files:
|
|
689
680
|
- spec/fixtures/files/ocr_gray.jp2
|
690
681
|
- spec/fixtures/files/ocr_gray.tiff
|
691
682
|
- spec/fixtures/files/ocr_mono.tiff
|
683
|
+
- spec/fixtures/files/ocr_mono_text_hocr.html
|
692
684
|
- spec/fixtures/files/page1.tiff
|
693
685
|
- spec/fixtures/files/resource_mocks/chronam/http404-expected
|
694
686
|
- spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf
|
@@ -732,6 +724,7 @@ files:
|
|
732
724
|
- spec/lib/newspaper_works/data/work_derivatives_spec.rb
|
733
725
|
- spec/lib/newspaper_works/data/work_file_spec.rb
|
734
726
|
- spec/lib/newspaper_works/data/work_files_spec.rb
|
727
|
+
- spec/lib/newspaper_works/image_tool_spec.rb
|
735
728
|
- spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb
|
736
729
|
- spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb
|
737
730
|
- spec/lib/newspaper_works/ingest/from_command_spec.rb
|
@@ -761,10 +754,12 @@ files:
|
|
761
754
|
- spec/lib/newspaper_works/ingest/publication_info_spec.rb
|
762
755
|
- spec/lib/newspaper_works/ingest_spec.rb
|
763
756
|
- spec/lib/newspaper_works/issue_pdf_composer_spec.rb
|
757
|
+
- spec/lib/newspaper_works/jp2_image_metadata_spec.rb
|
764
758
|
- spec/lib/newspaper_works/logging_spec.rb
|
765
759
|
- spec/lib/newspaper_works/page_finder_spec.rb
|
766
760
|
- spec/lib/newspaper_works/resource_fetcher_spec.rb
|
767
761
|
- spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb
|
762
|
+
- spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb
|
768
763
|
- spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb
|
769
764
|
- spec/lib/newspaper_works/text_extraction/render_alto_spec.rb
|
770
765
|
- spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb
|
@@ -830,7 +825,7 @@ files:
|
|
830
825
|
- test/newspaper_works_test.rb
|
831
826
|
- test/test_helper.rb
|
832
827
|
- tmp/.keep
|
833
|
-
homepage: https://github.com/
|
828
|
+
homepage: https://github.com/samvera-labs/newspaper_works
|
834
829
|
licenses:
|
835
830
|
- Apache-2.0
|
836
831
|
metadata: {}
|
@@ -850,7 +845,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
850
845
|
version: '0'
|
851
846
|
requirements: []
|
852
847
|
rubyforge_project:
|
853
|
-
rubygems_version: 2.6.
|
848
|
+
rubygems_version: 2.7.6.2
|
854
849
|
signing_key:
|
855
850
|
specification_version: 4
|
856
851
|
summary: newspaper_works is a Rails Engine gem providing model and administrative
|
@@ -896,6 +891,7 @@ test_files:
|
|
896
891
|
- spec/fixtures/files/ocr_gray.jp2
|
897
892
|
- spec/fixtures/files/ocr_gray.tiff
|
898
893
|
- spec/fixtures/files/ocr_mono.tiff
|
894
|
+
- spec/fixtures/files/ocr_mono_text_hocr.html
|
899
895
|
- spec/fixtures/files/page1.tiff
|
900
896
|
- spec/fixtures/files/resource_mocks/chronam/http404-expected
|
901
897
|
- spec/fixtures/files/resource_mocks/chronam/sn84038814.rdf
|
@@ -939,6 +935,7 @@ test_files:
|
|
939
935
|
- spec/lib/newspaper_works/data/work_derivatives_spec.rb
|
940
936
|
- spec/lib/newspaper_works/data/work_file_spec.rb
|
941
937
|
- spec/lib/newspaper_works/data/work_files_spec.rb
|
938
|
+
- spec/lib/newspaper_works/image_tool_spec.rb
|
942
939
|
- spec/lib/newspaper_works/ingest/batch_issue_ingester_spec.rb
|
943
940
|
- spec/lib/newspaper_works/ingest/chronam_publication_info_spec.rb
|
944
941
|
- spec/lib/newspaper_works/ingest/from_command_spec.rb
|
@@ -968,10 +965,12 @@ test_files:
|
|
968
965
|
- spec/lib/newspaper_works/ingest/publication_info_spec.rb
|
969
966
|
- spec/lib/newspaper_works/ingest_spec.rb
|
970
967
|
- spec/lib/newspaper_works/issue_pdf_composer_spec.rb
|
968
|
+
- spec/lib/newspaper_works/jp2_image_metadata_spec.rb
|
971
969
|
- spec/lib/newspaper_works/logging_spec.rb
|
972
970
|
- spec/lib/newspaper_works/page_finder_spec.rb
|
973
971
|
- spec/lib/newspaper_works/resource_fetcher_spec.rb
|
974
972
|
- spec/lib/newspaper_works/text_extraction/alto_reader_spec.rb
|
973
|
+
- spec/lib/newspaper_works/text_extraction/hocr_reader_spec.rb
|
975
974
|
- spec/lib/newspaper_works/text_extraction/page_ocr_spec.rb
|
976
975
|
- spec/lib/newspaper_works/text_extraction/render_alto_spec.rb
|
977
976
|
- spec/lib/newspaper_works/text_extraction/word_coords_builder_spec.rb
|