derivative-rodeo 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +15 -0
- data/README.md +251 -0
- data/Rakefile +42 -0
- data/derivative_rodeo.gemspec +54 -0
- data/lib/derivative/rodeo.rb +3 -0
- data/lib/derivative-rodeo.rb +3 -0
- data/lib/derivative_rodeo/configuration.rb +95 -0
- data/lib/derivative_rodeo/errors.rb +56 -0
- data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
- data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
- data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
- data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
- data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
- data/lib/derivative_rodeo/services/base_service.rb +15 -0
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
- data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
- data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
- data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
- data/lib/derivative_rodeo/services/image_service.rb +73 -0
- data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
- data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
- data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
- data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
- data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
- data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
- data/lib/derivative_rodeo/services/url_service.rb +42 -0
- data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
- data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
- data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
- data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
- data/lib/derivative_rodeo/technical_metadata.rb +23 -0
- data/lib/derivative_rodeo/version.rb +5 -0
- data/lib/derivative_rodeo.rb +36 -0
- metadata +339 -0
@@ -0,0 +1,218 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'forwardable'
|
4
|
+
require 'json'
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
module DerivativeRodeo
|
8
|
+
module Services
|
9
|
+
##
|
10
|
+
# Responsible for converting an SGML string into JSON coordinates
|
11
|
+
class ExtractWordCoordinatesFromHocrSgmlService
|
12
|
+
##
|
13
|
+
# @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
|
14
|
+
# @return [String] A JSON document
|
15
|
+
def self.call(sgml)
|
16
|
+
new(sgml).to_json
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Construct with either path or HTML [String]
|
21
|
+
#
|
22
|
+
# @param html [String] either an XML string or a path to a file.
|
23
|
+
def initialize(html)
|
24
|
+
@source = xml?(html) ? html : File.read(html)
|
25
|
+
@doc_stream = DocStream.new
|
26
|
+
parser = Nokogiri::HTML::SAX::Parser.new(@doc_stream)
|
27
|
+
parser.parse(@source)
|
28
|
+
end
|
29
|
+
attr_reader :doc_stream, :source
|
30
|
+
|
31
|
+
delegate :text, :width, :height, :words, to: :doc_stream
|
32
|
+
|
33
|
+
# Output JSON flattened word coordinates
|
34
|
+
#
|
35
|
+
# @return [String] JSON serialization of flattened word coordinates
|
36
|
+
def to_json
|
37
|
+
@to_json ||= WordCoordinates.to_json(
|
38
|
+
words: doc_stream.words,
|
39
|
+
width: doc_stream.width,
|
40
|
+
height: doc_stream.height
|
41
|
+
)
|
42
|
+
end
|
43
|
+
alias json to_json
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def xml?(xml)
|
48
|
+
xml.lstrip.start_with?('<')
|
49
|
+
end
|
50
|
+
|
51
|
+
# SAX Document Stream class to gather text and word tokens from hOCR
|
52
|
+
class DocStream < Nokogiri::XML::SAX::Document
|
53
|
+
attr_accessor :text, :words, :width, :height
|
54
|
+
|
55
|
+
def initialize
|
56
|
+
super()
|
57
|
+
# plain text buffer:
|
58
|
+
@text = ''
|
59
|
+
# list of word hash, containing word+coord:
|
60
|
+
@words = []
|
61
|
+
# page width and height to be found in hOCR for `div.ocr_page`
|
62
|
+
@width = nil
|
63
|
+
@height = nil
|
64
|
+
# to hold current word data state across #start_element, #characters,
|
65
|
+
# and #end_element methods (to associate word with coordinates).
|
66
|
+
@current = nil
|
67
|
+
# to preserve element classname from start to use by #end_element
|
68
|
+
@element_class_name = nil
|
69
|
+
end
|
70
|
+
|
71
|
+
# Return coordinates from `span.ocrx_word` element attribute hash
|
72
|
+
#
|
73
|
+
# @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
|
74
|
+
# @return [Array] Array of position x, y, width, height in px.
|
75
|
+
def s_coords(attrs)
|
76
|
+
element_title = attrs['title']
|
77
|
+
bbox = element_title.split(';')[0].split('bbox ')[-1]
|
78
|
+
x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
|
79
|
+
height = y2 - y1
|
80
|
+
width = x2 - x1
|
81
|
+
hpos = x1
|
82
|
+
vpos = y1
|
83
|
+
[hpos, vpos, width, height]
|
84
|
+
end
|
85
|
+
|
86
|
+
# Consider element for processing?
|
87
|
+
# - `div.ocr_page` — to get page width/height
|
88
|
+
# - `span.ocr_line` — to help make plain text readable
|
89
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
90
|
+
# @param name [String] Element name
|
91
|
+
# @param class_name [String] HTML class name
|
92
|
+
# @return [Boolean] true if element should be processed; otherwise false
|
93
|
+
def consider?(name, class_name)
|
94
|
+
selector = "#{name}.#{class_name}"
|
95
|
+
['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
|
96
|
+
end
|
97
|
+
|
98
|
+
def start_word(attrs)
|
99
|
+
@current = {}
|
100
|
+
# will be replaced during #characters method call:
|
101
|
+
@current[:word] = nil
|
102
|
+
@current[:coordinates] = s_coords(attrs)
|
103
|
+
end
|
104
|
+
|
105
|
+
def start_page(attrs)
|
106
|
+
title = attrs['title']
|
107
|
+
fields = title.split(';')
|
108
|
+
bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
|
109
|
+
# width and height:
|
110
|
+
@width = bbox[2]
|
111
|
+
@height = bbox[3]
|
112
|
+
end
|
113
|
+
|
114
|
+
def word_complete?
|
115
|
+
return false if @current.nil?
|
116
|
+
coords = @current[:coordinates]
|
117
|
+
@current[:word].present? && coords.size == 4
|
118
|
+
end
|
119
|
+
|
120
|
+
def end_word
|
121
|
+
# add trailing space to plaintext buffer for between words:
|
122
|
+
@text += ' '
|
123
|
+
@words.push(@current) if word_complete?
|
124
|
+
end
|
125
|
+
|
126
|
+
def end_line
|
127
|
+
# strip trailing whitespace
|
128
|
+
@text.strip!
|
129
|
+
# then insert a line break
|
130
|
+
@text += "\n"
|
131
|
+
end
|
132
|
+
|
133
|
+
# Callback for element start, ignores elements except for:
|
134
|
+
# - `div.ocr_page` — to get page width/height
|
135
|
+
# - `span.ocr_line` — to help make plain text readable
|
136
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
137
|
+
#
|
138
|
+
# @param name [String] element name.
|
139
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
140
|
+
def start_element(name, attrs = [])
|
141
|
+
attributes = attrs.to_h
|
142
|
+
@element_class_name = attributes['class']
|
143
|
+
return unless consider?(name, @element_class_name)
|
144
|
+
start_word(attributes) if @element_class_name == 'ocrx_word'
|
145
|
+
start_page(attributes) if @element_class_name == 'ocr_page'
|
146
|
+
end
|
147
|
+
|
148
|
+
def characters(value)
|
149
|
+
return if @current.nil?
|
150
|
+
return if @current[:coordinates].nil?
|
151
|
+
@current[:word] ||= ''
|
152
|
+
@current[:word] += value
|
153
|
+
@text += value
|
154
|
+
end
|
155
|
+
|
156
|
+
# Callback for element end; at this time, flush word coordinate state
|
157
|
+
# for current word, and append line endings to plain text:
|
158
|
+
#
|
159
|
+
# @param _name [String] element name.
|
160
|
+
def end_element(_name)
|
161
|
+
end_line if @element_class_name == 'ocr_line'
|
162
|
+
end_word if @element_class_name == 'ocrx_word'
|
163
|
+
end
|
164
|
+
|
165
|
+
# Callback for completion of parsing hOCR, used to normalize generated
|
166
|
+
# text content (strip unneeded whitespace incidental to output).
|
167
|
+
def end_document
|
168
|
+
# postprocess @text to remove trailing spaces on lines
|
169
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
170
|
+
# remove excess line break
|
171
|
+
@text.gsub!(/\n+/, "\n")
|
172
|
+
@text.delete("\r")
|
173
|
+
# remove trailing whitespace at end of buffer
|
174
|
+
@text.strip!
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
class WordCoordinates
|
179
|
+
##
|
180
|
+
# @api public
|
181
|
+
#
|
182
|
+
# @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
|
183
|
+
# @param width [Integer] the width of the "canvas" on which the words appear.
|
184
|
+
# @param height [Integer] the height of the "canvas" on which the words appear.
|
185
|
+
#
|
186
|
+
# @return [String] a JSON encoded string.
|
187
|
+
def self.to_json(words:, width: nil, height: nil)
|
188
|
+
new(words: words, width: width, height: height).to_json
|
189
|
+
end
|
190
|
+
|
191
|
+
def initialize(words:, width:, height:)
|
192
|
+
@words = words
|
193
|
+
@width = width
|
194
|
+
@height = height
|
195
|
+
end
|
196
|
+
attr_reader :words, :width, :height
|
197
|
+
|
198
|
+
# Output JSON flattened word coordinates
|
199
|
+
#
|
200
|
+
# @return [String] JSON serialization of flattened word coordinates
|
201
|
+
def to_json
|
202
|
+
coordinates = {}
|
203
|
+
words.each do |word|
|
204
|
+
word_chars = word[:word]
|
205
|
+
word_coords = word[:coordinates]
|
206
|
+
if coordinates[word_chars]
|
207
|
+
coordinates[word_chars] << word_coords
|
208
|
+
else
|
209
|
+
coordinates[word_chars] = [word_coords]
|
210
|
+
end
|
211
|
+
end
|
212
|
+
payload = { width: width, height: height, coords: coordinates }
|
213
|
+
JSON.generate(payload)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
##
|
6
|
+
# This module is responsible for extracting technical_metadata for a given path.
|
7
|
+
#
|
8
|
+
# @see .technical_metadata_for
|
9
|
+
class ImageIdentifyService < BaseService
|
10
|
+
class_attribute :identify_format_option,
|
11
|
+
default: %(Geometry: %G\nDepth: %[bit-depth]\nColorspace: %[colorspace]\nAlpha: %A\nMIME Step: %m\n) # rubocop:disable Layout/LineLength
|
12
|
+
|
13
|
+
##
|
14
|
+
# @api public
|
15
|
+
# @param path [String]
|
16
|
+
# @return [Derivative::Rodeo::TechnicalMetadata]
|
17
|
+
def self.technical_metadata_for(path:)
|
18
|
+
new(path).technical_metadata
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(path)
|
22
|
+
super()
|
23
|
+
@path = path
|
24
|
+
# The first 23 characters of a file contains the magic.
|
25
|
+
@initial_file_contents = File.read(@path, 23, 0)
|
26
|
+
end
|
27
|
+
attr_reader :path
|
28
|
+
|
29
|
+
# Return metadata by means of imagemagick identify
|
30
|
+
def technical_metadata
|
31
|
+
technical_metadata = TechnicalMetadata.new
|
32
|
+
lines = im_identify
|
33
|
+
width, height = im_identify_geometry(lines)
|
34
|
+
technical_metadata.width = width
|
35
|
+
technical_metadata.height = height
|
36
|
+
technical_metadata.content_type = im_mime(lines)
|
37
|
+
populate_im_color!(lines, technical_metadata)
|
38
|
+
technical_metadata
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# @return [Array<String>] lines of output from imagemagick `identify`
|
44
|
+
def im_identify
|
45
|
+
return @im_identify if defined?(@im_identify)
|
46
|
+
|
47
|
+
# Instead of relying on all of the properties, we're requesting on the specific properties
|
48
|
+
cmd = "identify -format '#{identify_format_option}' #{path}"
|
49
|
+
# cmd = "identify -verbose #{path}"
|
50
|
+
@im_identify = `#{cmd}`.lines
|
51
|
+
end
|
52
|
+
|
53
|
+
# @return [Array(Integer, Integer)] width, height in Integer px units
|
54
|
+
def im_identify_geometry(lines)
|
55
|
+
img_geo = im_line_select(lines, 'geometry').split('+')[0]
|
56
|
+
img_geo.split('x').map(&:to_i)
|
57
|
+
end
|
58
|
+
|
59
|
+
def im_mime(lines)
|
60
|
+
return 'application/pdf' if pdf? # workaround older imagemagick bug
|
61
|
+
|
62
|
+
im_line_select(lines, 'mime step')
|
63
|
+
end
|
64
|
+
|
65
|
+
def pdf?
|
66
|
+
@initial_file_contents.start_with?('%PDF-')
|
67
|
+
end
|
68
|
+
|
69
|
+
def populate_im_color!(lines, technical_metadata)
|
70
|
+
bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
|
71
|
+
colorspace = im_line_select(lines, 'colorspace')
|
72
|
+
color = colorspace == 'Gray' ? 'gray' : 'color'
|
73
|
+
has_alpha = !im_line_select(lines, 'alpha') == 'Undefined'
|
74
|
+
technical_metadata.num_components = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
|
75
|
+
technical_metadata.color = bpc == 1 ? 'monochrome' : color
|
76
|
+
technical_metadata.bits_per_component = bpc
|
77
|
+
end
|
78
|
+
|
79
|
+
def im_line_select(lines, key)
|
80
|
+
line = lines.find { |l| l.scrub.downcase.strip.start_with?(key.downcase) }
|
81
|
+
# Given "key: value" line, return the value as String stripped of
|
82
|
+
# leading and trailing whitespace
|
83
|
+
return line if line.nil?
|
84
|
+
|
85
|
+
line.strip.split(':')[-1].strip
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# rubocop:disable Style/FrozenStringLiteralComment
|
2
|
+
# TODO freeze them literals
|
3
|
+
|
4
|
+
module DerivativeRodeo
|
5
|
+
module Services
|
6
|
+
##
|
7
|
+
# A utility class for extracting technical metadata from a JP2.
|
8
|
+
#
|
9
|
+
# @see .technical_metadata_for
|
10
|
+
class ImageJp2Service < BaseService
|
11
|
+
TOKEN_MARKER_START = "\xFF".force_encoding('BINARY')
|
12
|
+
TOKEN_MARKER_SIZ = "\x51".force_encoding('BINARY')
|
13
|
+
TOKEN_IHDR = 'ihdr'.freeze
|
14
|
+
|
15
|
+
##
|
16
|
+
# @api public
|
17
|
+
#
|
18
|
+
# @param path [String] path to jp2, for reading
|
19
|
+
#
|
20
|
+
# @return [Derivative::Rodeo::TechnicalMetadata]
|
21
|
+
def self.technical_metadata_for(path:)
|
22
|
+
new(path).technical_metadata
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :path
|
26
|
+
|
27
|
+
def initialize(path)
|
28
|
+
super()
|
29
|
+
@path = path
|
30
|
+
end
|
31
|
+
|
32
|
+
# rubocop:disable Metrics/MethodLength
|
33
|
+
def technical_metadata
|
34
|
+
io = File.open(path, 'rb')
|
35
|
+
io.seek(0, IO::SEEK_SET)
|
36
|
+
validate_jp2(io)
|
37
|
+
x_siz, y_siz = extract_jp2_dim(io)
|
38
|
+
nc, bpc = extract_jp2_components(io)
|
39
|
+
color = nc >= 3 ? 'color' : 'gray'
|
40
|
+
TechnicalMetadata.new(
|
41
|
+
color: bpc == 1 ? 'monochrome' : color,
|
42
|
+
num_components: nc,
|
43
|
+
bits_per_component: bpc,
|
44
|
+
width: x_siz,
|
45
|
+
height: y_siz,
|
46
|
+
content_type: 'image/jp2'
|
47
|
+
)
|
48
|
+
ensure
|
49
|
+
io.close
|
50
|
+
end
|
51
|
+
# rubocop:enable Metrics/MethodLength
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
56
|
+
# @return [Array(Integer, Integer)] X size, Y size, in Integer-stepd px
|
57
|
+
# rubocop:disable Metrics/MethodLength
|
58
|
+
def extract_jp2_dim(io)
|
59
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
60
|
+
|
61
|
+
buffer = ''
|
62
|
+
siz_found = false
|
63
|
+
# Informed by ISO/IEC 15444-1:2000, pp. 26-27
|
64
|
+
# via:
|
65
|
+
# http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
|
66
|
+
#
|
67
|
+
# first 23 bytes are file-magic, we can skip
|
68
|
+
io.seek(23, IO::SEEK_SET)
|
69
|
+
while !siz_found && !buffer.nil?
|
70
|
+
# read one byte at a time, until we hit marker start 0xFF
|
71
|
+
buffer = io.read(1) while buffer != TOKEN_MARKER_START
|
72
|
+
# - on 0xFF read subsequent byte; if value != 0x51, continue
|
73
|
+
buffer = io.read(1)
|
74
|
+
next if buffer != TOKEN_MARKER_SIZ
|
75
|
+
|
76
|
+
# - on 0x51, read next 12 bytes
|
77
|
+
buffer = io.read(12)
|
78
|
+
siz_found = true
|
79
|
+
end
|
80
|
+
# discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
|
81
|
+
x_siz = buffer.byteslice(4, 4).unpack1('N')
|
82
|
+
y_siz = buffer.byteslice(8, 4).unpack1('N')
|
83
|
+
[x_siz, y_siz]
|
84
|
+
end
|
85
|
+
# rubocop:enable Metrics/MethodLength
|
86
|
+
|
87
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
88
|
+
# @return [Array(Integer, Integer)] number components, bits-per-component
|
89
|
+
def extract_jp2_components(io)
|
90
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
91
|
+
|
92
|
+
io.seek(0, IO::SEEK_SET)
|
93
|
+
# IHDR should be in first 64 bytes
|
94
|
+
buffer = io.read(64)
|
95
|
+
ihdr_data = buffer.split(TOKEN_IHDR)[-1]
|
96
|
+
raise IOError if ihdr_data.nil?
|
97
|
+
|
98
|
+
num_components = ihdr_data.byteslice(8, 2).unpack1('n')
|
99
|
+
# stored as "bit depth of the components in the codestream, minus 1", so add 1
|
100
|
+
bits_per_component = ihdr_data.byteslice(10, 1).unpack1('c') + 1
|
101
|
+
[num_components, bits_per_component]
|
102
|
+
end
|
103
|
+
|
104
|
+
def validate_jp2(io)
|
105
|
+
# verify file is jp2
|
106
|
+
magic = io.read(23)
|
107
|
+
raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
# rubocop:enable Style/FrozenStringLiteralComment
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'tmpdir'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module Services
|
7
|
+
##
|
8
|
+
# @api private
|
9
|
+
#
|
10
|
+
# @see .technical_metadata
|
11
|
+
# @see .convert
|
12
|
+
class ImageService < BaseService
|
13
|
+
attr_accessor :path
|
14
|
+
|
15
|
+
def initialize(path)
|
16
|
+
super()
|
17
|
+
@path = path
|
18
|
+
# The first 23 characters of a file contains the magic.
|
19
|
+
@initial_file_contents = File.read(@path, 23, 0)
|
20
|
+
end
|
21
|
+
|
22
|
+
def jp2?
|
23
|
+
@initial_file_contents.end_with?('ftypjp2')
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [Derivative::Rodeo::TechnicalMetadata]
|
27
|
+
def technical_metadata
|
28
|
+
return @technical_metadata if defined?(@technical_metadata)
|
29
|
+
|
30
|
+
@technical_metadata = if jp2?
|
31
|
+
ImageJp2Service.technical_metadata_for(path: path)
|
32
|
+
else
|
33
|
+
ImageIdentifyService.technical_metadata_for(path: path)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
alias metadata technical_metadata
|
37
|
+
|
38
|
+
extend Forwardable
|
39
|
+
def_delegator :technical_metadata, :monochrome?
|
40
|
+
|
41
|
+
# Convert source image to image at destination path, inferring file type from destination
|
42
|
+
# file extension. In case of JP2 files, create intermediate file using OpenJPEG 2000 that
|
43
|
+
# ImageMagick can use. Only outputs monochrome output if monochrome is true, destination
|
44
|
+
# format is TIFF.
|
45
|
+
#
|
46
|
+
# @param destination [String] Path to output / destination file
|
47
|
+
# @param monochrome [Boolean] true if monochrome output, otherwise false
|
48
|
+
def convert(destination:, monochrome: false)
|
49
|
+
raise 'JP2 output not yet supported' if destination.end_with?('jp2')
|
50
|
+
|
51
|
+
source = jp2? ? jp2_to_tiff(path) : path
|
52
|
+
convert_image(source: source, destination: destination, monochrome: monochrome)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def convert_image(source:, destination:, monochrome:)
|
58
|
+
monochrome &&= destination.slice(-4, 4).index('tif')
|
59
|
+
mono_opts = '-depth 1 -monochrome -compress Group4 -type bilevel '
|
60
|
+
opts = monochrome ? mono_opts : ''
|
61
|
+
cmd = "convert #{source} #{opts}#{destination}"
|
62
|
+
`#{cmd}`
|
63
|
+
end
|
64
|
+
|
65
|
+
def jp2_to_tiff(source)
|
66
|
+
intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
|
67
|
+
jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
|
68
|
+
`#{jp2_cmd}`
|
69
|
+
intermediate_path
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'open3'
|
4
|
+
require 'securerandom'
|
5
|
+
require 'tmpdir'
|
6
|
+
|
7
|
+
module DerivativeRodeo
|
8
|
+
module Services
|
9
|
+
module PdfSplitter
|
10
|
+
##
|
11
|
+
# @param name [String]
|
12
|
+
# @return [PdfSplitter::Base]
|
13
|
+
def self.for(name)
|
14
|
+
klass_name = "#{name.to_s.classify}_page".classify
|
15
|
+
"DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# @abstract
|
20
|
+
#
|
21
|
+
# The purpose of this class is to split the PDF into constituent image files.
|
22
|
+
#
|
23
|
+
# @see #each
|
24
|
+
class Base
|
25
|
+
class_attribute :image_extension
|
26
|
+
class_attribute :default_dpi, default: 400
|
27
|
+
# Should we perform compression logic on the images?
|
28
|
+
class_attribute :compression, default: nil
|
29
|
+
# What is the image quality we're using?
|
30
|
+
class_attribute :quality, default: nil
|
31
|
+
|
32
|
+
class_attribute :gsdevice, instance_accessor: false
|
33
|
+
class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
|
34
|
+
##
|
35
|
+
# @api public
|
36
|
+
#
|
37
|
+
# @param path [String] The path the the PDF
|
38
|
+
#
|
39
|
+
# @return [Enumerable, Utilities::PdfSplitter::Base]
|
40
|
+
def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
|
41
|
+
new(path, baseid: baseid, tmpdir: tmpdir)
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# @param path [String] the path to the source PDF that we're processing.
|
46
|
+
# @param baseid [String] used for creating a unique identifier
|
47
|
+
# @param tmpdir [String] place to perform the "work" of splitting the PDF.
|
48
|
+
# @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
|
49
|
+
# extract this from the given path, but for testing purposes, you might want to
|
50
|
+
# provide a specific summary.
|
51
|
+
# @param logger [Logger, #error]
|
52
|
+
def initialize(path,
|
53
|
+
baseid: SecureRandom.uuid,
|
54
|
+
# TODO: Do we need to provide the :tmpdir for the application?
|
55
|
+
tmpdir: Dir.mktmpdir,
|
56
|
+
pdf_pages_summary: PagesSummary.extract_from(path: path),
|
57
|
+
logger: DerivativeRodeo.config.logger)
|
58
|
+
@baseid = baseid
|
59
|
+
@pdfpath = path
|
60
|
+
@pdf_pages_summary = pdf_pages_summary
|
61
|
+
@tmpdir = tmpdir
|
62
|
+
@logger = logger
|
63
|
+
end
|
64
|
+
|
65
|
+
attr_reader :logger
|
66
|
+
|
67
|
+
# In creating {#each} we get many of the methods of array operation (e.g. #to_a).
|
68
|
+
include Enumerable
|
69
|
+
|
70
|
+
##
|
71
|
+
# @api public
|
72
|
+
#
|
73
|
+
# @yieldparam [String] the path to the page's tiff.
|
74
|
+
def each(&block)
|
75
|
+
entries.each(&block)
|
76
|
+
end
|
77
|
+
|
78
|
+
# @api private
|
79
|
+
def invalid_pdf?
|
80
|
+
!pdf_pages_summary.valid?
|
81
|
+
end
|
82
|
+
|
83
|
+
attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
|
84
|
+
private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
|
85
|
+
|
86
|
+
# @api private
|
87
|
+
def gsdevice
|
88
|
+
return self.class.gsdevice if self.class.gsdevice
|
89
|
+
|
90
|
+
raise NotImplementedError, "#{self.class}#gsdevice"
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
# entries for each page
|
96
|
+
def entries
|
97
|
+
return @entries if defined? @entries
|
98
|
+
|
99
|
+
@entries = Array.wrap(gsconvert)
|
100
|
+
end
|
101
|
+
|
102
|
+
def output_base
|
103
|
+
@output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
|
104
|
+
end
|
105
|
+
|
106
|
+
def gsconvert
|
107
|
+
# NOTE: you must call gsdevice before compression, as compression is
|
108
|
+
# updated during the gsdevice call.
|
109
|
+
file_names = []
|
110
|
+
|
111
|
+
Open3.popen3(gsconvert_cmd(output_base)) do |_stdin, stdout, stderr, _wait_thr|
|
112
|
+
err = stderr.read
|
113
|
+
logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
|
114
|
+
|
115
|
+
page_number = 1
|
116
|
+
stdout.read.split("\n").each do |line|
|
117
|
+
next unless line.start_with?('Page ')
|
118
|
+
|
119
|
+
file_names << format(output_base, page_number)
|
120
|
+
page_number += 1
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
file_names
|
125
|
+
end
|
126
|
+
|
127
|
+
def create_file_name(line:, page_number:); end
|
128
|
+
|
129
|
+
def gsconvert_cmd(output_base)
|
130
|
+
@gsconvert_cmd ||= begin
|
131
|
+
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
|
132
|
+
cmd += " -sCompression=#{compression}" if compression?
|
133
|
+
cmd += " -dJPEGQ=#{quality}" if quality?
|
134
|
+
cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
|
135
|
+
cmd
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def pagecount
|
140
|
+
return @pagecount if defined? @pagecount
|
141
|
+
|
142
|
+
cmd = "pdfinfo #{pdfpath}"
|
143
|
+
Open3.popen3(cmd) do |_stdin, stdout, stderr, _wait_thr|
|
144
|
+
err = stderr.read
|
145
|
+
logger.error "#{self.class}#pagecount encountered the following error with `pdfinfo': #{err}" if err.present?
|
146
|
+
output = stdout.read
|
147
|
+
raise "pdfinfo failed to return output for #{pdfpath} - #{err}" if output.blank?
|
148
|
+
match = page_count_regexp.match(output)
|
149
|
+
|
150
|
+
@pagecount = match[1].to_i
|
151
|
+
end
|
152
|
+
@pagecount
|
153
|
+
end
|
154
|
+
|
155
|
+
def ppi
|
156
|
+
if looks_scanned?
|
157
|
+
# For scanned media, defer to detected image PPI:
|
158
|
+
pdf_pages_summary.ppi
|
159
|
+
else
|
160
|
+
# 400 dpi for something that does not look like scanned media:
|
161
|
+
default_dpi
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def looks_scanned?
|
166
|
+
max_image_px = pdf_pages_summary.width * pdf_pages_summary.height
|
167
|
+
# single 10mp+ image per page?
|
168
|
+
single_image_per_page? && max_image_px > 1024 * 1024 * 10
|
169
|
+
end
|
170
|
+
|
171
|
+
def single_image_per_page?
|
172
|
+
pdf_pages_summary.page_count == pagecount
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
module PdfSplitter
|
6
|
+
# The purpose of this class is to split the PDF into constituent jpg files.
|
7
|
+
class JpgPage < PdfSplitter::Base
|
8
|
+
self.image_extension = 'jpg'
|
9
|
+
self.quality = '50'
|
10
|
+
self.gsdevice = 'jpeg'
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|