derivative-rodeo 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +6 -0
- data/LICENSE +15 -0
- data/README.md +251 -0
- data/Rakefile +42 -0
- data/derivative_rodeo.gemspec +54 -0
- data/lib/derivative/rodeo.rb +3 -0
- data/lib/derivative-rodeo.rb +3 -0
- data/lib/derivative_rodeo/configuration.rb +95 -0
- data/lib/derivative_rodeo/errors.rb +56 -0
- data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
- data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
- data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
- data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
- data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
- data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
- data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
- data/lib/derivative_rodeo/services/base_service.rb +15 -0
- data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
- data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
- data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
- data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
- data/lib/derivative_rodeo/services/image_service.rb +73 -0
- data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
- data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
- data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
- data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
- data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
- data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
- data/lib/derivative_rodeo/services/url_service.rb +42 -0
- data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
- data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
- data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
- data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
- data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
- data/lib/derivative_rodeo/technical_metadata.rb +23 -0
- data/lib/derivative_rodeo/version.rb +5 -0
- data/lib/derivative_rodeo.rb +36 -0
- metadata +339 -0
@@ -0,0 +1,218 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'forwardable'
|
4
|
+
require 'json'
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
module DerivativeRodeo
|
8
|
+
module Services
|
9
|
+
##
|
10
|
+
# Responsible for converting an SGML string into JSON coordinates
|
11
|
+
class ExtractWordCoordinatesFromHocrSgmlService
|
12
|
+
##
|
13
|
+
# @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
|
14
|
+
# @return [String] A JSON document
|
15
|
+
def self.call(sgml)
|
16
|
+
new(sgml).to_json
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Construct with either path or HTML [String]
|
21
|
+
#
|
22
|
+
# @param html [String] either an XML string or a path to a file.
|
23
|
+
def initialize(html)
|
24
|
+
@source = xml?(html) ? html : File.read(html)
|
25
|
+
@doc_stream = DocStream.new
|
26
|
+
parser = Nokogiri::HTML::SAX::Parser.new(@doc_stream)
|
27
|
+
parser.parse(@source)
|
28
|
+
end
|
29
|
+
attr_reader :doc_stream, :source
|
30
|
+
|
31
|
+
delegate :text, :width, :height, :words, to: :doc_stream
|
32
|
+
|
33
|
+
# Output JSON flattened word coordinates
|
34
|
+
#
|
35
|
+
# @return [String] JSON serialization of flattened word coordinates
|
36
|
+
def to_json
|
37
|
+
@to_json ||= WordCoordinates.to_json(
|
38
|
+
words: doc_stream.words,
|
39
|
+
width: doc_stream.width,
|
40
|
+
height: doc_stream.height
|
41
|
+
)
|
42
|
+
end
|
43
|
+
alias json to_json
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def xml?(xml)
|
48
|
+
xml.lstrip.start_with?('<')
|
49
|
+
end
|
50
|
+
|
51
|
+
# SAX Document Stream class to gather text and word tokens from hOCR
|
52
|
+
class DocStream < Nokogiri::XML::SAX::Document
|
53
|
+
attr_accessor :text, :words, :width, :height
|
54
|
+
|
55
|
+
def initialize
|
56
|
+
super()
|
57
|
+
# plain text buffer:
|
58
|
+
@text = ''
|
59
|
+
# list of word hash, containing word+coord:
|
60
|
+
@words = []
|
61
|
+
# page width and height to be found in hOCR for `div.ocr_page`
|
62
|
+
@width = nil
|
63
|
+
@height = nil
|
64
|
+
# to hold current word data state across #start_element, #characters,
|
65
|
+
# and #end_element methods (to associate word with coordinates).
|
66
|
+
@current = nil
|
67
|
+
# to preserve element classname from start to use by #end_element
|
68
|
+
@element_class_name = nil
|
69
|
+
end
|
70
|
+
|
71
|
+
# Return coordinates from `span.ocrx_word` element attribute hash
|
72
|
+
#
|
73
|
+
# @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
|
74
|
+
# @return [Array] Array of position x, y, width, height in px.
|
75
|
+
def s_coords(attrs)
|
76
|
+
element_title = attrs['title']
|
77
|
+
bbox = element_title.split(';')[0].split('bbox ')[-1]
|
78
|
+
x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
|
79
|
+
height = y2 - y1
|
80
|
+
width = x2 - x1
|
81
|
+
hpos = x1
|
82
|
+
vpos = y1
|
83
|
+
[hpos, vpos, width, height]
|
84
|
+
end
|
85
|
+
|
86
|
+
# Consider element for processing?
|
87
|
+
# - `div.ocr_page` — to get page width/height
|
88
|
+
# - `span.ocr_line` — to help make plain text readable
|
89
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
90
|
+
# @param name [String] Element name
|
91
|
+
# @param class_name [String] HTML class name
|
92
|
+
# @return [Boolean] true if element should be processed; otherwise false
|
93
|
+
def consider?(name, class_name)
|
94
|
+
selector = "#{name}.#{class_name}"
|
95
|
+
['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
|
96
|
+
end
|
97
|
+
|
98
|
+
def start_word(attrs)
|
99
|
+
@current = {}
|
100
|
+
# will be replaced during #characters method call:
|
101
|
+
@current[:word] = nil
|
102
|
+
@current[:coordinates] = s_coords(attrs)
|
103
|
+
end
|
104
|
+
|
105
|
+
def start_page(attrs)
|
106
|
+
title = attrs['title']
|
107
|
+
fields = title.split(';')
|
108
|
+
bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
|
109
|
+
# width and height:
|
110
|
+
@width = bbox[2]
|
111
|
+
@height = bbox[3]
|
112
|
+
end
|
113
|
+
|
114
|
+
def word_complete?
|
115
|
+
return false if @current.nil?
|
116
|
+
coords = @current[:coordinates]
|
117
|
+
@current[:word].present? && coords.size == 4
|
118
|
+
end
|
119
|
+
|
120
|
+
def end_word
|
121
|
+
# add trailing space to plaintext buffer for between words:
|
122
|
+
@text += ' '
|
123
|
+
@words.push(@current) if word_complete?
|
124
|
+
end
|
125
|
+
|
126
|
+
def end_line
|
127
|
+
# strip trailing whitespace
|
128
|
+
@text.strip!
|
129
|
+
# then insert a line break
|
130
|
+
@text += "\n"
|
131
|
+
end
|
132
|
+
|
133
|
+
# Callback for element start, ignores elements except for:
|
134
|
+
# - `div.ocr_page` — to get page width/height
|
135
|
+
# - `span.ocr_line` — to help make plain text readable
|
136
|
+
# - `span.ocrx_word` — for word-coordinate JSON and plain text word
|
137
|
+
#
|
138
|
+
# @param name [String] element name.
|
139
|
+
# @param attrs [Array] Array of key, value pair Arrays.
|
140
|
+
def start_element(name, attrs = [])
|
141
|
+
attributes = attrs.to_h
|
142
|
+
@element_class_name = attributes['class']
|
143
|
+
return unless consider?(name, @element_class_name)
|
144
|
+
start_word(attributes) if @element_class_name == 'ocrx_word'
|
145
|
+
start_page(attributes) if @element_class_name == 'ocr_page'
|
146
|
+
end
|
147
|
+
|
148
|
+
def characters(value)
|
149
|
+
return if @current.nil?
|
150
|
+
return if @current[:coordinates].nil?
|
151
|
+
@current[:word] ||= ''
|
152
|
+
@current[:word] += value
|
153
|
+
@text += value
|
154
|
+
end
|
155
|
+
|
156
|
+
# Callback for element end; at this time, flush word coordinate state
|
157
|
+
# for current word, and append line endings to plain text:
|
158
|
+
#
|
159
|
+
# @param _name [String] element name.
|
160
|
+
def end_element(_name)
|
161
|
+
end_line if @element_class_name == 'ocr_line'
|
162
|
+
end_word if @element_class_name == 'ocrx_word'
|
163
|
+
end
|
164
|
+
|
165
|
+
# Callback for completion of parsing hOCR, used to normalize generated
|
166
|
+
# text content (strip unneeded whitespace incidental to output).
|
167
|
+
def end_document
|
168
|
+
# postprocess @text to remove trailing spaces on lines
|
169
|
+
@text = @text.split("\n").map(&:strip).join("\n")
|
170
|
+
# remove excess line break
|
171
|
+
@text.gsub!(/\n+/, "\n")
|
172
|
+
@text.delete("\r")
|
173
|
+
# remove trailing whitespace at end of buffer
|
174
|
+
@text.strip!
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
class WordCoordinates
|
179
|
+
##
|
180
|
+
# @api public
|
181
|
+
#
|
182
|
+
# @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
|
183
|
+
# @param width [Integer] the width of the "canvas" on which the words appear.
|
184
|
+
# @param height [Integer] the height of the "canvas" on which the words appear.
|
185
|
+
#
|
186
|
+
# @return [String] a JSON encoded string.
|
187
|
+
def self.to_json(words:, width: nil, height: nil)
|
188
|
+
new(words: words, width: width, height: height).to_json
|
189
|
+
end
|
190
|
+
|
191
|
+
def initialize(words:, width:, height:)
|
192
|
+
@words = words
|
193
|
+
@width = width
|
194
|
+
@height = height
|
195
|
+
end
|
196
|
+
attr_reader :words, :width, :height
|
197
|
+
|
198
|
+
# Output JSON flattened word coordinates
|
199
|
+
#
|
200
|
+
# @return [String] JSON serialization of flattened word coordinates
|
201
|
+
def to_json
|
202
|
+
coordinates = {}
|
203
|
+
words.each do |word|
|
204
|
+
word_chars = word[:word]
|
205
|
+
word_coords = word[:coordinates]
|
206
|
+
if coordinates[word_chars]
|
207
|
+
coordinates[word_chars] << word_coords
|
208
|
+
else
|
209
|
+
coordinates[word_chars] = [word_coords]
|
210
|
+
end
|
211
|
+
end
|
212
|
+
payload = { width: width, height: height, coords: coordinates }
|
213
|
+
JSON.generate(payload)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
##
|
6
|
+
# This module is responsible for extracting technical_metadata for a given path.
|
7
|
+
#
|
8
|
+
# @see .technical_metadata_for
|
9
|
+
class ImageIdentifyService < BaseService
|
10
|
+
class_attribute :identify_format_option,
|
11
|
+
default: %(Geometry: %G\nDepth: %[bit-depth]\nColorspace: %[colorspace]\nAlpha: %A\nMIME Step: %m\n) # rubocop:disable Layout/LineLength
|
12
|
+
|
13
|
+
##
|
14
|
+
# @api public
|
15
|
+
# @param path [String]
|
16
|
+
# @return [Derivative::Rodeo::TechnicalMetadata]
|
17
|
+
def self.technical_metadata_for(path:)
|
18
|
+
new(path).technical_metadata
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(path)
|
22
|
+
super()
|
23
|
+
@path = path
|
24
|
+
# The first 23 characters of a file contains the magic.
|
25
|
+
@initial_file_contents = File.read(@path, 23, 0)
|
26
|
+
end
|
27
|
+
attr_reader :path
|
28
|
+
|
29
|
+
# Return metadata by means of imagemagick identify
|
30
|
+
def technical_metadata
|
31
|
+
technical_metadata = TechnicalMetadata.new
|
32
|
+
lines = im_identify
|
33
|
+
width, height = im_identify_geometry(lines)
|
34
|
+
technical_metadata.width = width
|
35
|
+
technical_metadata.height = height
|
36
|
+
technical_metadata.content_type = im_mime(lines)
|
37
|
+
populate_im_color!(lines, technical_metadata)
|
38
|
+
technical_metadata
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
# @return [Array<String>] lines of output from imagemagick `identify`
|
44
|
+
def im_identify
|
45
|
+
return @im_identify if defined?(@im_identify)
|
46
|
+
|
47
|
+
# Instead of relying on all of the properties, we're requesting on the specific properties
|
48
|
+
cmd = "identify -format '#{identify_format_option}' #{path}"
|
49
|
+
# cmd = "identify -verbose #{path}"
|
50
|
+
@im_identify = `#{cmd}`.lines
|
51
|
+
end
|
52
|
+
|
53
|
+
# @return [Array(Integer, Integer)] width, height in Integer px units
|
54
|
+
def im_identify_geometry(lines)
|
55
|
+
img_geo = im_line_select(lines, 'geometry').split('+')[0]
|
56
|
+
img_geo.split('x').map(&:to_i)
|
57
|
+
end
|
58
|
+
|
59
|
+
def im_mime(lines)
|
60
|
+
return 'application/pdf' if pdf? # workaround older imagemagick bug
|
61
|
+
|
62
|
+
im_line_select(lines, 'mime step')
|
63
|
+
end
|
64
|
+
|
65
|
+
def pdf?
|
66
|
+
@initial_file_contents.start_with?('%PDF-')
|
67
|
+
end
|
68
|
+
|
69
|
+
def populate_im_color!(lines, technical_metadata)
|
70
|
+
bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
|
71
|
+
colorspace = im_line_select(lines, 'colorspace')
|
72
|
+
color = colorspace == 'Gray' ? 'gray' : 'color'
|
73
|
+
has_alpha = !im_line_select(lines, 'alpha') == 'Undefined'
|
74
|
+
technical_metadata.num_components = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
|
75
|
+
technical_metadata.color = bpc == 1 ? 'monochrome' : color
|
76
|
+
technical_metadata.bits_per_component = bpc
|
77
|
+
end
|
78
|
+
|
79
|
+
def im_line_select(lines, key)
|
80
|
+
line = lines.find { |l| l.scrub.downcase.strip.start_with?(key.downcase) }
|
81
|
+
# Given "key: value" line, return the value as String stripped of
|
82
|
+
# leading and trailing whitespace
|
83
|
+
return line if line.nil?
|
84
|
+
|
85
|
+
line.strip.split(':')[-1].strip
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# rubocop:disable Style/FrozenStringLiteralComment
|
2
|
+
# TODO freeze them literals
|
3
|
+
|
4
|
+
module DerivativeRodeo
|
5
|
+
module Services
|
6
|
+
##
|
7
|
+
# A utility class for extracting technical metadata from a JP2.
|
8
|
+
#
|
9
|
+
# @see .technical_metadata_for
|
10
|
+
class ImageJp2Service < BaseService
|
11
|
+
TOKEN_MARKER_START = "\xFF".force_encoding('BINARY')
|
12
|
+
TOKEN_MARKER_SIZ = "\x51".force_encoding('BINARY')
|
13
|
+
TOKEN_IHDR = 'ihdr'.freeze
|
14
|
+
|
15
|
+
##
|
16
|
+
# @api public
|
17
|
+
#
|
18
|
+
# @param path [String] path to jp2, for reading
|
19
|
+
#
|
20
|
+
# @return [Derivative::Rodeo::TechnicalMetadata]
|
21
|
+
def self.technical_metadata_for(path:)
|
22
|
+
new(path).technical_metadata
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :path
|
26
|
+
|
27
|
+
def initialize(path)
|
28
|
+
super()
|
29
|
+
@path = path
|
30
|
+
end
|
31
|
+
|
32
|
+
# rubocop:disable Metrics/MethodLength
|
33
|
+
def technical_metadata
|
34
|
+
io = File.open(path, 'rb')
|
35
|
+
io.seek(0, IO::SEEK_SET)
|
36
|
+
validate_jp2(io)
|
37
|
+
x_siz, y_siz = extract_jp2_dim(io)
|
38
|
+
nc, bpc = extract_jp2_components(io)
|
39
|
+
color = nc >= 3 ? 'color' : 'gray'
|
40
|
+
TechnicalMetadata.new(
|
41
|
+
color: bpc == 1 ? 'monochrome' : color,
|
42
|
+
num_components: nc,
|
43
|
+
bits_per_component: bpc,
|
44
|
+
width: x_siz,
|
45
|
+
height: y_siz,
|
46
|
+
content_type: 'image/jp2'
|
47
|
+
)
|
48
|
+
ensure
|
49
|
+
io.close
|
50
|
+
end
|
51
|
+
# rubocop:enable Metrics/MethodLength
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
56
|
+
# @return [Array(Integer, Integer)] X size, Y size, in Integer-stepd px
|
57
|
+
# rubocop:disable Metrics/MethodLength
|
58
|
+
def extract_jp2_dim(io)
|
59
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
60
|
+
|
61
|
+
buffer = ''
|
62
|
+
siz_found = false
|
63
|
+
# Informed by ISO/IEC 15444-1:2000, pp. 26-27
|
64
|
+
# via:
|
65
|
+
# http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
|
66
|
+
#
|
67
|
+
# first 23 bytes are file-magic, we can skip
|
68
|
+
io.seek(23, IO::SEEK_SET)
|
69
|
+
while !siz_found && !buffer.nil?
|
70
|
+
# read one byte at a time, until we hit marker start 0xFF
|
71
|
+
buffer = io.read(1) while buffer != TOKEN_MARKER_START
|
72
|
+
# - on 0xFF read subsequent byte; if value != 0x51, continue
|
73
|
+
buffer = io.read(1)
|
74
|
+
next if buffer != TOKEN_MARKER_SIZ
|
75
|
+
|
76
|
+
# - on 0x51, read next 12 bytes
|
77
|
+
buffer = io.read(12)
|
78
|
+
siz_found = true
|
79
|
+
end
|
80
|
+
# discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
|
81
|
+
x_siz = buffer.byteslice(4, 4).unpack1('N')
|
82
|
+
y_siz = buffer.byteslice(8, 4).unpack1('N')
|
83
|
+
[x_siz, y_siz]
|
84
|
+
end
|
85
|
+
# rubocop:enable Metrics/MethodLength
|
86
|
+
|
87
|
+
# @param io [IO] IO stream opened in binary mode, for reading
|
88
|
+
# @return [Array(Integer, Integer)] number components, bits-per-component
|
89
|
+
def extract_jp2_components(io)
|
90
|
+
raise IOError, 'file not open in binary mode' unless io.binmode?
|
91
|
+
|
92
|
+
io.seek(0, IO::SEEK_SET)
|
93
|
+
# IHDR should be in first 64 bytes
|
94
|
+
buffer = io.read(64)
|
95
|
+
ihdr_data = buffer.split(TOKEN_IHDR)[-1]
|
96
|
+
raise IOError if ihdr_data.nil?
|
97
|
+
|
98
|
+
num_components = ihdr_data.byteslice(8, 2).unpack1('n')
|
99
|
+
# stored as "bit depth of the components in the codestream, minus 1", so add 1
|
100
|
+
bits_per_component = ihdr_data.byteslice(10, 1).unpack1('c') + 1
|
101
|
+
[num_components, bits_per_component]
|
102
|
+
end
|
103
|
+
|
104
|
+
def validate_jp2(io)
|
105
|
+
# verify file is jp2
|
106
|
+
magic = io.read(23)
|
107
|
+
raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
# rubocop:enable Style/FrozenStringLiteralComment
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'tmpdir'
|
4
|
+
|
5
|
+
module DerivativeRodeo
|
6
|
+
module Services
|
7
|
+
##
|
8
|
+
# @api private
|
9
|
+
#
|
10
|
+
# @see .technical_metadata
|
11
|
+
# @see .convert
|
12
|
+
class ImageService < BaseService
|
13
|
+
attr_accessor :path
|
14
|
+
|
15
|
+
def initialize(path)
|
16
|
+
super()
|
17
|
+
@path = path
|
18
|
+
# The first 23 characters of a file contains the magic.
|
19
|
+
@initial_file_contents = File.read(@path, 23, 0)
|
20
|
+
end
|
21
|
+
|
22
|
+
def jp2?
|
23
|
+
@initial_file_contents.end_with?('ftypjp2')
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [Derivative::Rodeo::TechnicalMetadata]
|
27
|
+
def technical_metadata
|
28
|
+
return @technical_metadata if defined?(@technical_metadata)
|
29
|
+
|
30
|
+
@technical_metadata = if jp2?
|
31
|
+
ImageJp2Service.technical_metadata_for(path: path)
|
32
|
+
else
|
33
|
+
ImageIdentifyService.technical_metadata_for(path: path)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
alias metadata technical_metadata
|
37
|
+
|
38
|
+
extend Forwardable
|
39
|
+
def_delegator :technical_metadata, :monochrome?
|
40
|
+
|
41
|
+
# Convert source image to image at destination path, inferring file type from destination
|
42
|
+
# file extension. In case of JP2 files, create intermediate file using OpenJPEG 2000 that
|
43
|
+
# ImageMagick can use. Only outputs monochrome output if monochrome is true, destination
|
44
|
+
# format is TIFF.
|
45
|
+
#
|
46
|
+
# @param destination [String] Path to output / destination file
|
47
|
+
# @param monochrome [Boolean] true if monochrome output, otherwise false
|
48
|
+
def convert(destination:, monochrome: false)
|
49
|
+
raise 'JP2 output not yet supported' if destination.end_with?('jp2')
|
50
|
+
|
51
|
+
source = jp2? ? jp2_to_tiff(path) : path
|
52
|
+
convert_image(source: source, destination: destination, monochrome: monochrome)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def convert_image(source:, destination:, monochrome:)
|
58
|
+
monochrome &&= destination.slice(-4, 4).index('tif')
|
59
|
+
mono_opts = '-depth 1 -monochrome -compress Group4 -type bilevel '
|
60
|
+
opts = monochrome ? mono_opts : ''
|
61
|
+
cmd = "convert #{source} #{opts}#{destination}"
|
62
|
+
`#{cmd}`
|
63
|
+
end
|
64
|
+
|
65
|
+
def jp2_to_tiff(source)
|
66
|
+
intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
|
67
|
+
jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
|
68
|
+
`#{jp2_cmd}`
|
69
|
+
intermediate_path
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'open3'
|
4
|
+
require 'securerandom'
|
5
|
+
require 'tmpdir'
|
6
|
+
|
7
|
+
module DerivativeRodeo
|
8
|
+
module Services
|
9
|
+
module PdfSplitter
|
10
|
+
##
|
11
|
+
# @param name [String]
|
12
|
+
# @return [PdfSplitter::Base]
|
13
|
+
def self.for(name)
|
14
|
+
klass_name = "#{name.to_s.classify}_page".classify
|
15
|
+
"DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
|
16
|
+
end
|
17
|
+
|
18
|
+
##
|
19
|
+
# @abstract
|
20
|
+
#
|
21
|
+
# The purpose of this class is to split the PDF into constituent image files.
|
22
|
+
#
|
23
|
+
# @see #each
|
24
|
+
class Base
|
25
|
+
class_attribute :image_extension
|
26
|
+
class_attribute :default_dpi, default: 400
|
27
|
+
# Should we perform compression logic on the images?
|
28
|
+
class_attribute :compression, default: nil
|
29
|
+
# What is the image quality we're using?
|
30
|
+
class_attribute :quality, default: nil
|
31
|
+
|
32
|
+
class_attribute :gsdevice, instance_accessor: false
|
33
|
+
class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
|
34
|
+
##
|
35
|
+
# @api public
|
36
|
+
#
|
37
|
+
# @param path [String] The path the the PDF
|
38
|
+
#
|
39
|
+
# @return [Enumerable, Utilities::PdfSplitter::Base]
|
40
|
+
def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
|
41
|
+
new(path, baseid: baseid, tmpdir: tmpdir)
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# @param path [String] the path to the source PDF that we're processing.
|
46
|
+
# @param baseid [String] used for creating a unique identifier
|
47
|
+
# @param tmpdir [String] place to perform the "work" of splitting the PDF.
|
48
|
+
# @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
|
49
|
+
# extract this from the given path, but for testing purposes, you might want to
|
50
|
+
# provide a specific summary.
|
51
|
+
# @param logger [Logger, #error]
|
52
|
+
def initialize(path,
|
53
|
+
baseid: SecureRandom.uuid,
|
54
|
+
# TODO: Do we need to provide the :tmpdir for the application?
|
55
|
+
tmpdir: Dir.mktmpdir,
|
56
|
+
pdf_pages_summary: PagesSummary.extract_from(path: path),
|
57
|
+
logger: DerivativeRodeo.config.logger)
|
58
|
+
@baseid = baseid
|
59
|
+
@pdfpath = path
|
60
|
+
@pdf_pages_summary = pdf_pages_summary
|
61
|
+
@tmpdir = tmpdir
|
62
|
+
@logger = logger
|
63
|
+
end
|
64
|
+
|
65
|
+
attr_reader :logger
|
66
|
+
|
67
|
+
# In creating {#each} we get many of the methods of array operation (e.g. #to_a).
|
68
|
+
include Enumerable
|
69
|
+
|
70
|
+
##
|
71
|
+
# @api public
|
72
|
+
#
|
73
|
+
# @yieldparam [String] the path to the page's tiff.
|
74
|
+
def each(&block)
|
75
|
+
entries.each(&block)
|
76
|
+
end
|
77
|
+
|
78
|
+
# @api private
|
79
|
+
def invalid_pdf?
|
80
|
+
!pdf_pages_summary.valid?
|
81
|
+
end
|
82
|
+
|
83
|
+
attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
|
84
|
+
private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
|
85
|
+
|
86
|
+
# @api private
|
87
|
+
def gsdevice
|
88
|
+
return self.class.gsdevice if self.class.gsdevice
|
89
|
+
|
90
|
+
raise NotImplementedError, "#{self.class}#gsdevice"
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
# entries for each page
|
96
|
+
def entries
|
97
|
+
return @entries if defined? @entries
|
98
|
+
|
99
|
+
@entries = Array.wrap(gsconvert)
|
100
|
+
end
|
101
|
+
|
102
|
+
def output_base
|
103
|
+
@output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
|
104
|
+
end
|
105
|
+
|
106
|
+
def gsconvert
|
107
|
+
# NOTE: you must call gsdevice before compression, as compression is
|
108
|
+
# updated during the gsdevice call.
|
109
|
+
file_names = []
|
110
|
+
|
111
|
+
Open3.popen3(gsconvert_cmd(output_base)) do |_stdin, stdout, stderr, _wait_thr|
|
112
|
+
err = stderr.read
|
113
|
+
logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
|
114
|
+
|
115
|
+
page_number = 1
|
116
|
+
stdout.read.split("\n").each do |line|
|
117
|
+
next unless line.start_with?('Page ')
|
118
|
+
|
119
|
+
file_names << format(output_base, page_number)
|
120
|
+
page_number += 1
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
file_names
|
125
|
+
end
|
126
|
+
|
127
|
+
def create_file_name(line:, page_number:); end
|
128
|
+
|
129
|
+
def gsconvert_cmd(output_base)
|
130
|
+
@gsconvert_cmd ||= begin
|
131
|
+
cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
|
132
|
+
cmd += " -sCompression=#{compression}" if compression?
|
133
|
+
cmd += " -dJPEGQ=#{quality}" if quality?
|
134
|
+
cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
|
135
|
+
cmd
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def pagecount
|
140
|
+
return @pagecount if defined? @pagecount
|
141
|
+
|
142
|
+
cmd = "pdfinfo #{pdfpath}"
|
143
|
+
Open3.popen3(cmd) do |_stdin, stdout, stderr, _wait_thr|
|
144
|
+
err = stderr.read
|
145
|
+
logger.error "#{self.class}#pagecount encountered the following error with `pdfinfo': #{err}" if err.present?
|
146
|
+
output = stdout.read
|
147
|
+
raise "pdfinfo failed to return output for #{pdfpath} - #{err}" if output.blank?
|
148
|
+
match = page_count_regexp.match(output)
|
149
|
+
|
150
|
+
@pagecount = match[1].to_i
|
151
|
+
end
|
152
|
+
@pagecount
|
153
|
+
end
|
154
|
+
|
155
|
+
def ppi
|
156
|
+
if looks_scanned?
|
157
|
+
# For scanned media, defer to detected image PPI:
|
158
|
+
pdf_pages_summary.ppi
|
159
|
+
else
|
160
|
+
# 400 dpi for something that does not look like scanned media:
|
161
|
+
default_dpi
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def looks_scanned?
|
166
|
+
max_image_px = pdf_pages_summary.width * pdf_pages_summary.height
|
167
|
+
# single 10mp+ image per page?
|
168
|
+
single_image_per_page? && max_image_px > 1024 * 1024 * 10
|
169
|
+
end
|
170
|
+
|
171
|
+
def single_image_per_page?
|
172
|
+
pdf_pages_summary.page_count == pagecount
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DerivativeRodeo
|
4
|
+
module Services
|
5
|
+
module PdfSplitter
|
6
|
+
# The purpose of this class is to split the PDF into constituent jpg files.
|
7
|
+
class JpgPage < PdfSplitter::Base
|
8
|
+
self.image_extension = 'jpg'
|
9
|
+
self.quality = '50'
|
10
|
+
self.gsdevice = 'jpeg'
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|