derivative-rodeo 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +6 -0
  3. data/LICENSE +15 -0
  4. data/README.md +251 -0
  5. data/Rakefile +42 -0
  6. data/derivative_rodeo.gemspec +54 -0
  7. data/lib/derivative/rodeo.rb +3 -0
  8. data/lib/derivative-rodeo.rb +3 -0
  9. data/lib/derivative_rodeo/configuration.rb +95 -0
  10. data/lib/derivative_rodeo/errors.rb +56 -0
  11. data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
  12. data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
  13. data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
  14. data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
  15. data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
  16. data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
  17. data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
  18. data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
  19. data/lib/derivative_rodeo/services/base_service.rb +15 -0
  20. data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
  21. data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
  22. data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
  23. data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
  24. data/lib/derivative_rodeo/services/image_service.rb +73 -0
  25. data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
  26. data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
  27. data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
  28. data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
  29. data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
  30. data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
  31. data/lib/derivative_rodeo/services/url_service.rb +42 -0
  32. data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
  33. data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
  34. data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
  35. data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
  36. data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
  37. data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
  38. data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
  39. data/lib/derivative_rodeo/technical_metadata.rb +23 -0
  40. data/lib/derivative_rodeo/version.rb +5 -0
  41. data/lib/derivative_rodeo.rb +36 -0
  42. metadata +339 -0
@@ -0,0 +1,218 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
4
+ require 'json'
5
+ require 'nokogiri'
6
+
7
+ module DerivativeRodeo
8
+ module Services
9
+ ##
10
+ # Responsible for converting an SGML string into JSON coordinates
11
+ class ExtractWordCoordinatesFromHocrSgmlService
12
+ ##
13
+ # @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
14
+ # @return [String] A JSON document
15
+ def self.call(sgml)
16
+ new(sgml).to_json
17
+ end
18
+
19
+ ##
20
+ # Construct with either path or HTML [String]
21
+ #
22
+ # @param html [String] either an XML string or a path to a file.
23
+ def initialize(html)
24
+ @source = xml?(html) ? html : File.read(html)
25
+ @doc_stream = DocStream.new
26
+ parser = Nokogiri::HTML::SAX::Parser.new(@doc_stream)
27
+ parser.parse(@source)
28
+ end
29
+ attr_reader :doc_stream, :source
30
+
31
+ delegate :text, :width, :height, :words, to: :doc_stream
32
+
33
+ # Output JSON flattened word coordinates
34
+ #
35
+ # @return [String] JSON serialization of flattened word coordinates
36
+ def to_json
37
+ @to_json ||= WordCoordinates.to_json(
38
+ words: doc_stream.words,
39
+ width: doc_stream.width,
40
+ height: doc_stream.height
41
+ )
42
+ end
43
+ alias json to_json
44
+
45
+ private
46
+
47
+ def xml?(xml)
48
+ xml.lstrip.start_with?('<')
49
+ end
50
+
51
+ # SAX Document Stream class to gather text and word tokens from hOCR
52
+ class DocStream < Nokogiri::XML::SAX::Document
53
+ attr_accessor :text, :words, :width, :height
54
+
55
+ def initialize
56
+ super()
57
+ # plain text buffer:
58
+ @text = ''
59
+ # list of word hash, containing word+coord:
60
+ @words = []
61
+ # page width and height to be found in hOCR for `div.ocr_page`
62
+ @width = nil
63
+ @height = nil
64
+ # to hold current word data state across #start_element, #characters,
65
+ # and #end_element methods (to associate word with coordinates).
66
+ @current = nil
67
+ # to preserve element classname from start to use by #end_element
68
+ @element_class_name = nil
69
+ end
70
+
71
+ # Return coordinates from `span.ocrx_word` element attribute hash
72
+ #
73
+ # @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
74
+ # @return [Array] Array of position x, y, width, height in px.
75
+ def s_coords(attrs)
76
+ element_title = attrs['title']
77
+ bbox = element_title.split(';')[0].split('bbox ')[-1]
78
+ x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
79
+ height = y2 - y1
80
+ width = x2 - x1
81
+ hpos = x1
82
+ vpos = y1
83
+ [hpos, vpos, width, height]
84
+ end
85
+
86
+ # Consider element for processing?
87
+ # - `div.ocr_page` — to get page width/height
88
+ # - `span.ocr_line` — to help make plain text readable
89
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
90
+ # @param name [String] Element name
91
+ # @param class_name [String] HTML class name
92
+ # @return [Boolean] true if element should be processed; otherwise false
93
+ def consider?(name, class_name)
94
+ selector = "#{name}.#{class_name}"
95
+ ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
96
+ end
97
+
98
+ def start_word(attrs)
99
+ @current = {}
100
+ # will be replaced during #characters method call:
101
+ @current[:word] = nil
102
+ @current[:coordinates] = s_coords(attrs)
103
+ end
104
+
105
+ def start_page(attrs)
106
+ title = attrs['title']
107
+ fields = title.split(';')
108
+ bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
109
+ # width and height:
110
+ @width = bbox[2]
111
+ @height = bbox[3]
112
+ end
113
+
114
+ def word_complete?
115
+ return false if @current.nil?
116
+ coords = @current[:coordinates]
117
+ @current[:word].present? && coords.size == 4
118
+ end
119
+
120
+ def end_word
121
+ # add trailing space to plaintext buffer for between words:
122
+ @text += ' '
123
+ @words.push(@current) if word_complete?
124
+ end
125
+
126
+ def end_line
127
+ # strip trailing whitespace
128
+ @text.strip!
129
+ # then insert a line break
130
+ @text += "\n"
131
+ end
132
+
133
+ # Callback for element start, ignores elements except for:
134
+ # - `div.ocr_page` — to get page width/height
135
+ # - `span.ocr_line` — to help make plain text readable
136
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
137
+ #
138
+ # @param name [String] element name.
139
+ # @param attrs [Array] Array of key, value pair Arrays.
140
+ def start_element(name, attrs = [])
141
+ attributes = attrs.to_h
142
+ @element_class_name = attributes['class']
143
+ return unless consider?(name, @element_class_name)
144
+ start_word(attributes) if @element_class_name == 'ocrx_word'
145
+ start_page(attributes) if @element_class_name == 'ocr_page'
146
+ end
147
+
148
+ def characters(value)
149
+ return if @current.nil?
150
+ return if @current[:coordinates].nil?
151
+ @current[:word] ||= ''
152
+ @current[:word] += value
153
+ @text += value
154
+ end
155
+
156
+ # Callback for element end; at this time, flush word coordinate state
157
+ # for current word, and append line endings to plain text:
158
+ #
159
+ # @param _name [String] element name.
160
+ def end_element(_name)
161
+ end_line if @element_class_name == 'ocr_line'
162
+ end_word if @element_class_name == 'ocrx_word'
163
+ end
164
+
165
+ # Callback for completion of parsing hOCR, used to normalize generated
166
+ # text content (strip unneeded whitespace incidental to output).
167
+ def end_document
168
+ # postprocess @text to remove trailing spaces on lines
169
+ @text = @text.split("\n").map(&:strip).join("\n")
170
+ # remove excess line break
171
+ @text.gsub!(/\n+/, "\n")
172
+ @text.delete("\r")
173
+ # remove trailing whitespace at end of buffer
174
+ @text.strip!
175
+ end
176
+ end
177
+
178
+ class WordCoordinates
179
+ ##
180
+ # @api public
181
+ #
182
+ # @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
183
+ # @param width [Integer] the width of the "canvas" on which the words appear.
184
+ # @param height [Integer] the height of the "canvas" on which the words appear.
185
+ #
186
+ # @return [String] a JSON encoded string.
187
+ def self.to_json(words:, width: nil, height: nil)
188
+ new(words: words, width: width, height: height).to_json
189
+ end
190
+
191
+ def initialize(words:, width:, height:)
192
+ @words = words
193
+ @width = width
194
+ @height = height
195
+ end
196
+ attr_reader :words, :width, :height
197
+
198
+ # Output JSON flattened word coordinates
199
+ #
200
+ # @return [String] JSON serialization of flattened word coordinates
201
+ def to_json
202
+ coordinates = {}
203
+ words.each do |word|
204
+ word_chars = word[:word]
205
+ word_coords = word[:coordinates]
206
+ if coordinates[word_chars]
207
+ coordinates[word_chars] << word_coords
208
+ else
209
+ coordinates[word_chars] = [word_coords]
210
+ end
211
+ end
212
+ payload = { width: width, height: height, coords: coordinates }
213
+ JSON.generate(payload)
214
+ end
215
+ end
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Services
5
+ ##
6
+ # This module is responsible for extracting technical_metadata for a given path.
7
+ #
8
+ # @see .technical_metadata_for
9
+ class ImageIdentifyService < BaseService
10
+ class_attribute :identify_format_option,
11
+ default: %(Geometry: %G\nDepth: %[bit-depth]\nColorspace: %[colorspace]\nAlpha: %A\nMIME Step: %m\n) # rubocop:disable Layout/LineLength
12
+
13
+ ##
14
+ # @api public
15
+ # @param path [String]
16
+ # @return [Derivative::Rodeo::TechnicalMetadata]
17
+ def self.technical_metadata_for(path:)
18
+ new(path).technical_metadata
19
+ end
20
+
21
+ def initialize(path)
22
+ super()
23
+ @path = path
24
+ # The first 23 characters of a file contains the magic.
25
+ @initial_file_contents = File.read(@path, 23, 0)
26
+ end
27
+ attr_reader :path
28
+
29
+ # Return metadata by means of imagemagick identify
30
+ def technical_metadata
31
+ technical_metadata = TechnicalMetadata.new
32
+ lines = im_identify
33
+ width, height = im_identify_geometry(lines)
34
+ technical_metadata.width = width
35
+ technical_metadata.height = height
36
+ technical_metadata.content_type = im_mime(lines)
37
+ populate_im_color!(lines, technical_metadata)
38
+ technical_metadata
39
+ end
40
+
41
+ private
42
+
43
+ # @return [Array<String>] lines of output from imagemagick `identify`
44
+ def im_identify
45
+ return @im_identify if defined?(@im_identify)
46
+
47
+ # Instead of relying on all of the properties, we're requesting on the specific properties
48
+ cmd = "identify -format '#{identify_format_option}' #{path}"
49
+ # cmd = "identify -verbose #{path}"
50
+ @im_identify = `#{cmd}`.lines
51
+ end
52
+
53
+ # @return [Array(Integer, Integer)] width, height in Integer px units
54
+ def im_identify_geometry(lines)
55
+ img_geo = im_line_select(lines, 'geometry').split('+')[0]
56
+ img_geo.split('x').map(&:to_i)
57
+ end
58
+
59
+ def im_mime(lines)
60
+ return 'application/pdf' if pdf? # workaround older imagemagick bug
61
+
62
+ im_line_select(lines, 'mime step')
63
+ end
64
+
65
+ def pdf?
66
+ @initial_file_contents.start_with?('%PDF-')
67
+ end
68
+
69
+ def populate_im_color!(lines, technical_metadata)
70
+ bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
71
+ colorspace = im_line_select(lines, 'colorspace')
72
+ color = colorspace == 'Gray' ? 'gray' : 'color'
73
+ has_alpha = !im_line_select(lines, 'alpha') == 'Undefined'
74
+ technical_metadata.num_components = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
75
+ technical_metadata.color = bpc == 1 ? 'monochrome' : color
76
+ technical_metadata.bits_per_component = bpc
77
+ end
78
+
79
+ def im_line_select(lines, key)
80
+ line = lines.find { |l| l.scrub.downcase.strip.start_with?(key.downcase) }
81
+ # Given "key: value" line, return the value as String stripped of
82
+ # leading and trailing whitespace
83
+ return line if line.nil?
84
+
85
+ line.strip.split(':')[-1].strip
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,112 @@
1
+ # rubocop:disable Style/FrozenStringLiteralComment
2
+ # TODO freeze them literals
3
+
4
+ module DerivativeRodeo
5
+ module Services
6
+ ##
7
+ # A utility class for extracting technical metadata from a JP2.
8
+ #
9
+ # @see .technical_metadata_for
10
+ class ImageJp2Service < BaseService
11
+ TOKEN_MARKER_START = "\xFF".force_encoding('BINARY')
12
+ TOKEN_MARKER_SIZ = "\x51".force_encoding('BINARY')
13
+ TOKEN_IHDR = 'ihdr'.freeze
14
+
15
+ ##
16
+ # @api public
17
+ #
18
+ # @param path [String] path to jp2, for reading
19
+ #
20
+ # @return [Derivative::Rodeo::TechnicalMetadata]
21
+ def self.technical_metadata_for(path:)
22
+ new(path).technical_metadata
23
+ end
24
+
25
+ attr_reader :path
26
+
27
+ def initialize(path)
28
+ super()
29
+ @path = path
30
+ end
31
+
32
+ # rubocop:disable Metrics/MethodLength
33
+ def technical_metadata
34
+ io = File.open(path, 'rb')
35
+ io.seek(0, IO::SEEK_SET)
36
+ validate_jp2(io)
37
+ x_siz, y_siz = extract_jp2_dim(io)
38
+ nc, bpc = extract_jp2_components(io)
39
+ color = nc >= 3 ? 'color' : 'gray'
40
+ TechnicalMetadata.new(
41
+ color: bpc == 1 ? 'monochrome' : color,
42
+ num_components: nc,
43
+ bits_per_component: bpc,
44
+ width: x_siz,
45
+ height: y_siz,
46
+ content_type: 'image/jp2'
47
+ )
48
+ ensure
49
+ io.close
50
+ end
51
+ # rubocop:enable Metrics/MethodLength
52
+
53
+ private
54
+
55
+ # @param io [IO] IO stream opened in binary mode, for reading
56
+ # @return [Array(Integer, Integer)] X size, Y size, in Integer-stepd px
57
+ # rubocop:disable Metrics/MethodLength
58
+ def extract_jp2_dim(io)
59
+ raise IOError, 'file not open in binary mode' unless io.binmode?
60
+
61
+ buffer = ''
62
+ siz_found = false
63
+ # Informed by ISO/IEC 15444-1:2000, pp. 26-27
64
+ # via:
65
+ # http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
66
+ #
67
+ # first 23 bytes are file-magic, we can skip
68
+ io.seek(23, IO::SEEK_SET)
69
+ while !siz_found && !buffer.nil?
70
+ # read one byte at a time, until we hit marker start 0xFF
71
+ buffer = io.read(1) while buffer != TOKEN_MARKER_START
72
+ # - on 0xFF read subsequent byte; if value != 0x51, continue
73
+ buffer = io.read(1)
74
+ next if buffer != TOKEN_MARKER_SIZ
75
+
76
+ # - on 0x51, read next 12 bytes
77
+ buffer = io.read(12)
78
+ siz_found = true
79
+ end
80
+ # discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
81
+ x_siz = buffer.byteslice(4, 4).unpack1('N')
82
+ y_siz = buffer.byteslice(8, 4).unpack1('N')
83
+ [x_siz, y_siz]
84
+ end
85
+ # rubocop:enable Metrics/MethodLength
86
+
87
+ # @param io [IO] IO stream opened in binary mode, for reading
88
+ # @return [Array(Integer, Integer)] number components, bits-per-component
89
+ def extract_jp2_components(io)
90
+ raise IOError, 'file not open in binary mode' unless io.binmode?
91
+
92
+ io.seek(0, IO::SEEK_SET)
93
+ # IHDR should be in first 64 bytes
94
+ buffer = io.read(64)
95
+ ihdr_data = buffer.split(TOKEN_IHDR)[-1]
96
+ raise IOError if ihdr_data.nil?
97
+
98
+ num_components = ihdr_data.byteslice(8, 2).unpack1('n')
99
+ # stored as "bit depth of the components in the codestream, minus 1", so add 1
100
+ bits_per_component = ihdr_data.byteslice(10, 1).unpack1('c') + 1
101
+ [num_components, bits_per_component]
102
+ end
103
+
104
+ def validate_jp2(io)
105
+ # verify file is jp2
106
+ magic = io.read(23)
107
+ raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
108
+ end
109
+ end
110
+ end
111
+ end
112
+ # rubocop:enable Style/FrozenStringLiteralComment
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'tmpdir'
4
+
5
+ module DerivativeRodeo
6
+ module Services
7
+ ##
8
+ # @api private
9
+ #
10
+ # @see .technical_metadata
11
+ # @see .convert
12
+ class ImageService < BaseService
13
+ attr_accessor :path
14
+
15
+ def initialize(path)
16
+ super()
17
+ @path = path
18
+ # The first 23 characters of a file contains the magic.
19
+ @initial_file_contents = File.read(@path, 23, 0)
20
+ end
21
+
22
+ def jp2?
23
+ @initial_file_contents.end_with?('ftypjp2')
24
+ end
25
+
26
+ # @return [Derivative::Rodeo::TechnicalMetadata]
27
+ def technical_metadata
28
+ return @technical_metadata if defined?(@technical_metadata)
29
+
30
+ @technical_metadata = if jp2?
31
+ ImageJp2Service.technical_metadata_for(path: path)
32
+ else
33
+ ImageIdentifyService.technical_metadata_for(path: path)
34
+ end
35
+ end
36
+ alias metadata technical_metadata
37
+
38
+ extend Forwardable
39
+ def_delegator :technical_metadata, :monochrome?
40
+
41
+ # Convert source image to image at destination path, inferring file type from destination
42
+ # file extension. In case of JP2 files, create intermediate file using OpenJPEG 2000 that
43
+ # ImageMagick can use. Only outputs monochrome output if monochrome is true, destination
44
+ # format is TIFF.
45
+ #
46
+ # @param destination [String] Path to output / destination file
47
+ # @param monochrome [Boolean] true if monochrome output, otherwise false
48
+ def convert(destination:, monochrome: false)
49
+ raise 'JP2 output not yet supported' if destination.end_with?('jp2')
50
+
51
+ source = jp2? ? jp2_to_tiff(path) : path
52
+ convert_image(source: source, destination: destination, monochrome: monochrome)
53
+ end
54
+
55
+ private
56
+
57
+ def convert_image(source:, destination:, monochrome:)
58
+ monochrome &&= destination.slice(-4, 4).index('tif')
59
+ mono_opts = '-depth 1 -monochrome -compress Group4 -type bilevel '
60
+ opts = monochrome ? mono_opts : ''
61
+ cmd = "convert #{source} #{opts}#{destination}"
62
+ `#{cmd}`
63
+ end
64
+
65
+ def jp2_to_tiff(source)
66
+ intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
67
+ jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
68
+ `#{jp2_cmd}`
69
+ intermediate_path
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,177 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'securerandom'
5
+ require 'tmpdir'
6
+
7
+ module DerivativeRodeo
8
+ module Services
9
+ module PdfSplitter
10
+ ##
11
+ # @param name [String]
12
+ # @return [PdfSplitter::Base]
13
+ def self.for(name)
14
+ klass_name = "#{name.to_s.classify}_page".classify
15
+ "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
16
+ end
17
+
18
+ ##
19
+ # @abstract
20
+ #
21
+ # The purpose of this class is to split the PDF into constituent image files.
22
+ #
23
+ # @see #each
24
+ class Base
25
+ class_attribute :image_extension
26
+ class_attribute :default_dpi, default: 400
27
+ # Should we perform compression logic on the images?
28
+ class_attribute :compression, default: nil
29
+ # What is the image quality we're using?
30
+ class_attribute :quality, default: nil
31
+
32
+ class_attribute :gsdevice, instance_accessor: false
33
+ class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
34
+ ##
35
+ # @api public
36
+ #
37
+ # @param path [String] The path the the PDF
38
+ #
39
+ # @return [Enumerable, Utilities::PdfSplitter::Base]
40
+ def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
41
+ new(path, baseid: baseid, tmpdir: tmpdir)
42
+ end
43
+
44
+ ##
45
+ # @param path [String] the path to the source PDF that we're processing.
46
+ # @param baseid [String] used for creating a unique identifier
47
+ # @param tmpdir [String] place to perform the "work" of splitting the PDF.
48
+ # @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
49
+ # extract this from the given path, but for testing purposes, you might want to
50
+ # provide a specific summary.
51
+ # @param logger [Logger, #error]
52
+ def initialize(path,
53
+ baseid: SecureRandom.uuid,
54
+ # TODO: Do we need to provide the :tmpdir for the application?
55
+ tmpdir: Dir.mktmpdir,
56
+ pdf_pages_summary: PagesSummary.extract_from(path: path),
57
+ logger: DerivativeRodeo.config.logger)
58
+ @baseid = baseid
59
+ @pdfpath = path
60
+ @pdf_pages_summary = pdf_pages_summary
61
+ @tmpdir = tmpdir
62
+ @logger = logger
63
+ end
64
+
65
+ attr_reader :logger
66
+
67
+ # In creating {#each} we get many of the methods of array operation (e.g. #to_a).
68
+ include Enumerable
69
+
70
+ ##
71
+ # @api public
72
+ #
73
+ # @yieldparam [String] the path to the page's tiff.
74
+ def each(&block)
75
+ entries.each(&block)
76
+ end
77
+
78
+ # @api private
79
+ def invalid_pdf?
80
+ !pdf_pages_summary.valid?
81
+ end
82
+
83
+ attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
84
+ private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
85
+
86
+ # @api private
87
+ def gsdevice
88
+ return self.class.gsdevice if self.class.gsdevice
89
+
90
+ raise NotImplementedError, "#{self.class}#gsdevice"
91
+ end
92
+
93
+ private
94
+
95
+ # entries for each page
96
+ def entries
97
+ return @entries if defined? @entries
98
+
99
+ @entries = Array.wrap(gsconvert)
100
+ end
101
+
102
+ def output_base
103
+ @output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
104
+ end
105
+
106
+ def gsconvert
107
+ # NOTE: you must call gsdevice before compression, as compression is
108
+ # updated during the gsdevice call.
109
+ file_names = []
110
+
111
+ Open3.popen3(gsconvert_cmd(output_base)) do |_stdin, stdout, stderr, _wait_thr|
112
+ err = stderr.read
113
+ logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
114
+
115
+ page_number = 1
116
+ stdout.read.split("\n").each do |line|
117
+ next unless line.start_with?('Page ')
118
+
119
+ file_names << format(output_base, page_number)
120
+ page_number += 1
121
+ end
122
+ end
123
+
124
+ file_names
125
+ end
126
+
127
+ def create_file_name(line:, page_number:); end
128
+
129
+ def gsconvert_cmd(output_base)
130
+ @gsconvert_cmd ||= begin
131
+ cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
132
+ cmd += " -sCompression=#{compression}" if compression?
133
+ cmd += " -dJPEGQ=#{quality}" if quality?
134
+ cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
135
+ cmd
136
+ end
137
+ end
138
+
139
+ def pagecount
140
+ return @pagecount if defined? @pagecount
141
+
142
+ cmd = "pdfinfo #{pdfpath}"
143
+ Open3.popen3(cmd) do |_stdin, stdout, stderr, _wait_thr|
144
+ err = stderr.read
145
+ logger.error "#{self.class}#pagecount encountered the following error with `pdfinfo': #{err}" if err.present?
146
+ output = stdout.read
147
+ raise "pdfinfo failed to return output for #{pdfpath} - #{err}" if output.blank?
148
+ match = page_count_regexp.match(output)
149
+
150
+ @pagecount = match[1].to_i
151
+ end
152
+ @pagecount
153
+ end
154
+
155
+ def ppi
156
+ if looks_scanned?
157
+ # For scanned media, defer to detected image PPI:
158
+ pdf_pages_summary.ppi
159
+ else
160
+ # 400 dpi for something that does not look like scanned media:
161
+ default_dpi
162
+ end
163
+ end
164
+
165
+ def looks_scanned?
166
+ max_image_px = pdf_pages_summary.width * pdf_pages_summary.height
167
+ # single 10mp+ image per page?
168
+ single_image_per_page? && max_image_px > 1024 * 1024 * 10
169
+ end
170
+
171
+ def single_image_per_page?
172
+ pdf_pages_summary.page_count == pagecount
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Services
5
+ module PdfSplitter
6
+ # The purpose of this class is to split the PDF into constituent jpg files.
7
+ class JpgPage < PdfSplitter::Base
8
+ self.image_extension = 'jpg'
9
+ self.quality = '50'
10
+ self.gsdevice = 'jpeg'
11
+ end
12
+ end
13
+ end
14
+ end