derivative-rodeo 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +6 -0
  3. data/LICENSE +15 -0
  4. data/README.md +251 -0
  5. data/Rakefile +42 -0
  6. data/derivative_rodeo.gemspec +54 -0
  7. data/lib/derivative/rodeo.rb +3 -0
  8. data/lib/derivative-rodeo.rb +3 -0
  9. data/lib/derivative_rodeo/configuration.rb +95 -0
  10. data/lib/derivative_rodeo/errors.rb +56 -0
  11. data/lib/derivative_rodeo/generators/base_generator.rb +200 -0
  12. data/lib/derivative_rodeo/generators/concerns/copy_file_concern.rb +28 -0
  13. data/lib/derivative_rodeo/generators/copy_generator.rb +14 -0
  14. data/lib/derivative_rodeo/generators/hocr_generator.rb +112 -0
  15. data/lib/derivative_rodeo/generators/monochrome_generator.rb +39 -0
  16. data/lib/derivative_rodeo/generators/pdf_split_generator.rb +61 -0
  17. data/lib/derivative_rodeo/generators/thumbnail_generator.rb +38 -0
  18. data/lib/derivative_rodeo/generators/word_coordinates_generator.rb +39 -0
  19. data/lib/derivative_rodeo/services/base_service.rb +15 -0
  20. data/lib/derivative_rodeo/services/convert_uri_via_template_service.rb +87 -0
  21. data/lib/derivative_rodeo/services/extract_word_coordinates_from_hocr_sgml_service.rb +218 -0
  22. data/lib/derivative_rodeo/services/image_identify_service.rb +89 -0
  23. data/lib/derivative_rodeo/services/image_jp2_service.rb +112 -0
  24. data/lib/derivative_rodeo/services/image_service.rb +73 -0
  25. data/lib/derivative_rodeo/services/pdf_splitter/base.rb +177 -0
  26. data/lib/derivative_rodeo/services/pdf_splitter/jpg_page.rb +14 -0
  27. data/lib/derivative_rodeo/services/pdf_splitter/pages_summary.rb +130 -0
  28. data/lib/derivative_rodeo/services/pdf_splitter/png_page.rb +26 -0
  29. data/lib/derivative_rodeo/services/pdf_splitter/tiff_page.rb +52 -0
  30. data/lib/derivative_rodeo/services/pdf_splitter_service.rb +19 -0
  31. data/lib/derivative_rodeo/services/url_service.rb +42 -0
  32. data/lib/derivative_rodeo/storage_locations/base_location.rb +251 -0
  33. data/lib/derivative_rodeo/storage_locations/concerns/download_concern.rb +67 -0
  34. data/lib/derivative_rodeo/storage_locations/file_location.rb +39 -0
  35. data/lib/derivative_rodeo/storage_locations/http_location.rb +13 -0
  36. data/lib/derivative_rodeo/storage_locations/https_location.rb +13 -0
  37. data/lib/derivative_rodeo/storage_locations/s3_location.rb +103 -0
  38. data/lib/derivative_rodeo/storage_locations/sqs_location.rb +187 -0
  39. data/lib/derivative_rodeo/technical_metadata.rb +23 -0
  40. data/lib/derivative_rodeo/version.rb +5 -0
  41. data/lib/derivative_rodeo.rb +36 -0
  42. metadata +339 -0
@@ -0,0 +1,218 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
4
+ require 'json'
5
+ require 'nokogiri'
6
+
7
+ module DerivativeRodeo
8
+ module Services
9
+ ##
10
+ # Responsible for converting an SGML string into JSON coordinates
11
+ class ExtractWordCoordinatesFromHocrSgmlService
12
+ ##
13
+ # @param sgml [String] The SGML (e.g. XML or HTML) text of a HOCR file.
14
+ # @return [String] A JSON document
15
+ def self.call(sgml)
16
+ new(sgml).to_json
17
+ end
18
+
19
+ ##
20
+ # Construct with either path or HTML [String]
21
+ #
22
+ # @param html [String] either an XML string or a path to a file.
23
+ def initialize(html)
24
+ @source = xml?(html) ? html : File.read(html)
25
+ @doc_stream = DocStream.new
26
+ parser = Nokogiri::HTML::SAX::Parser.new(@doc_stream)
27
+ parser.parse(@source)
28
+ end
29
+ attr_reader :doc_stream, :source
30
+
31
+ delegate :text, :width, :height, :words, to: :doc_stream
32
+
33
+ # Output JSON flattened word coordinates
34
+ #
35
+ # @return [String] JSON serialization of flattened word coordinates
36
+ def to_json
37
+ @to_json ||= WordCoordinates.to_json(
38
+ words: doc_stream.words,
39
+ width: doc_stream.width,
40
+ height: doc_stream.height
41
+ )
42
+ end
43
+ alias json to_json
44
+
45
+ private
46
+
47
+ def xml?(xml)
48
+ xml.lstrip.start_with?('<')
49
+ end
50
+
51
+ # SAX Document Stream class to gather text and word tokens from hOCR
52
+ class DocStream < Nokogiri::XML::SAX::Document
53
+ attr_accessor :text, :words, :width, :height
54
+
55
+ def initialize
56
+ super()
57
+ # plain text buffer:
58
+ @text = ''
59
+ # list of word hash, containing word+coord:
60
+ @words = []
61
+ # page width and height to be found in hOCR for `div.ocr_page`
62
+ @width = nil
63
+ @height = nil
64
+ # to hold current word data state across #start_element, #characters,
65
+ # and #end_element methods (to associate word with coordinates).
66
+ @current = nil
67
+ # to preserve element classname from start to use by #end_element
68
+ @element_class_name = nil
69
+ end
70
+
71
+ # Return coordinates from `span.ocrx_word` element attribute hash
72
+ #
73
+ # @param attrs [Hash] hash with hOCR `span.ocrx_word` element attributes
74
+ # @return [Array] Array of position x, y, width, height in px.
75
+ def s_coords(attrs)
76
+ element_title = attrs['title']
77
+ bbox = element_title.split(';')[0].split('bbox ')[-1]
78
+ x1, y1, x2, y2 = bbox.split(' ').map(&:to_i)
79
+ height = y2 - y1
80
+ width = x2 - x1
81
+ hpos = x1
82
+ vpos = y1
83
+ [hpos, vpos, width, height]
84
+ end
85
+
86
+ # Consider element for processing?
87
+ # - `div.ocr_page` — to get page width/height
88
+ # - `span.ocr_line` — to help make plain text readable
89
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
90
+ # @param name [String] Element name
91
+ # @param class_name [String] HTML class name
92
+ # @return [Boolean] true if element should be processed; otherwise false
93
+ def consider?(name, class_name)
94
+ selector = "#{name}.#{class_name}"
95
+ ['div.ocr_page', 'span.ocr_line', 'span.ocrx_word'].include?(selector)
96
+ end
97
+
98
+ def start_word(attrs)
99
+ @current = {}
100
+ # will be replaced during #characters method call:
101
+ @current[:word] = nil
102
+ @current[:coordinates] = s_coords(attrs)
103
+ end
104
+
105
+ def start_page(attrs)
106
+ title = attrs['title']
107
+ fields = title.split(';')
108
+ bbox = fields[1].split('bbox ')[-1].split(' ').map(&:to_i)
109
+ # width and height:
110
+ @width = bbox[2]
111
+ @height = bbox[3]
112
+ end
113
+
114
+ def word_complete?
115
+ return false if @current.nil?
116
+ coords = @current[:coordinates]
117
+ @current[:word].present? && coords.size == 4
118
+ end
119
+
120
+ def end_word
121
+ # add trailing space to plaintext buffer for between words:
122
+ @text += ' '
123
+ @words.push(@current) if word_complete?
124
+ end
125
+
126
+ def end_line
127
+ # strip trailing whitespace
128
+ @text.strip!
129
+ # then insert a line break
130
+ @text += "\n"
131
+ end
132
+
133
+ # Callback for element start, ignores elements except for:
134
+ # - `div.ocr_page` — to get page width/height
135
+ # - `span.ocr_line` — to help make plain text readable
136
+ # - `span.ocrx_word` — for word-coordinate JSON and plain text word
137
+ #
138
+ # @param name [String] element name.
139
+ # @param attrs [Array] Array of key, value pair Arrays.
140
+ def start_element(name, attrs = [])
141
+ attributes = attrs.to_h
142
+ @element_class_name = attributes['class']
143
+ return unless consider?(name, @element_class_name)
144
+ start_word(attributes) if @element_class_name == 'ocrx_word'
145
+ start_page(attributes) if @element_class_name == 'ocr_page'
146
+ end
147
+
148
+ def characters(value)
149
+ return if @current.nil?
150
+ return if @current[:coordinates].nil?
151
+ @current[:word] ||= ''
152
+ @current[:word] += value
153
+ @text += value
154
+ end
155
+
156
+ # Callback for element end; at this time, flush word coordinate state
157
+ # for current word, and append line endings to plain text:
158
+ #
159
+ # @param _name [String] element name.
160
+ def end_element(_name)
161
+ end_line if @element_class_name == 'ocr_line'
162
+ end_word if @element_class_name == 'ocrx_word'
163
+ end
164
+
165
+ # Callback for completion of parsing hOCR, used to normalize generated
166
+ # text content (strip unneeded whitespace incidental to output).
167
+ def end_document
168
+ # postprocess @text to remove trailing spaces on lines
169
+ @text = @text.split("\n").map(&:strip).join("\n")
170
+ # remove excess line break
171
+ @text.gsub!(/\n+/, "\n")
172
+ @text.delete("\r")
173
+ # remove trailing whitespace at end of buffer
174
+ @text.strip!
175
+ end
176
+ end
177
+
178
+ class WordCoordinates
179
+ ##
180
+ # @api public
181
+ #
182
+ # @param words [Array<Hash>] an array of hash objects that have the keys `:word` and `:coordinates`.
183
+ # @param width [Integer] the width of the "canvas" on which the words appear.
184
+ # @param height [Integer] the height of the "canvas" on which the words appear.
185
+ #
186
+ # @return [String] a JSON encoded string.
187
+ def self.to_json(words:, width: nil, height: nil)
188
+ new(words: words, width: width, height: height).to_json
189
+ end
190
+
191
+ def initialize(words:, width:, height:)
192
+ @words = words
193
+ @width = width
194
+ @height = height
195
+ end
196
+ attr_reader :words, :width, :height
197
+
198
+ # Output JSON flattened word coordinates
199
+ #
200
+ # @return [String] JSON serialization of flattened word coordinates
201
+ def to_json
202
+ coordinates = {}
203
+ words.each do |word|
204
+ word_chars = word[:word]
205
+ word_coords = word[:coordinates]
206
+ if coordinates[word_chars]
207
+ coordinates[word_chars] << word_coords
208
+ else
209
+ coordinates[word_chars] = [word_coords]
210
+ end
211
+ end
212
+ payload = { width: width, height: height, coords: coordinates }
213
+ JSON.generate(payload)
214
+ end
215
+ end
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Services
5
+ ##
6
+ # This module is responsible for extracting technical_metadata for a given path.
7
+ #
8
+ # @see .technical_metadata_for
9
+ class ImageIdentifyService < BaseService
10
+ class_attribute :identify_format_option,
11
+ default: %(Geometry: %G\nDepth: %[bit-depth]\nColorspace: %[colorspace]\nAlpha: %A\nMIME Step: %m\n) # rubocop:disable Layout/LineLength
12
+
13
+ ##
14
+ # @api public
15
+ # @param path [String]
16
+ # @return [Derivative::Rodeo::TechnicalMetadata]
17
+ def self.technical_metadata_for(path:)
18
+ new(path).technical_metadata
19
+ end
20
+
21
+ def initialize(path)
22
+ super()
23
+ @path = path
24
+ # The first 23 characters of a file contains the magic.
25
+ @initial_file_contents = File.read(@path, 23, 0)
26
+ end
27
+ attr_reader :path
28
+
29
+ # Return metadata by means of imagemagick identify
30
+ def technical_metadata
31
+ technical_metadata = TechnicalMetadata.new
32
+ lines = im_identify
33
+ width, height = im_identify_geometry(lines)
34
+ technical_metadata.width = width
35
+ technical_metadata.height = height
36
+ technical_metadata.content_type = im_mime(lines)
37
+ populate_im_color!(lines, technical_metadata)
38
+ technical_metadata
39
+ end
40
+
41
+ private
42
+
43
+ # @return [Array<String>] lines of output from imagemagick `identify`
44
+ def im_identify
45
+ return @im_identify if defined?(@im_identify)
46
+
47
+ # Instead of relying on all of the properties, we're requesting on the specific properties
48
+ cmd = "identify -format '#{identify_format_option}' #{path}"
49
+ # cmd = "identify -verbose #{path}"
50
+ @im_identify = `#{cmd}`.lines
51
+ end
52
+
53
+ # @return [Array(Integer, Integer)] width, height in Integer px units
54
+ def im_identify_geometry(lines)
55
+ img_geo = im_line_select(lines, 'geometry').split('+')[0]
56
+ img_geo.split('x').map(&:to_i)
57
+ end
58
+
59
+ def im_mime(lines)
60
+ return 'application/pdf' if pdf? # workaround older imagemagick bug
61
+
62
+ im_line_select(lines, 'mime step')
63
+ end
64
+
65
+ def pdf?
66
+ @initial_file_contents.start_with?('%PDF-')
67
+ end
68
+
69
+ def populate_im_color!(lines, technical_metadata)
70
+ bpc = im_line_select(lines, 'depth').split('-')[0].to_i # '1-bit' -> 1
71
+ colorspace = im_line_select(lines, 'colorspace')
72
+ color = colorspace == 'Gray' ? 'gray' : 'color'
73
+ has_alpha = !im_line_select(lines, 'alpha') == 'Undefined'
74
+ technical_metadata.num_components = (color == 'gray' ? 1 : 3) + (has_alpha ? 1 : 0)
75
+ technical_metadata.color = bpc == 1 ? 'monochrome' : color
76
+ technical_metadata.bits_per_component = bpc
77
+ end
78
+
79
+ def im_line_select(lines, key)
80
+ line = lines.find { |l| l.scrub.downcase.strip.start_with?(key.downcase) }
81
+ # Given "key: value" line, return the value as String stripped of
82
+ # leading and trailing whitespace
83
+ return line if line.nil?
84
+
85
+ line.strip.split(':')[-1].strip
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,112 @@
1
+ # rubocop:disable Style/FrozenStringLiteralComment
2
+ # TODO freeze them literals
3
+
4
+ module DerivativeRodeo
5
+ module Services
6
+ ##
7
+ # A utility class for extracting technical metadata from a JP2.
8
+ #
9
+ # @see .technical_metadata_for
10
+ class ImageJp2Service < BaseService
11
+ TOKEN_MARKER_START = "\xFF".force_encoding('BINARY')
12
+ TOKEN_MARKER_SIZ = "\x51".force_encoding('BINARY')
13
+ TOKEN_IHDR = 'ihdr'.freeze
14
+
15
+ ##
16
+ # @api public
17
+ #
18
+ # @param path [String] path to jp2, for reading
19
+ #
20
+ # @return [Derivative::Rodeo::TechnicalMetadata]
21
+ def self.technical_metadata_for(path:)
22
+ new(path).technical_metadata
23
+ end
24
+
25
+ attr_reader :path
26
+
27
+ def initialize(path)
28
+ super()
29
+ @path = path
30
+ end
31
+
32
+ # rubocop:disable Metrics/MethodLength
33
+ def technical_metadata
34
+ io = File.open(path, 'rb')
35
+ io.seek(0, IO::SEEK_SET)
36
+ validate_jp2(io)
37
+ x_siz, y_siz = extract_jp2_dim(io)
38
+ nc, bpc = extract_jp2_components(io)
39
+ color = nc >= 3 ? 'color' : 'gray'
40
+ TechnicalMetadata.new(
41
+ color: bpc == 1 ? 'monochrome' : color,
42
+ num_components: nc,
43
+ bits_per_component: bpc,
44
+ width: x_siz,
45
+ height: y_siz,
46
+ content_type: 'image/jp2'
47
+ )
48
+ ensure
49
+ io.close
50
+ end
51
+ # rubocop:enable Metrics/MethodLength
52
+
53
+ private
54
+
55
+ # @param io [IO] IO stream opened in binary mode, for reading
56
+ # @return [Array(Integer, Integer)] X size, Y size, in Integer-stepd px
57
+ # rubocop:disable Metrics/MethodLength
58
+ def extract_jp2_dim(io)
59
+ raise IOError, 'file not open in binary mode' unless io.binmode?
60
+
61
+ buffer = ''
62
+ siz_found = false
63
+ # Informed by ISO/IEC 15444-1:2000, pp. 26-27
64
+ # via:
65
+ # http://hosting.astro.cornell.edu/~carcich/LRO/jp2/ISO_JPEG200_Standard/INCITS+ISO+IEC+15444-1-2000.pdf
66
+ #
67
+ # first 23 bytes are file-magic, we can skip
68
+ io.seek(23, IO::SEEK_SET)
69
+ while !siz_found && !buffer.nil?
70
+ # read one byte at a time, until we hit marker start 0xFF
71
+ buffer = io.read(1) while buffer != TOKEN_MARKER_START
72
+ # - on 0xFF read subsequent byte; if value != 0x51, continue
73
+ buffer = io.read(1)
74
+ next if buffer != TOKEN_MARKER_SIZ
75
+
76
+ # - on 0x51, read next 12 bytes
77
+ buffer = io.read(12)
78
+ siz_found = true
79
+ end
80
+ # discard first 4 bytes; next 4 bytes are XSiz; last 4 bytes are YSiz
81
+ x_siz = buffer.byteslice(4, 4).unpack1('N')
82
+ y_siz = buffer.byteslice(8, 4).unpack1('N')
83
+ [x_siz, y_siz]
84
+ end
85
+ # rubocop:enable Metrics/MethodLength
86
+
87
+ # @param io [IO] IO stream opened in binary mode, for reading
88
+ # @return [Array(Integer, Integer)] number components, bits-per-component
89
+ def extract_jp2_components(io)
90
+ raise IOError, 'file not open in binary mode' unless io.binmode?
91
+
92
+ io.seek(0, IO::SEEK_SET)
93
+ # IHDR should be in first 64 bytes
94
+ buffer = io.read(64)
95
+ ihdr_data = buffer.split(TOKEN_IHDR)[-1]
96
+ raise IOError if ihdr_data.nil?
97
+
98
+ num_components = ihdr_data.byteslice(8, 2).unpack1('n')
99
+ # stored as "bit depth of the components in the codestream, minus 1", so add 1
100
+ bits_per_component = ihdr_data.byteslice(10, 1).unpack1('c') + 1
101
+ [num_components, bits_per_component]
102
+ end
103
+
104
+ def validate_jp2(io)
105
+ # verify file is jp2
106
+ magic = io.read(23)
107
+ raise IOError, 'Not JP2 file' unless magic.end_with?('ftypjp2')
108
+ end
109
+ end
110
+ end
111
+ end
112
+ # rubocop:enable Style/FrozenStringLiteralComment
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'tmpdir'
4
+
5
+ module DerivativeRodeo
6
+ module Services
7
+ ##
8
+ # @api private
9
+ #
10
+ # @see .technical_metadata
11
+ # @see .convert
12
+ class ImageService < BaseService
13
+ attr_accessor :path
14
+
15
+ def initialize(path)
16
+ super()
17
+ @path = path
18
+ # The first 23 characters of a file contains the magic.
19
+ @initial_file_contents = File.read(@path, 23, 0)
20
+ end
21
+
22
+ def jp2?
23
+ @initial_file_contents.end_with?('ftypjp2')
24
+ end
25
+
26
+ # @return [Derivative::Rodeo::TechnicalMetadata]
27
+ def technical_metadata
28
+ return @technical_metadata if defined?(@technical_metadata)
29
+
30
+ @technical_metadata = if jp2?
31
+ ImageJp2Service.technical_metadata_for(path: path)
32
+ else
33
+ ImageIdentifyService.technical_metadata_for(path: path)
34
+ end
35
+ end
36
+ alias metadata technical_metadata
37
+
38
+ extend Forwardable
39
+ def_delegator :technical_metadata, :monochrome?
40
+
41
+ # Convert source image to image at destination path, inferring file type from destination
42
+ # file extension. In case of JP2 files, create intermediate file using OpenJPEG 2000 that
43
+ # ImageMagick can use. Only outputs monochrome output if monochrome is true, destination
44
+ # format is TIFF.
45
+ #
46
+ # @param destination [String] Path to output / destination file
47
+ # @param monochrome [Boolean] true if monochrome output, otherwise false
48
+ def convert(destination:, monochrome: false)
49
+ raise 'JP2 output not yet supported' if destination.end_with?('jp2')
50
+
51
+ source = jp2? ? jp2_to_tiff(path) : path
52
+ convert_image(source: source, destination: destination, monochrome: monochrome)
53
+ end
54
+
55
+ private
56
+
57
+ def convert_image(source:, destination:, monochrome:)
58
+ monochrome &&= destination.slice(-4, 4).index('tif')
59
+ mono_opts = '-depth 1 -monochrome -compress Group4 -type bilevel '
60
+ opts = monochrome ? mono_opts : ''
61
+ cmd = "convert #{source} #{opts}#{destination}"
62
+ `#{cmd}`
63
+ end
64
+
65
+ def jp2_to_tiff(source)
66
+ intermediate_path = File.join(Dir.mktmpdir, 'intermediate.tif')
67
+ jp2_cmd = "opj_decompress -i #{source} -o #{intermediate_path}"
68
+ `#{jp2_cmd}`
69
+ intermediate_path
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,177 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+ require 'securerandom'
5
+ require 'tmpdir'
6
+
7
+ module DerivativeRodeo
8
+ module Services
9
+ module PdfSplitter
10
+ ##
11
+ # @param name [String]
12
+ # @return [PdfSplitter::Base]
13
+ def self.for(name)
14
+ klass_name = "#{name.to_s.classify}_page".classify
15
+ "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
16
+ end
17
+
18
+ ##
19
+ # @abstract
20
+ #
21
+ # The purpose of this class is to split the PDF into constituent image files.
22
+ #
23
+ # @see #each
24
+ class Base
25
+ class_attribute :image_extension
26
+ class_attribute :default_dpi, default: 400
27
+ # Should we perform compression logic on the images?
28
+ class_attribute :compression, default: nil
29
+ # What is the image quality we're using?
30
+ class_attribute :quality, default: nil
31
+
32
+ class_attribute :gsdevice, instance_accessor: false
33
+ class_attribute :page_count_regexp, instance_accessor: true, default: /^Pages: +(\d+)$/
34
+ ##
35
+ # @api public
36
+ #
37
+ # @param path [String] The path the the PDF
38
+ #
39
+ # @return [Enumerable, Utilities::PdfSplitter::Base]
40
+ def self.call(path, baseid: SureRandom.uuid, tmpdir: Dir.mktmpdir)
41
+ new(path, baseid: baseid, tmpdir: tmpdir)
42
+ end
43
+
44
+ ##
45
+ # @param path [String] the path to the source PDF that we're processing.
46
+ # @param baseid [String] used for creating a unique identifier
47
+ # @param tmpdir [String] place to perform the "work" of splitting the PDF.
48
+ # @param pdf_pages_summary [Derivative::Rodeo::PdfPagesSummary] by default we'll
49
+ # extract this from the given path, but for testing purposes, you might want to
50
+ # provide a specific summary.
51
+ # @param logger [Logger, #error]
52
+ def initialize(path,
53
+ baseid: SecureRandom.uuid,
54
+ # TODO: Do we need to provide the :tmpdir for the application?
55
+ tmpdir: Dir.mktmpdir,
56
+ pdf_pages_summary: PagesSummary.extract_from(path: path),
57
+ logger: DerivativeRodeo.config.logger)
58
+ @baseid = baseid
59
+ @pdfpath = path
60
+ @pdf_pages_summary = pdf_pages_summary
61
+ @tmpdir = tmpdir
62
+ @logger = logger
63
+ end
64
+
65
+ attr_reader :logger
66
+
67
+ # In creating {#each} we get many of the methods of array operation (e.g. #to_a).
68
+ include Enumerable
69
+
70
+ ##
71
+ # @api public
72
+ #
73
+ # @yieldparam [String] the path to the page's tiff.
74
+ def each(&block)
75
+ entries.each(&block)
76
+ end
77
+
78
+ # @api private
79
+ def invalid_pdf?
80
+ !pdf_pages_summary.valid?
81
+ end
82
+
83
+ attr_reader :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
84
+ private :pdf_pages_summary, :tmpdir, :baseid, :pdfpath
85
+
86
+ # @api private
87
+ def gsdevice
88
+ return self.class.gsdevice if self.class.gsdevice
89
+
90
+ raise NotImplementedError, "#{self.class}#gsdevice"
91
+ end
92
+
93
+ private
94
+
95
+ # entries for each page
96
+ def entries
97
+ return @entries if defined? @entries
98
+
99
+ @entries = Array.wrap(gsconvert)
100
+ end
101
+
102
+ def output_base
103
+ @output_base ||= File.join(tmpdir, "#{baseid}-page%d.#{image_extension}")
104
+ end
105
+
106
+ def gsconvert
107
+ # NOTE: you must call gsdevice before compression, as compression is
108
+ # updated during the gsdevice call.
109
+ file_names = []
110
+
111
+ Open3.popen3(gsconvert_cmd(output_base)) do |_stdin, stdout, stderr, _wait_thr|
112
+ err = stderr.read
113
+ logger.error "#{self.class}#gsconvert encountered the following error with `gs': #{err}" if err.present?
114
+
115
+ page_number = 1
116
+ stdout.read.split("\n").each do |line|
117
+ next unless line.start_with?('Page ')
118
+
119
+ file_names << format(output_base, page_number)
120
+ page_number += 1
121
+ end
122
+ end
123
+
124
+ file_names
125
+ end
126
+
127
+ def create_file_name(line:, page_number:); end
128
+
129
+ def gsconvert_cmd(output_base)
130
+ @gsconvert_cmd ||= begin
131
+ cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} -dTextAlphaBits=4"
132
+ cmd += " -sCompression=#{compression}" if compression?
133
+ cmd += " -dJPEGQ=#{quality}" if quality?
134
+ cmd += " -sOutputFile=#{output_base} -r#{ppi} -f #{pdfpath}"
135
+ cmd
136
+ end
137
+ end
138
+
139
+ def pagecount
140
+ return @pagecount if defined? @pagecount
141
+
142
+ cmd = "pdfinfo #{pdfpath}"
143
+ Open3.popen3(cmd) do |_stdin, stdout, stderr, _wait_thr|
144
+ err = stderr.read
145
+ logger.error "#{self.class}#pagecount encountered the following error with `pdfinfo': #{err}" if err.present?
146
+ output = stdout.read
147
+ raise "pdfinfo failed to return output for #{pdfpath} - #{err}" if output.blank?
148
+ match = page_count_regexp.match(output)
149
+
150
+ @pagecount = match[1].to_i
151
+ end
152
+ @pagecount
153
+ end
154
+
155
+ def ppi
156
+ if looks_scanned?
157
+ # For scanned media, defer to detected image PPI:
158
+ pdf_pages_summary.ppi
159
+ else
160
+ # 400 dpi for something that does not look like scanned media:
161
+ default_dpi
162
+ end
163
+ end
164
+
165
+ def looks_scanned?
166
+ max_image_px = pdf_pages_summary.width * pdf_pages_summary.height
167
+ # single 10mp+ image per page?
168
+ single_image_per_page? && max_image_px > 1024 * 1024 * 10
169
+ end
170
+
171
+ def single_image_per_page?
172
+ pdf_pages_summary.page_count == pagecount
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DerivativeRodeo
4
+ module Services
5
+ module PdfSplitter
6
+ # The purpose of this class is to split the PDF into constituent jpg files.
7
+ class JpgPage < PdfSplitter::Base
8
+ self.image_extension = 'jpg'
9
+ self.quality = '50'
10
+ self.gsdevice = 'jpeg'
11
+ end
12
+ end
13
+ end
14
+ end