derivative-rodeo 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 62872d16bfd5d73940f87d5c09f61f2a88ee67414f51905ce503f411b9b2fb37
4
- data.tar.gz: 742d63ca02418b3453824655738e25b47d3cca918f030e2fb5db4c997d52e945
3
+ metadata.gz: d2dc233be659f66709737ebdebc8ac6a69802f74bce6ac450e5eabb383ea4e54
4
+ data.tar.gz: 7a72ce66e69f827374b6f8e72767aca12638c71a9e5b2c63be09bbca08c12640
5
5
  SHA512:
6
- metadata.gz: e43b94745f35474edf4b463cd11b8c7d7bb29391f443c7ef2a9e84966d969aa6cf7c205a92c32a03452ba28c490c790ccd05d6569fd9db59d8c119b7e38f1dde
7
- data.tar.gz: '07962c3175aed6d77295e473ad8462d0ae634c931a2f3f5bc75be1195977dded3cec013448c9d2fe2653fa83b9d178dd3513aaa1eda8eeafe9da539e3dbf06b0'
6
+ metadata.gz: de5ff8fc29943f5dc52b49c7741ae4a742793439dcca97ea94259d044cc15870dd19ced0911d9305901757e838a8ecb401eb81744c4f0529eef9607534d5dcb8
7
+ data.tar.gz: bec393ace49d0e6689a07d335017987fa6e6edf9aae632159b87150dc2d4a0720441d64f6c6ce48ceb7c4498df3b6ada40b010cd0d439cb76cb071bd65c2c19d
@@ -35,7 +35,7 @@ module DerivativeRodeo
35
35
  ##
36
36
  # Raised when AWS bucket does not exist or is not accessible by current permissions
37
37
  class BucketMissingError < Error
38
- def initialize
38
+ def initialize(file_uri:)
39
39
  super("Bucket part missing #{file_uri}")
40
40
  end
41
41
  end
@@ -43,6 +43,9 @@ module DerivativeRodeo
43
43
  ##
44
44
  # Raised when trying to write a tmp file that does not exist
45
45
  class FileMissingError < Error
46
+ def self.with_info(**info)
47
+ new(info.inspect)
48
+ end
46
49
  end
47
50
 
48
51
  ##
@@ -11,11 +11,21 @@ module DerivativeRodeo
11
11
  ##
12
12
  # The Base Generator defines the interface and common methods.
13
13
  #
14
+ # Fundamentally, they are about ensuring the files end up at the specified location, based on
15
+ # the given:
16
+ #
17
+ # - {#input_uris}
18
+ # - {#output_location_template}
19
+ # - {#preprocessed_location_template}
20
+ #
14
21
  # In extending a BaseGenerator you:
15
22
  #
16
23
  # - must assign an {.output_extension}
17
24
  # - must impliment a {#build_step} method
18
25
  # - may override {#with_each_requisite_location_and_tmp_file_path}
26
+ #
27
+ # {#generated_files} is "where the magic happens"
28
+ # rubocop:disable Metrics/ClassLength
19
29
  class BaseGenerator
20
30
  ##
21
31
  # @!group Class Attributes
@@ -26,9 +36,27 @@ module DerivativeRodeo
26
36
  class_attribute :output_extension
27
37
  # @!endgroup Class Attributes
28
38
 
29
- attr_reader :input_uris,
30
- :output_location_template,
31
- :preprocessed_location_template
39
+ ##
40
+ # @!group Attributes
41
+ #
42
+ # The "original" files that we'll be processing (via {#generated_files})
43
+ # @return [Array<String>]
44
+ attr_reader :input_uris
45
+
46
+ ##
47
+ # The template that defines where we'll be writing the {#input_uris} (via {#generated_files})
48
+ # @return [String]
49
+ # @see DerivativeRodeo::Services::ConvertUriViaTemplateService
50
+ attr_reader :output_location_template
51
+
52
+ ##
53
+ # The template that defines where we might find existing processed files for the given
54
+ # {#input_uris} (via {#generated_files})
55
+ #
56
+ # @return [String, NilClass]
57
+ # @see DerivativeRodeo::Services::ConvertUriViaTemplateService
58
+ attr_reader :preprocessed_location_template
59
+ # @!endgroup Attributes
32
60
 
33
61
  ##
34
62
  # @param input_uris [Array<String>]
@@ -78,6 +106,15 @@ module DerivativeRodeo
78
106
  ##
79
107
  # @api public
80
108
  #
109
+ # Based on the {#input_uris} ensure that we have files at the given output location (as
110
+ # derived from the {#output_location_template}). We ensure that by:
111
+ #
112
+ # - Checking if a file already exists at the output location
113
+ # - Copying a preprocessed file to the output location if a preprocessed file exists
114
+ # - Generating the file based on the input location
115
+ #
116
+ # @note This is the method where the magic happens!
117
+ #
81
118
  # @return [Array<StorageLocations::BaseLocation>]
82
119
  #
83
120
  # @see #build_step
@@ -87,6 +124,10 @@ module DerivativeRodeo
87
124
  # TODO: Examples please
88
125
  return @generated_files if defined?(@generated_files)
89
126
 
127
+ logger.info("Starting #{self.class}#generated_files with " \
128
+ "input_uris: #{input_uris.inspect}, " \
129
+ "output_location_template: #{output_location_template.inspect}, and " \
130
+ "preprocessed_location_template: #{preprocessed_location_template.inspect}.")
90
131
  # As much as I would like to use map or returned values; given the implementations it's
91
132
  # better to explicitly require that; reducing downstream implementation headaches.
92
133
  #
@@ -98,15 +139,15 @@ module DerivativeRodeo
98
139
  # BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
99
140
  # "file:///Users/jfriesen/.profile"
100
141
  with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
101
- generated_file = destination(input_location)
102
- @generated_files << if generated_file.exist?
103
- generated_file
142
+ output_location = destination(input_location)
143
+ @generated_files << if output_location.exist?
144
+ output_location
104
145
  else
105
146
  log_message = "#{self.class}#generated_files :: " \
106
147
  "input_location file_uri #{input_location.file_uri} :: " \
107
- "Generating output_location file_uri #{generated_file.file_uri} via build_step."
148
+ "Generating output_location file_uri #{output_location.file_uri} via build_step."
108
149
  logger.info(log_message)
109
- build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
150
+ build_step(input_location: input_location, output_location: output_location, input_tmp_file_path: input_tmp_file_path)
110
151
  end
111
152
  end
112
153
  @generated_files
@@ -163,9 +204,13 @@ module DerivativeRodeo
163
204
  end
164
205
 
165
206
  ##
166
- # Returns the location destination for the given :input_file. The file at the location
167
- # destination might exist or might not. In the case of non-existence, then the {#build_step}
168
- # will create the file.
207
+ # Returns the output location for the given :input_location. The file at the location
208
+ # destination might exist or might not. In the case where we have a
209
+ # {#preprocessed_location_template}, we'll also check the preprocessed location for the file,
210
+ # and if it exists there copy it to the target output location.
211
+ #
212
+ # In the case of non-existence, then the {#build_step} will create
213
+ # the file.
169
214
  #
170
215
  # @param input_location [StorageLocations::BaseLocation]
171
216
  #
@@ -191,22 +236,31 @@ module DerivativeRodeo
191
236
  log_message = "#{self.class}#destination :: " \
192
237
  "input_location file_uri #{input_location.file_uri} :: " \
193
238
  "No preprocessed_location_template provided " \
194
- "nor does a file exist at output_location file_uri #{output_location.file_uri};" \
195
- " moving on to generation via #{self.class}#build_step."
239
+ "nor does a file exist at output_location file_uri #{output_location.file_uri}; " \
240
+ "moving on to generation via #{self.class}#build_step."
196
241
  logger.info(log_message)
197
242
 
198
243
  return output_location
199
244
  end
200
245
 
201
- preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template, extension: output_extension)
246
+ template = derive_preprocessed_template_from(input_location: input_location, preprocessed_location_template: preprocessed_location_template)
247
+
248
+ preprocessed_location = input_location.derived_file_from(template: template, extension: output_extension)
202
249
  # We only want the location if it exists
203
- if preprocessed_location&.exist?
250
+ if preprocessed_location.exist?
204
251
  log_message = "#{self.class}#destination :: " \
205
252
  "input_location file_uri #{input_location.file_uri} :: " \
206
- "Found preprocessed_location file_uri #{output_location.file_uri}."
253
+ "Found preprocessed_location file_uri #{preprocessed_location.file_uri}."
207
254
  logger.info(log_message)
208
255
 
209
- return preprocessed_location
256
+ # Let's make sure we reap the fruits of the pre-processing; and don't worry that generator
257
+ # will also write some logs.
258
+ output_location = CopyGenerator.new(
259
+ input_uris: [preprocessed_location.file_uri],
260
+ output_location_template: output_location.file_uri
261
+ ).generated_files.first
262
+
263
+ return output_location
210
264
  end
211
265
 
212
266
  log_message = "#{self.class}#destination :: " \
@@ -223,6 +277,22 @@ module DerivativeRodeo
223
277
  # rubocop:enable Metrics/AbcSize
224
278
  # rubocop:enable Metrics/MethodLength
225
279
 
280
+ ##
281
+ # Some generators (e.g. {PdfSplitGenerator}) need to cooerce the location template based on
282
+ # the input location. Most often, however, the given :preprocessed_location_template is
283
+ # adequate and would be the typical returned value.
284
+ #
285
+ # @param input_location [StorageLocations::BaseLocation]
286
+ # @param preprocessed_location_template [String]
287
+ #
288
+ # @return [String]
289
+ #
290
+ # rubocop:disable Lint/UnusedMethodArgument
291
+ def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
292
+ preprocessed_location_template
293
+ end
294
+ # rubocop:enable Lint/UnusedMethodArgument
295
+
226
296
  ##
227
297
  # A bit of indirection to create a common interface for running a shell command.
228
298
  #
@@ -237,6 +307,7 @@ module DerivativeRodeo
237
307
  result
238
308
  end
239
309
  end
310
+ # rubocop:enable Metrics/ClassLength
240
311
  end
241
312
  end
242
313
 
@@ -66,7 +66,7 @@ module DerivativeRodeo
66
66
  #
67
67
  # @note There is relation to {Generators::BaseGenerator#destination} and this method.
68
68
  #
69
- # @note The tail_glob is in relation to the {#image_file_basename_template}
69
+ # @note The tail_regexp is in relation to the {#image_file_basename_template}
70
70
  def existing_page_locations(input_location:)
71
71
  # See image_file_basename_template
72
72
  tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
@@ -76,12 +76,14 @@ module DerivativeRodeo
76
76
 
77
77
  return [] if preprocessed_location_template.blank?
78
78
 
79
- input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
79
+ input_location.derived_file_from(template: preprocessed_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
80
80
  end
81
81
 
82
82
  ##
83
83
  # @api public
84
84
  #
85
+ # @param splitter [#call]
86
+ #
85
87
  # Take the given PDF(s) and into one image per page. Remember that the URL should account for
86
88
  # the page number.
87
89
  #
@@ -98,22 +100,27 @@ module DerivativeRodeo
98
100
  # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
99
101
  #
100
102
  # rubocop:disable Metrics/MethodLength
101
- def with_each_requisite_location_and_tmp_file_path
103
+ # rubocop:disable Metrics/AbcSize
104
+ def with_each_requisite_location_and_tmp_file_path(splitter: Services::PdfSplitter)
102
105
  input_files.each do |input_location|
103
106
  input_location.with_existing_tmp_path do |input_tmp_file_path|
104
107
  existing_locations = existing_page_locations(input_location: input_location)
105
108
 
106
109
  if existing_locations.count.positive?
107
- existing_locations.each do |location|
110
+ logger.info("#{self.class}##{__method__} found #{existing_locations.count} file(s) at existing split location for #{input_location.file_uri.inspect}.")
111
+ existing_locations.each_with_index do |location, index|
112
+ logger.info("#{self.class}##{__method__} found ##{index} split file #{location.file_path.inspect} for #{input_location.file_uri.inspect}.")
108
113
  yield(location, location.file_path)
109
114
  end
110
115
  else
116
+ logger.info("#{self.class}##{__method__} did not find at existing location split files for #{input_location.file_uri.inspect}. Proceeding with #{splitter}.call")
111
117
  # We're going to need to create the files and "cast" them to locations.
112
- Services::PdfSplitter.call(
118
+ splitter.call(
113
119
  input_tmp_file_path,
114
120
  image_extension: output_extension,
115
121
  image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
116
- ).each do |image_path|
122
+ ).each_with_index do |image_path, index|
123
+ logger.info("#{self.class}##{__method__} generated (via #{splitter}.call) ##{index} split file #{image_path.inspect} for #{input_location.file_uri.inspect}.")
117
124
  image_location = StorageLocations::FileLocation.new("file://#{image_path}")
118
125
  yield(image_location, image_path)
119
126
  end
@@ -121,7 +128,18 @@ module DerivativeRodeo
121
128
  end
122
129
  end
123
130
  end
131
+ # rubocop:enable Metrics/AbcSize
124
132
  # rubocop:enable Metrics/MethodLength
133
+
134
+ ##
135
+ # We're working with an input location with a filename basename of "123.ARCHIVAL--page-1.tiff"
136
+ # The :preprocessed_location_template, due to constraints, likely ends with the original PDF's
137
+ # filename (e.g. "123.ARCHIVAL.pdf")
138
+ #
139
+ # And since the template doesn't have a concept of page number, we introduce this kludge.
140
+ def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
141
+ File.join(File.dirname(preprocessed_location_template), input_location.file_name)
142
+ end
125
143
  end
126
144
  end
127
145
  end
@@ -33,6 +33,11 @@ module DerivativeRodeo
33
33
  File.open(path_to_coordinate, "w+") do |file|
34
34
  file.puts service.call(hocr_html).to_json
35
35
  end
36
+ rescue => e
37
+ message = "#{self.class}##{__method__} encountered `#{e.class}' error “#{e}” for path_to_hocr: #{path_to_hocr.inspect} and path_to_coordinate: #{path_to_coordinate.inspect}"
38
+ exception = RuntimeError.new(message)
39
+ exception.set_backtrace(e.backtrace)
40
+ raise exception
36
41
  end
37
42
  end
38
43
  end
@@ -46,6 +46,8 @@ module DerivativeRodeo
46
46
  delegate :config, to: DerivativeRodeo
47
47
  end
48
48
 
49
+ delegate :logger, to: DerivativeRodeo
50
+
49
51
  ##
50
52
  # @param location_name [String]
51
53
  #
@@ -23,8 +23,8 @@ module DerivativeRodeo
23
23
  delegate :config, to: DerivativeRodeo
24
24
 
25
25
  def with_existing_tmp_path(&block)
26
- with_tmp_path(lambda { |_file_path, tmp_file_path, exist|
27
- raise Errors::FileMissingError unless exist
26
+ with_tmp_path(lambda { |file_path, tmp_file_path, exist|
27
+ raise Errors::FileMissingError.with_info(method: __method__, context: self, file_path: file_path, tmp_file_path: tmp_file_path) unless exist
28
28
 
29
29
  response = get(file_uri)
30
30
  File.open(tmp_file_path, 'wb') { |fp| fp.write(response.body) }
@@ -16,7 +16,7 @@ module DerivativeRodeo
16
16
 
17
17
  def with_existing_tmp_path(&block)
18
18
  with_tmp_path(lambda { |file_path, tmp_file_path, exist|
19
- raise Errors::FileMissingError unless exist
19
+ raise Errors::FileMissingError.with_info(method: __method__, context: self, file_path: file_path, tmp_file_path: tmp_file_path) unless exist
20
20
 
21
21
  FileUtils.cp(file_path, tmp_file_path)
22
22
  }, &block)
@@ -42,6 +42,9 @@ module DerivativeRodeo
42
42
  #
43
43
  # @param tail_regexp [Regexp]
44
44
  def matching_locations_in_file_dir(tail_regexp:)
45
+ logger.debug("#{self.class}##{__method__} searching for matching files in " \
46
+ "file_dir: #{file_dir.inspect} " \
47
+ "with tail_regexp: #{tail_regexp.inspect}.")
45
48
  Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
46
49
  accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
47
50
  end
@@ -47,7 +47,7 @@ module DerivativeRodeo
47
47
  # @return [String] the path to the tmp file
48
48
  def with_existing_tmp_path(&block)
49
49
  with_tmp_path(lambda { |file_path, tmp_file_path, exist|
50
- raise Errors::FileMissingError unless exist
50
+ raise Errors::FileMissingError.with_info(method: __method__, context: self, file_path: file_path, tmp_file_path: tmp_file_path) unless exist
51
51
  obj = bucket.object(file_path)
52
52
  obj.download_file(tmp_file_path)
53
53
  }, &block)
@@ -73,7 +73,10 @@ module DerivativeRodeo
73
73
  def matching_locations_in_file_dir(tail_regexp:)
74
74
  uri = URI.parse(file_uri)
75
75
  scheme_and_host = "#{uri.scheme}://#{uri.host}"
76
-
76
+ logger.debug("#{self.class}##{__method__} searching for matching files for " \
77
+ "scheme_and_host: #{scheme_and_host.inspect} " \
78
+ "file_dir: #{file_dir.inspect} " \
79
+ "with tail_regexp: #{tail_regexp.inspect}.")
77
80
  bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
78
81
  if tail_regexp.match(object.key)
79
82
  template = File.join(scheme_and_host, object.key)
@@ -120,7 +123,7 @@ module DerivativeRodeo
120
123
  def bucket_name
121
124
  @bucket_name ||= file_uri.match(%r{s3://(.+)\.s3})&.[](1)
122
125
  rescue StandardError
123
- raise Errors::BucketMissingError
126
+ raise Errors::BucketMissingError.new(file_uri: file_uri)
124
127
  end
125
128
 
126
129
  # @see .use_actual_s3_bucket
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DerivativeRodeo
4
- VERSION = '0.4.2'
4
+ VERSION = '0.5.1'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: derivative-rodeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-07-10 00:00:00.000000000 Z
12
+ date: 2023-11-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport