derivative-rodeo 0.4.2 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 62872d16bfd5d73940f87d5c09f61f2a88ee67414f51905ce503f411b9b2fb37
4
- data.tar.gz: 742d63ca02418b3453824655738e25b47d3cca918f030e2fb5db4c997d52e945
3
+ metadata.gz: f5ea42b2912b6b1eb6981b8a72a3d39e4d6f466e07ad60381ba4880431214b18
4
+ data.tar.gz: 4e4ca5cdd6ba61898ba13970fbd95c8836c33492c377eb6f258255c8ebf79e67
5
5
  SHA512:
6
- metadata.gz: e43b94745f35474edf4b463cd11b8c7d7bb29391f443c7ef2a9e84966d969aa6cf7c205a92c32a03452ba28c490c790ccd05d6569fd9db59d8c119b7e38f1dde
7
- data.tar.gz: '07962c3175aed6d77295e473ad8462d0ae634c931a2f3f5bc75be1195977dded3cec013448c9d2fe2653fa83b9d178dd3513aaa1eda8eeafe9da539e3dbf06b0'
6
+ metadata.gz: 6ebaacf09b1459e8bb8527ddd3b45b6e84106d335e91cc506a6b9b31aae6c74931c755dc5e084e9a39a6ddb0ca8f1b0f85ee21e461cac445edd3eea45384b1b5
7
+ data.tar.gz: e11195b3b7169a1f7e2f5df821c681512af93123fc272d3b1dd35d9c971f15ad58a86b4ab1cabf38b463c77d5531114af7ca36eb9197275e46549788a4ec6c1f
@@ -35,7 +35,7 @@ module DerivativeRodeo
35
35
  ##
36
36
  # Raised when AWS bucket does not exist or is not accessible by current permissions
37
37
  class BucketMissingError < Error
38
- def initialize
38
+ def initialize(file_uri:)
39
39
  super("Bucket part missing #{file_uri}")
40
40
  end
41
41
  end
@@ -11,11 +11,21 @@ module DerivativeRodeo
11
11
  ##
12
12
  # The Base Generator defines the interface and common methods.
13
13
  #
14
+ # Fundamentally, they are about ensuring the files end up at the specified location, based on
15
+ # the given:
16
+ #
17
+ # - {#input_uris}
18
+ # - {#output_location_template}
19
+ # - {#preprocessed_location_template}
20
+ #
14
21
  # In extending a BaseGenerator you:
15
22
  #
16
23
  # - must assign an {.output_extension}
17
24
  # - must impliment a {#build_step} method
18
25
  # - may override {#with_each_requisite_location_and_tmp_file_path}
26
+ #
27
+ # {#generated_files} is "where the magic happens"
28
+ # rubocop:disable Metrics/ClassLength
19
29
  class BaseGenerator
20
30
  ##
21
31
  # @!group Class Attributes
@@ -26,9 +36,27 @@ module DerivativeRodeo
26
36
  class_attribute :output_extension
27
37
  # @!endgroup Class Attributes
28
38
 
29
- attr_reader :input_uris,
30
- :output_location_template,
31
- :preprocessed_location_template
39
+ ##
40
+ # @!group Attributes
41
+ #
42
+ # The "original" files that we'll be processing (via {#generated_files})
43
+ # @return [Array<String>]
44
+ attr_reader :input_uris
45
+
46
+ ##
47
+ # The template that defines where we'll be writing the {#input_uris} (via {#generated_files})
48
+ # @return [String]
49
+ # @see DerivativeRodeo::Services::ConvertUriViaTemplateService
50
+ attr_reader :output_location_template
51
+
52
+ ##
53
+ # The template that defines where we might find existing processed files for the given
54
+ # {#input_uris} (via {#generated_files})
55
+ #
56
+ # @return [String, NilClass]
57
+ # @see DerivativeRodeo::Services::ConvertUriViaTemplateService
58
+ attr_reader :preprocessed_location_template
59
+ # @!endgroup Attributes
32
60
 
33
61
  ##
34
62
  # @param input_uris [Array<String>]
@@ -78,6 +106,15 @@ module DerivativeRodeo
78
106
  ##
79
107
  # @api public
80
108
  #
109
+ # Based on the {#input_uris} ensure that we have files at the given output location (as
110
+ # derived from the {#output_location_template}). We ensure that by:
111
+ #
112
+ # - Checking if a file already exists at the output location
113
+ # - Copying a preprocessed file to the output location if a preprocessed file exists
114
+ # - Generating the file based on the input location
115
+ #
116
+ # @note This is the method where the magic happens!
117
+ #
81
118
  # @return [Array<StorageLocations::BaseLocation>]
82
119
  #
83
120
  # @see #build_step
@@ -87,6 +124,10 @@ module DerivativeRodeo
87
124
  # TODO: Examples please
88
125
  return @generated_files if defined?(@generated_files)
89
126
 
127
+ logger.info("Starting #{self.class}#generated_files with " \
128
+ "input_uris: #{input_uris.inspect}, " \
129
+ "output_location_template: #{output_location_template.inspect}, and " \
130
+ "preprocessed_location_template: #{preprocessed_location_template.inspect}.")
90
131
  # As much as I would like to use map or returned values; given the implementations it's
91
132
  # better to explicitly require that; reducing downstream implementation headaches.
92
133
  #
@@ -98,15 +139,15 @@ module DerivativeRodeo
98
139
  # BaseLocation is like the Ruby `File` (Pathname) "File.exist?(path) :: location.exist?"
99
140
  # "file:///Users/jfriesen/.profile"
100
141
  with_each_requisite_location_and_tmp_file_path do |input_location, input_tmp_file_path|
101
- generated_file = destination(input_location)
102
- @generated_files << if generated_file.exist?
103
- generated_file
142
+ output_location = destination(input_location)
143
+ @generated_files << if output_location.exist?
144
+ output_location
104
145
  else
105
146
  log_message = "#{self.class}#generated_files :: " \
106
147
  "input_location file_uri #{input_location.file_uri} :: " \
107
- "Generating output_location file_uri #{generated_file.file_uri} via build_step."
148
+ "Generating output_location file_uri #{output_location.file_uri} via build_step."
108
149
  logger.info(log_message)
109
- build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
150
+ build_step(input_location: input_location, output_location: output_location, input_tmp_file_path: input_tmp_file_path)
110
151
  end
111
152
  end
112
153
  @generated_files
@@ -163,9 +204,13 @@ module DerivativeRodeo
163
204
  end
164
205
 
165
206
  ##
166
- # Returns the location destination for the given :input_file. The file at the location
167
- # destination might exist or might not. In the case of non-existence, then the {#build_step}
168
- # will create the file.
207
+ # Returns the output location for the given :input_location. The file at the location
208
+ # destination might exist or might not. In the case where we have a
209
+ # {#preprocessed_location_template}, we'll also check the preprocessed location for the file,
210
+ # and if it exists there copy it to the target output location.
211
+ #
212
+ # In the case of non-existence, then the {#build_step} will create
213
+ # the file.
169
214
  #
170
215
  # @param input_location [StorageLocations::BaseLocation]
171
216
  #
@@ -191,22 +236,31 @@ module DerivativeRodeo
191
236
  log_message = "#{self.class}#destination :: " \
192
237
  "input_location file_uri #{input_location.file_uri} :: " \
193
238
  "No preprocessed_location_template provided " \
194
- "nor does a file exist at output_location file_uri #{output_location.file_uri};" \
195
- " moving on to generation via #{self.class}#build_step."
239
+ "nor does a file exist at output_location file_uri #{output_location.file_uri}; " \
240
+ "moving on to generation via #{self.class}#build_step."
196
241
  logger.info(log_message)
197
242
 
198
243
  return output_location
199
244
  end
200
245
 
201
- preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template, extension: output_extension)
246
+ template = derive_preprocessed_template_from(input_location: input_location, preprocessed_location_template: preprocessed_location_template)
247
+
248
+ preprocessed_location = input_location.derived_file_from(template: template, extension: output_extension)
202
249
  # We only want the location if it exists
203
- if preprocessed_location&.exist?
250
+ if preprocessed_location.exist?
204
251
  log_message = "#{self.class}#destination :: " \
205
252
  "input_location file_uri #{input_location.file_uri} :: " \
206
- "Found preprocessed_location file_uri #{output_location.file_uri}."
253
+ "Found preprocessed_location file_uri #{preprocessed_location.file_uri}."
207
254
  logger.info(log_message)
208
255
 
209
- return preprocessed_location
256
+ # Let's make sure we reap the fruits of the pre-processing; and don't worry that generator
257
+ # will also write some logs.
258
+ output_location = CopyGenerator.new(
259
+ input_uris: [preprocessed_location.file_uri],
260
+ output_location_template: output_location.file_uri
261
+ ).generated_files.first
262
+
263
+ return output_location
210
264
  end
211
265
 
212
266
  log_message = "#{self.class}#destination :: " \
@@ -223,6 +277,22 @@ module DerivativeRodeo
223
277
  # rubocop:enable Metrics/AbcSize
224
278
  # rubocop:enable Metrics/MethodLength
225
279
 
280
+ ##
281
+ # Some generators (e.g. {PdfSplitGenerator}) need to cooerce the location template based on
282
+ # the input location. Most often, however, the given :preprocessed_location_template is
283
+ # adequate and would be the typical returned value.
284
+ #
285
+ # @param input_location [StorageLocations::BaseLocation]
286
+ # @param preprocessed_location_template [String]
287
+ #
288
+ # @return [String]
289
+ #
290
+ # rubocop:disable Lint/UnusedMethodArgument
291
+ def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
292
+ preprocessed_location_template
293
+ end
294
+ # rubocop:enable Lint/UnusedMethodArgument
295
+
226
296
  ##
227
297
  # A bit of indirection to create a common interface for running a shell command.
228
298
  #
@@ -237,6 +307,7 @@ module DerivativeRodeo
237
307
  result
238
308
  end
239
309
  end
310
+ # rubocop:enable Metrics/ClassLength
240
311
  end
241
312
  end
242
313
 
@@ -66,7 +66,7 @@ module DerivativeRodeo
66
66
  #
67
67
  # @note There is relation to {Generators::BaseGenerator#destination} and this method.
68
68
  #
69
- # @note The tail_glob is in relation to the {#image_file_basename_template}
69
+ # @note The tail_regexp is in relation to the {#image_file_basename_template}
70
70
  def existing_page_locations(input_location:)
71
71
  # See image_file_basename_template
72
72
  tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
@@ -76,12 +76,14 @@ module DerivativeRodeo
76
76
 
77
77
  return [] if preprocessed_location_template.blank?
78
78
 
79
- input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
79
+ input_location.derived_file_from(template: preprocessed_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
80
80
  end
81
81
 
82
82
  ##
83
83
  # @api public
84
84
  #
85
+ # @param splitter [#call]
86
+ #
85
87
  # Take the given PDF(s) and into one image per page. Remember that the URL should account for
86
88
  # the page number.
87
89
  #
@@ -98,22 +100,27 @@ module DerivativeRodeo
98
100
  # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
99
101
  #
100
102
  # rubocop:disable Metrics/MethodLength
101
- def with_each_requisite_location_and_tmp_file_path
103
+ # rubocop:disable Metrics/AbcSize
104
+ def with_each_requisite_location_and_tmp_file_path(splitter: Services::PdfSplitter)
102
105
  input_files.each do |input_location|
103
106
  input_location.with_existing_tmp_path do |input_tmp_file_path|
104
107
  existing_locations = existing_page_locations(input_location: input_location)
105
108
 
106
109
  if existing_locations.count.positive?
107
- existing_locations.each do |location|
110
+ logger.info("#{self.class}##{__method__} found #{existing_locations.count} file(s) at existing split location for #{input_location.file_uri.inspect}.")
111
+ existing_locations.each_with_index do |location, index|
112
+ logger.info("#{self.class}##{__method__} found ##{index} split file #{location.file_path.inspect} for #{input_location.file_uri.inspect}.")
108
113
  yield(location, location.file_path)
109
114
  end
110
115
  else
116
+ logger.info("#{self.class}##{__method__} did not find at existing location split files for #{input_location.file_uri.inspect}. Proceeding with #{splitter}.call")
111
117
  # We're going to need to create the files and "cast" them to locations.
112
- Services::PdfSplitter.call(
118
+ splitter.call(
113
119
  input_tmp_file_path,
114
120
  image_extension: output_extension,
115
121
  image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
116
- ).each do |image_path|
122
+ ).each_with_index do |image_path, index|
123
+ logger.info("#{self.class}##{__method__} generated (via #{splitter}.call) ##{index} split file #{image_path.inspect} for #{input_location.file_uri.inspect}.")
117
124
  image_location = StorageLocations::FileLocation.new("file://#{image_path}")
118
125
  yield(image_location, image_path)
119
126
  end
@@ -121,7 +128,18 @@ module DerivativeRodeo
121
128
  end
122
129
  end
123
130
  end
131
+ # rubocop:enable Metrics/AbcSize
124
132
  # rubocop:enable Metrics/MethodLength
133
+
134
+ ##
135
+ # We're working with an input location with a filename basename of "123.ARCHIVAL--page-1.tiff"
136
+ # The :preprocessed_location_template, due to constraints, likely ends with the original PDF's
137
+ # filename (e.g. "123.ARCHIVAL.pdf")
138
+ #
139
+ # And since the template doesn't have a concept of page number, we introduce this kludge.
140
+ def derive_preprocessed_template_from(input_location:, preprocessed_location_template:)
141
+ File.join(File.dirname(preprocessed_location_template), input_location.file_name)
142
+ end
125
143
  end
126
144
  end
127
145
  end
@@ -33,6 +33,11 @@ module DerivativeRodeo
33
33
  File.open(path_to_coordinate, "w+") do |file|
34
34
  file.puts service.call(hocr_html).to_json
35
35
  end
36
+ rescue => e
37
+ message = "#{self.class}##{__method__} encountered `#{e.class}' error “#{e}” for path_to_hocr: #{path_to_hocr.inspect} and path_to_coordinate: #{path_to_coordinate.inspect}"
38
+ exception = RuntimeError.new(message)
39
+ exception.set_backtrace(e.backtrace)
40
+ raise exception
36
41
  end
37
42
  end
38
43
  end
@@ -46,6 +46,8 @@ module DerivativeRodeo
46
46
  delegate :config, to: DerivativeRodeo
47
47
  end
48
48
 
49
+ delegate :logger, to: DerivativeRodeo
50
+
49
51
  ##
50
52
  # @param location_name [String]
51
53
  #
@@ -42,6 +42,9 @@ module DerivativeRodeo
42
42
  #
43
43
  # @param tail_regexp [Regexp]
44
44
  def matching_locations_in_file_dir(tail_regexp:)
45
+ logger.debug("#{self.class}##{__method__} searching for matching files in " \
46
+ "file_dir: #{file_dir.inspect} " \
47
+ "with tail_regexp: #{tail_regexp.inspect}.")
45
48
  Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
46
49
  accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
47
50
  end
@@ -73,7 +73,10 @@ module DerivativeRodeo
73
73
  def matching_locations_in_file_dir(tail_regexp:)
74
74
  uri = URI.parse(file_uri)
75
75
  scheme_and_host = "#{uri.scheme}://#{uri.host}"
76
-
76
+ logger.debug("#{self.class}##{__method__} searching for matching files for " \
77
+ "scheme_and_host: #{scheme_and_host.inspect} " \
78
+ "file_dir: #{file_dir.inspect} " \
79
+ "with tail_regexp: #{tail_regexp.inspect}.")
77
80
  bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
78
81
  if tail_regexp.match(object.key)
79
82
  template = File.join(scheme_and_host, object.key)
@@ -120,7 +123,7 @@ module DerivativeRodeo
120
123
  def bucket_name
121
124
  @bucket_name ||= file_uri.match(%r{s3://(.+)\.s3})&.[](1)
122
125
  rescue StandardError
123
- raise Errors::BucketMissingError
126
+ raise Errors::BucketMissingError.new(file_uri: file_uri)
124
127
  end
125
128
 
126
129
  # @see .use_actual_s3_bucket
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DerivativeRodeo
4
- VERSION = '0.4.2'
4
+ VERSION = '0.5.0'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: derivative-rodeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-07-10 00:00:00.000000000 Z
12
+ date: 2023-07-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport