derivative-rodeo 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
4
- data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
3
+ metadata.gz: bee0908ac5d045db1948b062d3a4e1569fff41b4a5a0f1521f31b620c15c53a6
4
+ data.tar.gz: 328de6f1cde3bcdadca31a361821b2c573425d15bfddbea5bf85d9475a8a3d0e
5
5
  SHA512:
6
- metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
7
- data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
6
+ metadata.gz: 2d8ba019ef30666d9633b73e90fe599b53d5f6698de0400b43a25c37a06470542962db8f61d74ed6a444f1ecc5ffbd604d2c0891dffcf5fa86d9dce87ace1bb5
7
+ data.tar.gz: b682377ce08e4379f1323bfdb6c6b8f46c97d45ebb392bb1dc7f5bc27f5b01701731880dc08f96cb5ef2b0da8ad39349b4534a63b4039e78616331b93bd9516c
@@ -39,7 +39,6 @@ module DerivativeRodeo
39
39
  # {Services::ConvertUriViaTemplateService} with the given
40
40
  # :preprocessed_location_template.
41
41
  def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
42
- # NOTE: Are we using this preprocessed_location_template? Wondering?
43
42
  @input_uris = Array.wrap(input_uris)
44
43
  @output_location_template = output_location_template
45
44
  @preprocessed_location_template = preprocessed_location_template
@@ -52,7 +52,7 @@ module DerivativeRodeo
52
52
  # @see #existing_page_locations
53
53
  # @see .filename_for_a_derived_page_from_a_pdf?
54
54
  def image_file_basename_template(basename:)
55
- "#{basename}/pages/#{basename}--page-%d.#{output_extension}"
55
+ "#{basename}--page-%d.#{output_extension}"
56
56
  end
57
57
 
58
58
  ##
@@ -62,21 +62,21 @@ module DerivativeRodeo
62
62
  # @param input_location [StorageLocations::BaseLocation]
63
63
  #
64
64
  # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
65
- # with :tail_glob.
65
+ # with :tail_regexp.
66
66
  #
67
67
  # @note There is relation to {Generators::BaseGenerator#destination} and this method.
68
68
  #
69
69
  # @note The tail_glob is in relation to the {#image_file_basename_template}
70
70
  def existing_page_locations(input_location:)
71
71
  # See image_file_basename_template
72
- tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
72
+ tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
73
73
 
74
- output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
74
+ output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
75
75
  return output_locations if output_locations.count.positive?
76
76
 
77
77
  return [] if preprocessed_location_template.blank?
78
78
 
79
- input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
79
+ input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
80
80
  end
81
81
 
82
82
  ##
@@ -101,20 +101,22 @@ module DerivativeRodeo
101
101
  def with_each_requisite_location_and_tmp_file_path
102
102
  input_files.each do |input_location|
103
103
  input_location.with_existing_tmp_path do |input_tmp_file_path|
104
- ## We want a single call for a directory listing of the image_file_basename_template
105
- generated_files = existing_page_locations(input_location: input_location)
104
+ existing_locations = existing_page_locations(input_location: input_location)
106
105
 
107
- if generated_files.count.zero?
108
- generated_files = Services::PdfSplitter.call(
106
+ if existing_locations.count.positive?
107
+ existing_locations.each do |location|
108
+ yield(location, location.file_path)
109
+ end
110
+ else
111
+ # We're going to need to create the files and "cast" them to locations.
112
+ Services::PdfSplitter.call(
109
113
  input_tmp_file_path,
110
114
  image_extension: output_extension,
111
115
  image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
112
- )
113
- end
114
-
115
- generated_files.each do |image_path|
116
- image_location = StorageLocations::FileLocation.new("file://#{image_path}")
117
- yield(image_location, image_path)
116
+ ).each do |image_path|
117
+ image_location = StorageLocations::FileLocation.new("file://#{image_path}")
118
+ yield(image_location, image_path)
119
+ end
118
120
  end
119
121
  end
120
122
  end
@@ -210,18 +210,18 @@ module DerivativeRodeo
210
210
 
211
211
  ##
212
212
  # When you have a known location and want to check for files that are within that location,
213
- # use the {#globbed_tail_locations} method. In the case of {Generators::PdfSplitGenerator} we
213
+ # use the {#matching_locations_in_file_dir} method. In the case of {Generators::PdfSplitGenerator} we
214
214
  # need to know the path to all of the image files we "split" off of the given PDF.
215
215
  #
216
216
  # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
217
217
  # qualified" Dir.glob type search.
218
218
  #
219
- # @param tail_glob [String]
219
+ # @param tail_regexp [Regexp]
220
220
  #
221
221
  # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
222
222
  # array when there are none.
223
- def globbed_tail_locations(tail_glob:)
224
- raise NotImplementedError, "#{self.class}#globbed_locations"
223
+ def matching_locations_in_file_dir(tail_regexp:)
224
+ raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
225
225
  end
226
226
 
227
227
  ##
@@ -35,8 +35,16 @@ module DerivativeRodeo
35
35
  file_uri
36
36
  end
37
37
 
38
- def globbed_tail_locations(tail_glob:)
39
- Dir.glob(File.join(file_dir, tail_glob))
38
+ ##
39
+ # @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
40
+ #
41
+ # @see Generators::PdfSplitGenerator#image_file_basename_template
42
+ #
43
+ # @param tail_regexp [Regexp]
44
+ def matching_locations_in_file_dir(tail_regexp:)
45
+ Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
46
+ accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
47
+ end
40
48
  end
41
49
  end
42
50
  end
@@ -65,31 +65,19 @@ module DerivativeRodeo
65
65
  ##
66
66
  # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
67
67
  #
68
- # @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can
69
- # use the components of the file_path to fake that behavior.
68
+ # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
70
69
  #
71
70
  # @see Generators::PdfSplitGenerator#image_file_basename_template
72
- def globbed_tail_locations(tail_glob:)
73
- # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
74
- #
75
- # NOTE: Should we be storing our files as such? The pattern we need is
76
- # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
77
- # more than one PDF (that we intend to split); we don't want to trample on those split files
78
- # and miscolate two PDFs.
79
- #
80
- # file_path = "s3://blah/1234/hello-world/hello-world.pdf
81
- globname = File.join(file_dir, tail_glob)
82
- regexp = %r{#{File.extname(globname)}$}
83
-
84
- # NOTE: We're making some informed guesses, needing to include the fully qualified template
85
- # based on both the key of the item in the bucket as well as the bucket's host.
71
+ #
72
+ # @param tail_regexp [Regexp]
73
+ def matching_locations_in_file_dir(tail_regexp:)
86
74
  uri = URI.parse(file_uri)
87
75
  scheme_and_host = "#{uri.scheme}://#{uri.host}"
88
76
 
89
- bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
90
- if object.key.match(regexp)
77
+ bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
78
+ if tail_regexp.match(object.key)
91
79
  template = File.join(scheme_and_host, object.key)
92
- derived_file_from(template: template)
80
+ accumulator << derived_file_from(template: template)
93
81
  end
94
82
  end
95
83
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DerivativeRodeo
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.0'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: derivative-rodeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-06-05 00:00:00.000000000 Z
12
+ date: 2023-06-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -337,7 +337,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
337
337
  - !ruby/object:Gem::Version
338
338
  version: '0'
339
339
  requirements: []
340
- rubygems_version: 3.3.7
340
+ rubygems_version: 3.1.6
341
341
  signing_key:
342
342
  specification_version: 4
343
343
  summary: An ETL Ecosystem for Derivative Processing.