derivative-rodeo 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
4
- data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
3
+ metadata.gz: bee0908ac5d045db1948b062d3a4e1569fff41b4a5a0f1521f31b620c15c53a6
4
+ data.tar.gz: 328de6f1cde3bcdadca31a361821b2c573425d15bfddbea5bf85d9475a8a3d0e
5
5
  SHA512:
6
- metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
7
- data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
6
+ metadata.gz: 2d8ba019ef30666d9633b73e90fe599b53d5f6698de0400b43a25c37a06470542962db8f61d74ed6a444f1ecc5ffbd604d2c0891dffcf5fa86d9dce87ace1bb5
7
+ data.tar.gz: b682377ce08e4379f1323bfdb6c6b8f46c97d45ebb392bb1dc7f5bc27f5b01701731880dc08f96cb5ef2b0da8ad39349b4534a63b4039e78616331b93bd9516c
@@ -39,7 +39,6 @@ module DerivativeRodeo
39
39
  # {Services::ConvertUriViaTemplateService} with the given
40
40
  # :preprocessed_location_template.
41
41
  def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
42
- # NOTE: Are we using this preprocessed_location_template? Wondering?
43
42
  @input_uris = Array.wrap(input_uris)
44
43
  @output_location_template = output_location_template
45
44
  @preprocessed_location_template = preprocessed_location_template
@@ -52,7 +52,7 @@ module DerivativeRodeo
52
52
  # @see #existing_page_locations
53
53
  # @see .filename_for_a_derived_page_from_a_pdf?
54
54
  def image_file_basename_template(basename:)
55
- "#{basename}/pages/#{basename}--page-%d.#{output_extension}"
55
+ "#{basename}--page-%d.#{output_extension}"
56
56
  end
57
57
 
58
58
  ##
@@ -62,21 +62,21 @@ module DerivativeRodeo
62
62
  # @param input_location [StorageLocations::BaseLocation]
63
63
  #
64
64
  # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
65
- # with :tail_glob.
65
+ # with :tail_regexp.
66
66
  #
67
67
  # @note There is relation to {Generators::BaseGenerator#destination} and this method.
68
68
  #
69
69
  # @note The tail_glob is in relation to the {#image_file_basename_template}
70
70
  def existing_page_locations(input_location:)
71
71
  # See image_file_basename_template
72
- tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
72
+ tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
73
73
 
74
- output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
74
+ output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
75
75
  return output_locations if output_locations.count.positive?
76
76
 
77
77
  return [] if preprocessed_location_template.blank?
78
78
 
79
- input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
79
+ input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
80
80
  end
81
81
 
82
82
  ##
@@ -101,20 +101,22 @@ module DerivativeRodeo
101
101
  def with_each_requisite_location_and_tmp_file_path
102
102
  input_files.each do |input_location|
103
103
  input_location.with_existing_tmp_path do |input_tmp_file_path|
104
- ## We want a single call for a directory listing of the image_file_basename_template
105
- generated_files = existing_page_locations(input_location: input_location)
104
+ existing_locations = existing_page_locations(input_location: input_location)
106
105
 
107
- if generated_files.count.zero?
108
- generated_files = Services::PdfSplitter.call(
106
+ if existing_locations.count.positive?
107
+ existing_locations.each do |location|
108
+ yield(location, location.file_path)
109
+ end
110
+ else
111
+ # We're going to need to create the files and "cast" them to locations.
112
+ Services::PdfSplitter.call(
109
113
  input_tmp_file_path,
110
114
  image_extension: output_extension,
111
115
  image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
112
- )
113
- end
114
-
115
- generated_files.each do |image_path|
116
- image_location = StorageLocations::FileLocation.new("file://#{image_path}")
117
- yield(image_location, image_path)
116
+ ).each do |image_path|
117
+ image_location = StorageLocations::FileLocation.new("file://#{image_path}")
118
+ yield(image_location, image_path)
119
+ end
118
120
  end
119
121
  end
120
122
  end
@@ -210,18 +210,18 @@ module DerivativeRodeo
210
210
 
211
211
  ##
212
212
  # When you have a known location and want to check for files that are within that location,
213
- # use the {#globbed_tail_locations} method. In the case of {Generators::PdfSplitGenerator} we
213
+ # use the {#matching_locations_in_file_dir} method. In the case of {Generators::PdfSplitGenerator} we
214
214
  # need to know the path to all of the image files we "split" off of the given PDF.
215
215
  #
216
216
  # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
217
217
  # qualified" Dir.glob type search.
218
218
  #
219
- # @param tail_glob [String]
219
+ # @param tail_regexp [Regexp]
220
220
  #
221
221
  # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
222
222
  # array when there are none.
223
- def globbed_tail_locations(tail_glob:)
224
- raise NotImplementedError, "#{self.class}#globbed_locations"
223
+ def matching_locations_in_file_dir(tail_regexp:)
224
+ raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
225
225
  end
226
226
 
227
227
  ##
@@ -35,8 +35,16 @@ module DerivativeRodeo
35
35
  file_uri
36
36
  end
37
37
 
38
- def globbed_tail_locations(tail_glob:)
39
- Dir.glob(File.join(file_dir, tail_glob))
38
+ ##
39
+ # @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
40
+ #
41
+ # @see Generators::PdfSplitGenerator#image_file_basename_template
42
+ #
43
+ # @param tail_regexp [Regexp]
44
+ def matching_locations_in_file_dir(tail_regexp:)
45
+ Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
46
+ accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
47
+ end
40
48
  end
41
49
  end
42
50
  end
@@ -65,31 +65,19 @@ module DerivativeRodeo
65
65
  ##
66
66
  # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
67
67
  #
68
- # @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can
69
- # use the components of the file_path to fake that behavior.
68
+ # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
70
69
  #
71
70
  # @see Generators::PdfSplitGenerator#image_file_basename_template
72
- def globbed_tail_locations(tail_glob:)
73
- # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
74
- #
75
- # NOTE: Should we be storing our files as such? The pattern we need is
76
- # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
77
- # more than one PDF (that we intend to split); we don't want to trample on those split files
78
- # and miscolate two PDFs.
79
- #
80
- # file_path = "s3://blah/1234/hello-world/hello-world.pdf
81
- globname = File.join(file_dir, tail_glob)
82
- regexp = %r{#{File.extname(globname)}$}
83
-
84
- # NOTE: We're making some informed guesses, needing to include the fully qualified template
85
- # based on both the key of the item in the bucket as well as the bucket's host.
71
+ #
72
+ # @param tail_regexp [Regexp]
73
+ def matching_locations_in_file_dir(tail_regexp:)
86
74
  uri = URI.parse(file_uri)
87
75
  scheme_and_host = "#{uri.scheme}://#{uri.host}"
88
76
 
89
- bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
90
- if object.key.match(regexp)
77
+ bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
78
+ if tail_regexp.match(object.key)
91
79
  template = File.join(scheme_and_host, object.key)
92
- derived_file_from(template: template)
80
+ accumulator << derived_file_from(template: template)
93
81
  end
94
82
  end
95
83
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DerivativeRodeo
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.0'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: derivative-rodeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-06-05 00:00:00.000000000 Z
12
+ date: 2023-06-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport
@@ -337,7 +337,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
337
337
  - !ruby/object:Gem::Version
338
338
  version: '0'
339
339
  requirements: []
340
- rubygems_version: 3.3.7
340
+ rubygems_version: 3.1.6
341
341
  signing_key:
342
342
  specification_version: 4
343
343
  summary: An ETL Ecosystem for Derivative Processing.