derivative-rodeo 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/generators/base_generator.rb +0 -1
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +17 -15
- data/lib/derivative_rodeo/storage_locations/base_location.rb +4 -4
- data/lib/derivative_rodeo/storage_locations/file_location.rb +10 -2
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +7 -19
- data/lib/derivative_rodeo/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bee0908ac5d045db1948b062d3a4e1569fff41b4a5a0f1521f31b620c15c53a6
|
4
|
+
data.tar.gz: 328de6f1cde3bcdadca31a361821b2c573425d15bfddbea5bf85d9475a8a3d0e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2d8ba019ef30666d9633b73e90fe599b53d5f6698de0400b43a25c37a06470542962db8f61d74ed6a444f1ecc5ffbd604d2c0891dffcf5fa86d9dce87ace1bb5
|
7
|
+
data.tar.gz: b682377ce08e4379f1323bfdb6c6b8f46c97d45ebb392bb1dc7f5bc27f5b01701731880dc08f96cb5ef2b0da8ad39349b4534a63b4039e78616331b93bd9516c
|
@@ -39,7 +39,6 @@ module DerivativeRodeo
|
|
39
39
|
# {Services::ConvertUriViaTemplateService} with the given
|
40
40
|
# :preprocessed_location_template.
|
41
41
|
def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
|
42
|
-
# NOTE: Are we using this preprocessed_location_template? Wondering?
|
43
42
|
@input_uris = Array.wrap(input_uris)
|
44
43
|
@output_location_template = output_location_template
|
45
44
|
@preprocessed_location_template = preprocessed_location_template
|
@@ -52,7 +52,7 @@ module DerivativeRodeo
|
|
52
52
|
# @see #existing_page_locations
|
53
53
|
# @see .filename_for_a_derived_page_from_a_pdf?
|
54
54
|
def image_file_basename_template(basename:)
|
55
|
-
"#{basename}
|
55
|
+
"#{basename}--page-%d.#{output_extension}"
|
56
56
|
end
|
57
57
|
|
58
58
|
##
|
@@ -62,21 +62,21 @@ module DerivativeRodeo
|
|
62
62
|
# @param input_location [StorageLocations::BaseLocation]
|
63
63
|
#
|
64
64
|
# @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
|
65
|
-
# with :
|
65
|
+
# with :tail_regexp.
|
66
66
|
#
|
67
67
|
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
68
|
#
|
69
69
|
# @note The tail_glob is in relation to the {#image_file_basename_template}
|
70
70
|
def existing_page_locations(input_location:)
|
71
71
|
# See image_file_basename_template
|
72
|
-
|
72
|
+
tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
|
73
73
|
|
74
|
-
output_locations = input_location.derived_file_from(template: output_location_template).
|
74
|
+
output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
|
75
75
|
return output_locations if output_locations.count.positive?
|
76
76
|
|
77
77
|
return [] if preprocessed_location_template.blank?
|
78
78
|
|
79
|
-
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
@@ -101,20 +101,22 @@ module DerivativeRodeo
|
|
101
101
|
def with_each_requisite_location_and_tmp_file_path
|
102
102
|
input_files.each do |input_location|
|
103
103
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
104
|
-
|
105
|
-
generated_files = existing_page_locations(input_location: input_location)
|
104
|
+
existing_locations = existing_page_locations(input_location: input_location)
|
106
105
|
|
107
|
-
if
|
108
|
-
|
106
|
+
if existing_locations.count.positive?
|
107
|
+
existing_locations.each do |location|
|
108
|
+
yield(location, location.file_path)
|
109
|
+
end
|
110
|
+
else
|
111
|
+
# We're going to need to create the files and "cast" them to locations.
|
112
|
+
Services::PdfSplitter.call(
|
109
113
|
input_tmp_file_path,
|
110
114
|
image_extension: output_extension,
|
111
115
|
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
112
|
-
)
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
117
|
-
yield(image_location, image_path)
|
116
|
+
).each do |image_path|
|
117
|
+
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
118
|
+
yield(image_location, image_path)
|
119
|
+
end
|
118
120
|
end
|
119
121
|
end
|
120
122
|
end
|
@@ -210,18 +210,18 @@ module DerivativeRodeo
|
|
210
210
|
|
211
211
|
##
|
212
212
|
# When you have a known location and want to check for files that are within that location,
|
213
|
-
# use the {#
|
213
|
+
# use the {#matching_locations_in_file_dir} method. In the case of {Generators::PdfSplitGenerator} we
|
214
214
|
# need to know the path to all of the image files we "split" off of the given PDF.
|
215
215
|
#
|
216
216
|
# We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
|
217
217
|
# qualified" Dir.glob type search.
|
218
218
|
#
|
219
|
-
# @param
|
219
|
+
# @param tail_regexp [Regexp]
|
220
220
|
#
|
221
221
|
# @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
|
222
222
|
# array when there are none.
|
223
|
-
def
|
224
|
-
raise NotImplementedError, "#{self.class}#
|
223
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
224
|
+
raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
|
225
225
|
end
|
226
226
|
|
227
227
|
##
|
@@ -35,8 +35,16 @@ module DerivativeRodeo
|
|
35
35
|
file_uri
|
36
36
|
end
|
37
37
|
|
38
|
-
|
39
|
-
|
38
|
+
##
|
39
|
+
# @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
|
40
|
+
#
|
41
|
+
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
42
|
+
#
|
43
|
+
# @param tail_regexp [Regexp]
|
44
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
45
|
+
Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
|
46
|
+
accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
|
47
|
+
end
|
40
48
|
end
|
41
49
|
end
|
42
50
|
end
|
@@ -65,31 +65,19 @@ module DerivativeRodeo
|
|
65
65
|
##
|
66
66
|
# @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
|
67
67
|
#
|
68
|
-
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
|
69
|
-
# use the components of the file_path to fake that behavior.
|
68
|
+
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
|
70
69
|
#
|
71
70
|
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
# NOTE: Should we be storing our files as such? The pattern we need is
|
76
|
-
# :parent_identifier/:file_set_identifier/files There are probably cases where a work has
|
77
|
-
# more than one PDF (that we intend to split); we don't want to trample on those split files
|
78
|
-
# and miscolate two PDFs.
|
79
|
-
#
|
80
|
-
# file_path = "s3://blah/1234/hello-world/hello-world.pdf
|
81
|
-
globname = File.join(file_dir, tail_glob)
|
82
|
-
regexp = %r{#{File.extname(globname)}$}
|
83
|
-
|
84
|
-
# NOTE: We're making some informed guesses, needing to include the fully qualified template
|
85
|
-
# based on both the key of the item in the bucket as well as the bucket's host.
|
71
|
+
#
|
72
|
+
# @param tail_regexp [Regexp]
|
73
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
86
74
|
uri = URI.parse(file_uri)
|
87
75
|
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
88
76
|
|
89
|
-
bucket.objects(prefix:
|
90
|
-
if object.key
|
77
|
+
bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
|
78
|
+
if tail_regexp.match(object.key)
|
91
79
|
template = File.join(scheme_and_host, object.key)
|
92
|
-
derived_file_from(template: template)
|
80
|
+
accumulator << derived_file_from(template: template)
|
93
81
|
end
|
94
82
|
end
|
95
83
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-06-
|
12
|
+
date: 2023-06-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
@@ -337,7 +337,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
337
337
|
- !ruby/object:Gem::Version
|
338
338
|
version: '0'
|
339
339
|
requirements: []
|
340
|
-
rubygems_version: 3.
|
340
|
+
rubygems_version: 3.1.6
|
341
341
|
signing_key:
|
342
342
|
specification_version: 4
|
343
343
|
summary: An ETL Ecosystem for Derivative Processing.
|