derivative-rodeo 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/derivative_rodeo/generators/base_generator.rb +0 -1
- data/lib/derivative_rodeo/generators/pdf_split_generator.rb +17 -15
- data/lib/derivative_rodeo/storage_locations/base_location.rb +4 -4
- data/lib/derivative_rodeo/storage_locations/file_location.rb +10 -2
- data/lib/derivative_rodeo/storage_locations/s3_location.rb +7 -19
- data/lib/derivative_rodeo/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bee0908ac5d045db1948b062d3a4e1569fff41b4a5a0f1521f31b620c15c53a6
|
4
|
+
data.tar.gz: 328de6f1cde3bcdadca31a361821b2c573425d15bfddbea5bf85d9475a8a3d0e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2d8ba019ef30666d9633b73e90fe599b53d5f6698de0400b43a25c37a06470542962db8f61d74ed6a444f1ecc5ffbd604d2c0891dffcf5fa86d9dce87ace1bb5
|
7
|
+
data.tar.gz: b682377ce08e4379f1323bfdb6c6b8f46c97d45ebb392bb1dc7f5bc27f5b01701731880dc08f96cb5ef2b0da8ad39349b4534a63b4039e78616331b93bd9516c
|
@@ -39,7 +39,6 @@ module DerivativeRodeo
|
|
39
39
|
# {Services::ConvertUriViaTemplateService} with the given
|
40
40
|
# :preprocessed_location_template.
|
41
41
|
def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
|
42
|
-
# NOTE: Are we using this preprocessed_location_template? Wondering?
|
43
42
|
@input_uris = Array.wrap(input_uris)
|
44
43
|
@output_location_template = output_location_template
|
45
44
|
@preprocessed_location_template = preprocessed_location_template
|
@@ -52,7 +52,7 @@ module DerivativeRodeo
|
|
52
52
|
# @see #existing_page_locations
|
53
53
|
# @see .filename_for_a_derived_page_from_a_pdf?
|
54
54
|
def image_file_basename_template(basename:)
|
55
|
-
"#{basename}
|
55
|
+
"#{basename}--page-%d.#{output_extension}"
|
56
56
|
end
|
57
57
|
|
58
58
|
##
|
@@ -62,21 +62,21 @@ module DerivativeRodeo
|
|
62
62
|
# @param input_location [StorageLocations::BaseLocation]
|
63
63
|
#
|
64
64
|
# @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
|
65
|
-
# with :
|
65
|
+
# with :tail_regexp.
|
66
66
|
#
|
67
67
|
# @note There is relation to {Generators::BaseGenerator#destination} and this method.
|
68
68
|
#
|
69
69
|
# @note The tail_glob is in relation to the {#image_file_basename_template}
|
70
70
|
def existing_page_locations(input_location:)
|
71
71
|
# See image_file_basename_template
|
72
|
-
|
72
|
+
tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
|
73
73
|
|
74
|
-
output_locations = input_location.derived_file_from(template: output_location_template).
|
74
|
+
output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
|
75
75
|
return output_locations if output_locations.count.positive?
|
76
76
|
|
77
77
|
return [] if preprocessed_location_template.blank?
|
78
78
|
|
79
|
-
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(
|
79
|
+
input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
|
80
80
|
end
|
81
81
|
|
82
82
|
##
|
@@ -101,20 +101,22 @@ module DerivativeRodeo
|
|
101
101
|
def with_each_requisite_location_and_tmp_file_path
|
102
102
|
input_files.each do |input_location|
|
103
103
|
input_location.with_existing_tmp_path do |input_tmp_file_path|
|
104
|
-
|
105
|
-
generated_files = existing_page_locations(input_location: input_location)
|
104
|
+
existing_locations = existing_page_locations(input_location: input_location)
|
106
105
|
|
107
|
-
if
|
108
|
-
|
106
|
+
if existing_locations.count.positive?
|
107
|
+
existing_locations.each do |location|
|
108
|
+
yield(location, location.file_path)
|
109
|
+
end
|
110
|
+
else
|
111
|
+
# We're going to need to create the files and "cast" them to locations.
|
112
|
+
Services::PdfSplitter.call(
|
109
113
|
input_tmp_file_path,
|
110
114
|
image_extension: output_extension,
|
111
115
|
image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
|
112
|
-
)
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
117
|
-
yield(image_location, image_path)
|
116
|
+
).each do |image_path|
|
117
|
+
image_location = StorageLocations::FileLocation.new("file://#{image_path}")
|
118
|
+
yield(image_location, image_path)
|
119
|
+
end
|
118
120
|
end
|
119
121
|
end
|
120
122
|
end
|
@@ -210,18 +210,18 @@ module DerivativeRodeo
|
|
210
210
|
|
211
211
|
##
|
212
212
|
# When you have a known location and want to check for files that are within that location,
|
213
|
-
# use the {#
|
213
|
+
# use the {#matching_locations_in_file_dir} method. In the case of {Generators::PdfSplitGenerator} we
|
214
214
|
# need to know the path to all of the image files we "split" off of the given PDF.
|
215
215
|
#
|
216
216
|
# We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
|
217
217
|
# qualified" Dir.glob type search.
|
218
218
|
#
|
219
|
-
# @param
|
219
|
+
# @param tail_regexp [Regexp]
|
220
220
|
#
|
221
221
|
# @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
|
222
222
|
# array when there are none.
|
223
|
-
def
|
224
|
-
raise NotImplementedError, "#{self.class}#
|
223
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
224
|
+
raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
|
225
225
|
end
|
226
226
|
|
227
227
|
##
|
@@ -35,8 +35,16 @@ module DerivativeRodeo
|
|
35
35
|
file_uri
|
36
36
|
end
|
37
37
|
|
38
|
-
|
39
|
-
|
38
|
+
##
|
39
|
+
# @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
|
40
|
+
#
|
41
|
+
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
42
|
+
#
|
43
|
+
# @param tail_regexp [Regexp]
|
44
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
45
|
+
Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
|
46
|
+
accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
|
47
|
+
end
|
40
48
|
end
|
41
49
|
end
|
42
50
|
end
|
@@ -65,31 +65,19 @@ module DerivativeRodeo
|
|
65
65
|
##
|
66
66
|
# @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
|
67
67
|
#
|
68
|
-
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
|
69
|
-
# use the components of the file_path to fake that behavior.
|
68
|
+
# @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
|
70
69
|
#
|
71
70
|
# @see Generators::PdfSplitGenerator#image_file_basename_template
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
# NOTE: Should we be storing our files as such? The pattern we need is
|
76
|
-
# :parent_identifier/:file_set_identifier/files There are probably cases where a work has
|
77
|
-
# more than one PDF (that we intend to split); we don't want to trample on those split files
|
78
|
-
# and miscolate two PDFs.
|
79
|
-
#
|
80
|
-
# file_path = "s3://blah/1234/hello-world/hello-world.pdf
|
81
|
-
globname = File.join(file_dir, tail_glob)
|
82
|
-
regexp = %r{#{File.extname(globname)}$}
|
83
|
-
|
84
|
-
# NOTE: We're making some informed guesses, needing to include the fully qualified template
|
85
|
-
# based on both the key of the item in the bucket as well as the bucket's host.
|
71
|
+
#
|
72
|
+
# @param tail_regexp [Regexp]
|
73
|
+
def matching_locations_in_file_dir(tail_regexp:)
|
86
74
|
uri = URI.parse(file_uri)
|
87
75
|
scheme_and_host = "#{uri.scheme}://#{uri.host}"
|
88
76
|
|
89
|
-
bucket.objects(prefix:
|
90
|
-
if object.key
|
77
|
+
bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
|
78
|
+
if tail_regexp.match(object.key)
|
91
79
|
template = File.join(scheme_and_host, object.key)
|
92
|
-
derived_file_from(template: template)
|
80
|
+
accumulator << derived_file_from(template: template)
|
93
81
|
end
|
94
82
|
end
|
95
83
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: derivative-rodeo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-06-
|
12
|
+
date: 2023-06-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
@@ -337,7 +337,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
337
337
|
- !ruby/object:Gem::Version
|
338
338
|
version: '0'
|
339
339
|
requirements: []
|
340
|
-
rubygems_version: 3.
|
340
|
+
rubygems_version: 3.1.6
|
341
341
|
signing_key:
|
342
342
|
specification_version: 4
|
343
343
|
summary: An ETL Ecosystem for Derivative Processing.
|