derivative-rodeo 0.3.0 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
4
- data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
3
+ metadata.gz: 62872d16bfd5d73940f87d5c09f61f2a88ee67414f51905ce503f411b9b2fb37
4
+ data.tar.gz: 742d63ca02418b3453824655738e25b47d3cca918f030e2fb5db4c997d52e945
5
5
  SHA512:
6
- metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
7
- data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
6
+ metadata.gz: e43b94745f35474edf4b463cd11b8c7d7bb29391f443c7ef2a9e84966d969aa6cf7c205a92c32a03452ba28c490c790ccd05d6569fd9db59d8c119b7e38f1dde
7
+ data.tar.gz: '07962c3175aed6d77295e473ad8462d0ae634c931a2f3f5bc75be1195977dded3cec013448c9d2fe2653fa83b9d178dd3513aaa1eda8eeafe9da539e3dbf06b0'
@@ -39,7 +39,6 @@ module DerivativeRodeo
39
39
  # {Services::ConvertUriViaTemplateService} with the given
40
40
  # :preprocessed_location_template.
41
41
  def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
42
- # NOTE: Are we using this preprocessed_location_template? Wondering?
43
42
  @input_uris = Array.wrap(input_uris)
44
43
  @output_location_template = output_location_template
45
44
  @preprocessed_location_template = preprocessed_location_template
@@ -83,6 +82,7 @@ module DerivativeRodeo
83
82
  #
84
83
  # @see #build_step
85
84
  # @see #with_each_requisite_location_and_tmp_file_path
85
+ # rubocop:disable Metrics/MethodLength
86
86
  def generated_files
87
87
  # TODO: Examples please
88
88
  return @generated_files if defined?(@generated_files)
@@ -102,11 +102,16 @@ module DerivativeRodeo
102
102
  @generated_files << if generated_file.exist?
103
103
  generated_file
104
104
  else
105
+ log_message = "#{self.class}#generated_files :: " \
106
+ "input_location file_uri #{input_location.file_uri} :: " \
107
+ "Generating output_location file_uri #{generated_file.file_uri} via build_step."
108
+ logger.info(log_message)
105
109
  build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
106
110
  end
107
111
  end
108
112
  @generated_files
109
113
  end
114
+ # rubocop:enable Metrics/MethodLength
110
115
 
111
116
  ##
112
117
  # @return [Array<String>]
@@ -168,20 +173,55 @@ module DerivativeRodeo
168
173
  # {#output_location_template} or {#preprocessed_location_template}.
169
174
  #
170
175
  # @see [StorageLocations::BaseLocation#exist?]
176
+ # rubocop:disable Metrics/MethodLength
177
+ # rubocop:disable Metrics/AbcSize
171
178
  def destination(input_location)
172
- output_location = input_location.derived_file_from(template: output_location_template)
179
+ output_location = input_location.derived_file_from(template: output_location_template, extension: output_extension)
173
180
 
174
- return output_location if output_location.exist?
175
- return output_location unless preprocessed_location_template
181
+ if output_location.exist?
182
+ log_message = "#{self.class}#destination :: " \
183
+ "input_location file_uri #{input_location.file_uri} :: " \
184
+ "Found output_location file_uri #{output_location.file_uri}."
185
+ logger.info(log_message)
176
186
 
177
- preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
187
+ return output_location
188
+ end
189
+
190
+ unless preprocessed_location_template
191
+ log_message = "#{self.class}#destination :: " \
192
+ "input_location file_uri #{input_location.file_uri} :: " \
193
+ "No preprocessed_location_template provided " \
194
+ "nor does a file exist at output_location file_uri #{output_location.file_uri};" \
195
+ " moving on to generation via #{self.class}#build_step."
196
+ logger.info(log_message)
197
+
198
+ return output_location
199
+ end
200
+
201
+ preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template, extension: output_extension)
178
202
  # We only want the location if it exists
179
- return preprocessed_location if preprocessed_location&.exist?
203
+ if preprocessed_location&.exist?
204
+ log_message = "#{self.class}#destination :: " \
205
+ "input_location file_uri #{input_location.file_uri} :: " \
206
+ "Found preprocessed_location file_uri #{output_location.file_uri}."
207
+ logger.info(log_message)
208
+
209
+ return preprocessed_location
210
+ end
211
+
212
+ log_message = "#{self.class}#destination :: " \
213
+ "input_location file_uri #{input_location.file_uri} :: " \
214
+ "No file exists at preprocessed_location file_uri #{preprocessed_location.file_uri} " \
215
+ "nor output_location file_uri #{output_location.file_uri}; " \
216
+ "moving on to generation via #{self.class}#build_step."
217
+ logger.info(log_message)
180
218
 
181
219
  # NOTE: The file does not exist at the output_location; but we pass this information along so
182
220
  # that the #build_step knows where to write the file.
183
221
  output_location
184
222
  end
223
+ # rubocop:enable Metrics/AbcSize
224
+ # rubocop:enable Metrics/MethodLength
185
225
 
186
226
  ##
187
227
  # A bit of indirection to create a common interface for running a shell command.
@@ -5,7 +5,8 @@ module DerivativeRodeo
5
5
  ##
6
6
  # Take images an ensures that we have a monochrome derivative of those images.
7
7
  class MonochromeGenerator < BaseGenerator
8
- # TODO: Can we assume a tiff?
8
+ # @see DerivativeRodeo::Services::ConvertUriViaTemplateService for the interaction of the
9
+ # magic ".mono" suffix
9
10
  self.output_extension = 'mono.tiff'
10
11
 
11
12
  ##
@@ -52,7 +52,7 @@ module DerivativeRodeo
52
52
  # @see #existing_page_locations
53
53
  # @see .filename_for_a_derived_page_from_a_pdf?
54
54
  def image_file_basename_template(basename:)
55
- "#{basename}/pages/#{basename}--page-%d.#{output_extension}"
55
+ "#{basename}--page-%d.#{output_extension}"
56
56
  end
57
57
 
58
58
  ##
@@ -62,21 +62,21 @@ module DerivativeRodeo
62
62
  # @param input_location [StorageLocations::BaseLocation]
63
63
  #
64
64
  # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
65
- # with :tail_glob.
65
+ # with :tail_regexp.
66
66
  #
67
67
  # @note There is relation to {Generators::BaseGenerator#destination} and this method.
68
68
  #
69
69
  # @note The tail_glob is in relation to the {#image_file_basename_template}
70
70
  def existing_page_locations(input_location:)
71
71
  # See image_file_basename_template
72
- tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
72
+ tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
73
73
 
74
- output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
74
+ output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
75
75
  return output_locations if output_locations.count.positive?
76
76
 
77
77
  return [] if preprocessed_location_template.blank?
78
78
 
79
- input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
79
+ input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
80
80
  end
81
81
 
82
82
  ##
@@ -101,20 +101,22 @@ module DerivativeRodeo
101
101
  def with_each_requisite_location_and_tmp_file_path
102
102
  input_files.each do |input_location|
103
103
  input_location.with_existing_tmp_path do |input_tmp_file_path|
104
- ## We want a single call for a directory listing of the image_file_basename_template
105
- generated_files = existing_page_locations(input_location: input_location)
104
+ existing_locations = existing_page_locations(input_location: input_location)
106
105
 
107
- if generated_files.count.zero?
108
- generated_files = Services::PdfSplitter.call(
106
+ if existing_locations.count.positive?
107
+ existing_locations.each do |location|
108
+ yield(location, location.file_path)
109
+ end
110
+ else
111
+ # We're going to need to create the files and "cast" them to locations.
112
+ Services::PdfSplitter.call(
109
113
  input_tmp_file_path,
110
114
  image_extension: output_extension,
111
115
  image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
112
- )
113
- end
114
-
115
- generated_files.each do |image_path|
116
- image_location = StorageLocations::FileLocation.new("file://#{image_path}")
117
- yield(image_location, image_path)
116
+ ).each do |image_path|
117
+ image_location = StorageLocations::FileLocation.new("file://#{image_path}")
118
+ yield(image_location, image_path)
119
+ end
118
120
  end
119
121
  end
120
122
  end
@@ -46,11 +46,12 @@ module DerivativeRodeo
46
46
  # from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
47
47
  # template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
48
48
  # => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
49
- def self.call(from_uri:, template:, adapter: nil, separator: "/")
50
- new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
49
+ def self.call(from_uri:, template:, adapter: nil, separator: "/", **options)
50
+ new(from_uri: from_uri, template: template, adapter: adapter, separator: separator, **options).call
51
51
  end
52
52
 
53
- def initialize(from_uri:, template:, adapter: nil, separator: "/")
53
+ # rubocop:disable Metrics/MethodLength
54
+ def initialize(from_uri:, template:, adapter: nil, separator: "/", **options)
54
55
  @from_uri = from_uri
55
56
  @template = template
56
57
  @adapter = adapter
@@ -60,12 +61,23 @@ module DerivativeRodeo
60
61
  @from_scheme, @path = uri.split("://")
61
62
  @parts = @path.split(separator)
62
63
  @dir_parts = @parts[0..-2]
63
- @filename = @parts[-1]
64
- @basename = File.basename(@filename, ".*")
65
- @extension = File.extname(@filename)
64
+ @filename = options[:filename] || @parts[-1]
65
+ @basename = options[:basename] || File.basename(@filename, ".*")
66
+
67
+ ##
68
+ # HACK: Because the HocrGenerator has `.mono.tiff` and we are not interested in carrying
69
+ # forward the `.mono` suffix as that makes it hard to find the preprocessed word
70
+ # coordinates, alto, and plain text. This ensures files derived from the .mono are findable
71
+ # in IIIF Print.
72
+ @basename = @basename.sub(/\.mono\z/, '')
73
+ @extension = options[:extension] || File.extname(@filename)
74
+ # When a generator specifies "same" we want to use the given file's extension
75
+ @extension = File.extname(@filename) if @extension == DerivativeRodeo::StorageLocations::SAME
76
+ @extension = ".#{@extension}" unless @extension.start_with?(".")
66
77
 
67
78
  @template_without_query, @template_query = template.split("?")
68
79
  end
80
+ # rubocop:enable Metrics/MethodLength
69
81
 
70
82
  def call
71
83
  to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
@@ -101,10 +101,10 @@ module DerivativeRodeo
101
101
  # @param service [#call, Module<DerivativeRodeo::Services::ConvertUriViaTemplateService>]
102
102
  #
103
103
  # @return [StorageLocations::BaseLocation]
104
- def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService)
104
+ def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService, **options)
105
105
  # HACK: Ensuring that we have the correct scheme. Maybe this is a hack?
106
106
  from_uri = "#{scheme}://#{from_uri}" unless from_uri.start_with?("#{scheme}://")
107
- to_uri = service.call(from_uri: from_uri, template: template, adapter: self)
107
+ to_uri = service.call(from_uri: from_uri, template: template, adapter: self, **options)
108
108
  new(to_uri)
109
109
  end
110
110
 
@@ -203,25 +203,25 @@ module DerivativeRodeo
203
203
  # @return [StorageLocations::BaseLocation]
204
204
  #
205
205
  # @see DerivativeRodeo::Services::ConvertUriViaTemplateService
206
- def derived_file_from(template:)
206
+ def derived_file_from(template:, **options)
207
207
  klass = DerivativeRodeo::StorageLocations::BaseLocation.load_location(template)
208
- klass.build(from_uri: file_path, template: template)
208
+ klass.build(from_uri: file_path, template: template, **options)
209
209
  end
210
210
 
211
211
  ##
212
212
  # When you have a known location and want to check for files that are within that location,
213
- # use the {#globbed_tail_locations} method. In the case of {Generators::PdfSplitGenerator} we
213
+ # use the {#matching_locations_in_file_dir} method. In the case of {Generators::PdfSplitGenerator} we
214
214
  # need to know the path to all of the image files we "split" off of the given PDF.
215
215
  #
216
216
  # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
217
217
  # qualified" Dir.glob type search.
218
218
  #
219
- # @param tail_glob [String]
219
+ # @param tail_regexp [Regexp]
220
220
  #
221
221
  # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
222
222
  # array when there are none.
223
- def globbed_tail_locations(tail_glob:)
224
- raise NotImplementedError, "#{self.class}#globbed_locations"
223
+ def matching_locations_in_file_dir(tail_regexp:)
224
+ raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
225
225
  end
226
226
 
227
227
  ##
@@ -231,6 +231,7 @@ module DerivativeRodeo
231
231
  def with_new_extension(extension)
232
232
  return file_path if extension == StorageLocations::SAME
233
233
 
234
+ # NOTE: May need to revisit this
234
235
  "#{file_path.split('.')[0]}.#{extension}"
235
236
  end
236
237
 
@@ -35,8 +35,16 @@ module DerivativeRodeo
35
35
  file_uri
36
36
  end
37
37
 
38
- def globbed_tail_locations(tail_glob:)
39
- Dir.glob(File.join(file_dir, tail_glob))
38
+ ##
39
+ # @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
40
+ #
41
+ # @see Generators::PdfSplitGenerator#image_file_basename_template
42
+ #
43
+ # @param tail_regexp [Regexp]
44
+ def matching_locations_in_file_dir(tail_regexp:)
45
+ Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
46
+ accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
47
+ end
40
48
  end
41
49
  end
42
50
  end
@@ -65,31 +65,19 @@ module DerivativeRodeo
65
65
  ##
66
66
  # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
67
67
  #
68
- # @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can
69
- # use the components of the file_path to fake that behavior.
68
+ # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
70
69
  #
71
70
  # @see Generators::PdfSplitGenerator#image_file_basename_template
72
- def globbed_tail_locations(tail_glob:)
73
- # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
74
- #
75
- # NOTE: Should we be storing our files as such? The pattern we need is
76
- # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
77
- # more than one PDF (that we intend to split); we don't want to trample on those split files
78
- # and miscolate two PDFs.
79
- #
80
- # file_path = "s3://blah/1234/hello-world/hello-world.pdf
81
- globname = File.join(file_dir, tail_glob)
82
- regexp = %r{#{File.extname(globname)}$}
83
-
84
- # NOTE: We're making some informed guesses, needing to include the fully qualified template
85
- # based on both the key of the item in the bucket as well as the bucket's host.
71
+ #
72
+ # @param tail_regexp [Regexp]
73
+ def matching_locations_in_file_dir(tail_regexp:)
86
74
  uri = URI.parse(file_uri)
87
75
  scheme_and_host = "#{uri.scheme}://#{uri.host}"
88
76
 
89
- bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
90
- if object.key.match(regexp)
77
+ bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
78
+ if tail_regexp.match(object.key)
91
79
  template = File.join(scheme_and_host, object.key)
92
- derived_file_from(template: template)
80
+ accumulator << derived_file_from(template: template)
93
81
  end
94
82
  end
95
83
  end
@@ -13,6 +13,8 @@ module DerivativeRodeo
13
13
  # Location to download and upload files to Sqs
14
14
  # It uploads a file_uri to the queue, not the contents of that file
15
15
  # reading from the queue is not currently implemented
16
+ #
17
+ # rubocop:disable Metrics/ClassLength
16
18
  class SqsLocation < BaseLocation
17
19
  ##
18
20
  # @!group Class Attributes
@@ -85,11 +87,14 @@ module DerivativeRodeo
85
87
  batch = []
86
88
  Dir.glob("#{File.dirname(tmp_file_path)}/**/**").each.with_index do |fp, i|
87
89
  batch << { id: SecureRandom.uuid, message_body: output_json("file://#{fp}") }
88
- if (i % batch_size).zero?
90
+ if (i + 1 % batch_size).zero?
89
91
  add_batch(messages: batch)
90
92
  batch = []
91
93
  end
92
94
  end
95
+
96
+ # Ensure we're flushing the batched up queue as part of completing the write.
97
+ add_batch(messages: batch) if batch.present?
93
98
  file_uri
94
99
  end
95
100
 
@@ -181,6 +186,7 @@ module DerivativeRodeo
181
186
  end
182
187
 
183
188
  def output_json(uri)
189
+ # TODO: Add ability to handle a pre-process-template given to an SQS, and pass that along to the generator when applicable.
184
190
  key = DerivativeRodeo::Services::ConvertUriViaTemplateService.call(from_uri: uri, template: template, adapter: self)
185
191
  { key => [template] }.to_json
186
192
  end
@@ -201,5 +207,6 @@ module DerivativeRodeo
201
207
  @file_uri_parts
202
208
  end
203
209
  end
210
+ # rubocop:enable Metrics/ClassLength
204
211
  end
205
212
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DerivativeRodeo
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.2'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: derivative-rodeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-06-05 00:00:00.000000000 Z
12
+ date: 2023-07-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport