derivative-rodeo 0.3.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc2eed3e32c7a4558d55e9d530b6790a5b876dcdfc4ced421cfa4894aa977d44
4
- data.tar.gz: 6e16e4bd7b9d38a1a19b1768a5cdb021c6aa946287f430c6a6c62fa26a215ca6
3
+ metadata.gz: 62872d16bfd5d73940f87d5c09f61f2a88ee67414f51905ce503f411b9b2fb37
4
+ data.tar.gz: 742d63ca02418b3453824655738e25b47d3cca918f030e2fb5db4c997d52e945
5
5
  SHA512:
6
- metadata.gz: 0ac19d20f92490eed508949b18df66ce61d0850a22a2b8b1e514673ddd447afb578e8090d4234dc0a179b85c25a145e44bce6a1e71cfe2f67d2e3b438cb4b9ff
7
- data.tar.gz: 6f503dd265243982bc9163b7fb6da42211eca3eb647b1ee9491fcbc06b373c6822222ee6d72c190f2e1bbd7ca63c8126102acd841b1e8f0240434a1af3a69a4f
6
+ metadata.gz: e43b94745f35474edf4b463cd11b8c7d7bb29391f443c7ef2a9e84966d969aa6cf7c205a92c32a03452ba28c490c790ccd05d6569fd9db59d8c119b7e38f1dde
7
+ data.tar.gz: '07962c3175aed6d77295e473ad8462d0ae634c931a2f3f5bc75be1195977dded3cec013448c9d2fe2653fa83b9d178dd3513aaa1eda8eeafe9da539e3dbf06b0'
@@ -39,7 +39,6 @@ module DerivativeRodeo
39
39
  # {Services::ConvertUriViaTemplateService} with the given
40
40
  # :preprocessed_location_template.
41
41
  def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
42
- # NOTE: Are we using this preprocessed_location_template? Wondering?
43
42
  @input_uris = Array.wrap(input_uris)
44
43
  @output_location_template = output_location_template
45
44
  @preprocessed_location_template = preprocessed_location_template
@@ -83,6 +82,7 @@ module DerivativeRodeo
83
82
  #
84
83
  # @see #build_step
85
84
  # @see #with_each_requisite_location_and_tmp_file_path
85
+ # rubocop:disable Metrics/MethodLength
86
86
  def generated_files
87
87
  # TODO: Examples please
88
88
  return @generated_files if defined?(@generated_files)
@@ -102,11 +102,16 @@ module DerivativeRodeo
102
102
  @generated_files << if generated_file.exist?
103
103
  generated_file
104
104
  else
105
+ log_message = "#{self.class}#generated_files :: " \
106
+ "input_location file_uri #{input_location.file_uri} :: " \
107
+ "Generating output_location file_uri #{generated_file.file_uri} via build_step."
108
+ logger.info(log_message)
105
109
  build_step(input_location: input_location, output_location: generated_file, input_tmp_file_path: input_tmp_file_path)
106
110
  end
107
111
  end
108
112
  @generated_files
109
113
  end
114
+ # rubocop:enable Metrics/MethodLength
110
115
 
111
116
  ##
112
117
  # @return [Array<String>]
@@ -168,20 +173,55 @@ module DerivativeRodeo
168
173
  # {#output_location_template} or {#preprocessed_location_template}.
169
174
  #
170
175
  # @see [StorageLocations::BaseLocation#exist?]
176
+ # rubocop:disable Metrics/MethodLength
177
+ # rubocop:disable Metrics/AbcSize
171
178
  def destination(input_location)
172
- output_location = input_location.derived_file_from(template: output_location_template)
179
+ output_location = input_location.derived_file_from(template: output_location_template, extension: output_extension)
173
180
 
174
- return output_location if output_location.exist?
175
- return output_location unless preprocessed_location_template
181
+ if output_location.exist?
182
+ log_message = "#{self.class}#destination :: " \
183
+ "input_location file_uri #{input_location.file_uri} :: " \
184
+ "Found output_location file_uri #{output_location.file_uri}."
185
+ logger.info(log_message)
176
186
 
177
- preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
187
+ return output_location
188
+ end
189
+
190
+ unless preprocessed_location_template
191
+ log_message = "#{self.class}#destination :: " \
192
+ "input_location file_uri #{input_location.file_uri} :: " \
193
+ "No preprocessed_location_template provided " \
194
+ "nor does a file exist at output_location file_uri #{output_location.file_uri};" \
195
+ " moving on to generation via #{self.class}#build_step."
196
+ logger.info(log_message)
197
+
198
+ return output_location
199
+ end
200
+
201
+ preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template, extension: output_extension)
178
202
  # We only want the location if it exists
179
- return preprocessed_location if preprocessed_location&.exist?
203
+ if preprocessed_location&.exist?
204
+ log_message = "#{self.class}#destination :: " \
205
+ "input_location file_uri #{input_location.file_uri} :: " \
206
+ "Found preprocessed_location file_uri #{output_location.file_uri}."
207
+ logger.info(log_message)
208
+
209
+ return preprocessed_location
210
+ end
211
+
212
+ log_message = "#{self.class}#destination :: " \
213
+ "input_location file_uri #{input_location.file_uri} :: " \
214
+ "No file exists at preprocessed_location file_uri #{preprocessed_location.file_uri} " \
215
+ "nor output_location file_uri #{output_location.file_uri}; " \
216
+ "moving on to generation via #{self.class}#build_step."
217
+ logger.info(log_message)
180
218
 
181
219
  # NOTE: The file does not exist at the output_location; but we pass this information along so
182
220
  # that the #build_step knows where to write the file.
183
221
  output_location
184
222
  end
223
+ # rubocop:enable Metrics/AbcSize
224
+ # rubocop:enable Metrics/MethodLength
185
225
 
186
226
  ##
187
227
  # A bit of indirection to create a common interface for running a shell command.
@@ -5,7 +5,8 @@ module DerivativeRodeo
5
5
  ##
6
6
  # Take images an ensures that we have a monochrome derivative of those images.
7
7
  class MonochromeGenerator < BaseGenerator
8
- # TODO: Can we assume a tiff?
8
+ # @see DerivativeRodeo::Services::ConvertUriViaTemplateService for the interaction of the
9
+ # magic ".mono" suffix
9
10
  self.output_extension = 'mono.tiff'
10
11
 
11
12
  ##
@@ -52,7 +52,7 @@ module DerivativeRodeo
52
52
  # @see #existing_page_locations
53
53
  # @see .filename_for_a_derived_page_from_a_pdf?
54
54
  def image_file_basename_template(basename:)
55
- "#{basename}/pages/#{basename}--page-%d.#{output_extension}"
55
+ "#{basename}--page-%d.#{output_extension}"
56
56
  end
57
57
 
58
58
  ##
@@ -62,21 +62,21 @@ module DerivativeRodeo
62
62
  # @param input_location [StorageLocations::BaseLocation]
63
63
  #
64
64
  # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
65
- # with :tail_glob.
65
+ # with :tail_regexp.
66
66
  #
67
67
  # @note There is relation to {Generators::BaseGenerator#destination} and this method.
68
68
  #
69
69
  # @note The tail_glob is in relation to the {#image_file_basename_template}
70
70
  def existing_page_locations(input_location:)
71
71
  # See image_file_basename_template
72
- tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
72
+ tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
73
73
 
74
- output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
74
+ output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
75
75
  return output_locations if output_locations.count.positive?
76
76
 
77
77
  return [] if preprocessed_location_template.blank?
78
78
 
79
- input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
79
+ input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
80
80
  end
81
81
 
82
82
  ##
@@ -101,20 +101,22 @@ module DerivativeRodeo
101
101
  def with_each_requisite_location_and_tmp_file_path
102
102
  input_files.each do |input_location|
103
103
  input_location.with_existing_tmp_path do |input_tmp_file_path|
104
- ## We want a single call for a directory listing of the image_file_basename_template
105
- generated_files = existing_page_locations(input_location: input_location)
104
+ existing_locations = existing_page_locations(input_location: input_location)
106
105
 
107
- if generated_files.count.zero?
108
- generated_files = Services::PdfSplitter.call(
106
+ if existing_locations.count.positive?
107
+ existing_locations.each do |location|
108
+ yield(location, location.file_path)
109
+ end
110
+ else
111
+ # We're going to need to create the files and "cast" them to locations.
112
+ Services::PdfSplitter.call(
109
113
  input_tmp_file_path,
110
114
  image_extension: output_extension,
111
115
  image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
112
- )
113
- end
114
-
115
- generated_files.each do |image_path|
116
- image_location = StorageLocations::FileLocation.new("file://#{image_path}")
117
- yield(image_location, image_path)
116
+ ).each do |image_path|
117
+ image_location = StorageLocations::FileLocation.new("file://#{image_path}")
118
+ yield(image_location, image_path)
119
+ end
118
120
  end
119
121
  end
120
122
  end
@@ -46,11 +46,12 @@ module DerivativeRodeo
46
46
  # from_uris: ["file:///path1/A/file.pdf", "aws:///path2/B/file.pdf"],
47
47
  # template: "file:///dest1/{{dir_parts[-1..-1]}}/{{ filename }}")
48
48
  # => ["file:///dest1/A/file.pdf", "aws:///dest1/B/file.pdf"]
49
- def self.call(from_uri:, template:, adapter: nil, separator: "/")
50
- new(from_uri: from_uri, template: template, adapter: adapter, separator: separator).call
49
+ def self.call(from_uri:, template:, adapter: nil, separator: "/", **options)
50
+ new(from_uri: from_uri, template: template, adapter: adapter, separator: separator, **options).call
51
51
  end
52
52
 
53
- def initialize(from_uri:, template:, adapter: nil, separator: "/")
53
+ # rubocop:disable Metrics/MethodLength
54
+ def initialize(from_uri:, template:, adapter: nil, separator: "/", **options)
54
55
  @from_uri = from_uri
55
56
  @template = template
56
57
  @adapter = adapter
@@ -60,12 +61,23 @@ module DerivativeRodeo
60
61
  @from_scheme, @path = uri.split("://")
61
62
  @parts = @path.split(separator)
62
63
  @dir_parts = @parts[0..-2]
63
- @filename = @parts[-1]
64
- @basename = File.basename(@filename, ".*")
65
- @extension = File.extname(@filename)
64
+ @filename = options[:filename] || @parts[-1]
65
+ @basename = options[:basename] || File.basename(@filename, ".*")
66
+
67
+ ##
68
+ # HACK: Because the HocrGenerator has `.mono.tiff` and we are not interested in carrying
69
+ # forward the `.mono` suffix as that makes it hard to find the preprocessed word
70
+ # coordinates, alto, and plain text. This ensures files derived from the .mono are findable
71
+ # in IIIF Print.
72
+ @basename = @basename.sub(/\.mono\z/, '')
73
+ @extension = options[:extension] || File.extname(@filename)
74
+ # When a generator specifies "same" we want to use the given file's extension
75
+ @extension = File.extname(@filename) if @extension == DerivativeRodeo::StorageLocations::SAME
76
+ @extension = ".#{@extension}" unless @extension.start_with?(".")
66
77
 
67
78
  @template_without_query, @template_query = template.split("?")
68
79
  end
80
+ # rubocop:enable Metrics/MethodLength
69
81
 
70
82
  def call
71
83
  to_uri = template_without_query.gsub(DIR_PARTS_REPLACEMENT_REGEXP) do |text|
@@ -101,10 +101,10 @@ module DerivativeRodeo
101
101
  # @param service [#call, Module<DerivativeRodeo::Services::ConvertUriViaTemplateService>]
102
102
  #
103
103
  # @return [StorageLocations::BaseLocation]
104
- def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService)
104
+ def self.build(from_uri:, template:, service: DerivativeRodeo::Services::ConvertUriViaTemplateService, **options)
105
105
  # HACK: Ensuring that we have the correct scheme. Maybe this is a hack?
106
106
  from_uri = "#{scheme}://#{from_uri}" unless from_uri.start_with?("#{scheme}://")
107
- to_uri = service.call(from_uri: from_uri, template: template, adapter: self)
107
+ to_uri = service.call(from_uri: from_uri, template: template, adapter: self, **options)
108
108
  new(to_uri)
109
109
  end
110
110
 
@@ -203,25 +203,25 @@ module DerivativeRodeo
203
203
  # @return [StorageLocations::BaseLocation]
204
204
  #
205
205
  # @see DerivativeRodeo::Services::ConvertUriViaTemplateService
206
- def derived_file_from(template:)
206
+ def derived_file_from(template:, **options)
207
207
  klass = DerivativeRodeo::StorageLocations::BaseLocation.load_location(template)
208
- klass.build(from_uri: file_path, template: template)
208
+ klass.build(from_uri: file_path, template: template, **options)
209
209
  end
210
210
 
211
211
  ##
212
212
  # When you have a known location and want to check for files that are within that location,
213
- # use the {#globbed_tail_locations} method. In the case of {Generators::PdfSplitGenerator} we
213
+ # use the {#matching_locations_in_file_dir} method. In the case of {Generators::PdfSplitGenerator} we
214
214
  # need to know the path to all of the image files we "split" off of the given PDF.
215
215
  #
216
216
  # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
217
217
  # qualified" Dir.glob type search.
218
218
  #
219
- # @param tail_glob [String]
219
+ # @param tail_regexp [Regexp]
220
220
  #
221
221
  # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
222
222
  # array when there are none.
223
- def globbed_tail_locations(tail_glob:)
224
- raise NotImplementedError, "#{self.class}#globbed_locations"
223
+ def matching_locations_in_file_dir(tail_regexp:)
224
+ raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
225
225
  end
226
226
 
227
227
  ##
@@ -231,6 +231,7 @@ module DerivativeRodeo
231
231
  def with_new_extension(extension)
232
232
  return file_path if extension == StorageLocations::SAME
233
233
 
234
+ # NOTE: May need to revisit this
234
235
  "#{file_path.split('.')[0]}.#{extension}"
235
236
  end
236
237
 
@@ -35,8 +35,16 @@ module DerivativeRodeo
35
35
  file_uri
36
36
  end
37
37
 
38
- def globbed_tail_locations(tail_glob:)
39
- Dir.glob(File.join(file_dir, tail_glob))
38
+ ##
39
+ # @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
40
+ #
41
+ # @see Generators::PdfSplitGenerator#image_file_basename_template
42
+ #
43
+ # @param tail_regexp [Regexp]
44
+ def matching_locations_in_file_dir(tail_regexp:)
45
+ Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
46
+ accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
47
+ end
40
48
  end
41
49
  end
42
50
  end
@@ -65,31 +65,19 @@ module DerivativeRodeo
65
65
  ##
66
66
  # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
67
67
  #
68
- # @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can
69
- # use the components of the file_path to fake that behavior.
68
+ # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
70
69
  #
71
70
  # @see Generators::PdfSplitGenerator#image_file_basename_template
72
- def globbed_tail_locations(tail_glob:)
73
- # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
74
- #
75
- # NOTE: Should we be storing our files as such? The pattern we need is
76
- # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
77
- # more than one PDF (that we intend to split); we don't want to trample on those split files
78
- # and miscolate two PDFs.
79
- #
80
- # file_path = "s3://blah/1234/hello-world/hello-world.pdf
81
- globname = File.join(file_dir, tail_glob)
82
- regexp = %r{#{File.extname(globname)}$}
83
-
84
- # NOTE: We're making some informed guesses, needing to include the fully qualified template
85
- # based on both the key of the item in the bucket as well as the bucket's host.
71
+ #
72
+ # @param tail_regexp [Regexp]
73
+ def matching_locations_in_file_dir(tail_regexp:)
86
74
  uri = URI.parse(file_uri)
87
75
  scheme_and_host = "#{uri.scheme}://#{uri.host}"
88
76
 
89
- bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
90
- if object.key.match(regexp)
77
+ bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
78
+ if tail_regexp.match(object.key)
91
79
  template = File.join(scheme_and_host, object.key)
92
- derived_file_from(template: template)
80
+ accumulator << derived_file_from(template: template)
93
81
  end
94
82
  end
95
83
  end
@@ -13,6 +13,8 @@ module DerivativeRodeo
13
13
  # Location to download and upload files to Sqs
14
14
  # It uploads a file_uri to the queue, not the contents of that file
15
15
  # reading from the queue is not currently implemented
16
+ #
17
+ # rubocop:disable Metrics/ClassLength
16
18
  class SqsLocation < BaseLocation
17
19
  ##
18
20
  # @!group Class Attributes
@@ -85,11 +87,14 @@ module DerivativeRodeo
85
87
  batch = []
86
88
  Dir.glob("#{File.dirname(tmp_file_path)}/**/**").each.with_index do |fp, i|
87
89
  batch << { id: SecureRandom.uuid, message_body: output_json("file://#{fp}") }
88
- if (i % batch_size).zero?
90
+ if (i + 1 % batch_size).zero?
89
91
  add_batch(messages: batch)
90
92
  batch = []
91
93
  end
92
94
  end
95
+
96
+ # Ensure we're flushing the batched up queue as part of completing the write.
97
+ add_batch(messages: batch) if batch.present?
93
98
  file_uri
94
99
  end
95
100
 
@@ -181,6 +186,7 @@ module DerivativeRodeo
181
186
  end
182
187
 
183
188
  def output_json(uri)
189
+ # TODO: Add ability to handle a pre-process-template given to an SQS, and pass that along to the generator when applicable.
184
190
  key = DerivativeRodeo::Services::ConvertUriViaTemplateService.call(from_uri: uri, template: template, adapter: self)
185
191
  { key => [template] }.to_json
186
192
  end
@@ -201,5 +207,6 @@ module DerivativeRodeo
201
207
  @file_uri_parts
202
208
  end
203
209
  end
210
+ # rubocop:enable Metrics/ClassLength
204
211
  end
205
212
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DerivativeRodeo
4
- VERSION = '0.3.0'
4
+ VERSION = '0.4.2'
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: derivative-rodeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-06-05 00:00:00.000000000 Z
12
+ date: 2023-07-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: activesupport