bulkrax 9.0.2 → 9.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +26 -0
  3. data/app/assets/javascripts/bulkrax/datatables.js +12 -0
  4. data/app/assets/javascripts/bulkrax/importers.js.erb +4 -1
  5. data/app/factories/bulkrax/object_factory.rb +36 -2
  6. data/app/factories/bulkrax/object_factory_interface.rb +26 -0
  7. data/app/factories/bulkrax/valkyrie_object_factory.rb +109 -27
  8. data/app/jobs/bulkrax/create_relationships_job.rb +123 -76
  9. data/app/jobs/bulkrax/delete_job.rb +11 -0
  10. data/app/jobs/bulkrax/importer_job.rb +1 -0
  11. data/app/matchers/bulkrax/application_matcher.rb +2 -1
  12. data/app/models/bulkrax/csv_entry.rb +41 -10
  13. data/app/models/bulkrax/importer.rb +9 -1
  14. data/app/models/bulkrax/status.rb +1 -1
  15. data/app/models/concerns/bulkrax/export_behavior.rb +28 -15
  16. data/app/models/concerns/bulkrax/file_set_entry_behavior.rb +13 -4
  17. data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +1 -1
  18. data/app/parsers/bulkrax/application_parser.rb +22 -4
  19. data/app/parsers/bulkrax/csv_parser.rb +36 -6
  20. data/app/parsers/bulkrax/oai_dc_parser.rb +0 -2
  21. data/app/parsers/bulkrax/xml_parser.rb +1 -1
  22. data/app/services/bulkrax/factory_class_finder.rb +56 -15
  23. data/app/services/hyrax/custom_queries/find_by_source_identifier.rb +6 -11
  24. data/app/services/wings/custom_queries/find_by_source_identifier.rb +15 -6
  25. data/app/views/bulkrax/entries/show.html.erb +15 -9
  26. data/app/views/bulkrax/importers/_bagit_fields.html.erb +1 -1
  27. data/app/views/bulkrax/importers/_csv_fields.html.erb +1 -1
  28. data/app/views/bulkrax/importers/_oai_fields.html.erb +1 -1
  29. data/app/views/bulkrax/importers/_xml_fields.html.erb +1 -1
  30. data/app/views/bulkrax/importers/show.html.erb +4 -4
  31. data/app/views/bulkrax/shared/_entries_tab.html.erb +1 -1
  32. data/config/locales/bulkrax.en.yml +5 -3
  33. data/lib/bulkrax/engine.rb +1 -1
  34. data/lib/bulkrax/version.rb +1 -1
  35. data/lib/bulkrax.rb +6 -11
  36. data/lib/generators/bulkrax/templates/bin/importer +1 -5
  37. metadata +8 -3
  38. data/app/factories/bulkrax/valkyrize-hyku.code-workspace +0 -19
@@ -44,7 +44,7 @@ module Bulkrax
44
44
 
45
45
  queue_as Bulkrax.config.ingest_queue_name
46
46
 
47
- attr_accessor :user, :importer_run, :errors
47
+ attr_accessor :user, :importer_run, :errors, :importer_run_id, :ability, :number_of_successes, :number_of_failures
48
48
  ##
49
49
  # @param parent_identifier [String] Work/Collection ID or Bulkrax::Entry source_identifiers
50
50
  # @param importer_run [Bulkrax::ImporterRun] current importer run (needed to properly update counters)
@@ -57,72 +57,52 @@ module Bulkrax
57
57
  #
58
58
  # rubocop:disable Metrics/MethodLength
59
59
  def perform(parent_identifier:, importer_run_id: nil, run_user: nil, failure_count: 0) # rubocop:disable Metrics/AbcSize
60
- importer_run = Bulkrax::ImporterRun.find(importer_run_id) if importer_run_id
61
- user = run_user || importer_run&.user
62
- ability = Ability.new(user)
63
-
64
- parent_entry, parent_record = find_record(parent_identifier, importer_run_id)
65
-
66
- number_of_successes = 0
67
- number_of_failures = 0
68
- errors = []
60
+ @importer_run_id = importer_run_id
61
+ @importer_run = Bulkrax::ImporterRun.find(@importer_run_id) if @importer_run_id
62
+ @user = run_user || importer_run&.user
63
+ @ability = Ability.new(@user)
64
+
65
+ @number_of_successes = 0
66
+ @number_of_failures = 0
67
+ @errors = []
69
68
  @parent_record_members_added = false
70
- @child_members_added = []
71
69
 
70
+ parent_entry, parent_record = find_record(parent_identifier, @importer_run_id)
72
71
  if parent_record
73
- conditionally_acquire_lock_for(parent_record.id) do
74
- ActiveRecord::Base.uncached do
75
- Bulkrax::PendingRelationship.where(parent_id: parent_identifier)
76
- .ordered.find_each do |rel|
77
- process(relationship: rel, importer_run_id: importer_run_id, parent_record: parent_record, ability: ability)
78
- number_of_successes += 1
79
- @parent_record_members_added = true
80
- rescue => e
81
- number_of_failures += 1
82
- rel.set_status_info(e, importer_run)
83
- errors << e
84
- end
85
- end
86
-
87
- # save record if members were added
88
- if @parent_record_members_added
89
- Bulkrax.object_factory.save!(resource: parent_record, user: user)
90
- Bulkrax.object_factory.publish(event: 'object.membership.updated', object: parent_record)
91
- Bulkrax.object_factory.update_index(resources: @child_members_added)
92
- end
72
+ # Works and collections are different breeds of animals:
73
+ # - works know both their children (file_sets and child works) in member_ids
74
+ # - works and collections know their parents (collections) in member_of_collection_ids
75
+ # We need to handle the two differently by locking the records appropriately to avoid race condition errors.
76
+ if parent_record.is_a?(Bulkrax.collection_model_class)
77
+ process_parent_as_collection(parent_record: parent_record, parent_identifier: parent_identifier)
78
+ else
79
+ process_parent_as_work(parent_record: parent_record, parent_identifier: parent_identifier)
93
80
  end
94
81
  else
95
- # In moving the check of the parent record "up" we've exposed a hidden reporting foible.
96
- # Namely we were reporting one error per child record when the parent record was itself
97
- # unavailable.
98
- #
99
- # We have chosen not to duplicate that "number of errors" as it does not seem like the
100
- # correct pattern for reporting a singular error (the previous pattern being one error per
101
- # child who's parent is not yet created).
102
- number_of_failures = 1
103
- errors = ["Parent record not yet available for creating relationships with children records."]
82
+ @number_of_failures = 1
83
+ @errors = ["Parent record #{parent_identifier} not yet available for creating relationships with children records."]
104
84
  end
105
85
 
106
- if errors.present?
86
+ if @errors.present?
107
87
  # rubocop:disable Rails/SkipsModelValidations
108
- ImporterRun.update_counters(importer_run_id, failed_relationships: number_of_failures)
88
+ ImporterRun.update_counters(@importer_run_id, failed_relationships: @number_of_failures)
109
89
  # rubocop:enable Rails/SkipsModelValidations
110
90
 
111
- parent_entry&.set_status_info(errors.last, importer_run)
91
+ parent_entry&.set_status_info(@errors.last, importer_run)
112
92
  failure_count += 1
113
93
 
114
94
  if failure_count < max_failure_count
115
95
  reschedule(
116
96
  parent_identifier: parent_identifier,
117
- importer_run_id: importer_run_id,
118
- run_user: run_user,
97
+ importer_run_id: @importer_run_id,
98
+ run_user: @user,
119
99
  failure_count: failure_count
120
100
  )
121
101
  end
122
- return errors # stop current job from continuing to run after rescheduling
102
+ return @errors # stop current job from continuing to run after rescheduling
123
103
  else
124
104
  # rubocop:disable Rails/SkipsModelValidations
125
- ImporterRun.update_counters(importer_run_id, processed_relationships: number_of_successes)
105
+ ImporterRun.update_counters(@importer_run_id, processed_relationships: @number_of_successes)
126
106
  # rubocop:enable Rails/SkipsModelValidations
127
107
  end
128
108
  end
@@ -132,6 +112,8 @@ module Bulkrax
132
112
 
133
113
  ##
134
114
  # We can use Hyrax's lock manager when we have one available.
115
+ # However it's not certain that this is actually working, so to be
116
+ # as safe as possible, we will reload resources before we update.
135
117
  if defined?(::Hyrax)
136
118
  include Hyrax::Lockable
137
119
 
@@ -151,46 +133,111 @@ module Bulkrax
151
133
  alias conditionally_acquire_lock_for acquire_lock_for
152
134
  end
153
135
 
154
- def process(relationship:, importer_run_id:, parent_record:, ability:)
155
- raise "#{relationship} needs a child to create relationship" if relationship.child_id.nil?
156
- raise "#{relationship} needs a parent to create relationship" if relationship.parent_id.nil?
157
-
158
- _child_entry, child_record = find_record(relationship.child_id, importer_run_id)
159
- raise "#{relationship} could not find child record" unless child_record
160
-
161
- raise "Cannot add child collection (ID=#{relationship.child_id}) to parent work (ID=#{relationship.parent_id})" if child_record.collection? && parent_record.work?
136
+ # When the parent is a collection, we save the relationship on each child.
137
+ # The parent does not need to be saved, as the relationship is stored on the child.
138
+ # but we do reindex the parent after all the children are added.
139
+ def process_parent_as_collection(parent_record:, parent_identifier:)
140
+ ActiveRecord::Base.uncached do
141
+ Bulkrax::PendingRelationship.where(parent_id: parent_identifier, importer_run_id: @importer_run_id)
142
+ .ordered.find_each do |rel|
143
+ raise "#{rel} needs a child to create relationship" if rel.child_id.nil?
144
+ raise "#{rel} needs a parent to create relationship" if rel.parent_id.nil?
145
+ add_to_collection(relationship: rel, parent_record: parent_record, ability: ability)
146
+ @number_of_successes += 1
147
+ @parent_record_members_added = true
148
+ rescue => e
149
+ rel.update(status_message: e.message)
150
+ @number_of_failures += 1
151
+ @errors << e
152
+ end
153
+ end
162
154
 
163
- ability.authorize!(:edit, child_record)
155
+ # if collection members were added, we reindex the collection
156
+ # The collection members have already saved the relationships
157
+ # To index the parent, we want to make sure we have the latest version of the parent,
158
+ # because another job may have updated it in the meantime.
159
+ return unless @parent_record_members_added
160
+ reloaded_parent = Bulkrax.object_factory.find(parent_record.id)
161
+ Bulkrax.object_factory.update_index(resources: [reloaded_parent])
162
+ Bulkrax.object_factory.publish(event: 'object.membership.updated', object: reloaded_parent, user: @user)
163
+ end
164
164
 
165
- # We could do this outside of the loop, but that could lead to odd counter failures.
166
- ability.authorize!(:edit, parent_record)
165
+ # When the parent is a work, we save the relationship on the parent.
166
+ # We prefer to save all of the member relationships and then save the parent once. Concurrent
167
+ # jobs may be trying to save the parent at the same time, so we need to lock the parent
168
+ # record while we are adding the children to it.
169
+ # However the locking appears to not be working so as a workaround we will save each member as we go,
170
+ # but only index the parent once at the end.
171
+ def process_parent_as_work(parent_record:, parent_identifier:)
172
+ conditionally_acquire_lock_for(parent_record.id.to_s) do
173
+ ActiveRecord::Base.uncached do
174
+ Bulkrax::PendingRelationship.where(parent_id: parent_identifier, importer_run_id: @importer_run_id)
175
+ .ordered.find_each do |rel|
176
+ raise "#{rel} needs a child to create relationship" if rel.child_id.nil?
177
+ raise "#{rel} needs a parent to create relationship" if rel.parent_id.nil?
178
+ add_to_work(relationship: rel, parent_record: parent_record, ability: ability)
179
+ self.number_of_successes += 1
180
+ @parent_record_members_added = true
181
+ rescue => e
182
+ rel.update(status_message: e.message)
183
+ @number_of_failures += 1
184
+ @errors << e
185
+ end
186
+ end
167
187
 
168
- if parent_record.is_a?(Bulkrax.collection_model_class)
169
- add_to_collection(child_record, parent_record)
170
- else
171
- add_to_work(child_record, parent_record)
188
+ # save record if members were added
189
+ if @parent_record_members_added
190
+ reloaded_parent = Bulkrax.object_factory.find(parent_record.id)
191
+ Bulkrax.object_factory.update_index(resources: [reloaded_parent])
192
+ Bulkrax.object_factory.publish(event: 'object.membership.updated', object: reloaded_parent, user: @user)
193
+ end
172
194
  end
173
-
174
- Bulkrax.object_factory.update_index_for_file_sets_of(resource: child_record) if update_child_records_works_file_sets?
175
-
176
- relationship.destroy
177
195
  end
178
196
 
179
- def add_to_collection(child_record, parent_record)
180
- Bulkrax.object_factory.add_resource_to_collection(
181
- collection: parent_record,
182
- resource: child_record,
183
- user: user
184
- )
197
+ # NOTE: the child changes are saved in the object factory.
198
+ def add_to_collection(relationship:, parent_record:, ability:)
199
+ ActiveRecord::Base.uncached do
200
+ _child_entry, child_record = find_record(relationship.child_id, @importer_run_id)
201
+ raise "#{relationship} could not find child record" unless child_record
202
+ raise "Cannot add child collection (ID=#{relationship.child_id}) to parent work (ID=#{relationship.parent_id})" if child_record.collection? && parent_record.work?
203
+ ability.authorize!(:edit, child_record)
204
+ # We could do this outside of the loop, but that could lead to odd counter failures.
205
+ ability.authorize!(:edit, parent_record)
206
+ # It is important to lock the child records as they are the ones being saved.
207
+ # However, locking doesn't seem to be working so we will reload the child record before saving.
208
+ # This is a workaround for the fact that the lock manager doesn't seem to be working.
209
+ conditionally_acquire_lock_for(child_record.id.to_s) do
210
+ Bulkrax.object_factory.add_resource_to_collection(
211
+ collection: parent_record,
212
+ resource: child_record,
213
+ user: @user
214
+ )
215
+ end
216
+ relationship.destroy
217
+ end
185
218
  end
186
219
 
187
- def add_to_work(child_record, parent_record)
188
- # NOTE: The .add_child_to_parent_work should not persist changes to the
189
- # child nor parent. We'll do that elsewhere in this loop.
190
- Bulkrax.object_factory.add_child_to_parent_work(
220
+ # NOTE: we only update the parent's member_ids and prefer to not save the parent until all children are added.
221
+ # However, the locking appears to be working so as a workaround we will save each member as we go.
222
+ # This is a workaround for the fact that the lock manager doesn't seem to be working.
223
+ # To avoid having to reload the parent, we return the updated parent to the calling method.
224
+ def add_to_work(relationship:, parent_record:, ability:)
225
+ _child_entry, child_record = find_record(relationship.child_id, @importer_run_id)
226
+ raise "#{relationship} could not find child record" unless child_record
227
+ raise "Cannot add child collection (ID=#{relationship.child_id}) to parent work (ID=#{relationship.parent_id})" if child_record.collection? && parent_record.work?
228
+
229
+ ability.authorize!(:edit, child_record)
230
+ # We could do this outside of the loop, but that could lead to odd counter failures.
231
+ ability.authorize!(:edit, parent_record)
232
+ updated_parent = Bulkrax.object_factory.add_child_to_parent_work(
191
233
  parent: parent_record,
192
234
  child: child_record
193
235
  )
236
+ # default is false for this... do not typically need to index file sets of child records
237
+ Bulkrax.object_factory.update_index_for_file_sets_of(resource: child_record) if update_child_records_works_file_sets?
238
+ relationship.destroy
239
+
240
+ updated_parent
194
241
  end
195
242
 
196
243
  def reschedule(**kargs)
@@ -6,6 +6,17 @@ module Bulkrax
6
6
 
7
7
  def perform(entry, importer_run)
8
8
  user = importer_run.importer.user
9
+
10
+ # When we delete, we don't go through the build process.
11
+ # However, we need the identifier to be set for the entry.
12
+ # This enables us to delete based on the ID, not just the source_identifier.
13
+ if entry.respond_to?(:build_metadata_for_delete) &&
14
+ entry.parsed_metadata.nil? &&
15
+ entry.raw_metadata.present?
16
+ entry.build_metadata_for_delete
17
+ entry.save!
18
+ end
19
+
9
20
  entry.factory.delete(user)
10
21
 
11
22
  # rubocop:disable Rails/SkipsModelValidations
@@ -30,6 +30,7 @@ module Bulkrax
30
30
  return unless parser.file? && parser.zip?
31
31
 
32
32
  parser.unzip(parser.parser_fields['import_file_path'])
33
+ parser.remove_spaces_from_filenames
33
34
  end
34
35
 
35
36
  def update_current_run_counters(importer)
@@ -16,8 +16,9 @@ module Bulkrax
16
16
 
17
17
  def result(_parser, content)
18
18
  return nil if self.excluded == true || Bulkrax.reserved_properties.include?(self.to)
19
+ # rubocop:disable Style/RedundantParentheses
19
20
  return nil if self.if && (!self.if.is_a?(Array) && self.if.length != 2)
20
-
21
+ # rubocop:enable Style/RedundantParentheses
21
22
  if self.if
22
23
  return unless content.send(self.if[0], Regexp.new(self.if[1]))
23
24
  end
@@ -5,6 +5,23 @@ module Bulkrax
5
5
  # We do too much in these entry classes. We need to extract the common logic from the various
6
6
  # entry models into a module that can be shared between them.
7
7
  class CsvEntry < Entry # rubocop:disable Metrics/ClassLength
8
+ class CsvPathError < StandardError
9
+ def initialize(message)
10
+ super(message)
11
+ end
12
+ end
13
+
14
+ class RecordNotFound < StandardError
15
+ def initialize(message)
16
+ super(message)
17
+ end
18
+ end
19
+
20
+ class MissingMetadata < StandardError
21
+ def initialize(message)
22
+ super(message)
23
+ end
24
+ end
8
25
  serialize :raw_metadata, Bulkrax::NormalizedJson
9
26
 
10
27
  def self.fields_from_data(data)
@@ -16,7 +33,7 @@ module Bulkrax
16
33
  # there's a risk that this reads the whole file into memory and could cause a memory leak
17
34
  # we strip any special characters out of the headers. looking at you Excel
18
35
  def self.read_data(path)
19
- raise StandardError, 'CSV path empty' if path.blank?
36
+ raise CsvPathError, 'CSV path empty' if path.blank?
20
37
  options = {
21
38
  headers: true,
22
39
  header_converters: ->(h) { h.to_s.gsub(/[^\w\d\. -]+/, '').strip.to_sym },
@@ -85,10 +102,18 @@ module Bulkrax
85
102
  self.parsed_metadata
86
103
  end
87
104
 
105
+ # limited metadata is needed for delete jobs
106
+ def build_metadata_for_delete
107
+ self.parsed_metadata = {}
108
+ establish_factory_class
109
+ add_ingested_metadata
110
+ self.parsed_metadata
111
+ end
112
+
88
113
  def validate_record
89
- raise StandardError, 'Record not found' if record.nil?
114
+ raise RecordNotFound, 'Record not found' if record.nil?
90
115
  unless importerexporter.parser.required_elements?(record)
91
- raise StandardError, "Missing required elements, missing element(s) are: "\
116
+ raise MissingMetadata, "Missing required elements, missing element(s) are: "\
92
117
  "#{importerexporter.parser.missing_elements(record).join(', ')}"
93
118
  end
94
119
  end
@@ -160,7 +185,7 @@ module Bulkrax
160
185
  source_id = source_id.to_a if source_id.is_a?(ActiveTriples::Relation)
161
186
  source_id = Array.wrap(source_id).first
162
187
  self.parsed_metadata[source_identifier] = source_id
163
- model_name = hyrax_record.respond_to?(:to_rdf_representation) ? hyrax_record.to_rdf_representation : hyrax_record.has_model.first
188
+ model_name = Bulkrax.object_factory.model_name(resource: hyrax_record)
164
189
  self.parsed_metadata[key_for_export('model')] = model_name
165
190
  end
166
191
 
@@ -179,9 +204,13 @@ module Bulkrax
179
204
 
180
205
  def build_relationship_metadata
181
206
  # Includes all relationship methods for all exportable record types (works, Collections, FileSets)
207
+ # @TODO: this logic assumes that the relationships are all available via a method that can be called
208
+ # on the object. With Valkyrie, this is only true for Hyrax-based models which include the
209
+ # ArResource module. We need to consider reworking this logic into an object factory method
210
+ # that can handle different types of models.
182
211
  relationship_methods = {
183
- related_parents_parsed_mapping => %i[member_of_collection_ids member_of_work_ids in_work_ids],
184
- related_children_parsed_mapping => %i[member_collection_ids member_work_ids file_set_ids]
212
+ related_parents_parsed_mapping => %i[member_of_collection_ids member_of_work_ids in_work_ids parent],
213
+ related_children_parsed_mapping => %i[member_collection_ids member_work_ids file_set_ids member_ids]
185
214
  }
186
215
 
187
216
  relationship_methods.each do |relationship_key, methods|
@@ -189,7 +218,9 @@ module Bulkrax
189
218
 
190
219
  values = []
191
220
  methods.each do |m|
192
- values << hyrax_record.public_send(m) if hyrax_record.respond_to?(m)
221
+ value = hyrax_record.public_send(m) if hyrax_record.respond_to?(m)
222
+ value_id = value.try(:id)&.to_s || value # get the id if it's an object
223
+ values << value_id if value_id.present?
193
224
  end
194
225
  values = values.flatten.uniq
195
226
  next if values.blank?
@@ -316,11 +347,11 @@ module Bulkrax
316
347
 
317
348
  def build_thumbnail_files
318
349
  return unless importerexporter.include_thumbnails
350
+ thumbnail = Bulkrax.object_factory.thumbnail_for(resource: hyrax_record)
351
+ return unless thumbnail
319
352
 
353
+ filenames = map_file_sets(Array.wrap(thumbnail))
320
354
  thumbnail_mapping = 'thumbnail_file'
321
- file_sets = Array.wrap(hyrax_record.thumbnail)
322
-
323
- filenames = map_file_sets(file_sets)
324
355
  handle_join_on_export(thumbnail_mapping, filenames, false)
325
356
  end
326
357
 
@@ -237,8 +237,16 @@ module Bulkrax
237
237
  # end
238
238
 
239
239
  # If the import data is zipped, unzip it to this path
240
- def importer_unzip_path
240
+ def importer_unzip_path(mkdir: false)
241
241
  @importer_unzip_path ||= File.join(parser.base_path, "import_#{path_string}")
242
+ return @importer_unzip_path if Dir.exist?(@importer_unzip_path) || mkdir == true
243
+
244
+ # turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
245
+ base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
246
+
247
+ # If we don't have an existing unzip path, we'll try and find it.
248
+ # Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
249
+ @importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
242
250
  end
243
251
 
244
252
  def errored_entries_csv_path
@@ -23,7 +23,7 @@ module Bulkrax
23
23
  end
24
24
 
25
25
  def latest?
26
- # TODO: remove if statment when we stop supporting Hyrax < 4
26
+ # TODO: remove if statement when we stop supporting Hyrax < 4
27
27
  self.id == if Gem::Version.new(Rails::VERSION::STRING) >= Gem::Version.new('6.0.0')
28
28
  self.class.where(statusable_id: self.statusable_id, statusable_type: self.statusable_type).order('id desc').pick(:id)
29
29
  else
@@ -26,25 +26,38 @@ module Bulkrax
26
26
 
27
27
  # Prepend the file_set id to ensure a unique filename and also one that is not longer than 255 characters
28
28
  def filename(file_set)
29
- return if file_set.original_file.blank?
30
- if file_set.original_file.respond_to?(:original_filename) # valkyrie
31
- fn = file_set.original_file.original_filename
32
- mime = ::Marcel::MimeType.for(file_set.original_file.file.io)
33
- else # original non valkyrie version
34
- fn = file_set.original_file.file_name.first
35
- mime = ::Marcel::MimeType.for(declared_type: file_set.original_file.mime_type)
36
- end
37
- ext_mime = ::Marcel::MimeType.for(name: fn)
29
+ # return if there are no files on the fileset
30
+ return if Bulkrax.object_factory.original_file(fileset: file_set).blank?
31
+
32
+ fn = Bulkrax.object_factory.filename_for(fileset: file_set)
33
+ file = Bulkrax.object_factory.original_file(fileset: file_set)
34
+ ext = file_extension(file: file, filename: fn)
35
+
36
+ # Prepend the file_set id to ensure a unique filename
37
+ filename = File.basename(fn, ".*")
38
+ # Skip modification if file already has ID or we're in metadata-only mode
38
39
  if fn.include?(file_set.id) || importerexporter.metadata_only?
39
- filename = "#{fn}.#{mime.to_sym}"
40
- filename = fn if mime.to_s == ext_mime.to_s
40
+ # keep filename as is
41
41
  else
42
- filename = "#{file_set.id}_#{fn}.#{mime.to_sym}"
43
- filename = "#{file_set.id}_#{fn}" if mime.to_s == ext_mime.to_s
42
+ filename = "#{file_set.id}_#{filename}"
44
43
  end
45
- # Remove extention truncate and reattach
46
- ext = File.extname(filename)
44
+ filename = ext.present? ? "#{filename}.#{ext}" : fn
45
+
46
+ # Remove extension, truncate and reattach
47
47
  "#{File.basename(filename, ext)[0...(220 - ext.length)]}#{ext}"
48
48
  end
49
+
50
+ ##
51
+ # Generate the appropriate file extension based on the mime type of the file
52
+ # @return [String] the file extension for the given file
53
+ def file_extension(file:, filename:)
54
+ declared_mime = ::Marcel::MimeType.for(declared_type: file.mime_type)
55
+ # validate the declared mime type
56
+ declared_mime = ::Marcel::MimeType.for(name: filename) if declared_mime.nil? || declared_mime == "application/octet-stream"
57
+ # convert the mime type to a file extension
58
+ Mime::Type.lookup(declared_mime).symbol.to_s
59
+ rescue Mime::Type::InvalidMimeType
60
+ nil
61
+ end
49
62
  end
50
63
  end
@@ -2,6 +2,15 @@
2
2
 
3
3
  module Bulkrax
4
4
  module FileSetEntryBehavior
5
+ class FileNameError < StandardError
6
+ end
7
+
8
+ class OrphanFileSetError < StandardError
9
+ end
10
+
11
+ class FilePathError < StandardError
12
+ end
13
+
5
14
  extend ActiveSupport::Concern
6
15
 
7
16
  included do
@@ -21,11 +30,11 @@ module Bulkrax
21
30
 
22
31
  path_to_file = parser.path_to_files(filename: filename)
23
32
 
24
- parsed_metadata['file'][i] = path_to_file
33
+ parsed_metadata['file'][i] = path_to_file if path_to_file.present?
25
34
  end
26
35
  parsed_metadata['file'].delete('')
27
36
 
28
- raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
37
+ raise FilePathError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
29
38
 
30
39
  parsed_metadata['file']
31
40
  end
@@ -33,13 +42,13 @@ module Bulkrax
33
42
  def validate_presence_of_filename!
34
43
  return if parsed_metadata&.[](file_reference)&.map(&:present?)&.any?
35
44
 
36
- raise StandardError, 'File set must have a filename'
45
+ raise FileNameError, 'File set must have a filename'
37
46
  end
38
47
 
39
48
  def validate_presence_of_parent!
40
49
  return if parsed_metadata[related_parents_parsed_mapping]&.map(&:present?)&.any?
41
50
 
42
- raise StandardError, 'File set must be related to at least one work'
51
+ raise OrphanFileSetError, 'File set must be related to at least one work'
43
52
  end
44
53
 
45
54
  def parent_jobs
@@ -56,7 +56,7 @@ module Bulkrax
56
56
 
57
57
  returning_value = false
58
58
  File.open(filename) do |file|
59
- mime_type = ::Marcel::MimeType.for(file)
59
+ mime_type = ::Marcel::MimeType.for(name: file)
60
60
  returning_value = mime_type.include?('application/zip') || mime_type.include?('application/gzip')
61
61
  end
62
62
  returning_value
@@ -209,8 +209,11 @@ module Bulkrax
209
209
  def rebuild_entries(types_array = nil)
210
210
  index = 0
211
211
  (types_array || %w[collection work file_set relationship]).each do |type|
212
- # works are not gurneteed to have Work in the type
213
-
212
+ # works are not guaranteed to have Work in the type
213
+ if type.eql?('relationship')
214
+ ScheduleRelationshipsJob.set(wait: 5.minutes).perform_later(importer_id: importerexporter.id)
215
+ next
216
+ end
214
217
  importer.entries.where(rebuild_entry_query(type, parser_fields['entry_statuses'])).find_each do |e|
215
218
  seen[e.identifier] = true
216
219
  e.status_info('Pending', importer.current_run)
@@ -432,7 +435,7 @@ module Bulkrax
432
435
 
433
436
  Zip::File.open(file_to_unzip) do |zip_file|
434
437
  zip_file.each do |entry|
435
- entry_path = File.join(importer_unzip_path, entry.name)
438
+ entry_path = File.join(importer_unzip_path(mkdir: true), entry.name)
436
439
  FileUtils.mkdir_p(File.dirname(entry_path))
437
440
  zip_file.extract(entry, entry_path) unless File.exist?(entry_path)
438
441
  end
@@ -440,12 +443,27 @@ module Bulkrax
440
443
  end
441
444
 
442
445
  def untar(file_to_untar)
443
- Dir.mkdir(importer_unzip_path) unless File.directory?(importer_unzip_path)
446
+ Dir.mkdir(importer_unzip_path(mkdir: true)) unless File.directory?(importer_unzip_path(mkdir: true))
444
447
  command = "tar -xzf #{Shellwords.escape(file_to_untar)} -C #{Shellwords.escape(importer_unzip_path)}"
445
448
  result = system(command)
446
449
  raise "Failed to extract #{file_to_untar}" unless result
447
450
  end
448
451
 
452
+ # File names referenced in CSVs have spaces replaced with underscores
453
+ # @see Bulkrax::CsvParser#file_paths
454
+ def remove_spaces_from_filenames
455
+ files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
456
+ files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
457
+ return if files_with_spaces.blank?
458
+
459
+ files_with_spaces.map! { |path| Pathname.new(path) }
460
+ files_with_spaces.each do |path|
461
+ filename = path.basename
462
+ filename_without_spaces = filename.to_s.tr(' ', '_')
463
+ path.rename(File.join(path.dirname, filename_without_spaces))
464
+ end
465
+ end
466
+
449
467
  def zip
450
468
  FileUtils.mkdir_p(exporter_export_zip_path)
451
469