bulkrax 9.0.2 → 9.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +26 -0
- data/app/assets/javascripts/bulkrax/datatables.js +12 -0
- data/app/assets/javascripts/bulkrax/importers.js.erb +4 -1
- data/app/factories/bulkrax/object_factory.rb +36 -2
- data/app/factories/bulkrax/object_factory_interface.rb +26 -0
- data/app/factories/bulkrax/valkyrie_object_factory.rb +109 -27
- data/app/jobs/bulkrax/create_relationships_job.rb +123 -76
- data/app/jobs/bulkrax/delete_job.rb +11 -0
- data/app/jobs/bulkrax/importer_job.rb +1 -0
- data/app/matchers/bulkrax/application_matcher.rb +2 -1
- data/app/models/bulkrax/csv_entry.rb +41 -10
- data/app/models/bulkrax/importer.rb +9 -1
- data/app/models/bulkrax/status.rb +1 -1
- data/app/models/concerns/bulkrax/export_behavior.rb +28 -15
- data/app/models/concerns/bulkrax/file_set_entry_behavior.rb +13 -4
- data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +1 -1
- data/app/parsers/bulkrax/application_parser.rb +22 -4
- data/app/parsers/bulkrax/csv_parser.rb +36 -6
- data/app/parsers/bulkrax/oai_dc_parser.rb +0 -2
- data/app/parsers/bulkrax/xml_parser.rb +1 -1
- data/app/services/bulkrax/factory_class_finder.rb +56 -15
- data/app/services/hyrax/custom_queries/find_by_source_identifier.rb +6 -11
- data/app/services/wings/custom_queries/find_by_source_identifier.rb +15 -6
- data/app/views/bulkrax/entries/show.html.erb +15 -9
- data/app/views/bulkrax/importers/_bagit_fields.html.erb +1 -1
- data/app/views/bulkrax/importers/_csv_fields.html.erb +1 -1
- data/app/views/bulkrax/importers/_oai_fields.html.erb +1 -1
- data/app/views/bulkrax/importers/_xml_fields.html.erb +1 -1
- data/app/views/bulkrax/importers/show.html.erb +4 -4
- data/app/views/bulkrax/shared/_entries_tab.html.erb +1 -1
- data/config/locales/bulkrax.en.yml +5 -3
- data/lib/bulkrax/engine.rb +1 -1
- data/lib/bulkrax/version.rb +1 -1
- data/lib/bulkrax.rb +6 -11
- data/lib/generators/bulkrax/templates/bin/importer +1 -5
- metadata +8 -3
- data/app/factories/bulkrax/valkyrize-hyku.code-workspace +0 -19
@@ -44,7 +44,7 @@ module Bulkrax
|
|
44
44
|
|
45
45
|
queue_as Bulkrax.config.ingest_queue_name
|
46
46
|
|
47
|
-
attr_accessor :user, :importer_run, :errors
|
47
|
+
attr_accessor :user, :importer_run, :errors, :importer_run_id, :ability, :number_of_successes, :number_of_failures
|
48
48
|
##
|
49
49
|
# @param parent_identifier [String] Work/Collection ID or Bulkrax::Entry source_identifiers
|
50
50
|
# @param importer_run [Bulkrax::ImporterRun] current importer run (needed to properly update counters)
|
@@ -57,72 +57,52 @@ module Bulkrax
|
|
57
57
|
#
|
58
58
|
# rubocop:disable Metrics/MethodLength
|
59
59
|
def perform(parent_identifier:, importer_run_id: nil, run_user: nil, failure_count: 0) # rubocop:disable Metrics/AbcSize
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
errors = []
|
60
|
+
@importer_run_id = importer_run_id
|
61
|
+
@importer_run = Bulkrax::ImporterRun.find(@importer_run_id) if @importer_run_id
|
62
|
+
@user = run_user || importer_run&.user
|
63
|
+
@ability = Ability.new(@user)
|
64
|
+
|
65
|
+
@number_of_successes = 0
|
66
|
+
@number_of_failures = 0
|
67
|
+
@errors = []
|
69
68
|
@parent_record_members_added = false
|
70
|
-
@child_members_added = []
|
71
69
|
|
70
|
+
parent_entry, parent_record = find_record(parent_identifier, @importer_run_id)
|
72
71
|
if parent_record
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
number_of_failures += 1
|
82
|
-
rel.set_status_info(e, importer_run)
|
83
|
-
errors << e
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
# save record if members were added
|
88
|
-
if @parent_record_members_added
|
89
|
-
Bulkrax.object_factory.save!(resource: parent_record, user: user)
|
90
|
-
Bulkrax.object_factory.publish(event: 'object.membership.updated', object: parent_record)
|
91
|
-
Bulkrax.object_factory.update_index(resources: @child_members_added)
|
92
|
-
end
|
72
|
+
# Works and collections are different breeds of animals:
|
73
|
+
# - works know both their children (file_sets and child works) in member_ids
|
74
|
+
# - works and collections know their parents (collections) in member_of_collection_ids
|
75
|
+
# We need to handle the two differently by locking the records appropriately to avoid race condition errors.
|
76
|
+
if parent_record.is_a?(Bulkrax.collection_model_class)
|
77
|
+
process_parent_as_collection(parent_record: parent_record, parent_identifier: parent_identifier)
|
78
|
+
else
|
79
|
+
process_parent_as_work(parent_record: parent_record, parent_identifier: parent_identifier)
|
93
80
|
end
|
94
81
|
else
|
95
|
-
|
96
|
-
|
97
|
-
# unavailable.
|
98
|
-
#
|
99
|
-
# We have chosen not to duplicate that "number of errors" as it does not seem like the
|
100
|
-
# correct pattern for reporting a singular error (the previous pattern being one error per
|
101
|
-
# child who's parent is not yet created).
|
102
|
-
number_of_failures = 1
|
103
|
-
errors = ["Parent record not yet available for creating relationships with children records."]
|
82
|
+
@number_of_failures = 1
|
83
|
+
@errors = ["Parent record #{parent_identifier} not yet available for creating relationships with children records."]
|
104
84
|
end
|
105
85
|
|
106
|
-
if errors.present?
|
86
|
+
if @errors.present?
|
107
87
|
# rubocop:disable Rails/SkipsModelValidations
|
108
|
-
ImporterRun.update_counters(importer_run_id, failed_relationships: number_of_failures)
|
88
|
+
ImporterRun.update_counters(@importer_run_id, failed_relationships: @number_of_failures)
|
109
89
|
# rubocop:enable Rails/SkipsModelValidations
|
110
90
|
|
111
|
-
parent_entry&.set_status_info(errors.last, importer_run)
|
91
|
+
parent_entry&.set_status_info(@errors.last, importer_run)
|
112
92
|
failure_count += 1
|
113
93
|
|
114
94
|
if failure_count < max_failure_count
|
115
95
|
reschedule(
|
116
96
|
parent_identifier: parent_identifier,
|
117
|
-
importer_run_id: importer_run_id,
|
118
|
-
run_user:
|
97
|
+
importer_run_id: @importer_run_id,
|
98
|
+
run_user: @user,
|
119
99
|
failure_count: failure_count
|
120
100
|
)
|
121
101
|
end
|
122
|
-
return errors # stop current job from continuing to run after rescheduling
|
102
|
+
return @errors # stop current job from continuing to run after rescheduling
|
123
103
|
else
|
124
104
|
# rubocop:disable Rails/SkipsModelValidations
|
125
|
-
ImporterRun.update_counters(importer_run_id, processed_relationships: number_of_successes)
|
105
|
+
ImporterRun.update_counters(@importer_run_id, processed_relationships: @number_of_successes)
|
126
106
|
# rubocop:enable Rails/SkipsModelValidations
|
127
107
|
end
|
128
108
|
end
|
@@ -132,6 +112,8 @@ module Bulkrax
|
|
132
112
|
|
133
113
|
##
|
134
114
|
# We can use Hyrax's lock manager when we have one available.
|
115
|
+
# However it's not certain that this is actually working, so to be
|
116
|
+
# as safe as possible, we will reload resources before we update.
|
135
117
|
if defined?(::Hyrax)
|
136
118
|
include Hyrax::Lockable
|
137
119
|
|
@@ -151,46 +133,111 @@ module Bulkrax
|
|
151
133
|
alias conditionally_acquire_lock_for acquire_lock_for
|
152
134
|
end
|
153
135
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
136
|
+
# When the parent is a collection, we save the relationship on each child.
|
137
|
+
# The parent does not need to be saved, as the relationship is stored on the child.
|
138
|
+
# but we do reindex the parent after all the children are added.
|
139
|
+
def process_parent_as_collection(parent_record:, parent_identifier:)
|
140
|
+
ActiveRecord::Base.uncached do
|
141
|
+
Bulkrax::PendingRelationship.where(parent_id: parent_identifier, importer_run_id: @importer_run_id)
|
142
|
+
.ordered.find_each do |rel|
|
143
|
+
raise "#{rel} needs a child to create relationship" if rel.child_id.nil?
|
144
|
+
raise "#{rel} needs a parent to create relationship" if rel.parent_id.nil?
|
145
|
+
add_to_collection(relationship: rel, parent_record: parent_record, ability: ability)
|
146
|
+
@number_of_successes += 1
|
147
|
+
@parent_record_members_added = true
|
148
|
+
rescue => e
|
149
|
+
rel.update(status_message: e.message)
|
150
|
+
@number_of_failures += 1
|
151
|
+
@errors << e
|
152
|
+
end
|
153
|
+
end
|
162
154
|
|
163
|
-
|
155
|
+
# if collection members were added, we reindex the collection
|
156
|
+
# The collection members have already saved the relationships
|
157
|
+
# To index the parent, we want to make sure we have the latest version of the parent,
|
158
|
+
# because another job may have updated it in the meantime.
|
159
|
+
return unless @parent_record_members_added
|
160
|
+
reloaded_parent = Bulkrax.object_factory.find(parent_record.id)
|
161
|
+
Bulkrax.object_factory.update_index(resources: [reloaded_parent])
|
162
|
+
Bulkrax.object_factory.publish(event: 'object.membership.updated', object: reloaded_parent, user: @user)
|
163
|
+
end
|
164
164
|
|
165
|
-
|
166
|
-
|
165
|
+
# When the parent is a work, we save the relationship on the parent.
|
166
|
+
# We prefer to save all of the member relationships and then save the parent once. Concurrent
|
167
|
+
# jobs may be trying to save the parent at the same time, so we need to lock the parent
|
168
|
+
# record while we are adding the children to it.
|
169
|
+
# However the locking appears to not be working so as a workaround we will save each member as we go,
|
170
|
+
# but only index the parent once at the end.
|
171
|
+
def process_parent_as_work(parent_record:, parent_identifier:)
|
172
|
+
conditionally_acquire_lock_for(parent_record.id.to_s) do
|
173
|
+
ActiveRecord::Base.uncached do
|
174
|
+
Bulkrax::PendingRelationship.where(parent_id: parent_identifier, importer_run_id: @importer_run_id)
|
175
|
+
.ordered.find_each do |rel|
|
176
|
+
raise "#{rel} needs a child to create relationship" if rel.child_id.nil?
|
177
|
+
raise "#{rel} needs a parent to create relationship" if rel.parent_id.nil?
|
178
|
+
add_to_work(relationship: rel, parent_record: parent_record, ability: ability)
|
179
|
+
self.number_of_successes += 1
|
180
|
+
@parent_record_members_added = true
|
181
|
+
rescue => e
|
182
|
+
rel.update(status_message: e.message)
|
183
|
+
@number_of_failures += 1
|
184
|
+
@errors << e
|
185
|
+
end
|
186
|
+
end
|
167
187
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
188
|
+
# save record if members were added
|
189
|
+
if @parent_record_members_added
|
190
|
+
reloaded_parent = Bulkrax.object_factory.find(parent_record.id)
|
191
|
+
Bulkrax.object_factory.update_index(resources: [reloaded_parent])
|
192
|
+
Bulkrax.object_factory.publish(event: 'object.membership.updated', object: reloaded_parent, user: @user)
|
193
|
+
end
|
172
194
|
end
|
173
|
-
|
174
|
-
Bulkrax.object_factory.update_index_for_file_sets_of(resource: child_record) if update_child_records_works_file_sets?
|
175
|
-
|
176
|
-
relationship.destroy
|
177
195
|
end
|
178
196
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
197
|
+
# NOTE: the child changes are saved in the object factory.
|
198
|
+
def add_to_collection(relationship:, parent_record:, ability:)
|
199
|
+
ActiveRecord::Base.uncached do
|
200
|
+
_child_entry, child_record = find_record(relationship.child_id, @importer_run_id)
|
201
|
+
raise "#{relationship} could not find child record" unless child_record
|
202
|
+
raise "Cannot add child collection (ID=#{relationship.child_id}) to parent work (ID=#{relationship.parent_id})" if child_record.collection? && parent_record.work?
|
203
|
+
ability.authorize!(:edit, child_record)
|
204
|
+
# We could do this outside of the loop, but that could lead to odd counter failures.
|
205
|
+
ability.authorize!(:edit, parent_record)
|
206
|
+
# It is important to lock the child records as they are the ones being saved.
|
207
|
+
# However, locking doesn't seem to be working so we will reload the child record before saving.
|
208
|
+
# This is a workaround for the fact that the lock manager doesn't seem to be working.
|
209
|
+
conditionally_acquire_lock_for(child_record.id.to_s) do
|
210
|
+
Bulkrax.object_factory.add_resource_to_collection(
|
211
|
+
collection: parent_record,
|
212
|
+
resource: child_record,
|
213
|
+
user: @user
|
214
|
+
)
|
215
|
+
end
|
216
|
+
relationship.destroy
|
217
|
+
end
|
185
218
|
end
|
186
219
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
220
|
+
# NOTE: we only update the parent's member_ids and prefer to not save the parent until all children are added.
|
221
|
+
# However, the locking appears to be working so as a workaround we will save each member as we go.
|
222
|
+
# This is a workaround for the fact that the lock manager doesn't seem to be working.
|
223
|
+
# To avoid having to reload the parent, we return the updated parent to the calling method.
|
224
|
+
def add_to_work(relationship:, parent_record:, ability:)
|
225
|
+
_child_entry, child_record = find_record(relationship.child_id, @importer_run_id)
|
226
|
+
raise "#{relationship} could not find child record" unless child_record
|
227
|
+
raise "Cannot add child collection (ID=#{relationship.child_id}) to parent work (ID=#{relationship.parent_id})" if child_record.collection? && parent_record.work?
|
228
|
+
|
229
|
+
ability.authorize!(:edit, child_record)
|
230
|
+
# We could do this outside of the loop, but that could lead to odd counter failures.
|
231
|
+
ability.authorize!(:edit, parent_record)
|
232
|
+
updated_parent = Bulkrax.object_factory.add_child_to_parent_work(
|
191
233
|
parent: parent_record,
|
192
234
|
child: child_record
|
193
235
|
)
|
236
|
+
# default is false for this... do not typically need to index file sets of child records
|
237
|
+
Bulkrax.object_factory.update_index_for_file_sets_of(resource: child_record) if update_child_records_works_file_sets?
|
238
|
+
relationship.destroy
|
239
|
+
|
240
|
+
updated_parent
|
194
241
|
end
|
195
242
|
|
196
243
|
def reschedule(**kargs)
|
@@ -6,6 +6,17 @@ module Bulkrax
|
|
6
6
|
|
7
7
|
def perform(entry, importer_run)
|
8
8
|
user = importer_run.importer.user
|
9
|
+
|
10
|
+
# When we delete, we don't go through the build process.
|
11
|
+
# However, we need the identifier to be set for the entry.
|
12
|
+
# This enables us to delete based on the ID, not just the source_identifier.
|
13
|
+
if entry.respond_to?(:build_metadata_for_delete) &&
|
14
|
+
entry.parsed_metadata.nil? &&
|
15
|
+
entry.raw_metadata.present?
|
16
|
+
entry.build_metadata_for_delete
|
17
|
+
entry.save!
|
18
|
+
end
|
19
|
+
|
9
20
|
entry.factory.delete(user)
|
10
21
|
|
11
22
|
# rubocop:disable Rails/SkipsModelValidations
|
@@ -16,8 +16,9 @@ module Bulkrax
|
|
16
16
|
|
17
17
|
def result(_parser, content)
|
18
18
|
return nil if self.excluded == true || Bulkrax.reserved_properties.include?(self.to)
|
19
|
+
# rubocop:disable Style/RedundantParentheses
|
19
20
|
return nil if self.if && (!self.if.is_a?(Array) && self.if.length != 2)
|
20
|
-
|
21
|
+
# rubocop:enable Style/RedundantParentheses
|
21
22
|
if self.if
|
22
23
|
return unless content.send(self.if[0], Regexp.new(self.if[1]))
|
23
24
|
end
|
@@ -5,6 +5,23 @@ module Bulkrax
|
|
5
5
|
# We do too much in these entry classes. We need to extract the common logic from the various
|
6
6
|
# entry models into a module that can be shared between them.
|
7
7
|
class CsvEntry < Entry # rubocop:disable Metrics/ClassLength
|
8
|
+
class CsvPathError < StandardError
|
9
|
+
def initialize(message)
|
10
|
+
super(message)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class RecordNotFound < StandardError
|
15
|
+
def initialize(message)
|
16
|
+
super(message)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class MissingMetadata < StandardError
|
21
|
+
def initialize(message)
|
22
|
+
super(message)
|
23
|
+
end
|
24
|
+
end
|
8
25
|
serialize :raw_metadata, Bulkrax::NormalizedJson
|
9
26
|
|
10
27
|
def self.fields_from_data(data)
|
@@ -16,7 +33,7 @@ module Bulkrax
|
|
16
33
|
# there's a risk that this reads the whole file into memory and could cause a memory leak
|
17
34
|
# we strip any special characters out of the headers. looking at you Excel
|
18
35
|
def self.read_data(path)
|
19
|
-
raise
|
36
|
+
raise CsvPathError, 'CSV path empty' if path.blank?
|
20
37
|
options = {
|
21
38
|
headers: true,
|
22
39
|
header_converters: ->(h) { h.to_s.gsub(/[^\w\d\. -]+/, '').strip.to_sym },
|
@@ -85,10 +102,18 @@ module Bulkrax
|
|
85
102
|
self.parsed_metadata
|
86
103
|
end
|
87
104
|
|
105
|
+
# limited metadata is needed for delete jobs
|
106
|
+
def build_metadata_for_delete
|
107
|
+
self.parsed_metadata = {}
|
108
|
+
establish_factory_class
|
109
|
+
add_ingested_metadata
|
110
|
+
self.parsed_metadata
|
111
|
+
end
|
112
|
+
|
88
113
|
def validate_record
|
89
|
-
raise
|
114
|
+
raise RecordNotFound, 'Record not found' if record.nil?
|
90
115
|
unless importerexporter.parser.required_elements?(record)
|
91
|
-
raise
|
116
|
+
raise MissingMetadata, "Missing required elements, missing element(s) are: "\
|
92
117
|
"#{importerexporter.parser.missing_elements(record).join(', ')}"
|
93
118
|
end
|
94
119
|
end
|
@@ -160,7 +185,7 @@ module Bulkrax
|
|
160
185
|
source_id = source_id.to_a if source_id.is_a?(ActiveTriples::Relation)
|
161
186
|
source_id = Array.wrap(source_id).first
|
162
187
|
self.parsed_metadata[source_identifier] = source_id
|
163
|
-
model_name =
|
188
|
+
model_name = Bulkrax.object_factory.model_name(resource: hyrax_record)
|
164
189
|
self.parsed_metadata[key_for_export('model')] = model_name
|
165
190
|
end
|
166
191
|
|
@@ -179,9 +204,13 @@ module Bulkrax
|
|
179
204
|
|
180
205
|
def build_relationship_metadata
|
181
206
|
# Includes all relationship methods for all exportable record types (works, Collections, FileSets)
|
207
|
+
# @TODO: this logic assumes that the relationships are all available via a method that can be called
|
208
|
+
# on the object. With Valkyrie, this is only true for Hyrax-based models which include the
|
209
|
+
# ArResource module. We need to consider reworking this logic into an object factory method
|
210
|
+
# that can handle different types of models.
|
182
211
|
relationship_methods = {
|
183
|
-
related_parents_parsed_mapping => %i[member_of_collection_ids member_of_work_ids in_work_ids],
|
184
|
-
related_children_parsed_mapping => %i[member_collection_ids member_work_ids file_set_ids]
|
212
|
+
related_parents_parsed_mapping => %i[member_of_collection_ids member_of_work_ids in_work_ids parent],
|
213
|
+
related_children_parsed_mapping => %i[member_collection_ids member_work_ids file_set_ids member_ids]
|
185
214
|
}
|
186
215
|
|
187
216
|
relationship_methods.each do |relationship_key, methods|
|
@@ -189,7 +218,9 @@ module Bulkrax
|
|
189
218
|
|
190
219
|
values = []
|
191
220
|
methods.each do |m|
|
192
|
-
|
221
|
+
value = hyrax_record.public_send(m) if hyrax_record.respond_to?(m)
|
222
|
+
value_id = value.try(:id)&.to_s || value # get the id if it's an object
|
223
|
+
values << value_id if value_id.present?
|
193
224
|
end
|
194
225
|
values = values.flatten.uniq
|
195
226
|
next if values.blank?
|
@@ -316,11 +347,11 @@ module Bulkrax
|
|
316
347
|
|
317
348
|
def build_thumbnail_files
|
318
349
|
return unless importerexporter.include_thumbnails
|
350
|
+
thumbnail = Bulkrax.object_factory.thumbnail_for(resource: hyrax_record)
|
351
|
+
return unless thumbnail
|
319
352
|
|
353
|
+
filenames = map_file_sets(Array.wrap(thumbnail))
|
320
354
|
thumbnail_mapping = 'thumbnail_file'
|
321
|
-
file_sets = Array.wrap(hyrax_record.thumbnail)
|
322
|
-
|
323
|
-
filenames = map_file_sets(file_sets)
|
324
355
|
handle_join_on_export(thumbnail_mapping, filenames, false)
|
325
356
|
end
|
326
357
|
|
@@ -237,8 +237,16 @@ module Bulkrax
|
|
237
237
|
# end
|
238
238
|
|
239
239
|
# If the import data is zipped, unzip it to this path
|
240
|
-
def importer_unzip_path
|
240
|
+
def importer_unzip_path(mkdir: false)
|
241
241
|
@importer_unzip_path ||= File.join(parser.base_path, "import_#{path_string}")
|
242
|
+
return @importer_unzip_path if Dir.exist?(@importer_unzip_path) || mkdir == true
|
243
|
+
|
244
|
+
# turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
|
245
|
+
base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
|
246
|
+
|
247
|
+
# If we don't have an existing unzip path, we'll try and find it.
|
248
|
+
# Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
|
249
|
+
@importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
|
242
250
|
end
|
243
251
|
|
244
252
|
def errored_entries_csv_path
|
@@ -23,7 +23,7 @@ module Bulkrax
|
|
23
23
|
end
|
24
24
|
|
25
25
|
def latest?
|
26
|
-
# TODO: remove if
|
26
|
+
# TODO: remove if statement when we stop supporting Hyrax < 4
|
27
27
|
self.id == if Gem::Version.new(Rails::VERSION::STRING) >= Gem::Version.new('6.0.0')
|
28
28
|
self.class.where(statusable_id: self.statusable_id, statusable_type: self.statusable_type).order('id desc').pick(:id)
|
29
29
|
else
|
@@ -26,25 +26,38 @@ module Bulkrax
|
|
26
26
|
|
27
27
|
# Prepend the file_set id to ensure a unique filename and also one that is not longer than 255 characters
|
28
28
|
def filename(file_set)
|
29
|
-
return if
|
30
|
-
if
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
29
|
+
# return if there are no files on the fileset
|
30
|
+
return if Bulkrax.object_factory.original_file(fileset: file_set).blank?
|
31
|
+
|
32
|
+
fn = Bulkrax.object_factory.filename_for(fileset: file_set)
|
33
|
+
file = Bulkrax.object_factory.original_file(fileset: file_set)
|
34
|
+
ext = file_extension(file: file, filename: fn)
|
35
|
+
|
36
|
+
# Prepend the file_set id to ensure a unique filename
|
37
|
+
filename = File.basename(fn, ".*")
|
38
|
+
# Skip modification if file already has ID or we're in metadata-only mode
|
38
39
|
if fn.include?(file_set.id) || importerexporter.metadata_only?
|
39
|
-
filename
|
40
|
-
filename = fn if mime.to_s == ext_mime.to_s
|
40
|
+
# keep filename as is
|
41
41
|
else
|
42
|
-
filename = "#{file_set.id}_#{
|
43
|
-
filename = "#{file_set.id}_#{fn}" if mime.to_s == ext_mime.to_s
|
42
|
+
filename = "#{file_set.id}_#{filename}"
|
44
43
|
end
|
45
|
-
|
46
|
-
|
44
|
+
filename = ext.present? ? "#{filename}.#{ext}" : fn
|
45
|
+
|
46
|
+
# Remove extension, truncate and reattach
|
47
47
|
"#{File.basename(filename, ext)[0...(220 - ext.length)]}#{ext}"
|
48
48
|
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# Generate the appropriate file extension based on the mime type of the file
|
52
|
+
# @return [String] the file extension for the given file
|
53
|
+
def file_extension(file:, filename:)
|
54
|
+
declared_mime = ::Marcel::MimeType.for(declared_type: file.mime_type)
|
55
|
+
# validate the declared mime type
|
56
|
+
declared_mime = ::Marcel::MimeType.for(name: filename) if declared_mime.nil? || declared_mime == "application/octet-stream"
|
57
|
+
# convert the mime type to a file extension
|
58
|
+
Mime::Type.lookup(declared_mime).symbol.to_s
|
59
|
+
rescue Mime::Type::InvalidMimeType
|
60
|
+
nil
|
61
|
+
end
|
49
62
|
end
|
50
63
|
end
|
@@ -2,6 +2,15 @@
|
|
2
2
|
|
3
3
|
module Bulkrax
|
4
4
|
module FileSetEntryBehavior
|
5
|
+
class FileNameError < StandardError
|
6
|
+
end
|
7
|
+
|
8
|
+
class OrphanFileSetError < StandardError
|
9
|
+
end
|
10
|
+
|
11
|
+
class FilePathError < StandardError
|
12
|
+
end
|
13
|
+
|
5
14
|
extend ActiveSupport::Concern
|
6
15
|
|
7
16
|
included do
|
@@ -21,11 +30,11 @@ module Bulkrax
|
|
21
30
|
|
22
31
|
path_to_file = parser.path_to_files(filename: filename)
|
23
32
|
|
24
|
-
parsed_metadata['file'][i] = path_to_file
|
33
|
+
parsed_metadata['file'][i] = path_to_file if path_to_file.present?
|
25
34
|
end
|
26
35
|
parsed_metadata['file'].delete('')
|
27
36
|
|
28
|
-
raise
|
37
|
+
raise FilePathError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
|
29
38
|
|
30
39
|
parsed_metadata['file']
|
31
40
|
end
|
@@ -33,13 +42,13 @@ module Bulkrax
|
|
33
42
|
def validate_presence_of_filename!
|
34
43
|
return if parsed_metadata&.[](file_reference)&.map(&:present?)&.any?
|
35
44
|
|
36
|
-
raise
|
45
|
+
raise FileNameError, 'File set must have a filename'
|
37
46
|
end
|
38
47
|
|
39
48
|
def validate_presence_of_parent!
|
40
49
|
return if parsed_metadata[related_parents_parsed_mapping]&.map(&:present?)&.any?
|
41
50
|
|
42
|
-
raise
|
51
|
+
raise OrphanFileSetError, 'File set must be related to at least one work'
|
43
52
|
end
|
44
53
|
|
45
54
|
def parent_jobs
|
@@ -56,7 +56,7 @@ module Bulkrax
|
|
56
56
|
|
57
57
|
returning_value = false
|
58
58
|
File.open(filename) do |file|
|
59
|
-
mime_type = ::Marcel::MimeType.for(file)
|
59
|
+
mime_type = ::Marcel::MimeType.for(name: file)
|
60
60
|
returning_value = mime_type.include?('application/zip') || mime_type.include?('application/gzip')
|
61
61
|
end
|
62
62
|
returning_value
|
@@ -209,8 +209,11 @@ module Bulkrax
|
|
209
209
|
def rebuild_entries(types_array = nil)
|
210
210
|
index = 0
|
211
211
|
(types_array || %w[collection work file_set relationship]).each do |type|
|
212
|
-
# works are not
|
213
|
-
|
212
|
+
# works are not guaranteed to have Work in the type
|
213
|
+
if type.eql?('relationship')
|
214
|
+
ScheduleRelationshipsJob.set(wait: 5.minutes).perform_later(importer_id: importerexporter.id)
|
215
|
+
next
|
216
|
+
end
|
214
217
|
importer.entries.where(rebuild_entry_query(type, parser_fields['entry_statuses'])).find_each do |e|
|
215
218
|
seen[e.identifier] = true
|
216
219
|
e.status_info('Pending', importer.current_run)
|
@@ -432,7 +435,7 @@ module Bulkrax
|
|
432
435
|
|
433
436
|
Zip::File.open(file_to_unzip) do |zip_file|
|
434
437
|
zip_file.each do |entry|
|
435
|
-
entry_path = File.join(importer_unzip_path, entry.name)
|
438
|
+
entry_path = File.join(importer_unzip_path(mkdir: true), entry.name)
|
436
439
|
FileUtils.mkdir_p(File.dirname(entry_path))
|
437
440
|
zip_file.extract(entry, entry_path) unless File.exist?(entry_path)
|
438
441
|
end
|
@@ -440,12 +443,27 @@ module Bulkrax
|
|
440
443
|
end
|
441
444
|
|
442
445
|
def untar(file_to_untar)
|
443
|
-
Dir.mkdir(importer_unzip_path) unless File.directory?(importer_unzip_path)
|
446
|
+
Dir.mkdir(importer_unzip_path(mkdir: true)) unless File.directory?(importer_unzip_path(mkdir: true))
|
444
447
|
command = "tar -xzf #{Shellwords.escape(file_to_untar)} -C #{Shellwords.escape(importer_unzip_path)}"
|
445
448
|
result = system(command)
|
446
449
|
raise "Failed to extract #{file_to_untar}" unless result
|
447
450
|
end
|
448
451
|
|
452
|
+
# File names referenced in CSVs have spaces replaced with underscores
|
453
|
+
# @see Bulkrax::CsvParser#file_paths
|
454
|
+
def remove_spaces_from_filenames
|
455
|
+
files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
|
456
|
+
files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
|
457
|
+
return if files_with_spaces.blank?
|
458
|
+
|
459
|
+
files_with_spaces.map! { |path| Pathname.new(path) }
|
460
|
+
files_with_spaces.each do |path|
|
461
|
+
filename = path.basename
|
462
|
+
filename_without_spaces = filename.to_s.tr(' ', '_')
|
463
|
+
path.rename(File.join(path.dirname, filename_without_spaces))
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
449
467
|
def zip
|
450
468
|
FileUtils.mkdir_p(exporter_export_zip_path)
|
451
469
|
|