bulkrax 9.3.5 → 9.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -1
- data/app/assets/javascripts/bulkrax/application.js +2 -1
- data/app/assets/javascripts/bulkrax/bulkrax.js +13 -4
- data/app/assets/javascripts/bulkrax/bulkrax_utils.js +96 -0
- data/app/assets/javascripts/bulkrax/datatables.js +1 -0
- data/app/assets/javascripts/bulkrax/entries.js +17 -10
- data/app/assets/javascripts/bulkrax/importers.js.erb +9 -2
- data/app/assets/javascripts/bulkrax/importers_stepper.js +2420 -0
- data/app/assets/stylesheets/bulkrax/application.css +1 -1
- data/app/assets/stylesheets/bulkrax/stepper/_header.scss +83 -0
- data/app/assets/stylesheets/bulkrax/stepper/_mixins.scss +26 -0
- data/app/assets/stylesheets/bulkrax/stepper/_navigation.scss +103 -0
- data/app/assets/stylesheets/bulkrax/stepper/_responsive.scss +46 -0
- data/app/assets/stylesheets/bulkrax/stepper/_review.scss +92 -0
- data/app/assets/stylesheets/bulkrax/stepper/_settings.scss +106 -0
- data/app/assets/stylesheets/bulkrax/stepper/_success.scss +26 -0
- data/app/assets/stylesheets/bulkrax/stepper/_summary.scss +171 -0
- data/app/assets/stylesheets/bulkrax/stepper/_upload.scss +339 -0
- data/app/assets/stylesheets/bulkrax/stepper/_validation.scss +237 -0
- data/app/assets/stylesheets/bulkrax/stepper/_variables.scss +46 -0
- data/app/assets/stylesheets/bulkrax/stepper.scss +32 -0
- data/app/controllers/bulkrax/guided_imports_controller.rb +175 -0
- data/app/controllers/bulkrax/importers_controller.rb +28 -31
- data/app/controllers/concerns/bulkrax/guided_import_demo_scenarios.rb +201 -0
- data/app/controllers/concerns/bulkrax/importer_file_handler.rb +212 -0
- data/app/errors/bulkrax/unzip_error.rb +16 -0
- data/app/factories/bulkrax/object_factory.rb +3 -2
- data/app/factories/bulkrax/valkyrie_object_factory.rb +61 -17
- data/app/jobs/bulkrax/importer_job.rb +42 -4
- data/app/models/bulkrax/csv_entry.rb +27 -7
- data/app/models/bulkrax/entry.rb +4 -0
- data/app/models/bulkrax/importer.rb +27 -10
- data/app/models/concerns/bulkrax/has_matchers.rb +2 -2
- data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +6 -5
- data/app/parsers/bulkrax/application_parser.rb +63 -20
- data/app/parsers/bulkrax/bagit_parser.rb +12 -0
- data/app/parsers/bulkrax/csv_parser.rb +168 -25
- data/app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb +73 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +133 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb +282 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb +96 -0
- data/app/services/bulkrax/csv_template/column_builder.rb +60 -0
- data/app/services/bulkrax/csv_template/column_descriptor.rb +58 -0
- data/app/services/bulkrax/csv_template/csv_builder.rb +83 -0
- data/app/services/bulkrax/csv_template/explanation_builder.rb +57 -0
- data/app/services/bulkrax/csv_template/field_analyzer.rb +56 -0
- data/app/services/bulkrax/csv_template/file_path_generator.rb +47 -0
- data/app/services/bulkrax/csv_template/file_validator.rb +68 -0
- data/app/services/bulkrax/csv_template/mapping_manager.rb +55 -0
- data/app/services/bulkrax/csv_template/model_loader.rb +50 -0
- data/app/services/bulkrax/csv_template/row_builder.rb +35 -0
- data/app/services/bulkrax/csv_template/schema_analyzer.rb +70 -0
- data/app/services/bulkrax/csv_template/split_formatter.rb +44 -0
- data/app/services/bulkrax/csv_template/value_determiner.rb +68 -0
- data/app/services/bulkrax/stepper_response_formatter.rb +347 -0
- data/app/services/bulkrax/validation_error_csv_builder.rb +99 -0
- data/app/validators/bulkrax/csv_row/child_reference.rb +56 -0
- data/app/validators/bulkrax/csv_row/circular_reference.rb +71 -0
- data/app/validators/bulkrax/csv_row/controlled_vocabulary.rb +74 -0
- data/app/validators/bulkrax/csv_row/duplicate_identifier.rb +63 -0
- data/app/validators/bulkrax/csv_row/missing_source_identifier.rb +31 -0
- data/app/validators/bulkrax/csv_row/parent_reference.rb +59 -0
- data/app/validators/bulkrax/csv_row/required_values.rb +64 -0
- data/app/views/bulkrax/guided_imports/new.html.erb +567 -0
- data/app/views/bulkrax/importers/index.html.erb +6 -1
- data/app/views/bulkrax/importers/new.html.erb +1 -1
- data/app/views/bulkrax/importers/show.html.erb +17 -1
- data/config/i18n-tasks.yml +195 -0
- data/config/locales/bulkrax.de.yml +508 -0
- data/config/locales/bulkrax.en.yml +463 -233
- data/config/locales/bulkrax.es.yml +508 -0
- data/config/locales/bulkrax.fr.yml +508 -0
- data/config/locales/bulkrax.it.yml +508 -0
- data/config/locales/bulkrax.pt-BR.yml +508 -0
- data/config/locales/bulkrax.zh.yml +507 -0
- data/config/routes.rb +10 -1
- data/lib/bulkrax/data/demo_scenarios.json +2235 -0
- data/lib/bulkrax/version.rb +1 -1
- data/lib/bulkrax.rb +31 -0
- metadata +56 -16
- data/app/services/bulkrax/sample_csv_service/column_builder.rb +0 -58
- data/app/services/bulkrax/sample_csv_service/column_descriptor.rb +0 -56
- data/app/services/bulkrax/sample_csv_service/csv_builder.rb +0 -82
- data/app/services/bulkrax/sample_csv_service/explanation_builder.rb +0 -51
- data/app/services/bulkrax/sample_csv_service/field_analyzer.rb +0 -54
- data/app/services/bulkrax/sample_csv_service/file_path_generator.rb +0 -16
- data/app/services/bulkrax/sample_csv_service/mapping_manager.rb +0 -36
- data/app/services/bulkrax/sample_csv_service/model_loader.rb +0 -40
- data/app/services/bulkrax/sample_csv_service/row_builder.rb +0 -33
- data/app/services/bulkrax/sample_csv_service/schema_analyzer.rb +0 -69
- data/app/services/bulkrax/sample_csv_service/split_formatter.rb +0 -42
- data/app/services/bulkrax/sample_csv_service/value_determiner.rb +0 -67
- data/app/services/bulkrax/sample_csv_service.rb +0 -78
- /data/{app/services → lib}/wings/custom_queries/find_by_source_identifier.rb +0 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
# Raised when a zip cannot be safely or meaningfully extracted during
|
|
5
|
+
# import. Covered scenarios include:
|
|
6
|
+
#
|
|
7
|
+
# - A single upload zip has no CSV at any level.
|
|
8
|
+
# - A single upload zip has multiple CSVs at its shallowest level
|
|
9
|
+
# (primary CSV cannot be determined).
|
|
10
|
+
# - A zip entry's name would escape the destination directory
|
|
11
|
+
# (Zip Slip: absolute paths, `..` traversal, etc.).
|
|
12
|
+
#
|
|
13
|
+
# Defined in its own file so Zeitwerk can autoload the constant by name
|
|
14
|
+
# from any parser or job that raises or rescues it.
|
|
15
|
+
class UnzipError < StandardError; end
|
|
16
|
+
end
|
|
@@ -70,14 +70,15 @@ module Bulkrax
|
|
|
70
70
|
properties.reject { |prop| Bulkrax.reserved_properties.include?(prop) }
|
|
71
71
|
end
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
# Unused admin set included to support flexible contexts in the Valkyrie version
|
|
74
|
+
def self.field_multi_value?(field:, model:, admin_set_id: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
74
75
|
return false unless field_supported?(field: field, model: model)
|
|
75
76
|
return false unless model.singleton_methods.include?(:properties)
|
|
76
77
|
|
|
77
78
|
model&.properties&.[](field)&.[]("multiple")
|
|
78
79
|
end
|
|
79
80
|
|
|
80
|
-
def self.field_supported?(field:, model:)
|
|
81
|
+
def self.field_supported?(field:, model:, admin_set_id: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
81
82
|
model.method_defined?(field) && model.properties[field].present?
|
|
82
83
|
end
|
|
83
84
|
|
|
@@ -120,11 +120,11 @@ module Bulkrax
|
|
|
120
120
|
save!(resource: resource, user: user)
|
|
121
121
|
end
|
|
122
122
|
|
|
123
|
-
def self.field_multi_value?(field:, model:)
|
|
124
|
-
return false unless field_supported?(field: field, model: model)
|
|
123
|
+
def self.field_multi_value?(field:, model:, admin_set_id: nil)
|
|
124
|
+
return false unless field_supported?(field: field, model: model, admin_set_id: admin_set_id)
|
|
125
125
|
|
|
126
126
|
if model.respond_to?(:schema)
|
|
127
|
-
schema = model
|
|
127
|
+
schema = cached_schema_for(klass: model, admin_set_id: admin_set_id)
|
|
128
128
|
dry_type = schema.key(field.to_sym)
|
|
129
129
|
return true if dry_type.respond_to?(:primitive) && dry_type.primitive == Array
|
|
130
130
|
|
|
@@ -134,9 +134,9 @@ module Bulkrax
|
|
|
134
134
|
end
|
|
135
135
|
end
|
|
136
136
|
|
|
137
|
-
def self.field_supported?(field:, model:)
|
|
137
|
+
def self.field_supported?(field:, model:, admin_set_id: nil)
|
|
138
138
|
if model.respond_to?(:schema)
|
|
139
|
-
schema_properties(model).include?(field)
|
|
139
|
+
schema_properties(klass: model, admin_set_id: admin_set_id).include?(field)
|
|
140
140
|
else
|
|
141
141
|
# We *might* have a Fedora object, so we need to consider that approach as
|
|
142
142
|
# well.
|
|
@@ -272,17 +272,34 @@ module Bulkrax
|
|
|
272
272
|
# rubocop:enable Metrics/ParameterLists
|
|
273
273
|
|
|
274
274
|
##
|
|
275
|
-
# Retrieve
|
|
276
|
-
#
|
|
275
|
+
# Retrieve schema property names for a model, respecting admin set contexts
|
|
276
|
+
# when using flexible metadata. Delegates context resolution to Hyrax so
|
|
277
|
+
# Bulkrax does not need to know about HYRAX_FLEXIBLE or contexts.
|
|
278
|
+
#
|
|
279
|
+
# @param klass [Class] the model class
|
|
280
|
+
# @param admin_set_id [String, nil] admin set used to resolve contexts
|
|
277
281
|
# @return [Array<String>]
|
|
278
|
-
def self.schema_properties(klass)
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
klass_key = klass.name
|
|
282
|
-
schema = klass.new.singleton_class.schema || klass.schema
|
|
283
|
-
@schema_properties_map[klass_key] = schema.map { |k| k.name.to_s } unless @schema_properties_map.key?(klass_key)
|
|
282
|
+
def self.schema_properties(klass:, admin_set_id: nil)
|
|
283
|
+
cached_schema_for(klass: klass, admin_set_id: admin_set_id).map { |k| k.name.to_s }
|
|
284
|
+
end
|
|
284
285
|
|
|
285
|
-
|
|
286
|
+
##
|
|
287
|
+
# Returns the schema for a model, memoized per (klass, admin_set_id) pair.
|
|
288
|
+
# Delegates to +Hyrax.schema_for+ when available so that context-gated
|
|
289
|
+
# properties are included without Bulkrax knowing about flexibility internals.
|
|
290
|
+
#
|
|
291
|
+
# @param klass [Class]
|
|
292
|
+
# @param admin_set_id [String, nil]
|
|
293
|
+
# @return [Dry::Types::Hash]
|
|
294
|
+
def self.cached_schema_for(klass:, admin_set_id: nil)
|
|
295
|
+
@cached_schema_map ||= {}
|
|
296
|
+
key = [klass.name, admin_set_id].compact.join('|')
|
|
297
|
+
@cached_schema_map[key] ||=
|
|
298
|
+
if admin_set_id.present? && defined?(Hyrax) && Hyrax.respond_to?(:schema_for)
|
|
299
|
+
Hyrax.schema_for(klass: klass, admin_set_id: admin_set_id)
|
|
300
|
+
else
|
|
301
|
+
klass.new.singleton_class.schema || klass.schema
|
|
302
|
+
end
|
|
286
303
|
end
|
|
287
304
|
|
|
288
305
|
def self.ordered_file_sets_for(object)
|
|
@@ -457,7 +474,9 @@ module Bulkrax
|
|
|
457
474
|
# TODO What do we return when the calculated form fails?
|
|
458
475
|
# @raise [StandardError] when there was a failure calling the translation.
|
|
459
476
|
def perform_transaction_for(object:, attrs:)
|
|
460
|
-
|
|
477
|
+
admin_set_id = attrs[:admin_set_id] || attrs['admin_set_id'] ||
|
|
478
|
+
attributes[:admin_set_id] || attributes['admin_set_id']
|
|
479
|
+
form = Hyrax::Forms::ResourceForm.for(resource: object, admin_set_id: admin_set_id).prepopulate!
|
|
461
480
|
|
|
462
481
|
# TODO: Handle validations
|
|
463
482
|
form.validate(attrs)
|
|
@@ -474,13 +493,15 @@ module Bulkrax
|
|
|
474
493
|
end
|
|
475
494
|
|
|
476
495
|
##
|
|
477
|
-
# We accept attributes based on the model schema
|
|
496
|
+
# We accept attributes based on the model schema. Passes the admin set ID
|
|
497
|
+
# so that context-restricted properties are included in the permitted list.
|
|
478
498
|
#
|
|
479
499
|
# @return [Array<Symbols>]
|
|
480
500
|
def permitted_attributes
|
|
481
501
|
@permitted_attributes ||= (
|
|
482
502
|
base_permitted_attributes + if klass.respond_to?(:schema)
|
|
483
|
-
|
|
503
|
+
admin_set_id = attributes[:admin_set_id] || attributes['admin_set_id']
|
|
504
|
+
Bulkrax::ValkyrieObjectFactory.schema_properties(klass: klass, admin_set_id: admin_set_id)
|
|
484
505
|
else
|
|
485
506
|
klass.properties.keys.map(&:to_sym)
|
|
486
507
|
end
|
|
@@ -590,6 +611,29 @@ module Bulkrax
|
|
|
590
611
|
.symbolize_keys
|
|
591
612
|
|
|
592
613
|
attrs[:title] = [] if attrs[:title].blank?
|
|
614
|
+
attrs = convert_based_near_to_attributes(attrs)
|
|
615
|
+
attrs
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
# Hyrax's ResourceForm strips the plain `based_near` key during validation
|
|
619
|
+
# (BasedNearFieldBehavior#deserialize calls params.except('based_near')).
|
|
620
|
+
# Values must be passed as `based_near_attributes` — a numbered hash of
|
|
621
|
+
# { "0" => { "id" => uri, "_destroy" => "false" } } — so the populator
|
|
622
|
+
# can set them. Hyrax accepts any valid URI; note that only GeoNames URIs
|
|
623
|
+
# will resolve to a display label via LocationService.
|
|
624
|
+
def convert_based_near_to_attributes(attrs)
|
|
625
|
+
values = Array.wrap(attrs.delete(:based_near)).reject(&:blank?)
|
|
626
|
+
return attrs if values.empty?
|
|
627
|
+
|
|
628
|
+
invalid = values.reject { |v| v.to_s.match?(::URI::DEFAULT_PARSER.make_regexp) }
|
|
629
|
+
if invalid.any?
|
|
630
|
+
raise ::StandardError, "Invalid value(s) for location (based_near): #{invalid.join(', ')}. " \
|
|
631
|
+
"Values must be valid URIs (e.g. http://sws.geonames.org/5128581/)."
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
attrs[:based_near_attributes] = values.each_with_index.to_h do |uri, i|
|
|
635
|
+
[i.to_s, { "id" => uri.to_s, "_destroy" => "false" }]
|
|
636
|
+
end
|
|
593
637
|
attrs
|
|
594
638
|
end
|
|
595
639
|
end
|
|
@@ -13,7 +13,7 @@ module Bulkrax
|
|
|
13
13
|
import(importer, only_updates_since_last_import)
|
|
14
14
|
update_current_run_counters(importer)
|
|
15
15
|
schedule(importer) if importer.schedulable?
|
|
16
|
-
rescue ::CSV::MalformedCSVError => e
|
|
16
|
+
rescue ::CSV::MalformedCSVError, Bulkrax::UnzipError => e
|
|
17
17
|
importer.set_status_info(e)
|
|
18
18
|
end
|
|
19
19
|
|
|
@@ -26,11 +26,49 @@ module Bulkrax
|
|
|
26
26
|
importer.import_objects
|
|
27
27
|
end
|
|
28
28
|
|
|
29
|
+
# Populates `importer_unzip_path` with the uploaded file(s), leaving
|
|
30
|
+
# the working directory in the shape each parser expects.
|
|
31
|
+
#
|
|
32
|
+
# Dispatch by parser capability rather than class name:
|
|
33
|
+
# - CsvParser (and subclasses that replicate its shape) implements
|
|
34
|
+
# `#unzip_with_primary_csv` and `#unzip_attachments_only`, which
|
|
35
|
+
# place the primary CSV at root and attachments under `files/`.
|
|
36
|
+
# - Other parsers (XML, raw BagIt) inherit the base-class `#unzip`,
|
|
37
|
+
# which extracts the zip verbatim.
|
|
38
|
+
# - The separate attachments-zip flow is CSV-only (guided import is
|
|
39
|
+
# the only UI that produces it).
|
|
40
|
+
#
|
|
41
|
+
# A retry of this job gets a clean working directory: any prior
|
|
42
|
+
# extraction state from an earlier attempt is wiped, so nothing runs
|
|
43
|
+
# against partially-populated state.
|
|
29
44
|
def unzip_imported_file(parser)
|
|
30
|
-
return unless parser.file?
|
|
45
|
+
return unless parser.file?
|
|
31
46
|
|
|
32
|
-
|
|
33
|
-
|
|
47
|
+
reset_unzip_path(parser)
|
|
48
|
+
|
|
49
|
+
import_file_path = parser.parser_fields['import_file_path']
|
|
50
|
+
attachments_zip_path = parser.parser_fields['attachments_zip_path']
|
|
51
|
+
|
|
52
|
+
if parser.zip?
|
|
53
|
+
if parser.respond_to?(:unzip_with_primary_csv)
|
|
54
|
+
parser.unzip_with_primary_csv(import_file_path)
|
|
55
|
+
else
|
|
56
|
+
parser.unzip(import_file_path)
|
|
57
|
+
end
|
|
58
|
+
elsif parser.respond_to?(:unzip_attachments_only) && parser.zip_file?(attachments_zip_path)
|
|
59
|
+
parser.copy_file(import_file_path)
|
|
60
|
+
parser.unzip_attachments_only(attachments_zip_path)
|
|
61
|
+
else
|
|
62
|
+
parser.copy_file(import_file_path)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
parser.remove_spaces_from_filenames if parser.respond_to?(:remove_spaces_from_filenames)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def reset_unzip_path(parser)
|
|
69
|
+
path = parser.importer_unzip_path
|
|
70
|
+
FileUtils.rm_rf(path) if Dir.exist?(path)
|
|
71
|
+
FileUtils.mkdir_p(path)
|
|
34
72
|
end
|
|
35
73
|
|
|
36
74
|
def update_current_run_counters(importer)
|
|
@@ -45,7 +45,12 @@ module Bulkrax
|
|
|
45
45
|
encoding: 'utf-8'
|
|
46
46
|
}.merge(csv_read_data_options)
|
|
47
47
|
|
|
48
|
-
results =
|
|
48
|
+
results = if path.respond_to?(:read)
|
|
49
|
+
path.rewind if path.respond_to?(:rewind)
|
|
50
|
+
CSV.parse(path.read, **options)
|
|
51
|
+
else
|
|
52
|
+
CSV.read(path, **options)
|
|
53
|
+
end
|
|
49
54
|
csv_wrapper_class.new(results)
|
|
50
55
|
end
|
|
51
56
|
|
|
@@ -83,9 +88,10 @@ module Bulkrax
|
|
|
83
88
|
# model has to be separated so that it doesn't get mistranslated by to_h
|
|
84
89
|
raw_data = data.to_h
|
|
85
90
|
raw_data[:model] = data[:model] if data[:model].present?
|
|
86
|
-
# If the
|
|
87
|
-
#
|
|
88
|
-
raw_data[:parents] = raw_data[
|
|
91
|
+
# If the parents/children field mapping uses a custom column name, alias it to the standard key
|
|
92
|
+
# so downstream code can find it regardless of what the CSV column is named.
|
|
93
|
+
raw_data[:parents] = raw_data[parser.related_parents_raw_mapping.to_sym] if parser.related_parents_raw_mapping.present? && raw_data.key?(parser.related_parents_raw_mapping.to_sym) && parser.related_parents_raw_mapping != 'parents'
|
|
94
|
+
raw_data[:children] = raw_data[parser.related_children_raw_mapping.to_sym] if parser.related_children_raw_mapping.present? && raw_data.key?(parser.related_children_raw_mapping.to_sym) && parser.related_children_raw_mapping != 'children'
|
|
89
95
|
return raw_data
|
|
90
96
|
end
|
|
91
97
|
|
|
@@ -416,18 +422,32 @@ module Bulkrax
|
|
|
416
422
|
self.collection_ids
|
|
417
423
|
end
|
|
418
424
|
|
|
419
|
-
# If only filename is given, construct the path (/files/my_file)
|
|
425
|
+
# If only filename is given, construct the path (/files/my_file).
|
|
426
|
+
# If file contains a path separator (e.g. attachments/cat_scan.jpg), resolve relative to the CSV's directory.
|
|
420
427
|
def path_to_file(file)
|
|
421
|
-
# return if we already have the full file path
|
|
422
428
|
return file if File.exist?(file)
|
|
429
|
+
|
|
430
|
+
# Relative path: resolve from CSV's directory (allows arbitrary subdirectory names, not just "files")
|
|
431
|
+
return resolve_relative_file_path(file) if file.include?('/')
|
|
432
|
+
|
|
433
|
+
# Bare filename: use legacy files/ directory for backward compatibility and round-tripping
|
|
423
434
|
path = importerexporter.parser.path_to_files
|
|
435
|
+
raise "Could not determine path to files directory. Ensure the import package contains a zip or a valid import_file_path." if path.nil?
|
|
436
|
+
|
|
424
437
|
f = File.join(path, file)
|
|
425
438
|
return f if File.exist?(f)
|
|
426
|
-
raise "File #{f}
|
|
439
|
+
raise "File not found: #{f}. Check the file column in your CSV and ensure the file exists in the import package or path_to_files directory."
|
|
427
440
|
end
|
|
428
441
|
|
|
429
442
|
private
|
|
430
443
|
|
|
444
|
+
def resolve_relative_file_path(file)
|
|
445
|
+
base = File.dirname(importerexporter.parser.import_file_path)
|
|
446
|
+
candidate = File.join(base, file)
|
|
447
|
+
return candidate if File.exist?(candidate)
|
|
448
|
+
raise "File not found: #{candidate}. Check the file path in your CSV and ensure the file exists in the import package or directory."
|
|
449
|
+
end
|
|
450
|
+
|
|
431
451
|
def map_file_sets(file_sets)
|
|
432
452
|
# rubocop:disable Rails/Presence
|
|
433
453
|
file_sets.map { |fs| filename(fs).to_s if filename(fs).present? }.compact
|
data/app/models/bulkrax/entry.rb
CHANGED
|
@@ -169,6 +169,30 @@ module Bulkrax
|
|
|
169
169
|
import_file_path if original_file?
|
|
170
170
|
end
|
|
171
171
|
|
|
172
|
+
# Returns all available original files (CSV and ZIP if present)
|
|
173
|
+
# @return [Array<Hash>] Array of hashes with :path and :name keys
|
|
174
|
+
def original_files
|
|
175
|
+
files = []
|
|
176
|
+
|
|
177
|
+
if import_file_path && File.exist?(import_file_path)
|
|
178
|
+
files << {
|
|
179
|
+
path: import_file_path,
|
|
180
|
+
name: File.basename(import_file_path),
|
|
181
|
+
type: :csv
|
|
182
|
+
}
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
if parser_fields['attachments_zip_path'] && File.exist?(parser_fields['attachments_zip_path'])
|
|
186
|
+
files << {
|
|
187
|
+
path: parser_fields['attachments_zip_path'],
|
|
188
|
+
name: File.basename(parser_fields['attachments_zip_path']),
|
|
189
|
+
type: :zip
|
|
190
|
+
}
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
files
|
|
194
|
+
end
|
|
195
|
+
|
|
172
196
|
def replace_files
|
|
173
197
|
self.parser_fields['replace_files']
|
|
174
198
|
end
|
|
@@ -241,17 +265,10 @@ module Bulkrax
|
|
|
241
265
|
# [['Single Metadata File for all works', 'single'], ['Multiple Files, one per Work', 'multi']]
|
|
242
266
|
# end
|
|
243
267
|
|
|
244
|
-
# If the import data is zipped, unzip it to this path
|
|
245
268
|
def importer_unzip_path(mkdir: false)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
# turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
|
|
250
|
-
base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
|
|
251
|
-
|
|
252
|
-
# If we don't have an existing unzip path, we'll try and find it.
|
|
253
|
-
# Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
|
|
254
|
-
@importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
|
|
269
|
+
path = File.join(parser.base_path, "import_#{path_string}")
|
|
270
|
+
FileUtils.mkdir_p(path) if mkdir
|
|
271
|
+
path
|
|
255
272
|
end
|
|
256
273
|
|
|
257
274
|
def errored_entries_csv_path
|
|
@@ -131,7 +131,7 @@ module Bulkrax
|
|
|
131
131
|
return false if excluded?(field)
|
|
132
132
|
return true if supported_bulkrax_fields.include?(field)
|
|
133
133
|
|
|
134
|
-
Bulkrax.object_factory.field_supported?(field: field, model: factory_class)
|
|
134
|
+
Bulkrax.object_factory.field_supported?(field: field, model: factory_class, admin_set_id: importerexporter.try(:admin_set_id))
|
|
135
135
|
end
|
|
136
136
|
|
|
137
137
|
def supported_bulkrax_fields
|
|
@@ -145,7 +145,7 @@ module Bulkrax
|
|
|
145
145
|
return true if fields_that_are_always_singular.include?(field.to_s)
|
|
146
146
|
return false if fields_that_are_always_multiple.include?(field.to_s)
|
|
147
147
|
|
|
148
|
-
Bulkrax.object_factory.field_multi_value?(field: field, model: factory_class)
|
|
148
|
+
Bulkrax.object_factory.field_multi_value?(field: field, model: factory_class, admin_set_id: importerexporter.try(:admin_set_id))
|
|
149
149
|
end
|
|
150
150
|
|
|
151
151
|
def fields_that_are_always_multiple
|
|
@@ -51,15 +51,16 @@ module Bulkrax
|
|
|
51
51
|
# Is this a zip file?
|
|
52
52
|
def zip?
|
|
53
53
|
filename = parser_fields&.[]('import_file_path')
|
|
54
|
-
return false unless filename
|
|
55
|
-
|
|
54
|
+
return false unless filename && File.file?(filename)
|
|
55
|
+
zip_file?(filename)
|
|
56
|
+
end
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
def zip_file?(filename)
|
|
59
|
+
return false unless filename && File.file?(filename)
|
|
58
60
|
File.open(filename) do |file|
|
|
59
61
|
mime_type = ::Marcel::MimeType.for(name: file)
|
|
60
|
-
|
|
62
|
+
mime_type.include?('application/zip') || mime_type.include?('application/gzip')
|
|
61
63
|
end
|
|
62
|
-
returning_value
|
|
63
64
|
end
|
|
64
65
|
end
|
|
65
66
|
end
|
|
@@ -12,7 +12,7 @@ module Bulkrax
|
|
|
12
12
|
:seen, :increment_counters, :parser_fields, :user, :keys_without_numbers,
|
|
13
13
|
:key_without_numbers, :status, :set_status_info, :status_info, :status_at,
|
|
14
14
|
:exporter_export_path, :exporter_export_zip_path, :importer_unzip_path, :validate_only,
|
|
15
|
-
:zip?, :file?, :remove_and_rerun,
|
|
15
|
+
:zip?, :file?, :remove_and_rerun, :zip_file?,
|
|
16
16
|
to: :importerexporter
|
|
17
17
|
|
|
18
18
|
# @todo Convert to `class_attribute :parser_fiels, default: {}`
|
|
@@ -430,18 +430,77 @@ module Bulkrax
|
|
|
430
430
|
zip
|
|
431
431
|
end
|
|
432
432
|
|
|
433
|
+
# Extracts a zip verbatim into {#importer_unzip_path}, preserving the zip's
|
|
434
|
+
# internal structure. Filters macOS junk (`__MACOSX/`, `.DS_Store`, `._*`).
|
|
435
|
+
# Parser subclasses that need to interpret the zip's structure (e.g.
|
|
436
|
+
# {Bulkrax::CsvParser#unzip_with_primary_csv}) should call a more specific
|
|
437
|
+
# method rather than this one.
|
|
433
438
|
def unzip(file_to_unzip)
|
|
434
439
|
return untar(file_to_unzip) if file_to_unzip.end_with?('.tar.gz')
|
|
435
440
|
|
|
441
|
+
dest_dir = importer_unzip_path(mkdir: true)
|
|
436
442
|
Zip::File.open(file_to_unzip) do |zip_file|
|
|
437
443
|
zip_file.each do |entry|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
444
|
+
next unless entry.file?
|
|
445
|
+
next if macos_junk_entry?(entry.name)
|
|
446
|
+
reject_unsafe_entry!(entry.name)
|
|
447
|
+
dest_path = safe_extract_path(dest_dir, entry.name)
|
|
448
|
+
FileUtils.mkdir_p(File.dirname(dest_path))
|
|
449
|
+
next if File.exist?(dest_path)
|
|
450
|
+
extract_zip_entry(zip_file, entry, dest_dir, entry.name, dest_path)
|
|
441
451
|
end
|
|
442
452
|
end
|
|
443
453
|
end
|
|
444
454
|
|
|
455
|
+
# rubyzip 2.x: extract(entry, absolute_dest_path)
|
|
456
|
+
# rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
|
|
457
|
+
#
|
|
458
|
+
# Callers are responsible for passing a `dest_path` produced by
|
|
459
|
+
# {#safe_extract_path} so the write can't escape `dest_dir`.
|
|
460
|
+
def extract_zip_entry(zip_file, entry, dest_dir, relative_name, dest_path)
|
|
461
|
+
if zip_file.method(:extract).arity == 2
|
|
462
|
+
zip_file.extract(entry, dest_path)
|
|
463
|
+
else
|
|
464
|
+
zip_file.extract(entry, relative_name, destination_directory: dest_dir)
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
def macos_junk_entry?(name)
|
|
469
|
+
name.start_with?('__MACOSX/') || name.split('/').any? { |part| part == '.DS_Store' || part.start_with?('._') }
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
# Zip Slip preflight — reject entries whose names are obviously unsafe
|
|
473
|
+
# (absolute paths, `..` segments) before we touch the filesystem.
|
|
474
|
+
# {#safe_extract_path} is the final line of defense; this check just
|
|
475
|
+
# fails fast with a clear message.
|
|
476
|
+
#
|
|
477
|
+
# @raise [Bulkrax::UnzipError] if the entry name is unsafe
|
|
478
|
+
def reject_unsafe_entry!(name)
|
|
479
|
+
return unless name.start_with?('/') || name.split('/').include?('..')
|
|
480
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: name)
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
# Zip Slip chokepoint. Resolves `relative_dest` against `dest_dir` and
|
|
484
|
+
# returns the absolute destination path — but only if it stays inside
|
|
485
|
+
# `dest_dir`. Callers must use this value rather than building their
|
|
486
|
+
# own path with `File.join`, so the path returned is always safe by
|
|
487
|
+
# construction.
|
|
488
|
+
#
|
|
489
|
+
# @return [String] absolute destination path, validated to be inside `dest_dir`
|
|
490
|
+
# @raise [Bulkrax::UnzipError] if `relative_dest` escapes `dest_dir`
|
|
491
|
+
def safe_extract_path(dest_dir, relative_dest)
|
|
492
|
+
expanded_dest_dir = File.expand_path(dest_dir)
|
|
493
|
+
dest_path = File.expand_path(relative_dest.to_s, expanded_dest_dir)
|
|
494
|
+
return dest_path if dest_path == expanded_dest_dir
|
|
495
|
+
return dest_path if dest_path.start_with?("#{expanded_dest_dir}#{File::SEPARATOR}")
|
|
496
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: relative_dest)
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
def copy_file(file_to_copy)
|
|
500
|
+
destination = File.join(importer_unzip_path(mkdir: true), File.basename(file_to_copy))
|
|
501
|
+
FileUtils.cp(file_to_copy, destination)
|
|
502
|
+
end
|
|
503
|
+
|
|
445
504
|
def untar(file_to_untar)
|
|
446
505
|
Dir.mkdir(importer_unzip_path(mkdir: true)) unless File.directory?(importer_unzip_path(mkdir: true))
|
|
447
506
|
command = "tar -xzf #{Shellwords.escape(file_to_untar)} -C #{Shellwords.escape(importer_unzip_path)}"
|
|
@@ -449,21 +508,6 @@ module Bulkrax
|
|
|
449
508
|
raise "Failed to extract #{file_to_untar}" unless result
|
|
450
509
|
end
|
|
451
510
|
|
|
452
|
-
# File names referenced in CSVs have spaces replaced with underscores
|
|
453
|
-
# @see Bulkrax::CsvParser#file_paths
|
|
454
|
-
def remove_spaces_from_filenames
|
|
455
|
-
files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
|
|
456
|
-
files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
|
|
457
|
-
return if files_with_spaces.blank?
|
|
458
|
-
|
|
459
|
-
files_with_spaces.map! { |path| Pathname.new(path) }
|
|
460
|
-
files_with_spaces.each do |path|
|
|
461
|
-
filename = path.basename
|
|
462
|
-
filename_without_spaces = filename.to_s.tr(' ', '_')
|
|
463
|
-
path.rename(File.join(path.dirname, filename_without_spaces))
|
|
464
|
-
end
|
|
465
|
-
end
|
|
466
|
-
|
|
467
511
|
def zip
|
|
468
512
|
FileUtils.mkdir_p(exporter_export_zip_path)
|
|
469
513
|
|
|
@@ -489,7 +533,6 @@ module Bulkrax
|
|
|
489
533
|
|
|
490
534
|
# @return [String]
|
|
491
535
|
def real_import_file_path
|
|
492
|
-
return importer_unzip_path if file? && zip?
|
|
493
536
|
parser_fields['import_file_path']
|
|
494
537
|
end
|
|
495
538
|
end
|
|
@@ -25,6 +25,18 @@ unless ENV.fetch('BULKRAX_NO_BAGIT', 'false').to_s == 'true'
|
|
|
25
25
|
@path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
# BagIt archives are not CSV imports: they don't contain a primary
|
|
29
|
+
# CSV at a shallowest level, and their structure (bagit.txt + data/
|
|
30
|
+
# + manifests) must be preserved verbatim. Override both CSV-flavored
|
|
31
|
+
# unzip entry points to use the base-class verbatim extraction.
|
|
32
|
+
def unzip_with_primary_csv(file_to_unzip)
|
|
33
|
+
unzip(file_to_unzip)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def unzip_attachments_only(file_to_unzip)
|
|
37
|
+
unzip(file_to_unzip)
|
|
38
|
+
end
|
|
39
|
+
|
|
28
40
|
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
|
29
41
|
def import_fields
|
|
30
42
|
raise StandardError, 'No metadata files were found' if metadata_paths.blank?
|