bulkrax 9.4.0 → 9.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -2
- data/app/assets/javascripts/bulkrax/datatables.js +43 -8
- data/app/assets/javascripts/bulkrax/importers_stepper.js +221 -26
- data/app/assets/stylesheets/bulkrax/stepper/_review.scss +14 -12
- data/app/controllers/bulkrax/entries_controller.rb +2 -2
- data/app/controllers/bulkrax/exporters_controller.rb +3 -3
- data/app/controllers/bulkrax/guided_imports_controller.rb +3 -1
- data/app/controllers/bulkrax/importers_controller.rb +5 -5
- data/app/controllers/concerns/bulkrax/importer_file_handler.rb +1 -6
- data/app/errors/bulkrax/unzip_error.rb +16 -0
- data/app/jobs/bulkrax/importer_job.rb +40 -9
- data/app/matchers/bulkrax/application_matcher.rb +5 -6
- data/app/models/bulkrax/csv_entry.rb +1 -1
- data/app/models/bulkrax/importer.rb +3 -16
- data/app/parsers/bulkrax/application_parser.rb +50 -33
- data/app/parsers/bulkrax/bagit_parser.rb +12 -0
- data/app/parsers/bulkrax/csv_parser.rb +163 -49
- data/app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb +4 -1
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +10 -8
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb +69 -36
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb +9 -7
- data/app/services/bulkrax/csv_template/file_validator.rb +1 -1
- data/app/services/bulkrax/csv_template/mapping_manager.rb +15 -6
- data/app/services/bulkrax/csv_template/split_formatter.rb +10 -3
- data/app/services/bulkrax/split_pattern_coercion.rb +42 -0
- data/app/services/bulkrax/stepper_response_formatter.rb +2 -1
- data/app/services/bulkrax/validation_error_csv_builder.rb +36 -12
- data/app/validators/bulkrax/csv_row/child_reference.rb +2 -1
- data/app/validators/bulkrax/csv_row/parent_reference.rb +1 -1
- data/app/validators/bulkrax/csv_row/required_values.rb +17 -3
- data/app/views/bulkrax/exporters/edit.html.erb +1 -1
- data/app/views/bulkrax/exporters/index.html.erb +3 -1
- data/app/views/bulkrax/exporters/new.html.erb +1 -1
- data/app/views/bulkrax/exporters/show.html.erb +1 -1
- data/app/views/bulkrax/guided_imports/new.html.erb +7 -0
- data/app/views/bulkrax/importers/_edit_item_buttons.html.erb +3 -3
- data/app/views/bulkrax/importers/index.html.erb +2 -0
- data/app/views/bulkrax/importers/new.html.erb +1 -1
- data/app/views/bulkrax/importers/show.html.erb +3 -1
- data/app/views/bulkrax/shared/_datatable_i18n.html.erb +3 -0
- data/config/locales/bulkrax.de.yml +95 -2
- data/config/locales/bulkrax.en.yml +58 -2
- data/config/locales/bulkrax.es.yml +95 -2
- data/config/locales/bulkrax.fr.yml +95 -2
- data/config/locales/bulkrax.it.yml +95 -2
- data/config/locales/bulkrax.pt-BR.yml +95 -2
- data/config/locales/bulkrax.zh.yml +96 -2
- data/db/migrate/20260424081537_remove_parents_from_bulkrax_importer_runs.rb +9 -0
- data/lib/bulkrax/version.rb +1 -1
- data/lib/bulkrax.rb +15 -1
- metadata +8 -4
|
@@ -13,7 +13,7 @@ module Bulkrax
|
|
|
13
13
|
import(importer, only_updates_since_last_import)
|
|
14
14
|
update_current_run_counters(importer)
|
|
15
15
|
schedule(importer) if importer.schedulable?
|
|
16
|
-
rescue ::CSV::MalformedCSVError => e
|
|
16
|
+
rescue ::CSV::MalformedCSVError, Bulkrax::UnzipError => e
|
|
17
17
|
importer.set_status_info(e)
|
|
18
18
|
end
|
|
19
19
|
|
|
@@ -26,18 +26,49 @@ module Bulkrax
|
|
|
26
26
|
importer.import_objects
|
|
27
27
|
end
|
|
28
28
|
|
|
29
|
+
# Populates `importer_unzip_path` with the uploaded file(s), leaving
|
|
30
|
+
# the working directory in the shape each parser expects.
|
|
31
|
+
#
|
|
32
|
+
# Dispatch by parser capability rather than class name:
|
|
33
|
+
# - CsvParser (and subclasses that replicate its shape) implements
|
|
34
|
+
# `#unzip_with_primary_csv` and `#unzip_attachments_only`, which
|
|
35
|
+
# place the primary CSV at root and attachments under `files/`.
|
|
36
|
+
# - Other parsers (XML, raw BagIt) inherit the base-class `#unzip`,
|
|
37
|
+
# which extracts the zip verbatim.
|
|
38
|
+
# - The separate attachments-zip flow is CSV-only (guided import is
|
|
39
|
+
# the only UI that produces it).
|
|
40
|
+
#
|
|
41
|
+
# A retry of this job gets a clean working directory: any prior
|
|
42
|
+
# extraction state from an earlier attempt is wiped, so nothing runs
|
|
43
|
+
# against partially-populated state.
|
|
29
44
|
def unzip_imported_file(parser)
|
|
30
45
|
return unless parser.file?
|
|
46
|
+
|
|
47
|
+
reset_unzip_path(parser)
|
|
48
|
+
|
|
49
|
+
import_file_path = parser.parser_fields['import_file_path']
|
|
50
|
+
attachments_zip_path = parser.parser_fields['attachments_zip_path']
|
|
51
|
+
|
|
31
52
|
if parser.zip?
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
parser.copy_file(
|
|
39
|
-
parser.
|
|
53
|
+
if parser.respond_to?(:unzip_with_primary_csv)
|
|
54
|
+
parser.unzip_with_primary_csv(import_file_path)
|
|
55
|
+
else
|
|
56
|
+
parser.unzip(import_file_path)
|
|
57
|
+
end
|
|
58
|
+
elsif parser.respond_to?(:unzip_attachments_only) && parser.zip_file?(attachments_zip_path)
|
|
59
|
+
parser.copy_file(import_file_path)
|
|
60
|
+
parser.unzip_attachments_only(attachments_zip_path)
|
|
61
|
+
else
|
|
62
|
+
parser.copy_file(import_file_path)
|
|
40
63
|
end
|
|
64
|
+
|
|
65
|
+
parser.remove_spaces_from_filenames if parser.respond_to?(:remove_spaces_from_filenames)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def reset_unzip_path(parser)
|
|
69
|
+
path = parser.importer_unzip_path
|
|
70
|
+
FileUtils.rm_rf(path) if Dir.exist?(path)
|
|
71
|
+
FileUtils.mkdir_p(path)
|
|
41
72
|
end
|
|
42
73
|
|
|
43
74
|
def update_current_run_counters(importer)
|
|
@@ -33,12 +33,11 @@ module Bulkrax
|
|
|
33
33
|
end
|
|
34
34
|
|
|
35
35
|
def process_split
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
end
|
|
36
|
+
pattern = Bulkrax::SplitPatternCoercion.coerce(self.split)
|
|
37
|
+
return unless pattern
|
|
38
|
+
|
|
39
|
+
@result = @result.split(pattern)
|
|
40
|
+
@result = @result.map(&:strip).select(&:present?) unless self.split.is_a?(TrueClass)
|
|
42
41
|
end
|
|
43
42
|
|
|
44
43
|
def process_parse
|
|
@@ -165,7 +165,7 @@ module Bulkrax
|
|
|
165
165
|
def add_file
|
|
166
166
|
self.parsed_metadata['file'] ||= []
|
|
167
167
|
if record['file']&.is_a?(String)
|
|
168
|
-
self.parsed_metadata['file'] = record['file'].split(Bulkrax.
|
|
168
|
+
self.parsed_metadata['file'] = record['file'].split(Bulkrax::CsvParser.file_split_pattern)
|
|
169
169
|
elsif record['file'].is_a?(Array)
|
|
170
170
|
self.parsed_metadata['file'] = record['file']
|
|
171
171
|
end
|
|
@@ -266,22 +266,9 @@ module Bulkrax
|
|
|
266
266
|
# end
|
|
267
267
|
|
|
268
268
|
def importer_unzip_path(mkdir: false)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
FileUtils.mkdir_p(unzip_dir) if mkdir
|
|
273
|
-
return unzip_dir
|
|
274
|
-
end
|
|
275
|
-
|
|
276
|
-
@importer_unzip_path ||= File.join(parser.base_path, "import_#{path_string}")
|
|
277
|
-
return @importer_unzip_path if Dir.exist?(@importer_unzip_path) || mkdir == true
|
|
278
|
-
|
|
279
|
-
# turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
|
|
280
|
-
base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
|
|
281
|
-
|
|
282
|
-
# If we don't have an existing unzip path, we'll try and find it.
|
|
283
|
-
# Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
|
|
284
|
-
@importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
|
|
269
|
+
path = File.join(parser.base_path, "import_#{path_string}")
|
|
270
|
+
FileUtils.mkdir_p(path) if mkdir
|
|
271
|
+
path
|
|
285
272
|
end
|
|
286
273
|
|
|
287
274
|
def errored_entries_csv_path
|
|
@@ -430,39 +430,72 @@ module Bulkrax
|
|
|
430
430
|
zip
|
|
431
431
|
end
|
|
432
432
|
|
|
433
|
+
# Extracts a zip verbatim into {#importer_unzip_path}, preserving the zip's
|
|
434
|
+
# internal structure. Filters macOS junk (`__MACOSX/`, `.DS_Store`, `._*`).
|
|
435
|
+
# Parser subclasses that need to interpret the zip's structure (e.g.
|
|
436
|
+
# {Bulkrax::CsvParser#unzip_with_primary_csv}) should call a more specific
|
|
437
|
+
# method rather than this one.
|
|
433
438
|
def unzip(file_to_unzip)
|
|
434
439
|
return untar(file_to_unzip) if file_to_unzip.end_with?('.tar.gz')
|
|
435
440
|
|
|
441
|
+
dest_dir = importer_unzip_path(mkdir: true)
|
|
436
442
|
Zip::File.open(file_to_unzip) do |zip_file|
|
|
437
|
-
real_entries = zip_file.reject { |e| macos_junk_entry?(e.name) }
|
|
438
|
-
top_level_dirs = real_entries.map { |e| e.name.split('/').first }.uniq
|
|
439
|
-
strip_prefix = top_level_dirs.size == 1 ? "#{top_level_dirs.first}/" : nil
|
|
440
|
-
|
|
441
|
-
dest_dir = importer_unzip_path(mkdir: true)
|
|
442
443
|
zip_file.each do |entry|
|
|
443
444
|
next unless entry.file?
|
|
444
445
|
next if macos_junk_entry?(entry.name)
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
dest_path = File.join(dest_dir, name)
|
|
446
|
+
reject_unsafe_entry!(entry.name)
|
|
447
|
+
dest_path = safe_extract_path(dest_dir, entry.name)
|
|
448
448
|
FileUtils.mkdir_p(File.dirname(dest_path))
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
# rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
|
|
452
|
-
if zip_file.method(:extract).arity == 2
|
|
453
|
-
zip_file.extract(entry, dest_path)
|
|
454
|
-
else
|
|
455
|
-
zip_file.extract(entry, name, destination_directory: dest_dir)
|
|
456
|
-
end
|
|
457
|
-
end
|
|
449
|
+
next if File.exist?(dest_path)
|
|
450
|
+
extract_zip_entry(zip_file, entry, dest_dir, entry.name, dest_path)
|
|
458
451
|
end
|
|
459
452
|
end
|
|
460
453
|
end
|
|
461
454
|
|
|
455
|
+
# rubyzip 2.x: extract(entry, absolute_dest_path)
|
|
456
|
+
# rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
|
|
457
|
+
#
|
|
458
|
+
# Callers are responsible for passing a `dest_path` produced by
|
|
459
|
+
# {#safe_extract_path} so the write can't escape `dest_dir`.
|
|
460
|
+
def extract_zip_entry(zip_file, entry, dest_dir, relative_name, dest_path)
|
|
461
|
+
if zip_file.method(:extract).arity == 2
|
|
462
|
+
zip_file.extract(entry, dest_path)
|
|
463
|
+
else
|
|
464
|
+
zip_file.extract(entry, relative_name, destination_directory: dest_dir)
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
462
468
|
def macos_junk_entry?(name)
|
|
463
469
|
name.start_with?('__MACOSX/') || name.split('/').any? { |part| part == '.DS_Store' || part.start_with?('._') }
|
|
464
470
|
end
|
|
465
471
|
|
|
472
|
+
# Zip Slip preflight — reject entries whose names are obviously unsafe
|
|
473
|
+
# (absolute paths, `..` segments) before we touch the filesystem.
|
|
474
|
+
# {#safe_extract_path} is the final line of defense; this check just
|
|
475
|
+
# fails fast with a clear message.
|
|
476
|
+
#
|
|
477
|
+
# @raise [Bulkrax::UnzipError] if the entry name is unsafe
|
|
478
|
+
def reject_unsafe_entry!(name)
|
|
479
|
+
return unless name.start_with?('/') || name.split('/').include?('..')
|
|
480
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: name)
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
# Zip Slip chokepoint. Resolves `relative_dest` against `dest_dir` and
|
|
484
|
+
# returns the absolute destination path — but only if it stays inside
|
|
485
|
+
# `dest_dir`. Callers must use this value rather than building their
|
|
486
|
+
# own path with `File.join`, so the path returned is always safe by
|
|
487
|
+
# construction.
|
|
488
|
+
#
|
|
489
|
+
# @return [String] absolute destination path, validated to be inside `dest_dir`
|
|
490
|
+
# @raise [Bulkrax::UnzipError] if `relative_dest` escapes `dest_dir`
|
|
491
|
+
def safe_extract_path(dest_dir, relative_dest)
|
|
492
|
+
expanded_dest_dir = File.expand_path(dest_dir)
|
|
493
|
+
dest_path = File.expand_path(relative_dest.to_s, expanded_dest_dir)
|
|
494
|
+
return dest_path if dest_path == expanded_dest_dir
|
|
495
|
+
return dest_path if dest_path.start_with?("#{expanded_dest_dir}#{File::SEPARATOR}")
|
|
496
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: relative_dest)
|
|
497
|
+
end
|
|
498
|
+
|
|
466
499
|
def copy_file(file_to_copy)
|
|
467
500
|
destination = File.join(importer_unzip_path(mkdir: true), File.basename(file_to_copy))
|
|
468
501
|
FileUtils.cp(file_to_copy, destination)
|
|
@@ -475,21 +508,6 @@ module Bulkrax
|
|
|
475
508
|
raise "Failed to extract #{file_to_untar}" unless result
|
|
476
509
|
end
|
|
477
510
|
|
|
478
|
-
# File names referenced in CSVs have spaces replaced with underscores
|
|
479
|
-
# @see Bulkrax::CsvParser#file_paths
|
|
480
|
-
def remove_spaces_from_filenames
|
|
481
|
-
files = Dir.glob(File.join(importer_unzip_path, 'files', '*')).uniq
|
|
482
|
-
files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
|
|
483
|
-
return if files_with_spaces.blank?
|
|
484
|
-
|
|
485
|
-
files_with_spaces.map! { |path| Pathname.new(path) }
|
|
486
|
-
files_with_spaces.each do |path|
|
|
487
|
-
filename = path.basename
|
|
488
|
-
filename_without_spaces = filename.to_s.tr(' ', '_')
|
|
489
|
-
path.rename(File.join(path.dirname, filename_without_spaces))
|
|
490
|
-
end
|
|
491
|
-
end
|
|
492
|
-
|
|
493
511
|
def zip
|
|
494
512
|
FileUtils.mkdir_p(exporter_export_zip_path)
|
|
495
513
|
|
|
@@ -515,7 +533,6 @@ module Bulkrax
|
|
|
515
533
|
|
|
516
534
|
# @return [String]
|
|
517
535
|
def real_import_file_path
|
|
518
|
-
return importer_unzip_path if file? && zip?
|
|
519
536
|
parser_fields['import_file_path']
|
|
520
537
|
end
|
|
521
538
|
end
|
|
@@ -25,6 +25,18 @@ unless ENV.fetch('BULKRAX_NO_BAGIT', 'false').to_s == 'true'
|
|
|
25
25
|
@path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
# BagIt archives are not CSV imports: they don't contain a primary
|
|
29
|
+
# CSV at a shallowest level, and their structure (bagit.txt + data/
|
|
30
|
+
# + manifests) must be preserved verbatim. Override both CSV-flavored
|
|
31
|
+
# unzip entry points to use the base-class verbatim extraction.
|
|
32
|
+
def unzip_with_primary_csv(file_to_unzip)
|
|
33
|
+
unzip(file_to_unzip)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def unzip_attachments_only(file_to_unzip)
|
|
37
|
+
unzip(file_to_unzip)
|
|
38
|
+
end
|
|
39
|
+
|
|
28
40
|
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
|
29
41
|
def import_fields
|
|
30
42
|
raise StandardError, 'No metadata files were found' if metadata_paths.blank?
|
|
@@ -13,6 +13,16 @@ module Bulkrax
|
|
|
13
13
|
true
|
|
14
14
|
end
|
|
15
15
|
|
|
16
|
+
# @return [Regexp] the pattern String#split should use on a `file` cell.
|
|
17
|
+
# Honours the `file` mapping's `split:` when set, otherwise falls back
|
|
18
|
+
# to {Bulkrax.multi_value_element_split_on}.
|
|
19
|
+
def self.file_split_pattern
|
|
20
|
+
file_mapping = Bulkrax.field_mappings.dig(to_s, 'file') ||
|
|
21
|
+
Bulkrax.field_mappings.dig(to_s, :file) || {}
|
|
22
|
+
split_value = file_mapping['split'] || file_mapping[:split]
|
|
23
|
+
Bulkrax::SplitPatternCoercion.coerce(split_value) || Bulkrax.multi_value_element_split_on
|
|
24
|
+
end
|
|
25
|
+
|
|
16
26
|
def records(_opts = {})
|
|
17
27
|
return @records if @records.present?
|
|
18
28
|
|
|
@@ -352,20 +362,13 @@ module Bulkrax
|
|
|
352
362
|
raise StandardError, 'No records were found' if records.blank?
|
|
353
363
|
return [] if importerexporter.metadata_only?
|
|
354
364
|
|
|
365
|
+
# Compute once — these don't vary per record.
|
|
366
|
+
file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
|
|
367
|
+
split_pattern = self.class.file_split_pattern
|
|
368
|
+
files_dir = path_to_files
|
|
369
|
+
|
|
355
370
|
@file_paths ||= records.map do |r|
|
|
356
|
-
file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
|
|
357
371
|
next if r[file_mapping].blank?
|
|
358
|
-
|
|
359
|
-
split_value = Bulkrax.field_mappings.dig(self.class.to_s, :file, :split)
|
|
360
|
-
split_pattern = case split_value
|
|
361
|
-
when Regexp
|
|
362
|
-
split_value
|
|
363
|
-
when String
|
|
364
|
-
Regexp.new(split_value)
|
|
365
|
-
else
|
|
366
|
-
Bulkrax.multi_value_element_split_on
|
|
367
|
-
end
|
|
368
|
-
files_dir = path_to_files
|
|
369
372
|
raise StandardError, "Record references local files but no files directory could be resolved from the import path" if files_dir.nil?
|
|
370
373
|
|
|
371
374
|
r[file_mapping].split(split_pattern).map do |f|
|
|
@@ -379,47 +382,161 @@ module Bulkrax
|
|
|
379
382
|
end.flatten.compact.uniq
|
|
380
383
|
end
|
|
381
384
|
|
|
382
|
-
# Retrieve the path where we expect to find the files
|
|
385
|
+
# Retrieve the path where we expect to find the files for this import.
|
|
386
|
+
# After {ImporterJob#unzip_imported_file} runs (zip cases), attachments
|
|
387
|
+
# live under `{importer_unzip_path}/files/`. For a server-path-style
|
|
388
|
+
# import (the user specified a CSV file path with a sibling `files/`
|
|
389
|
+
# directory on disk), resolve relative to the CSV's directory instead.
|
|
390
|
+
#
|
|
391
|
+
# When called with `filename:`, returns the full path to that file if
|
|
392
|
+
# it exists on disk, or `nil` otherwise — callers like
|
|
393
|
+
# `Bulkrax::FileSetEntryBehavior#add_path_to_file` rely on the nil
|
|
394
|
+
# sentinel to fall back to the raw filename in their error messages.
|
|
395
|
+
#
|
|
396
|
+
# When called with no filename, returns the `files/` directory itself
|
|
397
|
+
# (only when that directory exists on disk — else `nil` so callers can
|
|
398
|
+
# raise a clear "no files directory" error).
|
|
383
399
|
def path_to_files(**args)
|
|
384
400
|
filename = args.fetch(:filename, '')
|
|
401
|
+
base_dir = files_dir
|
|
402
|
+
return base_dir if filename.blank? && Dir.exist?(base_dir)
|
|
403
|
+
return nil if filename.blank?
|
|
404
|
+
|
|
405
|
+
candidate = File.join(base_dir, filename)
|
|
406
|
+
candidate if File.exist?(candidate)
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
# Extracts a zip that contains a primary CSV. The primary CSV lands at
|
|
410
|
+
# the root of {#importer_unzip_path}; every other entry lands under
|
|
411
|
+
# {#importer_unzip_path}/files/, preserving its path relative to the
|
|
412
|
+
# primary CSV's directory.
|
|
413
|
+
#
|
|
414
|
+
# Primary-CSV selection matches the guided-import validator's rule
|
|
415
|
+
# (see {Bulkrax::ImporterFileHandler#locate_csv_entry_in_zip}): the CSV
|
|
416
|
+
# entry at the shallowest directory level. Visible errors are raised on
|
|
417
|
+
# zero CSVs or multiple CSVs at the shallowest level.
|
|
418
|
+
#
|
|
419
|
+
# @param file_to_unzip [String] absolute path to a .zip
|
|
420
|
+
# @raise [Bulkrax::UnzipError] on no CSV or ambiguous CSVs
|
|
421
|
+
def unzip_with_primary_csv(file_to_unzip)
|
|
422
|
+
dest_dir = importer_unzip_path(mkdir: true)
|
|
423
|
+
Zip::File.open(file_to_unzip) do |zip_file|
|
|
424
|
+
entries = real_zip_entries(zip_file)
|
|
425
|
+
primary = select_primary_csv!(entries)
|
|
426
|
+
primary_dir = File.dirname(primary.name)
|
|
427
|
+
|
|
428
|
+
entries.each do |entry|
|
|
429
|
+
if entry == primary
|
|
430
|
+
extract_to(zip_file, entry, dest_dir, File.basename(entry.name))
|
|
431
|
+
else
|
|
432
|
+
extract_to(zip_file, entry, dest_dir, File.join('files', relative_to(primary_dir, entry.name)))
|
|
433
|
+
end
|
|
434
|
+
end
|
|
435
|
+
end
|
|
436
|
+
end
|
|
385
437
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
438
|
+
# Extracts a zip that accompanies a separately-uploaded CSV. Every
|
|
439
|
+
# entry lands under {#importer_unzip_path}/files/ — including any
|
|
440
|
+
# CSVs inside the zip, which are treated as attachments since the
|
|
441
|
+
# primary CSV was uploaded outside the zip. Strips a single top-level
|
|
442
|
+
# wrapper directory if present, so users can zip either the contents
|
|
443
|
+
# or the enclosing folder.
|
|
444
|
+
#
|
|
445
|
+
# @param file_to_unzip [String] absolute path to a .zip
|
|
446
|
+
def unzip_attachments_only(file_to_unzip)
|
|
447
|
+
dest_dir = importer_unzip_path(mkdir: true)
|
|
448
|
+
Zip::File.open(file_to_unzip) do |zip_file|
|
|
449
|
+
entries = real_zip_entries(zip_file)
|
|
450
|
+
wrapper = single_top_level_wrapper(entries)
|
|
451
|
+
|
|
452
|
+
entries.each do |entry|
|
|
453
|
+
relative = wrapper ? entry.name.delete_prefix("#{wrapper}/") : entry.name
|
|
454
|
+
next if relative.empty?
|
|
455
|
+
extract_to(zip_file, entry, dest_dir, File.join('files', relative))
|
|
456
|
+
end
|
|
457
|
+
end
|
|
398
458
|
end
|
|
399
459
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
460
|
+
# File names referenced in CSVs have spaces replaced with underscores.
|
|
461
|
+
# @see #file_paths
|
|
462
|
+
def remove_spaces_from_filenames
|
|
463
|
+
files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
|
|
464
|
+
files_with_spaces = files.select { |f| f.split('/').last.include?(' ') }
|
|
465
|
+
return if files_with_spaces.blank?
|
|
466
|
+
|
|
467
|
+
files_with_spaces.map! { |path| Pathname.new(path) }
|
|
468
|
+
files_with_spaces.each do |path|
|
|
469
|
+
filename_without_spaces = path.basename.to_s.tr(' ', '_')
|
|
470
|
+
path.rename(File.join(path.dirname, filename_without_spaces))
|
|
471
|
+
end
|
|
403
472
|
end
|
|
404
473
|
|
|
405
474
|
private
|
|
406
475
|
|
|
407
|
-
#
|
|
408
|
-
#
|
|
409
|
-
#
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
files_dir = File.join(dest_dir, 'files')
|
|
416
|
-
FileUtils.mkdir_p(files_dir)
|
|
417
|
-
flat_files.each do |f|
|
|
418
|
-
dest = File.join(files_dir, File.basename(f))
|
|
419
|
-
FileUtils.mv(f, dest) unless File.exist?(dest)
|
|
476
|
+
# Memoized base directory under which import attachments live. Kept
|
|
477
|
+
# separate from `#path_to_files`' per-filename return value to avoid
|
|
478
|
+
# cross-contamination between directory lookups and file lookups.
|
|
479
|
+
def files_dir
|
|
480
|
+
@files_dir ||= begin
|
|
481
|
+
has_attachments_zip = parser_fields['attachments_zip_path'].present? && zip_file?(parser_fields['attachments_zip_path'])
|
|
482
|
+
base = zip? || has_attachments_zip ? importer_unzip_path : File.dirname(import_file_path)
|
|
483
|
+
File.join(base, 'files')
|
|
420
484
|
end
|
|
421
485
|
end
|
|
422
486
|
|
|
487
|
+
# Returns zip entries filtered down to real files (no directories, no
|
|
488
|
+
# macOS junk). Raises {Bulkrax::UnzipError} if any entry's name would
|
|
489
|
+
# escape the destination directory (Zip Slip).
|
|
490
|
+
def real_zip_entries(zip_file)
|
|
491
|
+
entries = zip_file.entries.select { |e| e.file? && !macos_junk_entry?(e.name) }
|
|
492
|
+
entries.each { |e| reject_unsafe_entry!(e.name) }
|
|
493
|
+
entries
|
|
494
|
+
end
|
|
495
|
+
|
|
496
|
+
# Picks the single primary CSV from zip entries, enforcing the
|
|
497
|
+
# shallowest-level rule. Raises {Bulkrax::UnzipError} on failure.
|
|
498
|
+
def select_primary_csv!(entries)
|
|
499
|
+
csvs = entries.select { |e| e.name.end_with?('.csv') }
|
|
500
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.no_csv') if csvs.empty?
|
|
501
|
+
|
|
502
|
+
by_depth = csvs.group_by { |e| e.name.count('/') }
|
|
503
|
+
shallowest = by_depth[by_depth.keys.min]
|
|
504
|
+
|
|
505
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.multiple_csv') if shallowest.size > 1
|
|
506
|
+
|
|
507
|
+
shallowest.first
|
|
508
|
+
end
|
|
509
|
+
|
|
510
|
+
# If every entry shares a single top-level directory, returns that
|
|
511
|
+
# directory name; otherwise nil.
|
|
512
|
+
def single_top_level_wrapper(entries)
|
|
513
|
+
tops = entries.map { |e| e.name.split('/').first }.uniq
|
|
514
|
+
return nil unless tops.size == 1
|
|
515
|
+
# If the single top segment is a file (no slashes in the entry), not a dir,
|
|
516
|
+
# there's no wrapper to strip.
|
|
517
|
+
return nil if entries.any? { |e| e.name == tops.first }
|
|
518
|
+
tops.first
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
# Returns `path` with `prefix/` removed from the front, if present, and
|
|
522
|
+
# a leading `files/` segment also stripped so callers can join under
|
|
523
|
+
# `files/` without doubling when the zip already uses that convention.
|
|
524
|
+
def relative_to(prefix, path)
|
|
525
|
+
remaining = prefix == '.' || prefix.empty? ? path : path.delete_prefix("#{prefix}/")
|
|
526
|
+
remaining.delete_prefix('files/')
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
# Extracts a zip entry to `dest_dir/relative_dest`. Creates intermediate
|
|
530
|
+
# directories and honors the rubyzip 2/3 extract-method signature.
|
|
531
|
+
# The destination path is validated by {#safe_extract_path} — an unsafe
|
|
532
|
+
# `relative_dest` raises {Bulkrax::UnzipError} before any write.
|
|
533
|
+
def extract_to(zip_file, entry, dest_dir, relative_dest)
|
|
534
|
+
dest_path = safe_extract_path(dest_dir, relative_dest)
|
|
535
|
+
FileUtils.mkdir_p(File.dirname(dest_path))
|
|
536
|
+
return if File.exist?(dest_path)
|
|
537
|
+
extract_zip_entry(zip_file, entry, dest_dir, relative_dest, dest_path)
|
|
538
|
+
end
|
|
539
|
+
|
|
423
540
|
def unique_collection_identifier(collection_hash)
|
|
424
541
|
entry_uid = collection_hash[source_identifier]
|
|
425
542
|
entry_uid ||= if Bulkrax.fill_in_blank_source_identifiers.present?
|
|
@@ -434,16 +551,13 @@ module Bulkrax
|
|
|
434
551
|
# Override to return the first CSV in the path, if a zip file is supplied
|
|
435
552
|
# We expect a single CSV at the top level of the zip in the CSVParser
|
|
436
553
|
# but we are willing to go look for it if need be
|
|
554
|
+
# When the user uploaded a zip containing a CSV, the job extracts the
|
|
555
|
+
# primary CSV to the root of `importer_unzip_path` (see
|
|
556
|
+
# {#unzip_with_primary_csv}). Any non-primary CSVs live under `files/`
|
|
557
|
+
# and are treated as attachments, so a shallow glob suffices.
|
|
437
558
|
def real_import_file_path
|
|
438
|
-
return Dir["#{importer_unzip_path}
|
|
439
|
-
|
|
559
|
+
return Dir["#{importer_unzip_path}/*.csv"].first if file? && zip?
|
|
440
560
|
parser_fields['import_file_path']
|
|
441
561
|
end
|
|
442
|
-
|
|
443
|
-
# If there are CSVs that are meant to be attachments in the files directory,
|
|
444
|
-
# we don't want to consider them as the import CSV
|
|
445
|
-
def in_files_dir?(path)
|
|
446
|
-
File.dirname(path).ends_with?('files')
|
|
447
|
-
end
|
|
448
562
|
end
|
|
449
563
|
end
|
|
@@ -27,7 +27,10 @@ module Bulkrax
|
|
|
27
27
|
|
|
28
28
|
def initialize(models: nil, admin_set_id: nil)
|
|
29
29
|
@admin_set_id = admin_set_id
|
|
30
|
-
|
|
30
|
+
# Template generation excludes system-maintained fields (generated:
|
|
31
|
+
# true) so users don't see columns like date_uploaded, depositor,
|
|
32
|
+
# etc. on the downloadable template.
|
|
33
|
+
@mapping_manager = CsvTemplate::MappingManager.new(include_generated: false)
|
|
31
34
|
@mappings = @mapping_manager.mappings
|
|
32
35
|
@field_analyzer = CsvTemplate::FieldAnalyzer.new(@mappings, admin_set_id)
|
|
33
36
|
@all_models = CsvTemplate::ModelLoader.new(Array.wrap(models)).models
|
|
@@ -28,14 +28,13 @@ module Bulkrax
|
|
|
28
28
|
header_issues = check_headers(headers, raw_csv, mapping_manager, mappings, field_metadata, field_analyzer)
|
|
29
29
|
missing_required = header_issues[:missing_required]
|
|
30
30
|
notices, row_errors, file_validator, collections, works, file_sets =
|
|
31
|
-
run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id)
|
|
31
|
+
run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id, mapping_manager: mapping_manager)
|
|
32
32
|
|
|
33
33
|
result = assemble_result(
|
|
34
34
|
headers: headers, missing_required: missing_required, header_issues: header_issues,
|
|
35
35
|
row_errors: row_errors, csv_data: csv_data, file_validator: file_validator,
|
|
36
36
|
collections: collections, works: works, file_sets: file_sets, notices: notices
|
|
37
37
|
)
|
|
38
|
-
apply_rights_statement_validation_override!(result, missing_required)
|
|
39
38
|
result[:raw_csv_data] = csv_data
|
|
40
39
|
result
|
|
41
40
|
end
|
|
@@ -44,13 +43,13 @@ module Bulkrax
|
|
|
44
43
|
|
|
45
44
|
# Builds notices, runs row validators, file validator, and hierarchy extraction.
|
|
46
45
|
# Returns [notices, row_errors, file_validator, collections, works, file_sets].
|
|
47
|
-
def run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id) # rubocop:disable Metrics/ParameterLists
|
|
46
|
+
def run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id, mapping_manager: nil) # rubocop:disable Metrics/ParameterLists
|
|
48
47
|
find_record = build_find_record
|
|
49
48
|
notices = []
|
|
50
49
|
append_missing_source_id!(missing_required, headers, source_id_key, csv_data.map { |r| r[:model] }.compact.uniq)
|
|
51
50
|
append_missing_model_notice!(notices, headers, csv_data)
|
|
52
51
|
|
|
53
|
-
row_errors = run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices)
|
|
52
|
+
row_errors = run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices, mapping_manager: mapping_manager)
|
|
54
53
|
file_validator = CsvTemplate::FileValidator.new(csv_data, zip_file, admin_set_id)
|
|
55
54
|
collections, works, file_sets = extract_hierarchy_items(csv_data, all_ids, find_record, mappings)
|
|
56
55
|
[notices, row_errors, file_validator, collections, works, file_sets]
|
|
@@ -72,7 +71,7 @@ module Bulkrax
|
|
|
72
71
|
file_key = resolve_validation_key(mapping_manager, key: 'file', default: :file)
|
|
73
72
|
|
|
74
73
|
csv_data = parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)
|
|
75
|
-
all_models = csv_data.map { |r| r[:model] }.
|
|
74
|
+
all_models = csv_data.map { |r| r[:model].to_s }.reject(&:blank?).uniq
|
|
76
75
|
all_models |= [Bulkrax.default_work_type] if Bulkrax.default_work_type.present?
|
|
77
76
|
field_analyzer = CsvTemplate::FieldAnalyzer.new(mappings, admin_set_id)
|
|
78
77
|
field_metadata = build_validation_field_metadata(all_models, field_analyzer)
|
|
@@ -90,7 +89,9 @@ module Bulkrax
|
|
|
90
89
|
|
|
91
90
|
{
|
|
92
91
|
missing_required: find_missing_required_headers(headers, field_metadata, mapping_manager),
|
|
93
|
-
unrecognized: find_unrecognized_validation_headers(headers, valid_headers
|
|
92
|
+
unrecognized: find_unrecognized_validation_headers(headers, valid_headers,
|
|
93
|
+
mapping_manager: mapping_manager,
|
|
94
|
+
field_metadata: field_metadata),
|
|
94
95
|
empty_columns: find_empty_column_positions(headers, raw_csv)
|
|
95
96
|
}
|
|
96
97
|
end
|
|
@@ -99,12 +100,12 @@ module Bulkrax
|
|
|
99
100
|
extract_validation_items(
|
|
100
101
|
csv_data, all_ids, find_record,
|
|
101
102
|
parent_split_pattern: resolve_parent_split_pattern(mappings),
|
|
102
|
-
child_split_pattern: resolve_children_split_pattern(mappings) ||
|
|
103
|
+
child_split_pattern: resolve_children_split_pattern(mappings) || Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON
|
|
103
104
|
)
|
|
104
105
|
end
|
|
105
106
|
|
|
106
107
|
# Runs all registered row validators and returns the collected errors.
|
|
107
|
-
def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices = []) # rubocop:disable Metrics/ParameterLists
|
|
108
|
+
def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices = [], mapping_manager: nil) # rubocop:disable Metrics/ParameterLists
|
|
108
109
|
context = {
|
|
109
110
|
errors: [],
|
|
110
111
|
warnings: [],
|
|
@@ -116,6 +117,7 @@ module Bulkrax
|
|
|
116
117
|
parent_column: resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents'),
|
|
117
118
|
children_column: resolve_relationship_column(mappings, 'related_children_field_mapping', 'children'),
|
|
118
119
|
mappings: mappings,
|
|
120
|
+
mapping_manager: mapping_manager,
|
|
119
121
|
field_metadata: field_metadata,
|
|
120
122
|
find_record_by_source_identifier: find_record,
|
|
121
123
|
relationship_graph: build_relationship_graph(csv_data, mappings),
|