bulkrax 9.4.0 → 9.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +8 -2
  3. data/app/assets/javascripts/bulkrax/datatables.js +43 -8
  4. data/app/assets/javascripts/bulkrax/importers_stepper.js +221 -26
  5. data/app/assets/stylesheets/bulkrax/stepper/_review.scss +14 -12
  6. data/app/controllers/bulkrax/entries_controller.rb +2 -2
  7. data/app/controllers/bulkrax/exporters_controller.rb +3 -3
  8. data/app/controllers/bulkrax/guided_imports_controller.rb +3 -1
  9. data/app/controllers/bulkrax/importers_controller.rb +5 -5
  10. data/app/controllers/concerns/bulkrax/importer_file_handler.rb +1 -6
  11. data/app/errors/bulkrax/unzip_error.rb +16 -0
  12. data/app/jobs/bulkrax/importer_job.rb +40 -9
  13. data/app/matchers/bulkrax/application_matcher.rb +5 -6
  14. data/app/models/bulkrax/csv_entry.rb +1 -1
  15. data/app/models/bulkrax/importer.rb +3 -16
  16. data/app/parsers/bulkrax/application_parser.rb +50 -33
  17. data/app/parsers/bulkrax/bagit_parser.rb +12 -0
  18. data/app/parsers/bulkrax/csv_parser.rb +163 -49
  19. data/app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb +4 -1
  20. data/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +10 -8
  21. data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb +69 -36
  22. data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb +9 -7
  23. data/app/services/bulkrax/csv_template/file_validator.rb +1 -1
  24. data/app/services/bulkrax/csv_template/mapping_manager.rb +15 -6
  25. data/app/services/bulkrax/csv_template/split_formatter.rb +10 -3
  26. data/app/services/bulkrax/split_pattern_coercion.rb +42 -0
  27. data/app/services/bulkrax/stepper_response_formatter.rb +2 -1
  28. data/app/services/bulkrax/validation_error_csv_builder.rb +36 -12
  29. data/app/validators/bulkrax/csv_row/child_reference.rb +2 -1
  30. data/app/validators/bulkrax/csv_row/parent_reference.rb +1 -1
  31. data/app/validators/bulkrax/csv_row/required_values.rb +17 -3
  32. data/app/views/bulkrax/exporters/edit.html.erb +1 -1
  33. data/app/views/bulkrax/exporters/index.html.erb +3 -1
  34. data/app/views/bulkrax/exporters/new.html.erb +1 -1
  35. data/app/views/bulkrax/exporters/show.html.erb +1 -1
  36. data/app/views/bulkrax/guided_imports/new.html.erb +7 -0
  37. data/app/views/bulkrax/importers/_edit_item_buttons.html.erb +3 -3
  38. data/app/views/bulkrax/importers/index.html.erb +2 -0
  39. data/app/views/bulkrax/importers/new.html.erb +1 -1
  40. data/app/views/bulkrax/importers/show.html.erb +3 -1
  41. data/app/views/bulkrax/shared/_datatable_i18n.html.erb +3 -0
  42. data/config/locales/bulkrax.de.yml +95 -2
  43. data/config/locales/bulkrax.en.yml +58 -2
  44. data/config/locales/bulkrax.es.yml +95 -2
  45. data/config/locales/bulkrax.fr.yml +95 -2
  46. data/config/locales/bulkrax.it.yml +95 -2
  47. data/config/locales/bulkrax.pt-BR.yml +95 -2
  48. data/config/locales/bulkrax.zh.yml +96 -2
  49. data/db/migrate/20260424081537_remove_parents_from_bulkrax_importer_runs.rb +9 -0
  50. data/lib/bulkrax/version.rb +1 -1
  51. data/lib/bulkrax.rb +15 -1
  52. metadata +8 -4
@@ -13,7 +13,7 @@ module Bulkrax
13
13
  import(importer, only_updates_since_last_import)
14
14
  update_current_run_counters(importer)
15
15
  schedule(importer) if importer.schedulable?
16
- rescue ::CSV::MalformedCSVError => e
16
+ rescue ::CSV::MalformedCSVError, Bulkrax::UnzipError => e
17
17
  importer.set_status_info(e)
18
18
  end
19
19
 
@@ -26,18 +26,49 @@ module Bulkrax
26
26
  importer.import_objects
27
27
  end
28
28
 
29
+ # Populates `importer_unzip_path` with the uploaded file(s), leaving
30
+ # the working directory in the shape each parser expects.
31
+ #
32
+ # Dispatch by parser capability rather than class name:
33
+ # - CsvParser (and subclasses that replicate its shape) implements
34
+ # `#unzip_with_primary_csv` and `#unzip_attachments_only`, which
35
+ # place the primary CSV at root and attachments under `files/`.
36
+ # - Other parsers (XML, raw BagIt) inherit the base-class `#unzip`,
37
+ # which extracts the zip verbatim.
38
+ # - The separate attachments-zip flow is CSV-only (guided import is
39
+ # the only UI that produces it).
40
+ #
41
+ # A retry of this job gets a clean working directory: any prior
42
+ # extraction state from an earlier attempt is wiped, so nothing runs
43
+ # against partially-populated state.
29
44
  def unzip_imported_file(parser)
30
45
  return unless parser.file?
46
+
47
+ reset_unzip_path(parser)
48
+
49
+ import_file_path = parser.parser_fields['import_file_path']
50
+ attachments_zip_path = parser.parser_fields['attachments_zip_path']
51
+
31
52
  if parser.zip?
32
- # we have a zip file, and we need to unzip it before we can import the files
33
- parser.unzip(parser.parser_fields['import_file_path'])
34
- parser.remove_spaces_from_filenames
35
- elsif parser.zip_file?(parser.parser_fields['attachments_zip_path'])
36
- # we have a separate csv and zip file. We need to unzip the zip file, and move the csv file to the unzip location before we can import the files
37
- parser.unzip(parser.parser_fields['attachments_zip_path'])
38
- parser.copy_file(parser.parser_fields['import_file_path'])
39
- parser.remove_spaces_from_filenames
53
+ if parser.respond_to?(:unzip_with_primary_csv)
54
+ parser.unzip_with_primary_csv(import_file_path)
55
+ else
56
+ parser.unzip(import_file_path)
57
+ end
58
+ elsif parser.respond_to?(:unzip_attachments_only) && parser.zip_file?(attachments_zip_path)
59
+ parser.copy_file(import_file_path)
60
+ parser.unzip_attachments_only(attachments_zip_path)
61
+ else
62
+ parser.copy_file(import_file_path)
40
63
  end
64
+
65
+ parser.remove_spaces_from_filenames if parser.respond_to?(:remove_spaces_from_filenames)
66
+ end
67
+
68
+ def reset_unzip_path(parser)
69
+ path = parser.importer_unzip_path
70
+ FileUtils.rm_rf(path) if Dir.exist?(path)
71
+ FileUtils.mkdir_p(path)
41
72
  end
42
73
 
43
74
  def update_current_run_counters(importer)
@@ -33,12 +33,11 @@ module Bulkrax
33
33
  end
34
34
 
35
35
  def process_split
36
- if self.split.is_a?(TrueClass)
37
- @result = @result.split(Bulkrax.multi_value_element_split_on)
38
- elsif self.split
39
- @result = @result.split(Regexp.new(self.split))
40
- @result = @result.map(&:strip).select(&:present?)
41
- end
36
+ pattern = Bulkrax::SplitPatternCoercion.coerce(self.split)
37
+ return unless pattern
38
+
39
+ @result = @result.split(pattern)
40
+ @result = @result.map(&:strip).select(&:present?) unless self.split.is_a?(TrueClass)
42
41
  end
43
42
 
44
43
  def process_parse
@@ -165,7 +165,7 @@ module Bulkrax
165
165
  def add_file
166
166
  self.parsed_metadata['file'] ||= []
167
167
  if record['file']&.is_a?(String)
168
- self.parsed_metadata['file'] = record['file'].split(Bulkrax.multi_value_element_split_on)
168
+ self.parsed_metadata['file'] = record['file'].split(Bulkrax::CsvParser.file_split_pattern)
169
169
  elsif record['file'].is_a?(Array)
170
170
  self.parsed_metadata['file'] = record['file']
171
171
  end
@@ -266,22 +266,9 @@ module Bulkrax
266
266
  # end
267
267
 
268
268
  def importer_unzip_path(mkdir: false)
269
- entry = parser_fields&.[]('import_file_path')
270
- if entry.is_a?(String) && entry.end_with?('.zip') && File.file?(entry) && parser_fields["file_style"] != I18n.t('bulkrax.importer.xml.file_style.server_path')
271
- unzip_dir = File.dirname(entry)
272
- FileUtils.mkdir_p(unzip_dir) if mkdir
273
- return unzip_dir
274
- end
275
-
276
- @importer_unzip_path ||= File.join(parser.base_path, "import_#{path_string}")
277
- return @importer_unzip_path if Dir.exist?(@importer_unzip_path) || mkdir == true
278
-
279
- # turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
280
- base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
281
-
282
- # If we don't have an existing unzip path, we'll try and find it.
283
- # Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
284
- @importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
269
+ path = File.join(parser.base_path, "import_#{path_string}")
270
+ FileUtils.mkdir_p(path) if mkdir
271
+ path
285
272
  end
286
273
 
287
274
  def errored_entries_csv_path
@@ -430,39 +430,72 @@ module Bulkrax
430
430
  zip
431
431
  end
432
432
 
433
+ # Extracts a zip verbatim into {#importer_unzip_path}, preserving the zip's
434
+ # internal structure. Filters macOS junk (`__MACOSX/`, `.DS_Store`, `._*`).
435
+ # Parser subclasses that need to interpret the zip's structure (e.g.
436
+ # {Bulkrax::CsvParser#unzip_with_primary_csv}) should call a more specific
437
+ # method rather than this one.
433
438
  def unzip(file_to_unzip)
434
439
  return untar(file_to_unzip) if file_to_unzip.end_with?('.tar.gz')
435
440
 
441
+ dest_dir = importer_unzip_path(mkdir: true)
436
442
  Zip::File.open(file_to_unzip) do |zip_file|
437
- real_entries = zip_file.reject { |e| macos_junk_entry?(e.name) }
438
- top_level_dirs = real_entries.map { |e| e.name.split('/').first }.uniq
439
- strip_prefix = top_level_dirs.size == 1 ? "#{top_level_dirs.first}/" : nil
440
-
441
- dest_dir = importer_unzip_path(mkdir: true)
442
443
  zip_file.each do |entry|
443
444
  next unless entry.file?
444
445
  next if macos_junk_entry?(entry.name)
445
- name = strip_prefix ? entry.name.delete_prefix(strip_prefix) : entry.name
446
- next if name.empty?
447
- dest_path = File.join(dest_dir, name)
446
+ reject_unsafe_entry!(entry.name)
447
+ dest_path = safe_extract_path(dest_dir, entry.name)
448
448
  FileUtils.mkdir_p(File.dirname(dest_path))
449
- unless File.exist?(dest_path)
450
- # rubyzip 2.x: extract(entry, absolute_dest_path)
451
- # rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
452
- if zip_file.method(:extract).arity == 2
453
- zip_file.extract(entry, dest_path)
454
- else
455
- zip_file.extract(entry, name, destination_directory: dest_dir)
456
- end
457
- end
449
+ next if File.exist?(dest_path)
450
+ extract_zip_entry(zip_file, entry, dest_dir, entry.name, dest_path)
458
451
  end
459
452
  end
460
453
  end
461
454
 
455
+ # rubyzip 2.x: extract(entry, absolute_dest_path)
456
+ # rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
457
+ #
458
+ # Callers are responsible for passing a `dest_path` produced by
459
+ # {#safe_extract_path} so the write can't escape `dest_dir`.
460
+ def extract_zip_entry(zip_file, entry, dest_dir, relative_name, dest_path)
461
+ if zip_file.method(:extract).arity == 2
462
+ zip_file.extract(entry, dest_path)
463
+ else
464
+ zip_file.extract(entry, relative_name, destination_directory: dest_dir)
465
+ end
466
+ end
467
+
462
468
  def macos_junk_entry?(name)
463
469
  name.start_with?('__MACOSX/') || name.split('/').any? { |part| part == '.DS_Store' || part.start_with?('._') }
464
470
  end
465
471
 
472
+ # Zip Slip preflight — reject entries whose names are obviously unsafe
473
+ # (absolute paths, `..` segments) before we touch the filesystem.
474
+ # {#safe_extract_path} is the final line of defense; this check just
475
+ # fails fast with a clear message.
476
+ #
477
+ # @raise [Bulkrax::UnzipError] if the entry name is unsafe
478
+ def reject_unsafe_entry!(name)
479
+ return unless name.start_with?('/') || name.split('/').include?('..')
480
+ raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: name)
481
+ end
482
+
483
+ # Zip Slip chokepoint. Resolves `relative_dest` against `dest_dir` and
484
+ # returns the absolute destination path — but only if it stays inside
485
+ # `dest_dir`. Callers must use this value rather than building their
486
+ # own path with `File.join`, so the path returned is always safe by
487
+ # construction.
488
+ #
489
+ # @return [String] absolute destination path, validated to be inside `dest_dir`
490
+ # @raise [Bulkrax::UnzipError] if `relative_dest` escapes `dest_dir`
491
+ def safe_extract_path(dest_dir, relative_dest)
492
+ expanded_dest_dir = File.expand_path(dest_dir)
493
+ dest_path = File.expand_path(relative_dest.to_s, expanded_dest_dir)
494
+ return dest_path if dest_path == expanded_dest_dir
495
+ return dest_path if dest_path.start_with?("#{expanded_dest_dir}#{File::SEPARATOR}")
496
+ raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: relative_dest)
497
+ end
498
+
466
499
  def copy_file(file_to_copy)
467
500
  destination = File.join(importer_unzip_path(mkdir: true), File.basename(file_to_copy))
468
501
  FileUtils.cp(file_to_copy, destination)
@@ -475,21 +508,6 @@ module Bulkrax
475
508
  raise "Failed to extract #{file_to_untar}" unless result
476
509
  end
477
510
 
478
- # File names referenced in CSVs have spaces replaced with underscores
479
- # @see Bulkrax::CsvParser#file_paths
480
- def remove_spaces_from_filenames
481
- files = Dir.glob(File.join(importer_unzip_path, 'files', '*')).uniq
482
- files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
483
- return if files_with_spaces.blank?
484
-
485
- files_with_spaces.map! { |path| Pathname.new(path) }
486
- files_with_spaces.each do |path|
487
- filename = path.basename
488
- filename_without_spaces = filename.to_s.tr(' ', '_')
489
- path.rename(File.join(path.dirname, filename_without_spaces))
490
- end
491
- end
492
-
493
511
  def zip
494
512
  FileUtils.mkdir_p(exporter_export_zip_path)
495
513
 
@@ -515,7 +533,6 @@ module Bulkrax
515
533
 
516
534
  # @return [String]
517
535
  def real_import_file_path
518
- return importer_unzip_path if file? && zip?
519
536
  parser_fields['import_file_path']
520
537
  end
521
538
  end
@@ -25,6 +25,18 @@ unless ENV.fetch('BULKRAX_NO_BAGIT', 'false').to_s == 'true'
25
25
  @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
26
26
  end
27
27
 
28
+ # BagIt archives are not CSV imports: they don't contain a primary
29
+ # CSV at a shallowest level, and their structure (bagit.txt + data/
30
+ # + manifests) must be preserved verbatim. Override both CSV-flavored
31
+ # unzip entry points to use the base-class verbatim extraction.
32
+ def unzip_with_primary_csv(file_to_unzip)
33
+ unzip(file_to_unzip)
34
+ end
35
+
36
+ def unzip_attachments_only(file_to_unzip)
37
+ unzip(file_to_unzip)
38
+ end
39
+
28
40
  # Take a random sample of 10 metadata_paths and work out the import fields from that
29
41
  def import_fields
30
42
  raise StandardError, 'No metadata files were found' if metadata_paths.blank?
@@ -13,6 +13,16 @@ module Bulkrax
13
13
  true
14
14
  end
15
15
 
16
+ # @return [Regexp] the pattern String#split should use on a `file` cell.
17
+ # Honours the `file` mapping's `split:` when set, otherwise falls back
18
+ # to {Bulkrax.multi_value_element_split_on}.
19
+ def self.file_split_pattern
20
+ file_mapping = Bulkrax.field_mappings.dig(to_s, 'file') ||
21
+ Bulkrax.field_mappings.dig(to_s, :file) || {}
22
+ split_value = file_mapping['split'] || file_mapping[:split]
23
+ Bulkrax::SplitPatternCoercion.coerce(split_value) || Bulkrax.multi_value_element_split_on
24
+ end
25
+
16
26
  def records(_opts = {})
17
27
  return @records if @records.present?
18
28
 
@@ -352,20 +362,13 @@ module Bulkrax
352
362
  raise StandardError, 'No records were found' if records.blank?
353
363
  return [] if importerexporter.metadata_only?
354
364
 
365
+ # Compute once — these don't vary per record.
366
+ file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
367
+ split_pattern = self.class.file_split_pattern
368
+ files_dir = path_to_files
369
+
355
370
  @file_paths ||= records.map do |r|
356
- file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
357
371
  next if r[file_mapping].blank?
358
-
359
- split_value = Bulkrax.field_mappings.dig(self.class.to_s, :file, :split)
360
- split_pattern = case split_value
361
- when Regexp
362
- split_value
363
- when String
364
- Regexp.new(split_value)
365
- else
366
- Bulkrax.multi_value_element_split_on
367
- end
368
- files_dir = path_to_files
369
372
  raise StandardError, "Record references local files but no files directory could be resolved from the import path" if files_dir.nil?
370
373
 
371
374
  r[file_mapping].split(split_pattern).map do |f|
@@ -379,47 +382,161 @@ module Bulkrax
379
382
  end.flatten.compact.uniq
380
383
  end
381
384
 
382
- # Retrieve the path where we expect to find the files
385
+ # Retrieve the path where we expect to find the files for this import.
386
+ # After {ImporterJob#unzip_imported_file} runs (zip cases), attachments
387
+ # live under `{importer_unzip_path}/files/`. For a server-path-style
388
+ # import (the user specified a CSV file path with a sibling `files/`
389
+ # directory on disk), resolve relative to the CSV's directory instead.
390
+ #
391
+ # When called with `filename:`, returns the full path to that file if
392
+ # it exists on disk, or `nil` otherwise — callers like
393
+ # `Bulkrax::FileSetEntryBehavior#add_path_to_file` rely on the nil
394
+ # sentinel to fall back to the raw filename in their error messages.
395
+ #
396
+ # When called with no filename, returns the `files/` directory itself
397
+ # (only when that directory exists on disk — else `nil` so callers can
398
+ # raise a clear "no files directory" error).
383
399
  def path_to_files(**args)
384
400
  filename = args.fetch(:filename, '')
401
+ base_dir = files_dir
402
+ return base_dir if filename.blank? && Dir.exist?(base_dir)
403
+ return nil if filename.blank?
404
+
405
+ candidate = File.join(base_dir, filename)
406
+ candidate if File.exist?(candidate)
407
+ end
408
+
409
+ # Extracts a zip that contains a primary CSV. The primary CSV lands at
410
+ # the root of {#importer_unzip_path}; every other entry lands under
411
+ # {#importer_unzip_path}/files/, preserving its path relative to the
412
+ # primary CSV's directory.
413
+ #
414
+ # Primary-CSV selection matches the guided-import validator's rule
415
+ # (see {Bulkrax::ImporterFileHandler#locate_csv_entry_in_zip}): the CSV
416
+ # entry at the shallowest directory level. Visible errors are raised on
417
+ # zero CSVs or multiple CSVs at the shallowest level.
418
+ #
419
+ # @param file_to_unzip [String] absolute path to a .zip
420
+ # @raise [Bulkrax::UnzipError] on no CSV or ambiguous CSVs
421
+ def unzip_with_primary_csv(file_to_unzip)
422
+ dest_dir = importer_unzip_path(mkdir: true)
423
+ Zip::File.open(file_to_unzip) do |zip_file|
424
+ entries = real_zip_entries(zip_file)
425
+ primary = select_primary_csv!(entries)
426
+ primary_dir = File.dirname(primary.name)
427
+
428
+ entries.each do |entry|
429
+ if entry == primary
430
+ extract_to(zip_file, entry, dest_dir, File.basename(entry.name))
431
+ else
432
+ extract_to(zip_file, entry, dest_dir, File.join('files', relative_to(primary_dir, entry.name)))
433
+ end
434
+ end
435
+ end
436
+ end
385
437
 
386
- return @path_to_files if @path_to_files.present? && filename.blank?
387
- # The zip file could be either the main import file, or a separate attachments zip file.
388
- # We want to check for both of those before we determine the path to the files.
389
- have_zip_file = zip? || (parser_fields['attachments_zip_path'] && zip_file?(parser_fields['attachments_zip_path']))
390
- @path_to_files = File.join(
391
- have_zip_file ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
392
- )
393
-
394
- return @path_to_files if File.exist?(@path_to_files)
395
-
396
- # TODO: This method silently returns nil if there is no file & no zip file
397
- File.join(importer_unzip_path, 'files', filename) if file? && zip?
438
+ # Extracts a zip that accompanies a separately-uploaded CSV. Every
439
+ # entry lands under {#importer_unzip_path}/files/ including any
440
+ # CSVs inside the zip, which are treated as attachments since the
441
+ # primary CSV was uploaded outside the zip. Strips a single top-level
442
+ # wrapper directory if present, so users can zip either the contents
443
+ # or the enclosing folder.
444
+ #
445
+ # @param file_to_unzip [String] absolute path to a .zip
446
+ def unzip_attachments_only(file_to_unzip)
447
+ dest_dir = importer_unzip_path(mkdir: true)
448
+ Zip::File.open(file_to_unzip) do |zip_file|
449
+ entries = real_zip_entries(zip_file)
450
+ wrapper = single_top_level_wrapper(entries)
451
+
452
+ entries.each do |entry|
453
+ relative = wrapper ? entry.name.delete_prefix("#{wrapper}/") : entry.name
454
+ next if relative.empty?
455
+ extract_to(zip_file, entry, dest_dir, File.join('files', relative))
456
+ end
457
+ end
398
458
  end
399
459
 
400
- def unzip(file_to_unzip)
401
- super
402
- normalize_unzipped_files_structure(importer_unzip_path)
460
+ # File names referenced in CSVs have spaces replaced with underscores.
461
+ # @see #file_paths
462
+ def remove_spaces_from_filenames
463
+ files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
464
+ files_with_spaces = files.select { |f| f.split('/').last.include?(' ') }
465
+ return if files_with_spaces.blank?
466
+
467
+ files_with_spaces.map! { |path| Pathname.new(path) }
468
+ files_with_spaces.each do |path|
469
+ filename_without_spaces = path.basename.to_s.tr(' ', '_')
470
+ path.rename(File.join(path.dirname, filename_without_spaces))
471
+ end
403
472
  end
404
473
 
405
474
  private
406
475
 
407
- # Ensure files extracted from a zip always land in a `files/` subdirectory
408
- # regardless of how the zip was structured. If files were extracted directly
409
- # into dest_dir (flat zip with no `files/` folder), move them into
410
- # dest_dir/files/ so that path_to_files can reliably locate them.
411
- def normalize_unzipped_files_structure(dest_dir)
412
- flat_files = Dir.glob(File.join(dest_dir, '*')).select { |f| File.file?(f) && !f.end_with?('.csv') }
413
- return if flat_files.empty?
414
-
415
- files_dir = File.join(dest_dir, 'files')
416
- FileUtils.mkdir_p(files_dir)
417
- flat_files.each do |f|
418
- dest = File.join(files_dir, File.basename(f))
419
- FileUtils.mv(f, dest) unless File.exist?(dest)
476
+ # Memoized base directory under which import attachments live. Kept
477
+ # separate from `#path_to_files`' per-filename return value to avoid
478
+ # cross-contamination between directory lookups and file lookups.
479
+ def files_dir
480
+ @files_dir ||= begin
481
+ has_attachments_zip = parser_fields['attachments_zip_path'].present? && zip_file?(parser_fields['attachments_zip_path'])
482
+ base = zip? || has_attachments_zip ? importer_unzip_path : File.dirname(import_file_path)
483
+ File.join(base, 'files')
420
484
  end
421
485
  end
422
486
 
487
+ # Returns zip entries filtered down to real files (no directories, no
488
+ # macOS junk). Raises {Bulkrax::UnzipError} if any entry's name would
489
+ # escape the destination directory (Zip Slip).
490
+ def real_zip_entries(zip_file)
491
+ entries = zip_file.entries.select { |e| e.file? && !macos_junk_entry?(e.name) }
492
+ entries.each { |e| reject_unsafe_entry!(e.name) }
493
+ entries
494
+ end
495
+
496
+ # Picks the single primary CSV from zip entries, enforcing the
497
+ # shallowest-level rule. Raises {Bulkrax::UnzipError} on failure.
498
+ def select_primary_csv!(entries)
499
+ csvs = entries.select { |e| e.name.end_with?('.csv') }
500
+ raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.no_csv') if csvs.empty?
501
+
502
+ by_depth = csvs.group_by { |e| e.name.count('/') }
503
+ shallowest = by_depth[by_depth.keys.min]
504
+
505
+ raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.multiple_csv') if shallowest.size > 1
506
+
507
+ shallowest.first
508
+ end
509
+
510
+ # If every entry shares a single top-level directory, returns that
511
+ # directory name; otherwise nil.
512
+ def single_top_level_wrapper(entries)
513
+ tops = entries.map { |e| e.name.split('/').first }.uniq
514
+ return nil unless tops.size == 1
515
+ # If the single top segment is a file (no slashes in the entry), not a dir,
516
+ # there's no wrapper to strip.
517
+ return nil if entries.any? { |e| e.name == tops.first }
518
+ tops.first
519
+ end
520
+
521
+ # Returns `path` with `prefix/` removed from the front, if present, and
522
+ # a leading `files/` segment also stripped so callers can join under
523
+ # `files/` without doubling when the zip already uses that convention.
524
+ def relative_to(prefix, path)
525
+ remaining = prefix == '.' || prefix.empty? ? path : path.delete_prefix("#{prefix}/")
526
+ remaining.delete_prefix('files/')
527
+ end
528
+
529
+ # Extracts a zip entry to `dest_dir/relative_dest`. Creates intermediate
530
+ # directories and honors the rubyzip 2/3 extract-method signature.
531
+ # The destination path is validated by {#safe_extract_path} — an unsafe
532
+ # `relative_dest` raises {Bulkrax::UnzipError} before any write.
533
+ def extract_to(zip_file, entry, dest_dir, relative_dest)
534
+ dest_path = safe_extract_path(dest_dir, relative_dest)
535
+ FileUtils.mkdir_p(File.dirname(dest_path))
536
+ return if File.exist?(dest_path)
537
+ extract_zip_entry(zip_file, entry, dest_dir, relative_dest, dest_path)
538
+ end
539
+
423
540
  def unique_collection_identifier(collection_hash)
424
541
  entry_uid = collection_hash[source_identifier]
425
542
  entry_uid ||= if Bulkrax.fill_in_blank_source_identifiers.present?
@@ -434,16 +551,13 @@ module Bulkrax
434
551
  # Override to return the first CSV in the path, if a zip file is supplied
435
552
  # We expect a single CSV at the top level of the zip in the CSVParser
436
553
  # but we are willing to go look for it if need be
554
+ # When the user uploaded a zip containing a CSV, the job extracts the
555
+ # primary CSV to the root of `importer_unzip_path` (see
556
+ # {#unzip_with_primary_csv}). Any non-primary CSVs live under `files/`
557
+ # and are treated as attachments, so a shallow glob suffices.
437
558
  def real_import_file_path
438
- return Dir["#{importer_unzip_path}/**/*.csv"].reject { |path| in_files_dir?(path) }.first if file? && zip?
439
-
559
+ return Dir["#{importer_unzip_path}/*.csv"].first if file? && zip?
440
560
  parser_fields['import_file_path']
441
561
  end
442
-
443
- # If there are CSVs that are meant to be attachments in the files directory,
444
- # we don't want to consider them as the import CSV
445
- def in_files_dir?(path)
446
- File.dirname(path).ends_with?('files')
447
- end
448
562
  end
449
563
  end
@@ -27,7 +27,10 @@ module Bulkrax
27
27
 
28
28
  def initialize(models: nil, admin_set_id: nil)
29
29
  @admin_set_id = admin_set_id
30
- @mapping_manager = CsvTemplate::MappingManager.new
30
+ # Template generation excludes system-maintained fields (generated:
31
+ # true) so users don't see columns like date_uploaded, depositor,
32
+ # etc. on the downloadable template.
33
+ @mapping_manager = CsvTemplate::MappingManager.new(include_generated: false)
31
34
  @mappings = @mapping_manager.mappings
32
35
  @field_analyzer = CsvTemplate::FieldAnalyzer.new(@mappings, admin_set_id)
33
36
  @all_models = CsvTemplate::ModelLoader.new(Array.wrap(models)).models
@@ -28,14 +28,13 @@ module Bulkrax
28
28
  header_issues = check_headers(headers, raw_csv, mapping_manager, mappings, field_metadata, field_analyzer)
29
29
  missing_required = header_issues[:missing_required]
30
30
  notices, row_errors, file_validator, collections, works, file_sets =
31
- run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id)
31
+ run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id, mapping_manager: mapping_manager)
32
32
 
33
33
  result = assemble_result(
34
34
  headers: headers, missing_required: missing_required, header_issues: header_issues,
35
35
  row_errors: row_errors, csv_data: csv_data, file_validator: file_validator,
36
36
  collections: collections, works: works, file_sets: file_sets, notices: notices
37
37
  )
38
- apply_rights_statement_validation_override!(result, missing_required)
39
38
  result[:raw_csv_data] = csv_data
40
39
  result
41
40
  end
@@ -44,13 +43,13 @@ module Bulkrax
44
43
 
45
44
  # Builds notices, runs row validators, file validator, and hierarchy extraction.
46
45
  # Returns [notices, row_errors, file_validator, collections, works, file_sets].
47
- def run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id) # rubocop:disable Metrics/ParameterLists
46
+ def run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id, mapping_manager: nil) # rubocop:disable Metrics/ParameterLists
48
47
  find_record = build_find_record
49
48
  notices = []
50
49
  append_missing_source_id!(missing_required, headers, source_id_key, csv_data.map { |r| r[:model] }.compact.uniq)
51
50
  append_missing_model_notice!(notices, headers, csv_data)
52
51
 
53
- row_errors = run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices)
52
+ row_errors = run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices, mapping_manager: mapping_manager)
54
53
  file_validator = CsvTemplate::FileValidator.new(csv_data, zip_file, admin_set_id)
55
54
  collections, works, file_sets = extract_hierarchy_items(csv_data, all_ids, find_record, mappings)
56
55
  [notices, row_errors, file_validator, collections, works, file_sets]
@@ -72,7 +71,7 @@ module Bulkrax
72
71
  file_key = resolve_validation_key(mapping_manager, key: 'file', default: :file)
73
72
 
74
73
  csv_data = parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)
75
- all_models = csv_data.map { |r| r[:model] }.compact.uniq
74
+ all_models = csv_data.map { |r| r[:model].to_s }.reject(&:blank?).uniq
76
75
  all_models |= [Bulkrax.default_work_type] if Bulkrax.default_work_type.present?
77
76
  field_analyzer = CsvTemplate::FieldAnalyzer.new(mappings, admin_set_id)
78
77
  field_metadata = build_validation_field_metadata(all_models, field_analyzer)
@@ -90,7 +89,9 @@ module Bulkrax
90
89
 
91
90
  {
92
91
  missing_required: find_missing_required_headers(headers, field_metadata, mapping_manager),
93
- unrecognized: find_unrecognized_validation_headers(headers, valid_headers),
92
+ unrecognized: find_unrecognized_validation_headers(headers, valid_headers,
93
+ mapping_manager: mapping_manager,
94
+ field_metadata: field_metadata),
94
95
  empty_columns: find_empty_column_positions(headers, raw_csv)
95
96
  }
96
97
  end
@@ -99,12 +100,12 @@ module Bulkrax
99
100
  extract_validation_items(
100
101
  csv_data, all_ids, find_record,
101
102
  parent_split_pattern: resolve_parent_split_pattern(mappings),
102
- child_split_pattern: resolve_children_split_pattern(mappings) || '|'
103
+ child_split_pattern: resolve_children_split_pattern(mappings) || Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON
103
104
  )
104
105
  end
105
106
 
106
107
  # Runs all registered row validators and returns the collected errors.
107
- def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices = []) # rubocop:disable Metrics/ParameterLists
108
+ def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices = [], mapping_manager: nil) # rubocop:disable Metrics/ParameterLists
108
109
  context = {
109
110
  errors: [],
110
111
  warnings: [],
@@ -116,6 +117,7 @@ module Bulkrax
116
117
  parent_column: resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents'),
117
118
  children_column: resolve_relationship_column(mappings, 'related_children_field_mapping', 'children'),
118
119
  mappings: mappings,
120
+ mapping_manager: mapping_manager,
119
121
  field_metadata: field_metadata,
120
122
  find_record_by_source_identifier: find_record,
121
123
  relationship_graph: build_relationship_graph(csv_data, mappings),