bulkrax 9.4.0 → 9.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 05440b3212ae1571e13cd9ee789c0222b8113a2cf717e4fc84a14f3c2aa2627a
4
- data.tar.gz: 99d843e51cb362be57e279246fc63937b0772429368142a5a383e7b0a1ccfa4a
3
+ metadata.gz: 420e0b83f78ad1c411b0532bda121bebe74a651a9d1abec51549273896a00bcb
4
+ data.tar.gz: 892e143d2de6c714121804bf547b9473545c1071ecce6ef9a6269cb60eeaf66f
5
5
  SHA512:
6
- metadata.gz: 7c8d2bd4ec608ceea8b567ebf39972c3e14b300731b8202d133e9a541e29995e69ed6f1dc1528ed5d0c877e38664f05260a60147985da08b65a8c3630cb80ebb
7
- data.tar.gz: b4a1956d149a23c0bfc299ef3235a36c1d680784283957cb8fe56b06a3745f94d810318a58c9b791a193a76cd218f78129d8e8773b0ff78f8c73bb38d9e6975f
6
+ metadata.gz: 497fe999aa3d39f3e7281b5e743a75d5b6c60ba93d0a7a40bd63bdfe248b0c35e52dffaa9b8aebe59489e68999ea9a0e22826ed30ffea0f2d8927cfaa61852d5
7
+ data.tar.gz: 176a04163d610ad5241b96ecd107ae4bc79e64dddaff94f2fa80b7b93ce48951934ffb8f4a3438169c185f7c5a6e9f468aa8371c3c1c3d282d7c9d59ebfde946
@@ -121,12 +121,7 @@ module Bulkrax
121
121
  csv_by_depth = get_directory_depth_for_each_csv(csv_entries)
122
122
  csvs_at_level = determine_csvs_at_shallowest_level(csv_by_depth)
123
123
 
124
- csvs_by_directory = csvs_at_level.group_by { |entry| File.dirname(entry.name) }
125
- csvs_by_directory.each do |_dir, csvs|
126
- return StepperResponseFormatter.error(message: I18n.t('bulkrax.importer.guided_import.validation.multiple_csv_same_dir')) if csvs.count > 1
127
- end
128
-
129
- return StepperResponseFormatter.error(message: I18n.t('bulkrax.importer.guided_import.validation.multiple_csv_same_level')) if csvs_at_level.size > 1
124
+ return StepperResponseFormatter.error(message: I18n.t('bulkrax.importer.guided_import.validation.multiple_csv')) if csvs_at_level.size > 1
130
125
 
131
126
  csvs_at_level.first
132
127
  end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Bulkrax
4
+ # Raised when a zip cannot be safely or meaningfully extracted during
5
+ # import. Covered scenarios include:
6
+ #
7
+ # - A single upload zip has no CSV at any level.
8
+ # - A single upload zip has multiple CSVs at its shallowest level
9
+ # (primary CSV cannot be determined).
10
+ # - A zip entry's name would escape the destination directory
11
+ # (Zip Slip: absolute paths, `..` traversal, etc.).
12
+ #
13
+ # Defined in its own file so Zeitwerk can autoload the constant by name
14
+ # from any parser or job that raises or rescues it.
15
+ class UnzipError < StandardError; end
16
+ end
@@ -13,7 +13,7 @@ module Bulkrax
13
13
  import(importer, only_updates_since_last_import)
14
14
  update_current_run_counters(importer)
15
15
  schedule(importer) if importer.schedulable?
16
- rescue ::CSV::MalformedCSVError => e
16
+ rescue ::CSV::MalformedCSVError, Bulkrax::UnzipError => e
17
17
  importer.set_status_info(e)
18
18
  end
19
19
 
@@ -26,18 +26,49 @@ module Bulkrax
26
26
  importer.import_objects
27
27
  end
28
28
 
29
+ # Populates `importer_unzip_path` with the uploaded file(s), leaving
30
+ # the working directory in the shape each parser expects.
31
+ #
32
+ # Dispatch by parser capability rather than class name:
33
+ # - CsvParser (and subclasses that replicate its shape) implements
34
+ # `#unzip_with_primary_csv` and `#unzip_attachments_only`, which
35
+ # place the primary CSV at root and attachments under `files/`.
36
+ # - Other parsers (XML, raw BagIt) inherit the base-class `#unzip`,
37
+ # which extracts the zip verbatim.
38
+ # - The separate attachments-zip flow is CSV-only (guided import is
39
+ # the only UI that produces it).
40
+ #
41
+ # A retry of this job gets a clean working directory: any prior
42
+ # extraction state from an earlier attempt is wiped, so nothing runs
43
+ # against partially-populated state.
29
44
  def unzip_imported_file(parser)
30
45
  return unless parser.file?
46
+
47
+ reset_unzip_path(parser)
48
+
49
+ import_file_path = parser.parser_fields['import_file_path']
50
+ attachments_zip_path = parser.parser_fields['attachments_zip_path']
51
+
31
52
  if parser.zip?
32
- # we have a zip file, and we need to unzip it before we can import the files
33
- parser.unzip(parser.parser_fields['import_file_path'])
34
- parser.remove_spaces_from_filenames
35
- elsif parser.zip_file?(parser.parser_fields['attachments_zip_path'])
36
- # we have a separate csv and zip file. We need to unzip the zip file, and move the csv file to the unzip location before we can import the files
37
- parser.unzip(parser.parser_fields['attachments_zip_path'])
38
- parser.copy_file(parser.parser_fields['import_file_path'])
39
- parser.remove_spaces_from_filenames
53
+ if parser.respond_to?(:unzip_with_primary_csv)
54
+ parser.unzip_with_primary_csv(import_file_path)
55
+ else
56
+ parser.unzip(import_file_path)
57
+ end
58
+ elsif parser.respond_to?(:unzip_attachments_only) && parser.zip_file?(attachments_zip_path)
59
+ parser.copy_file(import_file_path)
60
+ parser.unzip_attachments_only(attachments_zip_path)
61
+ else
62
+ parser.copy_file(import_file_path)
40
63
  end
64
+
65
+ parser.remove_spaces_from_filenames if parser.respond_to?(:remove_spaces_from_filenames)
66
+ end
67
+
68
+ def reset_unzip_path(parser)
69
+ path = parser.importer_unzip_path
70
+ FileUtils.rm_rf(path) if Dir.exist?(path)
71
+ FileUtils.mkdir_p(path)
41
72
  end
42
73
 
43
74
  def update_current_run_counters(importer)
@@ -266,22 +266,9 @@ module Bulkrax
266
266
  # end
267
267
 
268
268
  def importer_unzip_path(mkdir: false)
269
- entry = parser_fields&.[]('import_file_path')
270
- if entry.is_a?(String) && entry.end_with?('.zip') && File.file?(entry) && parser_fields["file_style"] != I18n.t('bulkrax.importer.xml.file_style.server_path')
271
- unzip_dir = File.dirname(entry)
272
- FileUtils.mkdir_p(unzip_dir) if mkdir
273
- return unzip_dir
274
- end
275
-
276
- @importer_unzip_path ||= File.join(parser.base_path, "import_#{path_string}")
277
- return @importer_unzip_path if Dir.exist?(@importer_unzip_path) || mkdir == true
278
-
279
- # turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
280
- base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
281
-
282
- # If we don't have an existing unzip path, we'll try and find it.
283
- # Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
284
- @importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
269
+ path = File.join(parser.base_path, "import_#{path_string}")
270
+ FileUtils.mkdir_p(path) if mkdir
271
+ path
285
272
  end
286
273
 
287
274
  def errored_entries_csv_path
@@ -430,39 +430,72 @@ module Bulkrax
430
430
  zip
431
431
  end
432
432
 
433
+ # Extracts a zip verbatim into {#importer_unzip_path}, preserving the zip's
434
+ # internal structure. Filters macOS junk (`__MACOSX/`, `.DS_Store`, `._*`).
435
+ # Parser subclasses that need to interpret the zip's structure (e.g.
436
+ # {Bulkrax::CsvParser#unzip_with_primary_csv}) should call a more specific
437
+ # method rather than this one.
433
438
  def unzip(file_to_unzip)
434
439
  return untar(file_to_unzip) if file_to_unzip.end_with?('.tar.gz')
435
440
 
441
+ dest_dir = importer_unzip_path(mkdir: true)
436
442
  Zip::File.open(file_to_unzip) do |zip_file|
437
- real_entries = zip_file.reject { |e| macos_junk_entry?(e.name) }
438
- top_level_dirs = real_entries.map { |e| e.name.split('/').first }.uniq
439
- strip_prefix = top_level_dirs.size == 1 ? "#{top_level_dirs.first}/" : nil
440
-
441
- dest_dir = importer_unzip_path(mkdir: true)
442
443
  zip_file.each do |entry|
443
444
  next unless entry.file?
444
445
  next if macos_junk_entry?(entry.name)
445
- name = strip_prefix ? entry.name.delete_prefix(strip_prefix) : entry.name
446
- next if name.empty?
447
- dest_path = File.join(dest_dir, name)
446
+ reject_unsafe_entry!(entry.name)
447
+ dest_path = safe_extract_path(dest_dir, entry.name)
448
448
  FileUtils.mkdir_p(File.dirname(dest_path))
449
- unless File.exist?(dest_path)
450
- # rubyzip 2.x: extract(entry, absolute_dest_path)
451
- # rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
452
- if zip_file.method(:extract).arity == 2
453
- zip_file.extract(entry, dest_path)
454
- else
455
- zip_file.extract(entry, name, destination_directory: dest_dir)
456
- end
457
- end
449
+ next if File.exist?(dest_path)
450
+ extract_zip_entry(zip_file, entry, dest_dir, entry.name, dest_path)
458
451
  end
459
452
  end
460
453
  end
461
454
 
455
+ # rubyzip 2.x: extract(entry, absolute_dest_path)
456
+ # rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
457
+ #
458
+ # Callers are responsible for passing a `dest_path` produced by
459
+ # {#safe_extract_path} so the write can't escape `dest_dir`.
460
+ def extract_zip_entry(zip_file, entry, dest_dir, relative_name, dest_path)
461
+ if zip_file.method(:extract).arity == 2
462
+ zip_file.extract(entry, dest_path)
463
+ else
464
+ zip_file.extract(entry, relative_name, destination_directory: dest_dir)
465
+ end
466
+ end
467
+
462
468
  def macos_junk_entry?(name)
463
469
  name.start_with?('__MACOSX/') || name.split('/').any? { |part| part == '.DS_Store' || part.start_with?('._') }
464
470
  end
465
471
 
472
+ # Zip Slip preflight — reject entries whose names are obviously unsafe
473
+ # (absolute paths, `..` segments) before we touch the filesystem.
474
+ # {#safe_extract_path} is the final line of defense; this check just
475
+ # fails fast with a clear message.
476
+ #
477
+ # @raise [Bulkrax::UnzipError] if the entry name is unsafe
478
+ def reject_unsafe_entry!(name)
479
+ return unless name.start_with?('/') || name.split('/').include?('..')
480
+ raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: name)
481
+ end
482
+
483
+ # Zip Slip chokepoint. Resolves `relative_dest` against `dest_dir` and
484
+ # returns the absolute destination path — but only if it stays inside
485
+ # `dest_dir`. Callers must use this value rather than building their
486
+ # own path with `File.join`, so the path returned is always safe by
487
+ # construction.
488
+ #
489
+ # @return [String] absolute destination path, validated to be inside `dest_dir`
490
+ # @raise [Bulkrax::UnzipError] if `relative_dest` escapes `dest_dir`
491
+ def safe_extract_path(dest_dir, relative_dest)
492
+ expanded_dest_dir = File.expand_path(dest_dir)
493
+ dest_path = File.expand_path(relative_dest.to_s, expanded_dest_dir)
494
+ return dest_path if dest_path == expanded_dest_dir
495
+ return dest_path if dest_path.start_with?("#{expanded_dest_dir}#{File::SEPARATOR}")
496
+ raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: relative_dest)
497
+ end
498
+
466
499
  def copy_file(file_to_copy)
467
500
  destination = File.join(importer_unzip_path(mkdir: true), File.basename(file_to_copy))
468
501
  FileUtils.cp(file_to_copy, destination)
@@ -475,21 +508,6 @@ module Bulkrax
475
508
  raise "Failed to extract #{file_to_untar}" unless result
476
509
  end
477
510
 
478
- # File names referenced in CSVs have spaces replaced with underscores
479
- # @see Bulkrax::CsvParser#file_paths
480
- def remove_spaces_from_filenames
481
- files = Dir.glob(File.join(importer_unzip_path, 'files', '*')).uniq
482
- files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
483
- return if files_with_spaces.blank?
484
-
485
- files_with_spaces.map! { |path| Pathname.new(path) }
486
- files_with_spaces.each do |path|
487
- filename = path.basename
488
- filename_without_spaces = filename.to_s.tr(' ', '_')
489
- path.rename(File.join(path.dirname, filename_without_spaces))
490
- end
491
- end
492
-
493
511
  def zip
494
512
  FileUtils.mkdir_p(exporter_export_zip_path)
495
513
 
@@ -515,7 +533,6 @@ module Bulkrax
515
533
 
516
534
  # @return [String]
517
535
  def real_import_file_path
518
- return importer_unzip_path if file? && zip?
519
536
  parser_fields['import_file_path']
520
537
  end
521
538
  end
@@ -25,6 +25,18 @@ unless ENV.fetch('BULKRAX_NO_BAGIT', 'false').to_s == 'true'
25
25
  @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
26
26
  end
27
27
 
28
+ # BagIt archives are not CSV imports: they don't contain a primary
29
+ # CSV at a shallowest level, and their structure (bagit.txt + data/
30
+ # + manifests) must be preserved verbatim. Override both CSV-flavored
31
+ # unzip entry points to use the base-class verbatim extraction.
32
+ def unzip_with_primary_csv(file_to_unzip)
33
+ unzip(file_to_unzip)
34
+ end
35
+
36
+ def unzip_attachments_only(file_to_unzip)
37
+ unzip(file_to_unzip)
38
+ end
39
+
28
40
  # Take a random sample of 10 metadata_paths and work out the import fields from that
29
41
  def import_fields
30
42
  raise StandardError, 'No metadata files were found' if metadata_paths.blank?
@@ -379,47 +379,161 @@ module Bulkrax
379
379
  end.flatten.compact.uniq
380
380
  end
381
381
 
382
- # Retrieve the path where we expect to find the files
382
+ # Retrieve the path where we expect to find the files for this import.
383
+ # After {ImporterJob#unzip_imported_file} runs (zip cases), attachments
384
+ # live under `{importer_unzip_path}/files/`. For a server-path-style
385
+ # import (the user specified a CSV file path with a sibling `files/`
386
+ # directory on disk), resolve relative to the CSV's directory instead.
387
+ #
388
+ # When called with `filename:`, returns the full path to that file if
389
+ # it exists on disk, or `nil` otherwise — callers like
390
+ # `Bulkrax::FileSetEntryBehavior#add_path_to_file` rely on the nil
391
+ # sentinel to fall back to the raw filename in their error messages.
392
+ #
393
+ # When called with no filename, returns the `files/` directory itself
394
+ # (only when that directory exists on disk — else `nil` so callers can
395
+ # raise a clear "no files directory" error).
383
396
  def path_to_files(**args)
384
397
  filename = args.fetch(:filename, '')
398
+ base_dir = files_dir
399
+ return base_dir if filename.blank? && Dir.exist?(base_dir)
400
+ return nil if filename.blank?
401
+
402
+ candidate = File.join(base_dir, filename)
403
+ candidate if File.exist?(candidate)
404
+ end
405
+
406
+ # Extracts a zip that contains a primary CSV. The primary CSV lands at
407
+ # the root of {#importer_unzip_path}; every other entry lands under
408
+ # {#importer_unzip_path}/files/, preserving its path relative to the
409
+ # primary CSV's directory.
410
+ #
411
+ # Primary-CSV selection matches the guided-import validator's rule
412
+ # (see {Bulkrax::ImporterFileHandler#locate_csv_entry_in_zip}): the CSV
413
+ # entry at the shallowest directory level. Visible errors are raised on
414
+ # zero CSVs or multiple CSVs at the shallowest level.
415
+ #
416
+ # @param file_to_unzip [String] absolute path to a .zip
417
+ # @raise [Bulkrax::UnzipError] on no CSV or ambiguous CSVs
418
+ def unzip_with_primary_csv(file_to_unzip)
419
+ dest_dir = importer_unzip_path(mkdir: true)
420
+ Zip::File.open(file_to_unzip) do |zip_file|
421
+ entries = real_zip_entries(zip_file)
422
+ primary = select_primary_csv!(entries)
423
+ primary_dir = File.dirname(primary.name)
424
+
425
+ entries.each do |entry|
426
+ if entry == primary
427
+ extract_to(zip_file, entry, dest_dir, File.basename(entry.name))
428
+ else
429
+ extract_to(zip_file, entry, dest_dir, File.join('files', relative_to(primary_dir, entry.name)))
430
+ end
431
+ end
432
+ end
433
+ end
385
434
 
386
- return @path_to_files if @path_to_files.present? && filename.blank?
387
- # The zip file could be either the main import file, or a separate attachments zip file.
388
- # We want to check for both of those before we determine the path to the files.
389
- have_zip_file = zip? || (parser_fields['attachments_zip_path'] && zip_file?(parser_fields['attachments_zip_path']))
390
- @path_to_files = File.join(
391
- have_zip_file ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
392
- )
393
-
394
- return @path_to_files if File.exist?(@path_to_files)
395
-
396
- # TODO: This method silently returns nil if there is no file & no zip file
397
- File.join(importer_unzip_path, 'files', filename) if file? && zip?
435
+ # Extracts a zip that accompanies a separately-uploaded CSV. Every
436
+ # entry lands under {#importer_unzip_path}/files/ including any
437
+ # CSVs inside the zip, which are treated as attachments since the
438
+ # primary CSV was uploaded outside the zip. Strips a single top-level
439
+ # wrapper directory if present, so users can zip either the contents
440
+ # or the enclosing folder.
441
+ #
442
+ # @param file_to_unzip [String] absolute path to a .zip
443
+ def unzip_attachments_only(file_to_unzip)
444
+ dest_dir = importer_unzip_path(mkdir: true)
445
+ Zip::File.open(file_to_unzip) do |zip_file|
446
+ entries = real_zip_entries(zip_file)
447
+ wrapper = single_top_level_wrapper(entries)
448
+
449
+ entries.each do |entry|
450
+ relative = wrapper ? entry.name.delete_prefix("#{wrapper}/") : entry.name
451
+ next if relative.empty?
452
+ extract_to(zip_file, entry, dest_dir, File.join('files', relative))
453
+ end
454
+ end
398
455
  end
399
456
 
400
- def unzip(file_to_unzip)
401
- super
402
- normalize_unzipped_files_structure(importer_unzip_path)
457
+ # File names referenced in CSVs have spaces replaced with underscores.
458
+ # @see #file_paths
459
+ def remove_spaces_from_filenames
460
+ files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
461
+ files_with_spaces = files.select { |f| f.split('/').last.include?(' ') }
462
+ return if files_with_spaces.blank?
463
+
464
+ files_with_spaces.map! { |path| Pathname.new(path) }
465
+ files_with_spaces.each do |path|
466
+ filename_without_spaces = path.basename.to_s.tr(' ', '_')
467
+ path.rename(File.join(path.dirname, filename_without_spaces))
468
+ end
403
469
  end
404
470
 
405
471
  private
406
472
 
407
- # Ensure files extracted from a zip always land in a `files/` subdirectory
408
- # regardless of how the zip was structured. If files were extracted directly
409
- # into dest_dir (flat zip with no `files/` folder), move them into
410
- # dest_dir/files/ so that path_to_files can reliably locate them.
411
- def normalize_unzipped_files_structure(dest_dir)
412
- flat_files = Dir.glob(File.join(dest_dir, '*')).select { |f| File.file?(f) && !f.end_with?('.csv') }
413
- return if flat_files.empty?
414
-
415
- files_dir = File.join(dest_dir, 'files')
416
- FileUtils.mkdir_p(files_dir)
417
- flat_files.each do |f|
418
- dest = File.join(files_dir, File.basename(f))
419
- FileUtils.mv(f, dest) unless File.exist?(dest)
473
+ # Memoized base directory under which import attachments live. Kept
474
+ # separate from `#path_to_files`' per-filename return value to avoid
475
+ # cross-contamination between directory lookups and file lookups.
476
+ def files_dir
477
+ @files_dir ||= begin
478
+ has_attachments_zip = parser_fields['attachments_zip_path'].present? && zip_file?(parser_fields['attachments_zip_path'])
479
+ base = zip? || has_attachments_zip ? importer_unzip_path : File.dirname(import_file_path)
480
+ File.join(base, 'files')
420
481
  end
421
482
  end
422
483
 
484
+ # Returns zip entries filtered down to real files (no directories, no
485
+ # macOS junk). Raises {Bulkrax::UnzipError} if any entry's name would
486
+ # escape the destination directory (Zip Slip).
487
+ def real_zip_entries(zip_file)
488
+ entries = zip_file.entries.select { |e| e.file? && !macos_junk_entry?(e.name) }
489
+ entries.each { |e| reject_unsafe_entry!(e.name) }
490
+ entries
491
+ end
492
+
493
+ # Picks the single primary CSV from zip entries, enforcing the
494
+ # shallowest-level rule. Raises {Bulkrax::UnzipError} on failure.
495
+ def select_primary_csv!(entries)
496
+ csvs = entries.select { |e| e.name.end_with?('.csv') }
497
+ raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.no_csv') if csvs.empty?
498
+
499
+ by_depth = csvs.group_by { |e| e.name.count('/') }
500
+ shallowest = by_depth[by_depth.keys.min]
501
+
502
+ raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.multiple_csv') if shallowest.size > 1
503
+
504
+ shallowest.first
505
+ end
506
+
507
+ # If every entry shares a single top-level directory, returns that
508
+ # directory name; otherwise nil.
509
+ def single_top_level_wrapper(entries)
510
+ tops = entries.map { |e| e.name.split('/').first }.uniq
511
+ return nil unless tops.size == 1
512
+ # If the single top segment is a file (no slashes in the entry), not a dir,
513
+ # there's no wrapper to strip.
514
+ return nil if entries.any? { |e| e.name == tops.first }
515
+ tops.first
516
+ end
517
+
518
+ # Returns `path` with `prefix/` removed from the front, if present, and
519
+ # a leading `files/` segment also stripped so callers can join under
520
+ # `files/` without doubling when the zip already uses that convention.
521
+ def relative_to(prefix, path)
522
+ remaining = prefix == '.' || prefix.empty? ? path : path.delete_prefix("#{prefix}/")
523
+ remaining.delete_prefix('files/')
524
+ end
525
+
526
+ # Extracts a zip entry to `dest_dir/relative_dest`. Creates intermediate
527
+ # directories and honors the rubyzip 2/3 extract-method signature.
528
+ # The destination path is validated by {#safe_extract_path} — an unsafe
529
+ # `relative_dest` raises {Bulkrax::UnzipError} before any write.
530
+ def extract_to(zip_file, entry, dest_dir, relative_dest)
531
+ dest_path = safe_extract_path(dest_dir, relative_dest)
532
+ FileUtils.mkdir_p(File.dirname(dest_path))
533
+ return if File.exist?(dest_path)
534
+ extract_zip_entry(zip_file, entry, dest_dir, relative_dest, dest_path)
535
+ end
536
+
423
537
  def unique_collection_identifier(collection_hash)
424
538
  entry_uid = collection_hash[source_identifier]
425
539
  entry_uid ||= if Bulkrax.fill_in_blank_source_identifiers.present?
@@ -434,16 +548,13 @@ module Bulkrax
434
548
  # Override to return the first CSV in the path, if a zip file is supplied
435
549
  # We expect a single CSV at the top level of the zip in the CSVParser
436
550
  # but we are willing to go look for it if need be
551
+ # When the user uploaded a zip containing a CSV, the job extracts the
552
+ # primary CSV to the root of `importer_unzip_path` (see
553
+ # {#unzip_with_primary_csv}). Any non-primary CSVs live under `files/`
554
+ # and are treated as attachments, so a shallow glob suffices.
437
555
  def real_import_file_path
438
- return Dir["#{importer_unzip_path}/**/*.csv"].reject { |path| in_files_dir?(path) }.first if file? && zip?
439
-
556
+ return Dir["#{importer_unzip_path}/*.csv"].first if file? && zip?
440
557
  parser_fields['import_file_path']
441
558
  end
442
-
443
- # If there are CSVs that are meant to be attachments in the files directory,
444
- # we don't want to consider them as the import CSV
445
- def in_files_dir?(path)
446
- File.dirname(path).ends_with?('files')
447
- end
448
559
  end
449
560
  end
@@ -46,7 +46,7 @@ module Bulkrax
46
46
  end
47
47
 
48
48
  def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata)
49
- svc = ValidationContext.new(
49
+ svc = Bulkrax::CsvParser::ValidationContext.new(
50
50
  mapping_manager: mapping_manager,
51
51
  field_analyzer: field_analyzer,
52
52
  all_models: all_models,
@@ -315,8 +315,7 @@ de:
315
315
  missing_required_hint: Fügen Sie diese Spalte zu Ihrer CSV-Datei hinzu.
316
316
  missing_required_title: Fehlende Pflichtfelder
317
317
  missing_rights_desc: Ihre CSV-Datei enthält keine Spalte „rights_statement“. Sie können diese entweder Ihrer CSV-Datei hinzufügen oder im nächsten Schritt eine Standard-Rechteerklärung auswählen.
318
- multiple_csv_same_dir: Mehrere CSV-Dateien im selben Verzeichnis innerhalb der ZIP-Datei gefunden
319
- multiple_csv_same_level: Mehrere CSV-Dateien auf derselben Ebene innerhalb der ZIP-Datei gefunden
318
+ multiple_csv: Mehrere CSV-Dateien befinden sich auf der obersten Ebene des ZIP-Archivs, sodass die primäre CSV nicht bestimmt werden kann. Belassen Sie genau eine CSV auf dieser Ebene; weitere CSVs müssen tiefer verschachtelt sein.
320
319
  no_csv_in_zip: Es wurden keine CSV-Dateien im ZIP-Archiv gefunden.
321
320
  no_csv_uploaded: Es wurde keine CSV-Metadatendatei hochgeladen.
322
321
  no_files_uploaded: Es wurden keine Dateien hochgeladen.
@@ -346,6 +345,11 @@ de:
346
345
  notices_title: Importhinweise
347
346
  unrecognized_desc: 'Diese Spalten werden beim Import ignoriert:'
348
347
  unrecognized_title: Nicht anerkannte Felder
348
+ unzip:
349
+ errors:
350
+ multiple_csv: Mehrere CSV-Dateien befinden sich auf der obersten Ebene des ZIP-Archivs, sodass die primäre CSV nicht bestimmt werden kann. Belassen Sie genau eine CSV auf dieser Ebene; weitere CSVs müssen tiefer verschachtelt sein.
351
+ no_csv: Es wurden keine CSV-Dateien im ZIP-Archiv gefunden.
352
+ unsafe_entry: "Das ZIP enthält einen Eintrag mit unsicherem Pfad (%{name}). Einträge dürfen weder absolute Pfade noch Referenzen auf übergeordnete Verzeichnisse verwenden."
349
353
  validations:
350
354
  errors_prohibited:
351
355
  one: 'Ein Fehler verhinderte das Speichern dieses Importers:'
@@ -342,8 +342,7 @@ en:
342
342
  missing_required_hint: add this column to your CSV
343
343
  missing_required_title: Missing Required Fields
344
344
  missing_rights_desc: Your CSV does not include a rights_statement column. You can add it to your CSV or select a Default Rights Statement in the next step.
345
- multiple_csv_same_dir: Multiple CSV files found in the same directory within ZIP
346
- multiple_csv_same_level: Multiple CSV files found at the same level within ZIP
345
+ multiple_csv: Multiple CSV files share the shallowest directory level in the ZIP, so the primary CSV cannot be determined. Keep exactly one CSV at that level; any additional CSVs must be nested deeper.
347
346
  no_csv_in_zip: No CSV files found in ZIP
348
347
  no_csv_uploaded: No CSV metadata file uploaded
349
348
  no_files_uploaded: No files uploaded
@@ -382,6 +381,11 @@ en:
382
381
  notices_title: Import Notices
383
382
  unrecognized_desc: 'These columns will be ignored during import:'
384
383
  unrecognized_title: Unrecognized Fields
384
+ unzip:
385
+ errors:
386
+ multiple_csv: Multiple CSV files share the shallowest directory level in the ZIP, so the primary CSV cannot be determined. Keep exactly one CSV at that level; any additional CSVs must be nested deeper.
387
+ no_csv: No CSV file found in the ZIP
388
+ unsafe_entry: "The ZIP contains an entry with an unsafe path (%{name}). Entries must not use absolute paths or parent-directory references."
385
389
  validations:
386
390
  errors_prohibited:
387
391
  one: '1 error prohibited this importer from being saved:'
@@ -315,8 +315,7 @@ es:
315
315
  missing_required_hint: Añade esta columna a tu CSV
316
316
  missing_required_title: Campos obligatorios faltantes
317
317
  missing_rights_desc: Su archivo CSV no incluye la columna "rights_statement". Puede añadirla o seleccionar una "Declaración de derechos predeterminada" en el siguiente paso.
318
- multiple_csv_same_dir: Se encontraron varios archivos CSV en el mismo directorio dentro de ZIP
319
- multiple_csv_same_level: Se encontraron varios archivos CSV en el mismo nivel dentro de ZIP
318
+ multiple_csv: Hay varios archivos CSV en el nivel menos profundo del ZIP, por lo que no se puede determinar el CSV principal. Mantén exactamente un CSV en ese nivel; cualquier CSV adicional debe estar anidado más profundamente.
320
319
  no_csv_in_zip: No se encontraron archivos CSV en ZIP
321
320
  no_csv_uploaded: No se cargó ningún archivo de metadatos CSV
322
321
  no_files_uploaded: No hay archivos subidos
@@ -346,6 +345,11 @@ es:
346
345
  notices_title: Avisos de importación
347
346
  unrecognized_desc: 'Estas columnas se ignorarán durante la importación:'
348
347
  unrecognized_title: Campos no reconocidos
348
+ unzip:
349
+ errors:
350
+ multiple_csv: Hay varios archivos CSV en el nivel menos profundo del ZIP, por lo que no se puede determinar el CSV principal. Mantén exactamente un CSV en ese nivel; cualquier CSV adicional debe estar anidado más profundamente.
351
+ no_csv: No se encontraron archivos CSV en ZIP
352
+ unsafe_entry: "El ZIP contiene una entrada con una ruta no segura (%{name}). Las entradas no deben usar rutas absolutas ni referencias al directorio padre."
349
353
  validations:
350
354
  errors_prohibited:
351
355
  one: '1 error impidió que se guardara este importador:'
@@ -315,8 +315,7 @@ fr:
315
315
  missing_required_hint: Ajoutez cette colonne à votre fichier CSV.
316
316
  missing_required_title: Champs obligatoires manquants
317
317
  missing_rights_desc: Votre fichier CSV ne contient pas de colonne « droits_statement ». Vous pouvez l'ajouter ou sélectionner une déclaration de droits par défaut à l'étape suivante.
318
- multiple_csv_same_dir: Plusieurs fichiers CSV trouvés dans le même répertoire à l'intérieur du fichier ZIP
319
- multiple_csv_same_level: Plusieurs fichiers CSV trouvés au même niveau dans le fichier ZIP
318
+ multiple_csv: Plusieurs fichiers CSV se trouvent au niveau le moins profond dans le ZIP, ce qui empêche d'identifier le CSV principal. Conservez exactement un fichier CSV à ce niveau ; les CSV supplémentaires doivent être imbriqués plus profondément.
320
319
  no_csv_in_zip: Aucun fichier CSV trouvé dans le fichier ZIP
321
320
  no_csv_uploaded: Aucun fichier de métadonnées CSV n'a été téléchargé.
322
321
  no_files_uploaded: Aucun fichier téléchargé
@@ -346,6 +345,11 @@ fr:
346
345
  notices_title: Avis d'importation
347
346
  unrecognized_desc: 'Ces colonnes seront ignorées lors de l''importation :'
348
347
  unrecognized_title: Champs non reconnus
348
+ unzip:
349
+ errors:
350
+ multiple_csv: Plusieurs fichiers CSV se trouvent au niveau le moins profond dans le ZIP, ce qui empêche d'identifier le CSV principal. Conservez exactement un fichier CSV à ce niveau ; les CSV supplémentaires doivent être imbriqués plus profondément.
351
+ no_csv: Aucun fichier CSV trouvé dans le fichier ZIP
352
+ unsafe_entry: "Le ZIP contient une entrée avec un chemin non sûr (%{name}). Les entrées ne doivent pas utiliser de chemins absolus ni de références au répertoire parent."
349
353
  validations:
350
354
  errors_prohibited:
351
355
  one: 'Une erreur a empêché l''enregistrement de cet importateur :'
@@ -315,8 +315,7 @@ it:
315
315
  missing_required_hint: aggiungi questa colonna al tuo CSV
316
316
  missing_required_title: Campi obbligatori mancanti
317
317
  missing_rights_desc: Il tuo file CSV non include una colonna rights_statement. Puoi aggiungerla al tuo file CSV o selezionare una colonna "Default Rights Statement" nel passaggio successivo.
318
- multiple_csv_same_dir: Sono stati trovati più file CSV nella stessa directory all'interno di ZIP
319
- multiple_csv_same_level: Sono stati trovati più file CSV allo stesso livello all'interno dello ZIP
318
+ multiple_csv: Sono stati trovati più file CSV al livello meno profondo all'interno dello ZIP, quindi non è possibile determinare il CSV principale. Mantieni esattamente un CSV a quel livello; eventuali CSV aggiuntivi devono essere annidati più in profondità.
320
319
  no_csv_in_zip: Nessun file CSV trovato nello ZIP
321
320
  no_csv_uploaded: Nessun file di metadati CSV caricato
322
321
  no_files_uploaded: Nessun file caricato
@@ -346,6 +345,11 @@ it:
346
345
  notices_title: Avvisi di importazione
347
346
  unrecognized_desc: 'Queste colonne verranno ignorate durante l''importazione:'
348
347
  unrecognized_title: Campi non riconosciuti
348
+ unzip:
349
+ errors:
350
+ multiple_csv: Sono stati trovati più file CSV al livello meno profondo all'interno dello ZIP, quindi non è possibile determinare il CSV principale. Mantieni esattamente un CSV a quel livello; eventuali CSV aggiuntivi devono essere annidati più in profondità.
351
+ no_csv: Nessun file CSV trovato nello ZIP
352
+ unsafe_entry: "Lo ZIP contiene una voce con un percorso non sicuro (%{name}). Le voci non devono utilizzare percorsi assoluti né riferimenti alla directory superiore."
349
353
  validations:
350
354
  errors_prohibited:
351
355
  one: '1 errore ha impedito il salvataggio di questo importatore:'
@@ -315,8 +315,7 @@ pt-BR:
315
315
  missing_required_hint: Adicione esta coluna ao seu arquivo CSV.
316
316
  missing_required_title: Campos obrigatórios ausentes
317
317
  missing_rights_desc: Seu arquivo CSV não inclui uma coluna `rights_statement`. Você pode adicioná-la ao seu CSV ou selecionar uma Declaração de Direitos Padrão na próxima etapa.
318
- multiple_csv_same_dir: Vários arquivos CSV encontrados no mesmo diretório dentro do arquivo ZIP.
319
- multiple_csv_same_level: Vários arquivos CSV encontrados no mesmo nível dentro do arquivo ZIP.
318
+ multiple_csv: Vários arquivos CSV estão no nível menos profundo dentro do ZIP, então não é possível determinar qual é o CSV principal. Mantenha exatamente um CSV nesse nível; CSVs adicionais devem estar aninhados em níveis mais profundos.
320
319
  no_csv_in_zip: Nenhum arquivo CSV encontrado no arquivo ZIP.
321
320
  no_csv_uploaded: Nenhum arquivo de metadados CSV foi carregado.
322
321
  no_files_uploaded: Nenhum arquivo foi enviado.
@@ -346,6 +345,11 @@ pt-BR:
346
345
  notices_title: Avisos de importação
347
346
  unrecognized_desc: 'Estas colunas serão ignoradas durante a importação:'
348
347
  unrecognized_title: Campos não reconhecidos
348
+ unzip:
349
+ errors:
350
+ multiple_csv: Vários arquivos CSV estão no nível menos profundo dentro do ZIP, então não é possível determinar qual é o CSV principal. Mantenha exatamente um CSV nesse nível; CSVs adicionais devem estar aninhados em níveis mais profundos.
351
+ no_csv: Nenhum arquivo CSV encontrado no arquivo ZIP.
352
+ unsafe_entry: "O ZIP contém uma entrada com caminho inseguro (%{name}). As entradas não devem usar caminhos absolutos nem referências ao diretório pai."
349
353
  validations:
350
354
  errors_prohibited:
351
355
  one: '1 erro impediu que este importador fosse salvo:'
@@ -314,8 +314,7 @@ zh:
314
314
  missing_required_hint: 将此列添加到您的 CSV 文件中
315
315
  missing_required_title: 缺少必填字段
316
316
  missing_rights_desc: 您的 CSV 文件不包含 rights_statement 列。您可以在下一步中将其添加到 CSV 文件中,或选择默认的权利声明。
317
- multiple_csv_same_dir: ZIP 文件中的同一目录下发现了多个 CSV 文件
318
- multiple_csv_same_level: 在 ZIP 文件中的同一层级发现了多个 CSV 文件
317
+ multiple_csv: ZIP 文件中的同一最浅层级下发现了多个 CSV 文件,无法确定主 CSV。请在该层级仅保留一个 CSV,其他 CSV 必须位于更深的目录中。
319
318
  no_csv_in_zip: ZIP 文件中未找到 CSV 文件
320
319
  no_csv_uploaded: 未上传 CSV 元数据文件。
321
320
  no_files_uploaded: 未上传任何文件。
@@ -345,6 +344,11 @@ zh:
345
344
  notices_title: 导入通知
346
345
  unrecognized_desc: 导入过程中将忽略以下列:
347
346
  unrecognized_title: 未识别字段
347
+ unzip:
348
+ errors:
349
+ multiple_csv: ZIP 文件中的同一最浅层级下发现了多个 CSV 文件,无法确定主 CSV。请在该层级仅保留一个 CSV,其他 CSV 必须位于更深的目录中。
350
+ no_csv: ZIP 文件中未找到 CSV 文件
351
+ unsafe_entry: "ZIP 文件包含路径不安全的条目(%{name})。条目不得使用绝对路径或父目录引用。"
348
352
  validations:
349
353
  errors_prohibited:
350
354
  one: 1 个错误导致此导入程序无法保存:
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Bulkrax
4
- VERSION = '9.4.0'
4
+ VERSION = '9.4.1'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bulkrax
3
3
  version: !ruby/object:Gem::Version
4
- version: 9.4.0
4
+ version: 9.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rob Kaufman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-15 00:00:00.000000000 Z
11
+ date: 2026-04-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -365,6 +365,7 @@ files:
365
365
  - app/controllers/concerns/bulkrax/download_behavior.rb
366
366
  - app/controllers/concerns/bulkrax/guided_import_demo_scenarios.rb
367
367
  - app/controllers/concerns/bulkrax/importer_file_handler.rb
368
+ - app/errors/bulkrax/unzip_error.rb
368
369
  - app/factories/bulkrax/object_factory.rb
369
370
  - app/factories/bulkrax/object_factory_interface.rb
370
371
  - app/factories/bulkrax/valkyrie_object_factory.rb