bulkrax 9.4.0 → 9.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/concerns/bulkrax/importer_file_handler.rb +1 -6
- data/app/errors/bulkrax/unzip_error.rb +16 -0
- data/app/jobs/bulkrax/importer_job.rb +40 -9
- data/app/models/bulkrax/importer.rb +3 -16
- data/app/parsers/bulkrax/application_parser.rb +50 -33
- data/app/parsers/bulkrax/bagit_parser.rb +12 -0
- data/app/parsers/bulkrax/csv_parser.rb +148 -37
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb +1 -1
- data/config/locales/bulkrax.de.yml +6 -2
- data/config/locales/bulkrax.en.yml +6 -2
- data/config/locales/bulkrax.es.yml +6 -2
- data/config/locales/bulkrax.fr.yml +6 -2
- data/config/locales/bulkrax.it.yml +6 -2
- data/config/locales/bulkrax.pt-BR.yml +6 -2
- data/config/locales/bulkrax.zh.yml +6 -2
- data/lib/bulkrax/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 420e0b83f78ad1c411b0532bda121bebe74a651a9d1abec51549273896a00bcb
|
|
4
|
+
data.tar.gz: 892e143d2de6c714121804bf547b9473545c1071ecce6ef9a6269cb60eeaf66f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 497fe999aa3d39f3e7281b5e743a75d5b6c60ba93d0a7a40bd63bdfe248b0c35e52dffaa9b8aebe59489e68999ea9a0e22826ed30ffea0f2d8927cfaa61852d5
|
|
7
|
+
data.tar.gz: 176a04163d610ad5241b96ecd107ae4bc79e64dddaff94f2fa80b7b93ce48951934ffb8f4a3438169c185f7c5a6e9f468aa8371c3c1c3d282d7c9d59ebfde946
|
|
@@ -121,12 +121,7 @@ module Bulkrax
|
|
|
121
121
|
csv_by_depth = get_directory_depth_for_each_csv(csv_entries)
|
|
122
122
|
csvs_at_level = determine_csvs_at_shallowest_level(csv_by_depth)
|
|
123
123
|
|
|
124
|
-
|
|
125
|
-
csvs_by_directory.each do |_dir, csvs|
|
|
126
|
-
return StepperResponseFormatter.error(message: I18n.t('bulkrax.importer.guided_import.validation.multiple_csv_same_dir')) if csvs.count > 1
|
|
127
|
-
end
|
|
128
|
-
|
|
129
|
-
return StepperResponseFormatter.error(message: I18n.t('bulkrax.importer.guided_import.validation.multiple_csv_same_level')) if csvs_at_level.size > 1
|
|
124
|
+
return StepperResponseFormatter.error(message: I18n.t('bulkrax.importer.guided_import.validation.multiple_csv')) if csvs_at_level.size > 1
|
|
130
125
|
|
|
131
126
|
csvs_at_level.first
|
|
132
127
|
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
# Raised when a zip cannot be safely or meaningfully extracted during
|
|
5
|
+
# import. Covered scenarios include:
|
|
6
|
+
#
|
|
7
|
+
# - A single upload zip has no CSV at any level.
|
|
8
|
+
# - A single upload zip has multiple CSVs at its shallowest level
|
|
9
|
+
# (primary CSV cannot be determined).
|
|
10
|
+
# - A zip entry's name would escape the destination directory
|
|
11
|
+
# (Zip Slip: absolute paths, `..` traversal, etc.).
|
|
12
|
+
#
|
|
13
|
+
# Defined in its own file so Zeitwerk can autoload the constant by name
|
|
14
|
+
# from any parser or job that raises or rescues it.
|
|
15
|
+
class UnzipError < StandardError; end
|
|
16
|
+
end
|
|
@@ -13,7 +13,7 @@ module Bulkrax
|
|
|
13
13
|
import(importer, only_updates_since_last_import)
|
|
14
14
|
update_current_run_counters(importer)
|
|
15
15
|
schedule(importer) if importer.schedulable?
|
|
16
|
-
rescue ::CSV::MalformedCSVError => e
|
|
16
|
+
rescue ::CSV::MalformedCSVError, Bulkrax::UnzipError => e
|
|
17
17
|
importer.set_status_info(e)
|
|
18
18
|
end
|
|
19
19
|
|
|
@@ -26,18 +26,49 @@ module Bulkrax
|
|
|
26
26
|
importer.import_objects
|
|
27
27
|
end
|
|
28
28
|
|
|
29
|
+
# Populates `importer_unzip_path` with the uploaded file(s), leaving
|
|
30
|
+
# the working directory in the shape each parser expects.
|
|
31
|
+
#
|
|
32
|
+
# Dispatch by parser capability rather than class name:
|
|
33
|
+
# - CsvParser (and subclasses that replicate its shape) implements
|
|
34
|
+
# `#unzip_with_primary_csv` and `#unzip_attachments_only`, which
|
|
35
|
+
# place the primary CSV at root and attachments under `files/`.
|
|
36
|
+
# - Other parsers (XML, raw BagIt) inherit the base-class `#unzip`,
|
|
37
|
+
# which extracts the zip verbatim.
|
|
38
|
+
# - The separate attachments-zip flow is CSV-only (guided import is
|
|
39
|
+
# the only UI that produces it).
|
|
40
|
+
#
|
|
41
|
+
# A retry of this job gets a clean working directory: any prior
|
|
42
|
+
# extraction state from an earlier attempt is wiped, so nothing runs
|
|
43
|
+
# against partially-populated state.
|
|
29
44
|
def unzip_imported_file(parser)
|
|
30
45
|
return unless parser.file?
|
|
46
|
+
|
|
47
|
+
reset_unzip_path(parser)
|
|
48
|
+
|
|
49
|
+
import_file_path = parser.parser_fields['import_file_path']
|
|
50
|
+
attachments_zip_path = parser.parser_fields['attachments_zip_path']
|
|
51
|
+
|
|
31
52
|
if parser.zip?
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
parser.copy_file(
|
|
39
|
-
parser.
|
|
53
|
+
if parser.respond_to?(:unzip_with_primary_csv)
|
|
54
|
+
parser.unzip_with_primary_csv(import_file_path)
|
|
55
|
+
else
|
|
56
|
+
parser.unzip(import_file_path)
|
|
57
|
+
end
|
|
58
|
+
elsif parser.respond_to?(:unzip_attachments_only) && parser.zip_file?(attachments_zip_path)
|
|
59
|
+
parser.copy_file(import_file_path)
|
|
60
|
+
parser.unzip_attachments_only(attachments_zip_path)
|
|
61
|
+
else
|
|
62
|
+
parser.copy_file(import_file_path)
|
|
40
63
|
end
|
|
64
|
+
|
|
65
|
+
parser.remove_spaces_from_filenames if parser.respond_to?(:remove_spaces_from_filenames)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def reset_unzip_path(parser)
|
|
69
|
+
path = parser.importer_unzip_path
|
|
70
|
+
FileUtils.rm_rf(path) if Dir.exist?(path)
|
|
71
|
+
FileUtils.mkdir_p(path)
|
|
41
72
|
end
|
|
42
73
|
|
|
43
74
|
def update_current_run_counters(importer)
|
|
@@ -266,22 +266,9 @@ module Bulkrax
|
|
|
266
266
|
# end
|
|
267
267
|
|
|
268
268
|
def importer_unzip_path(mkdir: false)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
FileUtils.mkdir_p(unzip_dir) if mkdir
|
|
273
|
-
return unzip_dir
|
|
274
|
-
end
|
|
275
|
-
|
|
276
|
-
@importer_unzip_path ||= File.join(parser.base_path, "import_#{path_string}")
|
|
277
|
-
return @importer_unzip_path if Dir.exist?(@importer_unzip_path) || mkdir == true
|
|
278
|
-
|
|
279
|
-
# turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
|
|
280
|
-
base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
|
|
281
|
-
|
|
282
|
-
# If we don't have an existing unzip path, we'll try and find it.
|
|
283
|
-
# Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
|
|
284
|
-
@importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
|
|
269
|
+
path = File.join(parser.base_path, "import_#{path_string}")
|
|
270
|
+
FileUtils.mkdir_p(path) if mkdir
|
|
271
|
+
path
|
|
285
272
|
end
|
|
286
273
|
|
|
287
274
|
def errored_entries_csv_path
|
|
@@ -430,39 +430,72 @@ module Bulkrax
|
|
|
430
430
|
zip
|
|
431
431
|
end
|
|
432
432
|
|
|
433
|
+
# Extracts a zip verbatim into {#importer_unzip_path}, preserving the zip's
|
|
434
|
+
# internal structure. Filters macOS junk (`__MACOSX/`, `.DS_Store`, `._*`).
|
|
435
|
+
# Parser subclasses that need to interpret the zip's structure (e.g.
|
|
436
|
+
# {Bulkrax::CsvParser#unzip_with_primary_csv}) should call a more specific
|
|
437
|
+
# method rather than this one.
|
|
433
438
|
def unzip(file_to_unzip)
|
|
434
439
|
return untar(file_to_unzip) if file_to_unzip.end_with?('.tar.gz')
|
|
435
440
|
|
|
441
|
+
dest_dir = importer_unzip_path(mkdir: true)
|
|
436
442
|
Zip::File.open(file_to_unzip) do |zip_file|
|
|
437
|
-
real_entries = zip_file.reject { |e| macos_junk_entry?(e.name) }
|
|
438
|
-
top_level_dirs = real_entries.map { |e| e.name.split('/').first }.uniq
|
|
439
|
-
strip_prefix = top_level_dirs.size == 1 ? "#{top_level_dirs.first}/" : nil
|
|
440
|
-
|
|
441
|
-
dest_dir = importer_unzip_path(mkdir: true)
|
|
442
443
|
zip_file.each do |entry|
|
|
443
444
|
next unless entry.file?
|
|
444
445
|
next if macos_junk_entry?(entry.name)
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
dest_path = File.join(dest_dir, name)
|
|
446
|
+
reject_unsafe_entry!(entry.name)
|
|
447
|
+
dest_path = safe_extract_path(dest_dir, entry.name)
|
|
448
448
|
FileUtils.mkdir_p(File.dirname(dest_path))
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
# rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
|
|
452
|
-
if zip_file.method(:extract).arity == 2
|
|
453
|
-
zip_file.extract(entry, dest_path)
|
|
454
|
-
else
|
|
455
|
-
zip_file.extract(entry, name, destination_directory: dest_dir)
|
|
456
|
-
end
|
|
457
|
-
end
|
|
449
|
+
next if File.exist?(dest_path)
|
|
450
|
+
extract_zip_entry(zip_file, entry, dest_dir, entry.name, dest_path)
|
|
458
451
|
end
|
|
459
452
|
end
|
|
460
453
|
end
|
|
461
454
|
|
|
455
|
+
# rubyzip 2.x: extract(entry, absolute_dest_path)
|
|
456
|
+
# rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
|
|
457
|
+
#
|
|
458
|
+
# Callers are responsible for passing a `dest_path` produced by
|
|
459
|
+
# {#safe_extract_path} so the write can't escape `dest_dir`.
|
|
460
|
+
def extract_zip_entry(zip_file, entry, dest_dir, relative_name, dest_path)
|
|
461
|
+
if zip_file.method(:extract).arity == 2
|
|
462
|
+
zip_file.extract(entry, dest_path)
|
|
463
|
+
else
|
|
464
|
+
zip_file.extract(entry, relative_name, destination_directory: dest_dir)
|
|
465
|
+
end
|
|
466
|
+
end
|
|
467
|
+
|
|
462
468
|
def macos_junk_entry?(name)
|
|
463
469
|
name.start_with?('__MACOSX/') || name.split('/').any? { |part| part == '.DS_Store' || part.start_with?('._') }
|
|
464
470
|
end
|
|
465
471
|
|
|
472
|
+
# Zip Slip preflight — reject entries whose names are obviously unsafe
|
|
473
|
+
# (absolute paths, `..` segments) before we touch the filesystem.
|
|
474
|
+
# {#safe_extract_path} is the final line of defense; this check just
|
|
475
|
+
# fails fast with a clear message.
|
|
476
|
+
#
|
|
477
|
+
# @raise [Bulkrax::UnzipError] if the entry name is unsafe
|
|
478
|
+
def reject_unsafe_entry!(name)
|
|
479
|
+
return unless name.start_with?('/') || name.split('/').include?('..')
|
|
480
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: name)
|
|
481
|
+
end
|
|
482
|
+
|
|
483
|
+
# Zip Slip chokepoint. Resolves `relative_dest` against `dest_dir` and
|
|
484
|
+
# returns the absolute destination path — but only if it stays inside
|
|
485
|
+
# `dest_dir`. Callers must use this value rather than building their
|
|
486
|
+
# own path with `File.join`, so the path returned is always safe by
|
|
487
|
+
# construction.
|
|
488
|
+
#
|
|
489
|
+
# @return [String] absolute destination path, validated to be inside `dest_dir`
|
|
490
|
+
# @raise [Bulkrax::UnzipError] if `relative_dest` escapes `dest_dir`
|
|
491
|
+
def safe_extract_path(dest_dir, relative_dest)
|
|
492
|
+
expanded_dest_dir = File.expand_path(dest_dir)
|
|
493
|
+
dest_path = File.expand_path(relative_dest.to_s, expanded_dest_dir)
|
|
494
|
+
return dest_path if dest_path == expanded_dest_dir
|
|
495
|
+
return dest_path if dest_path.start_with?("#{expanded_dest_dir}#{File::SEPARATOR}")
|
|
496
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: relative_dest)
|
|
497
|
+
end
|
|
498
|
+
|
|
466
499
|
def copy_file(file_to_copy)
|
|
467
500
|
destination = File.join(importer_unzip_path(mkdir: true), File.basename(file_to_copy))
|
|
468
501
|
FileUtils.cp(file_to_copy, destination)
|
|
@@ -475,21 +508,6 @@ module Bulkrax
|
|
|
475
508
|
raise "Failed to extract #{file_to_untar}" unless result
|
|
476
509
|
end
|
|
477
510
|
|
|
478
|
-
# File names referenced in CSVs have spaces replaced with underscores
|
|
479
|
-
# @see Bulkrax::CsvParser#file_paths
|
|
480
|
-
def remove_spaces_from_filenames
|
|
481
|
-
files = Dir.glob(File.join(importer_unzip_path, 'files', '*')).uniq
|
|
482
|
-
files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
|
|
483
|
-
return if files_with_spaces.blank?
|
|
484
|
-
|
|
485
|
-
files_with_spaces.map! { |path| Pathname.new(path) }
|
|
486
|
-
files_with_spaces.each do |path|
|
|
487
|
-
filename = path.basename
|
|
488
|
-
filename_without_spaces = filename.to_s.tr(' ', '_')
|
|
489
|
-
path.rename(File.join(path.dirname, filename_without_spaces))
|
|
490
|
-
end
|
|
491
|
-
end
|
|
492
|
-
|
|
493
511
|
def zip
|
|
494
512
|
FileUtils.mkdir_p(exporter_export_zip_path)
|
|
495
513
|
|
|
@@ -515,7 +533,6 @@ module Bulkrax
|
|
|
515
533
|
|
|
516
534
|
# @return [String]
|
|
517
535
|
def real_import_file_path
|
|
518
|
-
return importer_unzip_path if file? && zip?
|
|
519
536
|
parser_fields['import_file_path']
|
|
520
537
|
end
|
|
521
538
|
end
|
|
@@ -25,6 +25,18 @@ unless ENV.fetch('BULKRAX_NO_BAGIT', 'false').to_s == 'true'
|
|
|
25
25
|
@path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
# BagIt archives are not CSV imports: they don't contain a primary
|
|
29
|
+
# CSV at a shallowest level, and their structure (bagit.txt + data/
|
|
30
|
+
# + manifests) must be preserved verbatim. Override both CSV-flavored
|
|
31
|
+
# unzip entry points to use the base-class verbatim extraction.
|
|
32
|
+
def unzip_with_primary_csv(file_to_unzip)
|
|
33
|
+
unzip(file_to_unzip)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def unzip_attachments_only(file_to_unzip)
|
|
37
|
+
unzip(file_to_unzip)
|
|
38
|
+
end
|
|
39
|
+
|
|
28
40
|
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
|
29
41
|
def import_fields
|
|
30
42
|
raise StandardError, 'No metadata files were found' if metadata_paths.blank?
|
|
@@ -379,47 +379,161 @@ module Bulkrax
|
|
|
379
379
|
end.flatten.compact.uniq
|
|
380
380
|
end
|
|
381
381
|
|
|
382
|
-
# Retrieve the path where we expect to find the files
|
|
382
|
+
# Retrieve the path where we expect to find the files for this import.
|
|
383
|
+
# After {ImporterJob#unzip_imported_file} runs (zip cases), attachments
|
|
384
|
+
# live under `{importer_unzip_path}/files/`. For a server-path-style
|
|
385
|
+
# import (the user specified a CSV file path with a sibling `files/`
|
|
386
|
+
# directory on disk), resolve relative to the CSV's directory instead.
|
|
387
|
+
#
|
|
388
|
+
# When called with `filename:`, returns the full path to that file if
|
|
389
|
+
# it exists on disk, or `nil` otherwise — callers like
|
|
390
|
+
# `Bulkrax::FileSetEntryBehavior#add_path_to_file` rely on the nil
|
|
391
|
+
# sentinel to fall back to the raw filename in their error messages.
|
|
392
|
+
#
|
|
393
|
+
# When called with no filename, returns the `files/` directory itself
|
|
394
|
+
# (only when that directory exists on disk — else `nil` so callers can
|
|
395
|
+
# raise a clear "no files directory" error).
|
|
383
396
|
def path_to_files(**args)
|
|
384
397
|
filename = args.fetch(:filename, '')
|
|
398
|
+
base_dir = files_dir
|
|
399
|
+
return base_dir if filename.blank? && Dir.exist?(base_dir)
|
|
400
|
+
return nil if filename.blank?
|
|
401
|
+
|
|
402
|
+
candidate = File.join(base_dir, filename)
|
|
403
|
+
candidate if File.exist?(candidate)
|
|
404
|
+
end
|
|
405
|
+
|
|
406
|
+
# Extracts a zip that contains a primary CSV. The primary CSV lands at
|
|
407
|
+
# the root of {#importer_unzip_path}; every other entry lands under
|
|
408
|
+
# {#importer_unzip_path}/files/, preserving its path relative to the
|
|
409
|
+
# primary CSV's directory.
|
|
410
|
+
#
|
|
411
|
+
# Primary-CSV selection matches the guided-import validator's rule
|
|
412
|
+
# (see {Bulkrax::ImporterFileHandler#locate_csv_entry_in_zip}): the CSV
|
|
413
|
+
# entry at the shallowest directory level. Visible errors are raised on
|
|
414
|
+
# zero CSVs or multiple CSVs at the shallowest level.
|
|
415
|
+
#
|
|
416
|
+
# @param file_to_unzip [String] absolute path to a .zip
|
|
417
|
+
# @raise [Bulkrax::UnzipError] on no CSV or ambiguous CSVs
|
|
418
|
+
def unzip_with_primary_csv(file_to_unzip)
|
|
419
|
+
dest_dir = importer_unzip_path(mkdir: true)
|
|
420
|
+
Zip::File.open(file_to_unzip) do |zip_file|
|
|
421
|
+
entries = real_zip_entries(zip_file)
|
|
422
|
+
primary = select_primary_csv!(entries)
|
|
423
|
+
primary_dir = File.dirname(primary.name)
|
|
424
|
+
|
|
425
|
+
entries.each do |entry|
|
|
426
|
+
if entry == primary
|
|
427
|
+
extract_to(zip_file, entry, dest_dir, File.basename(entry.name))
|
|
428
|
+
else
|
|
429
|
+
extract_to(zip_file, entry, dest_dir, File.join('files', relative_to(primary_dir, entry.name)))
|
|
430
|
+
end
|
|
431
|
+
end
|
|
432
|
+
end
|
|
433
|
+
end
|
|
385
434
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
435
|
+
# Extracts a zip that accompanies a separately-uploaded CSV. Every
|
|
436
|
+
# entry lands under {#importer_unzip_path}/files/ — including any
|
|
437
|
+
# CSVs inside the zip, which are treated as attachments since the
|
|
438
|
+
# primary CSV was uploaded outside the zip. Strips a single top-level
|
|
439
|
+
# wrapper directory if present, so users can zip either the contents
|
|
440
|
+
# or the enclosing folder.
|
|
441
|
+
#
|
|
442
|
+
# @param file_to_unzip [String] absolute path to a .zip
|
|
443
|
+
def unzip_attachments_only(file_to_unzip)
|
|
444
|
+
dest_dir = importer_unzip_path(mkdir: true)
|
|
445
|
+
Zip::File.open(file_to_unzip) do |zip_file|
|
|
446
|
+
entries = real_zip_entries(zip_file)
|
|
447
|
+
wrapper = single_top_level_wrapper(entries)
|
|
448
|
+
|
|
449
|
+
entries.each do |entry|
|
|
450
|
+
relative = wrapper ? entry.name.delete_prefix("#{wrapper}/") : entry.name
|
|
451
|
+
next if relative.empty?
|
|
452
|
+
extract_to(zip_file, entry, dest_dir, File.join('files', relative))
|
|
453
|
+
end
|
|
454
|
+
end
|
|
398
455
|
end
|
|
399
456
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
457
|
+
# File names referenced in CSVs have spaces replaced with underscores.
|
|
458
|
+
# @see #file_paths
|
|
459
|
+
def remove_spaces_from_filenames
|
|
460
|
+
files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
|
|
461
|
+
files_with_spaces = files.select { |f| f.split('/').last.include?(' ') }
|
|
462
|
+
return if files_with_spaces.blank?
|
|
463
|
+
|
|
464
|
+
files_with_spaces.map! { |path| Pathname.new(path) }
|
|
465
|
+
files_with_spaces.each do |path|
|
|
466
|
+
filename_without_spaces = path.basename.to_s.tr(' ', '_')
|
|
467
|
+
path.rename(File.join(path.dirname, filename_without_spaces))
|
|
468
|
+
end
|
|
403
469
|
end
|
|
404
470
|
|
|
405
471
|
private
|
|
406
472
|
|
|
407
|
-
#
|
|
408
|
-
#
|
|
409
|
-
#
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
files_dir = File.join(dest_dir, 'files')
|
|
416
|
-
FileUtils.mkdir_p(files_dir)
|
|
417
|
-
flat_files.each do |f|
|
|
418
|
-
dest = File.join(files_dir, File.basename(f))
|
|
419
|
-
FileUtils.mv(f, dest) unless File.exist?(dest)
|
|
473
|
+
# Memoized base directory under which import attachments live. Kept
|
|
474
|
+
# separate from `#path_to_files`' per-filename return value to avoid
|
|
475
|
+
# cross-contamination between directory lookups and file lookups.
|
|
476
|
+
def files_dir
|
|
477
|
+
@files_dir ||= begin
|
|
478
|
+
has_attachments_zip = parser_fields['attachments_zip_path'].present? && zip_file?(parser_fields['attachments_zip_path'])
|
|
479
|
+
base = zip? || has_attachments_zip ? importer_unzip_path : File.dirname(import_file_path)
|
|
480
|
+
File.join(base, 'files')
|
|
420
481
|
end
|
|
421
482
|
end
|
|
422
483
|
|
|
484
|
+
# Returns zip entries filtered down to real files (no directories, no
|
|
485
|
+
# macOS junk). Raises {Bulkrax::UnzipError} if any entry's name would
|
|
486
|
+
# escape the destination directory (Zip Slip).
|
|
487
|
+
def real_zip_entries(zip_file)
|
|
488
|
+
entries = zip_file.entries.select { |e| e.file? && !macos_junk_entry?(e.name) }
|
|
489
|
+
entries.each { |e| reject_unsafe_entry!(e.name) }
|
|
490
|
+
entries
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
# Picks the single primary CSV from zip entries, enforcing the
|
|
494
|
+
# shallowest-level rule. Raises {Bulkrax::UnzipError} on failure.
|
|
495
|
+
def select_primary_csv!(entries)
|
|
496
|
+
csvs = entries.select { |e| e.name.end_with?('.csv') }
|
|
497
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.no_csv') if csvs.empty?
|
|
498
|
+
|
|
499
|
+
by_depth = csvs.group_by { |e| e.name.count('/') }
|
|
500
|
+
shallowest = by_depth[by_depth.keys.min]
|
|
501
|
+
|
|
502
|
+
raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.multiple_csv') if shallowest.size > 1
|
|
503
|
+
|
|
504
|
+
shallowest.first
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
# If every entry shares a single top-level directory, returns that
|
|
508
|
+
# directory name; otherwise nil.
|
|
509
|
+
def single_top_level_wrapper(entries)
|
|
510
|
+
tops = entries.map { |e| e.name.split('/').first }.uniq
|
|
511
|
+
return nil unless tops.size == 1
|
|
512
|
+
# If the single top segment is a file (no slashes in the entry), not a dir,
|
|
513
|
+
# there's no wrapper to strip.
|
|
514
|
+
return nil if entries.any? { |e| e.name == tops.first }
|
|
515
|
+
tops.first
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
# Returns `path` with `prefix/` removed from the front, if present, and
|
|
519
|
+
# a leading `files/` segment also stripped so callers can join under
|
|
520
|
+
# `files/` without doubling when the zip already uses that convention.
|
|
521
|
+
def relative_to(prefix, path)
|
|
522
|
+
remaining = prefix == '.' || prefix.empty? ? path : path.delete_prefix("#{prefix}/")
|
|
523
|
+
remaining.delete_prefix('files/')
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
# Extracts a zip entry to `dest_dir/relative_dest`. Creates intermediate
|
|
527
|
+
# directories and honors the rubyzip 2/3 extract-method signature.
|
|
528
|
+
# The destination path is validated by {#safe_extract_path} — an unsafe
|
|
529
|
+
# `relative_dest` raises {Bulkrax::UnzipError} before any write.
|
|
530
|
+
def extract_to(zip_file, entry, dest_dir, relative_dest)
|
|
531
|
+
dest_path = safe_extract_path(dest_dir, relative_dest)
|
|
532
|
+
FileUtils.mkdir_p(File.dirname(dest_path))
|
|
533
|
+
return if File.exist?(dest_path)
|
|
534
|
+
extract_zip_entry(zip_file, entry, dest_dir, relative_dest, dest_path)
|
|
535
|
+
end
|
|
536
|
+
|
|
423
537
|
def unique_collection_identifier(collection_hash)
|
|
424
538
|
entry_uid = collection_hash[source_identifier]
|
|
425
539
|
entry_uid ||= if Bulkrax.fill_in_blank_source_identifiers.present?
|
|
@@ -434,16 +548,13 @@ module Bulkrax
|
|
|
434
548
|
# Override to return the first CSV in the path, if a zip file is supplied
|
|
435
549
|
# We expect a single CSV at the top level of the zip in the CSVParser
|
|
436
550
|
# but we are willing to go look for it if need be
|
|
551
|
+
# When the user uploaded a zip containing a CSV, the job extracts the
|
|
552
|
+
# primary CSV to the root of `importer_unzip_path` (see
|
|
553
|
+
# {#unzip_with_primary_csv}). Any non-primary CSVs live under `files/`
|
|
554
|
+
# and are treated as attachments, so a shallow glob suffices.
|
|
437
555
|
def real_import_file_path
|
|
438
|
-
return Dir["#{importer_unzip_path}
|
|
439
|
-
|
|
556
|
+
return Dir["#{importer_unzip_path}/*.csv"].first if file? && zip?
|
|
440
557
|
parser_fields['import_file_path']
|
|
441
558
|
end
|
|
442
|
-
|
|
443
|
-
# If there are CSVs that are meant to be attachments in the files directory,
|
|
444
|
-
# we don't want to consider them as the import CSV
|
|
445
|
-
def in_files_dir?(path)
|
|
446
|
-
File.dirname(path).ends_with?('files')
|
|
447
|
-
end
|
|
448
559
|
end
|
|
449
560
|
end
|
|
@@ -46,7 +46,7 @@ module Bulkrax
|
|
|
46
46
|
end
|
|
47
47
|
|
|
48
48
|
def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata)
|
|
49
|
-
svc = ValidationContext.new(
|
|
49
|
+
svc = Bulkrax::CsvParser::ValidationContext.new(
|
|
50
50
|
mapping_manager: mapping_manager,
|
|
51
51
|
field_analyzer: field_analyzer,
|
|
52
52
|
all_models: all_models,
|
|
@@ -315,8 +315,7 @@ de:
|
|
|
315
315
|
missing_required_hint: Fügen Sie diese Spalte zu Ihrer CSV-Datei hinzu.
|
|
316
316
|
missing_required_title: Fehlende Pflichtfelder
|
|
317
317
|
missing_rights_desc: Ihre CSV-Datei enthält keine Spalte „rights_statement“. Sie können diese entweder Ihrer CSV-Datei hinzufügen oder im nächsten Schritt eine Standard-Rechteerklärung auswählen.
|
|
318
|
-
|
|
319
|
-
multiple_csv_same_level: Mehrere CSV-Dateien auf derselben Ebene innerhalb der ZIP-Datei gefunden
|
|
318
|
+
multiple_csv: Mehrere CSV-Dateien befinden sich auf der obersten Ebene des ZIP-Archivs, sodass die primäre CSV nicht bestimmt werden kann. Belassen Sie genau eine CSV auf dieser Ebene; weitere CSVs müssen tiefer verschachtelt sein.
|
|
320
319
|
no_csv_in_zip: Es wurden keine CSV-Dateien im ZIP-Archiv gefunden.
|
|
321
320
|
no_csv_uploaded: Es wurde keine CSV-Metadatendatei hochgeladen.
|
|
322
321
|
no_files_uploaded: Es wurden keine Dateien hochgeladen.
|
|
@@ -346,6 +345,11 @@ de:
|
|
|
346
345
|
notices_title: Importhinweise
|
|
347
346
|
unrecognized_desc: 'Diese Spalten werden beim Import ignoriert:'
|
|
348
347
|
unrecognized_title: Nicht anerkannte Felder
|
|
348
|
+
unzip:
|
|
349
|
+
errors:
|
|
350
|
+
multiple_csv: Mehrere CSV-Dateien befinden sich auf der obersten Ebene des ZIP-Archivs, sodass die primäre CSV nicht bestimmt werden kann. Belassen Sie genau eine CSV auf dieser Ebene; weitere CSVs müssen tiefer verschachtelt sein.
|
|
351
|
+
no_csv: Es wurden keine CSV-Dateien im ZIP-Archiv gefunden.
|
|
352
|
+
unsafe_entry: "Das ZIP enthält einen Eintrag mit unsicherem Pfad (%{name}). Einträge dürfen weder absolute Pfade noch Referenzen auf übergeordnete Verzeichnisse verwenden."
|
|
349
353
|
validations:
|
|
350
354
|
errors_prohibited:
|
|
351
355
|
one: 'Ein Fehler verhinderte das Speichern dieses Importers:'
|
|
@@ -342,8 +342,7 @@ en:
|
|
|
342
342
|
missing_required_hint: add this column to your CSV
|
|
343
343
|
missing_required_title: Missing Required Fields
|
|
344
344
|
missing_rights_desc: Your CSV does not include a rights_statement column. You can add it to your CSV or select a Default Rights Statement in the next step.
|
|
345
|
-
|
|
346
|
-
multiple_csv_same_level: Multiple CSV files found at the same level within ZIP
|
|
345
|
+
multiple_csv: Multiple CSV files share the shallowest directory level in the ZIP, so the primary CSV cannot be determined. Keep exactly one CSV at that level; any additional CSVs must be nested deeper.
|
|
347
346
|
no_csv_in_zip: No CSV files found in ZIP
|
|
348
347
|
no_csv_uploaded: No CSV metadata file uploaded
|
|
349
348
|
no_files_uploaded: No files uploaded
|
|
@@ -382,6 +381,11 @@ en:
|
|
|
382
381
|
notices_title: Import Notices
|
|
383
382
|
unrecognized_desc: 'These columns will be ignored during import:'
|
|
384
383
|
unrecognized_title: Unrecognized Fields
|
|
384
|
+
unzip:
|
|
385
|
+
errors:
|
|
386
|
+
multiple_csv: Multiple CSV files share the shallowest directory level in the ZIP, so the primary CSV cannot be determined. Keep exactly one CSV at that level; any additional CSVs must be nested deeper.
|
|
387
|
+
no_csv: No CSV file found in the ZIP
|
|
388
|
+
unsafe_entry: "The ZIP contains an entry with an unsafe path (%{name}). Entries must not use absolute paths or parent-directory references."
|
|
385
389
|
validations:
|
|
386
390
|
errors_prohibited:
|
|
387
391
|
one: '1 error prohibited this importer from being saved:'
|
|
@@ -315,8 +315,7 @@ es:
|
|
|
315
315
|
missing_required_hint: Añade esta columna a tu CSV
|
|
316
316
|
missing_required_title: Campos obligatorios faltantes
|
|
317
317
|
missing_rights_desc: Su archivo CSV no incluye la columna "rights_statement". Puede añadirla o seleccionar una "Declaración de derechos predeterminada" en el siguiente paso.
|
|
318
|
-
|
|
319
|
-
multiple_csv_same_level: Se encontraron varios archivos CSV en el mismo nivel dentro de ZIP
|
|
318
|
+
multiple_csv: Hay varios archivos CSV en el nivel menos profundo del ZIP, por lo que no se puede determinar el CSV principal. Mantén exactamente un CSV en ese nivel; cualquier CSV adicional debe estar anidado más profundamente.
|
|
320
319
|
no_csv_in_zip: No se encontraron archivos CSV en ZIP
|
|
321
320
|
no_csv_uploaded: No se cargó ningún archivo de metadatos CSV
|
|
322
321
|
no_files_uploaded: No hay archivos subidos
|
|
@@ -346,6 +345,11 @@ es:
|
|
|
346
345
|
notices_title: Avisos de importación
|
|
347
346
|
unrecognized_desc: 'Estas columnas se ignorarán durante la importación:'
|
|
348
347
|
unrecognized_title: Campos no reconocidos
|
|
348
|
+
unzip:
|
|
349
|
+
errors:
|
|
350
|
+
multiple_csv: Hay varios archivos CSV en el nivel menos profundo del ZIP, por lo que no se puede determinar el CSV principal. Mantén exactamente un CSV en ese nivel; cualquier CSV adicional debe estar anidado más profundamente.
|
|
351
|
+
no_csv: No se encontraron archivos CSV en ZIP
|
|
352
|
+
unsafe_entry: "El ZIP contiene una entrada con una ruta no segura (%{name}). Las entradas no deben usar rutas absolutas ni referencias al directorio padre."
|
|
349
353
|
validations:
|
|
350
354
|
errors_prohibited:
|
|
351
355
|
one: '1 error impidió que se guardara este importador:'
|
|
@@ -315,8 +315,7 @@ fr:
|
|
|
315
315
|
missing_required_hint: Ajoutez cette colonne à votre fichier CSV.
|
|
316
316
|
missing_required_title: Champs obligatoires manquants
|
|
317
317
|
missing_rights_desc: Votre fichier CSV ne contient pas de colonne « droits_statement ». Vous pouvez l'ajouter ou sélectionner une déclaration de droits par défaut à l'étape suivante.
|
|
318
|
-
|
|
319
|
-
multiple_csv_same_level: Plusieurs fichiers CSV trouvés au même niveau dans le fichier ZIP
|
|
318
|
+
multiple_csv: Plusieurs fichiers CSV se trouvent au niveau le moins profond dans le ZIP, ce qui empêche d'identifier le CSV principal. Conservez exactement un fichier CSV à ce niveau ; les CSV supplémentaires doivent être imbriqués plus profondément.
|
|
320
319
|
no_csv_in_zip: Aucun fichier CSV trouvé dans le fichier ZIP
|
|
321
320
|
no_csv_uploaded: Aucun fichier de métadonnées CSV n'a été téléchargé.
|
|
322
321
|
no_files_uploaded: Aucun fichier téléchargé
|
|
@@ -346,6 +345,11 @@ fr:
|
|
|
346
345
|
notices_title: Avis d'importation
|
|
347
346
|
unrecognized_desc: 'Ces colonnes seront ignorées lors de l''importation :'
|
|
348
347
|
unrecognized_title: Champs non reconnus
|
|
348
|
+
unzip:
|
|
349
|
+
errors:
|
|
350
|
+
multiple_csv: Plusieurs fichiers CSV se trouvent au niveau le moins profond dans le ZIP, ce qui empêche d'identifier le CSV principal. Conservez exactement un fichier CSV à ce niveau ; les CSV supplémentaires doivent être imbriqués plus profondément.
|
|
351
|
+
no_csv: Aucun fichier CSV trouvé dans le fichier ZIP
|
|
352
|
+
unsafe_entry: "Le ZIP contient une entrée avec un chemin non sûr (%{name}). Les entrées ne doivent pas utiliser de chemins absolus ni de références au répertoire parent."
|
|
349
353
|
validations:
|
|
350
354
|
errors_prohibited:
|
|
351
355
|
one: 'Une erreur a empêché l''enregistrement de cet importateur :'
|
|
@@ -315,8 +315,7 @@ it:
|
|
|
315
315
|
missing_required_hint: aggiungi questa colonna al tuo CSV
|
|
316
316
|
missing_required_title: Campi obbligatori mancanti
|
|
317
317
|
missing_rights_desc: Il tuo file CSV non include una colonna rights_statement. Puoi aggiungerla al tuo file CSV o selezionare una colonna "Default Rights Statement" nel passaggio successivo.
|
|
318
|
-
|
|
319
|
-
multiple_csv_same_level: Sono stati trovati più file CSV allo stesso livello all'interno dello ZIP
|
|
318
|
+
multiple_csv: Sono stati trovati più file CSV al livello meno profondo all'interno dello ZIP, quindi non è possibile determinare il CSV principale. Mantieni esattamente un CSV a quel livello; eventuali CSV aggiuntivi devono essere annidati più in profondità.
|
|
320
319
|
no_csv_in_zip: Nessun file CSV trovato nello ZIP
|
|
321
320
|
no_csv_uploaded: Nessun file di metadati CSV caricato
|
|
322
321
|
no_files_uploaded: Nessun file caricato
|
|
@@ -346,6 +345,11 @@ it:
|
|
|
346
345
|
notices_title: Avvisi di importazione
|
|
347
346
|
unrecognized_desc: 'Queste colonne verranno ignorate durante l''importazione:'
|
|
348
347
|
unrecognized_title: Campi non riconosciuti
|
|
348
|
+
unzip:
|
|
349
|
+
errors:
|
|
350
|
+
multiple_csv: Sono stati trovati più file CSV al livello meno profondo all'interno dello ZIP, quindi non è possibile determinare il CSV principale. Mantieni esattamente un CSV a quel livello; eventuali CSV aggiuntivi devono essere annidati più in profondità.
|
|
351
|
+
no_csv: Nessun file CSV trovato nello ZIP
|
|
352
|
+
unsafe_entry: "Lo ZIP contiene una voce con un percorso non sicuro (%{name}). Le voci non devono utilizzare percorsi assoluti né riferimenti alla directory superiore."
|
|
349
353
|
validations:
|
|
350
354
|
errors_prohibited:
|
|
351
355
|
one: '1 errore ha impedito il salvataggio di questo importatore:'
|
|
@@ -315,8 +315,7 @@ pt-BR:
|
|
|
315
315
|
missing_required_hint: Adicione esta coluna ao seu arquivo CSV.
|
|
316
316
|
missing_required_title: Campos obrigatórios ausentes
|
|
317
317
|
missing_rights_desc: Seu arquivo CSV não inclui uma coluna `rights_statement`. Você pode adicioná-la ao seu CSV ou selecionar uma Declaração de Direitos Padrão na próxima etapa.
|
|
318
|
-
|
|
319
|
-
multiple_csv_same_level: Vários arquivos CSV encontrados no mesmo nível dentro do arquivo ZIP.
|
|
318
|
+
multiple_csv: Vários arquivos CSV estão no nível menos profundo dentro do ZIP, então não é possível determinar qual é o CSV principal. Mantenha exatamente um CSV nesse nível; CSVs adicionais devem estar aninhados em níveis mais profundos.
|
|
320
319
|
no_csv_in_zip: Nenhum arquivo CSV encontrado no arquivo ZIP.
|
|
321
320
|
no_csv_uploaded: Nenhum arquivo de metadados CSV foi carregado.
|
|
322
321
|
no_files_uploaded: Nenhum arquivo foi enviado.
|
|
@@ -346,6 +345,11 @@ pt-BR:
|
|
|
346
345
|
notices_title: Avisos de importação
|
|
347
346
|
unrecognized_desc: 'Estas colunas serão ignoradas durante a importação:'
|
|
348
347
|
unrecognized_title: Campos não reconhecidos
|
|
348
|
+
unzip:
|
|
349
|
+
errors:
|
|
350
|
+
multiple_csv: Vários arquivos CSV estão no nível menos profundo dentro do ZIP, então não é possível determinar qual é o CSV principal. Mantenha exatamente um CSV nesse nível; CSVs adicionais devem estar aninhados em níveis mais profundos.
|
|
351
|
+
no_csv: Nenhum arquivo CSV encontrado no arquivo ZIP.
|
|
352
|
+
unsafe_entry: "O ZIP contém uma entrada com caminho inseguro (%{name}). As entradas não devem usar caminhos absolutos nem referências ao diretório pai."
|
|
349
353
|
validations:
|
|
350
354
|
errors_prohibited:
|
|
351
355
|
one: '1 erro impediu que este importador fosse salvo:'
|
|
@@ -314,8 +314,7 @@ zh:
|
|
|
314
314
|
missing_required_hint: 将此列添加到您的 CSV 文件中
|
|
315
315
|
missing_required_title: 缺少必填字段
|
|
316
316
|
missing_rights_desc: 您的 CSV 文件不包含 rights_statement 列。您可以在下一步中将其添加到 CSV 文件中,或选择默认的权利声明。
|
|
317
|
-
|
|
318
|
-
multiple_csv_same_level: 在 ZIP 文件中的同一层级发现了多个 CSV 文件
|
|
317
|
+
multiple_csv: ZIP 文件中的同一最浅层级下发现了多个 CSV 文件,无法确定主 CSV。请在该层级仅保留一个 CSV,其他 CSV 必须位于更深的目录中。
|
|
319
318
|
no_csv_in_zip: ZIP 文件中未找到 CSV 文件
|
|
320
319
|
no_csv_uploaded: 未上传 CSV 元数据文件。
|
|
321
320
|
no_files_uploaded: 未上传任何文件。
|
|
@@ -345,6 +344,11 @@ zh:
|
|
|
345
344
|
notices_title: 导入通知
|
|
346
345
|
unrecognized_desc: 导入过程中将忽略以下列:
|
|
347
346
|
unrecognized_title: 未识别字段
|
|
347
|
+
unzip:
|
|
348
|
+
errors:
|
|
349
|
+
multiple_csv: ZIP 文件中的同一最浅层级下发现了多个 CSV 文件,无法确定主 CSV。请在该层级仅保留一个 CSV,其他 CSV 必须位于更深的目录中。
|
|
350
|
+
no_csv: ZIP 文件中未找到 CSV 文件
|
|
351
|
+
unsafe_entry: "ZIP 文件包含路径不安全的条目(%{name})。条目不得使用绝对路径或父目录引用。"
|
|
348
352
|
validations:
|
|
349
353
|
errors_prohibited:
|
|
350
354
|
one: 1 个错误导致此导入程序无法保存:
|
data/lib/bulkrax/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: bulkrax
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 9.4.
|
|
4
|
+
version: 9.4.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Rob Kaufman
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-21 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rails
|
|
@@ -365,6 +365,7 @@ files:
|
|
|
365
365
|
- app/controllers/concerns/bulkrax/download_behavior.rb
|
|
366
366
|
- app/controllers/concerns/bulkrax/guided_import_demo_scenarios.rb
|
|
367
367
|
- app/controllers/concerns/bulkrax/importer_file_handler.rb
|
|
368
|
+
- app/errors/bulkrax/unzip_error.rb
|
|
368
369
|
- app/factories/bulkrax/object_factory.rb
|
|
369
370
|
- app/factories/bulkrax/object_factory_interface.rb
|
|
370
371
|
- app/factories/bulkrax/valkyrie_object_factory.rb
|