RubyGems - bulkrax - Versions diffs - 9.3.5 → 9.4.1 - Mend

bulkrax 9.3.5 → 9.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

checksums.yaml +4 -4
data/README.md +11 -1
data/app/assets/javascripts/bulkrax/application.js +2 -1
data/app/assets/javascripts/bulkrax/bulkrax.js +13 -4
data/app/assets/javascripts/bulkrax/bulkrax_utils.js +96 -0
data/app/assets/javascripts/bulkrax/datatables.js +1 -0
data/app/assets/javascripts/bulkrax/entries.js +17 -10
data/app/assets/javascripts/bulkrax/importers.js.erb +9 -2
data/app/assets/javascripts/bulkrax/importers_stepper.js +2420 -0
data/app/assets/stylesheets/bulkrax/application.css +1 -1
data/app/assets/stylesheets/bulkrax/stepper/_header.scss +83 -0
data/app/assets/stylesheets/bulkrax/stepper/_mixins.scss +26 -0
data/app/assets/stylesheets/bulkrax/stepper/_navigation.scss +103 -0
data/app/assets/stylesheets/bulkrax/stepper/_responsive.scss +46 -0
data/app/assets/stylesheets/bulkrax/stepper/_review.scss +92 -0
data/app/assets/stylesheets/bulkrax/stepper/_settings.scss +106 -0
data/app/assets/stylesheets/bulkrax/stepper/_success.scss +26 -0
data/app/assets/stylesheets/bulkrax/stepper/_summary.scss +171 -0
data/app/assets/stylesheets/bulkrax/stepper/_upload.scss +339 -0
data/app/assets/stylesheets/bulkrax/stepper/_validation.scss +237 -0
data/app/assets/stylesheets/bulkrax/stepper/_variables.scss +46 -0
data/app/assets/stylesheets/bulkrax/stepper.scss +32 -0
data/app/controllers/bulkrax/guided_imports_controller.rb +175 -0
data/app/controllers/bulkrax/importers_controller.rb +28 -31
data/app/controllers/concerns/bulkrax/guided_import_demo_scenarios.rb +201 -0
data/app/controllers/concerns/bulkrax/importer_file_handler.rb +212 -0
data/app/errors/bulkrax/unzip_error.rb +16 -0
data/app/factories/bulkrax/object_factory.rb +3 -2
data/app/factories/bulkrax/valkyrie_object_factory.rb +61 -17
data/app/jobs/bulkrax/importer_job.rb +42 -4
data/app/models/bulkrax/csv_entry.rb +27 -7
data/app/models/bulkrax/entry.rb +4 -0
data/app/models/bulkrax/importer.rb +27 -10
data/app/models/concerns/bulkrax/has_matchers.rb +2 -2
data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +6 -5
data/app/parsers/bulkrax/application_parser.rb +63 -20
data/app/parsers/bulkrax/bagit_parser.rb +12 -0
data/app/parsers/bulkrax/csv_parser.rb +168 -25
data/app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb +73 -0
data/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +133 -0
data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb +282 -0
data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb +96 -0
data/app/services/bulkrax/csv_template/column_builder.rb +60 -0
data/app/services/bulkrax/csv_template/column_descriptor.rb +58 -0
data/app/services/bulkrax/csv_template/csv_builder.rb +83 -0
data/app/services/bulkrax/csv_template/explanation_builder.rb +57 -0
data/app/services/bulkrax/csv_template/field_analyzer.rb +56 -0
data/app/services/bulkrax/csv_template/file_path_generator.rb +47 -0
data/app/services/bulkrax/csv_template/file_validator.rb +68 -0
data/app/services/bulkrax/csv_template/mapping_manager.rb +55 -0
data/app/services/bulkrax/csv_template/model_loader.rb +50 -0
data/app/services/bulkrax/csv_template/row_builder.rb +35 -0
data/app/services/bulkrax/csv_template/schema_analyzer.rb +70 -0
data/app/services/bulkrax/csv_template/split_formatter.rb +44 -0
data/app/services/bulkrax/csv_template/value_determiner.rb +68 -0
data/app/services/bulkrax/stepper_response_formatter.rb +347 -0
data/app/services/bulkrax/validation_error_csv_builder.rb +99 -0
data/app/validators/bulkrax/csv_row/child_reference.rb +56 -0
data/app/validators/bulkrax/csv_row/circular_reference.rb +71 -0
data/app/validators/bulkrax/csv_row/controlled_vocabulary.rb +74 -0
data/app/validators/bulkrax/csv_row/duplicate_identifier.rb +63 -0
data/app/validators/bulkrax/csv_row/missing_source_identifier.rb +31 -0
data/app/validators/bulkrax/csv_row/parent_reference.rb +59 -0
data/app/validators/bulkrax/csv_row/required_values.rb +64 -0
data/app/views/bulkrax/guided_imports/new.html.erb +567 -0
data/app/views/bulkrax/importers/index.html.erb +6 -1
data/app/views/bulkrax/importers/new.html.erb +1 -1
data/app/views/bulkrax/importers/show.html.erb +17 -1
data/config/i18n-tasks.yml +195 -0
data/config/locales/bulkrax.de.yml +508 -0
data/config/locales/bulkrax.en.yml +463 -233
data/config/locales/bulkrax.es.yml +508 -0
data/config/locales/bulkrax.fr.yml +508 -0
data/config/locales/bulkrax.it.yml +508 -0
data/config/locales/bulkrax.pt-BR.yml +508 -0
data/config/locales/bulkrax.zh.yml +507 -0
data/config/routes.rb +10 -1
data/lib/bulkrax/data/demo_scenarios.json +2235 -0
data/lib/bulkrax/version.rb +1 -1
data/lib/bulkrax.rb +31 -0
metadata +56 -16
data/app/services/bulkrax/sample_csv_service/column_builder.rb +0 -58
data/app/services/bulkrax/sample_csv_service/column_descriptor.rb +0 -56
data/app/services/bulkrax/sample_csv_service/csv_builder.rb +0 -82
data/app/services/bulkrax/sample_csv_service/explanation_builder.rb +0 -51
data/app/services/bulkrax/sample_csv_service/field_analyzer.rb +0 -54
data/app/services/bulkrax/sample_csv_service/file_path_generator.rb +0 -16
data/app/services/bulkrax/sample_csv_service/mapping_manager.rb +0 -36
data/app/services/bulkrax/sample_csv_service/model_loader.rb +0 -40
data/app/services/bulkrax/sample_csv_service/row_builder.rb +0 -33
data/app/services/bulkrax/sample_csv_service/schema_analyzer.rb +0 -69
data/app/services/bulkrax/sample_csv_service/split_formatter.rb +0 -42
data/app/services/bulkrax/sample_csv_service/value_determiner.rb +0 -67
data/app/services/bulkrax/sample_csv_service.rb +0 -78
/data/{app/services → lib}/wings/custom_queries/find_by_source_identifier.rb +0 -0

data/app/errors/bulkrax/unzip_error.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+module Bulkrax
+  # Raised when a zip cannot be safely or meaningfully extracted during
+  # import. Covered scenarios include:
+  #
+  # - A single upload zip has no CSV at any level.
+  # - A single upload zip has multiple CSVs at its shallowest level
+  #   (primary CSV cannot be determined).
+  # - A zip entry's name would escape the destination directory
+  #   (Zip Slip: absolute paths, `..` traversal, etc.).
+  #
+  # Defined in its own file so Zeitwerk can autoload the constant by name
+  # from any parser or job that raises or rescues it.
+  class UnzipError < StandardError; end
+end

data/app/factories/bulkrax/object_factory.rb CHANGED Viewed

@@ -70,14 +70,15 @@ module Bulkrax
       properties.reject { |prop| Bulkrax.reserved_properties.include?(prop) }
     end
-    def self.field_multi_value?(field:, model:)
+    #  Unused admin set included to support flexible contexts in the Valkyrie version
+    def self.field_multi_value?(field:, model:, admin_set_id: nil) # rubocop:disable Lint/UnusedMethodArgument
       return false unless field_supported?(field: field, model: model)
       return false unless model.singleton_methods.include?(:properties)
       model&.properties&.[](field)&.[]("multiple")
     end
-    def self.field_supported?(field:, model:)
+    def self.field_supported?(field:, model:, admin_set_id: nil) # rubocop:disable Lint/UnusedMethodArgument
       model.method_defined?(field) && model.properties[field].present?
     end

data/app/factories/bulkrax/valkyrie_object_factory.rb CHANGED Viewed

@@ -120,11 +120,11 @@ module Bulkrax
       save!(resource: resource, user: user)
     end
-    def self.field_multi_value?(field:, model:)
-      return false unless field_supported?(field: field, model: model)
+    def self.field_multi_value?(field:, model:, admin_set_id: nil)
+      return false unless field_supported?(field: field, model: model, admin_set_id: admin_set_id)
       if model.respond_to?(:schema)
-        schema = model.new.singleton_class.schema || model.schema
+        schema = cached_schema_for(klass: model, admin_set_id: admin_set_id)
         dry_type = schema.key(field.to_sym)
         return true if dry_type.respond_to?(:primitive) && dry_type.primitive == Array
@@ -134,9 +134,9 @@ module Bulkrax
       end
     end
-    def self.field_supported?(field:, model:)
+    def self.field_supported?(field:, model:, admin_set_id: nil)
       if model.respond_to?(:schema)
-        schema_properties(model).include?(field)
+        schema_properties(klass: model, admin_set_id: admin_set_id).include?(field)
       else
         # We *might* have a Fedora object, so we need to consider that approach as
         # well.
@@ -272,17 +272,34 @@ module Bulkrax
     # rubocop:enable Metrics/ParameterLists
     ##
-    # Retrieve properties from M3 model
-    # @param klass the model
+    # Retrieve schema property names for a model, respecting admin set contexts
+    # when using flexible metadata. Delegates context resolution to Hyrax so
+    # Bulkrax does not need to know about HYRAX_FLEXIBLE or contexts.
+    #
+    # @param klass [Class] the model class
+    # @param admin_set_id [String, nil] admin set used to resolve contexts
     # @return [Array<String>]
-    def self.schema_properties(klass)
-      @schema_properties_map ||= {}
-      klass_key = klass.name
-      schema = klass.new.singleton_class.schema || klass.schema
-      @schema_properties_map[klass_key] = schema.map { |k| k.name.to_s } unless @schema_properties_map.key?(klass_key)
+    def self.schema_properties(klass:, admin_set_id: nil)
+      cached_schema_for(klass: klass, admin_set_id: admin_set_id).map { |k| k.name.to_s }
+    end
-      @schema_properties_map[klass_key]
+    ##
+    # Returns the schema for a model, memoized per (klass, admin_set_id) pair.
+    # Delegates to +Hyrax.schema_for+ when available so that context-gated
+    # properties are included without Bulkrax knowing about flexibility internals.
+    #
+    # @param klass [Class]
+    # @param admin_set_id [String, nil]
+    # @return [Dry::Types::Hash]
+    def self.cached_schema_for(klass:, admin_set_id: nil)
+      @cached_schema_map ||= {}
+      key = [klass.name, admin_set_id].compact.join('|')
+      @cached_schema_map[key] ||=
+        if admin_set_id.present? && defined?(Hyrax) && Hyrax.respond_to?(:schema_for)
+          Hyrax.schema_for(klass: klass, admin_set_id: admin_set_id)
+        else
+          klass.new.singleton_class.schema || klass.schema
+        end
     end
     def self.ordered_file_sets_for(object)
@@ -457,7 +474,9 @@ module Bulkrax
     # TODO What do we return when the calculated form fails?
     # @raise [StandardError] when there was a failure calling the translation.
     def perform_transaction_for(object:, attrs:)
-      form = Hyrax::Forms::ResourceForm.for(object).prepopulate!
+      admin_set_id = attrs[:admin_set_id] || attrs['admin_set_id'] ||
+                     attributes[:admin_set_id] || attributes['admin_set_id']
+      form = Hyrax::Forms::ResourceForm.for(resource: object, admin_set_id: admin_set_id).prepopulate!
       # TODO: Handle validations
       form.validate(attrs)
@@ -474,13 +493,15 @@ module Bulkrax
     end
     ##
-    # We accept attributes based on the model schema
+    # We accept attributes based on the model schema. Passes the admin set ID
+    # so that context-restricted properties are included in the permitted list.
     #
     # @return [Array<Symbols>]
     def permitted_attributes
       @permitted_attributes ||= (
         base_permitted_attributes + if klass.respond_to?(:schema)
-                                      Bulkrax::ValkyrieObjectFactory.schema_properties(klass)
+                                      admin_set_id = attributes[:admin_set_id] || attributes['admin_set_id']
+                                      Bulkrax::ValkyrieObjectFactory.schema_properties(klass: klass, admin_set_id: admin_set_id)
                                     else
                                       klass.properties.keys.map(&:to_sym)
                                     end
@@ -590,6 +611,29 @@ module Bulkrax
                    .symbolize_keys
       attrs[:title] = [] if attrs[:title].blank?
+      attrs = convert_based_near_to_attributes(attrs)
+      attrs
+    end
+    # Hyrax's ResourceForm strips the plain `based_near` key during validation
+    # (BasedNearFieldBehavior#deserialize calls params.except('based_near')).
+    # Values must be passed as `based_near_attributes` — a numbered hash of
+    # { "0" => { "id" => uri, "_destroy" => "false" } } — so the populator
+    # can set them. Hyrax accepts any valid URI; note that only GeoNames URIs
+    # will resolve to a display label via LocationService.
+    def convert_based_near_to_attributes(attrs)
+      values = Array.wrap(attrs.delete(:based_near)).reject(&:blank?)
+      return attrs if values.empty?
+      invalid = values.reject { |v| v.to_s.match?(::URI::DEFAULT_PARSER.make_regexp) }
+      if invalid.any?
+        raise ::StandardError, "Invalid value(s) for location (based_near): #{invalid.join(', ')}. " \
+                               "Values must be valid URIs (e.g. http://sws.geonames.org/5128581/)."
+      end
+      attrs[:based_near_attributes] = values.each_with_index.to_h do |uri, i|
+        [i.to_s, { "id" => uri.to_s, "_destroy" => "false" }]
+      end
       attrs
     end
   end

data/app/jobs/bulkrax/importer_job.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module Bulkrax
       import(importer, only_updates_since_last_import)
       update_current_run_counters(importer)
       schedule(importer) if importer.schedulable?
-    rescue ::CSV::MalformedCSVError => e
+    rescue ::CSV::MalformedCSVError, Bulkrax::UnzipError => e
       importer.set_status_info(e)
     end
@@ -26,11 +26,49 @@ module Bulkrax
       importer.import_objects
     end
+    # Populates `importer_unzip_path` with the uploaded file(s), leaving
+    # the working directory in the shape each parser expects.
+    #
+    # Dispatch by parser capability rather than class name:
+    # - CsvParser (and subclasses that replicate its shape) implements
+    #   `#unzip_with_primary_csv` and `#unzip_attachments_only`, which
+    #   place the primary CSV at root and attachments under `files/`.
+    # - Other parsers (XML, raw BagIt) inherit the base-class `#unzip`,
+    #   which extracts the zip verbatim.
+    # - The separate attachments-zip flow is CSV-only (guided import is
+    #   the only UI that produces it).
+    #
+    # A retry of this job gets a clean working directory: any prior
+    # extraction state from an earlier attempt is wiped, so nothing runs
+    # against partially-populated state.
     def unzip_imported_file(parser)
-      return unless parser.file? && parser.zip?
+      return unless parser.file?
-      parser.unzip(parser.parser_fields['import_file_path'])
-      parser.remove_spaces_from_filenames
+      reset_unzip_path(parser)
+      import_file_path = parser.parser_fields['import_file_path']
+      attachments_zip_path = parser.parser_fields['attachments_zip_path']
+      if parser.zip?
+        if parser.respond_to?(:unzip_with_primary_csv)
+          parser.unzip_with_primary_csv(import_file_path)
+        else
+          parser.unzip(import_file_path)
+        end
+      elsif parser.respond_to?(:unzip_attachments_only) && parser.zip_file?(attachments_zip_path)
+        parser.copy_file(import_file_path)
+        parser.unzip_attachments_only(attachments_zip_path)
+      else
+        parser.copy_file(import_file_path)
+      end
+      parser.remove_spaces_from_filenames if parser.respond_to?(:remove_spaces_from_filenames)
+    end
+    def reset_unzip_path(parser)
+      path = parser.importer_unzip_path
+      FileUtils.rm_rf(path) if Dir.exist?(path)
+      FileUtils.mkdir_p(path)
     end
     def update_current_run_counters(importer)

data/app/models/bulkrax/csv_entry.rb CHANGED Viewed

@@ -45,7 +45,12 @@ module Bulkrax
         encoding: 'utf-8'
       }.merge(csv_read_data_options)
-      results = CSV.read(path, **options)
+      results = if path.respond_to?(:read)
+                  path.rewind if path.respond_to?(:rewind)
+                  CSV.parse(path.read, **options)
+                else
+                  CSV.read(path, **options)
+                end
       csv_wrapper_class.new(results)
     end
@@ -83,9 +88,10 @@ module Bulkrax
       # model has to be separated so that it doesn't get mistranslated by to_h
       raw_data = data.to_h
       raw_data[:model] = data[:model] if data[:model].present?
-      # If the collection field mapping is not 'collection', add 'collection' - the parser needs it
-      # TODO: change to :parents
-      raw_data[:parents] = raw_data[parent_field(parser).to_sym] if raw_data.keys.include?(parent_field(parser).to_sym) && parent_field(parser) != 'parents'
+      # If the parents/children field mapping uses a custom column name, alias it to the standard key
+      # so downstream code can find it regardless of what the CSV column is named.
+      raw_data[:parents] = raw_data[parser.related_parents_raw_mapping.to_sym] if parser.related_parents_raw_mapping.present? && raw_data.key?(parser.related_parents_raw_mapping.to_sym) && parser.related_parents_raw_mapping != 'parents'
+      raw_data[:children] = raw_data[parser.related_children_raw_mapping.to_sym] if parser.related_children_raw_mapping.present? && raw_data.key?(parser.related_children_raw_mapping.to_sym) && parser.related_children_raw_mapping != 'children'
       return raw_data
     end
@@ -416,18 +422,32 @@ module Bulkrax
       self.collection_ids
     end
-    # If only filename is given, construct the path (/files/my_file)
+    # If only filename is given, construct the path (/files/my_file).
+    # If file contains a path separator (e.g. attachments/cat_scan.jpg), resolve relative to the CSV's directory.
     def path_to_file(file)
-      # return if we already have the full file path
       return file if File.exist?(file)
+      # Relative path: resolve from CSV's directory (allows arbitrary subdirectory names, not just "files")
+      return resolve_relative_file_path(file) if file.include?('/')
+      # Bare filename: use legacy files/ directory for backward compatibility and round-tripping
       path = importerexporter.parser.path_to_files
+      raise "Could not determine path to files directory. Ensure the import package contains a zip or a valid import_file_path." if path.nil?
       f = File.join(path, file)
       return f if File.exist?(f)
-      raise "File #{f} does not exist"
+      raise "File not found: #{f}. Check the file column in your CSV and ensure the file exists in the import package or path_to_files directory."
     end
     private
+    def resolve_relative_file_path(file)
+      base = File.dirname(importerexporter.parser.import_file_path)
+      candidate = File.join(base, file)
+      return candidate if File.exist?(candidate)
+      raise "File not found: #{candidate}. Check the file path in your CSV and ensure the file exists in the import package or directory."
+    end
     def map_file_sets(file_sets)
       # rubocop:disable Rails/Presence
       file_sets.map { |fs| filename(fs).to_s if filename(fs).present? }.compact

data/app/models/bulkrax/entry.rb CHANGED Viewed

@@ -93,6 +93,10 @@ module Bulkrax
       parser.related_parents_parsed_mapping
     end
+    def self.child_field(parser)
+      parser.related_children_parsed_mapping
+    end
     def build
       return if type.nil?
       self.save if self.new_record? # must be saved for statuses

data/app/models/bulkrax/importer.rb CHANGED Viewed

@@ -169,6 +169,30 @@ module Bulkrax
       import_file_path if original_file?
     end
+    # Returns all available original files (CSV and ZIP if present)
+    # @return [Array<Hash>] Array of hashes with :path and :name keys
+    def original_files
+      files = []
+      if import_file_path && File.exist?(import_file_path)
+        files << {
+          path: import_file_path,
+          name: File.basename(import_file_path),
+          type: :csv
+        }
+      end
+      if parser_fields['attachments_zip_path'] && File.exist?(parser_fields['attachments_zip_path'])
+        files << {
+          path: parser_fields['attachments_zip_path'],
+          name: File.basename(parser_fields['attachments_zip_path']),
+          type: :zip
+        }
+      end
+      files
+    end
     def replace_files
       self.parser_fields['replace_files']
     end
@@ -241,17 +265,10 @@ module Bulkrax
     #   [['Single Metadata File for all works', 'single'], ['Multiple Files, one per Work', 'multi']]
     # end
-    # If the import data is zipped, unzip it to this path
     def importer_unzip_path(mkdir: false)
-      @importer_unzip_path ||= File.join(parser.base_path, "import_#{path_string}")
-      return @importer_unzip_path if Dir.exist?(@importer_unzip_path) || mkdir == true
-      # turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
-      base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
-      # If we don't have an existing unzip path, we'll try and find it.
-      # Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
-      @importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
+      path = File.join(parser.base_path, "import_#{path_string}")
+      FileUtils.mkdir_p(path) if mkdir
+      path
     end
     def errored_entries_csv_path

data/app/models/concerns/bulkrax/has_matchers.rb CHANGED Viewed

@@ -131,7 +131,7 @@ module Bulkrax
       return false if excluded?(field)
       return true if supported_bulkrax_fields.include?(field)
-      Bulkrax.object_factory.field_supported?(field: field, model: factory_class)
+      Bulkrax.object_factory.field_supported?(field: field, model: factory_class, admin_set_id: importerexporter.try(:admin_set_id))
     end
     def supported_bulkrax_fields
@@ -145,7 +145,7 @@ module Bulkrax
       return true if fields_that_are_always_singular.include?(field.to_s)
       return false if fields_that_are_always_multiple.include?(field.to_s)
-      Bulkrax.object_factory.field_multi_value?(field: field, model: factory_class)
+      Bulkrax.object_factory.field_multi_value?(field: field, model: factory_class, admin_set_id: importerexporter.try(:admin_set_id))
     end
     def fields_that_are_always_multiple

data/app/models/concerns/bulkrax/importer_exporter_behavior.rb CHANGED Viewed

@@ -51,15 +51,16 @@ module Bulkrax
     # Is this a zip file?
     def zip?
       filename = parser_fields&.[]('import_file_path')
-      return false unless filename
-      return false unless File.file?(filename)
+      return false unless filename && File.file?(filename)
+      zip_file?(filename)
+    end
-      returning_value = false
+    def zip_file?(filename)
+      return false unless filename && File.file?(filename)
       File.open(filename) do |file|
         mime_type = ::Marcel::MimeType.for(name: file)
-        returning_value = mime_type.include?('application/zip') || mime_type.include?('application/gzip')
+        mime_type.include?('application/zip') || mime_type.include?('application/gzip')
       end
-      returning_value
     end
   end
 end

data/app/parsers/bulkrax/application_parser.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module Bulkrax
              :seen, :increment_counters, :parser_fields, :user, :keys_without_numbers,
              :key_without_numbers, :status, :set_status_info, :status_info, :status_at,
              :exporter_export_path, :exporter_export_zip_path, :importer_unzip_path, :validate_only,
-             :zip?, :file?, :remove_and_rerun,
+             :zip?, :file?, :remove_and_rerun, :zip_file?,
              to: :importerexporter
     # @todo Convert to `class_attribute :parser_fiels, default: {}`
@@ -430,18 +430,77 @@ module Bulkrax
       zip
     end
+    # Extracts a zip verbatim into {#importer_unzip_path}, preserving the zip's
+    # internal structure. Filters macOS junk (`__MACOSX/`, `.DS_Store`, `._*`).
+    # Parser subclasses that need to interpret the zip's structure (e.g.
+    # {Bulkrax::CsvParser#unzip_with_primary_csv}) should call a more specific
+    # method rather than this one.
     def unzip(file_to_unzip)
       return untar(file_to_unzip) if file_to_unzip.end_with?('.tar.gz')
+      dest_dir = importer_unzip_path(mkdir: true)
       Zip::File.open(file_to_unzip) do |zip_file|
         zip_file.each do |entry|
-          entry_path = File.join(importer_unzip_path(mkdir: true), entry.name)
-          FileUtils.mkdir_p(File.dirname(entry_path))
-          zip_file.extract(entry, entry_path) unless File.exist?(entry_path)
+          next unless entry.file?
+          next if macos_junk_entry?(entry.name)
+          reject_unsafe_entry!(entry.name)
+          dest_path = safe_extract_path(dest_dir, entry.name)
+          FileUtils.mkdir_p(File.dirname(dest_path))
+          next if File.exist?(dest_path)
+          extract_zip_entry(zip_file, entry, dest_dir, entry.name, dest_path)
         end
       end
     end
+    # rubyzip 2.x: extract(entry, absolute_dest_path)
+    # rubyzip 3.x: extract(entry, relative_name, destination_directory: dir)
+    #
+    # Callers are responsible for passing a `dest_path` produced by
+    # {#safe_extract_path} so the write can't escape `dest_dir`.
+    def extract_zip_entry(zip_file, entry, dest_dir, relative_name, dest_path)
+      if zip_file.method(:extract).arity == 2
+        zip_file.extract(entry, dest_path)
+      else
+        zip_file.extract(entry, relative_name, destination_directory: dest_dir)
+      end
+    end
+    def macos_junk_entry?(name)
+      name.start_with?('__MACOSX/') || name.split('/').any? { |part| part == '.DS_Store' || part.start_with?('._') }
+    end
+    # Zip Slip preflight — reject entries whose names are obviously unsafe
+    # (absolute paths, `..` segments) before we touch the filesystem.
+    # {#safe_extract_path} is the final line of defense; this check just
+    # fails fast with a clear message.
+    #
+    # @raise [Bulkrax::UnzipError] if the entry name is unsafe
+    def reject_unsafe_entry!(name)
+      return unless name.start_with?('/') || name.split('/').include?('..')
+      raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: name)
+    end
+    # Zip Slip chokepoint. Resolves `relative_dest` against `dest_dir` and
+    # returns the absolute destination path — but only if it stays inside
+    # `dest_dir`. Callers must use this value rather than building their
+    # own path with `File.join`, so the path returned is always safe by
+    # construction.
+    #
+    # @return [String] absolute destination path, validated to be inside `dest_dir`
+    # @raise  [Bulkrax::UnzipError] if `relative_dest` escapes `dest_dir`
+    def safe_extract_path(dest_dir, relative_dest)
+      expanded_dest_dir = File.expand_path(dest_dir)
+      dest_path = File.expand_path(relative_dest.to_s, expanded_dest_dir)
+      return dest_path if dest_path == expanded_dest_dir
+      return dest_path if dest_path.start_with?("#{expanded_dest_dir}#{File::SEPARATOR}")
+      raise Bulkrax::UnzipError, I18n.t('bulkrax.importer.unzip.errors.unsafe_entry', name: relative_dest)
+    end
+    def copy_file(file_to_copy)
+      destination = File.join(importer_unzip_path(mkdir: true), File.basename(file_to_copy))
+      FileUtils.cp(file_to_copy, destination)
+    end
     def untar(file_to_untar)
       Dir.mkdir(importer_unzip_path(mkdir: true)) unless File.directory?(importer_unzip_path(mkdir: true))
       command = "tar -xzf #{Shellwords.escape(file_to_untar)} -C #{Shellwords.escape(importer_unzip_path)}"
@@ -449,21 +508,6 @@ module Bulkrax
       raise "Failed to extract #{file_to_untar}" unless result
     end
-    # File names referenced in CSVs have spaces replaced with underscores
-    # @see Bulkrax::CsvParser#file_paths
-    def remove_spaces_from_filenames
-      files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
-      files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
-      return if files_with_spaces.blank?
-      files_with_spaces.map! { |path| Pathname.new(path) }
-      files_with_spaces.each do |path|
-        filename = path.basename
-        filename_without_spaces = filename.to_s.tr(' ', '_')
-        path.rename(File.join(path.dirname, filename_without_spaces))
-      end
-    end
     def zip
       FileUtils.mkdir_p(exporter_export_zip_path)
@@ -489,7 +533,6 @@ module Bulkrax
     # @return [String]
     def real_import_file_path
-      return importer_unzip_path if file? && zip?
       parser_fields['import_file_path']
     end
   end

data/app/parsers/bulkrax/bagit_parser.rb CHANGED Viewed

@@ -25,6 +25,18 @@ unless ENV.fetch('BULKRAX_NO_BAGIT', 'false').to_s == 'true'
         @path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
       end
+      # BagIt archives are not CSV imports: they don't contain a primary
+      # CSV at a shallowest level, and their structure (bagit.txt + data/
+      # + manifests) must be preserved verbatim. Override both CSV-flavored
+      # unzip entry points to use the base-class verbatim extraction.
+      def unzip_with_primary_csv(file_to_unzip)
+        unzip(file_to_unzip)
+      end
+      def unzip_attachments_only(file_to_unzip)
+        unzip(file_to_unzip)
+      end
       # Take a random sample of 10 metadata_paths and work out the import fields from that
       def import_fields
         raise StandardError, 'No metadata files were found' if metadata_paths.blank?