RubyGems - bulkrax - Versions diffs - 9.0.2 → 9.2.0 - Mend

bulkrax 9.0.2 → 9.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/README.md +26 -0
data/app/assets/javascripts/bulkrax/datatables.js +12 -0
data/app/assets/javascripts/bulkrax/importers.js.erb +4 -1
data/app/factories/bulkrax/object_factory.rb +36 -2
data/app/factories/bulkrax/object_factory_interface.rb +26 -0
data/app/factories/bulkrax/valkyrie_object_factory.rb +109 -27
data/app/jobs/bulkrax/create_relationships_job.rb +123 -76
data/app/jobs/bulkrax/delete_job.rb +11 -0
data/app/jobs/bulkrax/importer_job.rb +1 -0
data/app/matchers/bulkrax/application_matcher.rb +2 -1
data/app/models/bulkrax/csv_entry.rb +41 -10
data/app/models/bulkrax/importer.rb +9 -1
data/app/models/bulkrax/status.rb +1 -1
data/app/models/concerns/bulkrax/export_behavior.rb +28 -15
data/app/models/concerns/bulkrax/file_set_entry_behavior.rb +13 -4
data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +1 -1
data/app/parsers/bulkrax/application_parser.rb +22 -4
data/app/parsers/bulkrax/csv_parser.rb +36 -6
data/app/parsers/bulkrax/oai_dc_parser.rb +0 -2
data/app/parsers/bulkrax/xml_parser.rb +1 -1
data/app/services/bulkrax/factory_class_finder.rb +56 -15
data/app/services/hyrax/custom_queries/find_by_source_identifier.rb +6 -11
data/app/services/wings/custom_queries/find_by_source_identifier.rb +15 -6
data/app/views/bulkrax/entries/show.html.erb +15 -9
data/app/views/bulkrax/importers/_bagit_fields.html.erb +1 -1
data/app/views/bulkrax/importers/_csv_fields.html.erb +1 -1
data/app/views/bulkrax/importers/_oai_fields.html.erb +1 -1
data/app/views/bulkrax/importers/_xml_fields.html.erb +1 -1
data/app/views/bulkrax/importers/show.html.erb +4 -4
data/app/views/bulkrax/shared/_entries_tab.html.erb +1 -1
data/config/locales/bulkrax.en.yml +5 -3
data/lib/bulkrax/engine.rb +1 -1
data/lib/bulkrax/version.rb +1 -1
data/lib/bulkrax.rb +6 -11
data/lib/generators/bulkrax/templates/bin/importer +1 -5
metadata +8 -3
data/app/factories/bulkrax/valkyrize-hyku.code-workspace +0 -19

data/app/jobs/bulkrax/create_relationships_job.rb CHANGED Viewed

@@ -44,7 +44,7 @@ module Bulkrax
     queue_as Bulkrax.config.ingest_queue_name
-    attr_accessor :user, :importer_run, :errors
+    attr_accessor :user, :importer_run, :errors, :importer_run_id, :ability, :number_of_successes, :number_of_failures
     ##
     # @param parent_identifier [String] Work/Collection ID or Bulkrax::Entry source_identifiers
     # @param importer_run [Bulkrax::ImporterRun] current importer run (needed to properly update counters)
@@ -57,72 +57,52 @@ module Bulkrax
     #
     # rubocop:disable Metrics/MethodLength
     def perform(parent_identifier:, importer_run_id: nil, run_user: nil, failure_count: 0) # rubocop:disable Metrics/AbcSize
-      importer_run = Bulkrax::ImporterRun.find(importer_run_id) if importer_run_id
-      user = run_user || importer_run&.user
-      ability = Ability.new(user)
-      parent_entry, parent_record = find_record(parent_identifier, importer_run_id)
-      number_of_successes = 0
-      number_of_failures = 0
-      errors = []
+      @importer_run_id = importer_run_id
+      @importer_run = Bulkrax::ImporterRun.find(@importer_run_id) if @importer_run_id
+      @user = run_user || importer_run&.user
+      @ability = Ability.new(@user)
+      @number_of_successes = 0
+      @number_of_failures = 0
+      @errors = []
       @parent_record_members_added = false
-      @child_members_added = []
+      parent_entry, parent_record = find_record(parent_identifier, @importer_run_id)
       if parent_record
-        conditionally_acquire_lock_for(parent_record.id) do
-          ActiveRecord::Base.uncached do
-            Bulkrax::PendingRelationship.where(parent_id: parent_identifier)
-                                        .ordered.find_each do |rel|
-              process(relationship: rel, importer_run_id: importer_run_id, parent_record: parent_record, ability: ability)
-              number_of_successes += 1
-              @parent_record_members_added = true
-            rescue => e
-              number_of_failures += 1
-              rel.set_status_info(e, importer_run)
-              errors << e
-            end
-          end
-          # save record if members were added
-          if @parent_record_members_added
-            Bulkrax.object_factory.save!(resource: parent_record, user: user)
-            Bulkrax.object_factory.publish(event: 'object.membership.updated', object: parent_record)
-            Bulkrax.object_factory.update_index(resources: @child_members_added)
-          end
+        # Works and collections are different breeds of animals:
+        # - works know both their children (file_sets and child works) in member_ids
+        # - works and collections know their parents (collections) in member_of_collection_ids
+        # We need to handle the two differently by locking the records appropriately to avoid race condition errors.
+        if parent_record.is_a?(Bulkrax.collection_model_class)
+          process_parent_as_collection(parent_record: parent_record, parent_identifier: parent_identifier)
+        else
+          process_parent_as_work(parent_record: parent_record, parent_identifier: parent_identifier)
         end
       else
-        # In moving the check of the parent record "up" we've exposed a hidden reporting foible.
-        # Namely we were reporting one error per child record when the parent record was itself
-        # unavailable.
-        #
-        # We have chosen not to duplicate that "number of errors" as it does not seem like the
-        # correct pattern for reporting a singular error (the previous pattern being one error per
-        # child who's parent is not yet created).
-        number_of_failures = 1
-        errors = ["Parent record not yet available for creating relationships with children records."]
+        @number_of_failures = 1
+        @errors = ["Parent record #{parent_identifier} not yet available for creating relationships with children records."]
       end
-      if errors.present?
+      if @errors.present?
         # rubocop:disable Rails/SkipsModelValidations
-        ImporterRun.update_counters(importer_run_id, failed_relationships: number_of_failures)
+        ImporterRun.update_counters(@importer_run_id, failed_relationships: @number_of_failures)
         # rubocop:enable Rails/SkipsModelValidations
-        parent_entry&.set_status_info(errors.last, importer_run)
+        parent_entry&.set_status_info(@errors.last, importer_run)
         failure_count += 1
         if failure_count < max_failure_count
           reschedule(
             parent_identifier: parent_identifier,
-            importer_run_id: importer_run_id,
-            run_user: run_user,
+            importer_run_id: @importer_run_id,
+            run_user: @user,
             failure_count: failure_count
           )
         end
-        return errors # stop current job from continuing to run after rescheduling
+        return @errors # stop current job from continuing to run after rescheduling
       else
         # rubocop:disable Rails/SkipsModelValidations
-        ImporterRun.update_counters(importer_run_id, processed_relationships: number_of_successes)
+        ImporterRun.update_counters(@importer_run_id, processed_relationships: @number_of_successes)
         # rubocop:enable Rails/SkipsModelValidations
       end
     end
@@ -132,6 +112,8 @@ module Bulkrax
     ##
     # We can use Hyrax's lock manager when we have one available.
+    # However it's not certain that this is actually working, so to be
+    # as safe as possible, we will reload resources before we update.
     if defined?(::Hyrax)
       include Hyrax::Lockable
@@ -151,46 +133,111 @@ module Bulkrax
       alias conditionally_acquire_lock_for acquire_lock_for
     end
-    def process(relationship:, importer_run_id:, parent_record:, ability:)
-      raise "#{relationship} needs a child to create relationship" if relationship.child_id.nil?
-      raise "#{relationship} needs a parent to create relationship" if relationship.parent_id.nil?
-      _child_entry, child_record = find_record(relationship.child_id, importer_run_id)
-      raise "#{relationship} could not find child record" unless child_record
-      raise "Cannot add child collection (ID=#{relationship.child_id}) to parent work (ID=#{relationship.parent_id})" if child_record.collection? && parent_record.work?
+    # When the parent is a collection, we save the relationship on each child.
+    # The parent does not need to be saved, as the relationship is stored on the child.
+    # but we do reindex the parent after all the children are added.
+    def process_parent_as_collection(parent_record:, parent_identifier:)
+      ActiveRecord::Base.uncached do
+        Bulkrax::PendingRelationship.where(parent_id: parent_identifier, importer_run_id: @importer_run_id)
+                                    .ordered.find_each do |rel|
+          raise "#{rel} needs a child to create relationship" if rel.child_id.nil?
+          raise "#{rel} needs a parent to create relationship" if rel.parent_id.nil?
+          add_to_collection(relationship: rel, parent_record: parent_record, ability: ability)
+          @number_of_successes += 1
+          @parent_record_members_added = true
+        rescue => e
+          rel.update(status_message: e.message)
+          @number_of_failures += 1
+          @errors << e
+        end
+      end
-      ability.authorize!(:edit, child_record)
+      # if collection members were added, we reindex the collection
+      # The collection members have already saved the relationships
+      # To index the parent, we want to make sure we have the latest version of the parent,
+      # because another job may have updated it in the meantime.
+      return unless @parent_record_members_added
+      reloaded_parent = Bulkrax.object_factory.find(parent_record.id)
+      Bulkrax.object_factory.update_index(resources: [reloaded_parent])
+      Bulkrax.object_factory.publish(event: 'object.membership.updated', object: reloaded_parent, user: @user)
+    end
-      # We could do this outside of the loop, but that could lead to odd counter failures.
-      ability.authorize!(:edit, parent_record)
+    # When the parent is a work, we save the relationship on the parent.
+    # We prefer to save all of the member relationships and then save the parent once. Concurrent
+    # jobs may be trying to save the parent at the same time, so we need to lock the parent
+    # record while we are adding the children to it.
+    # However the locking appears to not be working so as a workaround we will save each member as we go,
+    # but only index the parent once at the end.
+    def process_parent_as_work(parent_record:, parent_identifier:)
+      conditionally_acquire_lock_for(parent_record.id.to_s) do
+        ActiveRecord::Base.uncached do
+          Bulkrax::PendingRelationship.where(parent_id: parent_identifier, importer_run_id: @importer_run_id)
+                                      .ordered.find_each do |rel|
+            raise "#{rel} needs a child to create relationship" if rel.child_id.nil?
+            raise "#{rel} needs a parent to create relationship" if rel.parent_id.nil?
+            add_to_work(relationship: rel, parent_record: parent_record, ability: ability)
+            self.number_of_successes += 1
+            @parent_record_members_added = true
+          rescue => e
+            rel.update(status_message: e.message)
+            @number_of_failures += 1
+            @errors << e
+          end
+        end
-      if parent_record.is_a?(Bulkrax.collection_model_class)
-        add_to_collection(child_record, parent_record)
-      else
-        add_to_work(child_record, parent_record)
+        # save record if members were added
+        if @parent_record_members_added
+          reloaded_parent = Bulkrax.object_factory.find(parent_record.id)
+          Bulkrax.object_factory.update_index(resources: [reloaded_parent])
+          Bulkrax.object_factory.publish(event: 'object.membership.updated', object: reloaded_parent, user: @user)
+        end
       end
-      Bulkrax.object_factory.update_index_for_file_sets_of(resource: child_record) if update_child_records_works_file_sets?
-      relationship.destroy
     end
-    def add_to_collection(child_record, parent_record)
-      Bulkrax.object_factory.add_resource_to_collection(
-        collection: parent_record,
-        resource: child_record,
-        user: user
-      )
+    # NOTE: the child changes are saved in the object factory.
+    def add_to_collection(relationship:, parent_record:, ability:)
+      ActiveRecord::Base.uncached do
+        _child_entry, child_record = find_record(relationship.child_id, @importer_run_id)
+        raise "#{relationship} could not find child record" unless child_record
+        raise "Cannot add child collection (ID=#{relationship.child_id}) to parent work (ID=#{relationship.parent_id})" if child_record.collection? && parent_record.work?
+        ability.authorize!(:edit, child_record)
+        # We could do this outside of the loop, but that could lead to odd counter failures.
+        ability.authorize!(:edit, parent_record)
+        # It is important to lock the child records as they are the ones being saved.
+        # However, locking doesn't seem to be working so we will reload the child record before saving.
+        # This is a workaround for the fact that the lock manager doesn't seem to be working.
+        conditionally_acquire_lock_for(child_record.id.to_s) do
+          Bulkrax.object_factory.add_resource_to_collection(
+            collection: parent_record,
+            resource: child_record,
+            user: @user
+          )
+        end
+        relationship.destroy
+      end
     end
-    def add_to_work(child_record, parent_record)
-      # NOTE: The .add_child_to_parent_work should not persist changes to the
-      #       child nor parent.  We'll do that elsewhere in this loop.
-      Bulkrax.object_factory.add_child_to_parent_work(
+    # NOTE: we only update the parent's member_ids and prefer to not save the parent until all children are added.
+    # However, the locking appears to be working so as a workaround we will save each member as we go.
+    # This is a workaround for the fact that the lock manager doesn't seem to be working.
+    # To avoid having to reload the parent, we return the updated parent to the calling method.
+    def add_to_work(relationship:, parent_record:, ability:)
+      _child_entry, child_record = find_record(relationship.child_id, @importer_run_id)
+      raise "#{relationship} could not find child record" unless child_record
+      raise "Cannot add child collection (ID=#{relationship.child_id}) to parent work (ID=#{relationship.parent_id})" if child_record.collection? && parent_record.work?
+      ability.authorize!(:edit, child_record)
+      # We could do this outside of the loop, but that could lead to odd counter failures.
+      ability.authorize!(:edit, parent_record)
+      updated_parent = Bulkrax.object_factory.add_child_to_parent_work(
         parent: parent_record,
         child: child_record
       )
+      # default is false for this... do not typically need to index file sets of child records
+      Bulkrax.object_factory.update_index_for_file_sets_of(resource: child_record) if update_child_records_works_file_sets?
+      relationship.destroy
+      updated_parent
     end
     def reschedule(**kargs)

data/app/jobs/bulkrax/delete_job.rb CHANGED Viewed

@@ -6,6 +6,17 @@ module Bulkrax
     def perform(entry, importer_run)
       user = importer_run.importer.user
+      # When we delete, we don't go through the build process.
+      # However, we need the identifier to be set for the entry.
+      # This enables us to delete based on the ID, not just the source_identifier.
+      if entry.respond_to?(:build_metadata_for_delete) &&
+         entry.parsed_metadata.nil? &&
+         entry.raw_metadata.present?
+        entry.build_metadata_for_delete
+        entry.save!
+      end
       entry.factory.delete(user)
       # rubocop:disable Rails/SkipsModelValidations

data/app/jobs/bulkrax/importer_job.rb CHANGED Viewed

@@ -30,6 +30,7 @@ module Bulkrax
       return unless parser.file? && parser.zip?
       parser.unzip(parser.parser_fields['import_file_path'])
+      parser.remove_spaces_from_filenames
     end
     def update_current_run_counters(importer)

data/app/matchers/bulkrax/application_matcher.rb CHANGED Viewed

@@ -16,8 +16,9 @@ module Bulkrax
     def result(_parser, content)
       return nil if self.excluded == true || Bulkrax.reserved_properties.include?(self.to)
+      # rubocop:disable Style/RedundantParentheses
       return nil if self.if && (!self.if.is_a?(Array) && self.if.length != 2)
+      # rubocop:enable Style/RedundantParentheses
       if self.if
         return unless content.send(self.if[0], Regexp.new(self.if[1]))
       end

data/app/models/bulkrax/csv_entry.rb CHANGED Viewed

@@ -5,6 +5,23 @@ module Bulkrax
   # We do too much in these entry classes. We need to extract the common logic from the various
   # entry models into a module that can be shared between them.
   class CsvEntry < Entry # rubocop:disable Metrics/ClassLength
+    class CsvPathError < StandardError
+      def initialize(message)
+        super(message)
+      end
+    end
+    class RecordNotFound < StandardError
+      def initialize(message)
+        super(message)
+      end
+    end
+    class MissingMetadata < StandardError
+      def initialize(message)
+        super(message)
+      end
+    end
     serialize :raw_metadata, Bulkrax::NormalizedJson
     def self.fields_from_data(data)
@@ -16,7 +33,7 @@ module Bulkrax
     # there's a risk that this reads the whole file into memory and could cause a memory leak
     # we strip any special characters out of the headers. looking at you Excel
     def self.read_data(path)
-      raise StandardError, 'CSV path empty' if path.blank?
+      raise CsvPathError, 'CSV path empty' if path.blank?
       options = {
         headers: true,
         header_converters: ->(h) { h.to_s.gsub(/[^\w\d\. -]+/, '').strip.to_sym },
@@ -85,10 +102,18 @@ module Bulkrax
       self.parsed_metadata
     end
+    # limited metadata is needed for delete jobs
+    def build_metadata_for_delete
+      self.parsed_metadata = {}
+      establish_factory_class
+      add_ingested_metadata
+      self.parsed_metadata
+    end
     def validate_record
-      raise StandardError, 'Record not found' if record.nil?
+      raise RecordNotFound, 'Record not found' if record.nil?
       unless importerexporter.parser.required_elements?(record)
-        raise StandardError, "Missing required elements, missing element(s) are: "\
+        raise MissingMetadata, "Missing required elements, missing element(s) are: "\
 "#{importerexporter.parser.missing_elements(record).join(', ')}"
       end
     end
@@ -160,7 +185,7 @@ module Bulkrax
       source_id = source_id.to_a if source_id.is_a?(ActiveTriples::Relation)
       source_id = Array.wrap(source_id).first
       self.parsed_metadata[source_identifier] = source_id
-      model_name = hyrax_record.respond_to?(:to_rdf_representation) ? hyrax_record.to_rdf_representation : hyrax_record.has_model.first
+      model_name = Bulkrax.object_factory.model_name(resource: hyrax_record)
       self.parsed_metadata[key_for_export('model')] = model_name
     end
@@ -179,9 +204,13 @@ module Bulkrax
     def build_relationship_metadata
       # Includes all relationship methods for all exportable record types (works, Collections, FileSets)
+      # @TODO: this logic assumes that the relationships are all available via a method that can be called
+      #        on the object. With Valkyrie, this is only true for Hyrax-based models which include the
+      #        ArResource module. We need to consider reworking this logic into an object factory method
+      #        that can handle different types of models.
       relationship_methods = {
-        related_parents_parsed_mapping => %i[member_of_collection_ids member_of_work_ids in_work_ids],
-        related_children_parsed_mapping => %i[member_collection_ids member_work_ids file_set_ids]
+        related_parents_parsed_mapping => %i[member_of_collection_ids member_of_work_ids in_work_ids parent],
+        related_children_parsed_mapping => %i[member_collection_ids member_work_ids file_set_ids member_ids]
       }
       relationship_methods.each do |relationship_key, methods|
@@ -189,7 +218,9 @@ module Bulkrax
         values = []
         methods.each do |m|
-          values << hyrax_record.public_send(m) if hyrax_record.respond_to?(m)
+          value = hyrax_record.public_send(m) if hyrax_record.respond_to?(m)
+          value_id = value.try(:id)&.to_s || value # get the id if it's an object
+          values << value_id if value_id.present?
         end
         values = values.flatten.uniq
         next if values.blank?
@@ -316,11 +347,11 @@ module Bulkrax
     def build_thumbnail_files
       return unless importerexporter.include_thumbnails
+      thumbnail = Bulkrax.object_factory.thumbnail_for(resource: hyrax_record)
+      return unless thumbnail
+      filenames = map_file_sets(Array.wrap(thumbnail))
       thumbnail_mapping = 'thumbnail_file'
-      file_sets = Array.wrap(hyrax_record.thumbnail)
-      filenames = map_file_sets(file_sets)
       handle_join_on_export(thumbnail_mapping, filenames, false)
     end

data/app/models/bulkrax/importer.rb CHANGED Viewed

@@ -237,8 +237,16 @@ module Bulkrax
     # end
     # If the import data is zipped, unzip it to this path
-    def importer_unzip_path
+    def importer_unzip_path(mkdir: false)
       @importer_unzip_path ||= File.join(parser.base_path, "import_#{path_string}")
+      return @importer_unzip_path if Dir.exist?(@importer_unzip_path) || mkdir == true
+      # turns "tmp/imports/tenant/import_1_20250122035229_1" to "tmp/imports/tenant/import_1_20250122035229"
+      base_importer_unzip_path = @importer_unzip_path.split('_')[0...-1].join('_')
+      # If we don't have an existing unzip path, we'll try and find it.
+      # Just in case there are multiple paths, we sort by the number at the end of the path and get the last one
+      @importer_unzip_path = Dir.glob(base_importer_unzip_path + '*').sort_by { |path| path.split(base_importer_unzip_path).last[1..-1].to_i }.last
     end
     def errored_entries_csv_path

data/app/models/bulkrax/status.rb CHANGED Viewed

@@ -23,7 +23,7 @@ module Bulkrax
     end
     def latest?
-      # TODO: remove if statment when we stop supporting Hyrax < 4
+      # TODO: remove if statement when we stop supporting Hyrax < 4
       self.id == if Gem::Version.new(Rails::VERSION::STRING) >= Gem::Version.new('6.0.0')
                    self.class.where(statusable_id: self.statusable_id, statusable_type: self.statusable_type).order('id desc').pick(:id)
                  else

data/app/models/concerns/bulkrax/export_behavior.rb CHANGED Viewed

@@ -26,25 +26,38 @@ module Bulkrax
     # Prepend the file_set id to ensure a unique filename and also one that is not longer than 255 characters
     def filename(file_set)
-      return if file_set.original_file.blank?
-      if file_set.original_file.respond_to?(:original_filename) # valkyrie
-        fn = file_set.original_file.original_filename
-        mime = ::Marcel::MimeType.for(file_set.original_file.file.io)
-      else # original non valkyrie version
-        fn = file_set.original_file.file_name.first
-        mime = ::Marcel::MimeType.for(declared_type: file_set.original_file.mime_type)
-      end
-      ext_mime = ::Marcel::MimeType.for(name: fn)
+      # return if there are no files on the fileset
+      return if Bulkrax.object_factory.original_file(fileset: file_set).blank?
+      fn = Bulkrax.object_factory.filename_for(fileset: file_set)
+      file = Bulkrax.object_factory.original_file(fileset: file_set)
+      ext = file_extension(file: file, filename: fn)
+      # Prepend the file_set id to ensure a unique filename
+      filename = File.basename(fn, ".*")
+      # Skip modification if file already has ID or we're in metadata-only mode
       if fn.include?(file_set.id) || importerexporter.metadata_only?
-        filename = "#{fn}.#{mime.to_sym}"
-        filename = fn if mime.to_s == ext_mime.to_s
+        # keep filename as is
       else
-        filename = "#{file_set.id}_#{fn}.#{mime.to_sym}"
-        filename = "#{file_set.id}_#{fn}" if mime.to_s == ext_mime.to_s
+        filename = "#{file_set.id}_#{filename}"
       end
-      # Remove extention truncate and reattach
-      ext = File.extname(filename)
+      filename = ext.present? ? "#{filename}.#{ext}" : fn
+      # Remove extension, truncate and reattach
       "#{File.basename(filename, ext)[0...(220 - ext.length)]}#{ext}"
     end
+    ##
+    # Generate the appropriate file extension based on the mime type of the file
+    # @return [String] the file extension for the given file
+    def file_extension(file:, filename:)
+      declared_mime = ::Marcel::MimeType.for(declared_type: file.mime_type)
+      # validate the declared mime type
+      declared_mime = ::Marcel::MimeType.for(name: filename) if declared_mime.nil? || declared_mime == "application/octet-stream"
+      # convert the mime type to a file extension
+      Mime::Type.lookup(declared_mime).symbol.to_s
+    rescue Mime::Type::InvalidMimeType
+      nil
+    end
   end
 end

data/app/models/concerns/bulkrax/file_set_entry_behavior.rb CHANGED Viewed

@@ -2,6 +2,15 @@
 module Bulkrax
   module FileSetEntryBehavior
+    class FileNameError < StandardError
+    end
+    class OrphanFileSetError < StandardError
+    end
+    class FilePathError < StandardError
+    end
     extend ActiveSupport::Concern
     included do
@@ -21,11 +30,11 @@ module Bulkrax
         path_to_file = parser.path_to_files(filename: filename)
-        parsed_metadata['file'][i] = path_to_file
+        parsed_metadata['file'][i] = path_to_file if path_to_file.present?
       end
       parsed_metadata['file'].delete('')
-      raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
+      raise FilePathError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
       parsed_metadata['file']
     end
@@ -33,13 +42,13 @@ module Bulkrax
     def validate_presence_of_filename!
       return if parsed_metadata&.[](file_reference)&.map(&:present?)&.any?
-      raise StandardError, 'File set must have a filename'
+      raise FileNameError, 'File set must have a filename'
     end
     def validate_presence_of_parent!
       return if parsed_metadata[related_parents_parsed_mapping]&.map(&:present?)&.any?
-      raise StandardError, 'File set must be related to at least one work'
+      raise OrphanFileSetError, 'File set must be related to at least one work'
     end
     def parent_jobs

data/app/models/concerns/bulkrax/importer_exporter_behavior.rb CHANGED Viewed

@@ -56,7 +56,7 @@ module Bulkrax
       returning_value = false
       File.open(filename) do |file|
-        mime_type = ::Marcel::MimeType.for(file)
+        mime_type = ::Marcel::MimeType.for(name: file)
         returning_value = mime_type.include?('application/zip') || mime_type.include?('application/gzip')
       end
       returning_value

data/app/parsers/bulkrax/application_parser.rb CHANGED Viewed

@@ -209,8 +209,11 @@ module Bulkrax
     def rebuild_entries(types_array = nil)
       index = 0
       (types_array || %w[collection work file_set relationship]).each do |type|
-        # works are not gurneteed to have Work in the type
+        # works are not guaranteed to have Work in the type
+        if type.eql?('relationship')
+          ScheduleRelationshipsJob.set(wait: 5.minutes).perform_later(importer_id: importerexporter.id)
+          next
+        end
         importer.entries.where(rebuild_entry_query(type, parser_fields['entry_statuses'])).find_each do |e|
           seen[e.identifier] = true
           e.status_info('Pending', importer.current_run)
@@ -432,7 +435,7 @@ module Bulkrax
       Zip::File.open(file_to_unzip) do |zip_file|
         zip_file.each do |entry|
-          entry_path = File.join(importer_unzip_path, entry.name)
+          entry_path = File.join(importer_unzip_path(mkdir: true), entry.name)
           FileUtils.mkdir_p(File.dirname(entry_path))
           zip_file.extract(entry, entry_path) unless File.exist?(entry_path)
         end
@@ -440,12 +443,27 @@ module Bulkrax
     end
     def untar(file_to_untar)
-      Dir.mkdir(importer_unzip_path) unless File.directory?(importer_unzip_path)
+      Dir.mkdir(importer_unzip_path(mkdir: true)) unless File.directory?(importer_unzip_path(mkdir: true))
       command = "tar -xzf #{Shellwords.escape(file_to_untar)} -C #{Shellwords.escape(importer_unzip_path)}"
       result = system(command)
       raise "Failed to extract #{file_to_untar}" unless result
     end
+    # File names referenced in CSVs have spaces replaced with underscores
+    # @see Bulkrax::CsvParser#file_paths
+    def remove_spaces_from_filenames
+      files = Dir.glob(File.join(importer_unzip_path, 'files', '*'))
+      files_with_spaces = files.select { |f| f.split('/').last.match?(' ') }
+      return if files_with_spaces.blank?
+      files_with_spaces.map! { |path| Pathname.new(path) }
+      files_with_spaces.each do |path|
+        filename = path.basename
+        filename_without_spaces = filename.to_s.tr(' ', '_')
+        path.rename(File.join(path.dirname, filename_without_spaces))
+      end
+    end
     def zip
       FileUtils.mkdir_p(exporter_export_zip_path)