RubyGems - bulk_ops - Versions diffs - 0.1.23 → 0.2.0 - Mend

bulk_ops 0.1.23 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/db/migrate/20200122234235_remove_relationships_ammend_work_proxy.rb +14 -0
data/lib/bulk_ops.rb +3 -2
data/lib/bulk_ops/apply_operation_job.rb +8 -0
data/lib/bulk_ops/create_work_job.rb +1 -1
data/lib/bulk_ops/github_access.rb +1 -1
data/lib/bulk_ops/operation.rb +57 -49
data/lib/bulk_ops/parser.rb +50 -414
data/lib/bulk_ops/resolve_children_job.rb +14 -0
data/lib/bulk_ops/solr_service.rb +13 -0
data/lib/bulk_ops/update_work_job.rb +1 -1
data/lib/bulk_ops/verification.rb +2 -10
data/lib/bulk_ops/version.rb +1 -1
data/lib/bulk_ops/work_job.rb +20 -13
data/lib/bulk_ops/work_proxy.rb +18 -2
data/lib/concerns/interpret_controlled_behavior.rb +140 -0
data/lib/concerns/interpret_files_behavior.rb +82 -0
data/lib/concerns/interpret_options_behavior.rb +59 -0
data/lib/concerns/interpret_relationships_behavior.rb +123 -0
data/lib/concerns/interpret_scalar_behavior.rb +21 -0
data/lib/concerns/search_builder_behavior.rb +80 -0
metadata +12 -3
data/lib/bulk_ops/relationship.rb +0 -117

data/lib/bulk_ops/resolve_children_job.rb ADDED

@@ -0,0 +1,14 @@
+class BulkOps::ResolveChildrenJob < ActiveJob::Base
+  def perform(proxy_id)
+    proxy = BulkOps::WorkProxy.find(proxy_id)
+    if proxy.ordered_children.all?{|child| child.work_id.present?}
+      work = ActiveFedora::Base.find(proxy.work_id)
+      work.ordered_member_ids = proxy.ordered_children.map(&:work_id)
+      work.save
+    else
+      BulkOps::ResolveChildrenJob.set(wait: 30.minutes).perform_later(proxy_id)
+    end
+  end
+end

data/lib/bulk_ops/solr_service.rb ADDED

@@ -0,0 +1,13 @@
+class SolrService
+  def self.record_exists? id
+    begin
+      return true if SolrDocument.find(id)
+    rescue Blacklight::Exceptions::RecordNotFound
+      return false
+    end
+    return false
+  end
+end

data/lib/bulk_ops/update_work_job.rb CHANGED

@@ -13,7 +13,7 @@ class BulkOps::UpdateWorkJob < BulkOps::WorkJob
   def define_work workClass=nil
     # report an error if we can't find the work in solr
-    unless record_exists?(@work_proxy.work_id)
+    unless BulkOps::SolrService.record_exists?(@work_proxy.work_id)
       report_error "Could not find work to update with id: #{@work_proxy.work_id} referenced by work proxy: #{@work_proxy.id}"
       return false
     end

data/lib/bulk_ops/verification.rb CHANGED

@@ -59,14 +59,6 @@ module BulkOps
       filenames.map { |filename| File.join(BulkOps::INGEST_MEDIA_PATH, options['file_prefix'] || "", filename) }
     end
-    def record_exists? id
-      begin
-        return true if SolrDocument.find(id)
-      rescue Blacklight::Exceptions::RecordNotFound
-        return false
-      end
-    end
     private
     def verify_files
@@ -165,7 +157,7 @@ module BulkOps
       get_spreadsheet.each_with_index do |row, row_num|
         id = get_ref_id(row)
         #TODO: find by other field. for now just id
-        unless (record_exists(id))
+        unless (BulkOps::SolrService.record_exists?(id))
           @verification_errors << BulkOps::Error.new(type: :cannot_find_work, id: id)
         end
       end
@@ -205,7 +197,7 @@ module BulkOps
             end
           elsif ref_id.include?("id")
             # This is a hydra id reference. It should correspond to an object already in the repo
-            unless record_exists?(obj_id)
+            unless BulkOps::SolrService.record_exists?(obj_id)
               @verification_errors << BulkOps::Error.new({type: :bad_object_reference, object_id: obj_id, row_number: row_num + BulkOps:: ROW_OFFSET})
             end
           end

data/lib/bulk_ops/version.rb CHANGED

@@ -1,3 +1,3 @@
 module BulkOps
-  VERSION = "0.1.23"
+  VERSION = "0.2.0"
 end

data/lib/bulk_ops/work_job.rb CHANGED

@@ -13,11 +13,25 @@ class BulkOps::WorkJob < ActiveJob::Base
       update_status "error"
     else
       @work_proxy.work_id = @work.id
-      update_status "complete"
+      # If this work has a parent outside of the current operation,
+      # and this is the first sibling (we only need to do this once per parent),
+      # queue a job to resolve that work's new children
+      if @work_proxy.parent_id.present? && (parent_proxy = BulkOps::WorkProxy.find(parent_id))
+        if parent_proxy.operation_id != @work_proxy.operation_id
+          if @work_proxy.previous_sibling.nil?
+            BulkOps::ResolveChildrenJob.set(wait: 10.minutes).perform_later(parent_proxy.id)
+          end
+        end
+      end
-      # Attempt to resolve all of the relationships defined in this row
-      @work_proxy.relationships.each do |relationship|
-        relationship.resolve!
+      # Set up jobs to link child works (once they are ingested)
+      # or mark as complete otherwise
+      if (children = @work_proxy.ordered_children)
+        BulkOps::ResolveChildrenJob.perform_later(@work_proxy.id)
+        update_status "awaiting_children"
+      else
+        update_status "complete"
       end
       # Delete any UploadedFiles. These take up tons of unnecessary disk space.
@@ -52,6 +66,7 @@ class BulkOps::WorkJob < ActiveJob::Base
       report_error("Cannot find work proxy with id: #{work_proxy_id}")
       return
     end
+    return if @work_proxy.status == "complete"
     return unless (work_action = define_work(workClass))
@@ -66,7 +81,7 @@ class BulkOps::WorkJob < ActiveJob::Base
   def define_work(workClass="Work")
-    if (@work_proxy.present? && @work_proxy.work_id.present? && record_exists?(@work_proxy.work_id))
+    if (@work_proxy.present? && @work_proxy.work_id.present? && BulkOps::SolrService.record_exists?(@work_proxy.work_id))
       begin
         @work = ActiveFedora::Base.find(@work_proxy.work_id)
         return :update
@@ -80,14 +95,6 @@ class BulkOps::WorkJob < ActiveJob::Base
     end
   end
-  def record_exists? id
-    begin
-      return true if SolrDocument.find(id)
-    rescue Blacklight::Exceptions::RecordNotFound
-      return false
-    end
-  end
   def report_error message=nil
     update_status "job_error", message: message
   end

data/lib/bulk_ops/work_proxy.rb CHANGED

@@ -2,7 +2,6 @@ class BulkOps::WorkProxy < ActiveRecord::Base
   self.table_name = "bulk_ops_work_proxies"
   belongs_to :operation, class_name: "BulkOps::Operation", foreign_key: "operation_id"
-  has_many :relationships, class_name: "BulkOps::Relationship"
   attr_accessor :proxy_errors
@@ -38,5 +37,22 @@ class BulkOps::WorkProxy < ActiveRecord::Base
     @proxy_errors ||= []
   end
+  def ordered_siblings
+    return nil unless (parent = BulkOps::WorkProxy.find(parent_id))
+    parent.ordered_children - self
+  end
+  def ordered_children
+    children = BulkOps::WorkProxy.where(parent_id: id)
+    ordered_kids = []
+    previous_id = nil
+    while ordered_kids.length < children.length do
+      next_child = children.find{|child| child.previous_sibling_id == previous_id}
+      break if (next_child.nil? or ordered_kids.include?(next_child))
+      previous_id = next_child.id
+      ordered_kids << next_child
+    end
+    ordered_kids = ordered_kids + (children - ordered_kids) if (children.length > ordered_kids.length)
+    ordered_kids
+  end
 end

data/lib/concerns/interpret_controlled_behavior.rb ADDED

@@ -0,0 +1,140 @@
+module BulkOps::InterpretControlledBehavior
+  extend ActiveSupport::Concern
+  def interpret_controlled_fields
+    # The labels array tracks the contents of columns marked as labels,
+    # which may require special validation
+    labels = {}
+    # This hash is populated with relevant data as we loop through the fields
+    controlled_data = {}
+    @raw_row.each do |field_name, value|
+      next if value.blank?  or field_name.blank?
+      field_name = field_name.to_s
+      #If our CSV interpreter is feeding us the headers as a line, ignore it.
+      next if field_name == value
+      #check if they are using the 'field_name.authority' syntax
+      authority = nil
+      if ((split=field_name.split('.')).count == 2)
+        authority = split.last
+        field_name = split.first
+      end
+      # get the field name, if this column is a metadata field
+      field_name_norm = find_field_name(field_name)
+      field = schema.get_field(field_name_norm)
+      # Ignore anything that isn't a controlled field
+      next unless field.present? && field.controlled?
+      # Keep track of label fields
+      if field_name.downcase.ends_with?("label")
+        next if @options["ignore_labels"]
+        labels[field_name_norm] ||= []
+        labels[field_name_norm] += BulkOps::Parser.split_values value
+        next unless @options["import_labels"]
+      end
+      # handle multiple values
+      value_array = BulkOps::Parser.split_values(value)
+      controlled_data[field_name_norm] ||= [] unless value_array.blank?
+      value_array.each do |value|
+        # Decide of we're dealing with a label or url
+        # It's an ID if it's a URL and the name doesn't end in 'label'
+        value.strip!
+        if value =~ /^#{URI::regexp}$/ and !field_name.downcase.ends_with?("label")
+          value_id = value
+        #          label = WorkIndexer.fetch_remote_label(value)
+        #          error_message =  "cannot fetch remote label for url: #{value}"
+        #          report_error( :cannot_retrieve_label , error_message, url: value, row_number: row_number) unless label
+        else
+          # It's a label, so unescape it and get the id
+          value = BulkOps::Parser.unescape_csv(value)
+          value_id = get_remote_id(value, property: field_name_norm, authority: authority) || localAuthUrl(field_name_norm, value)
+          #          label = value
+          report_error(:cannot_retrieve_url,
+                       message: "cannot find or create url for controlled vocabulary label: #{value}",
+                       url: value,
+                       row_number: row_number) unless value_id
+        end
+        destroy? =
+        atts = {id: value_id}
+        atts[:_destroy] = true if (field_name.downcase.starts_with?("remove") or field_name.downcase.starts_with?("delete"))
+        controlled_data[field_name_norm] << atts
+      end
+    end
+    # Actually add all the data
+    controlled_data.each do |property_name, data|
+      @metadata["#{property_name}_attributes"] ||= [] unless data.blank?
+      data.uniq.each do |datum|
+        @metadata["#{property_name}_attributes"].reject!{|val| val[:id] == datum[:id]}
+        @metadata["#{property_name}_attributes"] << datum
+      end
+    end
+  end
+  private
+  def localAuthUrl(property, value)
+    return value if (auth = getLocalAuth(property)).nil?
+    url =   findAuthUrl(auth, value) ||  mintLocalAuthUrl(auth,value)
+    return url
+  end
+  def findAuthUrl(auth, value)
+    value.strip!
+    return nil if auth.nil?
+    return nil unless (entries = Qa::Authorities::Local.subauthority_for(auth).search(value))
+    entries.each do |entry|
+      #require exact match
+      next unless entry["label"].force_encoding('UTF-8') == value.force_encoding('UTF-8')
+      url = entry["url"] || entry["id"]
+#      url = localIdToUrl(url,auth) unless url =~ URI::regexp
+      return url
+    end
+    return nil
+  end
+  def localIdToUrl(id,auth_name)
+    root_urls = {'production' => "https://digitalcollections.library.ucsc.edu",
+                 'staging' => "http://digitalcollections-staging.library.ucsc.edu",
+                 'sandbox' => "http://digitalcollections-staging-sandbox.library.ucsc.edu",
+                 'development' => "http://#{Socket.gethostname}",
+                 'test' => "http://#{Socket.gethostname}"}
+    return "#{root_urls[Rails.env.to_s]}/authorities/show/local/#{auth_name}/#{id}"
+  end
+    def mintLocalAuthUrl(auth_name, value)
+    value.strip!
+    id = value.parameterize
+    auth = Qa::LocalAuthority.find_or_create_by(name: auth_name)
+    entry = Qa::LocalAuthorityEntry.create(local_authority: auth,
+                                           label: value,
+                                           uri: id)
+    return localIdToUrl(id,auth_name)
+  end
+  def getLocalAuth(field_name)
+    field =  schema.get_property(field_name)
+    # There is only ever one local authority per field, so just pick the first you find
+    if vocs = field.vocabularies
+      vocs.each do |voc|
+        return voc["subauthority"] if voc["authority"].downcase == "local"
+      end
+    end
+    return nil
+  end
+  def get_remote_id(value, authority: nil, property: nil)
+    return false
+    #TODO retrieve URL for this value from the specified remote authr
+  end
+end

data/lib/concerns/interpret_files_behavior.rb ADDED

@@ -0,0 +1,82 @@
+module BulkOps::InterpretFilesBehavior
+  extend ActiveSupport::Concern
+  def interpret_file_fields
+    # This method handles file additions and deletions from the spreadsheet
+    # if additional files need to be deleted because the update is set to replace
+    # some or all existing files, those replacement-related deletions are handled
+    # by the BulkOps::Operation.
+    #
+    @raw_row.each do |field, value|
+      next if value.blank?  or field.blank?
+      field = field.to_s
+      #If our CSV interpreter is feeding us the headers as a line, ignore it.
+      next if field == value
+      # Check if this is a file field, and whether we are removing or adding a file
+      next unless (action = BulkOps::Verification.is_file_field?(field))
+      # Move on if this field is the name of another property (e.g. masterFilename)
+      next if find_field_name(field)
+      # Check if we are removing a file
+      if action == "remove"
+        get_removed_filesets(value).each { |fileset_id| delete_file_set(file_set_id) }
+      else
+        # Add a file
+        operation.get_file_paths(value).each do |filepath|
+          begin
+            uploaded_file = Hyrax::UploadedFile.create(file:  File.open(filepath), user: operation.user)
+            (@metadata[:uploaded_files] ||= []) << uploaded_file.id unless uploaded_file.id.nil?
+          rescue Exception => e
+            report_error(:upload_error,
+                         message: "Error opening file: #{ filepath } -- #{e}",
+                         file: File.join(BulkOps::INGEST_MEDIA_PATH,filename),
+                         row_number: row_number)
+          end
+        end
+      end
+      # Check if any of the upcoming rows are child filesets
+      i = 1
+      while self.class.is_file_set?(@metadata,row_number+i)
+        child_row.each do |field,value|
+          next if value.blank?
+          title = value if ["title","label"].include?(field.downcase.strip)
+          if BulkOps::Verification.is_file_field?(field)
+            operation.get_file_paths(value).each do |filepath|
+              uploaded_file = Hyrax::UploadedFile.create(file:  File.open(filepath), user: operation.user)
+            end
+          end
+        end
+        i+=1
+      end
+    end
+  end
+  private
+  def get_removed_filesets(filestring)
+    file_ids = BulkOps::Parser.split_values(filestring)
+    file_ids.select{|file_id| BulkOps::SolrService.record_exists?(file_id)}
+# This part handles filenames in addition to file ids. It doesn't work yet!
+#    file_ids.map do |file_id|
+      # If the filename is the id of an existing record, keep that
+#      next(file_id) if (BulkOps::SolrService.record_exists?(file_id))
+      # If this is the label (i.e.filename) of an existing fileset, use that fileset id
+      # TODO MAKE THIS WORK!!
+#      next(filename) if (filename_exists?(filename))
+#      File.join(BulkOps::INGEST_MEDIA_PATH, filename_prefix, filename)
+#    end
+  end
+  def delete_file_set fileset_id
+    BulkOps::DeleteFileSetJob.perform_later(fileset_id, operation.user.email )
+  end
+end

data/lib/concerns/interpret_options_behavior.rb ADDED

@@ -0,0 +1,59 @@
+module BulkOps::InterpretOptionsBehavior
+  extend ActiveSupport::Concern
+  def interpret_option_fields
+    @raw_row.each do |field,value|
+      next if value.blank? or field.blank?
+      field = field.to_s
+      next if value == field
+      normfield = field.downcase.parameterize.gsub(/[_\s-]/,'')
+      if ["visibility", "public"].include?(normfield)
+        @proxy.update(visibility: format_visibility(value))
+      end
+      if ["worktype","model","type"].include?(normfield)
+        @proxy.update(work_type: format_worktype(value) )
+      end
+      if ["referenceidentifier",
+          "referenceid",
+          "refid",
+          "referenceidentifiertype",
+          "referenceidtype",
+          "refidtype",
+          "relationshipidentifier",
+          "relationshipid",
+          "relationshipidentifiertype",
+          "relationshipidtype",
+          "relid",
+          "relidtype"].include?(normfield)
+        @proxy.update(reference_identifier: format_reference_id(value))
+      end
+    end
+  end
+  private
+  def format_worktype(value)
+    # format the value like a class name
+    type = value.titleize.gsub(/[-_\s]/,'')
+    # reject it if it isn't a defined class
+    type = false unless Object.const_defined? type
+    # fall back to the work type defined by the operation, or a standard "Work"
+    return type ||= work_type || operation.work_type || "Work"
+  end
+  def format_visibility(value)
+    case value.downcase
+    when "public", "open", "true"
+      return "open"
+    when "campus", "ucsc", "institution"
+      return "ucsc"
+    when "restricted", "private", "closed", "false"
+      return "restricted"
+    end
+  end
+end