RubyGems - bulk_ops - Versions diffs - 0.1.23 → 0.2.0 - Mend

bulk_ops 0.1.23 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/db/migrate/20200122234235_remove_relationships_ammend_work_proxy.rb +14 -0
data/lib/bulk_ops.rb +3 -2
data/lib/bulk_ops/apply_operation_job.rb +8 -0
data/lib/bulk_ops/create_work_job.rb +1 -1
data/lib/bulk_ops/github_access.rb +1 -1
data/lib/bulk_ops/operation.rb +57 -49
data/lib/bulk_ops/parser.rb +50 -414
data/lib/bulk_ops/resolve_children_job.rb +14 -0
data/lib/bulk_ops/solr_service.rb +13 -0
data/lib/bulk_ops/update_work_job.rb +1 -1
data/lib/bulk_ops/verification.rb +2 -10
data/lib/bulk_ops/version.rb +1 -1
data/lib/bulk_ops/work_job.rb +20 -13
data/lib/bulk_ops/work_proxy.rb +18 -2
data/lib/concerns/interpret_controlled_behavior.rb +140 -0
data/lib/concerns/interpret_files_behavior.rb +82 -0
data/lib/concerns/interpret_options_behavior.rb +59 -0
data/lib/concerns/interpret_relationships_behavior.rb +123 -0
data/lib/concerns/interpret_scalar_behavior.rb +21 -0
data/lib/concerns/search_builder_behavior.rb +80 -0
metadata +12 -3
data/lib/bulk_ops/relationship.rb +0 -117

data/lib/concerns/interpret_relationships_behavior.rb ADDED

@@ -0,0 +1,123 @@
+module BulkOps::InterpretRelationshipsBehavior
+  extend ActiveSupport::Concern
+  def interpret_relationship_fields
+    @raw_row.each do |field,value|
+      next if value.blank?  or field.blank? or value == field
+      #the default identifier type is the reference identifier of the proxy
+      id_type = reference_identifier
+      # Correctly interpret the notation "parent:id", "parent id" etc in a column header
+      if (split = field.split(/[:_\-\s]/)).count == 2
+        id_type = split.last
+        field = split.first
+      end
+      # skip to next field unless it's a known relationship field
+      next unless (relationship_type = self.class.normalize_relationship_field_name(field))
+      case relationship_type
+      when "order"
+        # If the field specifies the object's order among siblings
+        @proxy.update(order: value.to_f)
+        next
+      when "collection"
+        # If the field specifies the name or ID of a collection,
+        # find or create the collection and update the metadata to match
+        col = find_or_create_collection(value)
+        ( @metadata[:member_of_collection_ids] ||= [] ) << col.id if col
+        next
+      when "parent"
+        # Correctly interpret the notation "row:349", "id:s8df4j32w" etc in a cell
+        if (split = value.split(/[:_\\s]/)).count == 2
+          id_type = split.first
+          value = split.last
+        end
+        parent = find_parent_proxy(value, field, id_type)
+        proxy_updates =  { parent_id: parent.id}
+        siblings = parent.ordered_children
+        if siblings.present? && @proxy.previous_sibling_id.nil?
+          proxy_updates[:previous_sibling_id] = siblings.last.id
+        end
+        @proxy.update(proxy_updates)
+      end
+    end
+  end
+  private
+  def find_previous_parent_row field="parent"
+    #Return the row number of the most recent preceding row that does
+    # not itself have a parent defined
+    i = 1;
+    while (prev_row = raw_data[row_number - i])
+      return (row_number - i) if prev_row[field].blank?
+      i += 1
+    end
+  end
+  def find_parent_proxy parent_id, field, id_type
+    #The id_type determines what kind of identifier we expect in parent_id
+    case id_type.downcase
+    when "id"
+      # Expect a reference to an existing work in the DAMS
+      return false unless BulkOps::SolrService.record_exists?(parent_id.to_s)
+      # Pull the work proxy for that work, if it exists
+      parent_proxy = BulkOps::WorkProxy.find_by(work_id: parent_id.to_s, operation_id: @proxy.operation.id) || BulkOps::WorkProxy.find_by(work_id: parent_id.to_s)
+      # If no work proxy exists for this work, create one just to keep track of this task
+      return parent_proxy if proxy.present?
+      return BulkOps::WorkProxies.create(status: "awaiting_children",
+                                         operation_id: 0,
+                                         last_event: DateTime.now,
+                                         work_id: parent_id.to_s)
+    when "proxy_id"
+      return BulkOps::WorkProxy.find(parent_id)
+    when "row"
+      if parent_id =~ /\A[-+]?[0-9]+\z/
+        if parent_id.to_i < 0
+          # if given a negative integer, count backwards from the current row (remember that parent_id.to_i is negative)
+          parent_id = @proxy.row_number.to_i + parent_id.to_i
+        elsif parent_id.to_i > 0
+          # if given a positive integer, just remove the row offset
+          parent_id = parent_id.to_i - BulkOps::ROW_OFFSET
+        end
+      elsif parent_id.to_s.downcase.include?("prev")
+        # if given any variation of the word "previous", get the first preceding row with no parent of its own
+        parent_id = find_previous_parent_row(field)
+      end
+      return BulkOps::WorkProxy.find_by(operation_id: @proxy.operation_id,
+                                        row_number: parent_id.to_i)
+      #    when "title"
+      #      #          TODO clean up solr query and add work type to it
+      #      query = "{!field f=title_tesim}#{object_identifier}"
+      #      objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,
+      #                                                            params: { fq: query, rows: 1})["response"]["docs"]
+      #      return ActiveFedora::Base.find(objects.first["id"]) if objects.present?
+      #      return false
+      #    when "identifier"
+      #      query = "{!field f=identifier_tesim}#{object_identifier}"
+      #      objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,params: { fq: query, rows: 100})["response"]["docs"]
+      #      return false if objects.blank?
+      #      return ActiveFedora::Base.find(objects.first["id"])
+    end
+  end
+  def find_collection(collection)
+    puts "FINDING COLLECTION: #{collection}"
+    cols = Collection.where(title: collection)
+    cols += Collection.where(title: collection).select{|col| col.title.first == collection}
+    cols += Collection.where(id: collection)
+    puts "COLLECTION: #{cols.last}"
+    return cols.last unless cols.empty?
+    return false
+  end
+  def find_or_create_collection(collection)
+    find_collection(collection) || Collection.create(title: [collection.to_s], depositor: operation.user.email, collection_type: Hyrax::CollectionType.find_by(title:"User Collection"))
+  end
+end

data/lib/concerns/interpret_scalar_behavior.rb ADDED

@@ -0,0 +1,21 @@
+module BulkOps::InterpretScalarBehavior
+  extend ActiveSupport::Concern
+  def interpret_scalar_fields
+     @raw_row.each do |field, values|
+      next if values.blank? or field.nil? or field == values
+      # get the field name, if this column is a metadata field
+      next unless field_name = find_field_name(field.to_s)
+      field = schema.get_field(field_name)
+      # Ignore controlled fields
+      next if field.controlled?
+      BulkOps::Parser.split_values(values).each do |value|
+        next if value.blank?
+        value = value.strip.encode('utf-8', :invalid => :replace, :undef => :replace, :replace => '_') unless value.blank?
+        value = BulkOps::Parser.unescape_csv(value)
+        (@metadata[field_name] ||= []) << value
+       end
+    end
+   end
+end

data/lib/concerns/search_builder_behavior.rb ADDED

@@ -0,0 +1,80 @@
+module BulkOps::SearchBuilderBehavior
+  extend ActiveSupport::Concern
+  included do
+    attr_reader :collection,
+                :admin_set,
+                :workflow_state
+    class_attribute :collection_field,
+                    :collection_id_field,
+                    :admin_set_field,
+                    :admin_set_id_field,
+                    :workflow_state_field,
+                    :workflow_state_id_field,
+                    :keyword_field
+    self.collection_field = 'member_of_collections_ssim'
+    self.collection_id_field = 'member_of_collection_ids_ssim'
+    self.admin_set_field = 'admin_set_tesim'
+    self.admin_set_id_field = 'isPartOf_ssim'
+    self.workflow_state_field = 'workflow_state_name_ssim'
+    self.keyword_field = 'all_fields'
+    self.default_processor_chain += [:member_of_collection,
+                                   :member_of_admin_set,
+                                   :in_workflow_state,
+                                   :with_keyword_query]
+  end
+  # @param [scope] Typically the controller object
+  def initialize(scope: {},
+                 collection: nil,
+                 collection_id: nil,
+                 admin_set: nil,
+                 admin_set_id: nil,
+                 workflow_state: nil,
+                 keyword_query: nil)
+    @collection = collection unless collection.blank?
+    @admin_set = admin_set unless admin_set.blank?
+    @admin_set_id = admin_set_id unless admin_set_id.blank?
+    @workflow_state = workflow_state unless workflow_state.blank?
+    @collection_id = collection_id unless collection_id.blank?
+    @workflow_state = workflow_state unless workflow_state.blank?
+    @keyword_query = keyword_query unless keyword_query.blank?
+    super(scope)
+  end
+  def models
+    [Work,Course,Lecture]
+  end
+  # include filters into the query to only include the collection memebers
+  def member_of_collection(solr_parameters)
+    solr_parameters[:fq] ||= []
+    solr_parameters[:fq] << "#{collection_field}:#{@collection}" if @collection
+    solr_parameters[:fq] << "#{collection_id_field}:#{@collection_id}" if @collection_id
+  end
+  # include filters into the query to only include the collection memebers
+  def member_of_admin_set(solr_parameters)
+    solr_parameters[:fq] ||= []
+    solr_parameters[:fq] << "#{admin_set_field}:#{@admin_set}" if @admin_set
+    solr_parameters[:fq] << "#{admin_set_id_field}:#{@admin_set_id}" if @admin_set_id
+  end
+  # include filters into the query to only include the collection memebers
+  def in_workflow_state(solr_parameters)
+    solr_parameters[:fq] ||= []
+    solr_parameters[:fq] << "#{workflow_state_field}:#{@workflow_state}" if @workflow_state
+  end
+  def with_keyword_query(solr_parameters)
+    if @keyword_query
+      solr_parameters[:q] ||= []
+      #    solr_parameters[:q] << "#{keyword_field}:#{@keyword_query}" if @keyword_query
+      solr_parameters[:q] << @keyword_query
+      solr_parameters[:qf] = "title_tesim titleAlternative_tesim subseries_tesim creator_label_tesim contributor_label_tesim originalPublisher_tesim publisher_tesim publisherHomepage_tesim resourceType_label_tesim  rightsHolder_label_tesim scale_tesim series_tesim source_tesim staffNote_tesim coordinates_tesim subjectName_label_tesim subjectPlace_label_tesim subjectTemporal_label_tesim subjectTopic_label_tesim dateCreated_tesim dateCreatedDisplay_tesim dateDigitized_tesim datePublished_tesim description_tesim physicalFormat_label_tesim keyword_tesim language_label_tesim license_tesim masterFilename_tesim physicalDescription_tesim accessRights_tesim itemCallNumber_tesim collectionCallNumber_tesim donorProvenance_tesim genre_label_tesim boxFolder_tesim subject_label_tesim file_format_tesim all_text_timv"
+    end
+    solr_parameters
+  end
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bulk_ops
 version: !ruby/object:Gem::Version
-  version: 0.1.23
+  version: 0.2.0
 platform: ruby
 authors:
 - Ned Henry, UCSC Library Digital Initiatives
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-01-06 00:00:00.000000000 Z
+date: 2020-01-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rails
@@ -97,7 +97,9 @@ files:
 - config/routes.rb
 - db/migrate/20180926190757_create_github_credentials.rb
 - db/migrate/20181017180436_create_bulk_ops_tables.rb
+- db/migrate/20200122234235_remove_relationships_ammend_work_proxy.rb
 - lib/bulk_ops.rb
+- lib/bulk_ops/apply_operation_job.rb
 - lib/bulk_ops/create_spreadsheet_job.rb
 - lib/bulk_ops/create_work_job.rb
 - lib/bulk_ops/delete_file_set_job.rb
@@ -108,8 +110,9 @@ files:
 - lib/bulk_ops/operation.rb
 - lib/bulk_ops/parser.rb
 - lib/bulk_ops/queue_work_ingests_job.rb
-- lib/bulk_ops/relationship.rb
+- lib/bulk_ops/resolve_children_job.rb
 - lib/bulk_ops/search_builder_behavior.rb
+- lib/bulk_ops/solr_service.rb
 - lib/bulk_ops/templates/configuration.yml
 - lib/bulk_ops/templates/readme.md
 - lib/bulk_ops/update_work_job.rb
@@ -118,6 +121,12 @@ files:
 - lib/bulk_ops/version.rb
 - lib/bulk_ops/work_job.rb
 - lib/bulk_ops/work_proxy.rb
+- lib/concerns/interpret_controlled_behavior.rb
+- lib/concerns/interpret_files_behavior.rb
+- lib/concerns/interpret_options_behavior.rb
+- lib/concerns/interpret_relationships_behavior.rb
+- lib/concerns/interpret_scalar_behavior.rb
+- lib/concerns/search_builder_behavior.rb
 - lib/generators/bulk_ops/install/install_generator.rb
 - lib/generators/bulk_ops/install/templates/config/github.yml.example
 homepage: http://UCSCLibrary.github.org

data/lib/bulk_ops/relationship.rb DELETED

@@ -1,117 +0,0 @@
-class BulkOps::Relationship < ActiveRecord::Base
-  RELATIONSHIP_FIELDS = ['parent','child','order','next','collection']
-  self.table_name = "bulk_ops_relationships"
-  belongs_to :work_proxy, class_name: "BulkOps::WorkProxy", foreign_key: "work_proxy_id"
-  delegate :operation, :operation_id, to: :work_proxy
-  def initialize *args
-    super *args
-    # Attempt to resolve the relationship immediately
-    # which might work in the case of updates
-#    resolve!
-  end
-  def findObject
-    case (identifier_type || "").downcase
-    when "id"
-      begin
-      object = ActiveFedora::Base.find(object_identifier)
-      rescue Ldp::Gone
-        return false
-      end
-      return object || false
-    when "title"
-      #          TODO clean up solr query and add work type to it
-      query = "{!field f=title_tesim}#{object_identifier}"
-      objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,
-                                                            params: { fq: query, rows: 100})["response"]["docs"]
-      if objects.present?
-        return ActiveFedora::Base.find(objects.first["id"])
-      elsif (relationship_type || "").downcase == "collection"
-        return Collection.create(title: [object_identifier])
-      else
-        return false
-      end
-    when "identifier"
-      query = "{!field f=identifier_tesim}#{object_identifier}"
-      objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,params: { fq: query, rows: 100})["response"]["docs"]
-      return false if objects.blank?
-      return ActiveFedora::Base.find(objects.first["id"])
-    when "row"
-      object_proxy = BulkOps::WorkProxy.find_by(operation_id: work_proxy.operation_id,
-                                                row_number: (object_identifier.to_i))
-      ActiveFedora::Base.find(object_proxy.work_id)
-    when "proxy_id"
-      return false unless (proxy = BulkOps::WorkProxy.find(proxy_id))
-      return false unless proxy.work_id.present?
-      ActiveFedora::Base.find(proxy.work_id)
-    end
-  end
-  def resolve!
-    unless subject = work_proxy.work and object = self.findObject
-      wait!
-      return
-    end
-    implement_relationship! relationship_type, subject, object
-  end
-  def insert_among_children(object,new_member)
-    return nil unless ["parent"].include?((relationship_type || "").downcase)
-    prev_sib_id = previous_sibling
-    # This is the id of the WorkProxy associate with the most recent sibling work
-    # that might be fully ingested. If is it not fully ingested, we will move on
-    # to the preceding sibling.
-    while prev_sib_id.present?
-      prev_sib_proxy = BulkOps::WorkProxy.find(prev_sib_id)
-      # Check if the previous sibling is fully ingested
-      # and get its index among its siblings (if it has been successfully attached to the parent)
-      prev_sib_index = object.ordered_member_ids.index(prev_sib_proxy.work_id) if prev_sib_proxy.work_id.present?
-      # Insert the new member among its siblings if we found the right place
-      return object.ordered_members.to_a.insert(prev_sib_index+1, new_member) if prev_sib_index.present?
-      # Otherwise, pull up the sibling's relationship field to check if it sibling has a sibling before it
-      sib_relationship = prev_sib_proxy.relationships.find{|rel| rel.findObject.id == object.id }
-      # If we can't find an ingested sibling among the ordered members,
-      # break this loop and make this work the first member.
-      break unless sib_relationship.present?
-      prev_sib_id = sib_relationship.previous_sibling
-    end
-    #If we never found an existing previous sibling already attached, put this one at the front
-    return  [new_member]+object.ordered_members.to_a
-  end
-  def implement_relationship!(type,subject,object)
-    case (type || "").downcase
-    when "parent"
-      unless object.member_ids.include? subject.id
-        object.reload
-        object.save
-        object.ordered_members = insert_among_children(object, subject)
-        object.save
-      end
-    when "child"
-      #CAVEAT ordering not fully implemented in this case
-      unless subject.member_ids.include? object.id
-        subject.ordered_members << object
-        subject.save
-      end
-    when "order"
-      #TODO - implement this - related to ordering of filesets
-    end
-    update(status: "complete")
-  end
-  private
-  def fail!
-    update(status: "failed")
-  end
-  def wait!
-    update(status: "pending")
-  end
-end