RubyGems - bulk_ops - Versions diffs - 0.1.3 - Mend

bulk_ops 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

checksums.yaml +7 -0
data/app/assets/images/bulk_ops/github_logo.png +0 -0
data/app/assets/javascripts/bulk_ops.js +14 -0
data/app/assets/javascripts/bulk_ops/selections.js +24 -0
data/app/assets/javascripts/selections.js +38 -0
data/app/assets/javascripts/work_search.js +64 -0
data/app/assets/stylesheets/bulk_ops.scss +99 -0
data/app/controllers/bulk_ops/application_controller.rb +13 -0
data/app/controllers/bulk_ops/github_authorization_controller.rb +33 -0
data/app/controllers/bulk_ops/operations_controller.rb +481 -0
data/app/jobs/bulk_ops/application_job.rb +4 -0
data/app/mailers/bulk_ops/application_mailer.rb +6 -0
data/app/models/bulk_ops/application_record.rb +5 -0
data/app/views/bulk_ops/_bulk_ops_sidebar_widget.html.erb +15 -0
data/app/views/bulk_ops/_github_auth_widget.html.erb +13 -0
data/app/views/bulk_ops/operations/_bulk_ops_header.html.erb +4 -0
data/app/views/bulk_ops/operations/_choose_fields.html.erb +22 -0
data/app/views/bulk_ops/operations/_choose_notifications.html.erb +22 -0
data/app/views/bulk_ops/operations/_git_message.html.erb +7 -0
data/app/views/bulk_ops/operations/_ingest_options.html.erb +42 -0
data/app/views/bulk_ops/operations/_operation_options.html.erb +38 -0
data/app/views/bulk_ops/operations/_show_authorize.html.erb +13 -0
data/app/views/bulk_ops/operations/_show_complete.html.erb +31 -0
data/app/views/bulk_ops/operations/_show_draft.html.erb +20 -0
data/app/views/bulk_ops/operations/_show_new.html.erb +2 -0
data/app/views/bulk_ops/operations/_show_pending.html.erb +58 -0
data/app/views/bulk_ops/operations/_show_running.html.erb +56 -0
data/app/views/bulk_ops/operations/_show_verifying.html.erb +8 -0
data/app/views/bulk_ops/operations/_show_waiting.html.erb +9 -0
data/app/views/bulk_ops/operations/_update_draft_work_list.html.erb +45 -0
data/app/views/bulk_ops/operations/_update_draft_work_search.html.erb +59 -0
data/app/views/bulk_ops/operations/_update_options.html.erb +9 -0
data/app/views/bulk_ops/operations/index.html.erb +51 -0
data/app/views/bulk_ops/operations/new.html.erb +36 -0
data/app/views/bulk_ops/operations/show.html.erb +7 -0
data/config/routes.rb +25 -0
data/db/migrate/20180926190757_create_github_credentials.rb +13 -0
data/db/migrate/20181017180436_create_bulk_ops_tables.rb +40 -0
data/lib/bulk_ops.rb +15 -0
data/lib/bulk_ops/create_spreadsheet_job.rb +43 -0
data/lib/bulk_ops/create_work_job.rb +14 -0
data/lib/bulk_ops/delete_file_set_job.rb +15 -0
data/lib/bulk_ops/engine.rb +6 -0
data/lib/bulk_ops/error.rb +141 -0
data/lib/bulk_ops/github_access.rb +284 -0
data/lib/bulk_ops/github_credential.rb +3 -0
data/lib/bulk_ops/operation.rb +358 -0
data/lib/bulk_ops/relationship.rb +79 -0
data/lib/bulk_ops/search_builder_behavior.rb +80 -0
data/lib/bulk_ops/templates/configuration.yml +5 -0
data/lib/bulk_ops/templates/readme.md +1 -0
data/lib/bulk_ops/update_work_job.rb +14 -0
data/lib/bulk_ops/verification.rb +210 -0
data/lib/bulk_ops/verification_job.rb +23 -0
data/lib/bulk_ops/version.rb +3 -0
data/lib/bulk_ops/work_job.rb +104 -0
data/lib/bulk_ops/work_proxy.rb +466 -0
data/lib/generators/bulk_ops/install/install_generator.rb +27 -0
data/lib/generators/bulk_ops/install/templates/config/github.yml.example +28 -0
metadata +145 -0

data/lib/bulk_ops/relationship.rb ADDED

@@ -0,0 +1,79 @@
+class BulkOps::Relationship < ActiveRecord::Base
+  RELATIONSHIP_FIELDS = ['parent','child','order','next','collection']
+  self.table_name = "bulk_ops_relationships"
+  belongs_to :work_proxy, class_name: "BulkOps::WorkProxy", foreign_key: "work_proxy_id"
+  def initialize *args
+    super *args
+    # Attempt to resolve the relationship immediately
+    # which might work in the case of updates
+    resolve!
+  end
+  def findObject
+    work_type = (relationship_type.downcase == "collection") ? "Collection" : work_proxy.work_type
+    case identifier_type
+    when "id"
+      begin
+      object = ActiveFedora::Base.find(object_identifier)
+      rescue Ldp::Gone
+        return false
+      end
+      return object || false
+    when "title"
+      #          TODO clean up solr query and add work type to it
+      query = "{!field f=title_tesim}#{object_identifier}"
+      objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,params: { fq: query, rows: 100})["response"]["docs"].first
+      object = objects.first
+      object ||= Collection.create(title: [object_identifier]) if work_type == "Collection"
+      return object || false
+    when "identifier"
+      query = "{!field f=identifier_tesim}#{object_identifier}"
+      objects = ActiveFedora::SolrService.instance.conn.get(ActiveFedora::SolrService.select_path,params: { fq: query, rows: 100})["response"]["docs"]
+      return false if objects.blank?
+      return objects.first
+    end
+  end
+  def resolve! ()
+    unless subject = work_proxy.work and object = self.findObject
+      wait!
+      return
+    end
+    implement_relationship! relationship_type, subject, object
+  end
+  def implement_relationship!(type,subject,object)
+    case type
+    when "parent"
+      object.ordered_members << subject
+      object.save
+    when "child"
+      subject.ordered_members << object
+      subject.save
+    when "collection"
+      object.add_members([subject.id])
+      object.save
+    when "next"
+    #TODO - implement this - related to ordering of filesets
+    when "order"
+      #TODO - implement this - related to ordering of filesets
+    end
+    update(status: "complete")
+  end
+  private
+  def fail!
+    update(status: "failed")
+  end
+  def wait!
+    update(status: "pending")
+  end
+end

data/lib/bulk_ops/search_builder_behavior.rb ADDED

@@ -0,0 +1,80 @@
+module BulkOps::SearchBuilderBehavior
+  extend ActiveSupport::Concern
+  included do
+    attr_reader :collection,
+                :admin_set,
+                :workflow_state
+    class_attribute :collection_field,
+                    :collection_id_field,
+                    :admin_set_field,
+                    :admin_set_id_field,
+                    :workflow_state_field,
+                    :workflow_state_id_field,
+                    :keyword_field
+    self.collection_field = 'member_of_collections_ssim'
+    self.collection_id_field = 'member_of_collection_ids_ssim'
+    self.admin_set_field = 'admin_set_tesim'
+    self.admin_set_id_field = 'isPartOf_ssim'
+    self.workflow_state_field = 'workflow_state_name_ssim'
+    self.keyword_field = 'all_fields'
+    self.default_processor_chain += [:member_of_collection,
+                                   :member_of_admin_set,
+                                   :in_workflow_state,
+                                   :with_keyword_query]
+  end
+  # @param [scope] Typically the controller object
+  def initialize(scope: {},
+                 collection: nil,
+                 collection_id: nil,
+                 admin_set: nil,
+                 admin_set_id: nil,
+                 workflow_state: nil,
+                 keyword_query: nil)
+    @collection = collection unless collection.blank?
+    @admin_set = admin_set unless admin_set.blank?
+    @admin_set_id = admin_set_id unless admin_set_id.blank?
+    @workflow_state = workflow_state unless workflow_state.blank?
+    @collection_id = collection_id unless collection_id.blank?
+    @workflow_state = workflow_state unless workflow_state.blank?
+    @keyword_query = keyword_query unless keyword_query.blank?
+    super(scope)
+  end
+  def models
+    [Work,Course,Lecture]
+  end
+  # include filters into the query to only include the collection memebers
+  def member_of_collection(solr_parameters)
+    solr_parameters[:fq] ||= []
+    solr_parameters[:fq] << "#{collection_field}:#{@collection}" if @collection
+    solr_parameters[:fq] << "#{collection_id_field}:#{@collection_id}" if @collection_id
+  end
+  # include filters into the query to only include the collection memebers
+  def member_of_admin_set(solr_parameters)
+    solr_parameters[:fq] ||= []
+    solr_parameters[:fq] << "#{admin_set_field}:#{@admin_set}" if @admin_set
+    solr_parameters[:fq] << "#{admin_set_id_field}:#{@admin_set_id}" if @admin_set_id
+  end
+  # include filters into the query to only include the collection memebers
+  def in_workflow_state(solr_parameters)
+    solr_parameters[:fq] ||= []
+    solr_parameters[:fq] << "#{workflow_state_field}:#{@workflow_state}" if @workflow_state
+  end
+  def with_keyword_query(solr_parameters)
+    if @keyword_query
+      solr_parameters[:q] ||= []
+      #    solr_parameters[:q] << "#{keyword_field}:#{@keyword_query}" if @keyword_query
+      solr_parameters[:q] << @keyword_query
+      solr_parameters[:qf] = "title_tesim titleAlternative_tesim subseries_tesim creator_label_tesim contributor_label_tesim originalPublisher_tesim publisher_tesim publisherHomepage_tesim resourceType_label_tesim  rightsHolder_label_tesim scale_tesim series_tesim source_tesim staffNote_tesim coordinates_tesim subjectName_label_tesim subjectPlace_label_tesim subjectTemporal_label_tesim subjectTopic_label_tesim dateCreated_tesim dateCreatedDisplay_tesim dateDigitized_tesim datePublished_tesim description_tesim physicalFormat_label_tesim keyword_tesim language_label_tesim license_tesim masterFilename_tesim physicalDescription_tesim accessRights_tesim itemCallNumber_tesim collectionCallNumber_tesim donorProvenance_tesim genre_label_tesim boxFolder_tesim subject_label_tesim file_format_tesim all_text_timv"
+    end
+    solr_parameters
+  end
+end

data/lib/bulk_ops/templates/configuration.yml ADDED

@@ -0,0 +1,5 @@
+notifications:
+  - ethenry@ucsc.edu
+name: "A Readable Branch Name (optional)"
+status: New
+type: "Update / Edit / Overlay"

data/lib/bulk_ops/templates/readme.md ADDED

	@@ -0,0 +1 @@
1	+ This is a readme file for the specific branch. The whole repository's readme file has different information. I'm not sure if we need both, since i haven't written either yet.

data/lib/bulk_ops/update_work_job.rb ADDED

@@ -0,0 +1,14 @@
+#require 'hydra/access_controls'
+#require 'hyrax/workflow/activate_object'
+require 'bulk_ops/work_job'
+class BulkOps::UpdateWorkJob < BulkOps::WorkJob
+  private
+  def type
+    :update
+  end
+end

data/lib/bulk_ops/verification.rb ADDED

@@ -0,0 +1,210 @@
+module BulkOps
+  module Verification
+    extend ActiveSupport::Concern
+    def verify
+      @verification_errors ||= []
+      verify_column_headers
+      verify_remote_urls
+      verify_internal_references
+      verify_files
+      verify_works_to_update if operation_type.to_s == "update"
+      unless @verification_errors.blank?
+        error_file_name = BulkOps::Error.write_errors!(@verification_errors, git)
+        #notify everybody
+        notify(subject: "Errors verifying bulk #{operation_type} in Hycruz", message: "Hyrax ran a verification step to make sure that the spreadsheet for this bulk #{operation_type} is formatted correctly and won't create any errors. We found some problems. You can see a summary of the issues at this url: https://github.com/#{git.repo}/blob/#{git.name}/#{git.name}/errors/#{error_file_name}. Please fix these problems and run this verification again. The bulk #{operation_type} will not be allowed to move forward until all verification issues are resolved.")
+        return false
+      end
+      return true
+    end
+    def notify(subject: , message:)
+      options["notifications"].each do |email|
+        ActionMailer::Base.mail(from: "admin@digitalcollections.library.ucsc.edu",
+                                to: email,
+                                subject: subject,
+                                body: message).deliver
+      end
+    end
+    def is_file_field?(fieldname)
+      return false if fieldname.blank?
+      field_parts = fieldname.underscore.humanize.downcase.gsub(/[-_]/,' ').split(" ")
+      return false unless field_parts.any?{ |field_type| BulkOps::WorkProxy::FILE_FIELDS.include?(field_type) }
+      return "remove" if field_parts.any?{ |field_type| ['remove','delete'].include?(field_type) }
+      return "add"
+    end
+    def find_field_name(fieldname)
+      name = fieldname.dup
+      name.gsub!(/[_\s-]?[aA]ttributes$/,'')
+      name.gsub!(/[_\s-]?[lL]abel$/,'')
+      name.gsub!(/^[rR]emove[_\s-]?/,'')
+      name.gsub!(/^[dD]elete[_\s-]?/,'')
+      possible_fields = Work.attribute_names + schema.all_field_names
+      matching_fields = possible_fields.select{|pfield| pfield.gsub(/[_\s-]/,'').parameterize == name.gsub(/[_\s-]/,'').parameterize }
+      return false if matching_fields.blank?
+      #      raise Exception "Ambiguous metadata fields!" if matching_fields.uniq.count > 1
+      return matching_fields.first
+    end
+    def get_file_paths(filestring)
+      return [] if filestring.blank?
+      filenames = filestring.split(BulkOps::WorkProxy::SEPARATOR)
+      filenames.map { |filename| File.join(BulkOps::Operation::INGEST_MEDIA_PATH, options['file_prefix'] || "", filename) }
+    end
+    def record_exists? id
+      begin
+        return true if SolrDocument.find(id)
+      rescue Blacklight::Exceptions::RecordNotFound
+        return false
+      end
+    end
+    private
+    def verify_files
+      file_errors = []
+      get_spreadsheet.each_with_index do |row, row_num|
+        file_fields = row.select { |field, value| is_file_field?(field) }
+        file_fields.each do |column_name, filestring|
+          next if filestring.blank? or column_name == filestring
+          get_file_paths(filestring).each do |filepath|
+            file_errors << BulkOps::Error.new({type: :cannot_find_file, file: filepath}) unless  File.file? filepath
+          end
+        end
+      end
+      @verification_errors.concat file_errors
+      return file_errors
+    end
+    def verify_configuration
+      BulkOps::Operation::OPTION_REQUIREMENTS.each do |option_name, option_info|
+        # Make sure it's present if required
+        if (option_info["required"].to_s == "true") || (option_info["required"].to_s == type)
+          if options[option_name].blank?
+            @verification_errors << BulkOps::Error.new({type: :missing_required_option, option_name: option_name})
+          end
+        end
+        # Make sure the values are acceptable if present
+        unless (values = option_info.values).blank? || options[option_name].blank?
+          unless values.include? option[option_name]
+            values_string = values.reduce{|a,b| "#{a}, #{b}"}
+            @verification_errors << BulkOps::Error.new({type: :invalid_config_value, option_name: option_name, option_values: values_string})
+          end
+        end
+      end
+    end
+    def downcase_first_letter(str)
+      str[0].downcase + str[1..-1]
+    end
+    # Make sure the headers in the spreadsheet are matching to properties
+    def verify_column_headers
+      unless (headers = get_spreadsheet.headers)
+        # log an error if we can't get the metadata headers
+        @verification_errors << BulkOps::Error.new({type: :bad_header, field: column_name})
+      end
+      headers.each do |column_name|
+        next if column_name.blank?
+        column_name_redux = column_name.downcase.parameterize.gsub(/[_\s-]/,"")
+        # Ignore everything marked as a label
+        next if column_name_redux.ends_with? "label"
+        # Ignore any column names with special meaning in hyrax
+        next if BulkOps::Operation::SPECIAL_COLUMNS.any?{|col| col.downcase.parameterize.gsub(/[_\s-]/,"") == column_name_redux }
+        # Ignore any columns speficied to be ignored in the configuration
+        ignored = options["ignored headers"] || []
+        next if ignored.any?{|col| col.downcase.parameterize.gsub(/[_\s-]/,"") == column_name_redux }
+        # Column names corresponding to work attributes are legit
+        next if Work.attribute_names.any?{|col| col.downcase.parameterize.gsub(/[_\s-]/,"") == column_name_redux }
+        @verification_errors << BulkOps::Error.new({type: :bad_header, field: column_name})
+      end
+    end
+    def verify_remote_urls
+      get_spreadsheet.each do |row, row_num|
+        schema.controlled_field_names.each do |controlled_field_name|
+          next unless (url = row[controlled_field_name])
+          label = ::WorkIndexer.fetch_remote_label(url)
+          if !label || label.blank?
+            @verification_errors << BulkOps::Error.new({type: :cannot_retrieve_label, row: row_num + ROW_OFFSET, field: controlled_field_name, url: url})
+          end
+        end
+      end
+    end
+    def get_id_from_row row
+      ref_id = get_ref_id(row).to_sym
+      return :id if ref_id == :id
+      normrow = row.mapgsub(//,'').parameterize
+      if row.key?(ref_id)
+        # TODO if ref_id is another column
+        # TODO implement solr search
+      end
+    end
+    def verify_works_to_update
+      return [] unless operation_type == "update"
+      get_spreadsheet.each_with_index do |row, row_num|
+        id = get_ref_id(row)
+        #TODO: find by other field. for now just id
+        unless (record_exists(id))
+          @verification_errors << BulkOps::Error.new(type: :cannot_find_work, id: id)
+        end
+      end
+    end
+    def get_ref_id row
+      row.each do |field,value|
+        next if field.blank? or value.blank? or field === value
+        next unless BulkOps::WorkProxy::REFERENCE_IDENTIFIER_FIELDS.any?{ |ref_field| normalize_field(ref_field) ==  normalize_field(field) }
+        return value
+      end
+      # No reference identifier specified in the row. Use the default for the operation.
+      return reference_identifier || :id
+    end
+    def normalize_field field
+      return '' if field.nil?
+      field.downcase.parameterize.gsub(/[_\s-]/,'')
+    end
+    def verify_internal_references
+      # TODO
+      # This is sketchy. Redo it.
+      get_spreadsheet.each do |row,row_num|
+        ref_id = get_ref_id(row)
+        BulkOps::Operation::RELATIONSHIP_COLUMNS.each do |relationship|
+          next unless (obj_id = row[relationship])
+          if (split = obj_id.split(':')).count == 2
+            ref_id = split[0].downcase
+            obj_id = split[1]
+          end
+          if ref_id == "row" || (ref_id == "id/row" && obj_id.is_a?(Integer))
+            # This is a row number reference. It should be an integer in the range of possible row numbers.
+            unless obj_id.is_a? Integer && obj_id > 0 && obj_id <= metadata.count
+              @verification_errors << BulkOps::Error.new({type: :bad_object_reference, object_id: obj_id, row_number: row_num + ROW_OFFSET})
+            end
+          elsif ref_id == "id" || ref_id == "hyrax id" || (ref_id == "id/row" && (obj_id.is_a? Integer))
+            # This is a hydra id reference. It should correspond to an object already in the repo
+            unless record_exists?(obj_id)
+              @verification_errors << BulkOps::Error.new({type: :bad_object_reference, object_id: obj_id, row_number: row_num+ROW_OFFSET})
+            end
+          else
+            # This must be based on some other presumably unique field in hyrax, or a dummy field in the spreadsheet. We haven't added this functionality yet. Ignore for now.
+          end
+        end
+      end
+    end
+  end
+end

data/lib/bulk_ops/verification_job.rb ADDED

@@ -0,0 +1,23 @@
+#require 'hydra/access_controls'
+#require 'hyrax/workflow/activate_object'
+class BulkOps::VerificationJob < ActiveJob::Base
+  attr_accessor :operation
+  queue_as :default
+  def perform(operation)
+      if operation.verify
+        operation.set_stage "authorize"
+        if operation.create_pull_request
+          operation.notify(subject: "Bulk Operation Verification Successful", message: "Your bulk ingest has passed verification, and we have requested to start applying the operation. It may required one final approval from an administrator before the operation proceeds.")
+        else
+          operation.notify(subject: "Bulk Operation - Error creating Github pull request", message: "Your bulk ingest has passed verification, but we had a problem creating a pull request on Github in order to merge this operation with the master branch. Please check your github configuration.")
+        end
+      else
+        operation.set_stage "pending"
+      end
+  end
+end

data/lib/bulk_ops/version.rb ADDED

@@ -0,0 +1,3 @@
+module BulkOps
+  VERSION = "0.1.3"
+end

data/lib/bulk_ops/work_job.rb ADDED

@@ -0,0 +1,104 @@
+#require 'hydra/access_controls'
+#require 'hyrax/workflow/activate_object'
+class BulkOps::WorkJob < ActiveJob::Base
+  attr_accessor :status, :work, :type
+  queue_as :ingest
+  after_perform do |job|
+    # update BulkOperationsWorkProxy status
+    @work ||= ActiveFedora.find(@work_proxy.work_id)
+    if  @work.id.nil?
+      status = "error"
+    else
+      @work_proxy.work_id = @work.id
+      status = "complete"
+    end
+    update_status status
+    # Attempt to resolve all of the relationships defined in this row
+    @work_proxy.relationships.each do |relationship|
+      relationship.resolve!
+    end
+    # Attempt to resolve each dangling (objectless) relationships using
+    # this work as an object
+    BulkOps::Relationship.where(:status => "objectless").each do |relationship|
+      relationship.resolve! @work.id
+    end
+    # Delete any UploadedFiles. These take up tons of unnecessary disk space.
+    @work.file_sets.each do |fileset|
+      if uf = Hyrax::UploadedFile.find_by(file: fileset.label)
+        uf.destroy!
+      end
+    end
+    # Remove any edit holds placed on an item
+    @work_proxy.lift_hold
+    # Check if the parent operation is finished
+    # and do any cleanup if so
+    @work_proxy.operation.check_if_finished
+  end
+  def perform(workClass,user_email,attributes,work_proxy_id,visibility="private")
+    update_status "starting", "Initializing the job"
+    @work_proxy = BulkOps::WorkProxy.find(work_proxy_id)
+    unless @work_proxy
+      report_error("Cannot find work proxy with id: #{work_proxy_id}")
+      return
+    end
+    if record_exists?(@work_proxy.work_id)
+      # The work exists in Solr. Presumably we're updating it.
+      # Report an error if we can't retrieve the work from Fedora.
+      begin
+        @work = ActiveFedora::Base.find(@work_proxy.work_id)
+      rescue ActiveFedora::ObjectNotFoundError
+        report_error "Could not find work to update in Fedora (though it shows up in Solr). Work id: #{@work_proxy.work_id}"
+        return
+      end
+    else # The work is not found in Solr. If we're trying to update a work, we're in trouble.
+      if (type == "update")
+        report_error "Could not find work to update with id: #{@work_proxy.work_id}"
+        return
+      end
+      # Create the work we are ingesting
+      @work = workClass.capitalize.constantize.new
+    end
+    user = User.find_by_email(user_email)
+    update_status "running", "Started background task at #{DateTime.now.strftime("%d/%m/%Y %H:%M")}"
+    ability = Ability.new(user)
+    env = Hyrax::Actors::Environment.new(@work, ability, attributes)
+    update_status "complete", Hyrax::CurationConcern.actor.send(type,env)
+  end
+  private
+  def record_exists? id
+    begin
+      return true if SolrDocument.find(id)
+    rescue Blacklight::Exceptions::RecordNotFound
+      return false
+    end
+  end
+  def report_error message=nil
+    update_status "job_error", message: message
+  end
+  def type
+    #override this, setting as ingest by default
+    :create
+  end
+  def update_status status, message=false
+    return false unless @work_proxy
+    atts = {status: status}
+    atts[:message] = message if message
+    @work_proxy.update(atts)
+  end
+end