RubyGems - curation_concerns-models - Versions diffs - 0.1.0 - Mend

curation_concerns-models 0.1.0

Files changed (84) hide show

data/app/models/concerns/curation_concerns/solr_document_behavior.rb ADDED Viewed

@@ -0,0 +1,135 @@
+module CurationConcerns
+  module SolrDocumentBehavior
+    def title_or_label
+      title || label
+    end
+    ##
+    # Give our SolrDocument an ActiveModel::Naming appropriate route_key
+    def route_key
+      get(Solrizer.solr_name('has_model', :symbol)).split(':').last.downcase
+    end
+    def to_param
+      id
+    end
+    def to_s
+      title_or_label
+    end
+    ##
+    # Offer the source (ActiveFedora-based) model to Rails for some of the
+    # Rails methods (e.g. link_to).
+    # @example
+    #   link_to '...', SolrDocument(:id => 'bXXXXXX5').new => <a href="/dams_object/bXXXXXX5">...</a>
+    def to_model
+      @model ||= begin
+        m = ActiveFedora::Base.load_instance_from_solr(id, self)
+        m.class == ActiveFedora::Base ? self : m
+      end
+    end
+    def collection?
+      hydra_model == 'Collection'
+    end
+    # Method to return the ActiveFedora model
+    def hydra_model
+      self[Solrizer.solr_name('active_fedora_model', Solrizer::Descriptor.new(:string, :stored, :indexed))]
+    end
+    def human_readable_type
+      Array(self[Solrizer.solr_name('human_readable_type', :stored_searchable)]).first
+    end
+    def representative
+      Array(self[Solrizer.solr_name('representative', :stored_searchable)]).first
+    end
+    def date_uploaded
+      field = self[Solrizer.solr_name("date_uploaded", :stored_sortable, type: :date)]
+      return unless field.present?
+      begin
+        Date.parse(field).to_formatted_s(:standard)
+      rescue
+        Rails.logger.info "Unable to parse date: #{field.first.inspect} for #{self['id']}"
+      end
+    end
+    def depositor(default = '')
+      val = Array(self[Solrizer.solr_name("depositor")]).first
+      val.present? ? val : default
+    end
+    def title
+      Array(self[Solrizer.solr_name('title')]).first
+    end
+    def description
+      Array(self[Solrizer.solr_name('description')]).first
+    end
+    def label
+      Array(self[Solrizer.solr_name('label')]).first
+    end
+    def file_format
+       Array(self[Solrizer.solr_name('file_format')]).first
+    end
+    def creator
+      Array(self[Solrizer.solr_name("creator")]).first
+    end
+    def tags
+      Array(self[Solrizer.solr_name("tag")])
+    end
+    def resource_type
+      Array(self[Solrizer.solr_name("resource_type")])
+    end
+    def mime_type
+      Array(self[Solrizer.solr_name("mime_type")]).first
+    end
+    def read_groups
+      Array(self[::Ability.read_group_field])
+    end
+    def edit_groups
+      Array(self[::Ability.edit_group_field])
+    end
+    def edit_people
+      Array(self[::Ability.edit_user_field])
+    end
+    def public?
+      read_groups.include?('public')
+    end
+    def registered?
+      read_groups.include?('registered')
+    end
+    def pdf?
+      ['application/pdf'].include? self.mime_type
+    end
+    def image?
+      ['image/png','image/jpeg', 'image/jpg', 'image/jp2', 'image/bmp', 'image/gif', 'image/tiff'].include? self.mime_type
+    end
+    def video?
+      ['video/mpeg', 'video/mp4', 'video/webm', 'video/x-msvideo', 'video/avi', 'video/quicktime', 'application/mxf'].include? self.mime_type
+    end
+    def audio?
+      # audio/x-wave is the mime type that fits 0.6.0 returns for a wav file.
+      # audio/mpeg is the mime type that fits 0.6.0 returns for an mp3 file.
+      ['audio/mp3', 'audio/mpeg', 'audio/x-wave', 'audio/x-wav', 'audio/ogg'].include? self.mime_type
+    end
+  end
+end

data/app/models/concerns/curation_concerns/user.rb ADDED Viewed

@@ -0,0 +1,65 @@
+module CurationConcerns::User
+  extend ActiveSupport::Concern
+  # Copied piecemeal from the pcdm branch of sufia-models. More may yet be necessary.
+  included do
+    # Connects this user object to Blacklight's Bookmarks and Folders.
+    include Blacklight::User
+    include Hydra::User
+    delegate :can?, :cannot?, to: :ability
+    attr_accessor :update_directory
+  end
+  # Format the json for select2 which requires just an id and a field called text.
+  # If we need an alternate format we should probably look at a json template gem
+  def as_json(opts = nil)
+    { id: user_key, text: display_name ? "#{display_name} (#{user_key})" : user_key }
+  end
+  # Populate user instance with attributes from remote system (e.g., LDAP)
+  # There is no default implementation -- override this in your application
+  # def populate_attributes
+  # end
+  def email_address
+    self.email
+  end
+  def name
+    self.display_name.titleize || raise
+  rescue
+    self.user_key
+  end
+  # Redefine this for more intuitive keys in Redis
+  def to_param
+    # hack because rails doesn't like periods in urls.
+    user_key.gsub(/\./, '-dot-')
+  end
+  # The basic groups method, override or will fallback to S ufia::Ldap::User
+  # def groups
+  #   @groups ||= self.group_list ? self.group_list.split(";?;") : []
+  # end
+  def ability
+    @ability ||= ::Ability.new(self)
+  end
+  module ClassMethods
+    def current
+      Thread.current[:user]
+    end
+    def current=(user)
+      Thread.current[:user] = user
+    end
+    # def from_url_component(component)
+    #   User.find_by_user_key(component.gsub(/-dot-/, '.'))
+    # end
+  end
+end

data/app/models/concerns/curation_concerns/with_basic_metadata.rb ADDED Viewed

@@ -0,0 +1,98 @@
+module CurationConcerns
+   # This is a direct copy of Sufia::GenericFile::Metadata with a few modifications:
+  # * title & description are single-value instead of multivalue
+  module DefaultMetadata
+    extend ActiveSupport::Concern
+    included do
+      property :label, predicate: ::RDF::DC.title, multiple: false
+      property :depositor, predicate: ::RDF::URI.new("http://id.loc.gov/vocabulary/relators/dpt"), multiple: false do |index|
+        index.as :symbol, :stored_searchable
+      end
+      property :relative_path, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#relativePath'), multiple: false
+      property :import_url, predicate: ::RDF::URI.new('http://scholarsphere.psu.edu/ns#importUrl'), multiple: false do |index|
+        index.as :symbol
+      end
+      property :part_of, predicate: ::RDF::DC.isPartOf
+      property :resource_type, predicate: ::RDF::DC.type do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :title, predicate: ::RDF::DC.title, multiple:false do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :creator, predicate: ::RDF::DC.creator do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :contributor, predicate: ::RDF::DC.contributor do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :description, predicate: ::RDF::DC.description, multiple: false do |index|
+        index.type :text
+        index.as :stored_searchable
+      end
+      property :tag, predicate: ::RDF::DC.relation do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :rights, predicate: ::RDF::DC.rights do |index|
+        index.as :stored_searchable
+      end
+      property :publisher, predicate: ::RDF::DC.publisher do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :date_created, predicate: ::RDF::DC.created do |index|
+        index.as :stored_searchable
+      end
+      property :date_uploaded, predicate: ::RDF::DC.dateSubmitted, multiple: false do |index|
+        index.type :date
+        index.as :stored_sortable
+      end
+      property :date_modified, predicate: ::RDF::DC.modified, multiple: false do |index|
+        index.type :date
+        index.as :stored_sortable
+      end
+      property :subject, predicate: ::RDF::DC.subject do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :language, predicate: ::RDF::DC.language do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :identifier, predicate: ::RDF::DC.identifier do |index|
+        index.as :stored_searchable
+      end
+      property :based_near, predicate: ::RDF::FOAF.based_near do |index|
+        index.as :stored_searchable, :facetable
+      end
+      property :related_url, predicate: ::RDF::RDFS.seeAlso do |index|
+        index.as :stored_searchable
+      end
+      property :bibliographic_citation, predicate: ::RDF::DC.bibliographicCitation do |index|
+        index.as :stored_searchable
+      end
+      property :source, predicate: ::RDF::DC.source do |index|
+        index.as :stored_searchable
+      end
+      # TODO: Move this somewhere more appropriate
+      begin
+        LocalAuthority.register_vocabulary(self, "subject", "lc_subjects")
+        LocalAuthority.register_vocabulary(self, "language", "lexvo_languages")
+        LocalAuthority.register_vocabulary(self, "tag", "lc_genres")
+      rescue
+        puts "tables for vocabularies missing"
+      end
+    end
+    # Add a schema.org itemtype
+    def itemtype
+      # Look up the first non-empty resource type value in a hash from the config
+      CurationConcerns.config.resource_types_to_schema[resource_type.to_a.reject { |type| type.empty? }.first] || 'http://schema.org/CreativeWork'
+    rescue
+      'http://schema.org/CreativeWork'
+    end
+  end
+end

data/app/models/concerns/curation_concerns/with_generic_files.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# Copied from Curate
+module CurationConcerns
+   module WithGenericFiles
+    extend ActiveSupport::Concern
+    included do
+      # The generic_files association and its accessor methods comes from Hydra::Works::AggregatesGenericFiles
+      before_destroy :before_destroy_cleanup_generic_files
+    end
+    # Stopgap unil ActiveFedora ContainerAssociation includes an *_ids accessor.
+    # At the moment, this is no more efficient than calling generic_files, but hopefully that will change in the future.
+    def generic_file_ids
+      generic_files.map { |generic_file| generic_file.id }
+    end
+    def before_destroy_cleanup_generic_files
+      generic_files.each(&:destroy)
+    end
+    def copy_visibility_to_files
+      generic_files.each do |gf|
+        gf.visibility = visibility
+        gf.save!
+      end
+    end
+  end
+end

data/app/models/curation_concerns/classify_concern.rb ADDED Viewed

@@ -0,0 +1,47 @@
+require 'active_attr'
+module CurationConcerns
+  class ClassifyConcern
+    include ActiveAttr::Model
+    attribute :curation_concern_type
+    validates(
+      :curation_concern_type,
+      presence: true,
+      inclusion: { in: lambda { |record| record.registered_curation_concern_types } }
+    )
+    def all_curation_concern_classes
+      registered_curation_concern_types.sort.map { |c| self.class.to_class(c) }
+    end
+    def registered_curation_concern_types
+      CurationConcerns.configuration.registered_curation_concern_types
+    end
+    def possible_curation_concern_types
+      registered_curation_concern_types.collect do |concern|
+        [self.class.to_class(concern).human_readable_type, concern]
+      end
+    end
+    def curation_concern_class
+      if possible_curation_concern_types.detect{|name, class_name|
+          class_name == curation_concern_type
+        }
+        self.class.to_class(curation_concern_type)
+      else
+        raise RuntimeError, "Invalid :curation_concern_type"
+      end
+    end
+    def self.to_class(type)
+      # TODO we may want to allow a different (or nil) namespace
+      type.camelize.constantize
+      # begin
+      #   "::#{type.camelize}".constantize
+      # rescue NameError
+      #   "CurationConcerns::#{type}".constantize
+      # end
+    end
+  end
+end

data/app/models/curation_concerns/quick_classification_query.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module CurationConcerns
+  class QuickClassificationQuery
+    def self.each_for_context(*args, &block)
+      new(*args).all.each(&block)
+    end
+    attr_reader :user
+    def initialize(user, options = {})
+      @user = user
+      @concern_name_normalizer = options.fetch(:concern_name_normalizer, ClassifyConcern.method(:to_class))
+      @registered_curation_concern_names = options.fetch(:registered_curation_concern_names, CurationConcerns.configuration.registered_curation_concern_types)
+    end
+    def all
+      ActiveFedora::Base.logger.debug "User is #{user}"
+      ActiveFedora::Base.logger.debug "try is #{normalized_curation_concern_names.first}"
+      ActiveFedora::Base.logger.debug "can is  #{user.can?(:create, normalized_curation_concern_names.first)}"
+      normalized_curation_concern_names.select {|klass| user.can?(:create, klass)}
+    end
+    private
+    attr_reader :concern_name_normalizer, :registered_curation_concern_names
+    def normalized_curation_concern_names
+      registered_curation_concern_names.collect{|name| concern_name_normalizer.call(name) }
+    end
+  end
+end

data/app/models/datastreams/fits_datastream.rb ADDED Viewed

@@ -0,0 +1,148 @@
+class FitsDatastream < ActiveFedora::OmDatastream
+  include OM::XML::Document
+  set_terminology do |t|
+    t.root(path: "fits",
+           xmlns: "http://hul.harvard.edu/ois/xml/ns/fits/fits_output",
+           schema: "http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd")
+    t.identification {
+      t.identity {
+        t.format_label(path: {attribute: "format"})
+        t.mime_type(path: {attribute: "mimetype"})
+      }
+    }
+    t.fileinfo {
+      t.file_size(path: "size")
+      t.last_modified(path: "lastmodified")
+      t.filename(path: "filename")
+      t.original_checksum(path: "md5checksum")
+      t.rights_basis(path: "rightsBasis")
+      t.copyright_basis(path: "copyrightBasis")
+      t.copyright_note(path: "copyrightNote")
+    }
+    t.filestatus {
+      t.well_formed(path: "well-formed")
+      t.valid(path: "valid")
+      t.status_message(path: "message")
+    }
+    t.metadata {
+      t.document {
+        t.file_title(path: "title")
+        t.file_author(path: "author")
+        t.file_language(path: "language")
+        t.page_count(path: "pageCount")
+        t.word_count(path: "wordCount")
+        t.character_count(path: "characterCount")
+        t.paragraph_count(path: "paragraphCount")
+        t.line_count(path: "lineCount")
+        t.table_count(path: "tableCount")
+        t.graphics_count(path: "graphicsCount")
+      }
+      t.image {
+        t.byte_order(path: "byteOrder")
+        t.compression(path: "compressionScheme")
+        t.width(path: "imageWidth")
+        t.height(path: "imageHeight")
+        t.color_space(path: "colorSpace")
+        t.profile_name(path: "iccProfileName")
+        t.profile_version(path: "iccProfileVersion")
+        t.orientation(path: "orientation")
+        t.color_map(path: "colorMap")
+        t.image_producer(path: "imageProducer")
+        t.capture_device(path: "captureDevice")
+        t.scanning_software(path: "scanningSoftwareName")
+        t.exif_version(path: "exifVersion")
+        t.gps_timestamp(path: "gpsTimeStamp")
+        t.latitude(path: "gpsDestLatitude")
+        t.longitude(path: "gpsDestLongitude")
+      }
+      t.text {
+        t.character_set(path: "charset")
+        t.markup_basis(path: "markupBasis")
+        t.markup_language(path: "markupLanguage")
+      }
+      t.audio {
+        t.duration(path: "duration")
+        t.bit_depth(path: "bitDepth")
+        t.sample_rate(path: "sampleRate")
+        t.channels(path: "channels")
+        t.data_format(path: "dataFormatType")
+        t.offset(path: "offset")
+      }
+      t.video {
+        t.width(path: "imageWidth")
+        t.height(path: "imageHeight")
+        t.duration(path: "duration")
+        t.sample_rate(path: "sampleRate")
+        t.frame_rate(path: "frameRate")
+      }
+    }
+    t.format_label(proxy: [:identification, :identity, :format_label])
+    t.mime_type(proxy: [:identification, :identity, :mime_type])
+    t.file_size(proxy: [:fileinfo, :file_size])
+    t.last_modified(proxy: [:fileinfo, :last_modified])
+    t.filename(proxy: [:fileinfo, :filename])
+    t.original_checksum(proxy: [:fileinfo, :original_checksum])
+    t.rights_basis(proxy: [:fileinfo, :rights_basis])
+    t.copyright_basis(proxy: [:fileinfo, :copyright_basis])
+    t.copyright_note(proxy: [:fileinfo, :copyright_note])
+    t.well_formed(proxy: [:filestatus, :well_formed])
+    t.valid(proxy: [:filestatus, :valid])
+    t.status_message(proxy: [:filestatus, :status_message])
+    t.file_title(proxy: [:metadata, :document, :file_title])
+    t.file_author(proxy: [:metadata, :document, :file_author])
+    t.page_count(proxy: [:metadata, :document, :page_count])
+    t.file_language(proxy: [:metadata, :document, :file_language])
+    t.word_count(proxy: [:metadata, :document, :word_count])
+    t.character_count(proxy: [:metadata, :document, :character_count])
+    t.paragraph_count(proxy: [:metadata, :document, :paragraph_count])
+    t.line_count(proxy: [:metadata, :document, :line_count])
+    t.table_count(proxy: [:metadata, :document, :table_count])
+    t.graphics_count(proxy: [:metadata, :document, :graphics_count])
+    t.byte_order(proxy: [:metadata, :image, :byte_order])
+    t.compression(proxy: [:metadata, :image, :compression])
+    t.width(proxy: [:metadata, :image, :width])
+    t.video_width( proxy: [:metadata, :video, :width])
+    t.height(proxy: [:metadata, :image, :height])
+    t.video_height(proxy: [:metadata, :video, :height])
+    t.color_space(proxy: [:metadata, :image, :color_space])
+    t.profile_name(proxy: [:metadata, :image, :profile_name])
+    t.profile_version(proxy: [:metadata, :image, :profile_version])
+    t.orientation(proxy: [:metadata, :image, :orientation])
+    t.color_map(proxy: [:metadata, :image, :color_map])
+    t.image_producer(proxy: [:metadata, :image, :image_producer])
+    t.capture_device(proxy: [:metadata, :image, :capture_device])
+    t.scanning_software(proxy: [:metadata, :image, :scanning_software])
+    t.exif_version(proxy: [:metadata, :image, :exif_version])
+    t.gps_timestamp(proxy: [:metadata, :image, :gps_timestamp])
+    t.latitude(proxy: [:metadata, :image, :latitude])
+    t.longitude(proxy: [:metadata, :image, :longitude])
+    t.character_set(proxy: [:metadata, :text, :character_set])
+    t.markup_basis(proxy: [:metadata, :text, :markup_basis])
+    t.markup_language(proxy: [:metadata, :text, :markup_language])
+    t.duration(proxy: [:metadata, :audio, :duration])
+    t.video_duration(proxy: [:metadata, :video, :duration])
+    t.bit_depth(proxy: [:metadata, :audio, :bit_depth])
+    t.sample_rate(proxy: [:metadata, :audio, :sample_rate])
+    t.video_sample_rate(proxy: [:metadata, :video, :sample_rate])
+    t.channels(proxy: [:metadata, :audio, :channels])
+    t.data_format(proxy: [:metadata, :audio, :data_format])
+    t.offset(proxy: [:metadata, :audio, :offset])
+    t.frame_rate(proxy: [:metadata, :video, :frame_rate])
+  end
+  def self.xml_template
+    builder = Nokogiri::XML::Builder.new do |xml|
+      xml.fits(xmlns: 'http://hul.harvard.edu/ois/xml/ns/fits/fits_output',
+               'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
+               'xsi:schemaLocation' =>
+    "http://hul.harvard.edu/ois/xml/ns/fits/fits_output
+    http://hul.harvard.edu/ois/xml/xsd/fits/fits_output.xsd",
+               version: "0.6.0",
+               timestamp: "1/25/12 11:04 AM") {
+        xml.identification { xml.identity(toolname: 'FITS') }
+      }
+    end
+    builder.doc
+  end
+end

data/app/models/version_committer.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ class VersionCommitter < ActiveRecord::Base
2	+ end

data/app/services/curation_concerns/characterization_service.rb ADDED Viewed

@@ -0,0 +1,71 @@
+module CurationConcerns
+  # Run FITS to gather technical metadata about the content and the full text.
+  # Store this extracted metadata in the characterization datastream.
+  class CharacterizationService
+    include Hydra::Derivatives::ExtractMetadata
+    delegate :mime_type, :uri, to: :@generic_file
+    attr_reader :generic_file
+    def self.run(generic_file)
+      new(generic_file).characterize
+    end
+    def initialize(generic_file)
+      @generic_file = generic_file
+    end
+    ## Extract the metadata from the content datastream and record it in the characterization datastream
+    def characterize
+      store_metadata(extract_metadata)
+      store_fulltext(extract_fulltext)
+      generic_file.filename = [generic_file.original_file.original_name]
+    end
+    protected
+      def store_fulltext(extracted_text)
+        if extracted_text.present?
+          extracted_text_file = generic_file.build_extracted_text
+          extracted_text_file.content = extracted_text
+        end
+      end
+      def extract_fulltext
+        FullTextExtractionService.run(generic_file)
+      end
+      def store_metadata(metadata)
+        generic_file.characterization.ng_xml = metadata if metadata.present?
+        append_metadata
+      end
+      def extract_metadata
+        return unless generic_file.original_file.has_content?
+        Hydra::FileCharacterization.characterize(generic_file.original_file.content, filename_for_characterization.join, :fits) do |config|
+          config[:fits] = Hydra::Derivatives.fits_path
+        end
+      end
+      # Populate GenericFile's properties with fields from FITS (e.g. Author from pdfs)
+      def append_metadata
+        terms = generic_file.characterization_terms
+        CurationConcerns.config.fits_to_desc_mapping.each_pair do |k, v|
+          if terms.has_key?(k)
+            # coerce to array to remove a conditional
+            terms[k] = [terms[k]] unless terms[k].is_a? Array
+            terms[k].each do |term_value|
+              proxy_term = generic_file.send(v)
+              if proxy_term.kind_of?(Array)
+                proxy_term << term_value unless proxy_term.include?(term_value)
+              else
+                # these are single-valued terms which cannot be appended to
+                generic_file.send("#{v}=", term_value)
+              end
+            end
+          end
+        end
+      end
+  end
+end

data/app/services/curation_concerns/full_text_extraction_service.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module CurationConcerns
+  # Extract the full text from the content using Solr's extract handler
+  class FullTextExtractionService
+    def self.run(generic_file)
+      new(generic_file).extract
+    end
+    delegate :original_file, :logger, :mime_type, :id, to: :@generic_file
+    def initialize(generic_file)
+      @generic_file = generic_file
+    end
+    def extract
+      uri = URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
+      req = Net::HTTP.new(uri.host, uri.port)
+      resp = req.post(uri.to_s, original_file.content, {
+          'Content-type' => "#{mime_type};charset=utf-8",
+          'Content-Length' => original_file.content.size.to_s
+        })
+      raise "URL '#{uri}' returned code #{resp.code}" unless resp.code == "200"
+      original_file.content.rewind if original_file.content.respond_to?(:rewind)
+      JSON.parse(resp.body)[''].rstrip
+    rescue => e
+      logger.error("Error extracting content from #{id}: #{e.inspect}")
+      return nil
+    end
+    def connection_url
+      case
+        when Blacklight.connection_config[:url] then Blacklight.connection_config[:url]
+        when Blacklight.connection_config["url"] then Blacklight.connection_config["url"]
+        when Blacklight.connection_config[:fulltext] then Blacklight.connection_config[:fulltext]["url"]
+        else Blacklight.connection_config[:default]["url"]
+      end
+    end
+  end
+end