RubyGems - rof - Versions diffs - 0.0.1.pre → 1.0.4 - Mend

rof 0.0.1.pre → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

checksums.yaml +4 -4
data/.ruby-version +1 -1
data/.travis.yml +12 -2
data/Gemfile +1 -0
data/README.md +87 -0
data/bin/.ruby-version +1 -0
data/bin/csv_to_rof +26 -0
data/bin/fedora_to_rof +57 -0
data/bin/osf_to_rof +40 -0
data/bin/rof +78 -0
data/bulk-ingest.md +242 -0
data/labels.md +111 -0
data/lib/rof.rb +20 -1
data/lib/rof/access.rb +57 -0
data/lib/rof/cli.rb +122 -0
data/lib/rof/collection.rb +109 -0
data/lib/rof/compare_rof.rb +92 -0
data/lib/rof/filters/bendo.rb +33 -0
data/lib/rof/filters/date_stamp.rb +36 -0
data/lib/rof/filters/file_to_url.rb +27 -0
data/lib/rof/filters/label.rb +153 -0
data/lib/rof/filters/work.rb +111 -0
data/lib/rof/get_from_fedora.rb +196 -0
data/lib/rof/ingest.rb +204 -0
data/lib/rof/ingesters/rels_ext_ingester.rb +78 -0
data/lib/rof/ingesters/rights_metadata_ingester.rb +68 -0
data/lib/rof/osf_context.rb +19 -0
data/lib/rof/osf_to_rof.rb +122 -0
data/lib/rof/rdf_context.rb +36 -0
data/lib/rof/translate_csv.rb +112 -0
data/lib/rof/utility.rb +84 -0
data/lib/rof/version.rb +2 -2
data/rof.gemspec +17 -0
data/spec/fixtures/a.json +4 -0
data/spec/fixtures/label.json +20 -0
data/spec/fixtures/osf/b6psa.tar.gz +0 -0
data/spec/fixtures/rof/dev0012829m.rof +45 -0
data/spec/fixtures/vcr_tests/fedora_to_rof1.yml +5274 -0
data/spec/fixtures/vecnet-citation.json +73 -0
data/spec/lib/rof/access_spec.rb +36 -0
data/spec/lib/rof/cli_spec.rb +66 -0
data/spec/lib/rof/collection_spec.rb +90 -0
data/spec/lib/rof/compare_rof_spec.rb +263 -0
data/spec/lib/rof/filters/date_stamp_spec.rb +90 -0
data/spec/lib/rof/filters/file_to_url_spec.rb +70 -0
data/spec/lib/rof/filters/label_spec.rb +94 -0
data/spec/lib/rof/filters/work_spec.rb +87 -0
data/spec/lib/rof/ingest_spec.rb +117 -0
data/spec/lib/rof/ingesters/rels_ext_ingester_spec.rb +62 -0
data/spec/lib/rof/ingesters/rights_metadata_ingester_spec.rb +114 -0
data/spec/lib/rof/osf_to_rof_spec.rb +76 -0
data/spec/lib/rof/translate_csv_spec.rb +109 -0
data/spec/lib/rof/utility_spec.rb +64 -0
data/spec/lib/rof_spec.rb +14 -0
data/spec/spec_helper.rb +11 -11
metadata +283 -18

data/lib/rof/ingest.rb ADDED

@@ -0,0 +1,204 @@
+require 'json/ld'
+require "rof/ingesters/rels_ext_ingester"
+require "rof/ingesters/rights_metadata_ingester"
+module ROF
+  class NotFobjectError < RuntimeError
+  end
+  class MissingPidError < RuntimeError
+  end
+  class TooManyIdentitiesError < RuntimeError
+  end
+  class SourceError < RuntimeError
+  end
+  # Ingest or update item in fedora
+  # if fedora is nil, then we verify that item is in the proper format
+  # Otherwise fedora is a Rubydora::Reporitory object (for now...)
+  # Returns a list of ingested datastreams, if everything is okay.
+  # Otherwise raises an exception depending on the error.
+  def self.Ingest(item, fedora=nil, search_paths=[], bendo=nil)
+    raise NotFobjectError if item["type"] != "fobject"
+    raise TooManyIdentitiesError if item.key?("id") && item.key?("pid")
+    item["pid"] = item["id"] unless item.key?("pid")
+    raise MissingPidError unless item["pid"].is_a? String
+    models = string_nil_to_array(item["model"])
+    models += string_nil_to_array(item["af-model"]).map { |m| af_model_name(m) }
+    # does it already exist in fedora? Create it otherwise
+    doc = nil
+    if fedora
+      doc = fedora.find_or_initialize(item["pid"])
+      # the addRelationship API is broken in Fedora 3.6.x.
+      # Since the `models` method in Rubydora uses that API, it
+      # also doesn't work. ActiveFedora is not affected since it
+      # serializes to RELS-EXT itself, bypassing addRelationship endpoint.
+      # models.each do |m|
+      #   doc.models << m unless doc.models.include?(m)
+      # end
+      # it seems like we need to save the document before adding datastreams?!?
+      doc.save
+    end
+    ds_touched = []
+    # update rels-ext if there is either a rels-ext present or if there
+    # is a model to set. Otherwise, don't touch it!
+    if (item.has_key?("rels-ext") || !models.empty?)
+      update_rels_ext(models, item, doc)
+      ds_touched << "rels-ext"
+    end
+    # now handle all the other datastreams
+    item.each do |key,value|
+      case key
+      # fields having special treatement
+      when "rights"
+        self.ingest_rights_metadata(item, doc)
+        ds_touched << "rightsMetadata"
+      when "metadata"
+        self.ingest_ld_metadata(item, doc)
+        ds_touched << "descMetadata"
+      # ignore these fields
+      when "type", "pid", "model", "id", "af-model", "rels-ext", "collections"
+      # datastream fields
+      when /\A(.+)-file\Z/, /\A(.+)-meta\Z/, /\A(.+)\Z/
+        # ingest a datastream
+        dsname = $1
+        next if ds_touched.include?(dsname)
+        self.ingest_datastream(dsname, item, doc, search_paths, bendo)
+        ds_touched << dsname
+      end
+    end
+    return ds_touched
+  end
+  def self.ingest_datastream(dsname, item, fdoc, search_paths, bendo)
+    # What kind of content is there?
+    ds_content = item[dsname]
+    ds_filename = item["#{dsname}-file"]
+    ds_meta = item["#{dsname}-meta"]
+    if ds_filename && ds_content
+      raise SourceError.new("Both #{dsname} and #{dsname}-file are present.")
+    end
+    if ds_content && !ds_content.is_a?(String)
+      raise SourceError.new("Content for #{dsname} is not a string.")
+    end
+    # A URL, without content or file, is an R datastream
+    # A URL, with content or file, raises an error
+    ds_url = ds_meta["URL"] if ds_meta && ds_meta.is_a?(Hash)
+    if ds_url && ds_content
+      raise SourceError.new("Both #{ds_url} and #{dsname} are present.")
+    end
+    if ds_url && ds_filename
+      raise SourceError.new("Both #{ds_url} and #{dsname}-file are present.")
+    end
+    md = {"mime-type" => "text/plain",
+          "label" => "",
+          "versionable" => true,
+          "control-group" => "M",
+    }
+    if ds_meta
+      md.merge!(item["#{dsname}-meta"])
+    end
+    if ds_url
+       md["control-group"] = "R"
+       # If the bendo server was passed in the command line, assume that the URL is in
+       # the form "bendo:/item/<item#>/<item name> and substitute bendo: w/ the server name
+       # if no bendo provided, use whatever's there.
+       if bendo
+         md["URL"] = md["URL"].sub("bendo:", bendo)
+       end
+    end
+    # NOTE(dbrower): this could be refactored a bit. I was trying to keep the
+    # same path for whether fdoc is nil or not as much as possible.
+    ds = nil
+    if fdoc
+      ds = fdoc[dsname]
+      # TODO(dbrower): maybe verify these options to be within bounds?
+      ds.controlGroup = md["control-group"]
+      ds.dsLabel = md["label"]
+      ds.versionable = md["versionable"]
+      ds.mimeType = md["mime-type"]
+      ds.dsLocation = md["URL"] if md["URL"]
+    end
+    need_close = false
+    if ds_filename
+      ds_content = self.find_file_and_open(ds_filename, search_paths, "rb")
+      need_close = true
+    end
+    if ds
+      ds.content = ds_content if ds_content
+      ds.save
+    end
+  ensure
+    ds_content.close if ds_content && need_close
+  end
+  def self.ingest_rights_metadata(item, fdoc)
+    Ingesters::RightsMetadataIngester.call(item: item, fedora_document: fdoc)
+  end
+  def self.ingest_ld_metadata(item, fdoc)
+    input = item['metadata']
+    # sometimes json-ld generates @graph structures when converting from fedora to ROF.
+    # in that case, don't provide an id key
+    if !input.has_key?("@graph")
+      input["@id"] = "info:fedora/#{item['pid']}" unless input["@id"]
+    end
+    graph = RDF::Graph.new << JSON::LD::API.toRdf(input)
+    content = graph.dump(:ntriples)
+    # we read the rof file as utf-8. the RDF gem seems to convert it back to
+    # the default encoding. so fix it.
+    content.force_encoding('UTF-8')
+    if fdoc
+      ds = fdoc['descMetadata']
+      ds.mimeType = "text/plain"
+      ds.content = content
+      ds.save
+    end
+    content
+  end
+  def self.update_rels_ext(models, item, fdoc)
+    Ingesters::RelsExtIngester.call(models: models, item: item, fedora_document: fdoc)
+  end
+  # find fname by looking through directories in search_path,
+  # an array of strings.
+  # Will not find any files if search_path is empty.
+  # Raises Errno::ENOENT if no file is found, otherwise
+  # opens the file and returns a fd
+  def self.find_file_and_open(fname, search_path, flags)
+    # don't search if file has an absolute path
+    if fname[0] == "/"
+      return File.open(fname, flags)
+    end
+    search_path.each do |path|
+      begin
+        f = File.open(File.join(path,fname), flags)
+        return f
+      rescue Errno::ENOENT
+      end
+    end
+    raise Errno::ENOENT.new(fname)
+  end
+  def self.af_model_name(model)
+    "info:fedora/afmodel:#{model}"
+  end
+  def self.string_nil_to_array(x)
+    return [] if x.nil?
+    return [x] unless x.is_a? Array
+    x
+  end
+end

data/lib/rof/ingesters/rels_ext_ingester.rb ADDED

@@ -0,0 +1,78 @@
+require 'rdf'
+require 'json/ld'
+require 'rdf/rdfxml'
+module ROF
+  module Ingesters
+    class RelsExtIngester
+      def self.call(attributes)
+        new(attributes).call
+      end
+      # :models is a list of fedora content models this item has
+      # :item is the hash of the ROF item
+      # :fdoc is an optional fedora document to save to
+      # :pid is the namespaced identifier of this item
+      attr_reader :models, :item, :fdoc, :pid
+      def initialize(attributes = {})
+        @models = attributes.fetch(:models)
+        @item = attributes.fetch(:item)
+        @pid = item.fetch('pid')
+        @fdoc = attributes.fetch(:fedora_document, nil)
+      end
+      def call
+        content = build_content
+        persist(content)
+        content
+      end
+      private
+      def rels_ext
+        item.fetch('rels-ext', {})
+      end
+      def build_content
+        # this is ugly to work around addRelationship bug in 3.6.x
+        # (See bugs FCREPO-1191 and FCREPO-1187)
+        # build up a json-ld object, and then persist that (into XML!)
+        input = rels_ext
+        context = input.fetch("@context", {}).merge(ROF::RelsExtRefContext)
+        input["@context"] = context
+        input["@id"] = "info:fedora/#{pid}"
+        input["hasModel"] = models
+        # RELS-EXT should only contain references to other (internal) fedora
+        # objects. Rewrite them to have prefix "info:fedora/".
+        # Also need to make sure json-ld interprets each of these object
+        # references as an IRI instead of a string.
+        # This is kinda hacky. Is there a better way?
+        input.each do |relation, targets|
+          next if relation == "@context" || relation == "@id" || relation == "hasModel"
+          targets = [targets] if targets.is_a? String
+          input[relation] = targets.map do |target|
+            target.is_a?(String) ? {"@id" => "info:fedora/#{target}"} : target
+          end
+        end
+        graph = RDF::Graph.new << JSON::LD::API.toRdf(input)
+        graph.dump(:rdfxml)
+      end
+      def persist(content)
+        if fdoc
+          ds = fdoc['RELS-EXT']
+          ds.content = content
+          ds.mimeType = "application/rdf+xml"
+          ds.save
+        else
+          true
+        end
+      end
+    end
+  end
+end

data/lib/rof/ingesters/rights_metadata_ingester.rb ADDED

@@ -0,0 +1,68 @@
+module ROF
+  module Ingesters
+    class RightsMetadataIngester
+      def self.call(attributes)
+        new(attributes).call
+      end
+      attr_reader :item, :fdoc
+      def initialize(attributes = {})
+        @item = attributes.fetch(:item)
+        @fdoc = attributes.fetch(:fedora_document, nil)
+      end
+      def call
+        rights = item["rights"]
+        return if rights.nil?
+        #
+        # we really should be building this using an xml engine.
+        #
+        content = %Q{<rightsMetadata xmlns="http://hydra-collab.stanford.edu/schemas/rightsMetadata/v1" version="0.1">\n}
+        # TODO(dbrower): Does the copyright need to be exposed in the rof?
+        content += %Q{  <copyright>\n    <human type="title"/>\n    <human type="description"/>\n    <machine type="uri"/>\n  </copyright>\n}
+        content += format_rights_section("discover", rights["discover"], rights["discover-groups"])
+        content += format_rights_section("read", rights["read"], rights["read-groups"])
+        content += format_rights_section("edit", rights["edit"], rights["edit-groups"])
+        # TODO(dbrower): expose embargo information
+        content += %Q{  <embargo>\n    <human/>\n}
+        if rights["embargo-date"]
+          content += %Q{    <machine>\n}
+          content += %Q{      <date>#{rights["embargo-date"]}</date>\n}
+          content += %Q{    </machine>\n}
+        else
+          content += %Q{    <machine/>\n}
+        end
+        content += %Q{  </embargo>\n}
+        content += %Q{</rightsMetadata>\n}
+        if fdoc
+          ds = fdoc['rightsMetadata']
+          ds.mimeType = 'text/xml'
+          ds.content = content
+          ds.save
+        end
+        content
+      end
+      def format_rights_section(section_name, people, groups)
+        people = [people] if people.is_a? String
+        groups = [groups] if groups.is_a? String
+        result = "  <access type=\"#{section_name}\">\n    <human/>\n"
+        if people || groups
+          result += "    <machine>\n"
+          (people || []).each do |person|
+            result += "      <person>#{person}</person>\n"
+          end
+          (groups || []).each do |group|
+            result += "      <group>#{group}</group>\n"
+          end
+          result += "    </machine>\n"
+        else
+          result += "    <machine/>\n"
+        end
+        result += "  </access>\n"
+        result
+      end
+    end
+  end
+end

data/lib/rof/osf_context.rb ADDED

@@ -0,0 +1,19 @@
+module ROF
+  OsfPrefixList = {
+    'dcterms' => 'http://purl.org/dc/terms/',
+    'osf-model' => 'http://www.dataconservancy.org/osf-business-object-model#'
+  }.freeze
+  OsfToNDMap = {
+    'dc:created' => 'http://purl.org/dc/terms/created',
+    'dc:description' => 'http://purl.org/dc/terms/description',
+    'dc:title' => 'http://purl.org/dc/terms/title',
+    'dc:subject' => 'http://www.dataconservancy.org/osf-business-object-model#hasTag',
+    'isPublic' => 'http://www.dataconservancy.org/osf-business-object-model#isPublic',
+    'hasContributor' => 'http://www.dataconservancy.org/osf-business-object-model#hasContributor',
+    'isBibliographic' => 'http://www.dataconservancy.org/osf-business-object-model#isBibliographic',
+    'hasFullName' => 'http://www.dataconservancy.org/osf-business-object-model#hasFullName',
+    'hasUser' => 'http://www.dataconservancy.org/osf-business-object-model#hasUser'
+  }.freeze
+end

data/lib/rof/osf_to_rof.rb ADDED

@@ -0,0 +1,122 @@
+require 'json'
+require 'zlib'
+require 'rubygems/package'
+require 'rdf/turtle'
+require 'rof/osf_context'
+require 'rof/rdf_context'
+require 'rof/utility'
+module ROF
+  # Class for managing OSF Archive data transformations
+  # It is called after the get-from-osf task, and before the work-xlat task
+  class OsfToRof
+    # Convert Osf Archive tar.gz  to ROF
+    def self.osf_to_rof(config, osf_projects = nil)
+      @osf_map = ROF::OsfToNDMap
+      rof_array = []
+      return {} if osf_projects.nil?
+      this_project = osf_projects
+      ttl_data = ttl_from_targz(config, this_project,
+                                this_project['project_identifier'] + '.ttl')
+      rof_array[0] = build_archive_record(config, this_project, ttl_data)
+      rof_array
+    end
+    # reads a ttl file and makes it a JSON-LD file that we can parse
+    def self.fetch_from_ttl(ttl_file)
+      graph = RDF::Turtle::Reader.open(ttl_file,
+                                       prefixes:  ROF::OsfPrefixList.dup)
+      JSON::LD::API.fromRdf(graph)
+    end
+    # extracts given ttl file from JHU tar.gz package
+    # - assumed to live under data/obj/root
+    def self.ttl_from_targz(config, this_project, ttl_filename)
+      id =  this_project['project_identifier']
+      ttl_path = File.join(id,
+                      'data/obj/root',
+                      ttl_filename)
+      ROF::Utility.file_from_targz(File.join(config['package_dir'], id + '.tar.gz'),
+                                   ttl_path)
+      ttl_data = fetch_from_ttl(File.join(config['package_dir'], ttl_path))
+      # this is an array- the addition elements are the contributor(s)
+      ttl_data
+    end
+    # Maps RELS-EXT
+    def self.map_rels_ext(_ttl_data)
+      rels_ext = {}
+      rels_ext['@context'] = ROF::RelsExtRefContext.dup
+      rels_ext
+    end
+    # sets metadata
+    def self.map_metadata(config, project, ttl_data)
+      metadata = {}
+      metadata['@context'] = ROF::RdfContext.dup
+      # metdata derived from project ttl file
+      metadata['dc:created'] = Time.iso8601(ttl_data[0][@osf_map['dc:created']][0]['@value']).to_date.iso8601 + "Z"
+      metadata['dc:title'] = ttl_data[0][@osf_map['dc:title']][0]['@value']
+      metadata['dc:description'] =
+        ttl_data[0][@osf_map['dc:description']][0]['@value']
+      metadata['dc:subject'] = map_subject(ttl_data[0])
+      # metadata derived from osf_projects data, passed from UI
+      metadata['dc:source'] = "https://osf.io/" +project['project_identifier']
+      metadata['dc:creator#adminstrative_unit'] = project['administrative_unit']
+      metadata['dc:creator#affiliation'] = project['affiliation']
+      metadata['dc:creator'] = map_creator(config, project, ttl_data)
+      metadata
+    end
+    # Constructs OsfArchive Record from ttl_data, data from the UI form,
+    # and task config data
+    def self.build_archive_record(config, this_project, ttl_data)
+      this_rof = {}
+      this_rof['owner'] = this_project['owner']
+      this_rof['type'] = 'OsfArchive'
+      this_rof['rights'] = map_rights(ttl_data[0])
+      this_rof['rels-ext'] = map_rels_ext(ttl_data[0])
+      this_rof['metadata'] = map_metadata(config, this_project, ttl_data)
+      this_rof['files'] = [this_project['project_identifier'] + '.tar.gz']
+      this_rof
+    end
+    # sets subject
+    def self.map_subject(ttl_data)
+      if ttl_data.key?(@osf_map['dc:subject'])
+        return ttl_data[@osf_map['dc:subject']][0]['@value']
+      end
+      ''
+    end
+    # figures out the rights
+    def self.map_rights(ttl_data)
+      rights = {}
+      if ttl_data[@osf_map['isPublic']][0]['@value'] == 'true'
+        rights['read-groups'] = ['public']
+      end
+      rights
+    end
+    # sets the creator- needs to read another ttl for the User data
+    # only contrubutors with isBibliographic true are considered
+    def self.map_creator(config, project, ttl_data)
+      creator = ''
+      contributor = ttl_data[0][@osf_map['hasContributor']][0]['@id']
+      ttl_data.each do |item|
+        next unless item['@id'] == contributor
+        if item[@osf_map['isBibliographic']][0]['@value'] == 'true'
+          creator = map_user_from_ttl(config, project,
+                                      item[@osf_map['hasUser']][0]['@id'])
+        end
+      end
+      creator
+    end
+    # read user ttl file, extract User's full name
+    def self.map_user_from_ttl(config, project, file_subpath)
+      ttl_data = ttl_from_targz(config, project, File.basename(file_subpath))
+      ttl_data[0][@osf_map['hasFullName']][0]['@value']
+    end
+  end
+end