rof 0.0.1.pre → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.ruby-version +1 -1
  3. data/.travis.yml +12 -2
  4. data/Gemfile +1 -0
  5. data/README.md +87 -0
  6. data/bin/.ruby-version +1 -0
  7. data/bin/csv_to_rof +26 -0
  8. data/bin/fedora_to_rof +57 -0
  9. data/bin/osf_to_rof +40 -0
  10. data/bin/rof +78 -0
  11. data/bulk-ingest.md +242 -0
  12. data/labels.md +111 -0
  13. data/lib/rof.rb +20 -1
  14. data/lib/rof/access.rb +57 -0
  15. data/lib/rof/cli.rb +122 -0
  16. data/lib/rof/collection.rb +109 -0
  17. data/lib/rof/compare_rof.rb +92 -0
  18. data/lib/rof/filters/bendo.rb +33 -0
  19. data/lib/rof/filters/date_stamp.rb +36 -0
  20. data/lib/rof/filters/file_to_url.rb +27 -0
  21. data/lib/rof/filters/label.rb +153 -0
  22. data/lib/rof/filters/work.rb +111 -0
  23. data/lib/rof/get_from_fedora.rb +196 -0
  24. data/lib/rof/ingest.rb +204 -0
  25. data/lib/rof/ingesters/rels_ext_ingester.rb +78 -0
  26. data/lib/rof/ingesters/rights_metadata_ingester.rb +68 -0
  27. data/lib/rof/osf_context.rb +19 -0
  28. data/lib/rof/osf_to_rof.rb +122 -0
  29. data/lib/rof/rdf_context.rb +36 -0
  30. data/lib/rof/translate_csv.rb +112 -0
  31. data/lib/rof/utility.rb +84 -0
  32. data/lib/rof/version.rb +2 -2
  33. data/rof.gemspec +17 -0
  34. data/spec/fixtures/a.json +4 -0
  35. data/spec/fixtures/label.json +20 -0
  36. data/spec/fixtures/osf/b6psa.tar.gz +0 -0
  37. data/spec/fixtures/rof/dev0012829m.rof +45 -0
  38. data/spec/fixtures/vcr_tests/fedora_to_rof1.yml +5274 -0
  39. data/spec/fixtures/vecnet-citation.json +73 -0
  40. data/spec/lib/rof/access_spec.rb +36 -0
  41. data/spec/lib/rof/cli_spec.rb +66 -0
  42. data/spec/lib/rof/collection_spec.rb +90 -0
  43. data/spec/lib/rof/compare_rof_spec.rb +263 -0
  44. data/spec/lib/rof/filters/date_stamp_spec.rb +90 -0
  45. data/spec/lib/rof/filters/file_to_url_spec.rb +70 -0
  46. data/spec/lib/rof/filters/label_spec.rb +94 -0
  47. data/spec/lib/rof/filters/work_spec.rb +87 -0
  48. data/spec/lib/rof/ingest_spec.rb +117 -0
  49. data/spec/lib/rof/ingesters/rels_ext_ingester_spec.rb +62 -0
  50. data/spec/lib/rof/ingesters/rights_metadata_ingester_spec.rb +114 -0
  51. data/spec/lib/rof/osf_to_rof_spec.rb +76 -0
  52. data/spec/lib/rof/translate_csv_spec.rb +109 -0
  53. data/spec/lib/rof/utility_spec.rb +64 -0
  54. data/spec/lib/rof_spec.rb +14 -0
  55. data/spec/spec_helper.rb +11 -11
  56. metadata +283 -18
@@ -0,0 +1,204 @@
1
+ require 'json/ld'
2
+ require "rof/ingesters/rels_ext_ingester"
3
+ require "rof/ingesters/rights_metadata_ingester"
4
+
5
+ module ROF
6
+ class NotFobjectError < RuntimeError
7
+ end
8
+
9
+ class MissingPidError < RuntimeError
10
+ end
11
+
12
+ class TooManyIdentitiesError < RuntimeError
13
+ end
14
+
15
+ class SourceError < RuntimeError
16
+ end
17
+
18
+ # Ingest or update item in fedora
19
+ # if fedora is nil, then we verify that item is in the proper format
20
+ # Otherwise fedora is a Rubydora::Reporitory object (for now...)
21
+ # Returns a list of ingested datastreams, if everything is okay.
22
+ # Otherwise raises an exception depending on the error.
23
+ def self.Ingest(item, fedora=nil, search_paths=[], bendo=nil)
24
+ raise NotFobjectError if item["type"] != "fobject"
25
+ raise TooManyIdentitiesError if item.key?("id") && item.key?("pid")
26
+ item["pid"] = item["id"] unless item.key?("pid")
27
+ raise MissingPidError unless item["pid"].is_a? String
28
+ models = string_nil_to_array(item["model"])
29
+ models += string_nil_to_array(item["af-model"]).map { |m| af_model_name(m) }
30
+ # does it already exist in fedora? Create it otherwise
31
+ doc = nil
32
+ if fedora
33
+ doc = fedora.find_or_initialize(item["pid"])
34
+ # the addRelationship API is broken in Fedora 3.6.x.
35
+ # Since the `models` method in Rubydora uses that API, it
36
+ # also doesn't work. ActiveFedora is not affected since it
37
+ # serializes to RELS-EXT itself, bypassing addRelationship endpoint.
38
+ # models.each do |m|
39
+ # doc.models << m unless doc.models.include?(m)
40
+ # end
41
+
42
+ # it seems like we need to save the document before adding datastreams?!?
43
+ doc.save
44
+ end
45
+
46
+ ds_touched = []
47
+ # update rels-ext if there is either a rels-ext present or if there
48
+ # is a model to set. Otherwise, don't touch it!
49
+ if (item.has_key?("rels-ext") || !models.empty?)
50
+ update_rels_ext(models, item, doc)
51
+ ds_touched << "rels-ext"
52
+ end
53
+ # now handle all the other datastreams
54
+ item.each do |key,value|
55
+ case key
56
+ # fields having special treatement
57
+ when "rights"
58
+ self.ingest_rights_metadata(item, doc)
59
+ ds_touched << "rightsMetadata"
60
+ when "metadata"
61
+ self.ingest_ld_metadata(item, doc)
62
+ ds_touched << "descMetadata"
63
+
64
+ # ignore these fields
65
+ when "type", "pid", "model", "id", "af-model", "rels-ext", "collections"
66
+
67
+ # datastream fields
68
+ when /\A(.+)-file\Z/, /\A(.+)-meta\Z/, /\A(.+)\Z/
69
+ # ingest a datastream
70
+ dsname = $1
71
+ next if ds_touched.include?(dsname)
72
+ self.ingest_datastream(dsname, item, doc, search_paths, bendo)
73
+ ds_touched << dsname
74
+ end
75
+ end
76
+ return ds_touched
77
+ end
78
+
79
+ def self.ingest_datastream(dsname, item, fdoc, search_paths, bendo)
80
+ # What kind of content is there?
81
+ ds_content = item[dsname]
82
+ ds_filename = item["#{dsname}-file"]
83
+ ds_meta = item["#{dsname}-meta"]
84
+ if ds_filename && ds_content
85
+ raise SourceError.new("Both #{dsname} and #{dsname}-file are present.")
86
+ end
87
+ if ds_content && !ds_content.is_a?(String)
88
+ raise SourceError.new("Content for #{dsname} is not a string.")
89
+ end
90
+ # A URL, without content or file, is an R datastream
91
+ # A URL, with content or file, raises an error
92
+ ds_url = ds_meta["URL"] if ds_meta && ds_meta.is_a?(Hash)
93
+ if ds_url && ds_content
94
+ raise SourceError.new("Both #{ds_url} and #{dsname} are present.")
95
+ end
96
+ if ds_url && ds_filename
97
+ raise SourceError.new("Both #{ds_url} and #{dsname}-file are present.")
98
+ end
99
+
100
+ md = {"mime-type" => "text/plain",
101
+ "label" => "",
102
+ "versionable" => true,
103
+ "control-group" => "M",
104
+ }
105
+
106
+ if ds_meta
107
+ md.merge!(item["#{dsname}-meta"])
108
+ end
109
+
110
+ if ds_url
111
+ md["control-group"] = "R"
112
+
113
+ # If the bendo server was passed in the command line, assume that the URL is in
114
+ # the form "bendo:/item/<item#>/<item name> and substitute bendo: w/ the server name
115
+ # if no bendo provided, use whatever's there.
116
+ if bendo
117
+ md["URL"] = md["URL"].sub("bendo:", bendo)
118
+ end
119
+ end
120
+
121
+ # NOTE(dbrower): this could be refactored a bit. I was trying to keep the
122
+ # same path for whether fdoc is nil or not as much as possible.
123
+ ds = nil
124
+ if fdoc
125
+ ds = fdoc[dsname]
126
+ # TODO(dbrower): maybe verify these options to be within bounds?
127
+ ds.controlGroup = md["control-group"]
128
+ ds.dsLabel = md["label"]
129
+ ds.versionable = md["versionable"]
130
+ ds.mimeType = md["mime-type"]
131
+ ds.dsLocation = md["URL"] if md["URL"]
132
+ end
133
+ need_close = false
134
+ if ds_filename
135
+ ds_content = self.find_file_and_open(ds_filename, search_paths, "rb")
136
+ need_close = true
137
+ end
138
+ if ds
139
+ ds.content = ds_content if ds_content
140
+ ds.save
141
+ end
142
+ ensure
143
+ ds_content.close if ds_content && need_close
144
+ end
145
+
146
+ def self.ingest_rights_metadata(item, fdoc)
147
+ Ingesters::RightsMetadataIngester.call(item: item, fedora_document: fdoc)
148
+ end
149
+
150
+ def self.ingest_ld_metadata(item, fdoc)
151
+ input = item['metadata']
152
+ # sometimes json-ld generates @graph structures when converting from fedora to ROF.
153
+ # in that case, don't provide an id key
154
+ if !input.has_key?("@graph")
155
+ input["@id"] = "info:fedora/#{item['pid']}" unless input["@id"]
156
+ end
157
+ graph = RDF::Graph.new << JSON::LD::API.toRdf(input)
158
+ content = graph.dump(:ntriples)
159
+ # we read the rof file as utf-8. the RDF gem seems to convert it back to
160
+ # the default encoding. so fix it.
161
+ content.force_encoding('UTF-8')
162
+ if fdoc
163
+ ds = fdoc['descMetadata']
164
+ ds.mimeType = "text/plain"
165
+ ds.content = content
166
+ ds.save
167
+ end
168
+ content
169
+ end
170
+
171
+ def self.update_rels_ext(models, item, fdoc)
172
+ Ingesters::RelsExtIngester.call(models: models, item: item, fedora_document: fdoc)
173
+ end
174
+
175
+ # find fname by looking through directories in search_path,
176
+ # an array of strings.
177
+ # Will not find any files if search_path is empty.
178
+ # Raises Errno::ENOENT if no file is found, otherwise
179
+ # opens the file and returns a fd
180
+ def self.find_file_and_open(fname, search_path, flags)
181
+ # don't search if file has an absolute path
182
+ if fname[0] == "/"
183
+ return File.open(fname, flags)
184
+ end
185
+ search_path.each do |path|
186
+ begin
187
+ f = File.open(File.join(path,fname), flags)
188
+ return f
189
+ rescue Errno::ENOENT
190
+ end
191
+ end
192
+ raise Errno::ENOENT.new(fname)
193
+ end
194
+
195
+ def self.af_model_name(model)
196
+ "info:fedora/afmodel:#{model}"
197
+ end
198
+
199
+ def self.string_nil_to_array(x)
200
+ return [] if x.nil?
201
+ return [x] unless x.is_a? Array
202
+ x
203
+ end
204
+ end
@@ -0,0 +1,78 @@
1
+ require 'rdf'
2
+ require 'json/ld'
3
+ require 'rdf/rdfxml'
4
+
5
+ module ROF
6
+ module Ingesters
7
+ class RelsExtIngester
8
+ def self.call(attributes)
9
+ new(attributes).call
10
+ end
11
+
12
+ # :models is a list of fedora content models this item has
13
+ # :item is the hash of the ROF item
14
+ # :fdoc is an optional fedora document to save to
15
+ # :pid is the namespaced identifier of this item
16
+ attr_reader :models, :item, :fdoc, :pid
17
+ def initialize(attributes = {})
18
+ @models = attributes.fetch(:models)
19
+ @item = attributes.fetch(:item)
20
+ @pid = item.fetch('pid')
21
+ @fdoc = attributes.fetch(:fedora_document, nil)
22
+ end
23
+
24
+ def call
25
+ content = build_content
26
+ persist(content)
27
+ content
28
+ end
29
+
30
+ private
31
+
32
+ def rels_ext
33
+ item.fetch('rels-ext', {})
34
+ end
35
+
36
+ def build_content
37
+ # this is ugly to work around addRelationship bug in 3.6.x
38
+ # (See bugs FCREPO-1191 and FCREPO-1187)
39
+
40
+ # build up a json-ld object, and then persist that (into XML!)
41
+ input = rels_ext
42
+ context = input.fetch("@context", {}).merge(ROF::RelsExtRefContext)
43
+ input["@context"] = context
44
+ input["@id"] = "info:fedora/#{pid}"
45
+
46
+ input["hasModel"] = models
47
+
48
+ # RELS-EXT should only contain references to other (internal) fedora
49
+ # objects. Rewrite them to have prefix "info:fedora/".
50
+ # Also need to make sure json-ld interprets each of these object
51
+ # references as an IRI instead of a string.
52
+ # This is kinda hacky. Is there a better way?
53
+ input.each do |relation, targets|
54
+ next if relation == "@context" || relation == "@id" || relation == "hasModel"
55
+ targets = [targets] if targets.is_a? String
56
+ input[relation] = targets.map do |target|
57
+ target.is_a?(String) ? {"@id" => "info:fedora/#{target}"} : target
58
+ end
59
+ end
60
+
61
+ graph = RDF::Graph.new << JSON::LD::API.toRdf(input)
62
+ graph.dump(:rdfxml)
63
+ end
64
+
65
+ def persist(content)
66
+ if fdoc
67
+ ds = fdoc['RELS-EXT']
68
+ ds.content = content
69
+ ds.mimeType = "application/rdf+xml"
70
+ ds.save
71
+ else
72
+ true
73
+ end
74
+ end
75
+
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,68 @@
1
+ module ROF
2
+ module Ingesters
3
+ class RightsMetadataIngester
4
+ def self.call(attributes)
5
+ new(attributes).call
6
+ end
7
+
8
+ attr_reader :item, :fdoc
9
+ def initialize(attributes = {})
10
+ @item = attributes.fetch(:item)
11
+ @fdoc = attributes.fetch(:fedora_document, nil)
12
+ end
13
+
14
+ def call
15
+ rights = item["rights"]
16
+ return if rights.nil?
17
+ #
18
+ # we really should be building this using an xml engine.
19
+ #
20
+ content = %Q{<rightsMetadata xmlns="http://hydra-collab.stanford.edu/schemas/rightsMetadata/v1" version="0.1">\n}
21
+ # TODO(dbrower): Does the copyright need to be exposed in the rof?
22
+ content += %Q{ <copyright>\n <human type="title"/>\n <human type="description"/>\n <machine type="uri"/>\n </copyright>\n}
23
+ content += format_rights_section("discover", rights["discover"], rights["discover-groups"])
24
+ content += format_rights_section("read", rights["read"], rights["read-groups"])
25
+ content += format_rights_section("edit", rights["edit"], rights["edit-groups"])
26
+ # TODO(dbrower): expose embargo information
27
+ content += %Q{ <embargo>\n <human/>\n}
28
+ if rights["embargo-date"]
29
+ content += %Q{ <machine>\n}
30
+ content += %Q{ <date>#{rights["embargo-date"]}</date>\n}
31
+ content += %Q{ </machine>\n}
32
+ else
33
+ content += %Q{ <machine/>\n}
34
+ end
35
+ content += %Q{ </embargo>\n}
36
+ content += %Q{</rightsMetadata>\n}
37
+
38
+ if fdoc
39
+ ds = fdoc['rightsMetadata']
40
+ ds.mimeType = 'text/xml'
41
+ ds.content = content
42
+ ds.save
43
+ end
44
+ content
45
+ end
46
+
47
+ def format_rights_section(section_name, people, groups)
48
+ people = [people] if people.is_a? String
49
+ groups = [groups] if groups.is_a? String
50
+ result = " <access type=\"#{section_name}\">\n <human/>\n"
51
+ if people || groups
52
+ result += " <machine>\n"
53
+ (people || []).each do |person|
54
+ result += " <person>#{person}</person>\n"
55
+ end
56
+ (groups || []).each do |group|
57
+ result += " <group>#{group}</group>\n"
58
+ end
59
+ result += " </machine>\n"
60
+ else
61
+ result += " <machine/>\n"
62
+ end
63
+ result += " </access>\n"
64
+ result
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,19 @@
1
+ module ROF
2
+
3
+ OsfPrefixList = {
4
+ 'dcterms' => 'http://purl.org/dc/terms/',
5
+ 'osf-model' => 'http://www.dataconservancy.org/osf-business-object-model#'
6
+ }.freeze
7
+
8
+ OsfToNDMap = {
9
+ 'dc:created' => 'http://purl.org/dc/terms/created',
10
+ 'dc:description' => 'http://purl.org/dc/terms/description',
11
+ 'dc:title' => 'http://purl.org/dc/terms/title',
12
+ 'dc:subject' => 'http://www.dataconservancy.org/osf-business-object-model#hasTag',
13
+ 'isPublic' => 'http://www.dataconservancy.org/osf-business-object-model#isPublic',
14
+ 'hasContributor' => 'http://www.dataconservancy.org/osf-business-object-model#hasContributor',
15
+ 'isBibliographic' => 'http://www.dataconservancy.org/osf-business-object-model#isBibliographic',
16
+ 'hasFullName' => 'http://www.dataconservancy.org/osf-business-object-model#hasFullName',
17
+ 'hasUser' => 'http://www.dataconservancy.org/osf-business-object-model#hasUser'
18
+ }.freeze
19
+ end
@@ -0,0 +1,122 @@
1
+ require 'json'
2
+ require 'zlib'
3
+ require 'rubygems/package'
4
+ require 'rdf/turtle'
5
+ require 'rof/osf_context'
6
+ require 'rof/rdf_context'
7
+ require 'rof/utility'
8
+
9
+ module ROF
10
+ # Class for managing OSF Archive data transformations
11
+ # It is called after the get-from-osf task, and before the work-xlat task
12
+ class OsfToRof
13
+ # Convert Osf Archive tar.gz to ROF
14
+ def self.osf_to_rof(config, osf_projects = nil)
15
+ @osf_map = ROF::OsfToNDMap
16
+ rof_array = []
17
+ return {} if osf_projects.nil?
18
+ this_project = osf_projects
19
+ ttl_data = ttl_from_targz(config, this_project,
20
+ this_project['project_identifier'] + '.ttl')
21
+ rof_array[0] = build_archive_record(config, this_project, ttl_data)
22
+ rof_array
23
+ end
24
+
25
+ # reads a ttl file and makes it a JSON-LD file that we can parse
26
+ def self.fetch_from_ttl(ttl_file)
27
+ graph = RDF::Turtle::Reader.open(ttl_file,
28
+ prefixes: ROF::OsfPrefixList.dup)
29
+ JSON::LD::API.fromRdf(graph)
30
+ end
31
+
32
+ # extracts given ttl file from JHU tar.gz package
33
+ # - assumed to live under data/obj/root
34
+ def self.ttl_from_targz(config, this_project, ttl_filename)
35
+ id = this_project['project_identifier']
36
+ ttl_path = File.join(id,
37
+ 'data/obj/root',
38
+ ttl_filename)
39
+ ROF::Utility.file_from_targz(File.join(config['package_dir'], id + '.tar.gz'),
40
+ ttl_path)
41
+ ttl_data = fetch_from_ttl(File.join(config['package_dir'], ttl_path))
42
+ # this is an array- the addition elements are the contributor(s)
43
+ ttl_data
44
+ end
45
+
46
+ # Maps RELS-EXT
47
+ def self.map_rels_ext(_ttl_data)
48
+ rels_ext = {}
49
+ rels_ext['@context'] = ROF::RelsExtRefContext.dup
50
+ rels_ext
51
+ end
52
+
53
+ # sets metadata
54
+ def self.map_metadata(config, project, ttl_data)
55
+ metadata = {}
56
+ metadata['@context'] = ROF::RdfContext.dup
57
+ # metdata derived from project ttl file
58
+ metadata['dc:created'] = Time.iso8601(ttl_data[0][@osf_map['dc:created']][0]['@value']).to_date.iso8601 + "Z"
59
+ metadata['dc:title'] = ttl_data[0][@osf_map['dc:title']][0]['@value']
60
+ metadata['dc:description'] =
61
+ ttl_data[0][@osf_map['dc:description']][0]['@value']
62
+ metadata['dc:subject'] = map_subject(ttl_data[0])
63
+ # metadata derived from osf_projects data, passed from UI
64
+ metadata['dc:source'] = "https://osf.io/" +project['project_identifier']
65
+ metadata['dc:creator#adminstrative_unit'] = project['administrative_unit']
66
+ metadata['dc:creator#affiliation'] = project['affiliation']
67
+ metadata['dc:creator'] = map_creator(config, project, ttl_data)
68
+ metadata
69
+ end
70
+
71
+ # Constructs OsfArchive Record from ttl_data, data from the UI form,
72
+ # and task config data
73
+ def self.build_archive_record(config, this_project, ttl_data)
74
+ this_rof = {}
75
+ this_rof['owner'] = this_project['owner']
76
+ this_rof['type'] = 'OsfArchive'
77
+ this_rof['rights'] = map_rights(ttl_data[0])
78
+ this_rof['rels-ext'] = map_rels_ext(ttl_data[0])
79
+ this_rof['metadata'] = map_metadata(config, this_project, ttl_data)
80
+ this_rof['files'] = [this_project['project_identifier'] + '.tar.gz']
81
+ this_rof
82
+ end
83
+
84
+ # sets subject
85
+ def self.map_subject(ttl_data)
86
+ if ttl_data.key?(@osf_map['dc:subject'])
87
+ return ttl_data[@osf_map['dc:subject']][0]['@value']
88
+ end
89
+ ''
90
+ end
91
+
92
+ # figures out the rights
93
+ def self.map_rights(ttl_data)
94
+ rights = {}
95
+ if ttl_data[@osf_map['isPublic']][0]['@value'] == 'true'
96
+ rights['read-groups'] = ['public']
97
+ end
98
+ rights
99
+ end
100
+
101
+ # sets the creator- needs to read another ttl for the User data
102
+ # only contrubutors with isBibliographic true are considered
103
+ def self.map_creator(config, project, ttl_data)
104
+ creator = ''
105
+ contributor = ttl_data[0][@osf_map['hasContributor']][0]['@id']
106
+ ttl_data.each do |item|
107
+ next unless item['@id'] == contributor
108
+ if item[@osf_map['isBibliographic']][0]['@value'] == 'true'
109
+ creator = map_user_from_ttl(config, project,
110
+ item[@osf_map['hasUser']][0]['@id'])
111
+ end
112
+ end
113
+ creator
114
+ end
115
+
116
+ # read user ttl file, extract User's full name
117
+ def self.map_user_from_ttl(config, project, file_subpath)
118
+ ttl_data = ttl_from_targz(config, project, File.basename(file_subpath))
119
+ ttl_data[0][@osf_map['hasFullName']][0]['@value']
120
+ end
121
+ end
122
+ end