rof 0.0.1.pre → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.ruby-version +1 -1
  3. data/.travis.yml +12 -2
  4. data/Gemfile +1 -0
  5. data/README.md +87 -0
  6. data/bin/.ruby-version +1 -0
  7. data/bin/csv_to_rof +26 -0
  8. data/bin/fedora_to_rof +57 -0
  9. data/bin/osf_to_rof +40 -0
  10. data/bin/rof +78 -0
  11. data/bulk-ingest.md +242 -0
  12. data/labels.md +111 -0
  13. data/lib/rof.rb +20 -1
  14. data/lib/rof/access.rb +57 -0
  15. data/lib/rof/cli.rb +122 -0
  16. data/lib/rof/collection.rb +109 -0
  17. data/lib/rof/compare_rof.rb +92 -0
  18. data/lib/rof/filters/bendo.rb +33 -0
  19. data/lib/rof/filters/date_stamp.rb +36 -0
  20. data/lib/rof/filters/file_to_url.rb +27 -0
  21. data/lib/rof/filters/label.rb +153 -0
  22. data/lib/rof/filters/work.rb +111 -0
  23. data/lib/rof/get_from_fedora.rb +196 -0
  24. data/lib/rof/ingest.rb +204 -0
  25. data/lib/rof/ingesters/rels_ext_ingester.rb +78 -0
  26. data/lib/rof/ingesters/rights_metadata_ingester.rb +68 -0
  27. data/lib/rof/osf_context.rb +19 -0
  28. data/lib/rof/osf_to_rof.rb +122 -0
  29. data/lib/rof/rdf_context.rb +36 -0
  30. data/lib/rof/translate_csv.rb +112 -0
  31. data/lib/rof/utility.rb +84 -0
  32. data/lib/rof/version.rb +2 -2
  33. data/rof.gemspec +17 -0
  34. data/spec/fixtures/a.json +4 -0
  35. data/spec/fixtures/label.json +20 -0
  36. data/spec/fixtures/osf/b6psa.tar.gz +0 -0
  37. data/spec/fixtures/rof/dev0012829m.rof +45 -0
  38. data/spec/fixtures/vcr_tests/fedora_to_rof1.yml +5274 -0
  39. data/spec/fixtures/vecnet-citation.json +73 -0
  40. data/spec/lib/rof/access_spec.rb +36 -0
  41. data/spec/lib/rof/cli_spec.rb +66 -0
  42. data/spec/lib/rof/collection_spec.rb +90 -0
  43. data/spec/lib/rof/compare_rof_spec.rb +263 -0
  44. data/spec/lib/rof/filters/date_stamp_spec.rb +90 -0
  45. data/spec/lib/rof/filters/file_to_url_spec.rb +70 -0
  46. data/spec/lib/rof/filters/label_spec.rb +94 -0
  47. data/spec/lib/rof/filters/work_spec.rb +87 -0
  48. data/spec/lib/rof/ingest_spec.rb +117 -0
  49. data/spec/lib/rof/ingesters/rels_ext_ingester_spec.rb +62 -0
  50. data/spec/lib/rof/ingesters/rights_metadata_ingester_spec.rb +114 -0
  51. data/spec/lib/rof/osf_to_rof_spec.rb +76 -0
  52. data/spec/lib/rof/translate_csv_spec.rb +109 -0
  53. data/spec/lib/rof/utility_spec.rb +64 -0
  54. data/spec/lib/rof_spec.rb +14 -0
  55. data/spec/spec_helper.rb +11 -11
  56. metadata +283 -18
@@ -0,0 +1,204 @@
1
+ require 'json/ld'
2
+ require "rof/ingesters/rels_ext_ingester"
3
+ require "rof/ingesters/rights_metadata_ingester"
4
+
5
+ module ROF
6
+ class NotFobjectError < RuntimeError
7
+ end
8
+
9
+ class MissingPidError < RuntimeError
10
+ end
11
+
12
+ class TooManyIdentitiesError < RuntimeError
13
+ end
14
+
15
+ class SourceError < RuntimeError
16
+ end
17
+
18
+ # Ingest or update item in fedora
19
+ # if fedora is nil, then we verify that item is in the proper format
20
+ # Otherwise fedora is a Rubydora::Reporitory object (for now...)
21
+ # Returns a list of ingested datastreams, if everything is okay.
22
+ # Otherwise raises an exception depending on the error.
23
+ def self.Ingest(item, fedora=nil, search_paths=[], bendo=nil)
24
+ raise NotFobjectError if item["type"] != "fobject"
25
+ raise TooManyIdentitiesError if item.key?("id") && item.key?("pid")
26
+ item["pid"] = item["id"] unless item.key?("pid")
27
+ raise MissingPidError unless item["pid"].is_a? String
28
+ models = string_nil_to_array(item["model"])
29
+ models += string_nil_to_array(item["af-model"]).map { |m| af_model_name(m) }
30
+ # does it already exist in fedora? Create it otherwise
31
+ doc = nil
32
+ if fedora
33
+ doc = fedora.find_or_initialize(item["pid"])
34
+ # the addRelationship API is broken in Fedora 3.6.x.
35
+ # Since the `models` method in Rubydora uses that API, it
36
+ # also doesn't work. ActiveFedora is not affected since it
37
+ # serializes to RELS-EXT itself, bypassing addRelationship endpoint.
38
+ # models.each do |m|
39
+ # doc.models << m unless doc.models.include?(m)
40
+ # end
41
+
42
+ # it seems like we need to save the document before adding datastreams?!?
43
+ doc.save
44
+ end
45
+
46
+ ds_touched = []
47
+ # update rels-ext if there is either a rels-ext present or if there
48
+ # is a model to set. Otherwise, don't touch it!
49
+ if (item.has_key?("rels-ext") || !models.empty?)
50
+ update_rels_ext(models, item, doc)
51
+ ds_touched << "rels-ext"
52
+ end
53
+ # now handle all the other datastreams
54
+ item.each do |key,value|
55
+ case key
56
+ # fields having special treatement
57
+ when "rights"
58
+ self.ingest_rights_metadata(item, doc)
59
+ ds_touched << "rightsMetadata"
60
+ when "metadata"
61
+ self.ingest_ld_metadata(item, doc)
62
+ ds_touched << "descMetadata"
63
+
64
+ # ignore these fields
65
+ when "type", "pid", "model", "id", "af-model", "rels-ext", "collections"
66
+
67
+ # datastream fields
68
+ when /\A(.+)-file\Z/, /\A(.+)-meta\Z/, /\A(.+)\Z/
69
+ # ingest a datastream
70
+ dsname = $1
71
+ next if ds_touched.include?(dsname)
72
+ self.ingest_datastream(dsname, item, doc, search_paths, bendo)
73
+ ds_touched << dsname
74
+ end
75
+ end
76
+ return ds_touched
77
+ end
78
+
79
+ def self.ingest_datastream(dsname, item, fdoc, search_paths, bendo)
80
+ # What kind of content is there?
81
+ ds_content = item[dsname]
82
+ ds_filename = item["#{dsname}-file"]
83
+ ds_meta = item["#{dsname}-meta"]
84
+ if ds_filename && ds_content
85
+ raise SourceError.new("Both #{dsname} and #{dsname}-file are present.")
86
+ end
87
+ if ds_content && !ds_content.is_a?(String)
88
+ raise SourceError.new("Content for #{dsname} is not a string.")
89
+ end
90
+ # A URL, without content or file, is an R datastream
91
+ # A URL, with content or file, raises an error
92
+ ds_url = ds_meta["URL"] if ds_meta && ds_meta.is_a?(Hash)
93
+ if ds_url && ds_content
94
+ raise SourceError.new("Both #{ds_url} and #{dsname} are present.")
95
+ end
96
+ if ds_url && ds_filename
97
+ raise SourceError.new("Both #{ds_url} and #{dsname}-file are present.")
98
+ end
99
+
100
+ md = {"mime-type" => "text/plain",
101
+ "label" => "",
102
+ "versionable" => true,
103
+ "control-group" => "M",
104
+ }
105
+
106
+ if ds_meta
107
+ md.merge!(item["#{dsname}-meta"])
108
+ end
109
+
110
+ if ds_url
111
+ md["control-group"] = "R"
112
+
113
+ # If the bendo server was passed in the command line, assume that the URL is in
114
+ # the form "bendo:/item/<item#>/<item name> and substitute bendo: w/ the server name
115
+ # if no bendo provided, use whatever's there.
116
+ if bendo
117
+ md["URL"] = md["URL"].sub("bendo:", bendo)
118
+ end
119
+ end
120
+
121
+ # NOTE(dbrower): this could be refactored a bit. I was trying to keep the
122
+ # same path for whether fdoc is nil or not as much as possible.
123
+ ds = nil
124
+ if fdoc
125
+ ds = fdoc[dsname]
126
+ # TODO(dbrower): maybe verify these options to be within bounds?
127
+ ds.controlGroup = md["control-group"]
128
+ ds.dsLabel = md["label"]
129
+ ds.versionable = md["versionable"]
130
+ ds.mimeType = md["mime-type"]
131
+ ds.dsLocation = md["URL"] if md["URL"]
132
+ end
133
+ need_close = false
134
+ if ds_filename
135
+ ds_content = self.find_file_and_open(ds_filename, search_paths, "rb")
136
+ need_close = true
137
+ end
138
+ if ds
139
+ ds.content = ds_content if ds_content
140
+ ds.save
141
+ end
142
+ ensure
143
+ ds_content.close if ds_content && need_close
144
+ end
145
+
146
+ def self.ingest_rights_metadata(item, fdoc)
147
+ Ingesters::RightsMetadataIngester.call(item: item, fedora_document: fdoc)
148
+ end
149
+
150
+ def self.ingest_ld_metadata(item, fdoc)
151
+ input = item['metadata']
152
+ # sometimes json-ld generates @graph structures when converting from fedora to ROF.
153
+ # in that case, don't provide an id key
154
+ if !input.has_key?("@graph")
155
+ input["@id"] = "info:fedora/#{item['pid']}" unless input["@id"]
156
+ end
157
+ graph = RDF::Graph.new << JSON::LD::API.toRdf(input)
158
+ content = graph.dump(:ntriples)
159
+ # we read the rof file as utf-8. the RDF gem seems to convert it back to
160
+ # the default encoding. so fix it.
161
+ content.force_encoding('UTF-8')
162
+ if fdoc
163
+ ds = fdoc['descMetadata']
164
+ ds.mimeType = "text/plain"
165
+ ds.content = content
166
+ ds.save
167
+ end
168
+ content
169
+ end
170
+
171
+ def self.update_rels_ext(models, item, fdoc)
172
+ Ingesters::RelsExtIngester.call(models: models, item: item, fedora_document: fdoc)
173
+ end
174
+
175
+ # find fname by looking through directories in search_path,
176
+ # an array of strings.
177
+ # Will not find any files if search_path is empty.
178
+ # Raises Errno::ENOENT if no file is found, otherwise
179
+ # opens the file and returns a fd
180
+ def self.find_file_and_open(fname, search_path, flags)
181
+ # don't search if file has an absolute path
182
+ if fname[0] == "/"
183
+ return File.open(fname, flags)
184
+ end
185
+ search_path.each do |path|
186
+ begin
187
+ f = File.open(File.join(path,fname), flags)
188
+ return f
189
+ rescue Errno::ENOENT
190
+ end
191
+ end
192
+ raise Errno::ENOENT.new(fname)
193
+ end
194
+
195
+ def self.af_model_name(model)
196
+ "info:fedora/afmodel:#{model}"
197
+ end
198
+
199
+ def self.string_nil_to_array(x)
200
+ return [] if x.nil?
201
+ return [x] unless x.is_a? Array
202
+ x
203
+ end
204
+ end
@@ -0,0 +1,78 @@
1
+ require 'rdf'
2
+ require 'json/ld'
3
+ require 'rdf/rdfxml'
4
+
5
+ module ROF
6
+ module Ingesters
7
+ class RelsExtIngester
8
+ def self.call(attributes)
9
+ new(attributes).call
10
+ end
11
+
12
+ # :models is a list of fedora content models this item has
13
+ # :item is the hash of the ROF item
14
+ # :fdoc is an optional fedora document to save to
15
+ # :pid is the namespaced identifier of this item
16
+ attr_reader :models, :item, :fdoc, :pid
17
+ def initialize(attributes = {})
18
+ @models = attributes.fetch(:models)
19
+ @item = attributes.fetch(:item)
20
+ @pid = item.fetch('pid')
21
+ @fdoc = attributes.fetch(:fedora_document, nil)
22
+ end
23
+
24
+ def call
25
+ content = build_content
26
+ persist(content)
27
+ content
28
+ end
29
+
30
+ private
31
+
32
+ def rels_ext
33
+ item.fetch('rels-ext', {})
34
+ end
35
+
36
+ def build_content
37
+ # this is ugly to work around addRelationship bug in 3.6.x
38
+ # (See bugs FCREPO-1191 and FCREPO-1187)
39
+
40
+ # build up a json-ld object, and then persist that (into XML!)
41
+ input = rels_ext
42
+ context = input.fetch("@context", {}).merge(ROF::RelsExtRefContext)
43
+ input["@context"] = context
44
+ input["@id"] = "info:fedora/#{pid}"
45
+
46
+ input["hasModel"] = models
47
+
48
+ # RELS-EXT should only contain references to other (internal) fedora
49
+ # objects. Rewrite them to have prefix "info:fedora/".
50
+ # Also need to make sure json-ld interprets each of these object
51
+ # references as an IRI instead of a string.
52
+ # This is kinda hacky. Is there a better way?
53
+ input.each do |relation, targets|
54
+ next if relation == "@context" || relation == "@id" || relation == "hasModel"
55
+ targets = [targets] if targets.is_a? String
56
+ input[relation] = targets.map do |target|
57
+ target.is_a?(String) ? {"@id" => "info:fedora/#{target}"} : target
58
+ end
59
+ end
60
+
61
+ graph = RDF::Graph.new << JSON::LD::API.toRdf(input)
62
+ graph.dump(:rdfxml)
63
+ end
64
+
65
+ def persist(content)
66
+ if fdoc
67
+ ds = fdoc['RELS-EXT']
68
+ ds.content = content
69
+ ds.mimeType = "application/rdf+xml"
70
+ ds.save
71
+ else
72
+ true
73
+ end
74
+ end
75
+
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,68 @@
1
+ module ROF
2
+ module Ingesters
3
+ class RightsMetadataIngester
4
+ def self.call(attributes)
5
+ new(attributes).call
6
+ end
7
+
8
+ attr_reader :item, :fdoc
9
+ def initialize(attributes = {})
10
+ @item = attributes.fetch(:item)
11
+ @fdoc = attributes.fetch(:fedora_document, nil)
12
+ end
13
+
14
+ def call
15
+ rights = item["rights"]
16
+ return if rights.nil?
17
+ #
18
+ # we really should be building this using an xml engine.
19
+ #
20
+ content = %Q{<rightsMetadata xmlns="http://hydra-collab.stanford.edu/schemas/rightsMetadata/v1" version="0.1">\n}
21
+ # TODO(dbrower): Does the copyright need to be exposed in the rof?
22
+ content += %Q{ <copyright>\n <human type="title"/>\n <human type="description"/>\n <machine type="uri"/>\n </copyright>\n}
23
+ content += format_rights_section("discover", rights["discover"], rights["discover-groups"])
24
+ content += format_rights_section("read", rights["read"], rights["read-groups"])
25
+ content += format_rights_section("edit", rights["edit"], rights["edit-groups"])
26
+ # TODO(dbrower): expose embargo information
27
+ content += %Q{ <embargo>\n <human/>\n}
28
+ if rights["embargo-date"]
29
+ content += %Q{ <machine>\n}
30
+ content += %Q{ <date>#{rights["embargo-date"]}</date>\n}
31
+ content += %Q{ </machine>\n}
32
+ else
33
+ content += %Q{ <machine/>\n}
34
+ end
35
+ content += %Q{ </embargo>\n}
36
+ content += %Q{</rightsMetadata>\n}
37
+
38
+ if fdoc
39
+ ds = fdoc['rightsMetadata']
40
+ ds.mimeType = 'text/xml'
41
+ ds.content = content
42
+ ds.save
43
+ end
44
+ content
45
+ end
46
+
47
+ def format_rights_section(section_name, people, groups)
48
+ people = [people] if people.is_a? String
49
+ groups = [groups] if groups.is_a? String
50
+ result = " <access type=\"#{section_name}\">\n <human/>\n"
51
+ if people || groups
52
+ result += " <machine>\n"
53
+ (people || []).each do |person|
54
+ result += " <person>#{person}</person>\n"
55
+ end
56
+ (groups || []).each do |group|
57
+ result += " <group>#{group}</group>\n"
58
+ end
59
+ result += " </machine>\n"
60
+ else
61
+ result += " <machine/>\n"
62
+ end
63
+ result += " </access>\n"
64
+ result
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,19 @@
1
+ module ROF
2
+
3
+ OsfPrefixList = {
4
+ 'dcterms' => 'http://purl.org/dc/terms/',
5
+ 'osf-model' => 'http://www.dataconservancy.org/osf-business-object-model#'
6
+ }.freeze
7
+
8
+ OsfToNDMap = {
9
+ 'dc:created' => 'http://purl.org/dc/terms/created',
10
+ 'dc:description' => 'http://purl.org/dc/terms/description',
11
+ 'dc:title' => 'http://purl.org/dc/terms/title',
12
+ 'dc:subject' => 'http://www.dataconservancy.org/osf-business-object-model#hasTag',
13
+ 'isPublic' => 'http://www.dataconservancy.org/osf-business-object-model#isPublic',
14
+ 'hasContributor' => 'http://www.dataconservancy.org/osf-business-object-model#hasContributor',
15
+ 'isBibliographic' => 'http://www.dataconservancy.org/osf-business-object-model#isBibliographic',
16
+ 'hasFullName' => 'http://www.dataconservancy.org/osf-business-object-model#hasFullName',
17
+ 'hasUser' => 'http://www.dataconservancy.org/osf-business-object-model#hasUser'
18
+ }.freeze
19
+ end
@@ -0,0 +1,122 @@
1
+ require 'json'
2
+ require 'zlib'
3
+ require 'rubygems/package'
4
+ require 'rdf/turtle'
5
+ require 'rof/osf_context'
6
+ require 'rof/rdf_context'
7
+ require 'rof/utility'
8
+
9
+ module ROF
10
+ # Class for managing OSF Archive data transformations
11
+ # It is called after the get-from-osf task, and before the work-xlat task
12
+ class OsfToRof
13
+ # Convert Osf Archive tar.gz to ROF
14
+ def self.osf_to_rof(config, osf_projects = nil)
15
+ @osf_map = ROF::OsfToNDMap
16
+ rof_array = []
17
+ return {} if osf_projects.nil?
18
+ this_project = osf_projects
19
+ ttl_data = ttl_from_targz(config, this_project,
20
+ this_project['project_identifier'] + '.ttl')
21
+ rof_array[0] = build_archive_record(config, this_project, ttl_data)
22
+ rof_array
23
+ end
24
+
25
+ # reads a ttl file and makes it a JSON-LD file that we can parse
26
+ def self.fetch_from_ttl(ttl_file)
27
+ graph = RDF::Turtle::Reader.open(ttl_file,
28
+ prefixes: ROF::OsfPrefixList.dup)
29
+ JSON::LD::API.fromRdf(graph)
30
+ end
31
+
32
+ # extracts given ttl file from JHU tar.gz package
33
+ # - assumed to live under data/obj/root
34
+ def self.ttl_from_targz(config, this_project, ttl_filename)
35
+ id = this_project['project_identifier']
36
+ ttl_path = File.join(id,
37
+ 'data/obj/root',
38
+ ttl_filename)
39
+ ROF::Utility.file_from_targz(File.join(config['package_dir'], id + '.tar.gz'),
40
+ ttl_path)
41
+ ttl_data = fetch_from_ttl(File.join(config['package_dir'], ttl_path))
42
+ # this is an array- the addition elements are the contributor(s)
43
+ ttl_data
44
+ end
45
+
46
+ # Maps RELS-EXT
47
+ def self.map_rels_ext(_ttl_data)
48
+ rels_ext = {}
49
+ rels_ext['@context'] = ROF::RelsExtRefContext.dup
50
+ rels_ext
51
+ end
52
+
53
+ # sets metadata
54
+ def self.map_metadata(config, project, ttl_data)
55
+ metadata = {}
56
+ metadata['@context'] = ROF::RdfContext.dup
57
+ # metdata derived from project ttl file
58
+ metadata['dc:created'] = Time.iso8601(ttl_data[0][@osf_map['dc:created']][0]['@value']).to_date.iso8601 + "Z"
59
+ metadata['dc:title'] = ttl_data[0][@osf_map['dc:title']][0]['@value']
60
+ metadata['dc:description'] =
61
+ ttl_data[0][@osf_map['dc:description']][0]['@value']
62
+ metadata['dc:subject'] = map_subject(ttl_data[0])
63
+ # metadata derived from osf_projects data, passed from UI
64
+ metadata['dc:source'] = "https://osf.io/" +project['project_identifier']
65
+ metadata['dc:creator#adminstrative_unit'] = project['administrative_unit']
66
+ metadata['dc:creator#affiliation'] = project['affiliation']
67
+ metadata['dc:creator'] = map_creator(config, project, ttl_data)
68
+ metadata
69
+ end
70
+
71
+ # Constructs OsfArchive Record from ttl_data, data from the UI form,
72
+ # and task config data
73
+ def self.build_archive_record(config, this_project, ttl_data)
74
+ this_rof = {}
75
+ this_rof['owner'] = this_project['owner']
76
+ this_rof['type'] = 'OsfArchive'
77
+ this_rof['rights'] = map_rights(ttl_data[0])
78
+ this_rof['rels-ext'] = map_rels_ext(ttl_data[0])
79
+ this_rof['metadata'] = map_metadata(config, this_project, ttl_data)
80
+ this_rof['files'] = [this_project['project_identifier'] + '.tar.gz']
81
+ this_rof
82
+ end
83
+
84
+ # sets subject
85
+ def self.map_subject(ttl_data)
86
+ if ttl_data.key?(@osf_map['dc:subject'])
87
+ return ttl_data[@osf_map['dc:subject']][0]['@value']
88
+ end
89
+ ''
90
+ end
91
+
92
+ # figures out the rights
93
+ def self.map_rights(ttl_data)
94
+ rights = {}
95
+ if ttl_data[@osf_map['isPublic']][0]['@value'] == 'true'
96
+ rights['read-groups'] = ['public']
97
+ end
98
+ rights
99
+ end
100
+
101
+ # sets the creator- needs to read another ttl for the User data
102
+ # only contrubutors with isBibliographic true are considered
103
+ def self.map_creator(config, project, ttl_data)
104
+ creator = ''
105
+ contributor = ttl_data[0][@osf_map['hasContributor']][0]['@id']
106
+ ttl_data.each do |item|
107
+ next unless item['@id'] == contributor
108
+ if item[@osf_map['isBibliographic']][0]['@value'] == 'true'
109
+ creator = map_user_from_ttl(config, project,
110
+ item[@osf_map['hasUser']][0]['@id'])
111
+ end
112
+ end
113
+ creator
114
+ end
115
+
116
+ # read user ttl file, extract User's full name
117
+ def self.map_user_from_ttl(config, project, file_subpath)
118
+ ttl_data = ttl_from_targz(config, project, File.basename(file_subpath))
119
+ ttl_data[0][@osf_map['hasFullName']][0]['@value']
120
+ end
121
+ end
122
+ end