rof 0.0.1.pre → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.ruby-version +1 -1
  3. data/.travis.yml +12 -2
  4. data/Gemfile +1 -0
  5. data/README.md +87 -0
  6. data/bin/.ruby-version +1 -0
  7. data/bin/csv_to_rof +26 -0
  8. data/bin/fedora_to_rof +57 -0
  9. data/bin/osf_to_rof +40 -0
  10. data/bin/rof +78 -0
  11. data/bulk-ingest.md +242 -0
  12. data/labels.md +111 -0
  13. data/lib/rof.rb +20 -1
  14. data/lib/rof/access.rb +57 -0
  15. data/lib/rof/cli.rb +122 -0
  16. data/lib/rof/collection.rb +109 -0
  17. data/lib/rof/compare_rof.rb +92 -0
  18. data/lib/rof/filters/bendo.rb +33 -0
  19. data/lib/rof/filters/date_stamp.rb +36 -0
  20. data/lib/rof/filters/file_to_url.rb +27 -0
  21. data/lib/rof/filters/label.rb +153 -0
  22. data/lib/rof/filters/work.rb +111 -0
  23. data/lib/rof/get_from_fedora.rb +196 -0
  24. data/lib/rof/ingest.rb +204 -0
  25. data/lib/rof/ingesters/rels_ext_ingester.rb +78 -0
  26. data/lib/rof/ingesters/rights_metadata_ingester.rb +68 -0
  27. data/lib/rof/osf_context.rb +19 -0
  28. data/lib/rof/osf_to_rof.rb +122 -0
  29. data/lib/rof/rdf_context.rb +36 -0
  30. data/lib/rof/translate_csv.rb +112 -0
  31. data/lib/rof/utility.rb +84 -0
  32. data/lib/rof/version.rb +2 -2
  33. data/rof.gemspec +17 -0
  34. data/spec/fixtures/a.json +4 -0
  35. data/spec/fixtures/label.json +20 -0
  36. data/spec/fixtures/osf/b6psa.tar.gz +0 -0
  37. data/spec/fixtures/rof/dev0012829m.rof +45 -0
  38. data/spec/fixtures/vcr_tests/fedora_to_rof1.yml +5274 -0
  39. data/spec/fixtures/vecnet-citation.json +73 -0
  40. data/spec/lib/rof/access_spec.rb +36 -0
  41. data/spec/lib/rof/cli_spec.rb +66 -0
  42. data/spec/lib/rof/collection_spec.rb +90 -0
  43. data/spec/lib/rof/compare_rof_spec.rb +263 -0
  44. data/spec/lib/rof/filters/date_stamp_spec.rb +90 -0
  45. data/spec/lib/rof/filters/file_to_url_spec.rb +70 -0
  46. data/spec/lib/rof/filters/label_spec.rb +94 -0
  47. data/spec/lib/rof/filters/work_spec.rb +87 -0
  48. data/spec/lib/rof/ingest_spec.rb +117 -0
  49. data/spec/lib/rof/ingesters/rels_ext_ingester_spec.rb +62 -0
  50. data/spec/lib/rof/ingesters/rights_metadata_ingester_spec.rb +114 -0
  51. data/spec/lib/rof/osf_to_rof_spec.rb +76 -0
  52. data/spec/lib/rof/translate_csv_spec.rb +109 -0
  53. data/spec/lib/rof/utility_spec.rb +64 -0
  54. data/spec/lib/rof_spec.rb +14 -0
  55. data/spec/spec_helper.rb +11 -11
  56. metadata +283 -18
@@ -0,0 +1,33 @@
1
+ require 'date'
2
+
3
+ module ROF
4
+ module Filters
5
+
6
+ # If bendo server is set , add it into datasreams that contain an URl referencing bendo
7
+ #
8
+ class Bendo
9
+ def initialize(bendo=nil)
10
+ @bendo = bendo
11
+ end
12
+
13
+ def process(obj_list, _fname)
14
+
15
+ ends_meta = Regexp.new('(.+)-meta')
16
+
17
+ # for *-meta objects containing "URL", sub in bendo string if provided
18
+
19
+ obj_list.map! do |obj|
20
+ obj.map do |name, value|
21
+ if name =~ ends_meta
22
+ if obj[name]["URL"] && @bendo
23
+ obj[name]["URL"] = obj[name]["URL"].sub("bendo:",@bendo)
24
+ end
25
+ end
26
+ end
27
+ # print object
28
+ obj
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,36 @@
1
+ require 'date'
2
+
3
+ module ROF
4
+ module Filters
5
+ # Set the upload date to be the date given, provided it doesn't already exist.
6
+ # Also set the date modified to be the date given.
7
+ # If not given, the date used defaults to the local time on the computer.
8
+ class DateStamp
9
+ def initialize(date=nil)
10
+ @today = date || Date::today
11
+ @today_s = if @today.is_a?(Date)
12
+ @today.strftime('%FZ')
13
+ else
14
+ @today.to_s
15
+ end
16
+ end
17
+
18
+ def process(obj_list, _fname)
19
+ obj_list.map! do |obj|
20
+ if obj["metadata"].nil?
21
+ obj["metadata"] = {
22
+ "@context" => ROF::RdfContext
23
+ }
24
+ end
25
+ # only save the date submitted if it is not already present
26
+ if obj["metadata"]["dc:dateSubmitted"].nil?
27
+ obj["metadata"]["dc:dateSubmitted"] = @today_s
28
+ end
29
+ # always update the date modified
30
+ obj["metadata"]["dc:modified"] = @today_s
31
+ obj
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,27 @@
1
+ module ROF
2
+ module Filters
3
+ # Convert any content datastream files into a bendo URL, and alter the rof
4
+ # to use the URL and not upload the file to fedora directly. The bendo URL
5
+ # will only exist for items having a bendo-item id set. The URL generated
6
+ # supposes the file keeps the same relative path the item originally had in
7
+ # the rof file.
8
+ class FileToUrl
9
+ def initialize()
10
+ end
11
+
12
+ def process(obj_list, _fname)
13
+ obj_list.map! do |obj|
14
+ bendo_item = obj['bendo-item']
15
+ content_file = obj['content-file']
16
+ if bendo_item && content_file
17
+ new_meta = obj.fetch('content-meta', {})
18
+ new_meta['URL'] = "bendo:/item/#{bendo_item}/#{content_file}"
19
+ obj['content-meta'] = new_meta
20
+ obj.delete('content-file')
21
+ end
22
+ obj
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,153 @@
1
+ require 'noids_client'
2
+
3
+ module ROF
4
+ module Filters
5
+ # Class Label locates in-place labels of the form
6
+ # "$(label_name)" in the ROF file, assigns each
7
+ # label a pid, then replaces the label with that pid.
8
+ class Label
9
+ class MissingLabel < RuntimeError
10
+ end
11
+
12
+ class NoPool < RuntimeError
13
+ end
14
+
15
+ class OutOfIdentifiers < RuntimeError
16
+ end
17
+
18
+ # Create a new label assigner and resolver. The source of identifiers
19
+ # is given using options.
20
+ # Use :noid_server and :pool_name to connect to an external noid server.
21
+ # Use :id_list to pass in a ruby object responding to #shift and #empty? to generate
22
+ # ids. This is usually a list, to facilitate testing.
23
+ #
24
+ # If prefix is not nil, then "#{prefix}:" is prepended to
25
+ # every identifier.
26
+ def initialize(prefix, options)
27
+ @id_list = case
28
+ when options[:id_list]
29
+ options[:id_list]
30
+ when options[:noid_server]
31
+ NoidsPool.new(options[:noid_server], options[:pool_name])
32
+ else
33
+ raise NoPool
34
+ end
35
+ @prefix = "#{prefix}:" if prefix
36
+ # The first match group in the RE provides the label name
37
+ @label_re = /\$\(([^)]+)\)/
38
+ end
39
+
40
+ # mutate obj_list by assigning labels and resolving labels where needed
41
+ # Every fobject will be assigned an pid and a bendo_item
42
+ def process(obj_list, _fname)
43
+ labels = {}
44
+
45
+ # Use two passes. First assign ids, and then resolve labels
46
+ # Do this since labels can be referenced before being defined
47
+
48
+ # Assign pids to each fobject. If we find any labels in the pid field, then
49
+ # record a mapping of label => pid into the labels hash.
50
+ obj_list.each do |obj|
51
+ assign_pid(obj, labels)
52
+ end
53
+
54
+ # now replace any reference labels with the pids we've assigned them
55
+ obj_list.each do |obj|
56
+ replace_labels_in_obj(obj, labels)
57
+ end
58
+
59
+ # now assign bendo ids
60
+ bendo_item = nil
61
+ obj_list.each do |obj|
62
+ # for now we just use the first item's pid stripped of any namespaces as the bendo item id
63
+ if bendo_item.nil?
64
+ bendo_item = obj['pid'].gsub(/^.*:/, '') unless obj['pid'].nil?
65
+ next if bendo_item.nil?
66
+ end
67
+ # don't touch if a bendo item has already been assigned
68
+ obj['bendo-item'] = bendo_item if obj['bendo-item'].nil? || obj['bendo-item'] == ''
69
+ end
70
+
71
+ obj_list
72
+ end
73
+
74
+ # assign pids, recording any labels we find.
75
+ # obj is mutated
76
+ def assign_pid(obj, labels)
77
+ return if obj['type'] != 'fobject'
78
+
79
+ label = nil
80
+ unless obj['pid'].nil?
81
+ label = find_label(obj['pid'])
82
+ # skip if the "pid" is not a label
83
+ return if label.nil?
84
+ end
85
+ pid = "#{@prefix}#{next_id}"
86
+ obj['pid'] = pid
87
+ labels[label] = pid unless label.nil?
88
+ end
89
+
90
+ # replace any label references we find in obj.
91
+ # obj is mutated
92
+ def replace_labels_in_obj(obj, labels)
93
+ return if obj['type'] != 'fobject'
94
+ obj.each do |k, v|
95
+ # only force labels to exist if we are looking in the rels-ext
96
+ obj[k] = replace_labels(v, labels, k == 'rels-ext')
97
+ end
98
+ end
99
+
100
+ # recurse through obj replacing any labels in strings
101
+ # with the id in labels, which is a hash.
102
+ # The relacement is done in place.
103
+ # Hash keys are not touched (only hash values).
104
+ # if force is true, labels which don't resolve will raise
105
+ # a MissingLabel error.
106
+ def replace_labels(obj, labels, force = false)
107
+ if obj.is_a?(Array)
108
+ obj.map! { |x| replace_labels(x, labels, force) }
109
+ elsif obj.is_a?(Hash)
110
+ obj.each { |k, v| obj[k] = replace_labels(v, labels, force) }
111
+ obj
112
+ elsif obj.is_a?(String)
113
+ replace_match(obj, labels, force)
114
+ else
115
+ obj
116
+ end
117
+ end
118
+
119
+ # small matching function- uses regular expression
120
+ def replace_match(obj, labels, force)
121
+ obj.gsub(@label_re) do |match|
122
+ pid = labels[Regexp.last_match(1)]
123
+ raise MissingLabel if pid.nil? && force
124
+ pid.nil? ? match : pid
125
+ end
126
+ end
127
+
128
+ def find_label(s)
129
+ s[@label_re, 1]
130
+ end
131
+
132
+ def next_id
133
+ raise OutOfIdentifiers if @id_list.empty?
134
+ @id_list.shift
135
+ end
136
+
137
+ # Encapsulates connection to Noids Server
138
+ class NoidsPool
139
+ def initialize(noids_server, pool_name)
140
+ @pool = NoidsClient::Connection.new(noids_server).get_pool(pool_name)
141
+ end
142
+
143
+ def shift
144
+ @pool.mint.first
145
+ end
146
+
147
+ def empty?
148
+ @pool.closed?
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,111 @@
1
+ require 'mime-types'
2
+
3
+ module ROF
4
+ module Filters
5
+ # Expand objects of type "Work(-(.+))?" into a
6
+ # constellation of "fobjects".
7
+ # Makes a fobject/generic_file for each file
8
+ # adds a depositor
9
+ # turns original object into an fobject/$1
10
+ # and copies the access to each fobject.
11
+ class Work
12
+ class NoFile < RuntimeError
13
+ end
14
+
15
+ def initialize
16
+ @utility = ROF::Utility.new
17
+ end
18
+
19
+ # wade through object list
20
+ def process(obj_list, filename)
21
+ @utility.set_workdir(filename)
22
+ obj_list.map! { |x| process_one_work(x) }
23
+ obj_list.flatten!
24
+ end
25
+
26
+ # given a single object, return a list (possibly empty) of new objects
27
+ # to replace the one given
28
+ def process_one_work(input_obj)
29
+ model = @utility.decode_work_type(input_obj)
30
+ return [input_obj] if model.nil?
31
+ return [ROF::Collection.process_one_collection(input_obj, @utility)] if model == 'Collection'
32
+
33
+ main_obj = set_main_obj(input_obj, model)
34
+
35
+ result = [main_obj]
36
+ result = make_thumbnail(result, main_obj, input_obj) unless input_obj['files'].nil?
37
+ result
38
+ end
39
+
40
+ # make the first file be the representative thumbnail
41
+ def make_thumbnail(result, main_obj, input_obj)
42
+ thumb_rep = nil
43
+ input_obj['files'].each do |finfo|
44
+ if finfo.is_a?(String)
45
+ fname = finfo
46
+ finfo = { 'files' => [fname] }
47
+ else
48
+ fname = finfo['files'].first
49
+ raise NoFile if fname.nil?
50
+ end
51
+ finfo['rights'] ||= input_obj['rights']
52
+ finfo['owner'] ||= input_obj['owner']
53
+ finfo['bendo-item'] ||= input_obj['bendo-item']
54
+ finfo['metadata'] ||= {
55
+ '@context' => ROF::RdfContext
56
+ }
57
+ finfo['metadata']['dc:title'] ||= fname
58
+ mimetype = MIME::Types.of(fname)
59
+ mimetype = mimetype.empty? ? 'application/octet-stream' : mimetype.first.content_type
60
+ f_obj = {
61
+ 'type' => 'fobject',
62
+ 'af-model' => 'GenericFile',
63
+ 'pid' => finfo['pid'],
64
+ 'bendo-item' => finfo['bendo-item'],
65
+ 'rights' => finfo['rights'],
66
+ 'properties' => ROF::Utility.prop_ds(finfo['owner']),
67
+ 'properties-meta' => {
68
+ 'mime-type' => 'text/xml'
69
+ },
70
+ 'rels-ext' => {
71
+ 'isPartOf' => [main_obj['pid']]
72
+ },
73
+ 'content-file' => fname,
74
+ 'content-meta' => {
75
+ 'label' => fname,
76
+ 'mime-type' => mimetype
77
+ },
78
+ 'collections' => finfo['collections'],
79
+ 'metadata' => finfo['metadata']
80
+ }
81
+ f_obj.delete_if { |_k, v| v.nil? }
82
+ if thumb_rep.nil?
83
+ thumb_rep = f_obj['pid']
84
+ if thumb_rep.nil?
85
+ thumb_rep = @utility.next_label
86
+ f_obj['pid'] = thumb_rep
87
+ end
88
+ main_obj['properties'] = ROF::Utility.prop_ds(input_obj['owner'], thumb_rep)
89
+ end
90
+ result << f_obj
91
+ end
92
+ result
93
+ end
94
+
95
+ def set_main_obj(input_obj, model)
96
+ result = {}
97
+
98
+ result['type'] = 'fobject'
99
+ result['af-model'] = model
100
+ result['pid'] = input_obj.fetch('pid', @utility.next_label)
101
+ result['bendo-item'] = input_obj['bendo-item']
102
+ result['rights'] = input_obj['rights']
103
+ result['properties'] = ROF::Utility.prop_ds(input_obj['owner'])
104
+ result['properties-meta'] = { 'mime-type' => 'text/xml' }
105
+ result['rels-ext'] = input_obj.fetch('rels-ext', {})
106
+ result['metadata'] = input_obj['metadata']
107
+ result
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,196 @@
1
+ require 'json'
2
+ require 'rexml/document'
3
+ require 'rdf/ntriples'
4
+ require 'rdf/rdfxml'
5
+ require 'rubydora'
6
+
7
+ module ROF
8
+ class FedoraToRof
9
+ # connect to fedora and fetch object
10
+ # returns array of fedora attributes or nil
11
+ def self.GetFromFedora(pid, fedora, config)
12
+ @fedora_info = {}
13
+
14
+ # Try to connect to fedora, and search for the desired item
15
+ # If either of these actions fail, handle it, and exit.
16
+ begin
17
+ fedora = Rubydora.connect(fedora)
18
+ doc = fedora.find(pid)
19
+ rescue StandardError => e
20
+ puts "Error: #{e}"
21
+ exit 1
22
+ end
23
+
24
+ # set pid, type
25
+ @fedora_info['pid'] = pid
26
+ @fedora_info['type'] = 'fobject'
27
+
28
+ readFedora(doc, config)
29
+
30
+ @fedora_info
31
+ end
32
+
33
+ # Given a rubydora object, extract what we need
34
+ # to create our ROF object in an associative array
35
+ #
36
+ def self.readFedora(rdora_obj, config)
37
+ @fedora_info['af-model'] = setModel(rdora_obj)
38
+ # iterate through the data streams that are present.
39
+ # use reflection to call appropriate method for each
40
+ rdora_obj.datastreams.each do |dsname, ds|
41
+ next if dsname == 'DC'
42
+ method_key = dsname.sub('-', '')
43
+ if respond_to?(method_key)
44
+ send(method_key, ds, config)
45
+ else
46
+ # dump generic datastream
47
+ meta = create_meta(ds, config)
48
+ @fedora_info["#{dsname}-meta"] = meta unless meta.empty?
49
+
50
+ # TODO(dbrower): change dump algorithm:
51
+ # if content is short < X bytes, save as string
52
+ # if content is > X bytes, save as file only if config option is given
53
+ content = ds.datastream_content
54
+ # NOTE- Entire datastream being downloaded every time.
55
+ content_string = content.to_s.force_encoding('UTF-8')
56
+ if (content.length <= 1024 || config['inline']) && content_string.valid_encoding?
57
+ @fedora_info[dsname] = content_string
58
+ elsif config['download']
59
+ fname = "#{@fedora_info['pid']}-#{dsname}"
60
+ abspath = File.join(config['download_path'], fname)
61
+ @fedora_info["#{dsname}-file"] = fname
62
+ if File.file?(config['download_path'])
63
+ puts "Error: --download directory #{config['download_path']} specified is an existing file."
64
+ exit 1
65
+ end
66
+ FileUtils.mkdir_p(config['download_path'])
67
+ File.open(abspath, 'w') do |f|
68
+ f.write(content)
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def self.create_meta(ds, config)
76
+ result = {}
77
+
78
+ label = ds.profile['dsLabel']
79
+ result['label'] = label unless label.nil? || label == ''
80
+ result['mime-type'] = ds.profile['dsMIME'] if ds.profile['dsMIME'] != 'text/plain'
81
+ # TODO(dbrower): make sure this is working as intended
82
+ if %w(R E).include?(ds.profile['dsControlGroup'])
83
+ s = result['URL'] = ds.profile['dsLocation']
84
+ s = s.sub(config['bendo'], 'bendo:') if config['bendo']
85
+ result['URL'] = s
86
+ end
87
+ result
88
+ end
89
+
90
+ # set fedora_indo['af-model']
91
+ #
92
+ def self.setModel(rdora_obj)
93
+ # only keep info:fedora/afmodel:XXXXX
94
+ models = rdora_obj.profile['objModels'].map do |model|
95
+ Regexp.last_match(1) if model =~ /^info:fedora\/afmodel:(.*)/
96
+ end.compact
97
+ models[0]
98
+ end
99
+
100
+ # The methods below are called if the like-named datastream exists in fedora
101
+
102
+ # set metadata
103
+ #
104
+ def self.descMetadata(ds, _config)
105
+ # desMetadata is encoded in ntriples, convert to JSON-LD using our special context
106
+ graph = RDF::Graph.new
107
+ data = ds.datastream_content
108
+ # force utf-8 encoding. fedora does not store the encoding, so it defaults to ASCII-8BIT
109
+ # see https://github.com/ruby-rdf/rdf/issues/142
110
+ data.force_encoding('utf-8')
111
+ graph.from_ntriples(data, format: :ntriples)
112
+ JSON::LD::API.fromRdf(graph) do |expanded|
113
+ result = JSON::LD::API.compact(expanded, RdfContext)
114
+ @fedora_info['metadata'] = result
115
+ end
116
+ end
117
+
118
+ # set rights
119
+ #
120
+ def self.rightsMetadata(ds, _config)
121
+ # rights is an XML document
122
+ # the access array may have read or edit elements
123
+ # each of these elements may contain group or person elements
124
+ xml_doc = REXML::Document.new(ds.datastream_content)
125
+
126
+ rights_array = {}
127
+
128
+ root = xml_doc.root
129
+
130
+ %w(read edit).each do |access|
131
+ this_access = root.elements["//access[@type=\'#{access}\']"]
132
+
133
+ next if this_access.nil?
134
+
135
+ unless this_access.elements['machine'].elements['group'].nil?
136
+ group_array = []
137
+ this_access.elements['machine'].elements['group'].each do |this_group|
138
+ group_array << this_group
139
+ end
140
+ rights_array["#{access}-groups"] = group_array
141
+ end
142
+
143
+ next if this_access.elements['machine'].elements['person'].nil?
144
+ person_array = []
145
+
146
+ this_access.elements['machine'].elements['person'].each do |this_person|
147
+ person_array << this_person
148
+ end
149
+ rights_array[access.to_s] = person_array
150
+ end
151
+
152
+ @fedora_info['rights'] = rights_array
153
+ end
154
+
155
+ def self.RELSEXT(ds, _config)
156
+ # RELS-EXT is RDF-XML - parse it
157
+ ctx = ROF::RelsExtRefContext.dup
158
+ ctx.delete('@base') # @base causes problems when converting TO json-ld (it is = "info:/fedora") but info is not a namespace
159
+ graph = RDF::Graph.new
160
+ graph.from_rdfxml(ds.datastream_content)
161
+ result = nil
162
+ JSON::LD::API.fromRdf(graph) do |expanded|
163
+ result = JSON::LD::API.compact(expanded, ctx)
164
+ end
165
+ # now strip the info:fedora/ prefix from the URIs
166
+ strip_info_fedora(result)
167
+ # remove extra items
168
+ result.delete('hasModel')
169
+ @fedora_info['rels-ext'] = result
170
+ end
171
+
172
+ private
173
+
174
+ def self.strip_info_fedora(rels_ext)
175
+ rels_ext.each do |relation, targets|
176
+ next if relation == '@context'
177
+ if targets.is_a?(Hash)
178
+ strip_info_fedora(targets)
179
+ next
180
+ end
181
+ targets = [targets] if targets.is_a?(String)
182
+ targets.map! do |target|
183
+ if target.is_a?(Hash)
184
+ strip_info_fedora(target)
185
+ else
186
+ target.sub('info:fedora/', '')
187
+ end
188
+ end
189
+ # some single strings cannot be arrays in json-ld, so convert back
190
+ # this shouldn't cause any problems with items that began as arrays
191
+ targets = targets[0] if targets.length == 1
192
+ rels_ext[relation] = targets
193
+ end
194
+ end
195
+ end
196
+ end