rof 0.0.1.pre → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.ruby-version +1 -1
  3. data/.travis.yml +12 -2
  4. data/Gemfile +1 -0
  5. data/README.md +87 -0
  6. data/bin/.ruby-version +1 -0
  7. data/bin/csv_to_rof +26 -0
  8. data/bin/fedora_to_rof +57 -0
  9. data/bin/osf_to_rof +40 -0
  10. data/bin/rof +78 -0
  11. data/bulk-ingest.md +242 -0
  12. data/labels.md +111 -0
  13. data/lib/rof.rb +20 -1
  14. data/lib/rof/access.rb +57 -0
  15. data/lib/rof/cli.rb +122 -0
  16. data/lib/rof/collection.rb +109 -0
  17. data/lib/rof/compare_rof.rb +92 -0
  18. data/lib/rof/filters/bendo.rb +33 -0
  19. data/lib/rof/filters/date_stamp.rb +36 -0
  20. data/lib/rof/filters/file_to_url.rb +27 -0
  21. data/lib/rof/filters/label.rb +153 -0
  22. data/lib/rof/filters/work.rb +111 -0
  23. data/lib/rof/get_from_fedora.rb +196 -0
  24. data/lib/rof/ingest.rb +204 -0
  25. data/lib/rof/ingesters/rels_ext_ingester.rb +78 -0
  26. data/lib/rof/ingesters/rights_metadata_ingester.rb +68 -0
  27. data/lib/rof/osf_context.rb +19 -0
  28. data/lib/rof/osf_to_rof.rb +122 -0
  29. data/lib/rof/rdf_context.rb +36 -0
  30. data/lib/rof/translate_csv.rb +112 -0
  31. data/lib/rof/utility.rb +84 -0
  32. data/lib/rof/version.rb +2 -2
  33. data/rof.gemspec +17 -0
  34. data/spec/fixtures/a.json +4 -0
  35. data/spec/fixtures/label.json +20 -0
  36. data/spec/fixtures/osf/b6psa.tar.gz +0 -0
  37. data/spec/fixtures/rof/dev0012829m.rof +45 -0
  38. data/spec/fixtures/vcr_tests/fedora_to_rof1.yml +5274 -0
  39. data/spec/fixtures/vecnet-citation.json +73 -0
  40. data/spec/lib/rof/access_spec.rb +36 -0
  41. data/spec/lib/rof/cli_spec.rb +66 -0
  42. data/spec/lib/rof/collection_spec.rb +90 -0
  43. data/spec/lib/rof/compare_rof_spec.rb +263 -0
  44. data/spec/lib/rof/filters/date_stamp_spec.rb +90 -0
  45. data/spec/lib/rof/filters/file_to_url_spec.rb +70 -0
  46. data/spec/lib/rof/filters/label_spec.rb +94 -0
  47. data/spec/lib/rof/filters/work_spec.rb +87 -0
  48. data/spec/lib/rof/ingest_spec.rb +117 -0
  49. data/spec/lib/rof/ingesters/rels_ext_ingester_spec.rb +62 -0
  50. data/spec/lib/rof/ingesters/rights_metadata_ingester_spec.rb +114 -0
  51. data/spec/lib/rof/osf_to_rof_spec.rb +76 -0
  52. data/spec/lib/rof/translate_csv_spec.rb +109 -0
  53. data/spec/lib/rof/utility_spec.rb +64 -0
  54. data/spec/lib/rof_spec.rb +14 -0
  55. data/spec/spec_helper.rb +11 -11
  56. metadata +283 -18
@@ -0,0 +1,33 @@
1
+ require 'date'
2
+
3
+ module ROF
4
+ module Filters
5
+
6
+ # If bendo server is set , add it into datasreams that contain an URl referencing bendo
7
+ #
8
+ class Bendo
9
+ def initialize(bendo=nil)
10
+ @bendo = bendo
11
+ end
12
+
13
+ def process(obj_list, _fname)
14
+
15
+ ends_meta = Regexp.new('(.+)-meta')
16
+
17
+ # for *-meta objects containing "URL", sub in bendo string if provided
18
+
19
+ obj_list.map! do |obj|
20
+ obj.map do |name, value|
21
+ if name =~ ends_meta
22
+ if obj[name]["URL"] && @bendo
23
+ obj[name]["URL"] = obj[name]["URL"].sub("bendo:",@bendo)
24
+ end
25
+ end
26
+ end
27
+ # print object
28
+ obj
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,36 @@
1
+ require 'date'
2
+
3
+ module ROF
4
+ module Filters
5
+ # Set the upload date to be the date given, provided it doesn't already exist.
6
+ # Also set the date modified to be the date given.
7
+ # If not given, the date used defaults to the local time on the computer.
8
+ class DateStamp
9
+ def initialize(date=nil)
10
+ @today = date || Date::today
11
+ @today_s = if @today.is_a?(Date)
12
+ @today.strftime('%FZ')
13
+ else
14
+ @today.to_s
15
+ end
16
+ end
17
+
18
+ def process(obj_list, _fname)
19
+ obj_list.map! do |obj|
20
+ if obj["metadata"].nil?
21
+ obj["metadata"] = {
22
+ "@context" => ROF::RdfContext
23
+ }
24
+ end
25
+ # only save the date submitted if it is not already present
26
+ if obj["metadata"]["dc:dateSubmitted"].nil?
27
+ obj["metadata"]["dc:dateSubmitted"] = @today_s
28
+ end
29
+ # always update the date modified
30
+ obj["metadata"]["dc:modified"] = @today_s
31
+ obj
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,27 @@
1
+ module ROF
2
+ module Filters
3
+ # Convert any content datastream files into a bendo URL, and alter the rof
4
+ # to use the URL and not upload the file to fedora directly. The bendo URL
5
+ # will only exist for items having a bendo-item id set. The URL generated
6
+ # supposes the file keeps the same relative path the item originally had in
7
+ # the rof file.
8
+ class FileToUrl
9
+ def initialize()
10
+ end
11
+
12
+ def process(obj_list, _fname)
13
+ obj_list.map! do |obj|
14
+ bendo_item = obj['bendo-item']
15
+ content_file = obj['content-file']
16
+ if bendo_item && content_file
17
+ new_meta = obj.fetch('content-meta', {})
18
+ new_meta['URL'] = "bendo:/item/#{bendo_item}/#{content_file}"
19
+ obj['content-meta'] = new_meta
20
+ obj.delete('content-file')
21
+ end
22
+ obj
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,153 @@
1
+ require 'noids_client'
2
+
3
+ module ROF
4
+ module Filters
5
+ # Class Label locates in-place labels of the form
6
+ # "$(label_name)" in the ROF file, assigns each
7
+ # label a pid, then replaces the label with that pid.
8
+ class Label
9
+ class MissingLabel < RuntimeError
10
+ end
11
+
12
+ class NoPool < RuntimeError
13
+ end
14
+
15
+ class OutOfIdentifiers < RuntimeError
16
+ end
17
+
18
+ # Create a new label assigner and resolver. The source of identifiers
19
+ # is given using options.
20
+ # Use :noid_server and :pool_name to connect to an external noid server.
21
+ # Use :id_list to pass in a ruby object responding to #shift and #empty? to generate
22
+ # ids. This is usually a list, to facilitate testing.
23
+ #
24
+ # If prefix is not nil, then "#{prefix}:" is prepended to
25
+ # every identifier.
26
+ def initialize(prefix, options)
27
+ @id_list = case
28
+ when options[:id_list]
29
+ options[:id_list]
30
+ when options[:noid_server]
31
+ NoidsPool.new(options[:noid_server], options[:pool_name])
32
+ else
33
+ raise NoPool
34
+ end
35
+ @prefix = "#{prefix}:" if prefix
36
+ # The first match group in the RE provides the label name
37
+ @label_re = /\$\(([^)]+)\)/
38
+ end
39
+
40
+ # mutate obj_list by assigning labels and resolving labels where needed
41
+ # Every fobject will be assigned an pid and a bendo_item
42
+ def process(obj_list, _fname)
43
+ labels = {}
44
+
45
+ # Use two passes. First assign ids, and then resolve labels
46
+ # Do this since labels can be referenced before being defined
47
+
48
+ # Assign pids to each fobject. If we find any labels in the pid field, then
49
+ # record a mapping of label => pid into the labels hash.
50
+ obj_list.each do |obj|
51
+ assign_pid(obj, labels)
52
+ end
53
+
54
+ # now replace any reference labels with the pids we've assigned them
55
+ obj_list.each do |obj|
56
+ replace_labels_in_obj(obj, labels)
57
+ end
58
+
59
+ # now assign bendo ids
60
+ bendo_item = nil
61
+ obj_list.each do |obj|
62
+ # for now we just use the first item's pid stripped of any namespaces as the bendo item id
63
+ if bendo_item.nil?
64
+ bendo_item = obj['pid'].gsub(/^.*:/, '') unless obj['pid'].nil?
65
+ next if bendo_item.nil?
66
+ end
67
+ # don't touch if a bendo item has already been assigned
68
+ obj['bendo-item'] = bendo_item if obj['bendo-item'].nil? || obj['bendo-item'] == ''
69
+ end
70
+
71
+ obj_list
72
+ end
73
+
74
+ # assign pids, recording any labels we find.
75
+ # obj is mutated
76
+ def assign_pid(obj, labels)
77
+ return if obj['type'] != 'fobject'
78
+
79
+ label = nil
80
+ unless obj['pid'].nil?
81
+ label = find_label(obj['pid'])
82
+ # skip if the "pid" is not a label
83
+ return if label.nil?
84
+ end
85
+ pid = "#{@prefix}#{next_id}"
86
+ obj['pid'] = pid
87
+ labels[label] = pid unless label.nil?
88
+ end
89
+
90
+ # replace any label references we find in obj.
91
+ # obj is mutated
92
+ def replace_labels_in_obj(obj, labels)
93
+ return if obj['type'] != 'fobject'
94
+ obj.each do |k, v|
95
+ # only force labels to exist if we are looking in the rels-ext
96
+ obj[k] = replace_labels(v, labels, k == 'rels-ext')
97
+ end
98
+ end
99
+
100
+ # recurse through obj replacing any labels in strings
101
+ # with the id in labels, which is a hash.
102
+ # The relacement is done in place.
103
+ # Hash keys are not touched (only hash values).
104
+ # if force is true, labels which don't resolve will raise
105
+ # a MissingLabel error.
106
+ def replace_labels(obj, labels, force = false)
107
+ if obj.is_a?(Array)
108
+ obj.map! { |x| replace_labels(x, labels, force) }
109
+ elsif obj.is_a?(Hash)
110
+ obj.each { |k, v| obj[k] = replace_labels(v, labels, force) }
111
+ obj
112
+ elsif obj.is_a?(String)
113
+ replace_match(obj, labels, force)
114
+ else
115
+ obj
116
+ end
117
+ end
118
+
119
+ # small matching function- uses regular expression
120
+ def replace_match(obj, labels, force)
121
+ obj.gsub(@label_re) do |match|
122
+ pid = labels[Regexp.last_match(1)]
123
+ raise MissingLabel if pid.nil? && force
124
+ pid.nil? ? match : pid
125
+ end
126
+ end
127
+
128
+ def find_label(s)
129
+ s[@label_re, 1]
130
+ end
131
+
132
+ def next_id
133
+ raise OutOfIdentifiers if @id_list.empty?
134
+ @id_list.shift
135
+ end
136
+
137
+ # Encapsulates connection to Noids Server
138
+ class NoidsPool
139
+ def initialize(noids_server, pool_name)
140
+ @pool = NoidsClient::Connection.new(noids_server).get_pool(pool_name)
141
+ end
142
+
143
+ def shift
144
+ @pool.mint.first
145
+ end
146
+
147
+ def empty?
148
+ @pool.closed?
149
+ end
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,111 @@
1
+ require 'mime-types'
2
+
3
+ module ROF
4
+ module Filters
5
+ # Expand objects of type "Work(-(.+))?" into a
6
+ # constellation of "fobjects".
7
+ # Makes a fobject/generic_file for each file
8
+ # adds a depositor
9
+ # turns original object into an fobject/$1
10
+ # and copies the access to each fobject.
11
+ class Work
12
+ class NoFile < RuntimeError
13
+ end
14
+
15
+ def initialize
16
+ @utility = ROF::Utility.new
17
+ end
18
+
19
+ # wade through object list
20
+ def process(obj_list, filename)
21
+ @utility.set_workdir(filename)
22
+ obj_list.map! { |x| process_one_work(x) }
23
+ obj_list.flatten!
24
+ end
25
+
26
+ # given a single object, return a list (possibly empty) of new objects
27
+ # to replace the one given
28
+ def process_one_work(input_obj)
29
+ model = @utility.decode_work_type(input_obj)
30
+ return [input_obj] if model.nil?
31
+ return [ROF::Collection.process_one_collection(input_obj, @utility)] if model == 'Collection'
32
+
33
+ main_obj = set_main_obj(input_obj, model)
34
+
35
+ result = [main_obj]
36
+ result = make_thumbnail(result, main_obj, input_obj) unless input_obj['files'].nil?
37
+ result
38
+ end
39
+
40
+ # make the first file be the representative thumbnail
41
+ def make_thumbnail(result, main_obj, input_obj)
42
+ thumb_rep = nil
43
+ input_obj['files'].each do |finfo|
44
+ if finfo.is_a?(String)
45
+ fname = finfo
46
+ finfo = { 'files' => [fname] }
47
+ else
48
+ fname = finfo['files'].first
49
+ raise NoFile if fname.nil?
50
+ end
51
+ finfo['rights'] ||= input_obj['rights']
52
+ finfo['owner'] ||= input_obj['owner']
53
+ finfo['bendo-item'] ||= input_obj['bendo-item']
54
+ finfo['metadata'] ||= {
55
+ '@context' => ROF::RdfContext
56
+ }
57
+ finfo['metadata']['dc:title'] ||= fname
58
+ mimetype = MIME::Types.of(fname)
59
+ mimetype = mimetype.empty? ? 'application/octet-stream' : mimetype.first.content_type
60
+ f_obj = {
61
+ 'type' => 'fobject',
62
+ 'af-model' => 'GenericFile',
63
+ 'pid' => finfo['pid'],
64
+ 'bendo-item' => finfo['bendo-item'],
65
+ 'rights' => finfo['rights'],
66
+ 'properties' => ROF::Utility.prop_ds(finfo['owner']),
67
+ 'properties-meta' => {
68
+ 'mime-type' => 'text/xml'
69
+ },
70
+ 'rels-ext' => {
71
+ 'isPartOf' => [main_obj['pid']]
72
+ },
73
+ 'content-file' => fname,
74
+ 'content-meta' => {
75
+ 'label' => fname,
76
+ 'mime-type' => mimetype
77
+ },
78
+ 'collections' => finfo['collections'],
79
+ 'metadata' => finfo['metadata']
80
+ }
81
+ f_obj.delete_if { |_k, v| v.nil? }
82
+ if thumb_rep.nil?
83
+ thumb_rep = f_obj['pid']
84
+ if thumb_rep.nil?
85
+ thumb_rep = @utility.next_label
86
+ f_obj['pid'] = thumb_rep
87
+ end
88
+ main_obj['properties'] = ROF::Utility.prop_ds(input_obj['owner'], thumb_rep)
89
+ end
90
+ result << f_obj
91
+ end
92
+ result
93
+ end
94
+
95
+ def set_main_obj(input_obj, model)
96
+ result = {}
97
+
98
+ result['type'] = 'fobject'
99
+ result['af-model'] = model
100
+ result['pid'] = input_obj.fetch('pid', @utility.next_label)
101
+ result['bendo-item'] = input_obj['bendo-item']
102
+ result['rights'] = input_obj['rights']
103
+ result['properties'] = ROF::Utility.prop_ds(input_obj['owner'])
104
+ result['properties-meta'] = { 'mime-type' => 'text/xml' }
105
+ result['rels-ext'] = input_obj.fetch('rels-ext', {})
106
+ result['metadata'] = input_obj['metadata']
107
+ result
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,196 @@
1
+ require 'json'
2
+ require 'rexml/document'
3
+ require 'rdf/ntriples'
4
+ require 'rdf/rdfxml'
5
+ require 'rubydora'
6
+
7
+ module ROF
8
+ class FedoraToRof
9
+ # connect to fedora and fetch object
10
+ # returns array of fedora attributes or nil
11
+ def self.GetFromFedora(pid, fedora, config)
12
+ @fedora_info = {}
13
+
14
+ # Try to connect to fedora, and search for the desired item
15
+ # If either of these actions fail, handle it, and exit.
16
+ begin
17
+ fedora = Rubydora.connect(fedora)
18
+ doc = fedora.find(pid)
19
+ rescue StandardError => e
20
+ puts "Error: #{e}"
21
+ exit 1
22
+ end
23
+
24
+ # set pid, type
25
+ @fedora_info['pid'] = pid
26
+ @fedora_info['type'] = 'fobject'
27
+
28
+ readFedora(doc, config)
29
+
30
+ @fedora_info
31
+ end
32
+
33
+ # Given a rubydora object, extract what we need
34
+ # to create our ROF object in an associative array
35
+ #
36
+ def self.readFedora(rdora_obj, config)
37
+ @fedora_info['af-model'] = setModel(rdora_obj)
38
+ # iterate through the data streams that are present.
39
+ # use reflection to call appropriate method for each
40
+ rdora_obj.datastreams.each do |dsname, ds|
41
+ next if dsname == 'DC'
42
+ method_key = dsname.sub('-', '')
43
+ if respond_to?(method_key)
44
+ send(method_key, ds, config)
45
+ else
46
+ # dump generic datastream
47
+ meta = create_meta(ds, config)
48
+ @fedora_info["#{dsname}-meta"] = meta unless meta.empty?
49
+
50
+ # TODO(dbrower): change dump algorithm:
51
+ # if content is short < X bytes, save as string
52
+ # if content is > X bytes, save as file only if config option is given
53
+ content = ds.datastream_content
54
+ # NOTE- Entire datastream being downloaded every time.
55
+ content_string = content.to_s.force_encoding('UTF-8')
56
+ if (content.length <= 1024 || config['inline']) && content_string.valid_encoding?
57
+ @fedora_info[dsname] = content_string
58
+ elsif config['download']
59
+ fname = "#{@fedora_info['pid']}-#{dsname}"
60
+ abspath = File.join(config['download_path'], fname)
61
+ @fedora_info["#{dsname}-file"] = fname
62
+ if File.file?(config['download_path'])
63
+ puts "Error: --download directory #{config['download_path']} specified is an existing file."
64
+ exit 1
65
+ end
66
+ FileUtils.mkdir_p(config['download_path'])
67
+ File.open(abspath, 'w') do |f|
68
+ f.write(content)
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def self.create_meta(ds, config)
76
+ result = {}
77
+
78
+ label = ds.profile['dsLabel']
79
+ result['label'] = label unless label.nil? || label == ''
80
+ result['mime-type'] = ds.profile['dsMIME'] if ds.profile['dsMIME'] != 'text/plain'
81
+ # TODO(dbrower): make sure this is working as intended
82
+ if %w(R E).include?(ds.profile['dsControlGroup'])
83
+ s = result['URL'] = ds.profile['dsLocation']
84
+ s = s.sub(config['bendo'], 'bendo:') if config['bendo']
85
+ result['URL'] = s
86
+ end
87
+ result
88
+ end
89
+
90
+ # set fedora_indo['af-model']
91
+ #
92
+ def self.setModel(rdora_obj)
93
+ # only keep info:fedora/afmodel:XXXXX
94
+ models = rdora_obj.profile['objModels'].map do |model|
95
+ Regexp.last_match(1) if model =~ /^info:fedora\/afmodel:(.*)/
96
+ end.compact
97
+ models[0]
98
+ end
99
+
100
+ # The methods below are called if the like-named datastream exists in fedora
101
+
102
+ # set metadata
103
+ #
104
+ def self.descMetadata(ds, _config)
105
+ # desMetadata is encoded in ntriples, convert to JSON-LD using our special context
106
+ graph = RDF::Graph.new
107
+ data = ds.datastream_content
108
+ # force utf-8 encoding. fedora does not store the encoding, so it defaults to ASCII-8BIT
109
+ # see https://github.com/ruby-rdf/rdf/issues/142
110
+ data.force_encoding('utf-8')
111
+ graph.from_ntriples(data, format: :ntriples)
112
+ JSON::LD::API.fromRdf(graph) do |expanded|
113
+ result = JSON::LD::API.compact(expanded, RdfContext)
114
+ @fedora_info['metadata'] = result
115
+ end
116
+ end
117
+
118
+ # set rights
119
+ #
120
+ def self.rightsMetadata(ds, _config)
121
+ # rights is an XML document
122
+ # the access array may have read or edit elements
123
+ # each of these elements may contain group or person elements
124
+ xml_doc = REXML::Document.new(ds.datastream_content)
125
+
126
+ rights_array = {}
127
+
128
+ root = xml_doc.root
129
+
130
+ %w(read edit).each do |access|
131
+ this_access = root.elements["//access[@type=\'#{access}\']"]
132
+
133
+ next if this_access.nil?
134
+
135
+ unless this_access.elements['machine'].elements['group'].nil?
136
+ group_array = []
137
+ this_access.elements['machine'].elements['group'].each do |this_group|
138
+ group_array << this_group
139
+ end
140
+ rights_array["#{access}-groups"] = group_array
141
+ end
142
+
143
+ next if this_access.elements['machine'].elements['person'].nil?
144
+ person_array = []
145
+
146
+ this_access.elements['machine'].elements['person'].each do |this_person|
147
+ person_array << this_person
148
+ end
149
+ rights_array[access.to_s] = person_array
150
+ end
151
+
152
+ @fedora_info['rights'] = rights_array
153
+ end
154
+
155
+ def self.RELSEXT(ds, _config)
156
+ # RELS-EXT is RDF-XML - parse it
157
+ ctx = ROF::RelsExtRefContext.dup
158
+ ctx.delete('@base') # @base causes problems when converting TO json-ld (it is = "info:/fedora") but info is not a namespace
159
+ graph = RDF::Graph.new
160
+ graph.from_rdfxml(ds.datastream_content)
161
+ result = nil
162
+ JSON::LD::API.fromRdf(graph) do |expanded|
163
+ result = JSON::LD::API.compact(expanded, ctx)
164
+ end
165
+ # now strip the info:fedora/ prefix from the URIs
166
+ strip_info_fedora(result)
167
+ # remove extra items
168
+ result.delete('hasModel')
169
+ @fedora_info['rels-ext'] = result
170
+ end
171
+
172
+ private
173
+
174
+ def self.strip_info_fedora(rels_ext)
175
+ rels_ext.each do |relation, targets|
176
+ next if relation == '@context'
177
+ if targets.is_a?(Hash)
178
+ strip_info_fedora(targets)
179
+ next
180
+ end
181
+ targets = [targets] if targets.is_a?(String)
182
+ targets.map! do |target|
183
+ if target.is_a?(Hash)
184
+ strip_info_fedora(target)
185
+ else
186
+ target.sub('info:fedora/', '')
187
+ end
188
+ end
189
+ # some single strings cannot be arrays in json-ld, so convert back
190
+ # this shouldn't cause any problems with items that began as arrays
191
+ targets = targets[0] if targets.length == 1
192
+ rels_ext[relation] = targets
193
+ end
194
+ end
195
+ end
196
+ end