rof 1.0.7 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +9 -7
  3. data/LICENSE +201 -16
  4. data/Rakefile +46 -0
  5. data/bin/csv_to_rof +1 -2
  6. data/bin/fedora_to_rof +7 -1
  7. data/bin/jsonld_to_rof +26 -0
  8. data/bin/osf_to_rof +6 -2
  9. data/bin/rof +5 -19
  10. data/lib/rof.rb +2 -6
  11. data/lib/rof/access.rb +1 -1
  12. data/lib/rof/cli.rb +104 -67
  13. data/lib/rof/compare_rof.rb +68 -39
  14. data/lib/rof/filter.rb +21 -0
  15. data/lib/rof/filters.rb +38 -0
  16. data/lib/rof/filters/bendo.rb +15 -17
  17. data/lib/rof/filters/date_stamp.rb +5 -4
  18. data/lib/rof/filters/file_to_url.rb +5 -3
  19. data/lib/rof/filters/label.rb +9 -7
  20. data/lib/rof/filters/work.rb +7 -5
  21. data/lib/rof/ingest.rb +5 -0
  22. data/lib/rof/osf_context.rb +2 -2
  23. data/lib/rof/rdf_context.rb +2 -0
  24. data/lib/rof/translator.rb +18 -0
  25. data/lib/rof/translators.rb +23 -0
  26. data/lib/rof/{translate_csv.rb → translators/csv_to_rof.rb} +4 -3
  27. data/lib/rof/translators/fedora_to_rof.rb +244 -0
  28. data/lib/rof/translators/jsonld_to_rof.rb +112 -0
  29. data/lib/rof/translators/jsonld_to_rof/accumulator.rb +175 -0
  30. data/lib/rof/translators/jsonld_to_rof/predicate_handler.rb +223 -0
  31. data/lib/rof/translators/jsonld_to_rof/predicate_object_handler.rb +125 -0
  32. data/lib/rof/translators/jsonld_to_rof/statement_handler.rb +91 -0
  33. data/lib/rof/translators/osf_to_rof.rb +191 -0
  34. data/lib/rof/utility.rb +44 -1
  35. data/lib/rof/version.rb +1 -1
  36. data/rof.gemspec +10 -2
  37. data/spec/coverage_helper.rb +17 -0
  38. data/spec/fixtures/for_utility_load_items_from_json_file/multiple_items.json +8 -0
  39. data/spec/fixtures/for_utility_load_items_from_json_file/parse_error.json +3 -0
  40. data/spec/fixtures/for_utility_load_items_from_json_file/single_item.json +3 -0
  41. data/spec/fixtures/jsonld_to_rof/0g354f18610.jsonld +113 -0
  42. data/spec/fixtures/jsonld_to_rof/0g354f18610.rof +96 -0
  43. data/spec/fixtures/jsonld_to_rof/2j62s467216.jsonld +113 -0
  44. data/spec/fixtures/jsonld_to_rof/2j62s467216.rof +93 -0
  45. data/spec/fixtures/jsonld_to_rof/2v23vt16z2z.jsonld +70 -0
  46. data/spec/fixtures/jsonld_to_rof/2v23vt16z2z.rof +87 -0
  47. data/spec/fixtures/jsonld_to_rof/cr56n01253w.jsonld +84 -0
  48. data/spec/fixtures/jsonld_to_rof/cr56n01253w.rof +95 -0
  49. data/spec/fixtures/jsonld_to_rof/h989r21069m.jsonld +84 -0
  50. data/spec/fixtures/jsonld_to_rof/h989r21069m.rof +98 -0
  51. data/spec/fixtures/jsonld_to_rof/js956d59913.jsonld +79 -0
  52. data/spec/fixtures/jsonld_to_rof/js956d59913.rof +89 -0
  53. data/spec/fixtures/jsonld_to_rof/m039k358q5c.jsonld +80 -0
  54. data/spec/fixtures/jsonld_to_rof/m039k358q5c.rof +64 -0
  55. data/spec/fixtures/jsonld_to_rof/nk322b9161g.jsonld +89 -0
  56. data/spec/fixtures/jsonld_to_rof/nk322b9161g.rof +69 -0
  57. data/spec/fixtures/jsonld_to_rof/p8418k7430d.jsonld +84 -0
  58. data/spec/fixtures/jsonld_to_rof/p8418k7430d.rof +67 -0
  59. data/spec/fixtures/jsonld_to_rof/xg94hm53h0c.jsonld +98 -0
  60. data/spec/fixtures/jsonld_to_rof/xg94hm53h0c.rof +110 -0
  61. data/spec/fixtures/jsonld_to_rof/zk51vd69n1r.jsonld +94 -0
  62. data/spec/fixtures/jsonld_to_rof/zk51vd69n1r.rof +121 -0
  63. data/spec/fixtures/osf/phz6b.tar.gz +0 -0
  64. data/spec/lib/rof/access_spec.rb +30 -23
  65. data/spec/lib/rof/cli_spec.rb +83 -60
  66. data/spec/lib/rof/compare_rof_spec.rb +35 -24
  67. data/spec/lib/rof/filter_spec.rb +10 -0
  68. data/spec/lib/rof/filters/bendo_spec.rb +42 -0
  69. data/spec/lib/rof/filters/date_stamp_spec.rb +9 -5
  70. data/spec/lib/rof/filters/file_to_url_spec.rb +7 -3
  71. data/spec/lib/rof/filters/label_spec.rb +121 -77
  72. data/spec/lib/rof/filters/work_spec.rb +7 -4
  73. data/spec/lib/rof/filters_spec.rb +14 -0
  74. data/spec/lib/rof/translator_spec.rb +15 -0
  75. data/spec/lib/rof/{translate_csv_spec.rb → translators/csv_to_rof_spec.rb} +14 -14
  76. data/spec/lib/rof/translators/fedora_to_rof_spec.rb +64 -0
  77. data/spec/lib/rof/translators/jsonld_to_rof/accumulator_spec.rb +121 -0
  78. data/spec/lib/rof/translators/jsonld_to_rof/predicate_handler_spec.rb +73 -0
  79. data/spec/lib/rof/translators/jsonld_to_rof/predicate_object_handler_spec.rb +48 -0
  80. data/spec/lib/rof/translators/jsonld_to_rof/statement_handler_spec.rb +40 -0
  81. data/spec/lib/rof/translators/jsonld_to_rof_spec.rb +120 -0
  82. data/spec/lib/rof/{osf_to_rof_spec.rb → translators/osf_to_rof_spec.rb} +55 -25
  83. data/spec/lib/rof/translators_spec.rb +14 -0
  84. data/spec/lib/rof/utility_spec.rb +47 -1
  85. data/spec/spec_helper.rb +1 -1
  86. data/spec/support/an_rof_filter.rb +10 -0
  87. metadata +186 -15
  88. data/lib/rof/get_from_fedora.rb +0 -211
  89. data/lib/rof/osf_to_rof.rb +0 -123
  90. data/spec/lib/rof/get_from_fedora_spec.rb +0 -22
@@ -1,211 +0,0 @@
1
- require 'json'
2
- require 'rexml/document'
3
- require 'rdf/ntriples'
4
- require 'rdf/rdfxml'
5
- require 'rubydora'
6
-
7
- module ROF
8
- class FedoraToRof
9
- # connect to fedora and fetch object
10
- # returns array of fedora attributes or nil
11
- def self.GetFromFedora(pid, fedora, config)
12
- @fedora_info = {}
13
-
14
- # Try to connect to fedora, and search for the desired item
15
- # If either of these actions fail, handle it, and exit.
16
- begin
17
- fedora = Rubydora.connect(fedora)
18
- doc = fedora.find(pid)
19
- rescue StandardError => e
20
- puts "Error: #{e}"
21
- exit 1
22
- end
23
-
24
- # set pid, type
25
- @fedora_info['pid'] = pid
26
- @fedora_info['type'] = 'fobject'
27
-
28
- readFedora(doc, config)
29
-
30
- @fedora_info
31
- end
32
-
33
- # Given a rubydora object, extract what we need
34
- # to create our ROF object in an associative array
35
- #
36
- def self.readFedora(rdora_obj, config)
37
- @fedora_info['af-model'] = setModel(rdora_obj)
38
- # iterate through the data streams that are present.
39
- # use reflection to call appropriate method for each
40
- rdora_obj.datastreams.each do |dsname, ds|
41
- next if dsname == 'DC'
42
- method_key = dsname.sub('-', '')
43
- if respond_to?(method_key)
44
- send(method_key, ds, config)
45
- else
46
- # dump generic datastream
47
- meta = create_meta(ds, config)
48
- @fedora_info["#{dsname}-meta"] = meta unless meta.empty?
49
-
50
- # if content is short < X bytes and valid utf-8, save as string
51
- # if content is > X bytes or is not utf-8, save as file only if config option is given
52
- content = ds.datastream_content
53
- if content.length <= 1024 || config['inline']
54
- # this downloads the contents of the datastream into memory
55
- content_string = content.to_s.force_encoding('UTF-8')
56
- if content_string.valid_encoding?
57
- @fedora_info[dsname] = content_string
58
- next # we're done! move on to next datastream
59
- end
60
- # not utf-8, so keep going and see if download option was given
61
- end
62
- next unless config['download']
63
- # download option was given, so save this datastream as a file
64
- fname = "#{@fedora_info['pid']}-#{dsname}"
65
- abspath = File.join(config['download_path'], fname)
66
- @fedora_info["#{dsname}-file"] = fname
67
- if File.file?(config['download_path'])
68
- puts "Error: --download directory #{config['download_path']} specified is an existing file."
69
- exit 1
70
- end
71
- FileUtils.mkdir_p(config['download_path'])
72
- File.open(abspath, 'w') do |f|
73
- f.write(content)
74
- end
75
- end
76
- end
77
- end
78
-
79
- def self.create_meta(ds, config)
80
- result = {}
81
-
82
- label = ds.profile['dsLabel']
83
- result['label'] = label unless label.nil? || label == ''
84
- result['mime-type'] = ds.profile['dsMIME'] if ds.profile['dsMIME'] != 'text/plain'
85
- # TODO(dbrower): make sure this is working as intended
86
- if %w(R E).include?(ds.profile['dsControlGroup'])
87
- s = result['URL'] = ds.profile['dsLocation']
88
- s = s.sub(config['bendo'], 'bendo:') if config['bendo']
89
- result['URL'] = s
90
- end
91
- result
92
- end
93
-
94
- # set fedora_indo['af-model']
95
- #
96
- def self.setModel(rdora_obj)
97
- # only keep info:fedora/afmodel:XXXXX
98
- models = rdora_obj.profile['objModels'].map do |model|
99
- Regexp.last_match(1) if model =~ /^info:fedora\/afmodel:(.*)/
100
- end.compact
101
- models[0]
102
- end
103
-
104
- # The methods below are called if the like-named datastream exists in fedora
105
-
106
- # set metadata
107
- #
108
- def self.descMetadata(ds, _config)
109
- # desMetadata is encoded in ntriples, convert to JSON-LD using our special context
110
- graph = RDF::Graph.new
111
- data = ds.datastream_content
112
- # force utf-8 encoding. fedora does not store the encoding, so it defaults to ASCII-8BIT
113
- # see https://github.com/ruby-rdf/rdf/issues/142
114
- data.force_encoding('utf-8')
115
- graph.from_ntriples(data, format: :ntriples)
116
- JSON::LD::API.fromRdf(graph) do |expanded|
117
- result = JSON::LD::API.compact(expanded, RdfContext)
118
- @fedora_info['metadata'] = result
119
- end
120
- end
121
-
122
- # set rights
123
- #
124
- def self.rightsMetadata(ds, _config)
125
- # rights is an XML document
126
- # the access array may have read or edit elements
127
- # each of these elements may contain group or person elements
128
- xml_doc = REXML::Document.new(ds.datastream_content)
129
-
130
- rights_array = {}
131
-
132
- root = xml_doc.root
133
-
134
- # check for optional embargo date - set if present
135
- this_embargo = root.elements['embargo']
136
- rights_array['embargo-date'] = this_embargo.elements['machine'].elements['date'][0] if has_embargo_date(this_embargo)
137
-
138
- %w(read edit).each do |access|
139
- this_access = root.elements["//access[@type=\'#{access}\']"]
140
-
141
- next if this_access.nil?
142
-
143
- unless this_access.elements['machine'].elements['group'].nil?
144
- group_array = []
145
- this_access.elements['machine'].elements['group'].each do |this_group|
146
- group_array << this_group
147
- end
148
- rights_array["#{access}-groups"] = group_array
149
- end
150
-
151
- next if this_access.elements['machine'].elements['person'].nil?
152
- person_array = []
153
-
154
- this_access.elements['machine'].elements['person'].each do |this_person|
155
- person_array << this_person
156
- end
157
- rights_array[access.to_s] = person_array
158
- end
159
-
160
- @fedora_info['rights'] = rights_array
161
- end
162
-
163
- # test for embargo xml cases
164
- def self.has_embargo_date(embargo_xml)
165
- return false if embargo_xml == '' || embargo_xml.nil?
166
- return false unless embargo_xml.elements['machine'].has_elements? && embargo_xml.elements['machine'].elements['date'].has_text?
167
- true
168
- end
169
-
170
- def self.RELSEXT(ds, _config)
171
- # RELS-EXT is RDF-XML - parse it
172
- ctx = ROF::RelsExtRefContext.dup
173
- ctx.delete('@base') # @base causes problems when converting TO json-ld (it is = "info:/fedora") but info is not a namespace
174
- graph = RDF::Graph.new
175
- graph.from_rdfxml(ds.datastream_content)
176
- result = nil
177
- JSON::LD::API.fromRdf(graph) do |expanded|
178
- result = JSON::LD::API.compact(expanded, ctx)
179
- end
180
- # now strip the info:fedora/ prefix from the URIs
181
- strip_info_fedora(result)
182
- # remove extra items
183
- result.delete('hasModel')
184
- @fedora_info['rels-ext'] = result
185
- end
186
-
187
- private
188
-
189
- def self.strip_info_fedora(rels_ext)
190
- rels_ext.each do |relation, targets|
191
- next if relation == '@context'
192
- if targets.is_a?(Hash)
193
- strip_info_fedora(targets)
194
- next
195
- end
196
- targets = [targets] if targets.is_a?(String)
197
- targets.map! do |target|
198
- if target.is_a?(Hash)
199
- strip_info_fedora(target)
200
- else
201
- target.sub('info:fedora/', '')
202
- end
203
- end
204
- # some single strings cannot be arrays in json-ld, so convert back
205
- # this shouldn't cause any problems with items that began as arrays
206
- targets = targets[0] if targets.length == 1
207
- rels_ext[relation] = targets
208
- end
209
- end
210
- end
211
- end
@@ -1,123 +0,0 @@
1
- require 'json'
2
- require 'zlib'
3
- require 'rubygems/package'
4
- require 'rdf/turtle'
5
- require 'rof/osf_context'
6
- require 'rof/rdf_context'
7
- require 'rof/utility'
8
-
9
- module ROF
10
- # Class for managing OSF Archive data transformations
11
- # It is called after the get-from-osf task, and before the work-xlat task
12
- class OsfToRof
13
- # Convert Osf Archive tar.gz to ROF
14
- def self.osf_to_rof(config, osf_projects = nil)
15
- @osf_map = ROF::OsfToNDMap
16
- rof_array = []
17
- return {} if osf_projects.nil?
18
- this_project = osf_projects
19
- ttl_data = ttl_from_targz(config, this_project,
20
- this_project['project_identifier'] + '.ttl')
21
- rof_array[0] = build_archive_record(config, this_project, ttl_data)
22
- rof_array
23
- end
24
-
25
- # reads a ttl file and makes it a JSON-LD file that we can parse
26
- def self.fetch_from_ttl(ttl_file)
27
- graph = RDF::Turtle::Reader.open(ttl_file,
28
- prefixes: ROF::OsfPrefixList.dup)
29
- JSON::LD::API.fromRdf(graph)
30
- end
31
-
32
- # extracts given ttl file from JHU tar.gz package
33
- # - assumed to live under data/obj/root
34
- def self.ttl_from_targz(config, this_project, ttl_filename)
35
- id = this_project['project_identifier']
36
- ttl_path = File.join(id,
37
- 'data/obj/root',
38
- ttl_filename)
39
- ROF::Utility.file_from_targz(File.join(config['package_dir'], id + '.tar.gz'),
40
- ttl_path)
41
- ttl_data = fetch_from_ttl(File.join(config['package_dir'], ttl_path))
42
- # this is an array- the addition elements are the contributor(s)
43
- ttl_data
44
- end
45
-
46
- # Maps RELS-EXT
47
- def self.map_rels_ext(_ttl_data)
48
- rels_ext = {}
49
- rels_ext['@context'] = ROF::RelsExtRefContext.dup
50
- rels_ext
51
- end
52
-
53
- # sets metadata
54
- def self.map_metadata(config, project, ttl_data)
55
- metadata = {}
56
- metadata['@context'] = ROF::RdfContext.dup
57
- # metdata derived from project ttl file
58
- metadata['dc:created'] = Time.iso8601(ttl_data[0][@osf_map['dc:created']][0]['@value']).to_date.iso8601 + 'Z'
59
- metadata['dc:title'] = ttl_data[0][@osf_map['dc:title']][0]['@value']
60
- metadata['dc:description'] =
61
- ttl_data[0][@osf_map['dc:description']][0]['@value']
62
- metadata['dc:subject'] = map_subject(ttl_data[0])
63
- # metadata derived from osf_projects data, passed from UI
64
- metadata['dc:source'] = 'https://osf.io/' + project['project_identifier']
65
- metadata['dc:creator#adminstrative_unit'] = project['administrative_unit']
66
- metadata['dc:creator#affiliation'] = project['affiliation']
67
- metadata['dc:creator'] = map_creator(config, project, ttl_data)
68
- metadata
69
- end
70
-
71
- # Constructs OsfArchive Record from ttl_data, data from the UI form,
72
- # and task config data
73
- def self.build_archive_record(config, this_project, ttl_data)
74
- this_rof = {}
75
- this_rof['owner'] = this_project['owner']
76
- this_rof['type'] = 'OsfArchive'
77
- this_rof['rights'] = map_rights(ttl_data[0])
78
- this_rof['rels-ext'] = map_rels_ext(ttl_data[0])
79
- this_rof['metadata'] = map_metadata(config, this_project, ttl_data)
80
- this_rof['files'] = [this_project['project_identifier'] + '.tar.gz']
81
- this_rof
82
- end
83
-
84
- # sets subject
85
- def self.map_subject(ttl_data)
86
- if ttl_data.key?(@osf_map['dc:subject'])
87
- return ttl_data[@osf_map['dc:subject']][0]['@value']
88
- end
89
- ''
90
- end
91
-
92
- # figures out the rights
93
- def self.map_rights(ttl_data)
94
- rights = {}
95
- if ttl_data[@osf_map['isPublic']][0]['@value'] == 'true'
96
- rights['read-groups'] = ['public']
97
- end
98
- rights
99
- end
100
-
101
- # sets the creator- needs to read another ttl for the User data
102
- # only contrubutors with isBibliographic true are considered
103
- def self.map_creator(config, project, ttl_data)
104
- creator = []
105
- ttl_data[0][@osf_map['hasContributor']].each do |contributor|
106
- ttl_data.each do |item|
107
- next unless item['@id'] == contributor['@id']
108
- if item[@osf_map['isBibliographic']][0]['@value'] == 'true'
109
- creator.push map_user_from_ttl(config, project,
110
- item[@osf_map['hasUser']][0]['@id'])
111
- end
112
- end
113
- end
114
- creator
115
- end
116
-
117
- # read user ttl file, extract User's full name
118
- def self.map_user_from_ttl(config, project, file_subpath)
119
- ttl_data = ttl_from_targz(config, project, File.basename(file_subpath))
120
- ttl_data[0][@osf_map['hasFullName']][0]['@value']
121
- end
122
- end
123
- end
@@ -1,22 +0,0 @@
1
- require 'spec_helper'
2
-
3
- RSpec.describe ROF::FedoraToRof do
4
- it 'handles embargo presence or absence' do
5
-
6
- rights_tests = [
7
- ['<embargo> <human/> <machine> <date>2017-08-01</date> </machine> </embargo>', true],
8
- ['<embargo> <human/> <machine> <date></date> </machine> </embargo>', false],
9
- ['<embargo> <human/> <machine/> </embargo>', false]
10
- ]
11
-
12
- begin
13
-
14
- rights_tests.each do |this_test|
15
- xml_doc = REXML::Document.new(this_test[0])
16
- root = xml_doc.root
17
- rights = ROF::FedoraToRof.has_embargo_date(root)
18
- expect(rights).to eq(this_test[1])
19
- end
20
- end
21
- end
22
- end