rof 1.0.7 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +9 -7
  3. data/LICENSE +201 -16
  4. data/Rakefile +46 -0
  5. data/bin/csv_to_rof +1 -2
  6. data/bin/fedora_to_rof +7 -1
  7. data/bin/jsonld_to_rof +26 -0
  8. data/bin/osf_to_rof +6 -2
  9. data/bin/rof +5 -19
  10. data/lib/rof.rb +2 -6
  11. data/lib/rof/access.rb +1 -1
  12. data/lib/rof/cli.rb +104 -67
  13. data/lib/rof/compare_rof.rb +68 -39
  14. data/lib/rof/filter.rb +21 -0
  15. data/lib/rof/filters.rb +38 -0
  16. data/lib/rof/filters/bendo.rb +15 -17
  17. data/lib/rof/filters/date_stamp.rb +5 -4
  18. data/lib/rof/filters/file_to_url.rb +5 -3
  19. data/lib/rof/filters/label.rb +9 -7
  20. data/lib/rof/filters/work.rb +7 -5
  21. data/lib/rof/ingest.rb +5 -0
  22. data/lib/rof/osf_context.rb +2 -2
  23. data/lib/rof/rdf_context.rb +2 -0
  24. data/lib/rof/translator.rb +18 -0
  25. data/lib/rof/translators.rb +23 -0
  26. data/lib/rof/{translate_csv.rb → translators/csv_to_rof.rb} +4 -3
  27. data/lib/rof/translators/fedora_to_rof.rb +244 -0
  28. data/lib/rof/translators/jsonld_to_rof.rb +112 -0
  29. data/lib/rof/translators/jsonld_to_rof/accumulator.rb +175 -0
  30. data/lib/rof/translators/jsonld_to_rof/predicate_handler.rb +223 -0
  31. data/lib/rof/translators/jsonld_to_rof/predicate_object_handler.rb +125 -0
  32. data/lib/rof/translators/jsonld_to_rof/statement_handler.rb +91 -0
  33. data/lib/rof/translators/osf_to_rof.rb +191 -0
  34. data/lib/rof/utility.rb +44 -1
  35. data/lib/rof/version.rb +1 -1
  36. data/rof.gemspec +10 -2
  37. data/spec/coverage_helper.rb +17 -0
  38. data/spec/fixtures/for_utility_load_items_from_json_file/multiple_items.json +8 -0
  39. data/spec/fixtures/for_utility_load_items_from_json_file/parse_error.json +3 -0
  40. data/spec/fixtures/for_utility_load_items_from_json_file/single_item.json +3 -0
  41. data/spec/fixtures/jsonld_to_rof/0g354f18610.jsonld +113 -0
  42. data/spec/fixtures/jsonld_to_rof/0g354f18610.rof +96 -0
  43. data/spec/fixtures/jsonld_to_rof/2j62s467216.jsonld +113 -0
  44. data/spec/fixtures/jsonld_to_rof/2j62s467216.rof +93 -0
  45. data/spec/fixtures/jsonld_to_rof/2v23vt16z2z.jsonld +70 -0
  46. data/spec/fixtures/jsonld_to_rof/2v23vt16z2z.rof +87 -0
  47. data/spec/fixtures/jsonld_to_rof/cr56n01253w.jsonld +84 -0
  48. data/spec/fixtures/jsonld_to_rof/cr56n01253w.rof +95 -0
  49. data/spec/fixtures/jsonld_to_rof/h989r21069m.jsonld +84 -0
  50. data/spec/fixtures/jsonld_to_rof/h989r21069m.rof +98 -0
  51. data/spec/fixtures/jsonld_to_rof/js956d59913.jsonld +79 -0
  52. data/spec/fixtures/jsonld_to_rof/js956d59913.rof +89 -0
  53. data/spec/fixtures/jsonld_to_rof/m039k358q5c.jsonld +80 -0
  54. data/spec/fixtures/jsonld_to_rof/m039k358q5c.rof +64 -0
  55. data/spec/fixtures/jsonld_to_rof/nk322b9161g.jsonld +89 -0
  56. data/spec/fixtures/jsonld_to_rof/nk322b9161g.rof +69 -0
  57. data/spec/fixtures/jsonld_to_rof/p8418k7430d.jsonld +84 -0
  58. data/spec/fixtures/jsonld_to_rof/p8418k7430d.rof +67 -0
  59. data/spec/fixtures/jsonld_to_rof/xg94hm53h0c.jsonld +98 -0
  60. data/spec/fixtures/jsonld_to_rof/xg94hm53h0c.rof +110 -0
  61. data/spec/fixtures/jsonld_to_rof/zk51vd69n1r.jsonld +94 -0
  62. data/spec/fixtures/jsonld_to_rof/zk51vd69n1r.rof +121 -0
  63. data/spec/fixtures/osf/phz6b.tar.gz +0 -0
  64. data/spec/lib/rof/access_spec.rb +30 -23
  65. data/spec/lib/rof/cli_spec.rb +83 -60
  66. data/spec/lib/rof/compare_rof_spec.rb +35 -24
  67. data/spec/lib/rof/filter_spec.rb +10 -0
  68. data/spec/lib/rof/filters/bendo_spec.rb +42 -0
  69. data/spec/lib/rof/filters/date_stamp_spec.rb +9 -5
  70. data/spec/lib/rof/filters/file_to_url_spec.rb +7 -3
  71. data/spec/lib/rof/filters/label_spec.rb +121 -77
  72. data/spec/lib/rof/filters/work_spec.rb +7 -4
  73. data/spec/lib/rof/filters_spec.rb +14 -0
  74. data/spec/lib/rof/translator_spec.rb +15 -0
  75. data/spec/lib/rof/{translate_csv_spec.rb → translators/csv_to_rof_spec.rb} +14 -14
  76. data/spec/lib/rof/translators/fedora_to_rof_spec.rb +64 -0
  77. data/spec/lib/rof/translators/jsonld_to_rof/accumulator_spec.rb +121 -0
  78. data/spec/lib/rof/translators/jsonld_to_rof/predicate_handler_spec.rb +73 -0
  79. data/spec/lib/rof/translators/jsonld_to_rof/predicate_object_handler_spec.rb +48 -0
  80. data/spec/lib/rof/translators/jsonld_to_rof/statement_handler_spec.rb +40 -0
  81. data/spec/lib/rof/translators/jsonld_to_rof_spec.rb +120 -0
  82. data/spec/lib/rof/{osf_to_rof_spec.rb → translators/osf_to_rof_spec.rb} +55 -25
  83. data/spec/lib/rof/translators_spec.rb +14 -0
  84. data/spec/lib/rof/utility_spec.rb +47 -1
  85. data/spec/spec_helper.rb +1 -1
  86. data/spec/support/an_rof_filter.rb +10 -0
  87. metadata +186 -15
  88. data/lib/rof/get_from_fedora.rb +0 -211
  89. data/lib/rof/osf_to_rof.rb +0 -123
  90. data/spec/lib/rof/get_from_fedora_spec.rb +0 -22
@@ -1,211 +0,0 @@
1
- require 'json'
2
- require 'rexml/document'
3
- require 'rdf/ntriples'
4
- require 'rdf/rdfxml'
5
- require 'rubydora'
6
-
7
- module ROF
8
- class FedoraToRof
9
- # connect to fedora and fetch object
10
- # returns array of fedora attributes or nil
11
- def self.GetFromFedora(pid, fedora, config)
12
- @fedora_info = {}
13
-
14
- # Try to connect to fedora, and search for the desired item
15
- # If either of these actions fail, handle it, and exit.
16
- begin
17
- fedora = Rubydora.connect(fedora)
18
- doc = fedora.find(pid)
19
- rescue StandardError => e
20
- puts "Error: #{e}"
21
- exit 1
22
- end
23
-
24
- # set pid, type
25
- @fedora_info['pid'] = pid
26
- @fedora_info['type'] = 'fobject'
27
-
28
- readFedora(doc, config)
29
-
30
- @fedora_info
31
- end
32
-
33
- # Given a rubydora object, extract what we need
34
- # to create our ROF object in an associative array
35
- #
36
- def self.readFedora(rdora_obj, config)
37
- @fedora_info['af-model'] = setModel(rdora_obj)
38
- # iterate through the data streams that are present.
39
- # use reflection to call appropriate method for each
40
- rdora_obj.datastreams.each do |dsname, ds|
41
- next if dsname == 'DC'
42
- method_key = dsname.sub('-', '')
43
- if respond_to?(method_key)
44
- send(method_key, ds, config)
45
- else
46
- # dump generic datastream
47
- meta = create_meta(ds, config)
48
- @fedora_info["#{dsname}-meta"] = meta unless meta.empty?
49
-
50
- # if content is short < X bytes and valid utf-8, save as string
51
- # if content is > X bytes or is not utf-8, save as file only if config option is given
52
- content = ds.datastream_content
53
- if content.length <= 1024 || config['inline']
54
- # this downloads the contents of the datastream into memory
55
- content_string = content.to_s.force_encoding('UTF-8')
56
- if content_string.valid_encoding?
57
- @fedora_info[dsname] = content_string
58
- next # we're done! move on to next datastream
59
- end
60
- # not utf-8, so keep going and see if download option was given
61
- end
62
- next unless config['download']
63
- # download option was given, so save this datastream as a file
64
- fname = "#{@fedora_info['pid']}-#{dsname}"
65
- abspath = File.join(config['download_path'], fname)
66
- @fedora_info["#{dsname}-file"] = fname
67
- if File.file?(config['download_path'])
68
- puts "Error: --download directory #{config['download_path']} specified is an existing file."
69
- exit 1
70
- end
71
- FileUtils.mkdir_p(config['download_path'])
72
- File.open(abspath, 'w') do |f|
73
- f.write(content)
74
- end
75
- end
76
- end
77
- end
78
-
79
- def self.create_meta(ds, config)
80
- result = {}
81
-
82
- label = ds.profile['dsLabel']
83
- result['label'] = label unless label.nil? || label == ''
84
- result['mime-type'] = ds.profile['dsMIME'] if ds.profile['dsMIME'] != 'text/plain'
85
- # TODO(dbrower): make sure this is working as intended
86
- if %w(R E).include?(ds.profile['dsControlGroup'])
87
- s = result['URL'] = ds.profile['dsLocation']
88
- s = s.sub(config['bendo'], 'bendo:') if config['bendo']
89
- result['URL'] = s
90
- end
91
- result
92
- end
93
-
94
- # set fedora_indo['af-model']
95
- #
96
- def self.setModel(rdora_obj)
97
- # only keep info:fedora/afmodel:XXXXX
98
- models = rdora_obj.profile['objModels'].map do |model|
99
- Regexp.last_match(1) if model =~ /^info:fedora\/afmodel:(.*)/
100
- end.compact
101
- models[0]
102
- end
103
-
104
- # The methods below are called if the like-named datastream exists in fedora
105
-
106
- # set metadata
107
- #
108
- def self.descMetadata(ds, _config)
109
- # desMetadata is encoded in ntriples, convert to JSON-LD using our special context
110
- graph = RDF::Graph.new
111
- data = ds.datastream_content
112
- # force utf-8 encoding. fedora does not store the encoding, so it defaults to ASCII-8BIT
113
- # see https://github.com/ruby-rdf/rdf/issues/142
114
- data.force_encoding('utf-8')
115
- graph.from_ntriples(data, format: :ntriples)
116
- JSON::LD::API.fromRdf(graph) do |expanded|
117
- result = JSON::LD::API.compact(expanded, RdfContext)
118
- @fedora_info['metadata'] = result
119
- end
120
- end
121
-
122
- # set rights
123
- #
124
- def self.rightsMetadata(ds, _config)
125
- # rights is an XML document
126
- # the access array may have read or edit elements
127
- # each of these elements may contain group or person elements
128
- xml_doc = REXML::Document.new(ds.datastream_content)
129
-
130
- rights_array = {}
131
-
132
- root = xml_doc.root
133
-
134
- # check for optional embargo date - set if present
135
- this_embargo = root.elements['embargo']
136
- rights_array['embargo-date'] = this_embargo.elements['machine'].elements['date'][0] if has_embargo_date(this_embargo)
137
-
138
- %w(read edit).each do |access|
139
- this_access = root.elements["//access[@type=\'#{access}\']"]
140
-
141
- next if this_access.nil?
142
-
143
- unless this_access.elements['machine'].elements['group'].nil?
144
- group_array = []
145
- this_access.elements['machine'].elements['group'].each do |this_group|
146
- group_array << this_group
147
- end
148
- rights_array["#{access}-groups"] = group_array
149
- end
150
-
151
- next if this_access.elements['machine'].elements['person'].nil?
152
- person_array = []
153
-
154
- this_access.elements['machine'].elements['person'].each do |this_person|
155
- person_array << this_person
156
- end
157
- rights_array[access.to_s] = person_array
158
- end
159
-
160
- @fedora_info['rights'] = rights_array
161
- end
162
-
163
- # test for embargo xml cases
164
- def self.has_embargo_date(embargo_xml)
165
- return false if embargo_xml == '' || embargo_xml.nil?
166
- return false unless embargo_xml.elements['machine'].has_elements? && embargo_xml.elements['machine'].elements['date'].has_text?
167
- true
168
- end
169
-
170
- def self.RELSEXT(ds, _config)
171
- # RELS-EXT is RDF-XML - parse it
172
- ctx = ROF::RelsExtRefContext.dup
173
- ctx.delete('@base') # @base causes problems when converting TO json-ld (it is = "info:/fedora") but info is not a namespace
174
- graph = RDF::Graph.new
175
- graph.from_rdfxml(ds.datastream_content)
176
- result = nil
177
- JSON::LD::API.fromRdf(graph) do |expanded|
178
- result = JSON::LD::API.compact(expanded, ctx)
179
- end
180
- # now strip the info:fedora/ prefix from the URIs
181
- strip_info_fedora(result)
182
- # remove extra items
183
- result.delete('hasModel')
184
- @fedora_info['rels-ext'] = result
185
- end
186
-
187
- private
188
-
189
- def self.strip_info_fedora(rels_ext)
190
- rels_ext.each do |relation, targets|
191
- next if relation == '@context'
192
- if targets.is_a?(Hash)
193
- strip_info_fedora(targets)
194
- next
195
- end
196
- targets = [targets] if targets.is_a?(String)
197
- targets.map! do |target|
198
- if target.is_a?(Hash)
199
- strip_info_fedora(target)
200
- else
201
- target.sub('info:fedora/', '')
202
- end
203
- end
204
- # some single strings cannot be arrays in json-ld, so convert back
205
- # this shouldn't cause any problems with items that began as arrays
206
- targets = targets[0] if targets.length == 1
207
- rels_ext[relation] = targets
208
- end
209
- end
210
- end
211
- end
@@ -1,123 +0,0 @@
1
- require 'json'
2
- require 'zlib'
3
- require 'rubygems/package'
4
- require 'rdf/turtle'
5
- require 'rof/osf_context'
6
- require 'rof/rdf_context'
7
- require 'rof/utility'
8
-
9
- module ROF
10
- # Class for managing OSF Archive data transformations
11
- # It is called after the get-from-osf task, and before the work-xlat task
12
- class OsfToRof
13
- # Convert Osf Archive tar.gz to ROF
14
- def self.osf_to_rof(config, osf_projects = nil)
15
- @osf_map = ROF::OsfToNDMap
16
- rof_array = []
17
- return {} if osf_projects.nil?
18
- this_project = osf_projects
19
- ttl_data = ttl_from_targz(config, this_project,
20
- this_project['project_identifier'] + '.ttl')
21
- rof_array[0] = build_archive_record(config, this_project, ttl_data)
22
- rof_array
23
- end
24
-
25
- # reads a ttl file and makes it a JSON-LD file that we can parse
26
- def self.fetch_from_ttl(ttl_file)
27
- graph = RDF::Turtle::Reader.open(ttl_file,
28
- prefixes: ROF::OsfPrefixList.dup)
29
- JSON::LD::API.fromRdf(graph)
30
- end
31
-
32
- # extracts given ttl file from JHU tar.gz package
33
- # - assumed to live under data/obj/root
34
- def self.ttl_from_targz(config, this_project, ttl_filename)
35
- id = this_project['project_identifier']
36
- ttl_path = File.join(id,
37
- 'data/obj/root',
38
- ttl_filename)
39
- ROF::Utility.file_from_targz(File.join(config['package_dir'], id + '.tar.gz'),
40
- ttl_path)
41
- ttl_data = fetch_from_ttl(File.join(config['package_dir'], ttl_path))
42
- # this is an array- the addition elements are the contributor(s)
43
- ttl_data
44
- end
45
-
46
- # Maps RELS-EXT
47
- def self.map_rels_ext(_ttl_data)
48
- rels_ext = {}
49
- rels_ext['@context'] = ROF::RelsExtRefContext.dup
50
- rels_ext
51
- end
52
-
53
- # sets metadata
54
- def self.map_metadata(config, project, ttl_data)
55
- metadata = {}
56
- metadata['@context'] = ROF::RdfContext.dup
57
- # metdata derived from project ttl file
58
- metadata['dc:created'] = Time.iso8601(ttl_data[0][@osf_map['dc:created']][0]['@value']).to_date.iso8601 + 'Z'
59
- metadata['dc:title'] = ttl_data[0][@osf_map['dc:title']][0]['@value']
60
- metadata['dc:description'] =
61
- ttl_data[0][@osf_map['dc:description']][0]['@value']
62
- metadata['dc:subject'] = map_subject(ttl_data[0])
63
- # metadata derived from osf_projects data, passed from UI
64
- metadata['dc:source'] = 'https://osf.io/' + project['project_identifier']
65
- metadata['dc:creator#adminstrative_unit'] = project['administrative_unit']
66
- metadata['dc:creator#affiliation'] = project['affiliation']
67
- metadata['dc:creator'] = map_creator(config, project, ttl_data)
68
- metadata
69
- end
70
-
71
- # Constructs OsfArchive Record from ttl_data, data from the UI form,
72
- # and task config data
73
- def self.build_archive_record(config, this_project, ttl_data)
74
- this_rof = {}
75
- this_rof['owner'] = this_project['owner']
76
- this_rof['type'] = 'OsfArchive'
77
- this_rof['rights'] = map_rights(ttl_data[0])
78
- this_rof['rels-ext'] = map_rels_ext(ttl_data[0])
79
- this_rof['metadata'] = map_metadata(config, this_project, ttl_data)
80
- this_rof['files'] = [this_project['project_identifier'] + '.tar.gz']
81
- this_rof
82
- end
83
-
84
- # sets subject
85
- def self.map_subject(ttl_data)
86
- if ttl_data.key?(@osf_map['dc:subject'])
87
- return ttl_data[@osf_map['dc:subject']][0]['@value']
88
- end
89
- ''
90
- end
91
-
92
- # figures out the rights
93
- def self.map_rights(ttl_data)
94
- rights = {}
95
- if ttl_data[@osf_map['isPublic']][0]['@value'] == 'true'
96
- rights['read-groups'] = ['public']
97
- end
98
- rights
99
- end
100
-
101
- # sets the creator- needs to read another ttl for the User data
102
- # only contrubutors with isBibliographic true are considered
103
- def self.map_creator(config, project, ttl_data)
104
- creator = []
105
- ttl_data[0][@osf_map['hasContributor']].each do |contributor|
106
- ttl_data.each do |item|
107
- next unless item['@id'] == contributor['@id']
108
- if item[@osf_map['isBibliographic']][0]['@value'] == 'true'
109
- creator.push map_user_from_ttl(config, project,
110
- item[@osf_map['hasUser']][0]['@id'])
111
- end
112
- end
113
- end
114
- creator
115
- end
116
-
117
- # read user ttl file, extract User's full name
118
- def self.map_user_from_ttl(config, project, file_subpath)
119
- ttl_data = ttl_from_targz(config, project, File.basename(file_subpath))
120
- ttl_data[0][@osf_map['hasFullName']][0]['@value']
121
- end
122
- end
123
- end
@@ -1,22 +0,0 @@
1
- require 'spec_helper'
2
-
3
- RSpec.describe ROF::FedoraToRof do
4
- it 'handles embargo presence or absence' do
5
-
6
- rights_tests = [
7
- ['<embargo> <human/> <machine> <date>2017-08-01</date> </machine> </embargo>', true],
8
- ['<embargo> <human/> <machine> <date></date> </machine> </embargo>', false],
9
- ['<embargo> <human/> <machine/> </embargo>', false]
10
- ]
11
-
12
- begin
13
-
14
- rights_tests.each do |this_test|
15
- xml_doc = REXML::Document.new(this_test[0])
16
- root = xml_doc.root
17
- rights = ROF::FedoraToRof.has_embargo_date(root)
18
- expect(rights).to eq(this_test[1])
19
- end
20
- end
21
- end
22
- end