rof 0.0.1.pre → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.ruby-version +1 -1
  3. data/.travis.yml +12 -2
  4. data/Gemfile +1 -0
  5. data/README.md +87 -0
  6. data/bin/.ruby-version +1 -0
  7. data/bin/csv_to_rof +26 -0
  8. data/bin/fedora_to_rof +57 -0
  9. data/bin/osf_to_rof +40 -0
  10. data/bin/rof +78 -0
  11. data/bulk-ingest.md +242 -0
  12. data/labels.md +111 -0
  13. data/lib/rof.rb +20 -1
  14. data/lib/rof/access.rb +57 -0
  15. data/lib/rof/cli.rb +122 -0
  16. data/lib/rof/collection.rb +109 -0
  17. data/lib/rof/compare_rof.rb +92 -0
  18. data/lib/rof/filters/bendo.rb +33 -0
  19. data/lib/rof/filters/date_stamp.rb +36 -0
  20. data/lib/rof/filters/file_to_url.rb +27 -0
  21. data/lib/rof/filters/label.rb +153 -0
  22. data/lib/rof/filters/work.rb +111 -0
  23. data/lib/rof/get_from_fedora.rb +196 -0
  24. data/lib/rof/ingest.rb +204 -0
  25. data/lib/rof/ingesters/rels_ext_ingester.rb +78 -0
  26. data/lib/rof/ingesters/rights_metadata_ingester.rb +68 -0
  27. data/lib/rof/osf_context.rb +19 -0
  28. data/lib/rof/osf_to_rof.rb +122 -0
  29. data/lib/rof/rdf_context.rb +36 -0
  30. data/lib/rof/translate_csv.rb +112 -0
  31. data/lib/rof/utility.rb +84 -0
  32. data/lib/rof/version.rb +2 -2
  33. data/rof.gemspec +17 -0
  34. data/spec/fixtures/a.json +4 -0
  35. data/spec/fixtures/label.json +20 -0
  36. data/spec/fixtures/osf/b6psa.tar.gz +0 -0
  37. data/spec/fixtures/rof/dev0012829m.rof +45 -0
  38. data/spec/fixtures/vcr_tests/fedora_to_rof1.yml +5274 -0
  39. data/spec/fixtures/vecnet-citation.json +73 -0
  40. data/spec/lib/rof/access_spec.rb +36 -0
  41. data/spec/lib/rof/cli_spec.rb +66 -0
  42. data/spec/lib/rof/collection_spec.rb +90 -0
  43. data/spec/lib/rof/compare_rof_spec.rb +263 -0
  44. data/spec/lib/rof/filters/date_stamp_spec.rb +90 -0
  45. data/spec/lib/rof/filters/file_to_url_spec.rb +70 -0
  46. data/spec/lib/rof/filters/label_spec.rb +94 -0
  47. data/spec/lib/rof/filters/work_spec.rb +87 -0
  48. data/spec/lib/rof/ingest_spec.rb +117 -0
  49. data/spec/lib/rof/ingesters/rels_ext_ingester_spec.rb +62 -0
  50. data/spec/lib/rof/ingesters/rights_metadata_ingester_spec.rb +114 -0
  51. data/spec/lib/rof/osf_to_rof_spec.rb +76 -0
  52. data/spec/lib/rof/translate_csv_spec.rb +109 -0
  53. data/spec/lib/rof/utility_spec.rb +64 -0
  54. data/spec/lib/rof_spec.rb +14 -0
  55. data/spec/spec_helper.rb +11 -11
  56. metadata +283 -18
@@ -0,0 +1,111 @@
1
+ # Pid Assigner Filter (aka Labels)
2
+
3
+ This filter will assign pids to objects which don't have one, and provides a labeling system so items without pids and still be linked together using RELS-EXT relationships.
4
+
5
+ More thought is needed on how to provide linking in RDF since RDF requires a full URI for subjects and objects.
6
+
7
+ ## Parameters
8
+
9
+ The filter requires the following parameters to work
10
+
11
+ - A namespace to use
12
+ - A server and pool of a noids service to use
13
+
14
+ ## Output
15
+
16
+ The service does two things: it assigns identifiers to objects, and it provides identifiers for linking in the RELS-EXT section.
17
+ The filter uses the following rules to assign identifiers to objects:
18
+
19
+ 1. Any item not of type `fobject` is skipped.
20
+
21
+ 1. Any item of type `fobject` which does not have a `pid` field will be assigned one using the provided namespace and an identifier generated by the noids service.
22
+
23
+ 1. Any item of type `fobject` which has a label in its `pid` field will be assigned an pid using the provided namespace and an identifier generated by the noids service. The label will be **defined** to be the generated identifier.
24
+
25
+ 1. Any item of type `fobject` which otherwise has a `pid` is skipped.
26
+
27
+ The filter uses the following rules to provide for linking in the RELS-EXT section:
28
+
29
+ 1. Any item not of type `fobject` is skipped.
30
+
31
+ 1. Any item of type `fobject` with no `rels-ext` section is skipped.
32
+
33
+ 1. Any item of type `fobject` with a `rels-ext` section has all identifiers which are labels **substituted** with the pid assigned to the label. It is an error to use a label which cannot be resolved. See Resolution.
34
+
35
+ ## Labels
36
+
37
+ A label has the same textual form whether it is used to assign a pid or to reference an assigned pid.
38
+ A label begins with a dollar sign and an open parenthesis, contains a label name, and ends with a close parenthesis.
39
+ For example, `$(label-name)`
40
+ A label name may contain any character except close parenthesis `)`.
41
+ Labels which end with the following strings are reserved:
42
+
43
+ -noid
44
+ -ns
45
+ -info
46
+
47
+ These suffixes are reserved for future use as providing alternate forms of an identifier.
48
+
49
+ I see the replacement of labels in text blocks as a possible way to adapt this to use in the `descMetadata` field, e.g.
50
+
51
+ https://curate.nd.edu/show/$(label-noid)
52
+
53
+ ## Resolution
54
+
55
+ Identifiers do not need to be defined before they are referenced; they merely need to be defined eventually.
56
+ This property allows for complicated object constellations such as cycles, and even self-references.
57
+ It also makes it easier to create input files since ROF creators do not need to make sure objects which define a label are listed before any references to that label.
58
+
59
+ ## Example
60
+
61
+ The following code shows how a list of objects with labels is transformed.
62
+ The input is
63
+
64
+ ```
65
+ [
66
+ {
67
+ "type" : "fobject",
68
+ "pid" : "$(first)"
69
+ },
70
+ {
71
+ "type" : "fobject",
72
+ "rels-ext" : {
73
+ "memberOf" : ["$(first)", "$(second)"]
74
+ }
75
+ },
76
+ {
77
+ "type" : "fobject",
78
+ "pid" : "$(second)"
79
+ }
80
+ ]
81
+ ```
82
+
83
+ Results in the following output after applying the filter.
84
+
85
+ ```
86
+ [
87
+ {
88
+ "type": "fobject",
89
+ "pid": "temp:001"
90
+ },
91
+ {
92
+ "type": "fobject",
93
+ "rels-ext": {
94
+ "memberOf": [
95
+ "temp:001",
96
+ "temp:003"
97
+ ]
98
+ },
99
+ "pid": "temp:002"
100
+ },
101
+ {
102
+ "type": "fobject",
103
+ "pid": "temp:003"
104
+ }
105
+ ]
106
+ ```
107
+
108
+
109
+
110
+
111
+
data/lib/rof.rb CHANGED
@@ -1,5 +1,24 @@
1
+ require "rof/ingest"
1
2
  require "rof/version"
3
+ require "rof/cli"
4
+ require "rof/access"
5
+ require "rof/collection"
6
+ require "rof/utility"
7
+ require "rof/rdf_context"
8
+ require "rof/translate_csv"
9
+ require "rof/filters/date_stamp"
10
+ require "rof/filters/file_to_url"
11
+ require "rof/filters/label"
12
+ require "rof/filters/work"
13
+ require "rof/filters/bendo"
2
14
 
3
15
  module ROF
16
+ end
4
17
 
5
- end
18
+ # work around Rubydora expecting a logger
19
+ unless defined?(logger)
20
+ def logger
21
+ require 'logger'
22
+ @logger ||= Logger.new(STDOUT)
23
+ end
24
+ end
@@ -0,0 +1,57 @@
1
+ module ROF
2
+ # provide translation between access strings and the ROF access hash
3
+ # e.g. ("public", owner=dbrower) --> {read-groups: "public", edit: "dbrower"}
4
+ class Access
5
+ class DecodeError < RuntimeError
6
+ end
7
+
8
+ # convert from a string to a hash
9
+ def self.decode(access_string, owner=nil)
10
+ result = {}
11
+ access_string.split(";").each do |clause|
12
+ t = self.decode_clause(clause, owner)
13
+ t.each do |k,v|
14
+ if v.is_a?(Array)
15
+ result[k] = (result.fetch(k, []) + v).uniq
16
+ else
17
+ result[k] = v
18
+ end
19
+ end
20
+ end
21
+
22
+ result
23
+ end
24
+
25
+ # convert from a hash to a string
26
+ # simple because we do not try to recover "public", et al.
27
+ def self.encode(access_hash)
28
+ result = []
29
+ access_hash.each do |k,v|
30
+ xk = k.gsub("-groups", "group").gsub("embargo-date","embargo")
31
+ xv = v.join(',') if v.is_a?(Array)
32
+ result << "#{xk}=#{xv}"
33
+ end
34
+ result.join(";")
35
+ end
36
+
37
+ def self.decode_clause(access, owner)
38
+ case access
39
+ when "public"
40
+ {"read-groups" => ["public"], "edit" => [owner]}
41
+ when "restricted"
42
+ {"read-groups" => ["registered"], "edit" => [owner]}
43
+ when "private"
44
+ {"edit" => [owner]}
45
+ when /^embargo=(.+)/
46
+ {"embargo-date" => $1}
47
+ when /^(read|readgroup|edit|editgroup|discover|discovergroup)=(.+)/
48
+ which = $1
49
+ who = $2.split(",")
50
+ xwhich = which.gsub("group", "-groups")
51
+ Hash[xwhich, who]
52
+ else
53
+ raise DecodeError
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,122 @@
1
+ require 'benchmark'
2
+ require 'rof/compare_rof'
3
+ require 'json'
4
+ require 'rubydora'
5
+ require 'rof/ingest'
6
+ require 'rof/collection'
7
+ require 'rof/get_from_fedora'
8
+ require 'rof/osf_to_rof'
9
+ module ROF
10
+ module CLI
11
+ # Ingest the file `fname` that is a level 0 rof file. It may contain any
12
+ # number of fedora objects; they will be delt with in the order they appear
13
+ # in the file. Any external files (except fname) are searched for using the
14
+ # `search_path` array of directories. If `fedora` is present, it is a hash
15
+ # having the keys `url`, `user`, and `password`. Omitting `fedora` has the
16
+ # effect of verifying the format of `fname`.
17
+ #
18
+ # All output is sent to `outfile`.
19
+ #
20
+ # Returns the number of errors.
21
+ def self.ingest_file(fname, search_paths = [], outfile = STDOUT, fedora = nil, bendo = nil)
22
+ items = load_items_from_file(fname, outfile)
23
+ ingest_array(items, search_paths, outfile, fedora, bendo)
24
+ end
25
+
26
+ def self.ingest_array(items, search_paths = [], outfile = STDOUT, fedora = nil, bendo = nil)
27
+ need_close = false
28
+ if outfile.nil?
29
+ outfile = File.open('/dev/null', 'w')
30
+ need_close = true
31
+ end
32
+ fedora = Rubydora.connect(fedora) if fedora
33
+ item_count = 1
34
+ error_count = 0
35
+ verb = fedora.nil? ? 'Verifying' : 'Ingesting'
36
+ overall_benchmark = Benchmark.measure do
37
+ items.each do |item|
38
+ begin
39
+ outfile.write("#{item_count}. #{verb} #{item['pid']} ...")
40
+ item_count += 1
41
+ individual_benchmark = Benchmark.measure do
42
+ ROF.Ingest(item, fedora, search_paths, bendo)
43
+ end
44
+ outfile.write("ok. %0.3fs\n" % individual_benchmark.real)
45
+ rescue Exception => e
46
+ error_count += 1
47
+ outfile.write("error. #{e}\n")
48
+ # TODO(dbrower): add option to toggle displaying backtraces
49
+ if e.backtrace
50
+ outfile.write(e.backtrace.join("\n\t"))
51
+ outfile.write("\n")
52
+ end
53
+ end
54
+ end
55
+ end
56
+ outfile.write("Total time %0.3fs\n" % overall_benchmark.real)
57
+ outfile.write("#{error_count} errors\n")
58
+ error_count
59
+ ensure
60
+ outfile.close if outfile && need_close
61
+ end
62
+
63
+ def self.filter_file(filter, fname, outfile = STDOUT)
64
+ items = load_items_from_file(fname, STDERR)
65
+ filter_array(filter, items, fname, outfile)
66
+ end
67
+
68
+ def self.filter_array(filter, items, fname, outfile = STDOUT)
69
+ # filter will transform the items array in place
70
+ result = filter.process(items, fname)
71
+ outfile.write(JSON.pretty_generate(result))
72
+ end
73
+
74
+ # retrieve fedora object and convert to ROF
75
+ def self.convert_to_rof(pids, fedora = nil, outfile = STDOUT, config = {})
76
+ need_close = false
77
+ # use outfile is_a String
78
+ if outfile.is_a?(String)
79
+ outfile = File.open(outfile, 'w')
80
+ need_close = true
81
+ end
82
+
83
+ # wrap the objects inside a JSON list
84
+ result = []
85
+ pids.each do |pid|
86
+ result << ROF::FedoraToRof.GetFromFedora(pid, fedora, config)
87
+ end
88
+ outfile.write(JSON.pretty_generate(result))
89
+ ensure
90
+ outfile.close if outfile && need_close
91
+ end
92
+
93
+ # convert OSF archive tar.gz to rof file
94
+ def self.osf_to_rof(config, outfile = STDOUT)
95
+ osf_projects = load_items_from_file(config['project_file'], outfile) if config.key?('project_file')
96
+ rof_data = ROF::OsfToRof.osf_to_rof(config, osf_projects[0])
97
+ outfile.write(JSON.pretty_generate(rof_data))
98
+ end
99
+
100
+ # compare two rofs
101
+ def self.compare_files(file1, file2, outfile = STDOUT, _fedora, _bendo)
102
+ fedora_rof = load_items_from_file(file1, outfile)
103
+ bendo_rof = load_items_from_file(file2, outfile)
104
+
105
+ ROF::CompareRof.fedora_vs_bendo(fedora_rof, bendo_rof, outfile)
106
+ end
107
+
108
+ protected
109
+
110
+ def self.load_items_from_file(fname, outfile)
111
+ items = nil
112
+ File.open(fname, 'r:UTF-8') do |f|
113
+ items = JSON.parse(f.read)
114
+ end
115
+ items = [items] unless items.is_a? Array
116
+ items
117
+ rescue JSON::ParserError => e
118
+ outfile.puts("Error reading #{fname}:#{e}")
119
+ exit!(1)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,109 @@
1
+ require 'mime-types'
2
+
3
+ module ROF
4
+ # Called from ROF::Work.process_one_work
5
+ # Can assume type fobject, af-model Collection
6
+ class Collection
7
+ class NoFile < RuntimeError
8
+ end
9
+ def self.process_one_collection(input_obj, utility)
10
+ # set the required fields
11
+ result = set_required_fields(input_obj, utility)
12
+ result = make_images(result, input_obj, utility)
13
+ result
14
+ end
15
+
16
+ # Set the fields that must be there
17
+ def self.set_required_fields(obj, utility)
18
+ result = {}
19
+ result['type'] = 'fobject'
20
+ result['af-model'] = 'Collection'
21
+ result['rights'] = obj['rights']
22
+ result['metadata'] = obj['metadata']
23
+ result['pid'] = obj.fetch('pid', utility.next_label)
24
+ result['rels-ext'] = obj.fetch('rels-ext', {})
25
+ result['properties'] = ROF::Utility.prop_ds(obj['owner'])
26
+ result['properties-meta'] = { 'mime-type' => 'text/xml' }
27
+ result
28
+ end
29
+
30
+ # If collection included a file, create launch image and thumbnaile
31
+ def self.make_images(subtotal, obj, utility)
32
+ return subtotal if obj['files'].nil?
33
+
34
+ # verify source image is present in job dir
35
+ image_source = File.join(utility.workdir, obj['files'][0])
36
+
37
+ # attempt to create a launch page image and thumbnail
38
+ # exit if either fails
39
+ unless File.exist?(image_source)
40
+ STDERR.print("ROF:Collection.make_images: file ", image_source, " does not exist.\n")
41
+ raise NoFile
42
+ end
43
+ create_images(subtotal, image_source)
44
+ end
45
+
46
+ def self.create_images(obj, image_source)
47
+ launch_img = make_launch(image_source)
48
+ thumb_img = make_thumb(image_source)
49
+ raise NoFile if launch_img.nil? || thumb_img.nil?
50
+ obj['content-file'] = File.basename(launch_img)
51
+ obj['content-meta'] = { 'mime-type' => find_file_mime(launch_img) }
52
+ obj['thumbnail-file'] = File.basename(thumb_img)
53
+ obj['thumbnail-meta'] = { 'mime-type' => find_file_mime(thumb_img) }
54
+ obj
55
+ end
56
+
57
+ # make collections launch page image
58
+ def self.make_launch(src_image)
59
+ options = ' -resize 350x350 '
60
+
61
+ dest_image = mk_dest_img_name(src_image, '-launch')
62
+ unless run_convert(src_image, dest_image, options)
63
+ STDERR.print("ROF:Collection.mk_launch: failed on file ", src_image, ".\n")
64
+ return nil
65
+ end
66
+ dest_image
67
+ end
68
+
69
+ # make thumbnail
70
+ def self.make_thumb(src_image)
71
+ options = ' -resize 256x256 '
72
+
73
+ dest_image = mk_dest_img_name(src_image, '-thumb')
74
+ unless run_convert(src_image, dest_image, options)
75
+ STDERR.print("ROF:Collection.mk_thumb: failed on file ", src_image, ".\n")
76
+ return nil
77
+ end
78
+ dest_image
79
+ end
80
+
81
+ def self.run_convert(src_image, dest_image, options)
82
+ command = set_convert_path + ' ' + src_image + options + ' ' + dest_image
83
+ Kernel.system(command)
84
+ end
85
+
86
+ # figure out where ImageMagick is installed
87
+ # (assumes brew path on MacOS, binary RPM path on Linux).
88
+ def self.set_convert_path
89
+ host_os = RbConfig::CONFIG['sitearch']
90
+
91
+ return '/usr/local/bin/convert' if host_os.include? 'darwin'
92
+ '/usr/bin/convert'
93
+ end
94
+
95
+ # given source image, create destination name for conversion
96
+ # keep same mime type - use dumb mime type determination
97
+ def self.mk_dest_img_name(src_img, dest_name)
98
+ dest_part = src_img.split('.')
99
+ dest_img = dest_part[0] + dest_name
100
+ dest_img = dest_img + '.' + dest_part[1] if dest_part.length == 2
101
+ dest_img
102
+ end
103
+
104
+ # extract file extension and determine mime/type.
105
+ def self.find_file_mime(filename)
106
+ MIME::Types.of(filename).first.content_type
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,92 @@
1
+ require 'json'
2
+ require 'rdf/ntriples'
3
+ require 'rdf/rdfxml'
4
+ require 'rof/rdf_context'
5
+ require 'rdf/isomorphic'
6
+
7
+ module ROF
8
+ class CompareRof
9
+
10
+ # compare fedora rof to bendo_rof
11
+ # return true in equivalent, false if not
12
+ def self.fedora_vs_bendo( fedora_rof, bendo_rof, output)
13
+
14
+ error_count = 0
15
+ # dereferencing an array of one element with [0]. Oh, the horror of it.
16
+ error_count += compare_rights( fedora_rof[0], bendo_rof[0], output)
17
+ error_count += compare_rels_ext(fedora_rof[0], bendo_rof[0])
18
+ error_count += compare_metadata(fedora_rof[0], bendo_rof[0])
19
+ error_count += compare_everything_else(fedora_rof[0], bendo_rof[0], output)
20
+ error_count
21
+ end
22
+
23
+ # do rights comparison
24
+ # return 0 if the same, >0 if different
25
+ def self.compare_rights( fedora_rof, bendo_rof, output )
26
+
27
+ error_count =0
28
+
29
+ # Use same comparison scheme on all rights
30
+ [ 'read' , 'read-groups', 'edit', 'edit-groups', 'edit-users', 'embargo-date'].each do |attribute|
31
+ error_count += rights_equal(attribute, fedora_rof, bendo_rof)
32
+ break if error_count != 0
33
+ end
34
+
35
+ error_count
36
+ end
37
+
38
+ # compare array or element for equivalence
39
+ def self.rights_equal(rights_attr, fedora, bendo)
40
+ f_rights = fedora.fetch('rights', {}).fetch(rights_attr, [])
41
+ b_rights = bendo.fetch('rights', {}).fetch(rights_attr, [])
42
+
43
+ f_rights = f_rights.sort if f_rights.respond_to?(:"sort")
44
+ b_rights = b_rights.sort if b_rights.respond_to?(:"sort")
45
+
46
+ return 0 if f_rights == b_rights
47
+ 1
48
+ end
49
+
50
+ # convert RELS-EXT sections to RDF::graph and compater w/ rdf-isomorphic
51
+ def self.compare_rels_ext(fedora, bendo)
52
+ error_count = 0
53
+ bendo_rdf = jsonld_to_rdf(bendo['rels-ext'], ROF::RelsExtRefContext)
54
+ fedora_rdf = jsonld_to_rdf(fedora['rels-ext'], ROF::RelsExtRefContext)
55
+ error_count +=1 if ! bendo_rdf.isomorphic_with? fedora_rdf
56
+ error_count
57
+ end
58
+
59
+ def self.jsonld_to_rdf(doc, default_context)
60
+ doc["@context"] = default_context unless doc.has_key?("@context")
61
+ RDF::Graph.new << JSON::LD::API.toRdf(doc)
62
+ end
63
+
64
+ # convert metadata sections to RDF::graph and compater w/ rdf-isomorphic
65
+ def self.compare_metadata(fedora, bendo)
66
+ error_count = 0
67
+ bendo_rdf = jsonld_to_rdf(bendo['metadata'], ROF::RdfContext)
68
+ fedora_rdf = jsonld_to_rdf(fedora['metadata'], ROF::RdfContext)
69
+ error_count +=1 if ! bendo_rdf.isomorphic_with? fedora_rdf
70
+ error_count
71
+ end
72
+
73
+ # compare what remains
74
+ def self.compare_everything_else( fedora, bendo, output)
75
+ error_count =0
76
+ fedora = remove_others(fedora)
77
+ bendo = remove_others(bendo)
78
+ # comparsion using builtin equivalency operation
79
+ error_count = 1 if bendo != fedora
80
+ error_count
81
+ end
82
+
83
+ # remove elements we've dealt with already
84
+ def self.remove_others( rof_object)
85
+ rof_object.delete('rights')
86
+ rof_object.delete('rels-ext')
87
+ rof_object.delete('metadata')
88
+ rof_object.delete('thumbnail-file')
89
+ rof_object
90
+ end
91
+ end
92
+ end