rof 0.0.1.pre → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/.ruby-version +1 -1
  3. data/.travis.yml +12 -2
  4. data/Gemfile +1 -0
  5. data/README.md +87 -0
  6. data/bin/.ruby-version +1 -0
  7. data/bin/csv_to_rof +26 -0
  8. data/bin/fedora_to_rof +57 -0
  9. data/bin/osf_to_rof +40 -0
  10. data/bin/rof +78 -0
  11. data/bulk-ingest.md +242 -0
  12. data/labels.md +111 -0
  13. data/lib/rof.rb +20 -1
  14. data/lib/rof/access.rb +57 -0
  15. data/lib/rof/cli.rb +122 -0
  16. data/lib/rof/collection.rb +109 -0
  17. data/lib/rof/compare_rof.rb +92 -0
  18. data/lib/rof/filters/bendo.rb +33 -0
  19. data/lib/rof/filters/date_stamp.rb +36 -0
  20. data/lib/rof/filters/file_to_url.rb +27 -0
  21. data/lib/rof/filters/label.rb +153 -0
  22. data/lib/rof/filters/work.rb +111 -0
  23. data/lib/rof/get_from_fedora.rb +196 -0
  24. data/lib/rof/ingest.rb +204 -0
  25. data/lib/rof/ingesters/rels_ext_ingester.rb +78 -0
  26. data/lib/rof/ingesters/rights_metadata_ingester.rb +68 -0
  27. data/lib/rof/osf_context.rb +19 -0
  28. data/lib/rof/osf_to_rof.rb +122 -0
  29. data/lib/rof/rdf_context.rb +36 -0
  30. data/lib/rof/translate_csv.rb +112 -0
  31. data/lib/rof/utility.rb +84 -0
  32. data/lib/rof/version.rb +2 -2
  33. data/rof.gemspec +17 -0
  34. data/spec/fixtures/a.json +4 -0
  35. data/spec/fixtures/label.json +20 -0
  36. data/spec/fixtures/osf/b6psa.tar.gz +0 -0
  37. data/spec/fixtures/rof/dev0012829m.rof +45 -0
  38. data/spec/fixtures/vcr_tests/fedora_to_rof1.yml +5274 -0
  39. data/spec/fixtures/vecnet-citation.json +73 -0
  40. data/spec/lib/rof/access_spec.rb +36 -0
  41. data/spec/lib/rof/cli_spec.rb +66 -0
  42. data/spec/lib/rof/collection_spec.rb +90 -0
  43. data/spec/lib/rof/compare_rof_spec.rb +263 -0
  44. data/spec/lib/rof/filters/date_stamp_spec.rb +90 -0
  45. data/spec/lib/rof/filters/file_to_url_spec.rb +70 -0
  46. data/spec/lib/rof/filters/label_spec.rb +94 -0
  47. data/spec/lib/rof/filters/work_spec.rb +87 -0
  48. data/spec/lib/rof/ingest_spec.rb +117 -0
  49. data/spec/lib/rof/ingesters/rels_ext_ingester_spec.rb +62 -0
  50. data/spec/lib/rof/ingesters/rights_metadata_ingester_spec.rb +114 -0
  51. data/spec/lib/rof/osf_to_rof_spec.rb +76 -0
  52. data/spec/lib/rof/translate_csv_spec.rb +109 -0
  53. data/spec/lib/rof/utility_spec.rb +64 -0
  54. data/spec/lib/rof_spec.rb +14 -0
  55. data/spec/spec_helper.rb +11 -11
  56. metadata +283 -18
@@ -0,0 +1,111 @@
1
+ # Pid Assigner Filter (aka Labels)
2
+
3
+ This filter will assign pids to objects which don't have one, and provides a labeling system so items without pids and still be linked together using RELS-EXT relationships.
4
+
5
+ More thought is needed on how to provide linking in RDF since RDF requires a full URI for subjects and objects.
6
+
7
+ ## Parameters
8
+
9
+ The filter requires the following parameters to work
10
+
11
+ - A namespace to use
12
+ - A server and pool of a noids service to use
13
+
14
+ ## Output
15
+
16
+ The service does two things: it assigns identifiers to objects, and it provides identifiers for linking in the RELS-EXT section.
17
+ The filter uses the following rules to assign identifiers to objects:
18
+
19
+ 1. Any item not of type `fobject` is skipped.
20
+
21
+ 1. Any item of type `fobject` which does not have a `pid` field will be assigned one using the provided namespace and an identifier generated by the noids service.
22
+
23
+ 1. Any item of type `fobject` which has a label in its `pid` field will be assigned an pid using the provided namespace and an identifier generated by the noids service. The label will be **defined** to be the generated identifier.
24
+
25
+ 1. Any item of type `fobject` which otherwise has a `pid` is skipped.
26
+
27
+ The filter uses the following rules to provide for linking in the RELS-EXT section:
28
+
29
+ 1. Any item not of type `fobject` is skipped.
30
+
31
+ 1. Any item of type `fobject` with no `rels-ext` section is skipped.
32
+
33
+ 1. Any item of type `fobject` with a `rels-ext` section has all identifiers which are labels **substituted** with the pid assigned to the label. It is an error to use a label which cannot be resolved. See Resolution.
34
+
35
+ ## Labels
36
+
37
+ A label has the same textual form whether it is used to assign a pid or to reference an assigned pid.
38
+ A label begins with a dollar sign and an open parenthesis, contains a label name, and ends with a close parenthesis.
39
+ For example, `$(label-name)`
40
+ A label name may contain any character except close parenthesis `)`.
41
+ Labels which end with the following strings are reserved:
42
+
43
+ -noid
44
+ -ns
45
+ -info
46
+
47
+ These suffixes are reserved for future use as providing alternate forms of an identifier.
48
+
49
+ I see the replacement of labels in text blocks as a possible way to adapt this to use in the `descMetadata` field, e.g.
50
+
51
+ https://curate.nd.edu/show/$(label-noid)
52
+
53
+ ## Resolution
54
+
55
+ Identifiers do not need to be defined before they are referenced; they merely need to be defined eventually.
56
+ This property allows for complicated object constellations such as cycles, and even self-references.
57
+ It also makes it easier to create input files since ROF creators do not need to make sure objects which define a label are listed before any references to that label.
58
+
59
+ ## Example
60
+
61
+ The following code shows how a list of objects with labels is transformed.
62
+ The input is
63
+
64
+ ```
65
+ [
66
+ {
67
+ "type" : "fobject",
68
+ "pid" : "$(first)"
69
+ },
70
+ {
71
+ "type" : "fobject",
72
+ "rels-ext" : {
73
+ "memberOf" : ["$(first)", "$(second)"]
74
+ }
75
+ },
76
+ {
77
+ "type" : "fobject",
78
+ "pid" : "$(second)"
79
+ }
80
+ ]
81
+ ```
82
+
83
+ Results in the following output after applying the filter.
84
+
85
+ ```
86
+ [
87
+ {
88
+ "type": "fobject",
89
+ "pid": "temp:001"
90
+ },
91
+ {
92
+ "type": "fobject",
93
+ "rels-ext": {
94
+ "memberOf": [
95
+ "temp:001",
96
+ "temp:003"
97
+ ]
98
+ },
99
+ "pid": "temp:002"
100
+ },
101
+ {
102
+ "type": "fobject",
103
+ "pid": "temp:003"
104
+ }
105
+ ]
106
+ ```
107
+
108
+
109
+
110
+
111
+
data/lib/rof.rb CHANGED
@@ -1,5 +1,24 @@
1
+ require "rof/ingest"
1
2
  require "rof/version"
3
+ require "rof/cli"
4
+ require "rof/access"
5
+ require "rof/collection"
6
+ require "rof/utility"
7
+ require "rof/rdf_context"
8
+ require "rof/translate_csv"
9
+ require "rof/filters/date_stamp"
10
+ require "rof/filters/file_to_url"
11
+ require "rof/filters/label"
12
+ require "rof/filters/work"
13
+ require "rof/filters/bendo"
2
14
 
3
15
  module ROF
16
+ end
4
17
 
5
- end
18
+ # work around Rubydora expecting a logger
19
+ unless defined?(logger)
20
+ def logger
21
+ require 'logger'
22
+ @logger ||= Logger.new(STDOUT)
23
+ end
24
+ end
@@ -0,0 +1,57 @@
1
+ module ROF
2
+ # provide translation between access strings and the ROF access hash
3
+ # e.g. ("public", owner=dbrower) --> {read-groups: "public", edit: "dbrower"}
4
+ class Access
5
+ class DecodeError < RuntimeError
6
+ end
7
+
8
+ # convert from a string to a hash
9
+ def self.decode(access_string, owner=nil)
10
+ result = {}
11
+ access_string.split(";").each do |clause|
12
+ t = self.decode_clause(clause, owner)
13
+ t.each do |k,v|
14
+ if v.is_a?(Array)
15
+ result[k] = (result.fetch(k, []) + v).uniq
16
+ else
17
+ result[k] = v
18
+ end
19
+ end
20
+ end
21
+
22
+ result
23
+ end
24
+
25
+ # convert from a hash to a string
26
+ # simple because we do not try to recover "public", et al.
27
+ def self.encode(access_hash)
28
+ result = []
29
+ access_hash.each do |k,v|
30
+ xk = k.gsub("-groups", "group").gsub("embargo-date","embargo")
31
+ xv = v.join(',') if v.is_a?(Array)
32
+ result << "#{xk}=#{xv}"
33
+ end
34
+ result.join(";")
35
+ end
36
+
37
+ def self.decode_clause(access, owner)
38
+ case access
39
+ when "public"
40
+ {"read-groups" => ["public"], "edit" => [owner]}
41
+ when "restricted"
42
+ {"read-groups" => ["registered"], "edit" => [owner]}
43
+ when "private"
44
+ {"edit" => [owner]}
45
+ when /^embargo=(.+)/
46
+ {"embargo-date" => $1}
47
+ when /^(read|readgroup|edit|editgroup|discover|discovergroup)=(.+)/
48
+ which = $1
49
+ who = $2.split(",")
50
+ xwhich = which.gsub("group", "-groups")
51
+ Hash[xwhich, who]
52
+ else
53
+ raise DecodeError
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,122 @@
1
+ require 'benchmark'
2
+ require 'rof/compare_rof'
3
+ require 'json'
4
+ require 'rubydora'
5
+ require 'rof/ingest'
6
+ require 'rof/collection'
7
+ require 'rof/get_from_fedora'
8
+ require 'rof/osf_to_rof'
9
+ module ROF
10
+ module CLI
11
+ # Ingest the file `fname` that is a level 0 rof file. It may contain any
12
+ # number of fedora objects; they will be delt with in the order they appear
13
+ # in the file. Any external files (except fname) are searched for using the
14
+ # `search_path` array of directories. If `fedora` is present, it is a hash
15
+ # having the keys `url`, `user`, and `password`. Omitting `fedora` has the
16
+ # effect of verifying the format of `fname`.
17
+ #
18
+ # All output is sent to `outfile`.
19
+ #
20
+ # Returns the number of errors.
21
+ def self.ingest_file(fname, search_paths = [], outfile = STDOUT, fedora = nil, bendo = nil)
22
+ items = load_items_from_file(fname, outfile)
23
+ ingest_array(items, search_paths, outfile, fedora, bendo)
24
+ end
25
+
26
+ def self.ingest_array(items, search_paths = [], outfile = STDOUT, fedora = nil, bendo = nil)
27
+ need_close = false
28
+ if outfile.nil?
29
+ outfile = File.open('/dev/null', 'w')
30
+ need_close = true
31
+ end
32
+ fedora = Rubydora.connect(fedora) if fedora
33
+ item_count = 1
34
+ error_count = 0
35
+ verb = fedora.nil? ? 'Verifying' : 'Ingesting'
36
+ overall_benchmark = Benchmark.measure do
37
+ items.each do |item|
38
+ begin
39
+ outfile.write("#{item_count}. #{verb} #{item['pid']} ...")
40
+ item_count += 1
41
+ individual_benchmark = Benchmark.measure do
42
+ ROF.Ingest(item, fedora, search_paths, bendo)
43
+ end
44
+ outfile.write("ok. %0.3fs\n" % individual_benchmark.real)
45
+ rescue Exception => e
46
+ error_count += 1
47
+ outfile.write("error. #{e}\n")
48
+ # TODO(dbrower): add option to toggle displaying backtraces
49
+ if e.backtrace
50
+ outfile.write(e.backtrace.join("\n\t"))
51
+ outfile.write("\n")
52
+ end
53
+ end
54
+ end
55
+ end
56
+ outfile.write("Total time %0.3fs\n" % overall_benchmark.real)
57
+ outfile.write("#{error_count} errors\n")
58
+ error_count
59
+ ensure
60
+ outfile.close if outfile && need_close
61
+ end
62
+
63
+ def self.filter_file(filter, fname, outfile = STDOUT)
64
+ items = load_items_from_file(fname, STDERR)
65
+ filter_array(filter, items, fname, outfile)
66
+ end
67
+
68
+ def self.filter_array(filter, items, fname, outfile = STDOUT)
69
+ # filter will transform the items array in place
70
+ result = filter.process(items, fname)
71
+ outfile.write(JSON.pretty_generate(result))
72
+ end
73
+
74
+ # retrieve fedora object and convert to ROF
75
+ def self.convert_to_rof(pids, fedora = nil, outfile = STDOUT, config = {})
76
+ need_close = false
77
+ # use outfile is_a String
78
+ if outfile.is_a?(String)
79
+ outfile = File.open(outfile, 'w')
80
+ need_close = true
81
+ end
82
+
83
+ # wrap the objects inside a JSON list
84
+ result = []
85
+ pids.each do |pid|
86
+ result << ROF::FedoraToRof.GetFromFedora(pid, fedora, config)
87
+ end
88
+ outfile.write(JSON.pretty_generate(result))
89
+ ensure
90
+ outfile.close if outfile && need_close
91
+ end
92
+
93
+ # convert OSF archive tar.gz to rof file
94
+ def self.osf_to_rof(config, outfile = STDOUT)
95
+ osf_projects = load_items_from_file(config['project_file'], outfile) if config.key?('project_file')
96
+ rof_data = ROF::OsfToRof.osf_to_rof(config, osf_projects[0])
97
+ outfile.write(JSON.pretty_generate(rof_data))
98
+ end
99
+
100
+ # compare two rofs
101
+ def self.compare_files(file1, file2, outfile = STDOUT, _fedora, _bendo)
102
+ fedora_rof = load_items_from_file(file1, outfile)
103
+ bendo_rof = load_items_from_file(file2, outfile)
104
+
105
+ ROF::CompareRof.fedora_vs_bendo(fedora_rof, bendo_rof, outfile)
106
+ end
107
+
108
+ protected
109
+
110
+ def self.load_items_from_file(fname, outfile)
111
+ items = nil
112
+ File.open(fname, 'r:UTF-8') do |f|
113
+ items = JSON.parse(f.read)
114
+ end
115
+ items = [items] unless items.is_a? Array
116
+ items
117
+ rescue JSON::ParserError => e
118
+ outfile.puts("Error reading #{fname}:#{e}")
119
+ exit!(1)
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,109 @@
1
+ require 'mime-types'
2
+
3
+ module ROF
4
+ # Called from ROF::Work.process_one_work
5
+ # Can assume type fobject, af-model Collection
6
+ class Collection
7
+ class NoFile < RuntimeError
8
+ end
9
+ def self.process_one_collection(input_obj, utility)
10
+ # set the required fields
11
+ result = set_required_fields(input_obj, utility)
12
+ result = make_images(result, input_obj, utility)
13
+ result
14
+ end
15
+
16
+ # Set the fields that must be there
17
+ def self.set_required_fields(obj, utility)
18
+ result = {}
19
+ result['type'] = 'fobject'
20
+ result['af-model'] = 'Collection'
21
+ result['rights'] = obj['rights']
22
+ result['metadata'] = obj['metadata']
23
+ result['pid'] = obj.fetch('pid', utility.next_label)
24
+ result['rels-ext'] = obj.fetch('rels-ext', {})
25
+ result['properties'] = ROF::Utility.prop_ds(obj['owner'])
26
+ result['properties-meta'] = { 'mime-type' => 'text/xml' }
27
+ result
28
+ end
29
+
30
+ # If collection included a file, create launch image and thumbnaile
31
+ def self.make_images(subtotal, obj, utility)
32
+ return subtotal if obj['files'].nil?
33
+
34
+ # verify source image is present in job dir
35
+ image_source = File.join(utility.workdir, obj['files'][0])
36
+
37
+ # attempt to create a launch page image and thumbnail
38
+ # exit if either fails
39
+ unless File.exist?(image_source)
40
+ STDERR.print("ROF:Collection.make_images: file ", image_source, " does not exist.\n")
41
+ raise NoFile
42
+ end
43
+ create_images(subtotal, image_source)
44
+ end
45
+
46
+ def self.create_images(obj, image_source)
47
+ launch_img = make_launch(image_source)
48
+ thumb_img = make_thumb(image_source)
49
+ raise NoFile if launch_img.nil? || thumb_img.nil?
50
+ obj['content-file'] = File.basename(launch_img)
51
+ obj['content-meta'] = { 'mime-type' => find_file_mime(launch_img) }
52
+ obj['thumbnail-file'] = File.basename(thumb_img)
53
+ obj['thumbnail-meta'] = { 'mime-type' => find_file_mime(thumb_img) }
54
+ obj
55
+ end
56
+
57
+ # make collections launch page image
58
+ def self.make_launch(src_image)
59
+ options = ' -resize 350x350 '
60
+
61
+ dest_image = mk_dest_img_name(src_image, '-launch')
62
+ unless run_convert(src_image, dest_image, options)
63
+ STDERR.print("ROF:Collection.mk_launch: failed on file ", src_image, ".\n")
64
+ return nil
65
+ end
66
+ dest_image
67
+ end
68
+
69
+ # make thumbnail
70
+ def self.make_thumb(src_image)
71
+ options = ' -resize 256x256 '
72
+
73
+ dest_image = mk_dest_img_name(src_image, '-thumb')
74
+ unless run_convert(src_image, dest_image, options)
75
+ STDERR.print("ROF:Collection.mk_thumb: failed on file ", src_image, ".\n")
76
+ return nil
77
+ end
78
+ dest_image
79
+ end
80
+
81
+ def self.run_convert(src_image, dest_image, options)
82
+ command = set_convert_path + ' ' + src_image + options + ' ' + dest_image
83
+ Kernel.system(command)
84
+ end
85
+
86
+ # figure out where ImageMagick is installed
87
+ # (assumes brew path on MacOS, binary RPM path on Linux).
88
+ def self.set_convert_path
89
+ host_os = RbConfig::CONFIG['sitearch']
90
+
91
+ return '/usr/local/bin/convert' if host_os.include? 'darwin'
92
+ '/usr/bin/convert'
93
+ end
94
+
95
+ # given source image, create destination name for conversion
96
+ # keep same mime type - use dumb mime type determination
97
+ def self.mk_dest_img_name(src_img, dest_name)
98
+ dest_part = src_img.split('.')
99
+ dest_img = dest_part[0] + dest_name
100
+ dest_img = dest_img + '.' + dest_part[1] if dest_part.length == 2
101
+ dest_img
102
+ end
103
+
104
+ # extract file extension and determine mime/type.
105
+ def self.find_file_mime(filename)
106
+ MIME::Types.of(filename).first.content_type
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,92 @@
1
+ require 'json'
2
+ require 'rdf/ntriples'
3
+ require 'rdf/rdfxml'
4
+ require 'rof/rdf_context'
5
+ require 'rdf/isomorphic'
6
+
7
+ module ROF
8
+ class CompareRof
9
+
10
+ # compare fedora rof to bendo_rof
11
+ # return true in equivalent, false if not
12
+ def self.fedora_vs_bendo( fedora_rof, bendo_rof, output)
13
+
14
+ error_count = 0
15
+ # dereferencing an array of one element with [0]. Oh, the horror of it.
16
+ error_count += compare_rights( fedora_rof[0], bendo_rof[0], output)
17
+ error_count += compare_rels_ext(fedora_rof[0], bendo_rof[0])
18
+ error_count += compare_metadata(fedora_rof[0], bendo_rof[0])
19
+ error_count += compare_everything_else(fedora_rof[0], bendo_rof[0], output)
20
+ error_count
21
+ end
22
+
23
+ # do rights comparison
24
+ # return 0 if the same, >0 if different
25
+ def self.compare_rights( fedora_rof, bendo_rof, output )
26
+
27
+ error_count =0
28
+
29
+ # Use same comparison scheme on all rights
30
+ [ 'read' , 'read-groups', 'edit', 'edit-groups', 'edit-users', 'embargo-date'].each do |attribute|
31
+ error_count += rights_equal(attribute, fedora_rof, bendo_rof)
32
+ break if error_count != 0
33
+ end
34
+
35
+ error_count
36
+ end
37
+
38
+ # compare array or element for equivalence
39
+ def self.rights_equal(rights_attr, fedora, bendo)
40
+ f_rights = fedora.fetch('rights', {}).fetch(rights_attr, [])
41
+ b_rights = bendo.fetch('rights', {}).fetch(rights_attr, [])
42
+
43
+ f_rights = f_rights.sort if f_rights.respond_to?(:"sort")
44
+ b_rights = b_rights.sort if b_rights.respond_to?(:"sort")
45
+
46
+ return 0 if f_rights == b_rights
47
+ 1
48
+ end
49
+
50
+ # convert RELS-EXT sections to RDF::graph and compater w/ rdf-isomorphic
51
+ def self.compare_rels_ext(fedora, bendo)
52
+ error_count = 0
53
+ bendo_rdf = jsonld_to_rdf(bendo['rels-ext'], ROF::RelsExtRefContext)
54
+ fedora_rdf = jsonld_to_rdf(fedora['rels-ext'], ROF::RelsExtRefContext)
55
+ error_count +=1 if ! bendo_rdf.isomorphic_with? fedora_rdf
56
+ error_count
57
+ end
58
+
59
+ def self.jsonld_to_rdf(doc, default_context)
60
+ doc["@context"] = default_context unless doc.has_key?("@context")
61
+ RDF::Graph.new << JSON::LD::API.toRdf(doc)
62
+ end
63
+
64
+ # convert metadata sections to RDF::graph and compater w/ rdf-isomorphic
65
+ def self.compare_metadata(fedora, bendo)
66
+ error_count = 0
67
+ bendo_rdf = jsonld_to_rdf(bendo['metadata'], ROF::RdfContext)
68
+ fedora_rdf = jsonld_to_rdf(fedora['metadata'], ROF::RdfContext)
69
+ error_count +=1 if ! bendo_rdf.isomorphic_with? fedora_rdf
70
+ error_count
71
+ end
72
+
73
+ # compare what remains
74
+ def self.compare_everything_else( fedora, bendo, output)
75
+ error_count =0
76
+ fedora = remove_others(fedora)
77
+ bendo = remove_others(bendo)
78
+ # comparsion using builtin equivalency operation
79
+ error_count = 1 if bendo != fedora
80
+ error_count
81
+ end
82
+
83
+ # remove elements we've dealt with already
84
+ def self.remove_others( rof_object)
85
+ rof_object.delete('rights')
86
+ rof_object.delete('rels-ext')
87
+ rof_object.delete('metadata')
88
+ rof_object.delete('thumbnail-file')
89
+ rof_object
90
+ end
91
+ end
92
+ end