bio-publisci 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +1 -1
  3. data/Gemfile +1 -1
  4. data/Rakefile +4 -6
  5. data/features/integration_steps.rb +1 -1
  6. data/features/metadata.feature +24 -0
  7. data/features/metadata_steps.rb +21 -0
  8. data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +36 -14
  9. data/lib/bio-publisci/dataset/ORM/observation.rb +1 -1
  10. data/lib/bio-publisci/dataset/data_cube.rb +192 -131
  11. data/lib/bio-publisci/dataset/dataset_for.rb +150 -0
  12. data/lib/bio-publisci/dataset/interactive.rb +70 -55
  13. data/lib/bio-publisci/metadata/metadata.rb +81 -8
  14. data/lib/bio-publisci/parser.rb +76 -1
  15. data/lib/bio-publisci/readers/big_cross.rb +118 -117
  16. data/lib/bio-publisci/readers/csv.rb +37 -2
  17. data/lib/bio-publisci/readers/r_matrix.rb +1 -1
  18. data/lib/bio-publisci/store.rb +31 -31
  19. data/lib/bio-publisci/writers/arff.rb +48 -49
  20. data/lib/bio-publisci.rb +3 -0
  21. data/resources/queries/code_resources.rq +10 -0
  22. data/resources/queries/dimension_ranges.rq +3 -3
  23. data/resources/queries/dimensions.rq +3 -3
  24. data/resources/queries/measures.rq +3 -3
  25. data/resources/queries/observation_labels.rq +8 -0
  26. data/resources/queries/properties.rq +8 -0
  27. data/scripts/islet_mlratio.rb +6 -0
  28. data/scripts/scan_islet.rb +6 -0
  29. data/scripts/update_reference.rb +20 -0
  30. data/spec/ORM/data_cube_orm_spec.rb +12 -0
  31. data/spec/data_cube_spec.rb +1 -1
  32. data/spec/generators/dataframe_spec.rb +1 -1
  33. data/spec/generators/r_matrix_spec.rb +1 -1
  34. data/spec/r_builder_spec.rb +6 -6
  35. data/spec/resource/.RData +0 -0
  36. data/spec/resource/example.Rhistory +3 -0
  37. data/spec/turtle/bacon +4 -22
  38. data/spec/turtle/reference +9 -27
  39. metadata +37 -56
  40. data/lib/bio-publisci/loader.rb +0 -36
  41. data/spec/bio-publisci_spec.rb +0 -7
@@ -0,0 +1,150 @@
1
+ module R2RDF
2
+ class Dataset
3
+ extend R2RDF::Interactive
4
+
5
+ def self.for(object, options={}, ask_on_ambiguous=true)
6
+ if object.is_a? String
7
+ if File.exist? object
8
+ if File.extname(object).size > 0
9
+ extension = File.extname(object)
10
+ elsif File.basename(object)[0] == '.' && File.basename(object).count('.') == 1
11
+ extension = File.basename(object)
12
+ else
13
+ raise "Can't load file #{object}; type inference not yet implemented"
14
+ end
15
+
16
+ case extension
17
+ when ".RData"
18
+ r_object(object, options, ask_on_ambiguous)
19
+ when /.csv/i
20
+ R2RDF::Reader::CSV.new.automatic(object,nil,options,ask_on_ambiguous)
21
+ end
22
+ else
23
+ raise "Unable to find reader for File or String"
24
+ end
25
+ elsif object.is_a? Rserve::REXP
26
+ r_object(object, options, ask_on_ambiguous)
27
+ else
28
+ raise "not recognize Ruby objects of this type yet (#{object})"
29
+ end
30
+ end
31
+
32
+ def self.r_object(object, options={}, ask_on_ambiguous=true)
33
+ if object.is_a? String
34
+ con = Rserve::Connection.new
35
+ vars = con.eval("load('#{File.absolute_path object}')")
36
+ if vars.to_ruby.size > 1 && ask_on_ambiguous
37
+ puts "Which variable? #{vars.to_ruby}"
38
+ var = vars.to_ruby[gets.to_i]
39
+ else
40
+ var = vars.to_ruby[0]
41
+ end
42
+
43
+ r_classes = con.eval("class(#{var})").to_ruby
44
+
45
+ if r_classes.include? "data.frame"
46
+ df = R2RDF::Reader::Dataframe.new
47
+ unless options[:dimensions] || !ask_on_ambiguous
48
+ dims = con.eval("names(#{var})").to_ruby
49
+ puts "Which dimensions? #{dims}"
50
+ selection = gets.chomp
51
+ if selection.size > 0
52
+ options[:dimensions] = selection.split(',').map(&:to_i).map{|i| dims[i]}
53
+ end
54
+ end
55
+ unless options[:measures] || !ask_on_ambiguous
56
+ meas = con.eval("names(#{var})").to_ruby
57
+ puts "Which measures? #{meas} "
58
+ selection = gets.chomp
59
+ if selection.size > 0
60
+ options[:measures] = selection.split(',').map(&:to_i).map{|i| meas[i]}
61
+ end
62
+ end
63
+
64
+ df.generate_n3(con.eval(var),var,options)
65
+
66
+ elsif r_classes.include? "cross"
67
+ bc = R2RDF::Reader::BigCross.new
68
+
69
+ unless options[:measures] || !ask_on_ambiguous
70
+ pheno_names = con.eval("names(#{var}$pheno)").to_ruby
71
+ puts "Which phenotype traits? #{pheno_names}"
72
+ selection = gets.chomp
73
+ if selection.size > 0
74
+ options[:measures] = selection.split(',').map(&:to_i).map{|i| pheno_names[i]}
75
+ end
76
+ end
77
+
78
+ base = var
79
+ if ask_on_ambiguous
80
+ puts "Output file base?"
81
+ base = gets.chomp
82
+ base = var unless base.size > 0
83
+ end
84
+
85
+ bc.generate_n3(con, var, base, options)
86
+
87
+ elsif r_classes.include? "matrix"
88
+ mat = R2RDF::Reader::RMatrix.new
89
+
90
+ unless options[:measures] || !ask_on_ambiguous
91
+ puts "Row label"
92
+ rows = gets.chomp
93
+ rows = "row" unless rows.size > 0
94
+
95
+ puts "Column label"
96
+ cols = gets.chomp
97
+ cols = "column" unless cols.size > 0
98
+
99
+ puts "Entry label"
100
+ vals = gets.chomp
101
+ vals = "value" unless vals.size > 0
102
+
103
+ options[:measures] = [cols,rows,vals]
104
+ end
105
+
106
+ base = var
107
+ if ask_on_ambiguous
108
+ puts "Output file base?"
109
+ base = gets.chomp
110
+ base = var unless base.size > 0
111
+ end
112
+
113
+ mat.generate_n3(con, var, base, options)
114
+ else
115
+ raise "no R2RDF::Reader found for #{r_classes}"
116
+ end
117
+
118
+ elsif object.is_a? Rserve::REXP
119
+ if object.attr.payload["class"].payload.first
120
+
121
+ df = R2RDF::Reader::Dataframe.new
122
+
123
+ var = nil
124
+
125
+ if ask_on_ambiguous
126
+ var = interact("Dataset name?",nil)
127
+ end
128
+
129
+ unless options[:dimensions] || !ask_on_ambiguous
130
+ dims = object.payload.names
131
+ selection = interact("Which dimensions?","row",dims){|s| puts s; nil}
132
+ options[:dimensions] = selection if selection
133
+ end
134
+
135
+ unless options[:measures] || !ask_on_ambiguous
136
+ meas = object.payload.names
137
+ options[:measures] = interact("Which measures?",meas,meas)
138
+ end
139
+
140
+ df.generate_n3(object,var,options)
141
+ else
142
+ raise "support for other Rserve objects coming shortly"
143
+ end
144
+
145
+ else
146
+ raise "#{object} is not an R object"
147
+ end
148
+ end
149
+ end
150
+ end
@@ -1,57 +1,72 @@
1
1
  module R2RDF
2
- module Dataset
3
- module Interactive
4
- #to be called by other classes if user input is required
5
- def defaults
6
- {
7
- load_from_file: false
8
- }
9
- end
10
-
11
- def interactive(options={})
12
- options = defaults.merge(options)
13
- qb = {}
14
-
15
- puts "load config from file? [y/N]"
16
- if gets.chomp == "y"
17
- #use yaml or DSL file to configure
18
- else
19
- qb[:dimensions] = dimensions()
20
- qb[:measures] = measures()
21
- end
22
-
23
- puts "load data from file? [y/N]"
24
- if gets.chomp == "y"
25
- #attempt to load dataset from file, ask user to resolve problems or ambiguity
26
- else
27
- end
28
- qb
29
- end
30
-
31
- def dimensions
32
- puts "Enter a list of dimensions, separated by commas"
33
- arr = gets.chomp.split(",")
34
- dims = {}
35
-
36
- arr.map{|dim|
37
- puts "What is the range of #{dim.chomp.strip}? [:coded]"
38
- type = gets.chomp
39
- type = :coded if type == ":coded" || type == ""
40
- dims[dim.chomp.strip] = {type: type}
41
- }
42
-
43
- dims
44
- end
45
-
46
- def measures
47
- puts "Enter a list of measures, separated by commas"
48
- arr = gets.chomp.split(",")
49
- meas = []
50
-
51
- arr.map{|m| meas << m.chomp.strip}
52
-
53
- meas
54
- end
55
- end
56
- end
2
+ module Interactive
3
+ #to be called by other classes if user input is required
4
+
5
+ #take message, options, defaults. can be passed block to handle default as well
6
+ def interact(message, default, options=nil)
7
+ puts message + " (#{default})\n[#{options}]"
8
+ str = gets.chomp
9
+ if str.size > 0
10
+ if options
11
+ if str.split(',').all?{|s| Integer(s) rescue nil}
12
+ str.split(',').map(&:to_i).map{|i| options[i]}
13
+ else
14
+ str.split(',').each{|s| raise "unkown selection #{s}" unless options.include? s.strip}
15
+ str.split(',').map(&:strip)
16
+ end
17
+ else
18
+ str
19
+ end
20
+ elsif block_given?
21
+ yield str
22
+ else
23
+ default
24
+ end
25
+ end
26
+
27
+ # def interactive(options={})
28
+ # options = defaults.merge(options)
29
+ # qb = {}
30
+
31
+ # puts "load config from file? [y/N]"
32
+ # if gets.chomp == "y"
33
+ # #use yaml or DSL file to configure
34
+ # else
35
+ # qb[:dimensions] = dimensions()
36
+ # qb[:measures] = measures()
37
+ # end
38
+
39
+ # puts "load data from file? [y/N]"
40
+ # if gets.chomp == "y"
41
+ # #attempt to load dataset from file, ask user to resolve problems or ambiguity
42
+ # else
43
+ # end
44
+ # qb
45
+ # end
46
+
47
+ # def dimensions
48
+ # puts "Enter a list of dimensions, separated by commas"
49
+ # arr = gets.chomp.split(",")
50
+ # dims = {}
51
+
52
+ # arr.map{|dim|
53
+ # puts "What is the range of #{dim.chomp.strip}? [:coded]"
54
+ # type = gets.chomp
55
+ # type = :coded if type == ":coded" || type == ""
56
+ # dims[dim.chomp.strip] = {type: type}
57
+ # }
58
+
59
+ # dims
60
+ # end
61
+
62
+ # def measures
63
+ # puts "Enter a list of measures, separated by commas"
64
+ # arr = gets.chomp.split(",")
65
+ # meas = []
66
+
67
+ # arr.map{|m| meas << m.chomp.strip}
68
+
69
+ # meas
70
+ # end
71
+ end
57
72
  end
@@ -6,6 +6,8 @@ end
6
6
 
7
7
  module R2RDF
8
8
  module Metadata
9
+ include R2RDF::Parser
10
+
9
11
  def defaults
10
12
  {
11
13
  encode_nulls: false,
@@ -18,7 +20,20 @@ module R2RDF
18
20
  #make it just "var", and try to make that clear to calling classes
19
21
 
20
22
  fields[:var] = sanitize([fields[:var]]).first
23
+
24
+ unless fields[:creator]
25
+ if ENV['USER']
26
+ fields[:creator] = ENV['USER']
27
+ elsif ENV['USERNAME']
28
+ fields[:creator] = ENV['USERNAME']
29
+ end
30
+ end
31
+
32
+ fields[:date] = Time.now.strftime("%Y-%m-%d") unless fields[:date]
33
+
21
34
  options = defaults().merge(options)
35
+
36
+ #TODO some of these should probably be resources, eg dct:creator, or put under DC namespace
22
37
  str = <<-EOF.unindent
23
38
  ns:dataset-#{fields[:var]} rdfs:label "#{fields[:title]}";
24
39
  dct:title "#{fields[:title]}";
@@ -52,19 +67,77 @@ module R2RDF
52
67
  def provenance(fields, options={})
53
68
  #TODO: should either add a prefixes method or replace some with full URIs
54
69
  var = sanitize([fields[:var]]).first
70
+ creator = fields[:creator] if fields[:creator] #should be URI
71
+ org = fields[:organization] if fields[:organization] #should be URI
55
72
  source_software = fields[:software] # software name, object type, optionally steps list for, eg, R
73
+ str = "ns:dataset-#{var} a prov:Entity.\n\n"
74
+ assoc_id = Time.now.nsec.to_s(32)
75
+ endstr = <<-EOF.unindent
76
+ </ns/R2RDF> a prov:Agent .
77
+ ns:dataset-#{var} prov:wasGeneratredBy ns:activity-0 .
78
+
79
+ ns:activity-0 a prov:Activity ;
80
+ prov:qualifiedAssociation ns:assoc-0_#{assoc_id};
81
+ prov:generated ns:dataset-#{var} .
82
+
83
+ ns:assoc-0_#{assoc_id} a prov:Assocation ;
84
+ prov:entity </ns/R2RDF>;
85
+ prov:hadPlan ns:plan-0.
86
+
87
+ ns:plan-0 a prov:Plan ;
88
+ rdfs:comment "generation of dataset-#{var} by R2RDF gem".
89
+
90
+ EOF
91
+
92
+ if creator
93
+ str << "<#{creator}> a prov:Agent, prov:Person .\n"
94
+ str << "</ns/R2RDF> prov:actedOnBehalfOf <#{creator}> .\n\n"
95
+
96
+ if org
97
+ str << "<#{org}> a prov:Agent, prov:Organization .\n"
98
+ str << "<#{creator}> prov:actedOnBehalfOf <#{org}> .\n"
99
+ end
100
+ end
56
101
 
57
- str = "qb:dataset-#{var} a prov:Entity.\n"
58
- endstr = "qb:dataset-#{var} prov:wasGeneratredBy <#{options[:base_url]}/ns/R2RDF>\n" #replace once gem has an actual name
59
102
  if source_software
60
- source_software = [source_software] unless source_software.respond_to? :map
61
- source_software.map{|soft|
62
- str << "<#{options[:base_url]}/ns/prov/software/#{soft}> a prov:Entity .\n"
103
+ source_software = [source_software] unless source_software.is_a? Array
104
+ source_software.each_with_index.map{|soft,i|
105
+ str << "</ns/prov/software/#{soft[:name]}> a prov:Agent .\n"
63
106
 
64
- #Note: probably should say derived from the software object, then software object from software.
65
- endstr << "qb:dataset-#{var} prov:wasDerivedFrom <#{options[:base_url]}/ns/prov/#{soft}> .\n"
107
+ endstr << "ns:activity-0 prov:used </ns/dataset/#{var}#var> .\n"
108
+ endstr << "ns:dataset-#{var} prov:wasDerivedFrom </ns/dataset/#{var}#var> .\n\n"
109
+
110
+ if soft[:process]
111
+ if File.exist? soft[:process]
112
+ soft[:process] = IO.read(soft[:process])
113
+ end
114
+ endstr << "</ns/dataset/#{var}#var> prov:wasGeneratredBy ns:activity-#{i+1} .\n"
115
+ endstr << process(i+1, soft[:process],"/ns/prov/software/#{soft[:name]}", var)
116
+ end
66
117
  }
67
118
  end
119
+ str + "\n" + endstr
120
+ end
121
+
122
+ def process(id, step_string, software_resource, software_var, options={})
123
+ #TODO a better predicate for the steplist than rdfs:comment
124
+ # and make sure it looks good.
125
+ steps = '"' + step_string.split("\n").join('" "') + '"'
126
+ assoc_id = Time.now.nsec.to_s(32)
127
+ str = <<-EOF.unindent
128
+ ns:activity-#{id} a prov:Activity ;
129
+ prov:qualifiedAssociation ns:assoc-#{assoc_id} ;
130
+ prov:used </ns/dataset/#{software_var}#var>.
131
+
132
+ ns:assoc-#{id}_#{assoc_id} a prov:Assocation ;
133
+ prov:entity <#{software_resource}>;
134
+ prov:hadPlan ns:plan-#{id}.
135
+
136
+ ns:plan-#{id} a prov:Plan ;
137
+ rdfs:comment (#{steps});
138
+
139
+ EOF
140
+
68
141
  end
69
142
 
70
143
  def r2rdf_metadata
@@ -77,7 +150,7 @@ module R2RDF
77
150
 
78
151
  def org_metadata
79
152
  str <<-EOF.unindent
80
- <http://sciruby.com/> a org:Organization;
153
+ <http://sciruby.com/> a org:Organization, prov:Organization;
81
154
  skos:prefLabel "SciRuby";
82
155
  rdfs:description "A Project to Build and Improve Tools for Scientific Computing in Ruby".
83
156
  EOF
@@ -1,5 +1,34 @@
1
1
  module R2RDF
2
2
  module Parser
3
+
4
+ def sanitize(array)
5
+ #remove spaces and other special characters
6
+ processed = []
7
+ array.map{|entry|
8
+ if entry.is_a? String
9
+ processed << entry.gsub(/[\s\.]/,'_')
10
+ else
11
+ processed << entry
12
+ end
13
+ }
14
+ processed
15
+ end
16
+
17
+ def sanitize_hash(h)
18
+ mappings = {}
19
+ h.keys.map{|k|
20
+ if(k.is_a? String)
21
+ mappings[k] = k.gsub(' ','_')
22
+ end
23
+ }
24
+
25
+ h.keys.map{|k|
26
+ h[mappings[k]] = h.delete(k) if mappings[k]
27
+ }
28
+
29
+ h
30
+ end
31
+
3
32
  def create_graph(string)
4
33
  f = Tempfile.new('graph')
5
34
  f.write(string)
@@ -57,8 +86,54 @@ module R2RDF
57
86
  end
58
87
  end
59
88
 
89
+ def to_resource(obj, options)
90
+ if obj.is_a? String
91
+ obj = "<#{obj}>" if obj =~ /^http:\/\//
92
+
93
+ #TODO decide the right way to handle missing values, since RDF has no null
94
+ #probably throw an error here since a missing resource is a bigger problem
95
+ obj = "NA" if obj.empty?
96
+
97
+ #TODO remove special characters (faster) as well (eg '?')
98
+ obj.gsub(' ','_').gsub('?','')
99
+ elsif obj == nil && options[:encode_nulls]
100
+ '"NA"'
101
+ elsif obj.is_a? Numeric
102
+ #resources cannot be referred to purely by integer (?)
103
+ "n"+obj.to_s
104
+ else
105
+ obj
106
+ end
107
+ end
108
+
109
+ def to_literal(obj, options)
110
+ if obj.is_a? String
111
+ # Depressing that there's no more elegant way to check if a string is
112
+ # a number...
113
+ if val = Integer(obj) rescue nil
114
+ val
115
+ elsif val = Float(obj) rescue nil
116
+ val
117
+ else
118
+ '"'+obj+'"'
119
+ end
120
+ elsif obj == nil && options[:encode_nulls]
121
+ #TODO decide the right way to handle missing values, since RDF has no null
122
+ '"NA"'
123
+ else
124
+ obj
125
+ end
126
+ end
127
+
60
128
  def strip_uri(uri)
61
- uri.to_s.split('/').last.split('#').last
129
+ uri = uri.to_s.dup
130
+ uri[-1] = '' if uri[-1] == '>'
131
+ uri.to_s.split('/').last.split('#').last
132
+ end
133
+
134
+ def strip_prefixes(string)
135
+ string.to_s.split(':').last
62
136
  end
137
+
63
138
  end
64
139
  end