bio-publisci 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/.travis.yml +1 -1
  3. data/Gemfile +1 -1
  4. data/Rakefile +4 -6
  5. data/features/integration_steps.rb +1 -1
  6. data/features/metadata.feature +24 -0
  7. data/features/metadata_steps.rb +21 -0
  8. data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +36 -14
  9. data/lib/bio-publisci/dataset/ORM/observation.rb +1 -1
  10. data/lib/bio-publisci/dataset/data_cube.rb +192 -131
  11. data/lib/bio-publisci/dataset/dataset_for.rb +150 -0
  12. data/lib/bio-publisci/dataset/interactive.rb +70 -55
  13. data/lib/bio-publisci/metadata/metadata.rb +81 -8
  14. data/lib/bio-publisci/parser.rb +76 -1
  15. data/lib/bio-publisci/readers/big_cross.rb +118 -117
  16. data/lib/bio-publisci/readers/csv.rb +37 -2
  17. data/lib/bio-publisci/readers/r_matrix.rb +1 -1
  18. data/lib/bio-publisci/store.rb +31 -31
  19. data/lib/bio-publisci/writers/arff.rb +48 -49
  20. data/lib/bio-publisci.rb +3 -0
  21. data/resources/queries/code_resources.rq +10 -0
  22. data/resources/queries/dimension_ranges.rq +3 -3
  23. data/resources/queries/dimensions.rq +3 -3
  24. data/resources/queries/measures.rq +3 -3
  25. data/resources/queries/observation_labels.rq +8 -0
  26. data/resources/queries/properties.rq +8 -0
  27. data/scripts/islet_mlratio.rb +6 -0
  28. data/scripts/scan_islet.rb +6 -0
  29. data/scripts/update_reference.rb +20 -0
  30. data/spec/ORM/data_cube_orm_spec.rb +12 -0
  31. data/spec/data_cube_spec.rb +1 -1
  32. data/spec/generators/dataframe_spec.rb +1 -1
  33. data/spec/generators/r_matrix_spec.rb +1 -1
  34. data/spec/r_builder_spec.rb +6 -6
  35. data/spec/resource/.RData +0 -0
  36. data/spec/resource/example.Rhistory +3 -0
  37. data/spec/turtle/bacon +4 -22
  38. data/spec/turtle/reference +9 -27
  39. metadata +37 -56
  40. data/lib/bio-publisci/loader.rb +0 -36
  41. data/spec/bio-publisci_spec.rb +0 -7
@@ -0,0 +1,150 @@
1
+ module R2RDF
2
+ class Dataset
3
+ extend R2RDF::Interactive
4
+
5
+ def self.for(object, options={}, ask_on_ambiguous=true)
6
+ if object.is_a? String
7
+ if File.exist? object
8
+ if File.extname(object).size > 0
9
+ extension = File.extname(object)
10
+ elsif File.basename(object)[0] == '.' && File.basename(object).count('.') == 1
11
+ extension = File.basename(object)
12
+ else
13
+ raise "Can't load file #{object}; type inference not yet implemented"
14
+ end
15
+
16
+ case extension
17
+ when ".RData"
18
+ r_object(object, options, ask_on_ambiguous)
19
+ when /.csv/i
20
+ R2RDF::Reader::CSV.new.automatic(object,nil,options,ask_on_ambiguous)
21
+ end
22
+ else
23
+ raise "Unable to find reader for File or String"
24
+ end
25
+ elsif object.is_a? Rserve::REXP
26
+ r_object(object, options, ask_on_ambiguous)
27
+ else
28
+ raise "not recognize Ruby objects of this type yet (#{object})"
29
+ end
30
+ end
31
+
32
+ def self.r_object(object, options={}, ask_on_ambiguous=true)
33
+ if object.is_a? String
34
+ con = Rserve::Connection.new
35
+ vars = con.eval("load('#{File.absolute_path object}')")
36
+ if vars.to_ruby.size > 1 && ask_on_ambiguous
37
+ puts "Which variable? #{vars.to_ruby}"
38
+ var = vars.to_ruby[gets.to_i]
39
+ else
40
+ var = vars.to_ruby[0]
41
+ end
42
+
43
+ r_classes = con.eval("class(#{var})").to_ruby
44
+
45
+ if r_classes.include? "data.frame"
46
+ df = R2RDF::Reader::Dataframe.new
47
+ unless options[:dimensions] || !ask_on_ambiguous
48
+ dims = con.eval("names(#{var})").to_ruby
49
+ puts "Which dimensions? #{dims}"
50
+ selection = gets.chomp
51
+ if selection.size > 0
52
+ options[:dimensions] = selection.split(',').map(&:to_i).map{|i| dims[i]}
53
+ end
54
+ end
55
+ unless options[:measures] || !ask_on_ambiguous
56
+ meas = con.eval("names(#{var})").to_ruby
57
+ puts "Which measures? #{meas} "
58
+ selection = gets.chomp
59
+ if selection.size > 0
60
+ options[:measures] = selection.split(',').map(&:to_i).map{|i| meas[i]}
61
+ end
62
+ end
63
+
64
+ df.generate_n3(con.eval(var),var,options)
65
+
66
+ elsif r_classes.include? "cross"
67
+ bc = R2RDF::Reader::BigCross.new
68
+
69
+ unless options[:measures] || !ask_on_ambiguous
70
+ pheno_names = con.eval("names(#{var}$pheno)").to_ruby
71
+ puts "Which phenotype traits? #{pheno_names}"
72
+ selection = gets.chomp
73
+ if selection.size > 0
74
+ options[:measures] = selection.split(',').map(&:to_i).map{|i| pheno_names[i]}
75
+ end
76
+ end
77
+
78
+ base = var
79
+ if ask_on_ambiguous
80
+ puts "Output file base?"
81
+ base = gets.chomp
82
+ base = var unless base.size > 0
83
+ end
84
+
85
+ bc.generate_n3(con, var, base, options)
86
+
87
+ elsif r_classes.include? "matrix"
88
+ mat = R2RDF::Reader::RMatrix.new
89
+
90
+ unless options[:measures] || !ask_on_ambiguous
91
+ puts "Row label"
92
+ rows = gets.chomp
93
+ rows = "row" unless rows.size > 0
94
+
95
+ puts "Column label"
96
+ cols = gets.chomp
97
+ cols = "column" unless cols.size > 0
98
+
99
+ puts "Entry label"
100
+ vals = gets.chomp
101
+ vals = "value" unless vals.size > 0
102
+
103
+ options[:measures] = [cols,rows,vals]
104
+ end
105
+
106
+ base = var
107
+ if ask_on_ambiguous
108
+ puts "Output file base?"
109
+ base = gets.chomp
110
+ base = var unless base.size > 0
111
+ end
112
+
113
+ mat.generate_n3(con, var, base, options)
114
+ else
115
+ raise "no R2RDF::Reader found for #{r_classes}"
116
+ end
117
+
118
+ elsif object.is_a? Rserve::REXP
119
+ if object.attr.payload["class"].payload.first
120
+
121
+ df = R2RDF::Reader::Dataframe.new
122
+
123
+ var = nil
124
+
125
+ if ask_on_ambiguous
126
+ var = interact("Dataset name?",nil)
127
+ end
128
+
129
+ unless options[:dimensions] || !ask_on_ambiguous
130
+ dims = object.payload.names
131
+ selection = interact("Which dimensions?","row",dims){|s| puts s; nil}
132
+ options[:dimensions] = selection if selection
133
+ end
134
+
135
+ unless options[:measures] || !ask_on_ambiguous
136
+ meas = object.payload.names
137
+ options[:measures] = interact("Which measures?",meas,meas)
138
+ end
139
+
140
+ df.generate_n3(object,var,options)
141
+ else
142
+ raise "support for other Rserve objects coming shortly"
143
+ end
144
+
145
+ else
146
+ raise "#{object} is not an R object"
147
+ end
148
+ end
149
+ end
150
+ end
@@ -1,57 +1,72 @@
1
1
  module R2RDF
2
- module Dataset
3
- module Interactive
4
- #to be called by other classes if user input is required
5
- def defaults
6
- {
7
- load_from_file: false
8
- }
9
- end
10
-
11
- def interactive(options={})
12
- options = defaults.merge(options)
13
- qb = {}
14
-
15
- puts "load config from file? [y/N]"
16
- if gets.chomp == "y"
17
- #use yaml or DSL file to configure
18
- else
19
- qb[:dimensions] = dimensions()
20
- qb[:measures] = measures()
21
- end
22
-
23
- puts "load data from file? [y/N]"
24
- if gets.chomp == "y"
25
- #attempt to load dataset from file, ask user to resolve problems or ambiguity
26
- else
27
- end
28
- qb
29
- end
30
-
31
- def dimensions
32
- puts "Enter a list of dimensions, separated by commas"
33
- arr = gets.chomp.split(",")
34
- dims = {}
35
-
36
- arr.map{|dim|
37
- puts "What is the range of #{dim.chomp.strip}? [:coded]"
38
- type = gets.chomp
39
- type = :coded if type == ":coded" || type == ""
40
- dims[dim.chomp.strip] = {type: type}
41
- }
42
-
43
- dims
44
- end
45
-
46
- def measures
47
- puts "Enter a list of measures, separated by commas"
48
- arr = gets.chomp.split(",")
49
- meas = []
50
-
51
- arr.map{|m| meas << m.chomp.strip}
52
-
53
- meas
54
- end
55
- end
56
- end
2
+ module Interactive
3
+ #to be called by other classes if user input is required
4
+
5
+ #take message, options, defaults. can be passed block to handle default as well
6
+ def interact(message, default, options=nil)
7
+ puts message + " (#{default})\n[#{options}]"
8
+ str = gets.chomp
9
+ if str.size > 0
10
+ if options
11
+ if str.split(',').all?{|s| Integer(s) rescue nil}
12
+ str.split(',').map(&:to_i).map{|i| options[i]}
13
+ else
14
+ str.split(',').each{|s| raise "unkown selection #{s}" unless options.include? s.strip}
15
+ str.split(',').map(&:strip)
16
+ end
17
+ else
18
+ str
19
+ end
20
+ elsif block_given?
21
+ yield str
22
+ else
23
+ default
24
+ end
25
+ end
26
+
27
+ # def interactive(options={})
28
+ # options = defaults.merge(options)
29
+ # qb = {}
30
+
31
+ # puts "load config from file? [y/N]"
32
+ # if gets.chomp == "y"
33
+ # #use yaml or DSL file to configure
34
+ # else
35
+ # qb[:dimensions] = dimensions()
36
+ # qb[:measures] = measures()
37
+ # end
38
+
39
+ # puts "load data from file? [y/N]"
40
+ # if gets.chomp == "y"
41
+ # #attempt to load dataset from file, ask user to resolve problems or ambiguity
42
+ # else
43
+ # end
44
+ # qb
45
+ # end
46
+
47
+ # def dimensions
48
+ # puts "Enter a list of dimensions, separated by commas"
49
+ # arr = gets.chomp.split(",")
50
+ # dims = {}
51
+
52
+ # arr.map{|dim|
53
+ # puts "What is the range of #{dim.chomp.strip}? [:coded]"
54
+ # type = gets.chomp
55
+ # type = :coded if type == ":coded" || type == ""
56
+ # dims[dim.chomp.strip] = {type: type}
57
+ # }
58
+
59
+ # dims
60
+ # end
61
+
62
+ # def measures
63
+ # puts "Enter a list of measures, separated by commas"
64
+ # arr = gets.chomp.split(",")
65
+ # meas = []
66
+
67
+ # arr.map{|m| meas << m.chomp.strip}
68
+
69
+ # meas
70
+ # end
71
+ end
57
72
  end
@@ -6,6 +6,8 @@ end
6
6
 
7
7
  module R2RDF
8
8
  module Metadata
9
+ include R2RDF::Parser
10
+
9
11
  def defaults
10
12
  {
11
13
  encode_nulls: false,
@@ -18,7 +20,20 @@ module R2RDF
18
20
  #make it just "var", and try to make that clear to calling classes
19
21
 
20
22
  fields[:var] = sanitize([fields[:var]]).first
23
+
24
+ unless fields[:creator]
25
+ if ENV['USER']
26
+ fields[:creator] = ENV['USER']
27
+ elsif ENV['USERNAME']
28
+ fields[:creator] = ENV['USERNAME']
29
+ end
30
+ end
31
+
32
+ fields[:date] = Time.now.strftime("%Y-%m-%d") unless fields[:date]
33
+
21
34
  options = defaults().merge(options)
35
+
36
+ #TODO some of these should probably be resources, eg dct:creator, or put under DC namespace
22
37
  str = <<-EOF.unindent
23
38
  ns:dataset-#{fields[:var]} rdfs:label "#{fields[:title]}";
24
39
  dct:title "#{fields[:title]}";
@@ -52,19 +67,77 @@ module R2RDF
52
67
  def provenance(fields, options={})
53
68
  #TODO: should either add a prefixes method or replace some with full URIs
54
69
  var = sanitize([fields[:var]]).first
70
+ creator = fields[:creator] if fields[:creator] #should be URI
71
+ org = fields[:organization] if fields[:organization] #should be URI
55
72
  source_software = fields[:software] # software name, object type, optionally steps list for, eg, R
73
+ str = "ns:dataset-#{var} a prov:Entity.\n\n"
74
+ assoc_id = Time.now.nsec.to_s(32)
75
+ endstr = <<-EOF.unindent
76
+ </ns/R2RDF> a prov:Agent .
77
+ ns:dataset-#{var} prov:wasGeneratredBy ns:activity-0 .
78
+
79
+ ns:activity-0 a prov:Activity ;
80
+ prov:qualifiedAssociation ns:assoc-0_#{assoc_id};
81
+ prov:generated ns:dataset-#{var} .
82
+
83
+ ns:assoc-0_#{assoc_id} a prov:Assocation ;
84
+ prov:entity </ns/R2RDF>;
85
+ prov:hadPlan ns:plan-0.
86
+
87
+ ns:plan-0 a prov:Plan ;
88
+ rdfs:comment "generation of dataset-#{var} by R2RDF gem".
89
+
90
+ EOF
91
+
92
+ if creator
93
+ str << "<#{creator}> a prov:Agent, prov:Person .\n"
94
+ str << "</ns/R2RDF> prov:actedOnBehalfOf <#{creator}> .\n\n"
95
+
96
+ if org
97
+ str << "<#{org}> a prov:Agent, prov:Organization .\n"
98
+ str << "<#{creator}> prov:actedOnBehalfOf <#{org}> .\n"
99
+ end
100
+ end
56
101
 
57
- str = "qb:dataset-#{var} a prov:Entity.\n"
58
- endstr = "qb:dataset-#{var} prov:wasGeneratredBy <#{options[:base_url]}/ns/R2RDF>\n" #replace once gem has an actual name
59
102
  if source_software
60
- source_software = [source_software] unless source_software.respond_to? :map
61
- source_software.map{|soft|
62
- str << "<#{options[:base_url]}/ns/prov/software/#{soft}> a prov:Entity .\n"
103
+ source_software = [source_software] unless source_software.is_a? Array
104
+ source_software.each_with_index.map{|soft,i|
105
+ str << "</ns/prov/software/#{soft[:name]}> a prov:Agent .\n"
63
106
 
64
- #Note: probably should say derived from the software object, then software object from software.
65
- endstr << "qb:dataset-#{var} prov:wasDerivedFrom <#{options[:base_url]}/ns/prov/#{soft}> .\n"
107
+ endstr << "ns:activity-0 prov:used </ns/dataset/#{var}#var> .\n"
108
+ endstr << "ns:dataset-#{var} prov:wasDerivedFrom </ns/dataset/#{var}#var> .\n\n"
109
+
110
+ if soft[:process]
111
+ if File.exist? soft[:process]
112
+ soft[:process] = IO.read(soft[:process])
113
+ end
114
+ endstr << "</ns/dataset/#{var}#var> prov:wasGeneratredBy ns:activity-#{i+1} .\n"
115
+ endstr << process(i+1, soft[:process],"/ns/prov/software/#{soft[:name]}", var)
116
+ end
66
117
  }
67
118
  end
119
+ str + "\n" + endstr
120
+ end
121
+
122
+ def process(id, step_string, software_resource, software_var, options={})
123
+ #TODO a better predicate for the steplist than rdfs:comment
124
+ # and make sure it looks good.
125
+ steps = '"' + step_string.split("\n").join('" "') + '"'
126
+ assoc_id = Time.now.nsec.to_s(32)
127
+ str = <<-EOF.unindent
128
+ ns:activity-#{id} a prov:Activity ;
129
+ prov:qualifiedAssociation ns:assoc-#{assoc_id} ;
130
+ prov:used </ns/dataset/#{software_var}#var>.
131
+
132
+ ns:assoc-#{id}_#{assoc_id} a prov:Assocation ;
133
+ prov:entity <#{software_resource}>;
134
+ prov:hadPlan ns:plan-#{id}.
135
+
136
+ ns:plan-#{id} a prov:Plan ;
137
+ rdfs:comment (#{steps});
138
+
139
+ EOF
140
+
68
141
  end
69
142
 
70
143
  def r2rdf_metadata
@@ -77,7 +150,7 @@ module R2RDF
77
150
 
78
151
  def org_metadata
79
152
  str <<-EOF.unindent
80
- <http://sciruby.com/> a org:Organization;
153
+ <http://sciruby.com/> a org:Organization, prov:Organization;
81
154
  skos:prefLabel "SciRuby";
82
155
  rdfs:description "A Project to Build and Improve Tools for Scientific Computing in Ruby".
83
156
  EOF
@@ -1,5 +1,34 @@
1
1
  module R2RDF
2
2
  module Parser
3
+
4
+ def sanitize(array)
5
+ #remove spaces and other special characters
6
+ processed = []
7
+ array.map{|entry|
8
+ if entry.is_a? String
9
+ processed << entry.gsub(/[\s\.]/,'_')
10
+ else
11
+ processed << entry
12
+ end
13
+ }
14
+ processed
15
+ end
16
+
17
+ def sanitize_hash(h)
18
+ mappings = {}
19
+ h.keys.map{|k|
20
+ if(k.is_a? String)
21
+ mappings[k] = k.gsub(' ','_')
22
+ end
23
+ }
24
+
25
+ h.keys.map{|k|
26
+ h[mappings[k]] = h.delete(k) if mappings[k]
27
+ }
28
+
29
+ h
30
+ end
31
+
3
32
  def create_graph(string)
4
33
  f = Tempfile.new('graph')
5
34
  f.write(string)
@@ -57,8 +86,54 @@ module R2RDF
57
86
  end
58
87
  end
59
88
 
89
+ def to_resource(obj, options)
90
+ if obj.is_a? String
91
+ obj = "<#{obj}>" if obj =~ /^http:\/\//
92
+
93
+ #TODO decide the right way to handle missing values, since RDF has no null
94
+ #probably throw an error here since a missing resource is a bigger problem
95
+ obj = "NA" if obj.empty?
96
+
97
+ #TODO remove special characters (faster) as well (eg '?')
98
+ obj.gsub(' ','_').gsub('?','')
99
+ elsif obj == nil && options[:encode_nulls]
100
+ '"NA"'
101
+ elsif obj.is_a? Numeric
102
+ #resources cannot be referred to purely by integer (?)
103
+ "n"+obj.to_s
104
+ else
105
+ obj
106
+ end
107
+ end
108
+
109
+ def to_literal(obj, options)
110
+ if obj.is_a? String
111
+ # Depressing that there's no more elegant way to check if a string is
112
+ # a number...
113
+ if val = Integer(obj) rescue nil
114
+ val
115
+ elsif val = Float(obj) rescue nil
116
+ val
117
+ else
118
+ '"'+obj+'"'
119
+ end
120
+ elsif obj == nil && options[:encode_nulls]
121
+ #TODO decide the right way to handle missing values, since RDF has no null
122
+ '"NA"'
123
+ else
124
+ obj
125
+ end
126
+ end
127
+
60
128
  def strip_uri(uri)
61
- uri.to_s.split('/').last.split('#').last
129
+ uri = uri.to_s.dup
130
+ uri[-1] = '' if uri[-1] == '>'
131
+ uri.to_s.split('/').last.split('#').last
132
+ end
133
+
134
+ def strip_prefixes(string)
135
+ string.to_s.split(':').last
62
136
  end
137
+
63
138
  end
64
139
  end