bio-publisci 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +1 -1
- data/Gemfile +1 -1
- data/Rakefile +4 -6
- data/features/integration_steps.rb +1 -1
- data/features/metadata.feature +24 -0
- data/features/metadata_steps.rb +21 -0
- data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +36 -14
- data/lib/bio-publisci/dataset/ORM/observation.rb +1 -1
- data/lib/bio-publisci/dataset/data_cube.rb +192 -131
- data/lib/bio-publisci/dataset/dataset_for.rb +150 -0
- data/lib/bio-publisci/dataset/interactive.rb +70 -55
- data/lib/bio-publisci/metadata/metadata.rb +81 -8
- data/lib/bio-publisci/parser.rb +76 -1
- data/lib/bio-publisci/readers/big_cross.rb +118 -117
- data/lib/bio-publisci/readers/csv.rb +37 -2
- data/lib/bio-publisci/readers/r_matrix.rb +1 -1
- data/lib/bio-publisci/store.rb +31 -31
- data/lib/bio-publisci/writers/arff.rb +48 -49
- data/lib/bio-publisci.rb +3 -0
- data/resources/queries/code_resources.rq +10 -0
- data/resources/queries/dimension_ranges.rq +3 -3
- data/resources/queries/dimensions.rq +3 -3
- data/resources/queries/measures.rq +3 -3
- data/resources/queries/observation_labels.rq +8 -0
- data/resources/queries/properties.rq +8 -0
- data/scripts/islet_mlratio.rb +6 -0
- data/scripts/scan_islet.rb +6 -0
- data/scripts/update_reference.rb +20 -0
- data/spec/ORM/data_cube_orm_spec.rb +12 -0
- data/spec/data_cube_spec.rb +1 -1
- data/spec/generators/dataframe_spec.rb +1 -1
- data/spec/generators/r_matrix_spec.rb +1 -1
- data/spec/r_builder_spec.rb +6 -6
- data/spec/resource/.RData +0 -0
- data/spec/resource/example.Rhistory +3 -0
- data/spec/turtle/bacon +4 -22
- data/spec/turtle/reference +9 -27
- metadata +37 -56
- data/lib/bio-publisci/loader.rb +0 -36
- data/spec/bio-publisci_spec.rb +0 -7
@@ -0,0 +1,150 @@
|
|
1
|
+
module R2RDF
|
2
|
+
class Dataset
|
3
|
+
extend R2RDF::Interactive
|
4
|
+
|
5
|
+
def self.for(object, options={}, ask_on_ambiguous=true)
|
6
|
+
if object.is_a? String
|
7
|
+
if File.exist? object
|
8
|
+
if File.extname(object).size > 0
|
9
|
+
extension = File.extname(object)
|
10
|
+
elsif File.basename(object)[0] == '.' && File.basename(object).count('.') == 1
|
11
|
+
extension = File.basename(object)
|
12
|
+
else
|
13
|
+
raise "Can't load file #{object}; type inference not yet implemented"
|
14
|
+
end
|
15
|
+
|
16
|
+
case extension
|
17
|
+
when ".RData"
|
18
|
+
r_object(object, options, ask_on_ambiguous)
|
19
|
+
when /.csv/i
|
20
|
+
R2RDF::Reader::CSV.new.automatic(object,nil,options,ask_on_ambiguous)
|
21
|
+
end
|
22
|
+
else
|
23
|
+
raise "Unable to find reader for File or String"
|
24
|
+
end
|
25
|
+
elsif object.is_a? Rserve::REXP
|
26
|
+
r_object(object, options, ask_on_ambiguous)
|
27
|
+
else
|
28
|
+
raise "not recognize Ruby objects of this type yet (#{object})"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.r_object(object, options={}, ask_on_ambiguous=true)
|
33
|
+
if object.is_a? String
|
34
|
+
con = Rserve::Connection.new
|
35
|
+
vars = con.eval("load('#{File.absolute_path object}')")
|
36
|
+
if vars.to_ruby.size > 1 && ask_on_ambiguous
|
37
|
+
puts "Which variable? #{vars.to_ruby}"
|
38
|
+
var = vars.to_ruby[gets.to_i]
|
39
|
+
else
|
40
|
+
var = vars.to_ruby[0]
|
41
|
+
end
|
42
|
+
|
43
|
+
r_classes = con.eval("class(#{var})").to_ruby
|
44
|
+
|
45
|
+
if r_classes.include? "data.frame"
|
46
|
+
df = R2RDF::Reader::Dataframe.new
|
47
|
+
unless options[:dimensions] || !ask_on_ambiguous
|
48
|
+
dims = con.eval("names(#{var})").to_ruby
|
49
|
+
puts "Which dimensions? #{dims}"
|
50
|
+
selection = gets.chomp
|
51
|
+
if selection.size > 0
|
52
|
+
options[:dimensions] = selection.split(',').map(&:to_i).map{|i| dims[i]}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
unless options[:measures] || !ask_on_ambiguous
|
56
|
+
meas = con.eval("names(#{var})").to_ruby
|
57
|
+
puts "Which measures? #{meas} "
|
58
|
+
selection = gets.chomp
|
59
|
+
if selection.size > 0
|
60
|
+
options[:measures] = selection.split(',').map(&:to_i).map{|i| meas[i]}
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
df.generate_n3(con.eval(var),var,options)
|
65
|
+
|
66
|
+
elsif r_classes.include? "cross"
|
67
|
+
bc = R2RDF::Reader::BigCross.new
|
68
|
+
|
69
|
+
unless options[:measures] || !ask_on_ambiguous
|
70
|
+
pheno_names = con.eval("names(#{var}$pheno)").to_ruby
|
71
|
+
puts "Which phenotype traits? #{pheno_names}"
|
72
|
+
selection = gets.chomp
|
73
|
+
if selection.size > 0
|
74
|
+
options[:measures] = selection.split(',').map(&:to_i).map{|i| pheno_names[i]}
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
base = var
|
79
|
+
if ask_on_ambiguous
|
80
|
+
puts "Output file base?"
|
81
|
+
base = gets.chomp
|
82
|
+
base = var unless base.size > 0
|
83
|
+
end
|
84
|
+
|
85
|
+
bc.generate_n3(con, var, base, options)
|
86
|
+
|
87
|
+
elsif r_classes.include? "matrix"
|
88
|
+
mat = R2RDF::Reader::RMatrix.new
|
89
|
+
|
90
|
+
unless options[:measures] || !ask_on_ambiguous
|
91
|
+
puts "Row label"
|
92
|
+
rows = gets.chomp
|
93
|
+
rows = "row" unless rows.size > 0
|
94
|
+
|
95
|
+
puts "Column label"
|
96
|
+
cols = gets.chomp
|
97
|
+
cols = "column" unless cols.size > 0
|
98
|
+
|
99
|
+
puts "Entry label"
|
100
|
+
vals = gets.chomp
|
101
|
+
vals = "value" unless vals.size > 0
|
102
|
+
|
103
|
+
options[:measures] = [cols,rows,vals]
|
104
|
+
end
|
105
|
+
|
106
|
+
base = var
|
107
|
+
if ask_on_ambiguous
|
108
|
+
puts "Output file base?"
|
109
|
+
base = gets.chomp
|
110
|
+
base = var unless base.size > 0
|
111
|
+
end
|
112
|
+
|
113
|
+
mat.generate_n3(con, var, base, options)
|
114
|
+
else
|
115
|
+
raise "no R2RDF::Reader found for #{r_classes}"
|
116
|
+
end
|
117
|
+
|
118
|
+
elsif object.is_a? Rserve::REXP
|
119
|
+
if object.attr.payload["class"].payload.first
|
120
|
+
|
121
|
+
df = R2RDF::Reader::Dataframe.new
|
122
|
+
|
123
|
+
var = nil
|
124
|
+
|
125
|
+
if ask_on_ambiguous
|
126
|
+
var = interact("Dataset name?",nil)
|
127
|
+
end
|
128
|
+
|
129
|
+
unless options[:dimensions] || !ask_on_ambiguous
|
130
|
+
dims = object.payload.names
|
131
|
+
selection = interact("Which dimensions?","row",dims){|s| puts s; nil}
|
132
|
+
options[:dimensions] = selection if selection
|
133
|
+
end
|
134
|
+
|
135
|
+
unless options[:measures] || !ask_on_ambiguous
|
136
|
+
meas = object.payload.names
|
137
|
+
options[:measures] = interact("Which measures?",meas,meas)
|
138
|
+
end
|
139
|
+
|
140
|
+
df.generate_n3(object,var,options)
|
141
|
+
else
|
142
|
+
raise "support for other Rserve objects coming shortly"
|
143
|
+
end
|
144
|
+
|
145
|
+
else
|
146
|
+
raise "#{object} is not an R object"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -1,57 +1,72 @@
|
|
1
1
|
module R2RDF
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
2
|
+
module Interactive
|
3
|
+
#to be called by other classes if user input is required
|
4
|
+
|
5
|
+
#take message, options, defaults. can be passed block to handle default as well
|
6
|
+
def interact(message, default, options=nil)
|
7
|
+
puts message + " (#{default})\n[#{options}]"
|
8
|
+
str = gets.chomp
|
9
|
+
if str.size > 0
|
10
|
+
if options
|
11
|
+
if str.split(',').all?{|s| Integer(s) rescue nil}
|
12
|
+
str.split(',').map(&:to_i).map{|i| options[i]}
|
13
|
+
else
|
14
|
+
str.split(',').each{|s| raise "unkown selection #{s}" unless options.include? s.strip}
|
15
|
+
str.split(',').map(&:strip)
|
16
|
+
end
|
17
|
+
else
|
18
|
+
str
|
19
|
+
end
|
20
|
+
elsif block_given?
|
21
|
+
yield str
|
22
|
+
else
|
23
|
+
default
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# def interactive(options={})
|
28
|
+
# options = defaults.merge(options)
|
29
|
+
# qb = {}
|
30
|
+
|
31
|
+
# puts "load config from file? [y/N]"
|
32
|
+
# if gets.chomp == "y"
|
33
|
+
# #use yaml or DSL file to configure
|
34
|
+
# else
|
35
|
+
# qb[:dimensions] = dimensions()
|
36
|
+
# qb[:measures] = measures()
|
37
|
+
# end
|
38
|
+
|
39
|
+
# puts "load data from file? [y/N]"
|
40
|
+
# if gets.chomp == "y"
|
41
|
+
# #attempt to load dataset from file, ask user to resolve problems or ambiguity
|
42
|
+
# else
|
43
|
+
# end
|
44
|
+
# qb
|
45
|
+
# end
|
46
|
+
|
47
|
+
# def dimensions
|
48
|
+
# puts "Enter a list of dimensions, separated by commas"
|
49
|
+
# arr = gets.chomp.split(",")
|
50
|
+
# dims = {}
|
51
|
+
|
52
|
+
# arr.map{|dim|
|
53
|
+
# puts "What is the range of #{dim.chomp.strip}? [:coded]"
|
54
|
+
# type = gets.chomp
|
55
|
+
# type = :coded if type == ":coded" || type == ""
|
56
|
+
# dims[dim.chomp.strip] = {type: type}
|
57
|
+
# }
|
58
|
+
|
59
|
+
# dims
|
60
|
+
# end
|
61
|
+
|
62
|
+
# def measures
|
63
|
+
# puts "Enter a list of measures, separated by commas"
|
64
|
+
# arr = gets.chomp.split(",")
|
65
|
+
# meas = []
|
66
|
+
|
67
|
+
# arr.map{|m| meas << m.chomp.strip}
|
68
|
+
|
69
|
+
# meas
|
70
|
+
# end
|
71
|
+
end
|
57
72
|
end
|
@@ -6,6 +6,8 @@ end
|
|
6
6
|
|
7
7
|
module R2RDF
|
8
8
|
module Metadata
|
9
|
+
include R2RDF::Parser
|
10
|
+
|
9
11
|
def defaults
|
10
12
|
{
|
11
13
|
encode_nulls: false,
|
@@ -18,7 +20,20 @@ module R2RDF
|
|
18
20
|
#make it just "var", and try to make that clear to calling classes
|
19
21
|
|
20
22
|
fields[:var] = sanitize([fields[:var]]).first
|
23
|
+
|
24
|
+
unless fields[:creator]
|
25
|
+
if ENV['USER']
|
26
|
+
fields[:creator] = ENV['USER']
|
27
|
+
elsif ENV['USERNAME']
|
28
|
+
fields[:creator] = ENV['USERNAME']
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
fields[:date] = Time.now.strftime("%Y-%m-%d") unless fields[:date]
|
33
|
+
|
21
34
|
options = defaults().merge(options)
|
35
|
+
|
36
|
+
#TODO some of these should probably be resources, eg dct:creator, or put under DC namespace
|
22
37
|
str = <<-EOF.unindent
|
23
38
|
ns:dataset-#{fields[:var]} rdfs:label "#{fields[:title]}";
|
24
39
|
dct:title "#{fields[:title]}";
|
@@ -52,19 +67,77 @@ module R2RDF
|
|
52
67
|
def provenance(fields, options={})
|
53
68
|
#TODO: should either add a prefixes method or replace some with full URIs
|
54
69
|
var = sanitize([fields[:var]]).first
|
70
|
+
creator = fields[:creator] if fields[:creator] #should be URI
|
71
|
+
org = fields[:organization] if fields[:organization] #should be URI
|
55
72
|
source_software = fields[:software] # software name, object type, optionally steps list for, eg, R
|
73
|
+
str = "ns:dataset-#{var} a prov:Entity.\n\n"
|
74
|
+
assoc_id = Time.now.nsec.to_s(32)
|
75
|
+
endstr = <<-EOF.unindent
|
76
|
+
</ns/R2RDF> a prov:Agent .
|
77
|
+
ns:dataset-#{var} prov:wasGeneratredBy ns:activity-0 .
|
78
|
+
|
79
|
+
ns:activity-0 a prov:Activity ;
|
80
|
+
prov:qualifiedAssociation ns:assoc-0_#{assoc_id};
|
81
|
+
prov:generated ns:dataset-#{var} .
|
82
|
+
|
83
|
+
ns:assoc-0_#{assoc_id} a prov:Assocation ;
|
84
|
+
prov:entity </ns/R2RDF>;
|
85
|
+
prov:hadPlan ns:plan-0.
|
86
|
+
|
87
|
+
ns:plan-0 a prov:Plan ;
|
88
|
+
rdfs:comment "generation of dataset-#{var} by R2RDF gem".
|
89
|
+
|
90
|
+
EOF
|
91
|
+
|
92
|
+
if creator
|
93
|
+
str << "<#{creator}> a prov:Agent, prov:Person .\n"
|
94
|
+
str << "</ns/R2RDF> prov:actedOnBehalfOf <#{creator}> .\n\n"
|
95
|
+
|
96
|
+
if org
|
97
|
+
str << "<#{org}> a prov:Agent, prov:Organization .\n"
|
98
|
+
str << "<#{creator}> prov:actedOnBehalfOf <#{org}> .\n"
|
99
|
+
end
|
100
|
+
end
|
56
101
|
|
57
|
-
str = "qb:dataset-#{var} a prov:Entity.\n"
|
58
|
-
endstr = "qb:dataset-#{var} prov:wasGeneratredBy <#{options[:base_url]}/ns/R2RDF>\n" #replace once gem has an actual name
|
59
102
|
if source_software
|
60
|
-
source_software = [source_software] unless source_software.
|
61
|
-
source_software.map{|soft|
|
62
|
-
str << "
|
103
|
+
source_software = [source_software] unless source_software.is_a? Array
|
104
|
+
source_software.each_with_index.map{|soft,i|
|
105
|
+
str << "</ns/prov/software/#{soft[:name]}> a prov:Agent .\n"
|
63
106
|
|
64
|
-
|
65
|
-
endstr << "
|
107
|
+
endstr << "ns:activity-0 prov:used </ns/dataset/#{var}#var> .\n"
|
108
|
+
endstr << "ns:dataset-#{var} prov:wasDerivedFrom </ns/dataset/#{var}#var> .\n\n"
|
109
|
+
|
110
|
+
if soft[:process]
|
111
|
+
if File.exist? soft[:process]
|
112
|
+
soft[:process] = IO.read(soft[:process])
|
113
|
+
end
|
114
|
+
endstr << "</ns/dataset/#{var}#var> prov:wasGeneratredBy ns:activity-#{i+1} .\n"
|
115
|
+
endstr << process(i+1, soft[:process],"/ns/prov/software/#{soft[:name]}", var)
|
116
|
+
end
|
66
117
|
}
|
67
118
|
end
|
119
|
+
str + "\n" + endstr
|
120
|
+
end
|
121
|
+
|
122
|
+
def process(id, step_string, software_resource, software_var, options={})
|
123
|
+
#TODO a better predicate for the steplist than rdfs:comment
|
124
|
+
# and make sure it looks good.
|
125
|
+
steps = '"' + step_string.split("\n").join('" "') + '"'
|
126
|
+
assoc_id = Time.now.nsec.to_s(32)
|
127
|
+
str = <<-EOF.unindent
|
128
|
+
ns:activity-#{id} a prov:Activity ;
|
129
|
+
prov:qualifiedAssociation ns:assoc-#{assoc_id} ;
|
130
|
+
prov:used </ns/dataset/#{software_var}#var>.
|
131
|
+
|
132
|
+
ns:assoc-#{id}_#{assoc_id} a prov:Assocation ;
|
133
|
+
prov:entity <#{software_resource}>;
|
134
|
+
prov:hadPlan ns:plan-#{id}.
|
135
|
+
|
136
|
+
ns:plan-#{id} a prov:Plan ;
|
137
|
+
rdfs:comment (#{steps});
|
138
|
+
|
139
|
+
EOF
|
140
|
+
|
68
141
|
end
|
69
142
|
|
70
143
|
def r2rdf_metadata
|
@@ -77,7 +150,7 @@ module R2RDF
|
|
77
150
|
|
78
151
|
def org_metadata
|
79
152
|
str <<-EOF.unindent
|
80
|
-
<http://sciruby.com/> a org:Organization;
|
153
|
+
<http://sciruby.com/> a org:Organization, prov:Organization;
|
81
154
|
skos:prefLabel "SciRuby";
|
82
155
|
rdfs:description "A Project to Build and Improve Tools for Scientific Computing in Ruby".
|
83
156
|
EOF
|
data/lib/bio-publisci/parser.rb
CHANGED
@@ -1,5 +1,34 @@
|
|
1
1
|
module R2RDF
|
2
2
|
module Parser
|
3
|
+
|
4
|
+
def sanitize(array)
|
5
|
+
#remove spaces and other special characters
|
6
|
+
processed = []
|
7
|
+
array.map{|entry|
|
8
|
+
if entry.is_a? String
|
9
|
+
processed << entry.gsub(/[\s\.]/,'_')
|
10
|
+
else
|
11
|
+
processed << entry
|
12
|
+
end
|
13
|
+
}
|
14
|
+
processed
|
15
|
+
end
|
16
|
+
|
17
|
+
def sanitize_hash(h)
|
18
|
+
mappings = {}
|
19
|
+
h.keys.map{|k|
|
20
|
+
if(k.is_a? String)
|
21
|
+
mappings[k] = k.gsub(' ','_')
|
22
|
+
end
|
23
|
+
}
|
24
|
+
|
25
|
+
h.keys.map{|k|
|
26
|
+
h[mappings[k]] = h.delete(k) if mappings[k]
|
27
|
+
}
|
28
|
+
|
29
|
+
h
|
30
|
+
end
|
31
|
+
|
3
32
|
def create_graph(string)
|
4
33
|
f = Tempfile.new('graph')
|
5
34
|
f.write(string)
|
@@ -57,8 +86,54 @@ module R2RDF
|
|
57
86
|
end
|
58
87
|
end
|
59
88
|
|
89
|
+
def to_resource(obj, options)
|
90
|
+
if obj.is_a? String
|
91
|
+
obj = "<#{obj}>" if obj =~ /^http:\/\//
|
92
|
+
|
93
|
+
#TODO decide the right way to handle missing values, since RDF has no null
|
94
|
+
#probably throw an error here since a missing resource is a bigger problem
|
95
|
+
obj = "NA" if obj.empty?
|
96
|
+
|
97
|
+
#TODO remove special characters (faster) as well (eg '?')
|
98
|
+
obj.gsub(' ','_').gsub('?','')
|
99
|
+
elsif obj == nil && options[:encode_nulls]
|
100
|
+
'"NA"'
|
101
|
+
elsif obj.is_a? Numeric
|
102
|
+
#resources cannot be referred to purely by integer (?)
|
103
|
+
"n"+obj.to_s
|
104
|
+
else
|
105
|
+
obj
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def to_literal(obj, options)
|
110
|
+
if obj.is_a? String
|
111
|
+
# Depressing that there's no more elegant way to check if a string is
|
112
|
+
# a number...
|
113
|
+
if val = Integer(obj) rescue nil
|
114
|
+
val
|
115
|
+
elsif val = Float(obj) rescue nil
|
116
|
+
val
|
117
|
+
else
|
118
|
+
'"'+obj+'"'
|
119
|
+
end
|
120
|
+
elsif obj == nil && options[:encode_nulls]
|
121
|
+
#TODO decide the right way to handle missing values, since RDF has no null
|
122
|
+
'"NA"'
|
123
|
+
else
|
124
|
+
obj
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
60
128
|
def strip_uri(uri)
|
61
|
-
|
129
|
+
uri = uri.to_s.dup
|
130
|
+
uri[-1] = '' if uri[-1] == '>'
|
131
|
+
uri.to_s.split('/').last.split('#').last
|
132
|
+
end
|
133
|
+
|
134
|
+
def strip_prefixes(string)
|
135
|
+
string.to_s.split(':').last
|
62
136
|
end
|
137
|
+
|
63
138
|
end
|
64
139
|
end
|