bio-publisci 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +1 -1
- data/Gemfile +1 -1
- data/Rakefile +4 -6
- data/features/integration_steps.rb +1 -1
- data/features/metadata.feature +24 -0
- data/features/metadata_steps.rb +21 -0
- data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +36 -14
- data/lib/bio-publisci/dataset/ORM/observation.rb +1 -1
- data/lib/bio-publisci/dataset/data_cube.rb +192 -131
- data/lib/bio-publisci/dataset/dataset_for.rb +150 -0
- data/lib/bio-publisci/dataset/interactive.rb +70 -55
- data/lib/bio-publisci/metadata/metadata.rb +81 -8
- data/lib/bio-publisci/parser.rb +76 -1
- data/lib/bio-publisci/readers/big_cross.rb +118 -117
- data/lib/bio-publisci/readers/csv.rb +37 -2
- data/lib/bio-publisci/readers/r_matrix.rb +1 -1
- data/lib/bio-publisci/store.rb +31 -31
- data/lib/bio-publisci/writers/arff.rb +48 -49
- data/lib/bio-publisci.rb +3 -0
- data/resources/queries/code_resources.rq +10 -0
- data/resources/queries/dimension_ranges.rq +3 -3
- data/resources/queries/dimensions.rq +3 -3
- data/resources/queries/measures.rq +3 -3
- data/resources/queries/observation_labels.rq +8 -0
- data/resources/queries/properties.rq +8 -0
- data/scripts/islet_mlratio.rb +6 -0
- data/scripts/scan_islet.rb +6 -0
- data/scripts/update_reference.rb +20 -0
- data/spec/ORM/data_cube_orm_spec.rb +12 -0
- data/spec/data_cube_spec.rb +1 -1
- data/spec/generators/dataframe_spec.rb +1 -1
- data/spec/generators/r_matrix_spec.rb +1 -1
- data/spec/r_builder_spec.rb +6 -6
- data/spec/resource/.RData +0 -0
- data/spec/resource/example.Rhistory +3 -0
- data/spec/turtle/bacon +4 -22
- data/spec/turtle/reference +9 -27
- metadata +37 -56
- data/lib/bio-publisci/loader.rb +0 -36
- data/spec/bio-publisci_spec.rb +0 -7
@@ -0,0 +1,150 @@
|
|
1
|
+
module R2RDF
|
2
|
+
class Dataset
|
3
|
+
extend R2RDF::Interactive
|
4
|
+
|
5
|
+
def self.for(object, options={}, ask_on_ambiguous=true)
|
6
|
+
if object.is_a? String
|
7
|
+
if File.exist? object
|
8
|
+
if File.extname(object).size > 0
|
9
|
+
extension = File.extname(object)
|
10
|
+
elsif File.basename(object)[0] == '.' && File.basename(object).count('.') == 1
|
11
|
+
extension = File.basename(object)
|
12
|
+
else
|
13
|
+
raise "Can't load file #{object}; type inference not yet implemented"
|
14
|
+
end
|
15
|
+
|
16
|
+
case extension
|
17
|
+
when ".RData"
|
18
|
+
r_object(object, options, ask_on_ambiguous)
|
19
|
+
when /.csv/i
|
20
|
+
R2RDF::Reader::CSV.new.automatic(object,nil,options,ask_on_ambiguous)
|
21
|
+
end
|
22
|
+
else
|
23
|
+
raise "Unable to find reader for File or String"
|
24
|
+
end
|
25
|
+
elsif object.is_a? Rserve::REXP
|
26
|
+
r_object(object, options, ask_on_ambiguous)
|
27
|
+
else
|
28
|
+
raise "not recognize Ruby objects of this type yet (#{object})"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.r_object(object, options={}, ask_on_ambiguous=true)
|
33
|
+
if object.is_a? String
|
34
|
+
con = Rserve::Connection.new
|
35
|
+
vars = con.eval("load('#{File.absolute_path object}')")
|
36
|
+
if vars.to_ruby.size > 1 && ask_on_ambiguous
|
37
|
+
puts "Which variable? #{vars.to_ruby}"
|
38
|
+
var = vars.to_ruby[gets.to_i]
|
39
|
+
else
|
40
|
+
var = vars.to_ruby[0]
|
41
|
+
end
|
42
|
+
|
43
|
+
r_classes = con.eval("class(#{var})").to_ruby
|
44
|
+
|
45
|
+
if r_classes.include? "data.frame"
|
46
|
+
df = R2RDF::Reader::Dataframe.new
|
47
|
+
unless options[:dimensions] || !ask_on_ambiguous
|
48
|
+
dims = con.eval("names(#{var})").to_ruby
|
49
|
+
puts "Which dimensions? #{dims}"
|
50
|
+
selection = gets.chomp
|
51
|
+
if selection.size > 0
|
52
|
+
options[:dimensions] = selection.split(',').map(&:to_i).map{|i| dims[i]}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
unless options[:measures] || !ask_on_ambiguous
|
56
|
+
meas = con.eval("names(#{var})").to_ruby
|
57
|
+
puts "Which measures? #{meas} "
|
58
|
+
selection = gets.chomp
|
59
|
+
if selection.size > 0
|
60
|
+
options[:measures] = selection.split(',').map(&:to_i).map{|i| meas[i]}
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
df.generate_n3(con.eval(var),var,options)
|
65
|
+
|
66
|
+
elsif r_classes.include? "cross"
|
67
|
+
bc = R2RDF::Reader::BigCross.new
|
68
|
+
|
69
|
+
unless options[:measures] || !ask_on_ambiguous
|
70
|
+
pheno_names = con.eval("names(#{var}$pheno)").to_ruby
|
71
|
+
puts "Which phenotype traits? #{pheno_names}"
|
72
|
+
selection = gets.chomp
|
73
|
+
if selection.size > 0
|
74
|
+
options[:measures] = selection.split(',').map(&:to_i).map{|i| pheno_names[i]}
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
base = var
|
79
|
+
if ask_on_ambiguous
|
80
|
+
puts "Output file base?"
|
81
|
+
base = gets.chomp
|
82
|
+
base = var unless base.size > 0
|
83
|
+
end
|
84
|
+
|
85
|
+
bc.generate_n3(con, var, base, options)
|
86
|
+
|
87
|
+
elsif r_classes.include? "matrix"
|
88
|
+
mat = R2RDF::Reader::RMatrix.new
|
89
|
+
|
90
|
+
unless options[:measures] || !ask_on_ambiguous
|
91
|
+
puts "Row label"
|
92
|
+
rows = gets.chomp
|
93
|
+
rows = "row" unless rows.size > 0
|
94
|
+
|
95
|
+
puts "Column label"
|
96
|
+
cols = gets.chomp
|
97
|
+
cols = "column" unless cols.size > 0
|
98
|
+
|
99
|
+
puts "Entry label"
|
100
|
+
vals = gets.chomp
|
101
|
+
vals = "value" unless vals.size > 0
|
102
|
+
|
103
|
+
options[:measures] = [cols,rows,vals]
|
104
|
+
end
|
105
|
+
|
106
|
+
base = var
|
107
|
+
if ask_on_ambiguous
|
108
|
+
puts "Output file base?"
|
109
|
+
base = gets.chomp
|
110
|
+
base = var unless base.size > 0
|
111
|
+
end
|
112
|
+
|
113
|
+
mat.generate_n3(con, var, base, options)
|
114
|
+
else
|
115
|
+
raise "no R2RDF::Reader found for #{r_classes}"
|
116
|
+
end
|
117
|
+
|
118
|
+
elsif object.is_a? Rserve::REXP
|
119
|
+
if object.attr.payload["class"].payload.first
|
120
|
+
|
121
|
+
df = R2RDF::Reader::Dataframe.new
|
122
|
+
|
123
|
+
var = nil
|
124
|
+
|
125
|
+
if ask_on_ambiguous
|
126
|
+
var = interact("Dataset name?",nil)
|
127
|
+
end
|
128
|
+
|
129
|
+
unless options[:dimensions] || !ask_on_ambiguous
|
130
|
+
dims = object.payload.names
|
131
|
+
selection = interact("Which dimensions?","row",dims){|s| puts s; nil}
|
132
|
+
options[:dimensions] = selection if selection
|
133
|
+
end
|
134
|
+
|
135
|
+
unless options[:measures] || !ask_on_ambiguous
|
136
|
+
meas = object.payload.names
|
137
|
+
options[:measures] = interact("Which measures?",meas,meas)
|
138
|
+
end
|
139
|
+
|
140
|
+
df.generate_n3(object,var,options)
|
141
|
+
else
|
142
|
+
raise "support for other Rserve objects coming shortly"
|
143
|
+
end
|
144
|
+
|
145
|
+
else
|
146
|
+
raise "#{object} is not an R object"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -1,57 +1,72 @@
|
|
1
1
|
module R2RDF
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
2
|
+
module Interactive
|
3
|
+
#to be called by other classes if user input is required
|
4
|
+
|
5
|
+
#take message, options, defaults. can be passed block to handle default as well
|
6
|
+
def interact(message, default, options=nil)
|
7
|
+
puts message + " (#{default})\n[#{options}]"
|
8
|
+
str = gets.chomp
|
9
|
+
if str.size > 0
|
10
|
+
if options
|
11
|
+
if str.split(',').all?{|s| Integer(s) rescue nil}
|
12
|
+
str.split(',').map(&:to_i).map{|i| options[i]}
|
13
|
+
else
|
14
|
+
str.split(',').each{|s| raise "unkown selection #{s}" unless options.include? s.strip}
|
15
|
+
str.split(',').map(&:strip)
|
16
|
+
end
|
17
|
+
else
|
18
|
+
str
|
19
|
+
end
|
20
|
+
elsif block_given?
|
21
|
+
yield str
|
22
|
+
else
|
23
|
+
default
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# def interactive(options={})
|
28
|
+
# options = defaults.merge(options)
|
29
|
+
# qb = {}
|
30
|
+
|
31
|
+
# puts "load config from file? [y/N]"
|
32
|
+
# if gets.chomp == "y"
|
33
|
+
# #use yaml or DSL file to configure
|
34
|
+
# else
|
35
|
+
# qb[:dimensions] = dimensions()
|
36
|
+
# qb[:measures] = measures()
|
37
|
+
# end
|
38
|
+
|
39
|
+
# puts "load data from file? [y/N]"
|
40
|
+
# if gets.chomp == "y"
|
41
|
+
# #attempt to load dataset from file, ask user to resolve problems or ambiguity
|
42
|
+
# else
|
43
|
+
# end
|
44
|
+
# qb
|
45
|
+
# end
|
46
|
+
|
47
|
+
# def dimensions
|
48
|
+
# puts "Enter a list of dimensions, separated by commas"
|
49
|
+
# arr = gets.chomp.split(",")
|
50
|
+
# dims = {}
|
51
|
+
|
52
|
+
# arr.map{|dim|
|
53
|
+
# puts "What is the range of #{dim.chomp.strip}? [:coded]"
|
54
|
+
# type = gets.chomp
|
55
|
+
# type = :coded if type == ":coded" || type == ""
|
56
|
+
# dims[dim.chomp.strip] = {type: type}
|
57
|
+
# }
|
58
|
+
|
59
|
+
# dims
|
60
|
+
# end
|
61
|
+
|
62
|
+
# def measures
|
63
|
+
# puts "Enter a list of measures, separated by commas"
|
64
|
+
# arr = gets.chomp.split(",")
|
65
|
+
# meas = []
|
66
|
+
|
67
|
+
# arr.map{|m| meas << m.chomp.strip}
|
68
|
+
|
69
|
+
# meas
|
70
|
+
# end
|
71
|
+
end
|
57
72
|
end
|
@@ -6,6 +6,8 @@ end
|
|
6
6
|
|
7
7
|
module R2RDF
|
8
8
|
module Metadata
|
9
|
+
include R2RDF::Parser
|
10
|
+
|
9
11
|
def defaults
|
10
12
|
{
|
11
13
|
encode_nulls: false,
|
@@ -18,7 +20,20 @@ module R2RDF
|
|
18
20
|
#make it just "var", and try to make that clear to calling classes
|
19
21
|
|
20
22
|
fields[:var] = sanitize([fields[:var]]).first
|
23
|
+
|
24
|
+
unless fields[:creator]
|
25
|
+
if ENV['USER']
|
26
|
+
fields[:creator] = ENV['USER']
|
27
|
+
elsif ENV['USERNAME']
|
28
|
+
fields[:creator] = ENV['USERNAME']
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
fields[:date] = Time.now.strftime("%Y-%m-%d") unless fields[:date]
|
33
|
+
|
21
34
|
options = defaults().merge(options)
|
35
|
+
|
36
|
+
#TODO some of these should probably be resources, eg dct:creator, or put under DC namespace
|
22
37
|
str = <<-EOF.unindent
|
23
38
|
ns:dataset-#{fields[:var]} rdfs:label "#{fields[:title]}";
|
24
39
|
dct:title "#{fields[:title]}";
|
@@ -52,19 +67,77 @@ module R2RDF
|
|
52
67
|
def provenance(fields, options={})
|
53
68
|
#TODO: should either add a prefixes method or replace some with full URIs
|
54
69
|
var = sanitize([fields[:var]]).first
|
70
|
+
creator = fields[:creator] if fields[:creator] #should be URI
|
71
|
+
org = fields[:organization] if fields[:organization] #should be URI
|
55
72
|
source_software = fields[:software] # software name, object type, optionally steps list for, eg, R
|
73
|
+
str = "ns:dataset-#{var} a prov:Entity.\n\n"
|
74
|
+
assoc_id = Time.now.nsec.to_s(32)
|
75
|
+
endstr = <<-EOF.unindent
|
76
|
+
</ns/R2RDF> a prov:Agent .
|
77
|
+
ns:dataset-#{var} prov:wasGeneratredBy ns:activity-0 .
|
78
|
+
|
79
|
+
ns:activity-0 a prov:Activity ;
|
80
|
+
prov:qualifiedAssociation ns:assoc-0_#{assoc_id};
|
81
|
+
prov:generated ns:dataset-#{var} .
|
82
|
+
|
83
|
+
ns:assoc-0_#{assoc_id} a prov:Assocation ;
|
84
|
+
prov:entity </ns/R2RDF>;
|
85
|
+
prov:hadPlan ns:plan-0.
|
86
|
+
|
87
|
+
ns:plan-0 a prov:Plan ;
|
88
|
+
rdfs:comment "generation of dataset-#{var} by R2RDF gem".
|
89
|
+
|
90
|
+
EOF
|
91
|
+
|
92
|
+
if creator
|
93
|
+
str << "<#{creator}> a prov:Agent, prov:Person .\n"
|
94
|
+
str << "</ns/R2RDF> prov:actedOnBehalfOf <#{creator}> .\n\n"
|
95
|
+
|
96
|
+
if org
|
97
|
+
str << "<#{org}> a prov:Agent, prov:Organization .\n"
|
98
|
+
str << "<#{creator}> prov:actedOnBehalfOf <#{org}> .\n"
|
99
|
+
end
|
100
|
+
end
|
56
101
|
|
57
|
-
str = "qb:dataset-#{var} a prov:Entity.\n"
|
58
|
-
endstr = "qb:dataset-#{var} prov:wasGeneratredBy <#{options[:base_url]}/ns/R2RDF>\n" #replace once gem has an actual name
|
59
102
|
if source_software
|
60
|
-
source_software = [source_software] unless source_software.
|
61
|
-
source_software.map{|soft|
|
62
|
-
str << "
|
103
|
+
source_software = [source_software] unless source_software.is_a? Array
|
104
|
+
source_software.each_with_index.map{|soft,i|
|
105
|
+
str << "</ns/prov/software/#{soft[:name]}> a prov:Agent .\n"
|
63
106
|
|
64
|
-
|
65
|
-
endstr << "
|
107
|
+
endstr << "ns:activity-0 prov:used </ns/dataset/#{var}#var> .\n"
|
108
|
+
endstr << "ns:dataset-#{var} prov:wasDerivedFrom </ns/dataset/#{var}#var> .\n\n"
|
109
|
+
|
110
|
+
if soft[:process]
|
111
|
+
if File.exist? soft[:process]
|
112
|
+
soft[:process] = IO.read(soft[:process])
|
113
|
+
end
|
114
|
+
endstr << "</ns/dataset/#{var}#var> prov:wasGeneratredBy ns:activity-#{i+1} .\n"
|
115
|
+
endstr << process(i+1, soft[:process],"/ns/prov/software/#{soft[:name]}", var)
|
116
|
+
end
|
66
117
|
}
|
67
118
|
end
|
119
|
+
str + "\n" + endstr
|
120
|
+
end
|
121
|
+
|
122
|
+
def process(id, step_string, software_resource, software_var, options={})
|
123
|
+
#TODO a better predicate for the steplist than rdfs:comment
|
124
|
+
# and make sure it looks good.
|
125
|
+
steps = '"' + step_string.split("\n").join('" "') + '"'
|
126
|
+
assoc_id = Time.now.nsec.to_s(32)
|
127
|
+
str = <<-EOF.unindent
|
128
|
+
ns:activity-#{id} a prov:Activity ;
|
129
|
+
prov:qualifiedAssociation ns:assoc-#{assoc_id} ;
|
130
|
+
prov:used </ns/dataset/#{software_var}#var>.
|
131
|
+
|
132
|
+
ns:assoc-#{id}_#{assoc_id} a prov:Assocation ;
|
133
|
+
prov:entity <#{software_resource}>;
|
134
|
+
prov:hadPlan ns:plan-#{id}.
|
135
|
+
|
136
|
+
ns:plan-#{id} a prov:Plan ;
|
137
|
+
rdfs:comment (#{steps});
|
138
|
+
|
139
|
+
EOF
|
140
|
+
|
68
141
|
end
|
69
142
|
|
70
143
|
def r2rdf_metadata
|
@@ -77,7 +150,7 @@ module R2RDF
|
|
77
150
|
|
78
151
|
def org_metadata
|
79
152
|
str <<-EOF.unindent
|
80
|
-
<http://sciruby.com/> a org:Organization;
|
153
|
+
<http://sciruby.com/> a org:Organization, prov:Organization;
|
81
154
|
skos:prefLabel "SciRuby";
|
82
155
|
rdfs:description "A Project to Build and Improve Tools for Scientific Computing in Ruby".
|
83
156
|
EOF
|
data/lib/bio-publisci/parser.rb
CHANGED
@@ -1,5 +1,34 @@
|
|
1
1
|
module R2RDF
|
2
2
|
module Parser
|
3
|
+
|
4
|
+
def sanitize(array)
|
5
|
+
#remove spaces and other special characters
|
6
|
+
processed = []
|
7
|
+
array.map{|entry|
|
8
|
+
if entry.is_a? String
|
9
|
+
processed << entry.gsub(/[\s\.]/,'_')
|
10
|
+
else
|
11
|
+
processed << entry
|
12
|
+
end
|
13
|
+
}
|
14
|
+
processed
|
15
|
+
end
|
16
|
+
|
17
|
+
def sanitize_hash(h)
|
18
|
+
mappings = {}
|
19
|
+
h.keys.map{|k|
|
20
|
+
if(k.is_a? String)
|
21
|
+
mappings[k] = k.gsub(' ','_')
|
22
|
+
end
|
23
|
+
}
|
24
|
+
|
25
|
+
h.keys.map{|k|
|
26
|
+
h[mappings[k]] = h.delete(k) if mappings[k]
|
27
|
+
}
|
28
|
+
|
29
|
+
h
|
30
|
+
end
|
31
|
+
|
3
32
|
def create_graph(string)
|
4
33
|
f = Tempfile.new('graph')
|
5
34
|
f.write(string)
|
@@ -57,8 +86,54 @@ module R2RDF
|
|
57
86
|
end
|
58
87
|
end
|
59
88
|
|
89
|
+
def to_resource(obj, options)
|
90
|
+
if obj.is_a? String
|
91
|
+
obj = "<#{obj}>" if obj =~ /^http:\/\//
|
92
|
+
|
93
|
+
#TODO decide the right way to handle missing values, since RDF has no null
|
94
|
+
#probably throw an error here since a missing resource is a bigger problem
|
95
|
+
obj = "NA" if obj.empty?
|
96
|
+
|
97
|
+
#TODO remove special characters (faster) as well (eg '?')
|
98
|
+
obj.gsub(' ','_').gsub('?','')
|
99
|
+
elsif obj == nil && options[:encode_nulls]
|
100
|
+
'"NA"'
|
101
|
+
elsif obj.is_a? Numeric
|
102
|
+
#resources cannot be referred to purely by integer (?)
|
103
|
+
"n"+obj.to_s
|
104
|
+
else
|
105
|
+
obj
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def to_literal(obj, options)
|
110
|
+
if obj.is_a? String
|
111
|
+
# Depressing that there's no more elegant way to check if a string is
|
112
|
+
# a number...
|
113
|
+
if val = Integer(obj) rescue nil
|
114
|
+
val
|
115
|
+
elsif val = Float(obj) rescue nil
|
116
|
+
val
|
117
|
+
else
|
118
|
+
'"'+obj+'"'
|
119
|
+
end
|
120
|
+
elsif obj == nil && options[:encode_nulls]
|
121
|
+
#TODO decide the right way to handle missing values, since RDF has no null
|
122
|
+
'"NA"'
|
123
|
+
else
|
124
|
+
obj
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
60
128
|
def strip_uri(uri)
|
61
|
-
|
129
|
+
uri = uri.to_s.dup
|
130
|
+
uri[-1] = '' if uri[-1] == '>'
|
131
|
+
uri.to_s.split('/').last.split('#').last
|
132
|
+
end
|
133
|
+
|
134
|
+
def strip_prefixes(string)
|
135
|
+
string.to_s.split(':').last
|
62
136
|
end
|
137
|
+
|
63
138
|
end
|
64
139
|
end
|