bio-publisci 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -1
- data/Rakefile +1 -1
- data/features/orm_steps.rb +4 -4
- data/features/reader.feature +3 -3
- data/features/reader_steps.rb +1 -0
- data/features/writer.feature +7 -2
- data/features/writer_steps.rb +8 -1
- data/lib/bio-publisci.rb +3 -1
- data/lib/bio-publisci/datacube_model.rb +46 -20
- data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +196 -194
- data/lib/bio-publisci/dataset/ORM/observation.rb +15 -13
- data/lib/bio-publisci/dataset/data_cube.rb +3 -3
- data/lib/bio-publisci/dataset/dataset_for.rb +25 -4
- data/lib/bio-publisci/dsl/dataset_dsl.rb +4 -2
- data/lib/bio-publisci/dsl/dsl.rb +3 -0
- data/lib/bio-publisci/metadata/generator.rb +1 -1
- data/lib/bio-publisci/metadata/metadata_model.rb +27 -0
- data/lib/bio-publisci/metadata/prov/activity.rb +1 -0
- data/lib/bio-publisci/metadata/prov/model/prov_models.rb +33 -2
- data/lib/bio-publisci/query/query_helper.rb +5 -1
- data/lib/bio-publisci/readers/arff.rb +2 -40
- data/lib/bio-publisci/readers/dataframe.rb +1 -1
- data/lib/bio-publisci/writers/arff.rb +42 -16
- data/lib/bio-publisci/writers/base.rb +77 -0
- data/lib/bio-publisci/writers/csv.rb +31 -0
- data/lib/bio-publisci/writers/dataframe.rb +2 -2
- data/resources/queries/codes.rq +10 -5
- data/resources/queries/dimensions.rq +9 -4
- data/resources/queries/measures.rq +7 -2
- data/resources/queries/observations.rq +5 -4
- data/resources/weather.numeric.arff +26 -21
- data/spec/ORM/data_cube_orm_spec.rb +23 -3
- data/spec/ORM/prov_model_spec.rb +53 -0
- data/spec/dataset_for_spec.rb +21 -0
- data/spec/dsl_spec.rb +5 -2
- data/spec/metadata/metadata_dsl_spec.rb +1 -1
- data/spec/r_builder_spec.rb +2 -2
- data/spec/turtle/bacon +1 -1
- data/spec/turtle/reference +1 -1
- data/spec/turtle/weather +275 -0
- data/spec/writer_spec.rb +61 -0
- metadata +66 -28
- checksums.yaml +0 -7
@@ -1,18 +1,20 @@
|
|
1
1
|
module PubliSci
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
module DataSet
|
3
|
+
module ORM
|
4
|
+
class Observation
|
5
|
+
attr_accessor :data
|
6
|
+
def initialize(data={})
|
7
|
+
@data = data
|
8
|
+
end
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
def method_missing(name, args)
|
11
|
+
#get entry of data hash
|
12
|
+
end
|
12
13
|
|
13
|
-
|
14
|
+
def respond_to_missing?(method, *)
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
18
20
|
end
|
@@ -121,7 +121,7 @@ module PubliSci
|
|
121
121
|
base = options[:base_url]
|
122
122
|
<<-EOF.unindent
|
123
123
|
@base <#{base}/ns/dc/> .
|
124
|
-
@prefix ns: <#{base}/ns/dataset/#{var}
|
124
|
+
@prefix ns: <#{base}/ns/dataset/#{var}/> .
|
125
125
|
@prefix qb: <http://purl.org/linked-data/cube#> .
|
126
126
|
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
127
127
|
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
@@ -342,7 +342,6 @@ module PubliSci
|
|
342
342
|
lists << str
|
343
343
|
}
|
344
344
|
|
345
|
-
|
346
345
|
lists
|
347
346
|
end
|
348
347
|
|
@@ -357,12 +356,13 @@ module PubliSci
|
|
357
356
|
else
|
358
357
|
refcode = code[0]
|
359
358
|
end
|
359
|
+
# puts data[refcode].uniq
|
360
360
|
data[refcode].uniq.each_with_index{|value,i|
|
361
361
|
unless value == nil && !options[:encode_nulls]
|
362
362
|
concepts << <<-EOF.unindent
|
363
363
|
#{to_resource(value,options)} a skos:Concept, #{code[2]};
|
364
364
|
skos:topConceptOf #{code[1]} ;
|
365
|
-
skos:prefLabel "#{strip_uri(
|
365
|
+
skos:prefLabel "#{strip_uri(value)}" ;
|
366
366
|
skos:inScheme #{code[1]} .
|
367
367
|
|
368
368
|
EOF
|
@@ -17,7 +17,7 @@ module PubliSci
|
|
17
17
|
elsif File.basename(object)[0] == '.' && File.basename(object).count('.') == 1
|
18
18
|
extension = File.basename(object)
|
19
19
|
else
|
20
|
-
raise "Can't load file #{object}; type inference not yet implemented"
|
20
|
+
raise "Can't load file #{object}; file type inference not yet implemented"
|
21
21
|
end
|
22
22
|
|
23
23
|
case extension
|
@@ -25,11 +25,17 @@ module PubliSci
|
|
25
25
|
r_object(object, options, ask_on_ambiguous)
|
26
26
|
when /.csv/i
|
27
27
|
PubliSci::Reader::CSV.new.automatic(object,nil,options,ask_on_ambiguous)
|
28
|
+
else
|
29
|
+
false
|
28
30
|
end
|
29
|
-
elsif object =~ %r{
|
30
|
-
self.for(download(object).path, options, ask_on_ambiguous)
|
31
|
+
elsif object =~ %r{htt(p|ps)://.+}
|
32
|
+
self.for(download(object).path, options, ask_on_ambiguous) || RDF::Statement.new(RDF::URI(object), RDF::URI('http://semanticscience.org/resource/hasValue'), IO.read(download(object).path)).to_s
|
33
|
+
# raise res
|
34
|
+
# self.for_remote(object)
|
31
35
|
else
|
32
|
-
raise "Unable to find reader for
|
36
|
+
raise "Unable to find reader for String '#{object}'"
|
37
|
+
# TODO: better handling of missing readers; need this way for raw strings for now
|
38
|
+
# false
|
33
39
|
end
|
34
40
|
elsif object.is_a? Rserve::REXP
|
35
41
|
r_object(object, options, ask_on_ambiguous)
|
@@ -38,6 +44,12 @@ module PubliSci
|
|
38
44
|
end
|
39
45
|
end
|
40
46
|
|
47
|
+
# def for_remote
|
48
|
+
# addr = object
|
49
|
+
# tmp = download(object)
|
50
|
+
# self.for(tmp.path) || "#{addr} <http://semanticscience.org/resource/"
|
51
|
+
# end
|
52
|
+
|
41
53
|
def self.download(uri)
|
42
54
|
out = Tempfile.new(uri.split('/').last)
|
43
55
|
out.write open(uri).read
|
@@ -45,6 +57,15 @@ module PubliSci
|
|
45
57
|
out
|
46
58
|
end
|
47
59
|
|
60
|
+
# private
|
61
|
+
# def self.reader_exists?(object)
|
62
|
+
# if object.is_a? String
|
63
|
+
# if File.exist? object
|
64
|
+
|
65
|
+
# elsif
|
66
|
+
# end
|
67
|
+
# end
|
68
|
+
|
48
69
|
def self.r_object(object, options={}, ask_on_ambiguous=true)
|
49
70
|
if object.is_a? String
|
50
71
|
con = Rserve::Connection.new
|
@@ -15,7 +15,7 @@ module PubliSci
|
|
15
15
|
# end
|
16
16
|
|
17
17
|
def object(file=nil)
|
18
|
-
|
18
|
+
add_or_get('object',file)
|
19
19
|
end
|
20
20
|
alias_method :source, :object
|
21
21
|
|
@@ -61,7 +61,9 @@ module PubliSci
|
|
61
61
|
if options
|
62
62
|
opts = opts.merge(options)
|
63
63
|
end
|
64
|
-
|
64
|
+
object().map{|obj|
|
65
|
+
Dataset.for(obj,opts,interact)
|
66
|
+
}.join("\n")
|
65
67
|
end
|
66
68
|
|
67
69
|
private
|
data/lib/bio-publisci/dsl/dsl.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
module PubliSci
|
2
|
+
class Metadata
|
3
|
+
module Model
|
4
|
+
PROV ||= RDF::Vocabulary.new(RDF::URI.new('http://www.w3.org/ns/prov#'))
|
5
|
+
QB ||= RDF::Vocabulary.new(RDF::URI.new('http://purl.org/linked-data/cube#'))
|
6
|
+
DCT ||= RDF::Vocabulary.new(RDF::URI.new('http://purl.org/dc/terms/'))
|
7
|
+
# dct:title "#{fields[:title]}";
|
8
|
+
# dct:creator "#{fields[:creator]}";
|
9
|
+
# rdfs:comment "#{fields[:description]}";
|
10
|
+
# dct:description "#{fields[:description]}";
|
11
|
+
# dct:issued "#{fields[:date]}"^^xsd:date.
|
12
|
+
|
13
|
+
class Meta < Spira::Base
|
14
|
+
type PROV.Entity
|
15
|
+
type QB.DataSet
|
16
|
+
property :label, predicate: RDF::RDFS.label
|
17
|
+
property :comment, predicate: RDF::RDFS.comment
|
18
|
+
property :description, predicate: DCT.description
|
19
|
+
property :creator, predicate: DCT.creator
|
20
|
+
property :issued, predicate: DCT.issued
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
# rescue LoadError
|
26
|
+
# puts "spira not installed, ORM unavailable"
|
27
|
+
# end
|
@@ -35,6 +35,7 @@ class Prov
|
|
35
35
|
def associated_with(agent=nil, &block)
|
36
36
|
block_list(:associated,:associations,Association,Associations,agent,&block)
|
37
37
|
end
|
38
|
+
alias_method :wasAssociatedWith, :associated_with
|
38
39
|
|
39
40
|
def used(entity=nil, &block)
|
40
41
|
block_list(:use,:usages,Usage,Usages,entity, &block)
|
@@ -23,6 +23,20 @@ module PubliSci
|
|
23
23
|
end
|
24
24
|
}
|
25
25
|
end
|
26
|
+
|
27
|
+
def all_types
|
28
|
+
me = self.subject
|
29
|
+
type_query = RDF::Query.new do
|
30
|
+
pattern [me, RDF.type, :type]
|
31
|
+
end
|
32
|
+
|
33
|
+
type_query.execute(self.class.repository).map{|t| t[:type]}
|
34
|
+
end
|
35
|
+
|
36
|
+
def has_data?
|
37
|
+
all_types.include?('http://purl.org/linked-data/cube#DataSet')
|
38
|
+
end
|
39
|
+
|
26
40
|
end
|
27
41
|
|
28
42
|
class Agent < Spira::Base
|
@@ -31,15 +45,28 @@ module PubliSci
|
|
31
45
|
type PROV.SoftwareAgent
|
32
46
|
type PROV.Person
|
33
47
|
property :label, predicate: RDF::RDFS.label
|
34
|
-
property :wasGeneratedBy, predicate: PROV.wasGeneratedBy
|
35
48
|
property :foaf_name, predicate: RDF::FOAF.name
|
36
49
|
property :foaf_given, predicate: RDF::FOAF.givenName
|
37
|
-
property :name, predicate: PROV.actedOnBehalfOf
|
38
50
|
property :actedOnBehalfOf, predicate: PROV.actedOnBehalfOf
|
39
51
|
|
52
|
+
|
40
53
|
def name
|
41
54
|
foaf_given || foaf_name
|
42
55
|
end
|
56
|
+
|
57
|
+
def name=(name)
|
58
|
+
foaf_given = name
|
59
|
+
foaf_name = name
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
def activities
|
64
|
+
#should do this in a SPARQL query instead
|
65
|
+
Activity.enum_for.map{|act|
|
66
|
+
subj = subject()
|
67
|
+
act if act.wasAssociatedWith.any?{|assoc| assoc == subj}
|
68
|
+
}.reject{|x| x==nil}
|
69
|
+
end
|
43
70
|
end
|
44
71
|
|
45
72
|
class Activity < Spira::Base
|
@@ -56,6 +83,10 @@ module PubliSci
|
|
56
83
|
property :label, predicate: RDF::RDFS.label
|
57
84
|
property :agent, predicate: PROV.agent
|
58
85
|
property :hadPlan, predicate: PROV.hadPlan
|
86
|
+
|
87
|
+
def activity
|
88
|
+
Activity.each.to_a.select{|act| act.qualifiedAssociation.include? self}
|
89
|
+
end
|
59
90
|
end
|
60
91
|
|
61
92
|
class Derivation < Spira::Base
|
@@ -47,7 +47,7 @@ module PubliSci
|
|
47
47
|
sparql.query(string)
|
48
48
|
end
|
49
49
|
|
50
|
-
def execute_from_file(file,store,type=:fourstore)
|
50
|
+
def execute_from_file(file,store,type=:fourstore,substitutions={})
|
51
51
|
if File.exist?(file)
|
52
52
|
string = IO.read(file)
|
53
53
|
elsif File.exist?(File.dirname(__FILE__) + '/../../../resources/queries/' + file)
|
@@ -57,6 +57,10 @@ module PubliSci
|
|
57
57
|
else
|
58
58
|
raise "couldn't find query for #{file}"
|
59
59
|
end
|
60
|
+
|
61
|
+
substitutions.map{|k,v|
|
62
|
+
string = string.gsub(k,v)
|
63
|
+
}
|
60
64
|
execute(string, store, type)
|
61
65
|
end
|
62
66
|
|
@@ -2,14 +2,14 @@ module PubliSci
|
|
2
2
|
module Reader
|
3
3
|
class ARFF
|
4
4
|
include PubliSci::Dataset::DataCube
|
5
|
+
|
5
6
|
def generate_n3(arff, options={})
|
6
7
|
arff = IO.read(arff) if File.exist? arff
|
7
8
|
options[:no_labels] = true # unless options[:no_labels] == nil
|
8
9
|
@options = options
|
9
10
|
comps = components(arff)
|
10
11
|
obs = data(arff, comps.keys)
|
11
|
-
|
12
|
-
|
12
|
+
generate(comps.reject{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, comps.select{|c| comps[c][:codes]}.keys, obs, (1..obs.first[1].size).to_a, relation(arff), options)
|
13
13
|
end
|
14
14
|
|
15
15
|
def relation(arff)
|
@@ -44,44 +44,6 @@ module PubliSci
|
|
44
44
|
}
|
45
45
|
h
|
46
46
|
end
|
47
|
-
|
48
|
-
# def coded_dimensions
|
49
|
-
# if @options[:codes]
|
50
|
-
# @options[:codes]
|
51
|
-
# elsif @options[:row_label]
|
52
|
-
# [@options[:row_label]]
|
53
|
-
# else
|
54
|
-
# ["refRow"]
|
55
|
-
# end
|
56
|
-
# end
|
57
|
-
|
58
|
-
# def measures
|
59
|
-
# if @options[:dimensions]
|
60
|
-
# if @options[:measures]
|
61
|
-
# @options[:measures] - @options[:dimensions]
|
62
|
-
# else
|
63
|
-
# # @rexp.payload.names - @options[:dimensions]
|
64
|
-
# end
|
65
|
-
# else
|
66
|
-
# @options[:measures] # || @rexp.payload.names
|
67
|
-
# end
|
68
|
-
# end
|
69
|
-
|
70
|
-
# def observation_labels
|
71
|
-
# # row_names = @rexp.attr.payload["row.names"].to_ruby
|
72
|
-
# # row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
|
73
|
-
# # row_names
|
74
|
-
# end
|
75
|
-
|
76
|
-
# def observation_data
|
77
|
-
|
78
|
-
# # data = {}
|
79
|
-
# # @rexp.payload.names.map{|name|
|
80
|
-
# # data[name] = @rexp.payload[name].to_ruby
|
81
|
-
# # }
|
82
|
-
# # data[@options[:row_label] || "refRow"] = observation_labels()
|
83
|
-
# # data
|
84
|
-
# end
|
85
47
|
end
|
86
48
|
end
|
87
49
|
end
|
@@ -1,9 +1,9 @@
|
|
1
1
|
module PubliSci
|
2
|
-
module
|
3
|
-
class ARFF
|
4
|
-
include PubliSci::Query
|
5
|
-
include PubliSci::Parser
|
6
|
-
include PubliSci::Analyzer
|
2
|
+
module Writers
|
3
|
+
class ARFF < Base
|
4
|
+
# include PubliSci::Query
|
5
|
+
# include PubliSci::Parser
|
6
|
+
# include PubliSci::Analyzer
|
7
7
|
|
8
8
|
def build_arff(relation, attributes, data, source)
|
9
9
|
str = <<-EOS
|
@@ -31,34 +31,60 @@ EOS
|
|
31
31
|
repo = RDF::Repository.load(turtle_file)
|
32
32
|
puts "loaded #{repo.size} statements into temporary repo" if verbose
|
33
33
|
|
34
|
-
dims =
|
35
|
-
meas =
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
dims = dimensions(repo)
|
35
|
+
meas = measures(repo)
|
36
|
+
data = observations(repo)
|
37
|
+
|
38
|
+
relation = dataSet(repo)
|
39
|
+
codes = codes(repo)
|
40
40
|
|
41
|
-
data = observation_hash(execute_from_file("observations.rq",repo,:graph), true)
|
42
41
|
attributes = {}
|
42
|
+
|
43
43
|
(dims | meas).map{|component|
|
44
|
-
attributes[component
|
44
|
+
attributes[component] = case recommend_range(data.map{|o| o[1][component]})
|
45
45
|
when "xsd:int"
|
46
46
|
"integer"
|
47
47
|
when "xsd:double"
|
48
48
|
"real"
|
49
49
|
when :coded
|
50
50
|
if dims.include? component
|
51
|
-
"{#{codes[component
|
51
|
+
"{#{codes[component].join(', ')}}"
|
52
52
|
else
|
53
53
|
"string"
|
54
54
|
end
|
55
55
|
end
|
56
56
|
}
|
57
|
+
|
57
58
|
build_arff(relation, attributes, data, turtle_file)
|
58
59
|
end
|
59
60
|
|
60
|
-
def from_store(
|
61
|
-
|
61
|
+
def from_store(repo, dataset=nil, title=nil, verbose=false)
|
62
|
+
# data = observation_hash(execute_from_file("observations.rq",repo,:graph,{"%{dataSet}"=>"<#{dataSet}>"}), true)
|
63
|
+
|
64
|
+
dims = dimensions(repo,dataset)
|
65
|
+
meas = measures(repo,dataset)
|
66
|
+
data = observations(repo,dataset)
|
67
|
+
codes = codes(repo,dataset)
|
68
|
+
attributes = {}
|
69
|
+
|
70
|
+
(dims | meas).map{|component|
|
71
|
+
attributes[component] = case recommend_range(data.map{|o| o[1][component]})
|
72
|
+
when "xsd:int"
|
73
|
+
"integer"
|
74
|
+
when "xsd:double"
|
75
|
+
"real"
|
76
|
+
when :coded
|
77
|
+
if dims.include? component
|
78
|
+
"{#{codes[component].join(', ')}}"
|
79
|
+
else
|
80
|
+
"string"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
}
|
84
|
+
|
85
|
+
dataset = dataSet(repo) unless dataset
|
86
|
+
title = dataset unless title
|
87
|
+
build_arff(title,attributes,data,dataset)
|
62
88
|
end
|
63
89
|
end
|
64
90
|
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module PubliSci
|
2
|
+
module Writers
|
3
|
+
class Base
|
4
|
+
include PubliSci::Query
|
5
|
+
include PubliSci::Parser
|
6
|
+
include PubliSci::Analyzer
|
7
|
+
|
8
|
+
def handle_input(input)
|
9
|
+
if input.is_a? String
|
10
|
+
if File.exist? input
|
11
|
+
RDF::Repository.load(input)
|
12
|
+
else
|
13
|
+
raise "UnkownStringInput: #{input}"
|
14
|
+
end
|
15
|
+
elsif input.is_a? RDF::Repository
|
16
|
+
input
|
17
|
+
else
|
18
|
+
raise "UnkownInput: #{input}, #{input.class}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def dimensions(input, data_set=nil, select=:label)
|
23
|
+
repo = handle_input(input)
|
24
|
+
|
25
|
+
if data_set
|
26
|
+
dims = execute_from_file("dimensions.rq",repo,:graph,{"?dataSet"=>"<#{data_set}>"})
|
27
|
+
else
|
28
|
+
dims = execute_from_file("dimensions.rq",repo,:graph)
|
29
|
+
end
|
30
|
+
|
31
|
+
dims.to_h.map{|d| d[select].to_s}
|
32
|
+
end
|
33
|
+
|
34
|
+
def measures(input, data_set=nil, select=:label)
|
35
|
+
repo = handle_input(input)
|
36
|
+
|
37
|
+
if data_set
|
38
|
+
meas = execute_from_file("measures.rq",repo,:graph,{"?dataSet"=>"<#{data_set}>"})
|
39
|
+
else
|
40
|
+
meas = execute_from_file("measures.rq",repo,:graph)
|
41
|
+
end
|
42
|
+
|
43
|
+
meas.to_h.map{|d| d[select].to_s}
|
44
|
+
end
|
45
|
+
|
46
|
+
def observations(input, data_set = nil, shorten_url = true)
|
47
|
+
repo = handle_input(input)
|
48
|
+
|
49
|
+
if data_set
|
50
|
+
obs = execute_from_file("observations.rq",repo,:graph,{"?dataSet"=>"<#{data_set}>"})
|
51
|
+
else
|
52
|
+
obs = execute_from_file("observations.rq",repo,:graph)
|
53
|
+
end
|
54
|
+
|
55
|
+
observation_hash(obs,shorten_url)
|
56
|
+
end
|
57
|
+
|
58
|
+
def dataSet(input, select = :label)
|
59
|
+
repo = handle_input(input)
|
60
|
+
|
61
|
+
execute_from_file("dataset.rq",repo,:graph).to_h.first[select].to_s
|
62
|
+
end
|
63
|
+
|
64
|
+
def codes(input, data_set = nil, select = :label)
|
65
|
+
repo = handle_input(input)
|
66
|
+
if data_set
|
67
|
+
codes = execute_from_file("codes.rq",repo,:graph,{"?dataSet"=>"<#{data_set}>"}).to_h
|
68
|
+
else
|
69
|
+
codes = execute_from_file("codes.rq",repo,:graph).to_h
|
70
|
+
end
|
71
|
+
codes.map{|c| c.values.map(&:to_s)}.inject({}){|h,el|
|
72
|
+
(h[el.first]||=[]) << el.last; h
|
73
|
+
}
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|