bio-publisci 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/.travis.yml +13 -0
  4. data/Gemfile +24 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.md +47 -0
  7. data/README.rdoc +48 -0
  8. data/Rakefile +70 -0
  9. data/bin/bio-publisci +83 -0
  10. data/features/create_generator.feature +25 -0
  11. data/features/integration.feature +12 -0
  12. data/features/integration_steps.rb +10 -0
  13. data/features/orm.feature +60 -0
  14. data/features/orm_steps.rb +74 -0
  15. data/features/reader.feature +25 -0
  16. data/features/reader_steps.rb +60 -0
  17. data/features/step_definitions/bio-publisci_steps.rb +0 -0
  18. data/features/store.feature +27 -0
  19. data/features/store_steps.rb +42 -0
  20. data/features/support/env.rb +13 -0
  21. data/features/writer.feature +9 -0
  22. data/features/writer_steps.rb +17 -0
  23. data/lib/bio-publisci/analyzer.rb +57 -0
  24. data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +219 -0
  25. data/lib/bio-publisci/dataset/ORM/observation.rb +20 -0
  26. data/lib/bio-publisci/dataset/data_cube.rb +308 -0
  27. data/lib/bio-publisci/dataset/interactive.rb +57 -0
  28. data/lib/bio-publisci/loader.rb +36 -0
  29. data/lib/bio-publisci/metadata/metadata.rb +105 -0
  30. data/lib/bio-publisci/parser.rb +64 -0
  31. data/lib/bio-publisci/query/query_helper.rb +114 -0
  32. data/lib/bio-publisci/r_client.rb +54 -0
  33. data/lib/bio-publisci/readers/arff.rb +87 -0
  34. data/lib/bio-publisci/readers/big_cross.rb +119 -0
  35. data/lib/bio-publisci/readers/cross.rb +72 -0
  36. data/lib/bio-publisci/readers/csv.rb +54 -0
  37. data/lib/bio-publisci/readers/dataframe.rb +66 -0
  38. data/lib/bio-publisci/readers/r_matrix.rb +152 -0
  39. data/lib/bio-publisci/store.rb +56 -0
  40. data/lib/bio-publisci/writers/arff.rb +66 -0
  41. data/lib/bio-publisci/writers/dataframe.rb +81 -0
  42. data/lib/bio-publisci.rb +36 -0
  43. data/lib/r2rdf.rb +226 -0
  44. data/lib/template_bak/publisci.rb +3 -0
  45. data/lib/template_bak.rb +12 -0
  46. data/lib/vocabs/cc.rb +18 -0
  47. data/lib/vocabs/cert.rb +13 -0
  48. data/lib/vocabs/dc.rb +63 -0
  49. data/lib/vocabs/dc11.rb +23 -0
  50. data/lib/vocabs/doap.rb +45 -0
  51. data/lib/vocabs/exif.rb +168 -0
  52. data/lib/vocabs/foaf.rb +69 -0
  53. data/lib/vocabs/geo.rb +13 -0
  54. data/lib/vocabs/http.rb +26 -0
  55. data/lib/vocabs/ma.rb +78 -0
  56. data/lib/vocabs/owl.rb +59 -0
  57. data/lib/vocabs/rdfs.rb +17 -0
  58. data/lib/vocabs/rsa.rb +12 -0
  59. data/lib/vocabs/rss.rb +14 -0
  60. data/lib/vocabs/sioc.rb +93 -0
  61. data/lib/vocabs/skos.rb +36 -0
  62. data/lib/vocabs/wot.rb +21 -0
  63. data/lib/vocabs/xhtml.rb +9 -0
  64. data/lib/vocabs/xsd.rb +58 -0
  65. data/resources/queries/codes.rq +13 -0
  66. data/resources/queries/dataset.rq +7 -0
  67. data/resources/queries/dimension_ranges.rq +8 -0
  68. data/resources/queries/dimensions.rq +7 -0
  69. data/resources/queries/measures.rq +7 -0
  70. data/resources/queries/observations.rq +12 -0
  71. data/resources/queries/test.rq +3 -0
  72. data/resources/weather.numeric.arff +23 -0
  73. data/spec/analyzer_spec.rb +36 -0
  74. data/spec/bio-publisci_spec.rb +7 -0
  75. data/spec/csv/bacon.csv +4 -0
  76. data/spec/csv/moar_bacon.csv +11 -0
  77. data/spec/data_cube_spec.rb +166 -0
  78. data/spec/generators/csv_spec.rb +44 -0
  79. data/spec/generators/dataframe_spec.rb +44 -0
  80. data/spec/generators/r_matrix_spec.rb +35 -0
  81. data/spec/queries/integrity/1.rq +21 -0
  82. data/spec/queries/integrity/11.rq +29 -0
  83. data/spec/queries/integrity/12.rq +37 -0
  84. data/spec/queries/integrity/14.rq +25 -0
  85. data/spec/queries/integrity/19_1.rq +21 -0
  86. data/spec/queries/integrity/19_2.rq +15 -0
  87. data/spec/queries/integrity/2.rq +22 -0
  88. data/spec/queries/integrity/3.rq +19 -0
  89. data/spec/queries/integrity/4.rq +13 -0
  90. data/spec/queries/integrity/5.rq +14 -0
  91. data/spec/r_builder_spec.rb +33 -0
  92. data/spec/spec_helper.rb +17 -0
  93. data/spec/turtle/bacon +149 -0
  94. data/spec/turtle/reference +2066 -0
  95. metadata +259 -0
@@ -0,0 +1,54 @@
1
+ module R2RDF
2
+ module Reader
3
+ class CSV
4
+ include R2RDF::Dataset::DataCube
5
+
6
+ def generate_n3(file, dataset_name, options={})
7
+ @data = ::CSV.read(file)
8
+ @options = options
9
+ generate(measures, dimensions, codes, observation_data, observation_labels, dataset_name, options)
10
+ end
11
+
12
+ def dimensions
13
+ @options[:dimensions] || [@data[0][0]]
14
+ end
15
+
16
+ def codes
17
+ @options[:codes] || dimensions()
18
+ end
19
+
20
+ def measures
21
+ @options[:measures] || @data[0] - dimensions()
22
+ end
23
+
24
+ def observation_labels
25
+ if @options[:label_column]
26
+ tmp = @data.dup
27
+ tmp.shift
28
+ tmp.map{|row|
29
+ row[@options[:label_column]]
30
+ }
31
+ else
32
+ (1..@data.size - 1).to_a
33
+ end
34
+ end
35
+
36
+ def observation_data
37
+
38
+ obs = {}
39
+ @data[0].map{|label|
40
+ obs[label] = []
41
+ }
42
+ tmp = @data.dup
43
+ tmp.shift
44
+
45
+ tmp.map{|row|
46
+ row.each_with_index{|entry,i|
47
+ obs[@data[0][i]] << entry
48
+ }
49
+ }
50
+ obs
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,66 @@
1
+ module R2RDF
2
+ module Reader
3
+ class Dataframe
4
+ include R2RDF::Dataset::DataCube
5
+
6
+ # def initialize(var)
7
+ # @var = var
8
+ # end
9
+
10
+ def generate_n3(rexp, var, options={})
11
+ @rexp = rexp
12
+ @options = options
13
+
14
+ generate(measures, dimensions, codes, observation_data, observation_labels, var, options)
15
+ end
16
+
17
+ def dimensions
18
+ if @options[:dimensions]
19
+ @options[:dimensions]
20
+ elsif @options[:row_label]
21
+ [@options[:row_label]]
22
+ else
23
+ ["refRow"]
24
+ end
25
+ end
26
+
27
+ def codes
28
+ if @options[:codes]
29
+ @options[:codes]
30
+ elsif @options[:row_label]
31
+ [@options[:row_label]]
32
+ else
33
+ ["refRow"]
34
+ end
35
+ end
36
+
37
+ def measures
38
+ if @options[:dimensions]
39
+ if @options[:measures]
40
+ @options[:measures] - @options[:dimensions]
41
+ else
42
+ @rexp.payload.names - @options[:dimensions]
43
+ end
44
+ else
45
+ @options[:measures] || @rexp.payload.names
46
+ end
47
+ end
48
+
49
+ def observation_labels
50
+ row_names = @rexp.attr.payload["row.names"].to_ruby
51
+ row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
52
+ row_names
53
+ end
54
+
55
+ def observation_data
56
+
57
+ data = {}
58
+ @rexp.payload.names.map{|name|
59
+ data[name] = @rexp.payload[name].to_ruby
60
+ }
61
+ data[@options[:row_label] || "refRow"] = observation_labels()
62
+ data
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,152 @@
1
+ module R2RDF
2
+ module Reader
3
+ class RMatrix
4
+ include R2RDF::Dataset::DataCube
5
+
6
+ #NOTE; this is pretty much hard coded for Karl's application right now, and doesn't
7
+ # do any dimension or code generation. Since its a set of LOD scores indexed by dimension
8
+ # and marker the usual datacube generator wont work (I think). In the future adding an option
9
+ # to specify this kind of a dataset would probably be useful
10
+
11
+
12
+ def generate_n3(client, var, outfile_base, options={})
13
+ meas = measures(client,var,options)
14
+ dim = dimensions(client,var,options)
15
+ codes = codes(client,var,options)
16
+
17
+ outvar = sanitize([var]).first
18
+
19
+ probes_per_file = options[:probes_per_file] || 100
20
+ col_select = "colnames"
21
+ col_select = "names" if options[:type] == :dataframe
22
+
23
+ #write structure
24
+ open(outfile_base+'_structure.ttl','w'){|f| f.write structure(client,var,outvar,options)}
25
+
26
+ probes=client.eval("#{col_select}(#{var})").to_ruby
27
+ if probes == nil
28
+ client.eval("colnames(#{var})=1:ncol(#{var})")
29
+ probes=client.eval("#{col_select}(#{var})").to_ruby
30
+ end
31
+ markers = rows(client,var,options)
32
+
33
+ probes.each_with_index{|probe,i|
34
+ #write prefixes and erase old file on first run
35
+ open(outfile_base+"_#{i/probes_per_file}.ttl",'w'){|f| f.write prefixes(var,options)} if i % probes_per_file == 0
36
+ i+=1
37
+ obs_data = observation_data(client,var,i,markers,options)
38
+ labels = labels_for(client,var,probe)
39
+
40
+ # labels = sanitize(labels)
41
+ # return obs_data
42
+ open(outfile_base+"_#{i/probes_per_file}.ttl",'a'){|f| observations(meas,dim,codes,obs_data,labels,outvar,options).map{|obs| f.write obs}}
43
+ puts "#{i}/#{probes.size}" unless options[:quiet]
44
+ }
45
+ end
46
+
47
+ def structure(client,var,outvar,options={})
48
+ meas = measures(client,var,options)
49
+ dim = dimensions(client,var,options)
50
+ codes = codes(client,var,options)
51
+
52
+ str = prefixes(var, options)
53
+ str << data_structure_definition(meas,outvar,options)
54
+ str << dataset(outvar,options)
55
+ component_specifications(meas, dim, var, options).map{ |c| str << c }
56
+ measure_properties(meas,var,options).map{|m| str << m}
57
+
58
+ str
59
+ end
60
+
61
+ #for now just make everything a measure
62
+ def measures(client, var, options={})
63
+ if options[:measures]
64
+ options[:measures]
65
+ else
66
+ ["probe","marker","value"]
67
+ end
68
+ # measure_properties(measures,var,options)
69
+ end
70
+
71
+ def dimensions(client, var, options={})
72
+ # dimension_properties([""],var)
73
+ []
74
+ end
75
+
76
+ def codes(client, var, options={})
77
+ []
78
+ end
79
+
80
+ def labels_for(connection,var,probe_id,options={})
81
+ row_names = connection.eval("row.names(#{var})")
82
+ # row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
83
+ if row_names == connection.eval('NULL')
84
+ row_names = (1..connection.eval("nrow(#{var})").payload.first).to_a
85
+ else
86
+ row_names = row_names.payload
87
+ end
88
+
89
+ labels = (1..(row_names.size)).to_a.map(&:to_s)
90
+ labels = labels.map{|l|
91
+ l.insert(0,probe_id.to_s + "_")
92
+ }
93
+
94
+ labels
95
+ end
96
+
97
+ def rows(connection,var,options={})
98
+ row_names = connection.eval("row.names(#{var})")
99
+ #hacky solution because rserve client's .to_ruby method doesn't fully work
100
+ if row_names == connection.eval('NULL')
101
+ row_names = (1..connection.eval("nrow(#{var})").payload.first).to_a
102
+ else
103
+ row_names = row_names.payload
104
+ end
105
+ row_names
106
+ end
107
+
108
+ def observation_data(client, var, probe_number, row_names, options={})
109
+
110
+ data = {}
111
+ # geno_chr = client.eval("#{var}$geno$'#{chr}'")
112
+ # n_individuals = client.eval("#{var}$pheno[[1]]").to_ruby.size
113
+ # entries_per_individual = @rexp.payload["geno"].payload[row_individ].payload["map"].payload.size * @rexp.payload["geno"].payload.names.size
114
+ col_label = "probe"
115
+ row_label = "marker"
116
+ val_label = "value"
117
+
118
+ if options[:measures]
119
+ col_label = options[:measures][0] || "probe"
120
+ row_label = options[:measures][1] || "marker"
121
+ val_label = options[:measures][2] || "value"
122
+ end
123
+
124
+ data["#{col_label}"] = []
125
+ data["#{row_label}"] = []
126
+ data["#{val_label}"] = []
127
+
128
+ # n_individuals.times{|row_individ|
129
+ # puts "#{row_individ}/#{n_individuals}"
130
+
131
+ col_select = "colnames"
132
+ col_select = "names" if options[:type] == :dataframe
133
+
134
+ if options[:type] == :dataframe
135
+ probe_obj = client.eval("#{var}[[#{probe_number}]]").to_ruby
136
+ else
137
+ probe_obj = client.eval("#{var}[,#{probe_number}]").to_ruby
138
+ end
139
+ # puts probe_obj
140
+ probe_id = client.eval("#{col_select}(#{var})[[#{probe_number}]]").to_ruby
141
+ data["#{col_label}"] = (1..(probe_obj.size)).to_a.fill(probe_id)
142
+ probe_obj.each_with_index{|lod,i|
143
+ data["#{row_label}"] << row_names[i]
144
+ data["#{val_label}"] << lod
145
+ }
146
+
147
+ data.map{|k,v| v.flatten!}
148
+ data
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,56 @@
1
+ module R2RDF
2
+ # handles connection and messaging to/from the triple store
3
+ class Store
4
+ include R2RDF::Query
5
+
6
+ def defaults
7
+ {
8
+ type: :fourstore,
9
+ url: "http://localhost:8080", #TODO port etc should eventually be extracted from URI if given
10
+ replace: false
11
+ }
12
+ end
13
+
14
+ def add(file,graph)
15
+ if @options[:type] == :graph
16
+ throw "please provide an RDF::Repository" unless graph.is_a? RDF::Repository
17
+ graph.load(file)
18
+ @store = graph
19
+ @store
20
+ elsif @options[:type] == :fourstore
21
+ if @options[:replace]
22
+ `curl -T #{file} -H 'Content-Type: application/x-turtle' #{@options[:url]}/data/http%3A%2F%2Frqtl.org%2F#{graph}`
23
+ else
24
+ `curl --data-urlencode data@#{file} -d 'graph=http%3A%2F%2Frqtl.org%2F#{graph}' -d 'mime-type=application/x-turtle' #{@options[:url]}/data/`
25
+ end
26
+ end
27
+ end
28
+
29
+ def add_all(dir, graph, pattern=nil)
30
+ pattern = /.+\.ttl/ if pattern == :turtle || pattern == :ttl
31
+
32
+ files = Dir.entries(dir) - %w(. ..)
33
+ files = files.grep(pattern) if pattern.is_a? Regexp
34
+ nfiles = files.size
35
+ n = 0
36
+ files.each{|file| puts file + " #{n+=1}/#{nfiles} files"; puts add(file,graph)}
37
+ end
38
+
39
+ def initialize(options={})
40
+ @options = defaults.merge(options)
41
+ end
42
+
43
+ def query(string)
44
+ # execute(string, )
45
+ if @options[:type] == :graph
46
+ execute(string, @store, :graph)
47
+ elsif @options[:type] == :fourstore
48
+ execute(string, @options[:url], :fourstore)
49
+ end
50
+ end
51
+
52
+ def url
53
+ @options[:url]
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,66 @@
1
+ module R2RDF
2
+ module Writer
3
+ class ARFF
4
+ include R2RDF::Query
5
+ include R2RDF::Parser
6
+ include R2RDF::Analyzer
7
+
8
+ def build_arff(relation, attributes, data, source)
9
+ str = <<-EOS
10
+ % 1. Title: #{relation.capitalize} Database
11
+ %
12
+ % 2. Sources:
13
+ % (a) Generated from RDF source #{source}
14
+ %
15
+ @RELATION #{relation}
16
+
17
+ EOS
18
+
19
+ Hash[attributes.sort].map{|attribute,type|
20
+ str << "@ATTRIBUTE #{attribute} #{type}\n"
21
+ }
22
+
23
+ str << "\n@DATA\n"
24
+ data.map { |d| str << Hash[d[1].sort].values.join(',') + "\n" }
25
+
26
+ str
27
+ end
28
+
29
+ def from_turtle(turtle_file, verbose=false)
30
+ puts "loading #{turtle_file}" if verbose
31
+ repo = RDF::Repository.load(turtle_file)
32
+ puts "loaded #{repo.size} statements into temporary repo" if verbose
33
+
34
+ dims = get_ary(execute_from_file("dimensions.rq",repo,:graph)).flatten
35
+ meas = get_ary(execute_from_file("measures.rq",repo,:graph)).flatten
36
+ relation = execute_from_file("dataset.rq",repo,:graph).to_h.first[:label].to_s
37
+ codes = execute_from_file("codes.rq",repo,:graph).to_h.map{|e| e.values.map(&:to_s)}.inject({}){|h,el|
38
+ (h[el.first]||=[]) << el.last; h
39
+ }
40
+
41
+ data = observation_hash(execute_from_file("observations.rq",repo,:graph), true)
42
+ attributes = {}
43
+ (dims | meas).map{|component|
44
+ attributes[component] = case recommend_range(data.map{|o| o[1][component]})
45
+ when "xsd:int"
46
+ "integer"
47
+ when "xsd:double"
48
+ "real"
49
+ when :coded
50
+ if dims.include? component
51
+ "{#{codes[component].join(',')}}"
52
+ else
53
+ "string"
54
+ end
55
+ end
56
+ }
57
+
58
+ build_arff(relation, attributes, data, turtle_file)
59
+ end
60
+
61
+ def from_store(endpoint_url,variable_in=nil, variable_out=nil, verbose=false)
62
+ raise "not implemented yet"
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,81 @@
1
+ module R2RDF
2
+ module Writer
3
+ module Dataframe
4
+
5
+ def framestring(name,vectors)
6
+ framestr = "#{name} = data.frame("
7
+ vectors.map{ |k,v| framestr << k + '=' + k +','}
8
+ framestr[-1] = ')'
9
+ framestr
10
+ end
11
+
12
+ def get_vectors(variable_name, helper, repo)
13
+ column_names = helper.get_ary(helper.execute(helper.property_names(variable_name), repo)).flatten.map{|n| n.gsub(' Component','')}
14
+ vectors = {}
15
+ column_names.map{|n|
16
+ vectors[n] = helper.get_ary(helper.execute(helper.property_values(variable_name,n),repo),'to_f').flatten unless n == "refRow"
17
+ }
18
+ vectors
19
+ end
20
+
21
+ def create_dataframe(name, connection, rows, vectors)
22
+ connection.assign('rows', rows)
23
+ vectors.map{ |k,v|
24
+ connection.assign(k,v)
25
+ }
26
+ connection.eval(framestring(name,vectors))
27
+ connection.eval("row.names(#{name}) <- rows")
28
+ connection.eval(name)
29
+ end
30
+
31
+ def save_workspace(connection, loc)
32
+ connection.eval "save.image(#{loc})"
33
+ end
34
+
35
+ def get_rownames(variable, helper, repo)
36
+ rows = helper.get_ary(helper.execute(helper.row_names(variable), repo)).flatten
37
+ end
38
+
39
+ end
40
+
41
+ class Builder
42
+ include R2RDF::Writer::Dataframe
43
+
44
+
45
+ def from_turtle(turtle_file, connection, variable_in=nil, variable_out=nil, verbose=true, save=true)
46
+ unless variable_in && variable_out
47
+ puts "no variable specified. Simple inference coming soon" if verbose
48
+ return
49
+ end
50
+ puts "loading #{turtle_file}" if verbose
51
+ repo = RDF::Repository.load(turtle_file)
52
+ puts "loaded #{repo.size} statements into temporary repo" if verbose
53
+ # connection = Rserve::Connection.new
54
+ query = R2RDF::QueryHelper.new
55
+ rows = get_rownames(variable_in, query, repo)
56
+ puts "frame has #{rows.size} rows" if verbose
57
+
58
+ vectors = get_vectors(variable_in, query, repo)
59
+ puts "got vectors of size #{vectors.first.last.size}" if verbose && vectors.first
60
+
61
+ create_dataframe(variable_out, connection, rows, vectors)
62
+ save_workspace(connection, connection.eval('getwd()').to_ruby) if save
63
+ end
64
+
65
+ def from_store(endpoint_url,connection,variable_in=nil, variable_out=nil, verbose=true, save=true)
66
+ unless variable_in && variable_out
67
+ puts "no variable specified. Simple inference coming soon" if verbose
68
+ return
69
+ end
70
+ puts "connecting to endpoint at #{endpoint_url}" if verbose
71
+ sparql = SPARQL::Client.new(endpoint_url)
72
+ # client = R2RDF::Client.new
73
+ query = R2RDF::QueryHelper.new
74
+
75
+ rows = query.get_ary(sparql.query(query.row_names(variable_in))).flatten
76
+
77
+ end
78
+
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,36 @@
1
+ # This is temporary, just to help w/ development so I don't have to rewrite r2rdf.rb to be
2
+ # a standard gem base yet. Also load s the files instead of require for easy reloading
3
+ require 'tempfile'
4
+ require 'rdf'
5
+ require 'csv'
6
+ require 'rserve'
7
+ require 'sparql'
8
+ require 'sparql/client'
9
+ require 'rdf/turtle'
10
+
11
+ def load_folder(folder)
12
+ Dir.foreach(File.dirname(__FILE__) + "/#{folder}") do |file|
13
+ unless file == "." or file == ".."
14
+ load File.dirname(__FILE__) + "/#{folder}/" + file
15
+ end
16
+ end
17
+ end
18
+
19
+ load File.dirname(__FILE__) + '/bio-publisci/dataset/interactive.rb'
20
+ load File.dirname(__FILE__) + '/bio-publisci/query/query_helper.rb'
21
+ load File.dirname(__FILE__) + '/bio-publisci/parser.rb'
22
+ load File.dirname(__FILE__) + '/bio-publisci/r_client.rb'
23
+ load File.dirname(__FILE__) + '/bio-publisci/analyzer.rb'
24
+ load File.dirname(__FILE__) + '/bio-publisci/store.rb'
25
+ load File.dirname(__FILE__) + '/bio-publisci/dataset/data_cube.rb'
26
+
27
+
28
+ load_folder('bio-publisci/metadata')
29
+ load_folder('bio-publisci/readers')
30
+ load_folder('bio-publisci/writers')
31
+ load_folder('bio-publisci/dataset/ORM')
32
+ # Dir.foreach(File.dirname(__FILE__) + '/generators') do |file|
33
+ # unless file == "." or file == ".."
34
+ # load File.dirname(__FILE__) + '/generators/' + file
35
+ # end
36
+ # end