bio-publisci 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/.travis.yml +13 -0
  4. data/Gemfile +24 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.md +47 -0
  7. data/README.rdoc +48 -0
  8. data/Rakefile +70 -0
  9. data/bin/bio-publisci +83 -0
  10. data/features/create_generator.feature +25 -0
  11. data/features/integration.feature +12 -0
  12. data/features/integration_steps.rb +10 -0
  13. data/features/orm.feature +60 -0
  14. data/features/orm_steps.rb +74 -0
  15. data/features/reader.feature +25 -0
  16. data/features/reader_steps.rb +60 -0
  17. data/features/step_definitions/bio-publisci_steps.rb +0 -0
  18. data/features/store.feature +27 -0
  19. data/features/store_steps.rb +42 -0
  20. data/features/support/env.rb +13 -0
  21. data/features/writer.feature +9 -0
  22. data/features/writer_steps.rb +17 -0
  23. data/lib/bio-publisci/analyzer.rb +57 -0
  24. data/lib/bio-publisci/dataset/ORM/data_cube_orm.rb +219 -0
  25. data/lib/bio-publisci/dataset/ORM/observation.rb +20 -0
  26. data/lib/bio-publisci/dataset/data_cube.rb +308 -0
  27. data/lib/bio-publisci/dataset/interactive.rb +57 -0
  28. data/lib/bio-publisci/loader.rb +36 -0
  29. data/lib/bio-publisci/metadata/metadata.rb +105 -0
  30. data/lib/bio-publisci/parser.rb +64 -0
  31. data/lib/bio-publisci/query/query_helper.rb +114 -0
  32. data/lib/bio-publisci/r_client.rb +54 -0
  33. data/lib/bio-publisci/readers/arff.rb +87 -0
  34. data/lib/bio-publisci/readers/big_cross.rb +119 -0
  35. data/lib/bio-publisci/readers/cross.rb +72 -0
  36. data/lib/bio-publisci/readers/csv.rb +54 -0
  37. data/lib/bio-publisci/readers/dataframe.rb +66 -0
  38. data/lib/bio-publisci/readers/r_matrix.rb +152 -0
  39. data/lib/bio-publisci/store.rb +56 -0
  40. data/lib/bio-publisci/writers/arff.rb +66 -0
  41. data/lib/bio-publisci/writers/dataframe.rb +81 -0
  42. data/lib/bio-publisci.rb +36 -0
  43. data/lib/r2rdf.rb +226 -0
  44. data/lib/template_bak/publisci.rb +3 -0
  45. data/lib/template_bak.rb +12 -0
  46. data/lib/vocabs/cc.rb +18 -0
  47. data/lib/vocabs/cert.rb +13 -0
  48. data/lib/vocabs/dc.rb +63 -0
  49. data/lib/vocabs/dc11.rb +23 -0
  50. data/lib/vocabs/doap.rb +45 -0
  51. data/lib/vocabs/exif.rb +168 -0
  52. data/lib/vocabs/foaf.rb +69 -0
  53. data/lib/vocabs/geo.rb +13 -0
  54. data/lib/vocabs/http.rb +26 -0
  55. data/lib/vocabs/ma.rb +78 -0
  56. data/lib/vocabs/owl.rb +59 -0
  57. data/lib/vocabs/rdfs.rb +17 -0
  58. data/lib/vocabs/rsa.rb +12 -0
  59. data/lib/vocabs/rss.rb +14 -0
  60. data/lib/vocabs/sioc.rb +93 -0
  61. data/lib/vocabs/skos.rb +36 -0
  62. data/lib/vocabs/wot.rb +21 -0
  63. data/lib/vocabs/xhtml.rb +9 -0
  64. data/lib/vocabs/xsd.rb +58 -0
  65. data/resources/queries/codes.rq +13 -0
  66. data/resources/queries/dataset.rq +7 -0
  67. data/resources/queries/dimension_ranges.rq +8 -0
  68. data/resources/queries/dimensions.rq +7 -0
  69. data/resources/queries/measures.rq +7 -0
  70. data/resources/queries/observations.rq +12 -0
  71. data/resources/queries/test.rq +3 -0
  72. data/resources/weather.numeric.arff +23 -0
  73. data/spec/analyzer_spec.rb +36 -0
  74. data/spec/bio-publisci_spec.rb +7 -0
  75. data/spec/csv/bacon.csv +4 -0
  76. data/spec/csv/moar_bacon.csv +11 -0
  77. data/spec/data_cube_spec.rb +166 -0
  78. data/spec/generators/csv_spec.rb +44 -0
  79. data/spec/generators/dataframe_spec.rb +44 -0
  80. data/spec/generators/r_matrix_spec.rb +35 -0
  81. data/spec/queries/integrity/1.rq +21 -0
  82. data/spec/queries/integrity/11.rq +29 -0
  83. data/spec/queries/integrity/12.rq +37 -0
  84. data/spec/queries/integrity/14.rq +25 -0
  85. data/spec/queries/integrity/19_1.rq +21 -0
  86. data/spec/queries/integrity/19_2.rq +15 -0
  87. data/spec/queries/integrity/2.rq +22 -0
  88. data/spec/queries/integrity/3.rq +19 -0
  89. data/spec/queries/integrity/4.rq +13 -0
  90. data/spec/queries/integrity/5.rq +14 -0
  91. data/spec/r_builder_spec.rb +33 -0
  92. data/spec/spec_helper.rb +17 -0
  93. data/spec/turtle/bacon +149 -0
  94. data/spec/turtle/reference +2066 -0
  95. metadata +259 -0
@@ -0,0 +1,54 @@
1
+ module R2RDF
2
+ module Reader
3
+ class CSV
4
+ include R2RDF::Dataset::DataCube
5
+
6
+ def generate_n3(file, dataset_name, options={})
7
+ @data = ::CSV.read(file)
8
+ @options = options
9
+ generate(measures, dimensions, codes, observation_data, observation_labels, dataset_name, options)
10
+ end
11
+
12
+ def dimensions
13
+ @options[:dimensions] || [@data[0][0]]
14
+ end
15
+
16
+ def codes
17
+ @options[:codes] || dimensions()
18
+ end
19
+
20
+ def measures
21
+ @options[:measures] || @data[0] - dimensions()
22
+ end
23
+
24
+ def observation_labels
25
+ if @options[:label_column]
26
+ tmp = @data.dup
27
+ tmp.shift
28
+ tmp.map{|row|
29
+ row[@options[:label_column]]
30
+ }
31
+ else
32
+ (1..@data.size - 1).to_a
33
+ end
34
+ end
35
+
36
+ def observation_data
37
+
38
+ obs = {}
39
+ @data[0].map{|label|
40
+ obs[label] = []
41
+ }
42
+ tmp = @data.dup
43
+ tmp.shift
44
+
45
+ tmp.map{|row|
46
+ row.each_with_index{|entry,i|
47
+ obs[@data[0][i]] << entry
48
+ }
49
+ }
50
+ obs
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,66 @@
1
+ module R2RDF
2
+ module Reader
3
+ class Dataframe
4
+ include R2RDF::Dataset::DataCube
5
+
6
+ # def initialize(var)
7
+ # @var = var
8
+ # end
9
+
10
+ def generate_n3(rexp, var, options={})
11
+ @rexp = rexp
12
+ @options = options
13
+
14
+ generate(measures, dimensions, codes, observation_data, observation_labels, var, options)
15
+ end
16
+
17
+ def dimensions
18
+ if @options[:dimensions]
19
+ @options[:dimensions]
20
+ elsif @options[:row_label]
21
+ [@options[:row_label]]
22
+ else
23
+ ["refRow"]
24
+ end
25
+ end
26
+
27
+ def codes
28
+ if @options[:codes]
29
+ @options[:codes]
30
+ elsif @options[:row_label]
31
+ [@options[:row_label]]
32
+ else
33
+ ["refRow"]
34
+ end
35
+ end
36
+
37
+ def measures
38
+ if @options[:dimensions]
39
+ if @options[:measures]
40
+ @options[:measures] - @options[:dimensions]
41
+ else
42
+ @rexp.payload.names - @options[:dimensions]
43
+ end
44
+ else
45
+ @options[:measures] || @rexp.payload.names
46
+ end
47
+ end
48
+
49
+ def observation_labels
50
+ row_names = @rexp.attr.payload["row.names"].to_ruby
51
+ row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
52
+ row_names
53
+ end
54
+
55
+ def observation_data
56
+
57
+ data = {}
58
+ @rexp.payload.names.map{|name|
59
+ data[name] = @rexp.payload[name].to_ruby
60
+ }
61
+ data[@options[:row_label] || "refRow"] = observation_labels()
62
+ data
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,152 @@
1
+ module R2RDF
2
+ module Reader
3
+ class RMatrix
4
+ include R2RDF::Dataset::DataCube
5
+
6
+ #NOTE; this is pretty much hard coded for Karl's application right now, and doesn't
7
+ # do any dimension or code generation. Since its a set of LOD scores indexed by dimension
8
+ # and marker the usual datacube generator wont work (I think). In the future adding an option
9
+ # to specify this kind of a dataset would probably be useful
10
+
11
+
12
+ def generate_n3(client, var, outfile_base, options={})
13
+ meas = measures(client,var,options)
14
+ dim = dimensions(client,var,options)
15
+ codes = codes(client,var,options)
16
+
17
+ outvar = sanitize([var]).first
18
+
19
+ probes_per_file = options[:probes_per_file] || 100
20
+ col_select = "colnames"
21
+ col_select = "names" if options[:type] == :dataframe
22
+
23
+ #write structure
24
+ open(outfile_base+'_structure.ttl','w'){|f| f.write structure(client,var,outvar,options)}
25
+
26
+ probes=client.eval("#{col_select}(#{var})").to_ruby
27
+ if probes == nil
28
+ client.eval("colnames(#{var})=1:ncol(#{var})")
29
+ probes=client.eval("#{col_select}(#{var})").to_ruby
30
+ end
31
+ markers = rows(client,var,options)
32
+
33
+ probes.each_with_index{|probe,i|
34
+ #write prefixes and erase old file on first run
35
+ open(outfile_base+"_#{i/probes_per_file}.ttl",'w'){|f| f.write prefixes(var,options)} if i % probes_per_file == 0
36
+ i+=1
37
+ obs_data = observation_data(client,var,i,markers,options)
38
+ labels = labels_for(client,var,probe)
39
+
40
+ # labels = sanitize(labels)
41
+ # return obs_data
42
+ open(outfile_base+"_#{i/probes_per_file}.ttl",'a'){|f| observations(meas,dim,codes,obs_data,labels,outvar,options).map{|obs| f.write obs}}
43
+ puts "#{i}/#{probes.size}" unless options[:quiet]
44
+ }
45
+ end
46
+
47
+ def structure(client,var,outvar,options={})
48
+ meas = measures(client,var,options)
49
+ dim = dimensions(client,var,options)
50
+ codes = codes(client,var,options)
51
+
52
+ str = prefixes(var, options)
53
+ str << data_structure_definition(meas,outvar,options)
54
+ str << dataset(outvar,options)
55
+ component_specifications(meas, dim, var, options).map{ |c| str << c }
56
+ measure_properties(meas,var,options).map{|m| str << m}
57
+
58
+ str
59
+ end
60
+
61
+ #for now just make everything a measure
62
+ def measures(client, var, options={})
63
+ if options[:measures]
64
+ options[:measures]
65
+ else
66
+ ["probe","marker","value"]
67
+ end
68
+ # measure_properties(measures,var,options)
69
+ end
70
+
71
+ def dimensions(client, var, options={})
72
+ # dimension_properties([""],var)
73
+ []
74
+ end
75
+
76
+ def codes(client, var, options={})
77
+ []
78
+ end
79
+
80
+ def labels_for(connection,var,probe_id,options={})
81
+ row_names = connection.eval("row.names(#{var})")
82
+ # row_names = (1..@rexp.payload.first.to_ruby.size).to_a unless row_names.first
83
+ if row_names == connection.eval('NULL')
84
+ row_names = (1..connection.eval("nrow(#{var})").payload.first).to_a
85
+ else
86
+ row_names = row_names.payload
87
+ end
88
+
89
+ labels = (1..(row_names.size)).to_a.map(&:to_s)
90
+ labels = labels.map{|l|
91
+ l.insert(0,probe_id.to_s + "_")
92
+ }
93
+
94
+ labels
95
+ end
96
+
97
+ def rows(connection,var,options={})
98
+ row_names = connection.eval("row.names(#{var})")
99
+ #hacky solution because rserve client's .to_ruby method doesn't fully work
100
+ if row_names == connection.eval('NULL')
101
+ row_names = (1..connection.eval("nrow(#{var})").payload.first).to_a
102
+ else
103
+ row_names = row_names.payload
104
+ end
105
+ row_names
106
+ end
107
+
108
+ def observation_data(client, var, probe_number, row_names, options={})
109
+
110
+ data = {}
111
+ # geno_chr = client.eval("#{var}$geno$'#{chr}'")
112
+ # n_individuals = client.eval("#{var}$pheno[[1]]").to_ruby.size
113
+ # entries_per_individual = @rexp.payload["geno"].payload[row_individ].payload["map"].payload.size * @rexp.payload["geno"].payload.names.size
114
+ col_label = "probe"
115
+ row_label = "marker"
116
+ val_label = "value"
117
+
118
+ if options[:measures]
119
+ col_label = options[:measures][0] || "probe"
120
+ row_label = options[:measures][1] || "marker"
121
+ val_label = options[:measures][2] || "value"
122
+ end
123
+
124
+ data["#{col_label}"] = []
125
+ data["#{row_label}"] = []
126
+ data["#{val_label}"] = []
127
+
128
+ # n_individuals.times{|row_individ|
129
+ # puts "#{row_individ}/#{n_individuals}"
130
+
131
+ col_select = "colnames"
132
+ col_select = "names" if options[:type] == :dataframe
133
+
134
+ if options[:type] == :dataframe
135
+ probe_obj = client.eval("#{var}[[#{probe_number}]]").to_ruby
136
+ else
137
+ probe_obj = client.eval("#{var}[,#{probe_number}]").to_ruby
138
+ end
139
+ # puts probe_obj
140
+ probe_id = client.eval("#{col_select}(#{var})[[#{probe_number}]]").to_ruby
141
+ data["#{col_label}"] = (1..(probe_obj.size)).to_a.fill(probe_id)
142
+ probe_obj.each_with_index{|lod,i|
143
+ data["#{row_label}"] << row_names[i]
144
+ data["#{val_label}"] << lod
145
+ }
146
+
147
+ data.map{|k,v| v.flatten!}
148
+ data
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,56 @@
1
+ module R2RDF
2
+ # handles connection and messaging to/from the triple store
3
+ class Store
4
+ include R2RDF::Query
5
+
6
+ def defaults
7
+ {
8
+ type: :fourstore,
9
+ url: "http://localhost:8080", #TODO port etc should eventually be extracted from URI if given
10
+ replace: false
11
+ }
12
+ end
13
+
14
+ def add(file,graph)
15
+ if @options[:type] == :graph
16
+ throw "please provide an RDF::Repository" unless graph.is_a? RDF::Repository
17
+ graph.load(file)
18
+ @store = graph
19
+ @store
20
+ elsif @options[:type] == :fourstore
21
+ if @options[:replace]
22
+ `curl -T #{file} -H 'Content-Type: application/x-turtle' #{@options[:url]}/data/http%3A%2F%2Frqtl.org%2F#{graph}`
23
+ else
24
+ `curl --data-urlencode data@#{file} -d 'graph=http%3A%2F%2Frqtl.org%2F#{graph}' -d 'mime-type=application/x-turtle' #{@options[:url]}/data/`
25
+ end
26
+ end
27
+ end
28
+
29
+ def add_all(dir, graph, pattern=nil)
30
+ pattern = /.+\.ttl/ if pattern == :turtle || pattern == :ttl
31
+
32
+ files = Dir.entries(dir) - %w(. ..)
33
+ files = files.grep(pattern) if pattern.is_a? Regexp
34
+ nfiles = files.size
35
+ n = 0
36
+ files.each{|file| puts file + " #{n+=1}/#{nfiles} files"; puts add(file,graph)}
37
+ end
38
+
39
+ def initialize(options={})
40
+ @options = defaults.merge(options)
41
+ end
42
+
43
+ def query(string)
44
+ # execute(string, )
45
+ if @options[:type] == :graph
46
+ execute(string, @store, :graph)
47
+ elsif @options[:type] == :fourstore
48
+ execute(string, @options[:url], :fourstore)
49
+ end
50
+ end
51
+
52
+ def url
53
+ @options[:url]
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,66 @@
1
+ module R2RDF
2
+ module Writer
3
+ class ARFF
4
+ include R2RDF::Query
5
+ include R2RDF::Parser
6
+ include R2RDF::Analyzer
7
+
8
+ def build_arff(relation, attributes, data, source)
9
+ str = <<-EOS
10
+ % 1. Title: #{relation.capitalize} Database
11
+ %
12
+ % 2. Sources:
13
+ % (a) Generated from RDF source #{source}
14
+ %
15
+ @RELATION #{relation}
16
+
17
+ EOS
18
+
19
+ Hash[attributes.sort].map{|attribute,type|
20
+ str << "@ATTRIBUTE #{attribute} #{type}\n"
21
+ }
22
+
23
+ str << "\n@DATA\n"
24
+ data.map { |d| str << Hash[d[1].sort].values.join(',') + "\n" }
25
+
26
+ str
27
+ end
28
+
29
+ def from_turtle(turtle_file, verbose=false)
30
+ puts "loading #{turtle_file}" if verbose
31
+ repo = RDF::Repository.load(turtle_file)
32
+ puts "loaded #{repo.size} statements into temporary repo" if verbose
33
+
34
+ dims = get_ary(execute_from_file("dimensions.rq",repo,:graph)).flatten
35
+ meas = get_ary(execute_from_file("measures.rq",repo,:graph)).flatten
36
+ relation = execute_from_file("dataset.rq",repo,:graph).to_h.first[:label].to_s
37
+ codes = execute_from_file("codes.rq",repo,:graph).to_h.map{|e| e.values.map(&:to_s)}.inject({}){|h,el|
38
+ (h[el.first]||=[]) << el.last; h
39
+ }
40
+
41
+ data = observation_hash(execute_from_file("observations.rq",repo,:graph), true)
42
+ attributes = {}
43
+ (dims | meas).map{|component|
44
+ attributes[component] = case recommend_range(data.map{|o| o[1][component]})
45
+ when "xsd:int"
46
+ "integer"
47
+ when "xsd:double"
48
+ "real"
49
+ when :coded
50
+ if dims.include? component
51
+ "{#{codes[component].join(',')}}"
52
+ else
53
+ "string"
54
+ end
55
+ end
56
+ }
57
+
58
+ build_arff(relation, attributes, data, turtle_file)
59
+ end
60
+
61
+ def from_store(endpoint_url,variable_in=nil, variable_out=nil, verbose=false)
62
+ raise "not implemented yet"
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,81 @@
1
+ module R2RDF
2
+ module Writer
3
+ module Dataframe
4
+
5
+ def framestring(name,vectors)
6
+ framestr = "#{name} = data.frame("
7
+ vectors.map{ |k,v| framestr << k + '=' + k +','}
8
+ framestr[-1] = ')'
9
+ framestr
10
+ end
11
+
12
+ def get_vectors(variable_name, helper, repo)
13
+ column_names = helper.get_ary(helper.execute(helper.property_names(variable_name), repo)).flatten.map{|n| n.gsub(' Component','')}
14
+ vectors = {}
15
+ column_names.map{|n|
16
+ vectors[n] = helper.get_ary(helper.execute(helper.property_values(variable_name,n),repo),'to_f').flatten unless n == "refRow"
17
+ }
18
+ vectors
19
+ end
20
+
21
+ def create_dataframe(name, connection, rows, vectors)
22
+ connection.assign('rows', rows)
23
+ vectors.map{ |k,v|
24
+ connection.assign(k,v)
25
+ }
26
+ connection.eval(framestring(name,vectors))
27
+ connection.eval("row.names(#{name}) <- rows")
28
+ connection.eval(name)
29
+ end
30
+
31
+ def save_workspace(connection, loc)
32
+ connection.eval "save.image(#{loc})"
33
+ end
34
+
35
+ def get_rownames(variable, helper, repo)
36
+ rows = helper.get_ary(helper.execute(helper.row_names(variable), repo)).flatten
37
+ end
38
+
39
+ end
40
+
41
+ class Builder
42
+ include R2RDF::Writer::Dataframe
43
+
44
+
45
+ def from_turtle(turtle_file, connection, variable_in=nil, variable_out=nil, verbose=true, save=true)
46
+ unless variable_in && variable_out
47
+ puts "no variable specified. Simple inference coming soon" if verbose
48
+ return
49
+ end
50
+ puts "loading #{turtle_file}" if verbose
51
+ repo = RDF::Repository.load(turtle_file)
52
+ puts "loaded #{repo.size} statements into temporary repo" if verbose
53
+ # connection = Rserve::Connection.new
54
+ query = R2RDF::QueryHelper.new
55
+ rows = get_rownames(variable_in, query, repo)
56
+ puts "frame has #{rows.size} rows" if verbose
57
+
58
+ vectors = get_vectors(variable_in, query, repo)
59
+ puts "got vectors of size #{vectors.first.last.size}" if verbose && vectors.first
60
+
61
+ create_dataframe(variable_out, connection, rows, vectors)
62
+ save_workspace(connection, connection.eval('getwd()').to_ruby) if save
63
+ end
64
+
65
+ def from_store(endpoint_url,connection,variable_in=nil, variable_out=nil, verbose=true, save=true)
66
+ unless variable_in && variable_out
67
+ puts "no variable specified. Simple inference coming soon" if verbose
68
+ return
69
+ end
70
+ puts "connecting to endpoint at #{endpoint_url}" if verbose
71
+ sparql = SPARQL::Client.new(endpoint_url)
72
+ # client = R2RDF::Client.new
73
+ query = R2RDF::QueryHelper.new
74
+
75
+ rows = query.get_ary(sparql.query(query.row_names(variable_in))).flatten
76
+
77
+ end
78
+
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,36 @@
1
+ # This is temporary, just to help w/ development so I don't have to rewrite r2rdf.rb to be
2
+ # a standard gem base yet. Also load s the files instead of require for easy reloading
3
+ require 'tempfile'
4
+ require 'rdf'
5
+ require 'csv'
6
+ require 'rserve'
7
+ require 'sparql'
8
+ require 'sparql/client'
9
+ require 'rdf/turtle'
10
+
11
+ def load_folder(folder)
12
+ Dir.foreach(File.dirname(__FILE__) + "/#{folder}") do |file|
13
+ unless file == "." or file == ".."
14
+ load File.dirname(__FILE__) + "/#{folder}/" + file
15
+ end
16
+ end
17
+ end
18
+
19
+ load File.dirname(__FILE__) + '/bio-publisci/dataset/interactive.rb'
20
+ load File.dirname(__FILE__) + '/bio-publisci/query/query_helper.rb'
21
+ load File.dirname(__FILE__) + '/bio-publisci/parser.rb'
22
+ load File.dirname(__FILE__) + '/bio-publisci/r_client.rb'
23
+ load File.dirname(__FILE__) + '/bio-publisci/analyzer.rb'
24
+ load File.dirname(__FILE__) + '/bio-publisci/store.rb'
25
+ load File.dirname(__FILE__) + '/bio-publisci/dataset/data_cube.rb'
26
+
27
+
28
+ load_folder('bio-publisci/metadata')
29
+ load_folder('bio-publisci/readers')
30
+ load_folder('bio-publisci/writers')
31
+ load_folder('bio-publisci/dataset/ORM')
32
+ # Dir.foreach(File.dirname(__FILE__) + '/generators') do |file|
33
+ # unless file == "." or file == ".."
34
+ # load File.dirname(__FILE__) + '/generators/' + file
35
+ # end
36
+ # end