opentox-dataset 5.0.0pre1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ OpenTox Dataset
2
+ ===============
3
+
4
+ * An OpenTox REST Webservice
5
+ * Stores associations between compounds and features in datasets
6
+ * Implements a subset of the [OpenTox compound API 1.2](http://opentox.org/dev/apis/api-1.2/dataset)
7
+ * Supports the internal YAML representation of opentox-ruby
8
+
9
+ REST operations
10
+ ---------------
11
+
12
+ Get a list of datasets GET / - List of dataset URIs 200,400,404
13
+ Get a dataset GET /{id} - Dataset representation 200,400,404
14
+ Upload a dataset POST / Dataset representation Dataset URI 200,400,404
15
+ Delete a dataset DELETE /{id} - - 200,404
16
+ Delete all datasets DELETE / - - 200,404
17
+
18
+ Supported MIME formats (http://chemical-mime.sourceforge.net/)
19
+ --------------------------------------------------------------
20
+
21
+ * application/rdf+xml (default): read/write OWL-DL
22
+ * application/x-yaml: read/write YAML
23
+
24
+ Examples
25
+ --------
26
+
27
+ Get a list of all datasets
28
+
29
+ curl http://webservices.in-silico.ch/dataset
30
+
31
+ Upload a dataset
32
+
33
+ curl -X POST -H "Content-Type:application/rdf+xml" --data-binary @{my_rdf_file} http://webservices.in-silico.ch/dataset
34
+
35
+ Get a dataset representation
36
+
37
+ curl http://webservices.in-silico.ch/dataset/{id}
38
+
39
+ Delete a dataset
40
+
41
+ curl -X DELETE http://webservices.in-silico.ch/dataset/{id}
42
+
43
+ [API documentation](http://rdoc.info/github/opentox/dataset)
44
+ ------------------------------------------------------------
45
+
46
+ Copyright (c) 2009-2011 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
47
+
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 5.0.0pre1
data/application.rb ADDED
@@ -0,0 +1,452 @@
1
+ require 'roo'
2
+ #require 'profiler'
3
+
4
+ module OpenTox
5
+ class Application < Service
6
+
7
+ @warnings = []
8
+
9
+ helpers do
10
+
11
+ def from_csv(csv)
12
+ from_table CSV.parse(csv)
13
+ end
14
+
15
+ def from_spreadsheet spreadsheet
16
+ extensions = { Excel => ".xls", Excelx => ".xlsx", Openoffice => ".ods" }
17
+ input = params[:file][:tempfile].path + ".xls"
18
+ csv_file = params[:file][:tempfile].path + ".csv"
19
+ File.rename params[:file][:tempfile].path, input # roo needs "correct" extensions
20
+ spreadsheet.new(input).to_csv csv_file # roo cannot write to strings
21
+ @body = from_csv File.read(csv_file)
22
+ @content_type = "text/plain"
23
+ end
24
+
25
+ =begin
26
+ def from_sdf(sdf)
27
+
28
+ #obconversion = OpenBabel::OBConversion.new
29
+ #obmol = OpenBabel::OBMol.new
30
+ #obconversion.set_in_and_out_formats "sdf", "inchi"
31
+
32
+ table = []
33
+
34
+ properties = []
35
+ sdf.each_line { |l| properties << l.to_s if l.match(/</) }
36
+ properties.sort!
37
+ properties.uniq!
38
+ properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp }
39
+ properties.insert 0, "InChI"
40
+ table[0] = properties
41
+
42
+ rec = 0
43
+ sdf.split(/\$\$\$\$\r*\n/).each do |s|
44
+ rec += 1
45
+ table << []
46
+ begin
47
+ # TODO: use compound service
48
+ compound = OpenTox::Compound.from_sdf sdf
49
+ #obconversion.read_string obmol, s
50
+ table.last << obconversion.write_string(obmol).gsub(/\s/,'').chomp
51
+ rescue
52
+ # TODO: Fix, will lead to follow up errors
53
+ table.last << "Could not convert structure at record #{rec}) have been ignored! \n#{s}"
54
+ end
55
+ obmol.get_data.each { |d| table.last[table.first.index(d.get_attribute)] = d.get_value }
56
+ end
57
+ from_table table
58
+ end
59
+ =end
60
+
61
+ def from_table table
62
+
63
+ =begin
64
+ dataset = OpenTox::Dataset.new @uri
65
+ puts dataset.uri
66
+ feature_names = table.shift.collect{|f| f.strip}
67
+ puts feature_names.inspect
68
+ dataset.append RDF::OT.Warnings, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
69
+ compound_format = feature_names.shift.strip
70
+ bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: URI, SMILES, InChI." unless compound_format =~ /URI|URL|SMILES|InChI/i
71
+ features = []
72
+ feature_names.each_with_index do |f,i|
73
+ feature = OpenTox::Feature.new File.join($feature[:uri], SecureRandom.uuid)
74
+ feature[RDF::DC.title] = f
75
+ features << feature
76
+ values = table.collect{|row| row[i+1].strip unless row[i+1].nil?}.uniq.compact # skip compound column
77
+ if values.size <= 3 # max classes
78
+ feature.append RDF.type, RDF::OT.NominalFeature
79
+ feature.append RDF.type, RDF::OT.StringFeature
80
+ feature[RDF::OT.acceptValue] = values
81
+ else
82
+ types = values.collect{|v| feature_type(v)}
83
+ if types.include?(RDF::OT.NominalFeature)
84
+ dataset.append RDF::OT.Warnings, "Feature #{f} contains nominal and numeric values."
85
+ else
86
+ feature.append RDF.type, RDF::OT.NumericFeature
87
+ end
88
+ end
89
+ feature.put
90
+ end
91
+ dataset.features = features
92
+ compounds = []
93
+ table.each_with_index do |values,j|
94
+ c = values.shift
95
+ puts c
96
+ puts compound_format
97
+ values.collect!{|v| v.nil? ? nil : v.strip }
98
+ #begin
99
+ case compound_format
100
+ when /URI|URL/i
101
+ compound = OpenTox::Compound.new c
102
+ when /SMILES/i
103
+ compound = OpenTox::Compound.from_smiles($compound[:uri], c)
104
+ when /InChI/i
105
+ compound = OpenTox::Compound.from_inchi($compound[:uri], URI.decode_www_form_component(c))
106
+ end
107
+ #rescue
108
+ #dataset.append RDF::OT.Warnings, "Cannot parse compound \"#{c}\" at position #{j+2}, all entries are ignored."
109
+ #next
110
+ #end
111
+ unless compound_uri.match(/InChI=/)
112
+ dataset.append RDF::OT.Warnings, "Cannot parse compound \"#{c}\" at position #{j+2}, all entries are ignored."
113
+ next
114
+ end
115
+ compounds << compound
116
+ unless values.size == features.size
117
+ dataset.append RDF::OT.Warnings, "Number of values at position #{j+2} (#{values.size}) is different than header size (#{features.size}), all entries are ignored."
118
+ next
119
+ end
120
+
121
+ dataset << values
122
+
123
+ end
124
+ dataset.compounds = compounds
125
+ compounds.duplicates.each do |compound|
126
+ positions = []
127
+ compounds.each_with_index{|c,i| positions << i+1 if c.uri == compound.uri}
128
+ dataset.append RDF::OT.Warnings, "Duplicated compound #{compound.uri} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
129
+ end
130
+ puts dataset.to_ntriples
131
+ dataset.to_ntriples
132
+ =end
133
+
134
+ @warnings = []
135
+ ntriples = ["<#{@uri}> <#{RDF.type}> <#{RDF::OT.Dataset}>."]
136
+ ntriples << ["<#{@uri}> <#{RDF.type}> <#{RDF::OT.OrderedDataset}>."]
137
+
138
+ # features
139
+ feature_names = table.shift.collect{|f| f.strip}
140
+ @warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
141
+ compound_format = feature_names.shift.strip
142
+ bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: URI, SMILES, InChI." unless compound_format =~ /URI|URL|SMILES|InChI/i
143
+ features = []
144
+ ignored_feature_indices = []
145
+ feature_names.each_with_index do |f,i|
146
+ feature = OpenTox::Feature.new File.join($feature[:uri], SecureRandom.uuid)
147
+ feature[RDF::DC.title] = f
148
+ features << feature
149
+ values = table.collect{|row| row[i+1].strip unless row[i+1].nil?}.uniq.compact # skip compound column
150
+ if values.size <= 3 # max classes
151
+ feature.append RDF.type, RDF::OT.NominalFeature
152
+ feature.append RDF.type, RDF::OT.StringFeature
153
+ feature[RDF::OT.acceptValue] = values
154
+ else
155
+ types = values.collect{|v| feature_type(v)}
156
+ if types.include?(RDF::OT.NominalFeature)
157
+ @warnings << "Feature #{f} contains nominal and numeric values."
158
+ else
159
+ feature.append RDF.type, RDF::OT.NumericFeature
160
+ end
161
+ end
162
+ feature.put
163
+ ntriples << "<#{feature.uri}> <#{RDF.type}> <#{RDF::OT.Feature}>."
164
+ ntriples << "<#{feature.uri}> <#{RDF::OLO.index}> #{i} ."
165
+ end
166
+
167
+ # compounds and values
168
+ compound_uris = []
169
+ table.each_with_index do |values,j|
170
+ values.collect!{|v| v.nil? ? nil : v.strip }
171
+ compound = values.shift
172
+ begin
173
+ case compound_format
174
+ when /URI|URL/i
175
+ compound_uri = compound
176
+ when /SMILES/i
177
+ compound_uri = OpenTox::Compound.from_smiles($compound[:uri], compound).uri
178
+ when /InChI/i
179
+ compound_uri = OpenTox::Compound.from_inchi($compound[:uri], URI.decode_www_form_component(compound)).uri
180
+ end
181
+ rescue
182
+ @warnings << "Cannot parse compound \"#{compound}\" at position #{j+2}, all entries are ignored."
183
+ next
184
+ end
185
+ unless compound_uri.match(/InChI=/)
186
+ @warnings << "Cannot parse compound \"#{compound}\" at position #{j+2}, all entries are ignored."
187
+ next
188
+ end
189
+ compound_uris << compound_uri
190
+ unless values.size == features.size
191
+ @warnings << "Number of values at position #{j+2} (#{values.size}) is different than header size (#{features.size}), all entries are ignored."
192
+ next
193
+ end
194
+ ntriples << "<#{compound_uri}> <#{RDF.type}> <#{RDF::OT.Compound}>."
195
+ ntriples << "<#{compound_uri}> <#{RDF::OLO.index}> #{j} ."
196
+
197
+ values.each_with_index do |v,i|
198
+ #@warnings << "Empty value for compound #{compound} (row #{j+2}) and feature \"#{feature_names[i]}\" (column #{i+2})." if v.blank?
199
+ #@warnings << "Empty value in row #{j+2}, column #{i+2} (feature \"#{feature_names[i]}\")." if v.blank?
200
+
201
+ data_entry_node = "_:dataentry"+ j.to_s
202
+ value_node = data_entry_node+ "_value"+ i.to_s
203
+ ntriples << "<#{@uri}> <#{RDF::OT.dataEntry}> #{data_entry_node} ."
204
+ ntriples << "#{data_entry_node} <#{RDF.type}> <#{RDF::OT.DataEntry}> ."
205
+ ntriples << "#{data_entry_node} <#{RDF::OLO.index}> #{j} ."
206
+ ntriples << "#{data_entry_node} <#{RDF::OT.compound}> <#{compound_uri}> ."
207
+ ntriples << "#{data_entry_node} <#{RDF::OT.values}> #{value_node} ."
208
+ ntriples << "#{value_node} <#{RDF::OT.feature}> <#{features[i].uri}> ."
209
+ ntriples << "#{value_node} <#{RDF::OT.value}> \"#{v}\" ."
210
+
211
+ end
212
+
213
+ end
214
+ compound_uris.duplicates.each do |uri|
215
+ positions = []
216
+ compound_uris.each_with_index{|c,i| positions << i+1 if c == uri}
217
+ @warnings << "Duplicated compound #{uri} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
218
+ end
219
+
220
+ ntriples << "<#{@uri}> <#{RDF::OT.Warnings}> \"#{@warnings.join('\n')}\" ."
221
+ ntriples.join("\n")
222
+ =begin
223
+ =end
224
+ end
225
+
226
+ =begin
227
+ def to_xlsx
228
+
229
+ # both simple_xlsx and axlsx create empty documents with OLE2 errors
230
+ xlsx = @uri.split("/").last+".xlsx"
231
+ p = Axlsx::Package.new
232
+ wb = p.workbook
233
+ wb.add_worksheet(:name => "test") do |sheet|
234
+ to_table.each { |row| sheet.add_row row; puts row }
235
+ end
236
+ p.serialize("test.xlsx")
237
+
238
+ p.to_stream
239
+ #```
240
+ #Tempfile.open(@uri.split("/").last+".xlsx") do |xlsx|
241
+ SimpleXlsx::Serializer.new(xlsx) do |doc|
242
+ doc.add_sheet("People") do |sheet|
243
+ to_table.each { |row| sheet.add_row row }
244
+ end
245
+ end
246
+ send_file xlsx
247
+ #end
248
+ end
249
+ =end
250
+
251
+ def to_csv
252
+ csv_string = CSV.generate do |csv|
253
+ to_table.each { |row| csv << row }
254
+ end
255
+ end
256
+
257
+ def to_table
258
+ =begin
259
+ table = []
260
+ dataset = OpenTox::Dataset.new @uri
261
+ dataset.get
262
+ table << ["SMILES"] + dataset.features.collect{|f| f.get; f.title}
263
+ dataset.data_entries.each_with_index do |data_entry,i|
264
+ table << [dataset.compounds[i]] + data_entry
265
+ end
266
+ table
267
+ =end
268
+ accept = "text/uri-list"
269
+ table = []
270
+ if ordered?
271
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}> . ?s <#{RDF::OLO.index}> ?i} ORDER BY ?i"
272
+ features = FourStore.query(sparql, accept).split("\n").collect{|uri| OpenTox::Feature.new uri}
273
+ table << ["SMILES"] + features.collect{ |f| f.get; f[RDF::DC.title] }
274
+ sparql = "SELECT DISTINCT ?i FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.DataEntry}> . ?s <#{RDF::OLO.index}> ?i} ORDER BY ?i"
275
+ FourStore.query(sparql, accept).split("\n").each do |data_entry_idx|
276
+ sparql = "SELECT DISTINCT ?compound FROM <#{@uri}> WHERE {
277
+ ?data_entry <#{RDF::OLO.index}> #{data_entry_idx} ;
278
+ <#{RDF::OT.compound}> ?compound. }"
279
+ compound = OpenTox::Compound.new FourStore.query(sparql, accept).strip
280
+ sparql = "SELECT ?value FROM <#{@uri}> WHERE {
281
+ ?data_entry <#{RDF::OLO.index}> #{data_entry_idx} ;
282
+ <#{RDF::OT.values}> ?v .
283
+ ?v <#{RDF::OT.feature}> ?f;
284
+ <#{RDF::OT.value}> ?value .
285
+ ?f <#{RDF::OLO.index}> ?i.
286
+
287
+ } ORDER BY ?i"
288
+ values = FourStore.query(sparql,accept).split("\n")
289
+ # Fill up trailing empty cells
290
+ table << [compound.smiles] + values.fill("",values.size,features.size-values.size)
291
+ end
292
+ else
293
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>}"
294
+ features = FourStore.query(sparql, accept).split("\n").collect{|uri| OpenTox::Feature.new uri}
295
+ table << ["SMILES"] + features.collect{ |f| f.get; f[RDF::DC.title] }
296
+ sparql = "SELECT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>. }"
297
+ compounds = FourStore.query(sparql, accept).split("\n").collect{|uri| OpenTox::Compound.new uri}
298
+ compounds.each do |compound|
299
+ data_entries = []
300
+ features.each do |feature|
301
+ sparql = "SELECT ?value FROM <#{@uri}> WHERE {
302
+ ?data_entry <#{RDF::OT.compound}> <#{compound.uri}>;
303
+ <#{RDF::OT.values}> ?v .
304
+ ?v <#{RDF::OT.feature}> <#{feature.uri}>;
305
+ <#{RDF::OT.value}> ?value.
306
+ } ORDER BY ?data_entry"
307
+ FourStore.query(sparql, accept).split("\n").each_with_index do |value,i|
308
+ data_entries[i] = Array.new(features.size) unless data_entries[i]
309
+ data_entries[i] << value
310
+ end
311
+ end
312
+ data_entries.each{|data_entry| table << [compound.smiles] + data_entry}
313
+ end
314
+ end
315
+ table
316
+ =begin
317
+ =end
318
+ end
319
+
320
+ def feature_type(value)
321
+ if value.blank?
322
+ nil
323
+ elsif value.numeric?
324
+ RDF::OT.NumericFeature
325
+ else
326
+ RDF::OT.NominalFeature
327
+ end
328
+ end
329
+
330
+ def ordered?
331
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.OrderedDataset}>}"
332
+ FourStore.query(sparql, "text/uri-list").split("\n").empty? ? false : true
333
+ end
334
+
335
+ def parse_put
336
+ task = OpenTox::Task.create $task[:uri], nil, RDF::DC.description => "Dataset upload: #{@uri}" do
337
+ #Profiler__::start_profile
338
+ case @content_type
339
+ when "text/plain", "text/turtle", "application/rdf+xml" # no conversion needed
340
+ when "text/csv"
341
+ @body = from_csv @body
342
+ @content_type = "text/plain"
343
+ when "application/vnd.ms-excel"
344
+ from_spreadsheet Excel
345
+ when "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
346
+ from_spreadsheet Excelx
347
+ when "application/vnd.oasis.opendocument.spreadsheet"
348
+ from_spreadsheet Openoffice
349
+ # when "chemical/x-mdl-sdfile"
350
+ # @body = parse_sdf @body
351
+ # @content_type = "text/plain"
352
+ else
353
+ bad_request_error "#{@content_type} is not a supported content type."
354
+ end
355
+ FourStore.put @uri, @body, @content_type
356
+ if params[:file]
357
+ nt = "<#{@uri}> <#{RDF::DC.title}> \"#{params[:file][:filename]}\".\n<#{uri}> <#{RDF::OT.hasSource}> \"#{params[:file][:filename]}\"."
358
+ FourStore.post(@uri, nt, "text/plain")
359
+ end
360
+ #Profiler__::stop_profile
361
+ #Profiler__::print_profile($stdout)
362
+ @uri
363
+ end
364
+ response['Content-Type'] = "text/uri-list"
365
+ halt 202, task.uri
366
+ end
367
+ end
368
+
369
+ before "/#{SERVICE}/:id/:property" do
370
+ @uri = uri("/#{SERVICE}/#{params[:id]}")
371
+ end
372
+
373
+ # Create a new resource
374
+ post "/dataset/?" do
375
+ @uri = uri("/#{SERVICE}/#{SecureRandom.uuid}")
376
+ parse_put
377
+ end
378
+
379
+ get "/dataset/:id/?" do
380
+ #Profiler__::start_profile
381
+ @accept = "text/html" if @accept == '*/*'
382
+ case @accept
383
+ when "application/rdf+xml", "text/turtle", "text/plain", /html/
384
+ r = FourStore.get(@uri, @accept)
385
+ else
386
+ case @accept
387
+ when "text/csv"
388
+ r = to_csv
389
+ #when "application/vnd.ms-excel"
390
+ #to_spreadsheet Excel
391
+ #when "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
392
+ #to_xlsx
393
+ #when "application/vnd.oasis.opendocument.spreadsheet"
394
+ #to_spreadsheet Openoffice
395
+ #when "chemical/x-mdl-sdfile"
396
+ else
397
+ bad_request_error "'#{@accept}' is not a supported content type."
398
+ end
399
+ end
400
+ #Profiler__::stop_profile
401
+ #Profiler__::print_profile($stdout)
402
+ r
403
+ end
404
+
405
+ # Create or updata a resource
406
+ put "/dataset/:id/?" do
407
+ parse_put
408
+ end
409
+
410
+ # Get metadata of the dataset
411
+ # @return [application/rdf+xml] Metadata OWL-DL
412
+ get '/dataset/:id/metadata' do
413
+ case @accept
414
+ when "application/rdf+xml", "text/turtle", "text/plain"
415
+ sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{@uri}> WHERE {<#{@uri}> ?p ?o. }"
416
+ FourStore.query sparql, @accept
417
+ else
418
+ bad_request_error "'#{@accept}' is not a supported content type."
419
+ end
420
+ end
421
+
422
+ # Get a list of all features
423
+ # @param [Header] Accept one of `application/rdf+xml, text/turtle, text/plain, text/uri-list` (default application/rdf+xml)
424
+ # @return [application/rdf+xml, text/turtle, text/plain, text/uri-list] Feature list
425
+ get '/dataset/:id/features' do
426
+ case @accept
427
+ when "application/rdf+xml", "text/turtle", "text/plain"
428
+ sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>; ?p ?o. }"
429
+ when "text/uri-list"
430
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>. }"
431
+ else
432
+ bad_request_error "'#{@accept}' is not a supported content type."
433
+ end
434
+ FourStore.query sparql, @accept
435
+ end
436
+
437
+ # Get a list of all compounds
438
+ # @return [text/uri-list] Feature list
439
+ get '/dataset/:id/compounds' do
440
+ case @accept
441
+ when "application/rdf+xml", "text/turtle", "text/plain"
442
+ sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>; ?p ?o. }"
443
+ when "text/uri-list"
444
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>. }"
445
+ else
446
+ bad_request_error "'#{@accept}' is not a supported content type."
447
+ end
448
+ FourStore.query sparql, @accept
449
+ end
450
+ end
451
+ end
452
+