opentox-dataset 5.0.0pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ OpenTox Dataset
2
+ ===============
3
+
4
+ * An OpenTox REST Webservice
5
+ * Stores associations between compounds and features in datasets
6
+ * Implements a subset of the [OpenTox compound API 1.2](http://opentox.org/dev/apis/api-1.2/dataset)
7
+ * Supports the internal YAML representation of opentox-ruby
8
+
9
+ REST operations
10
+ ---------------
11
+
12
+ Get a list of datasets GET / - List of dataset URIs 200,400,404
13
+ Get a dataset GET /{id} - Dataset representation 200,400,404
14
+ Upload a dataset POST / Dataset representation Dataset URI 200,400,404
15
+ Delete a dataset DELETE /{id} - - 200,404
16
+ Delete all datasets DELETE / - - 200,404
17
+
18
+ Supported MIME formats (http://chemical-mime.sourceforge.net/)
19
+ --------------------------------------------------------------
20
+
21
+ * application/rdf+xml (default): read/write OWL-DL
22
+ * application/x-yaml: read/write YAML
23
+
24
+ Examples
25
+ --------
26
+
27
+ Get a list of all datasets
28
+
29
+ curl http://webservices.in-silico.ch/dataset
30
+
31
+ Upload a dataset
32
+
33
+ curl -X POST -H "Content-Type:application/rdf+xml" --data-binary @{my_rdf_file} http://webservices.in-silico.ch/dataset
34
+
35
+ Get a dataset representation
36
+
37
+ curl http://webservices.in-silico.ch/dataset/{id}
38
+
39
+ Delete a dataset
40
+
41
+ curl -X DELETE http://webservices.in-silico.ch/dataset/{id}
42
+
43
+ [API documentation](http://rdoc.info/github/opentox/dataset)
44
+ ------------------------------------------------------------
45
+
46
+ Copyright (c) 2009-2011 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details.
47
+
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 5.0.0pre1
data/application.rb ADDED
@@ -0,0 +1,452 @@
1
+ require 'roo'
2
+ #require 'profiler'
3
+
4
+ module OpenTox
5
+ class Application < Service
6
+
7
+ @warnings = []
8
+
9
+ helpers do
10
+
11
+ def from_csv(csv)
12
+ from_table CSV.parse(csv)
13
+ end
14
+
15
+ def from_spreadsheet spreadsheet
16
+ extensions = { Excel => ".xls", Excelx => ".xlsx", Openoffice => ".ods" }
17
+ input = params[:file][:tempfile].path + ".xls"
18
+ csv_file = params[:file][:tempfile].path + ".csv"
19
+ File.rename params[:file][:tempfile].path, input # roo needs "correct" extensions
20
+ spreadsheet.new(input).to_csv csv_file # roo cannot write to strings
21
+ @body = from_csv File.read(csv_file)
22
+ @content_type = "text/plain"
23
+ end
24
+
25
+ =begin
26
+ def from_sdf(sdf)
27
+
28
+ #obconversion = OpenBabel::OBConversion.new
29
+ #obmol = OpenBabel::OBMol.new
30
+ #obconversion.set_in_and_out_formats "sdf", "inchi"
31
+
32
+ table = []
33
+
34
+ properties = []
35
+ sdf.each_line { |l| properties << l.to_s if l.match(/</) }
36
+ properties.sort!
37
+ properties.uniq!
38
+ properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp }
39
+ properties.insert 0, "InChI"
40
+ table[0] = properties
41
+
42
+ rec = 0
43
+ sdf.split(/\$\$\$\$\r*\n/).each do |s|
44
+ rec += 1
45
+ table << []
46
+ begin
47
+ # TODO: use compound service
48
+ compound = OpenTox::Compound.from_sdf sdf
49
+ #obconversion.read_string obmol, s
50
+ table.last << obconversion.write_string(obmol).gsub(/\s/,'').chomp
51
+ rescue
52
+ # TODO: Fix, will lead to follow up errors
53
+ table.last << "Could not convert structure at record #{rec}) have been ignored! \n#{s}"
54
+ end
55
+ obmol.get_data.each { |d| table.last[table.first.index(d.get_attribute)] = d.get_value }
56
+ end
57
+ from_table table
58
+ end
59
+ =end
60
+
61
+ def from_table table
62
+
63
+ =begin
64
+ dataset = OpenTox::Dataset.new @uri
65
+ puts dataset.uri
66
+ feature_names = table.shift.collect{|f| f.strip}
67
+ puts feature_names.inspect
68
+ dataset.append RDF::OT.Warnings, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
69
+ compound_format = feature_names.shift.strip
70
+ bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: URI, SMILES, InChI." unless compound_format =~ /URI|URL|SMILES|InChI/i
71
+ features = []
72
+ feature_names.each_with_index do |f,i|
73
+ feature = OpenTox::Feature.new File.join($feature[:uri], SecureRandom.uuid)
74
+ feature[RDF::DC.title] = f
75
+ features << feature
76
+ values = table.collect{|row| row[i+1].strip unless row[i+1].nil?}.uniq.compact # skip compound column
77
+ if values.size <= 3 # max classes
78
+ feature.append RDF.type, RDF::OT.NominalFeature
79
+ feature.append RDF.type, RDF::OT.StringFeature
80
+ feature[RDF::OT.acceptValue] = values
81
+ else
82
+ types = values.collect{|v| feature_type(v)}
83
+ if types.include?(RDF::OT.NominalFeature)
84
+ dataset.append RDF::OT.Warnings, "Feature #{f} contains nominal and numeric values."
85
+ else
86
+ feature.append RDF.type, RDF::OT.NumericFeature
87
+ end
88
+ end
89
+ feature.put
90
+ end
91
+ dataset.features = features
92
+ compounds = []
93
+ table.each_with_index do |values,j|
94
+ c = values.shift
95
+ puts c
96
+ puts compound_format
97
+ values.collect!{|v| v.nil? ? nil : v.strip }
98
+ #begin
99
+ case compound_format
100
+ when /URI|URL/i
101
+ compound = OpenTox::Compound.new c
102
+ when /SMILES/i
103
+ compound = OpenTox::Compound.from_smiles($compound[:uri], c)
104
+ when /InChI/i
105
+ compound = OpenTox::Compound.from_inchi($compound[:uri], URI.decode_www_form_component(c))
106
+ end
107
+ #rescue
108
+ #dataset.append RDF::OT.Warnings, "Cannot parse compound \"#{c}\" at position #{j+2}, all entries are ignored."
109
+ #next
110
+ #end
111
+ unless compound_uri.match(/InChI=/)
112
+ dataset.append RDF::OT.Warnings, "Cannot parse compound \"#{c}\" at position #{j+2}, all entries are ignored."
113
+ next
114
+ end
115
+ compounds << compound
116
+ unless values.size == features.size
117
+ dataset.append RDF::OT.Warnings, "Number of values at position #{j+2} (#{values.size}) is different than header size (#{features.size}), all entries are ignored."
118
+ next
119
+ end
120
+
121
+ dataset << values
122
+
123
+ end
124
+ dataset.compounds = compounds
125
+ compounds.duplicates.each do |compound|
126
+ positions = []
127
+ compounds.each_with_index{|c,i| positions << i+1 if c.uri == compound.uri}
128
+ dataset.append RDF::OT.Warnings, "Duplicated compound #{compound.uri} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
129
+ end
130
+ puts dataset.to_ntriples
131
+ dataset.to_ntriples
132
+ =end
133
+
134
+ @warnings = []
135
+ ntriples = ["<#{@uri}> <#{RDF.type}> <#{RDF::OT.Dataset}>."]
136
+ ntriples << ["<#{@uri}> <#{RDF.type}> <#{RDF::OT.OrderedDataset}>."]
137
+
138
+ # features
139
+ feature_names = table.shift.collect{|f| f.strip}
140
+ @warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
141
+ compound_format = feature_names.shift.strip
142
+ bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: URI, SMILES, InChI." unless compound_format =~ /URI|URL|SMILES|InChI/i
143
+ features = []
144
+ ignored_feature_indices = []
145
+ feature_names.each_with_index do |f,i|
146
+ feature = OpenTox::Feature.new File.join($feature[:uri], SecureRandom.uuid)
147
+ feature[RDF::DC.title] = f
148
+ features << feature
149
+ values = table.collect{|row| row[i+1].strip unless row[i+1].nil?}.uniq.compact # skip compound column
150
+ if values.size <= 3 # max classes
151
+ feature.append RDF.type, RDF::OT.NominalFeature
152
+ feature.append RDF.type, RDF::OT.StringFeature
153
+ feature[RDF::OT.acceptValue] = values
154
+ else
155
+ types = values.collect{|v| feature_type(v)}
156
+ if types.include?(RDF::OT.NominalFeature)
157
+ @warnings << "Feature #{f} contains nominal and numeric values."
158
+ else
159
+ feature.append RDF.type, RDF::OT.NumericFeature
160
+ end
161
+ end
162
+ feature.put
163
+ ntriples << "<#{feature.uri}> <#{RDF.type}> <#{RDF::OT.Feature}>."
164
+ ntriples << "<#{feature.uri}> <#{RDF::OLO.index}> #{i} ."
165
+ end
166
+
167
+ # compounds and values
168
+ compound_uris = []
169
+ table.each_with_index do |values,j|
170
+ values.collect!{|v| v.nil? ? nil : v.strip }
171
+ compound = values.shift
172
+ begin
173
+ case compound_format
174
+ when /URI|URL/i
175
+ compound_uri = compound
176
+ when /SMILES/i
177
+ compound_uri = OpenTox::Compound.from_smiles($compound[:uri], compound).uri
178
+ when /InChI/i
179
+ compound_uri = OpenTox::Compound.from_inchi($compound[:uri], URI.decode_www_form_component(compound)).uri
180
+ end
181
+ rescue
182
+ @warnings << "Cannot parse compound \"#{compound}\" at position #{j+2}, all entries are ignored."
183
+ next
184
+ end
185
+ unless compound_uri.match(/InChI=/)
186
+ @warnings << "Cannot parse compound \"#{compound}\" at position #{j+2}, all entries are ignored."
187
+ next
188
+ end
189
+ compound_uris << compound_uri
190
+ unless values.size == features.size
191
+ @warnings << "Number of values at position #{j+2} (#{values.size}) is different than header size (#{features.size}), all entries are ignored."
192
+ next
193
+ end
194
+ ntriples << "<#{compound_uri}> <#{RDF.type}> <#{RDF::OT.Compound}>."
195
+ ntriples << "<#{compound_uri}> <#{RDF::OLO.index}> #{j} ."
196
+
197
+ values.each_with_index do |v,i|
198
+ #@warnings << "Empty value for compound #{compound} (row #{j+2}) and feature \"#{feature_names[i]}\" (column #{i+2})." if v.blank?
199
+ #@warnings << "Empty value in row #{j+2}, column #{i+2} (feature \"#{feature_names[i]}\")." if v.blank?
200
+
201
+ data_entry_node = "_:dataentry"+ j.to_s
202
+ value_node = data_entry_node+ "_value"+ i.to_s
203
+ ntriples << "<#{@uri}> <#{RDF::OT.dataEntry}> #{data_entry_node} ."
204
+ ntriples << "#{data_entry_node} <#{RDF.type}> <#{RDF::OT.DataEntry}> ."
205
+ ntriples << "#{data_entry_node} <#{RDF::OLO.index}> #{j} ."
206
+ ntriples << "#{data_entry_node} <#{RDF::OT.compound}> <#{compound_uri}> ."
207
+ ntriples << "#{data_entry_node} <#{RDF::OT.values}> #{value_node} ."
208
+ ntriples << "#{value_node} <#{RDF::OT.feature}> <#{features[i].uri}> ."
209
+ ntriples << "#{value_node} <#{RDF::OT.value}> \"#{v}\" ."
210
+
211
+ end
212
+
213
+ end
214
+ compound_uris.duplicates.each do |uri|
215
+ positions = []
216
+ compound_uris.each_with_index{|c,i| positions << i+1 if c == uri}
217
+ @warnings << "Duplicated compound #{uri} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
218
+ end
219
+
220
+ ntriples << "<#{@uri}> <#{RDF::OT.Warnings}> \"#{@warnings.join('\n')}\" ."
221
+ ntriples.join("\n")
222
+ =begin
223
+ =end
224
+ end
225
+
226
+ =begin
227
+ def to_xlsx
228
+
229
+ # both simple_xlsx and axlsx create empty documents with OLE2 errors
230
+ xlsx = @uri.split("/").last+".xlsx"
231
+ p = Axlsx::Package.new
232
+ wb = p.workbook
233
+ wb.add_worksheet(:name => "test") do |sheet|
234
+ to_table.each { |row| sheet.add_row row; puts row }
235
+ end
236
+ p.serialize("test.xlsx")
237
+
238
+ p.to_stream
239
+ #```
240
+ #Tempfile.open(@uri.split("/").last+".xlsx") do |xlsx|
241
+ SimpleXlsx::Serializer.new(xlsx) do |doc|
242
+ doc.add_sheet("People") do |sheet|
243
+ to_table.each { |row| sheet.add_row row }
244
+ end
245
+ end
246
+ send_file xlsx
247
+ #end
248
+ end
249
+ =end
250
+
251
+ def to_csv
252
+ csv_string = CSV.generate do |csv|
253
+ to_table.each { |row| csv << row }
254
+ end
255
+ end
256
+
257
+ def to_table
258
+ =begin
259
+ table = []
260
+ dataset = OpenTox::Dataset.new @uri
261
+ dataset.get
262
+ table << ["SMILES"] + dataset.features.collect{|f| f.get; f.title}
263
+ dataset.data_entries.each_with_index do |data_entry,i|
264
+ table << [dataset.compounds[i]] + data_entry
265
+ end
266
+ table
267
+ =end
268
+ accept = "text/uri-list"
269
+ table = []
270
+ if ordered?
271
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}> . ?s <#{RDF::OLO.index}> ?i} ORDER BY ?i"
272
+ features = FourStore.query(sparql, accept).split("\n").collect{|uri| OpenTox::Feature.new uri}
273
+ table << ["SMILES"] + features.collect{ |f| f.get; f[RDF::DC.title] }
274
+ sparql = "SELECT DISTINCT ?i FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.DataEntry}> . ?s <#{RDF::OLO.index}> ?i} ORDER BY ?i"
275
+ FourStore.query(sparql, accept).split("\n").each do |data_entry_idx|
276
+ sparql = "SELECT DISTINCT ?compound FROM <#{@uri}> WHERE {
277
+ ?data_entry <#{RDF::OLO.index}> #{data_entry_idx} ;
278
+ <#{RDF::OT.compound}> ?compound. }"
279
+ compound = OpenTox::Compound.new FourStore.query(sparql, accept).strip
280
+ sparql = "SELECT ?value FROM <#{@uri}> WHERE {
281
+ ?data_entry <#{RDF::OLO.index}> #{data_entry_idx} ;
282
+ <#{RDF::OT.values}> ?v .
283
+ ?v <#{RDF::OT.feature}> ?f;
284
+ <#{RDF::OT.value}> ?value .
285
+ ?f <#{RDF::OLO.index}> ?i.
286
+
287
+ } ORDER BY ?i"
288
+ values = FourStore.query(sparql,accept).split("\n")
289
+ # Fill up trailing empty cells
290
+ table << [compound.smiles] + values.fill("",values.size,features.size-values.size)
291
+ end
292
+ else
293
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>}"
294
+ features = FourStore.query(sparql, accept).split("\n").collect{|uri| OpenTox::Feature.new uri}
295
+ table << ["SMILES"] + features.collect{ |f| f.get; f[RDF::DC.title] }
296
+ sparql = "SELECT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>. }"
297
+ compounds = FourStore.query(sparql, accept).split("\n").collect{|uri| OpenTox::Compound.new uri}
298
+ compounds.each do |compound|
299
+ data_entries = []
300
+ features.each do |feature|
301
+ sparql = "SELECT ?value FROM <#{@uri}> WHERE {
302
+ ?data_entry <#{RDF::OT.compound}> <#{compound.uri}>;
303
+ <#{RDF::OT.values}> ?v .
304
+ ?v <#{RDF::OT.feature}> <#{feature.uri}>;
305
+ <#{RDF::OT.value}> ?value.
306
+ } ORDER BY ?data_entry"
307
+ FourStore.query(sparql, accept).split("\n").each_with_index do |value,i|
308
+ data_entries[i] = Array.new(features.size) unless data_entries[i]
309
+ data_entries[i] << value
310
+ end
311
+ end
312
+ data_entries.each{|data_entry| table << [compound.smiles] + data_entry}
313
+ end
314
+ end
315
+ table
316
+ =begin
317
+ =end
318
+ end
319
+
320
+ def feature_type(value)
321
+ if value.blank?
322
+ nil
323
+ elsif value.numeric?
324
+ RDF::OT.NumericFeature
325
+ else
326
+ RDF::OT.NominalFeature
327
+ end
328
+ end
329
+
330
+ def ordered?
331
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.OrderedDataset}>}"
332
+ FourStore.query(sparql, "text/uri-list").split("\n").empty? ? false : true
333
+ end
334
+
335
+ def parse_put
336
+ task = OpenTox::Task.create $task[:uri], nil, RDF::DC.description => "Dataset upload: #{@uri}" do
337
+ #Profiler__::start_profile
338
+ case @content_type
339
+ when "text/plain", "text/turtle", "application/rdf+xml" # no conversion needed
340
+ when "text/csv"
341
+ @body = from_csv @body
342
+ @content_type = "text/plain"
343
+ when "application/vnd.ms-excel"
344
+ from_spreadsheet Excel
345
+ when "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
346
+ from_spreadsheet Excelx
347
+ when "application/vnd.oasis.opendocument.spreadsheet"
348
+ from_spreadsheet Openoffice
349
+ # when "chemical/x-mdl-sdfile"
350
+ # @body = parse_sdf @body
351
+ # @content_type = "text/plain"
352
+ else
353
+ bad_request_error "#{@content_type} is not a supported content type."
354
+ end
355
+ FourStore.put @uri, @body, @content_type
356
+ if params[:file]
357
+ nt = "<#{@uri}> <#{RDF::DC.title}> \"#{params[:file][:filename]}\".\n<#{uri}> <#{RDF::OT.hasSource}> \"#{params[:file][:filename]}\"."
358
+ FourStore.post(@uri, nt, "text/plain")
359
+ end
360
+ #Profiler__::stop_profile
361
+ #Profiler__::print_profile($stdout)
362
+ @uri
363
+ end
364
+ response['Content-Type'] = "text/uri-list"
365
+ halt 202, task.uri
366
+ end
367
+ end
368
+
369
+ before "/#{SERVICE}/:id/:property" do
370
+ @uri = uri("/#{SERVICE}/#{params[:id]}")
371
+ end
372
+
373
+ # Create a new resource
374
+ post "/dataset/?" do
375
+ @uri = uri("/#{SERVICE}/#{SecureRandom.uuid}")
376
+ parse_put
377
+ end
378
+
379
+ get "/dataset/:id/?" do
380
+ #Profiler__::start_profile
381
+ @accept = "text/html" if @accept == '*/*'
382
+ case @accept
383
+ when "application/rdf+xml", "text/turtle", "text/plain", /html/
384
+ r = FourStore.get(@uri, @accept)
385
+ else
386
+ case @accept
387
+ when "text/csv"
388
+ r = to_csv
389
+ #when "application/vnd.ms-excel"
390
+ #to_spreadsheet Excel
391
+ #when "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
392
+ #to_xlsx
393
+ #when "application/vnd.oasis.opendocument.spreadsheet"
394
+ #to_spreadsheet Openoffice
395
+ #when "chemical/x-mdl-sdfile"
396
+ else
397
+ bad_request_error "'#{@accept}' is not a supported content type."
398
+ end
399
+ end
400
+ #Profiler__::stop_profile
401
+ #Profiler__::print_profile($stdout)
402
+ r
403
+ end
404
+
405
+ # Create or updata a resource
406
+ put "/dataset/:id/?" do
407
+ parse_put
408
+ end
409
+
410
+ # Get metadata of the dataset
411
+ # @return [application/rdf+xml] Metadata OWL-DL
412
+ get '/dataset/:id/metadata' do
413
+ case @accept
414
+ when "application/rdf+xml", "text/turtle", "text/plain"
415
+ sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{@uri}> WHERE {<#{@uri}> ?p ?o. }"
416
+ FourStore.query sparql, @accept
417
+ else
418
+ bad_request_error "'#{@accept}' is not a supported content type."
419
+ end
420
+ end
421
+
422
+ # Get a list of all features
423
+ # @param [Header] Accept one of `application/rdf+xml, text/turtle, text/plain, text/uri-list` (default application/rdf+xml)
424
+ # @return [application/rdf+xml, text/turtle, text/plain, text/uri-list] Feature list
425
+ get '/dataset/:id/features' do
426
+ case @accept
427
+ when "application/rdf+xml", "text/turtle", "text/plain"
428
+ sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>; ?p ?o. }"
429
+ when "text/uri-list"
430
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>. }"
431
+ else
432
+ bad_request_error "'#{@accept}' is not a supported content type."
433
+ end
434
+ FourStore.query sparql, @accept
435
+ end
436
+
437
+ # Get a list of all compounds
438
+ # @return [text/uri-list] Feature list
439
+ get '/dataset/:id/compounds' do
440
+ case @accept
441
+ when "application/rdf+xml", "text/turtle", "text/plain"
442
+ sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>; ?p ?o. }"
443
+ when "text/uri-list"
444
+ sparql = "SELECT DISTINCT ?s FROM <#{@uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>. }"
445
+ else
446
+ bad_request_error "'#{@accept}' is not a supported content type."
447
+ end
448
+ FourStore.query sparql, @accept
449
+ end
450
+ end
451
+ end
452
+