biointerchange 0.1.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +17 -0
- data/VERSION +1 -1
- data/generators/GOxrefify.rb +41 -0
- data/generators/rdfxml.rb +6 -4
- data/lib/biointerchange/core.rb +94 -20
- data/lib/biointerchange/genomics/gff3_feature_set.rb +11 -3
- data/lib/biointerchange/genomics/gff3_pragmas.rb +3 -3
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +217 -12
- data/lib/biointerchange/genomics/gff3_reader.rb +78 -20
- data/lib/biointerchange/genomics/gvf_reader.rb +9 -3
- data/lib/biointerchange/gff3o.rb +69 -55
- data/lib/biointerchange/goxref.rb +867 -0
- data/lib/biointerchange/gvf1o.rb +546 -82
- data/lib/biointerchange/textmining/text_mining_reader.rb +9 -0
- data/spec/gff3_rdfwriter_spec.rb +1 -1
- data/spec/gvf_rdfwriter_spec.rb +1 -1
- data/spec/text_mining_pdfx_xml_reader_spec.rb +3 -0
- data/spec/text_mining_pubannos_json_reader_spec.rb +4 -1
- data/supplemental/java/biointerchange/pom.xml +1 -1
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GFF3O.java +93 -125
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java +304 -205
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java +4044 -4290
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java +3 -3
- data/supplemental/python/biointerchange/gff3o.py +1 -89
- data/supplemental/python/biointerchange/gvf1o.py +129 -147
- data/supplemental/python/biointerchange/sio.py +817 -46
- data/supplemental/python/biointerchange/sofa.py +543 -543
- data/supplemental/python/setup.py +1 -1
- data/web/ontologies.html +1 -3
- metadata +7 -2
data/README.md
CHANGED
@@ -274,6 +274,12 @@ Building a new version of the Ruby vocabulary classes for GFF3, SIO, SOFA (requi
|
|
274
274
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sofa> SOFA >> lib/biointerchange/sofa.rb
|
275
275
|
echo -e "\nend" >> lib/biointerchange/sofa.rb
|
276
276
|
|
277
|
+
A Geno Ontology external reference (GOxref) vocabulary can be created by directly downloading the latest version of `GO.xrf_abbs`:
|
278
|
+
|
279
|
+
echo -e "module BioInterchange\n" > lib/biointerchange/goxref.rb
|
280
|
+
curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb
|
281
|
+
echo -e "\nend" >> lib/biointerchange/goxref.rb
|
282
|
+
|
277
283
|
#### Python Vocabulary Classes
|
278
284
|
|
279
285
|
The source-code generation can be skipped, if none of the ontologies that are used by BioInterchange have been changed. Otherwise, the existing Python vocabulary class wrappers can be generated as follows:
|
@@ -282,6 +288,7 @@ The source-code generation can be skipped, if none of the ontologies that are us
|
|
282
288
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-gvf1o> GVF1O | ruby generators/pythonify.rb > supplemental/python/biointerchange/gvf1o.py
|
283
289
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sio> SIO | ruby generators/pythonify.rb > supplemental/python/biointerchange/sio.py
|
284
290
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sofa> SOFA | ruby generators/pythonify.rb > supplemental/python/biointerchange/sofa.py
|
291
|
+
curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb | ruby generators/pythonify.rb > supplemental/python/biointerchange/goxref.py
|
285
292
|
|
286
293
|
Generate the BioInterchange Python vocabulary egg:
|
287
294
|
|
@@ -302,6 +309,7 @@ The source-code generation can be skipped, if none of the ontologies that are us
|
|
302
309
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-gvf1o> GVF1O | ruby generators/javaify.rb > supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java
|
303
310
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sio> SIO | ruby generators/javaify.rb > supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java
|
304
311
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sofa> SOFA | ruby generators/javaify.rb "http://purl.obolibrary.org/obo/" > supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java
|
312
|
+
curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb | ruby generators/javaify.rb > supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GOXRef.java
|
305
313
|
|
306
314
|
Generate the BioInterchange Java vocabulary artifact:
|
307
315
|
|
@@ -346,6 +354,15 @@ A more verbose is produced by calling `rspec` directly:
|
|
346
354
|
|
347
355
|
bundle exec rake rdoc
|
348
356
|
|
357
|
+
### Deploying on Rubygems
|
358
|
+
|
359
|
+
_Note:_ Only BioInterchange package maintainers can deploy the 'biointerchange' gem on Rubygems.
|
360
|
+
|
361
|
+
bundle exec rake version:bump:(major | minor | patch)
|
362
|
+
bundle exec rake gemspec
|
363
|
+
bundle exec gem build biointerchange.gemspec
|
364
|
+
bundle exec gem push biointerchange-VERSION.gem
|
365
|
+
|
349
366
|
### Troubleshooting
|
350
367
|
|
351
368
|
#### GCC: No such file or directory
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
def record(id, description, uri)
|
4
|
+
puts " # Returns the link-out URI for objects of \"#{description}\"."
|
5
|
+
puts " def self.#{id}"
|
6
|
+
puts " RDF::URI.new(\"#{uri}\")"
|
7
|
+
puts ' end'
|
8
|
+
puts ''
|
9
|
+
end
|
10
|
+
|
11
|
+
puts 'class GOXRef'
|
12
|
+
puts ''
|
13
|
+
|
14
|
+
in_record = false
|
15
|
+
|
16
|
+
id = nil
|
17
|
+
description = nil
|
18
|
+
uri = nil
|
19
|
+
|
20
|
+
STDIN.each { |line|
|
21
|
+
line.chomp!
|
22
|
+
|
23
|
+
if line.empty? then
|
24
|
+
record(id, description, uri) if uri and not uri.match(/\[.*\]/)
|
25
|
+
uri = nil
|
26
|
+
in_record = false
|
27
|
+
end
|
28
|
+
|
29
|
+
if line.start_with?('abbreviation:') and not in_record then
|
30
|
+
id = line.sub(/^abbreviation: /, '').gsub(/[-\/]/, '_')
|
31
|
+
in_record = true
|
32
|
+
end
|
33
|
+
|
34
|
+
description = line.sub(/^database: /, '') if line.start_with?('database:') and in_record
|
35
|
+
uri = line.sub(/^url_syntax: /, '').sub(/\[example_id\]$/, '') if line.start_with?('url_syntax:') and in_record
|
36
|
+
}
|
37
|
+
|
38
|
+
record(id, description, uri) if uri
|
39
|
+
|
40
|
+
puts 'end'
|
41
|
+
|
data/generators/rdfxml.rb
CHANGED
@@ -16,8 +16,10 @@ OBO_DEF = RDF::URI.new('http://purl.obolibrary.org/obo/def')
|
|
16
16
|
# For handling synonyms in SIO:
|
17
17
|
SIO_SYN = RDF::URI.new('http://semanticscience.org/resource/synonym')
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
# This label conversion also appears in:
|
20
|
+
# +lib/biointerchange/core.rb+
|
21
|
+
def make_safe_label(label)
|
22
|
+
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_')
|
21
23
|
end
|
22
24
|
|
23
25
|
reader = RDF::RDFXML::Reader.open(ARGV[0])
|
@@ -55,7 +57,7 @@ model.keys.each { |key|
|
|
55
57
|
next unless type
|
56
58
|
|
57
59
|
label = entry[RDF::RDFS.label].to_s
|
58
|
-
next if
|
60
|
+
next if make_safe_label(label).empty?
|
59
61
|
uri = key.to_s
|
60
62
|
|
61
63
|
# Only deal with URI sub-classes/sub-properties, whilst ignoring restrictions, etc.
|
@@ -84,7 +86,7 @@ model.keys.each { |key|
|
|
84
86
|
set = combined_uris[label_or_synonym]
|
85
87
|
set = [] unless set
|
86
88
|
combined_uris[label_or_synonym] = set | [ uri ]
|
87
|
-
generated_labels[label_or_synonym] =
|
89
|
+
generated_labels[label_or_synonym] = make_safe_label(label_or_synonym)
|
88
90
|
}
|
89
91
|
|
90
92
|
object_properties[uri] = true if type == RDF::OWL.ObjectProperty
|
data/lib/biointerchange/core.rb
CHANGED
@@ -5,11 +5,23 @@
|
|
5
5
|
# of it as a gem in your own Ruby implementation.
|
6
6
|
module BioInterchange
|
7
7
|
|
8
|
+
### Global behaviour settings, which can be altered programmatically or via the CLI:
|
9
|
+
|
10
|
+
# If true, then RDF::Graph's "insert" function will be overwritten so that it
|
11
|
+
# immediately outputs N-Triples. This reduces memory requirements (since no RDF
|
12
|
+
# graph is kept in memory) and performance (since no looping through an RDF graph
|
13
|
+
# is necessary).
|
14
|
+
@@skip_rdf_graph = true
|
15
|
+
def self.skip_rdf_graph
|
16
|
+
@@skip_rdf_graph
|
17
|
+
end
|
18
|
+
|
8
19
|
# Custom Exceptions and Errors
|
9
20
|
require 'biointerchange/exceptions'
|
10
21
|
|
11
22
|
# Ontologies (besides the ones from the 'rdf' gem)
|
12
23
|
require 'biointerchange/gff3o'
|
24
|
+
require 'biointerchange/goxref'
|
13
25
|
require 'biointerchange/gvf1o'
|
14
26
|
require 'biointerchange/sio'
|
15
27
|
require 'biointerchange/sofa'
|
@@ -78,14 +90,17 @@ module BioInterchange
|
|
78
90
|
opt = Getopt::Long.getopts(
|
79
91
|
["--help", "-h", Getopt::BOOLEAN],
|
80
92
|
["--debug", "-d", Getopt::BOOLEAN], # set debug mode => print stack traces
|
93
|
+
["--no_rdf_graph_optimization", "-n", Getopt::BOOLEAN], # set self.skip_rdf_graph to false
|
94
|
+
["--batchsize", "-b", Getopt::OPTIONAL], # batchsize for readers/writers that support +postpone?+
|
81
95
|
["--input", "-i", Getopt::REQUIRED], # input file format
|
82
96
|
["--rdf", "-r", Getopt::REQUIRED], # output file format
|
83
|
-
["--
|
84
|
-
["--
|
85
|
-
["--
|
86
|
-
["--
|
97
|
+
["--annotate_name", Getopt::OPTIONAL], # name of resourcce/tool/person
|
98
|
+
["--annotate_name_id", Getopt::OPTIONAL], # uri of resource/tool/person
|
99
|
+
["--annotate_date", Getopt::OPTIONAL], # date of processing/annotation
|
100
|
+
["--annotate_version", Getopt::OPTIONAL], # version number of resource
|
87
101
|
["--file", "-f", Getopt::OPTIONAL], # file to read, will read from STDIN if not supplied
|
88
|
-
["--out", "-o", Getopt::OPTIONAL] # output file, will out to STDOUT if not supplied
|
102
|
+
["--out", "-o", Getopt::OPTIONAL], # output file, will out to STDOUT if not supplied
|
103
|
+
["--version", "-v", Getopt::OPTIONAL] # output the version number of the gem and exit
|
89
104
|
)
|
90
105
|
|
91
106
|
if opt['help'] or not opt['input'] or not opt['rdf'] then
|
@@ -115,26 +130,38 @@ module BioInterchange
|
|
115
130
|
puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
|
116
131
|
puts ' Output: rdf.bh12.sio'
|
117
132
|
puts ' Options:'
|
118
|
-
puts '
|
119
|
-
puts '
|
120
|
-
puts ' --
|
121
|
-
puts ' --
|
133
|
+
puts ' --annotate_date <date> : date of processing/annotation (optional)'
|
134
|
+
puts ' --annotate_version <version> : version number of resource (optional)'
|
135
|
+
puts ' --annotate_name <name> : name of resource/tool/person (required)'
|
136
|
+
puts ' --annotate_name_id <id> : URI of resource/tool/person (required)'
|
122
137
|
puts ''
|
123
138
|
puts 'Input-/RDF-format specific options:'
|
124
139
|
puts ' Input: biointerchange.gff3 or biointerchange.gvf'
|
125
140
|
puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
|
126
141
|
puts ' Options:'
|
142
|
+
puts ' -b <size>/--batchsize <size> : process features in batches of the given size (optional)'
|
127
143
|
puts ' -t <date>/--date <date> : date when the GFF3/GVF file was created (optional)'
|
128
144
|
puts ' --name <name> : name of the GFF3/GVF file creator (optional)'
|
129
145
|
puts ' --name_id <id> : email address of the GFF3/GVF file creator (optional)'
|
130
146
|
puts ''
|
131
147
|
puts 'Other options:'
|
148
|
+
puts ' -v / --version : print the Gem\'s version number and exit'
|
132
149
|
puts ' -d / --debug : turn on debugging output (for stacktraces)'
|
133
150
|
puts ' -h --help : this message'
|
134
151
|
|
135
152
|
exit 1
|
136
153
|
end
|
137
154
|
|
155
|
+
# Print version number and exit:
|
156
|
+
if opt['version'] then
|
157
|
+
puts 'BioInterchange 0.1.4'
|
158
|
+
exit
|
159
|
+
end
|
160
|
+
|
161
|
+
# Turn off optimization, if requested. This will generate an RDF graph in memory and
|
162
|
+
# at least double memory requirements and runtime.
|
163
|
+
@@skip_rdf_graph = false if opt['no_rdf_graph_optimization']
|
164
|
+
|
138
165
|
# Check if the input/rdf options are supported:
|
139
166
|
if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
|
140
167
|
if opt['rdf'] == 'rdf.bh12.sio' then
|
@@ -158,27 +185,27 @@ module BioInterchange
|
|
158
185
|
unsupported_combination
|
159
186
|
end
|
160
187
|
|
161
|
-
opt['
|
162
|
-
|
163
|
-
|
188
|
+
wrong_type('batchsize', 'a positive integer') if opt['batchsize'] and not opt['batchsize'].match(/^[1-9][0-9]*$/)
|
189
|
+
|
190
|
+
opt['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
|
191
|
+
|
164
192
|
# Generate model from file (deserialization).
|
165
193
|
# Note: if-clauses are lexicographically ordered.
|
166
194
|
reader = nil
|
167
195
|
if opt['input'] == 'biointerchange.gff3' then
|
168
|
-
reader = BioInterchange::Genomics::GFF3Reader.new(opt['
|
196
|
+
reader = BioInterchange::Genomics::GFF3Reader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
|
169
197
|
elsif opt['input'] == 'biointerchange.gvf' then
|
170
|
-
reader = BioInterchange::Genomics::GVFReader.new(opt['
|
198
|
+
reader = BioInterchange::Genomics::GVFReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
|
171
199
|
elsif opt['input'] == 'dbcls.catanns.json' then
|
172
|
-
reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['
|
200
|
+
reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
|
173
201
|
elsif opt['input'] == 'uk.ac.man.pdfx' then
|
174
|
-
reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['
|
202
|
+
reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['annotate_version'])
|
175
203
|
end
|
176
204
|
|
177
|
-
model = nil
|
178
205
|
if opt["file"]
|
179
|
-
|
206
|
+
input_source = File.new(opt["file"],'r')
|
180
207
|
else
|
181
|
-
|
208
|
+
input_source = STDIN
|
182
209
|
end
|
183
210
|
|
184
211
|
# Generate rdf from model (serialization).
|
@@ -193,7 +220,10 @@ module BioInterchange
|
|
193
220
|
writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
|
194
221
|
end
|
195
222
|
|
196
|
-
|
223
|
+
begin
|
224
|
+
model = reader.deserialize(input_source)
|
225
|
+
writer.serialize(model)
|
226
|
+
end while reader.postponed?
|
197
227
|
|
198
228
|
rescue ArgumentError => e
|
199
229
|
$stderr.puts e.message
|
@@ -228,11 +258,55 @@ module BioInterchange
|
|
228
258
|
}
|
229
259
|
end
|
230
260
|
|
261
|
+
# Returns a "safe" version of a label that can be used as a Ruby method name.
|
262
|
+
#
|
263
|
+
# +label+:: string that should be converted into a "safe" string that can be used as a Ruby method name
|
264
|
+
def self.make_safe_label(label)
|
265
|
+
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_')
|
266
|
+
end
|
267
|
+
|
231
268
|
private
|
232
269
|
|
233
270
|
def self.unsupported_combination
|
234
271
|
raise ArgumentError, 'This input/output format combination is not supported.'
|
235
272
|
end
|
236
273
|
|
274
|
+
def self.wrong_type(parameter, expected_type)
|
275
|
+
raise ArgumentError, "The parameter '#{parameter}' needs to be #{expected_type}."
|
276
|
+
end
|
277
|
+
|
278
|
+
end
|
279
|
+
|
280
|
+
# Overwrite RDF::Graph implementation, in case we do not want to keep
|
281
|
+
# the complete graph in memory. If the implementing writer does not
|
282
|
+
# set an output stream via +fast_ostream+, then fall back to the original
|
283
|
+
# implementation.
|
284
|
+
module RDF
|
285
|
+
|
286
|
+
class Graph
|
287
|
+
# DO NOT keep old insert implementation due to infinite recursion caused by module loading dependencies!
|
288
|
+
# alias_method :graph_building_insert, :insert
|
289
|
+
|
290
|
+
# Set an output stream for writing in +insert+.
|
291
|
+
#
|
292
|
+
# +ostream+:: Output stream that is populated by +insert+, if optimization can be carried out.
|
293
|
+
def fast_ostream(ostream)
|
294
|
+
@ostream = ostream
|
295
|
+
end
|
296
|
+
|
297
|
+
# Alternative implementation to +insert+, which can immediately output N-Triples instead
|
298
|
+
# of building an in-memory graph first.
|
299
|
+
#
|
300
|
+
# +statement+:: RDF statement that should be serialized.
|
301
|
+
def insert(statement)
|
302
|
+
if BioInterchange::skip_rdf_graph and @ostream then
|
303
|
+
@ostream.puts(statement.to_ntriples)
|
304
|
+
else
|
305
|
+
insert_statement(statement)
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
end
|
310
|
+
|
237
311
|
end
|
238
312
|
|
@@ -38,11 +38,11 @@ class GFF3FeatureSet
|
|
38
38
|
@pragmas.keys
|
39
39
|
end
|
40
40
|
|
41
|
-
# Returns an URI for this particular feature set, which is a SHA1 hash over the
|
41
|
+
# Returns an URI for this particular feature set, which is a SHA1 hash over the pragma's concatenated properties.
|
42
42
|
def uri
|
43
43
|
clob = ''
|
44
|
-
|
45
|
-
clob << "#{
|
44
|
+
pragmas.each { |pragma_name|
|
45
|
+
clob << "#{pragma_name}\t#{pragma(pragma_name).to_s}\n"
|
46
46
|
}
|
47
47
|
"biointerchange://gff3/featureset/self/#{Digest::SHA1.hexdigest(clob)}"
|
48
48
|
end
|
@@ -62,6 +62,14 @@ class GFF3FeatureSet
|
|
62
62
|
# TODO Should throw exception if name is not a string.
|
63
63
|
@pragmas[name] = value
|
64
64
|
end
|
65
|
+
|
66
|
+
# Removes all features from the set, but keeps the pragmas. This enables
|
67
|
+
# batched processing, since the URI for the set is only determined by the
|
68
|
+
# pragma statement contents.
|
69
|
+
def prune
|
70
|
+
@set.clear
|
71
|
+
end
|
72
|
+
|
65
73
|
end
|
66
74
|
|
67
75
|
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
|
2
2
|
module BioInterchange::Genomics
|
3
3
|
|
4
|
-
# Represents a named region, which is defined by the pragma statement 'sequence-region'.
|
5
|
-
class
|
4
|
+
# Represents a named region, a.k.a. landmark, which is defined by the pragma statement 'sequence-region'.
|
5
|
+
class GFF3Landmark
|
6
6
|
|
7
7
|
# Create a new instance of a named region.
|
8
8
|
#
|
9
9
|
# +seqid+:: unique identifier (in the GFF3 file context) that identifies this region
|
10
10
|
# +start_coordinate+:: genomic start coordinate of the region
|
11
11
|
# +end_coordinate+:: genomic end coordinate of the region
|
12
|
-
def initialize(seqid, start_coordinate, end_coordinate)
|
12
|
+
def initialize(seqid, start_coordinate = nil, end_coordinate = nil)
|
13
13
|
@seqid = seqid
|
14
14
|
@start_coordinate = start_coordinate
|
15
15
|
@end_coordinate = end_coordinate
|
@@ -46,7 +46,16 @@ protected
|
|
46
46
|
#
|
47
47
|
# +model+:: an instance of +BioInterchange::Genomics::GFF3FeatureSet+
|
48
48
|
def serialize_model(model)
|
49
|
+
# We record landmarks, because they can either be written when their "##sequence-region"
|
50
|
+
# pragma statement appears, or otherwise, when the first feature with said landmark is
|
51
|
+
# being serialized.
|
52
|
+
@landmarks = {}
|
53
|
+
|
54
|
+
# Record written variants in order to avoid writing out RDF.type multiple times.
|
55
|
+
@variants = {}
|
56
|
+
|
49
57
|
graph = RDF::Graph.new
|
58
|
+
graph.fast_ostream(@ostream) if BioInterchange::skip_rdf_graph
|
50
59
|
set_uri = RDF::URI.new(model.uri)
|
51
60
|
graph.insert(RDF::Statement.new(set_uri, RDF.type, @base.Set))
|
52
61
|
model.pragmas.each { |pragma_name|
|
@@ -57,21 +66,32 @@ protected
|
|
57
66
|
}
|
58
67
|
RDF::NTriples::Writer.dump(graph, @ostream)
|
59
68
|
# TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
|
69
|
+
# Having said that, Jena's rdfcat is very good for converting formats
|
70
|
+
# anyway, so perhaps it is not worth investigating the following.
|
60
71
|
# RDF::RDFXML::Writer.dump(graph, @ostream)
|
61
72
|
end
|
62
73
|
|
63
74
|
# Serializes pragmas for a given feature set URI.
|
75
|
+
#
|
64
76
|
# +graph+:: RDF graph to which the pragmas are added
|
65
77
|
# +set_uri+:: the feature set URI to which the pragmas belong to
|
66
78
|
# +pragma+:: an object representing a pragma statement
|
67
79
|
def serialize_pragma(graph, set_uri, pragma)
|
68
80
|
if pragma.kind_of?(Hash) then
|
69
|
-
if pragma.has_key?('
|
81
|
+
if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) and @base == BioInterchange::GVF1O then
|
82
|
+
serialize_structured_attribute(graph, set_uri, pragma)
|
83
|
+
elsif pragma.has_key?('gff-version') and @base == BioInterchange::GFF3O then
|
70
84
|
graph.insert(RDF::Statement.new(set_uri, @base.version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
|
71
85
|
elsif pragma.has_key?('gff-version') and @base == BioInterchange::GVF1O then
|
72
86
|
graph.insert(RDF::Statement.new(set_uri, @base.gff_version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
|
73
87
|
elsif pragma.has_key?('gvf-version') and @base == BioInterchange::GVF1O then
|
74
88
|
graph.insert(RDF::Statement.new(set_uri, @base.gvf_version, RDF::Literal.new(pragma['gvf-version'], :datatype => RDF::XSD.float )))
|
89
|
+
elsif pragma.has_key?('sequence-region') then
|
90
|
+
pragma['sequence-region'].keys.each { |seqid|
|
91
|
+
serialize_landmark(graph, set_uri, pragma['sequence-region'][seqid])
|
92
|
+
}
|
93
|
+
elsif pragma.has_key?('species') then
|
94
|
+
graph.insert(RDF::Statement.new(set_uri, @base.species, RDF::URI.new(pragma['species'])))
|
75
95
|
end
|
76
96
|
else
|
77
97
|
# TODO
|
@@ -87,17 +107,18 @@ protected
|
|
87
107
|
# TODO Make sure there is only one value in the 'ID' list.
|
88
108
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
|
89
109
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
|
90
|
-
|
110
|
+
feature_datatype_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
111
|
+
feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
91
112
|
graph.insert(RDF::Statement.new(set_uri, @base.contains, feature_uri))
|
92
113
|
graph.insert(RDF::Statement.new(feature_uri, RDF.type, @base.Feature))
|
93
|
-
graph
|
114
|
+
serialize_landmark(graph, set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
|
115
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.seqid ].flatten, feature_object_properties)[0], @landmarks[feature.sequence_id]))
|
94
116
|
graph.insert(RDF::Statement.new(feature_uri, @base.source, RDF::Literal.new(feature.source)))
|
95
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.type,
|
96
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.start,
|
97
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.end,
|
117
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.type, feature.type))
|
118
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.start, feature_datatype_properties)[0], RDF::Literal.new(feature.start_coordinate)))
|
119
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.end, feature_datatype_properties)[0], RDF::Literal.new(feature.end_coordinate)))
|
98
120
|
graph.insert(RDF::Statement.new(feature_uri, @base.score, RDF::Literal.new(feature.score))) if feature.score
|
99
|
-
|
100
|
-
strand_uri = @base.with_parent(@base.strand, feature_properties)[0]
|
121
|
+
strand_uri = @base.with_parent(@base.strand, feature_object_properties)[0]
|
101
122
|
case feature.strand
|
102
123
|
when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
103
124
|
graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.NotStranded))
|
@@ -108,13 +129,27 @@ protected
|
|
108
129
|
when BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
109
130
|
graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Negative))
|
110
131
|
else
|
111
|
-
raise
|
132
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
|
112
133
|
end
|
113
134
|
graph.insert(RDF::Statement.new(feature_uri, @base.phase, RDF::Literal.new(feature.phase))) if feature.phase
|
114
135
|
|
115
136
|
serialize_attributes(graph, set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
|
116
137
|
end
|
117
138
|
|
139
|
+
# Serializes a genomic feature landmark ("seqid").
|
140
|
+
#
|
141
|
+
# +graph+:: RDF graph to which the landmark is added
|
142
|
+
# +set_uri+:: the feature set URI to which the landmark belongs to
|
143
|
+
# +landmark+:: encapsuled landmark data
|
144
|
+
def serialize_landmark(graph, set_uri, landmark)
|
145
|
+
return if @landmarks.has_key?(landmark.seqid)
|
146
|
+
landmark_uri = RDF::URI.new("#{set_uri.to_s}/landmark/#{landmark.seqid}")
|
147
|
+
@landmarks[landmark.seqid] = landmark_uri
|
148
|
+
graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.id ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.seqid)))
|
149
|
+
graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.start ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.start_coordinate))) if landmark.start_coordinate
|
150
|
+
graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.end ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.end_coordinate))) if landmark.end_coordinate
|
151
|
+
end
|
152
|
+
|
118
153
|
# Serializes the attributes of a feature.
|
119
154
|
#
|
120
155
|
# +graph+:: RDF graph to which the feature is added
|
@@ -123,24 +158,194 @@ protected
|
|
123
158
|
# +attribtues+:: a map of tag/value pairs
|
124
159
|
def serialize_attributes(graph, set_uri, feature_uri, attributes)
|
125
160
|
attributes.each_pair { |tag, list|
|
126
|
-
if
|
161
|
+
# Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
|
162
|
+
if tag == 'Alias' then
|
163
|
+
list.each { |value|
|
164
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.alias, RDF::Literal.new(value)))
|
165
|
+
}
|
166
|
+
elsif tag == 'Dbxref' then
|
167
|
+
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
168
|
+
list.each { |value|
|
169
|
+
begin
|
170
|
+
if value.match(/^dbSNP(_\d+)?:rs\d+$/) then
|
171
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new("http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{value.split(/:/)[1].sub(/^rs/, '')}")))
|
172
|
+
elsif value.match(/^COSMIC(_\d+)?:COSM\d+$/) then
|
173
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new("http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{value.split(/:/)[1].sub(/^COSM/, '')}")))
|
174
|
+
else
|
175
|
+
abbreviation, id = value.split(':', 2)
|
176
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new(BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id)))
|
177
|
+
end
|
178
|
+
rescue NoMethodError
|
179
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Attribute Dbxref link-out is not resolvable, i.e. the name cannot be turned into an URL.'
|
180
|
+
end
|
181
|
+
}
|
182
|
+
elsif tag == 'Derives_from' and @base == BioInterchange::GFF3O then
|
183
|
+
list.each { |value|
|
184
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.derives_from, RDF::URI.new("#{set_uri.to_s}/feature/#{value}")))
|
185
|
+
}
|
186
|
+
elsif tag == 'Gap' and @base == BioInterchange::GFF3O then
|
187
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.gap, RDF::Literal.new(list.join(','))))
|
188
|
+
elsif tag == 'ID' then
|
189
|
+
# Do nothing. The feature ID is the URI of the feature. It is not relevant as information anymore.
|
190
|
+
elsif tag == 'Is_circular' and @base == BioInterchange::GFF3O then
|
191
|
+
value = list.join(',')
|
192
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.is_circular, true)) if value == 'true'
|
193
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.is_circular, false)) if value == 'false'
|
194
|
+
# TODO Report invalid value.
|
195
|
+
elsif tag == 'Name' and @base == BioInterchange::GFF3O then
|
196
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.name, RDF::Literal.new(list.join(','))))
|
197
|
+
elsif tag == 'Note' and @base == BioInterchange::GFF3O then
|
198
|
+
list.each { |value|
|
199
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.note, RDF::Literal.new(value)))
|
200
|
+
}
|
201
|
+
elsif tag == 'Ontology_term' and @base == BioInterchange::GFF3O then
|
202
|
+
list.each { |value|
|
203
|
+
# TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
|
204
|
+
# match their associated Ruby method.
|
205
|
+
namespace, accession = value.split(/:/, 2)
|
206
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.ontology_term, RDF::URI.new("#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")))
|
207
|
+
}
|
208
|
+
elsif tag == 'Parent' then
|
127
209
|
list.each { |parent_id|
|
128
210
|
graph.insert(RDF::Statement.new(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
|
129
211
|
}
|
212
|
+
elsif tag == 'Reference_seq' then
|
213
|
+
list.each { |value|
|
214
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.reference_seq, RDF::Literal.new(value)))
|
215
|
+
}
|
216
|
+
elsif tag == 'Target' then
|
217
|
+
target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
|
218
|
+
target_datatype_properties = @base.target_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
219
|
+
target_object_properties = @base.target_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
220
|
+
target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
|
221
|
+
graph.insert(RDF::Statement.new(target_uri, RDF.type, @base.Target))
|
222
|
+
graph.insert(RDF::Statement.new(target_uri, @base.target_id, RDF::Literal.new(target_id)))
|
223
|
+
graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.start ].flatten, target_datatype_properties)[0], RDF::Literal.new(start_coordinate.to_i)))
|
224
|
+
graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_datatype_properties)[0], RDF::Literal.new(end_coordinate.to_i)))
|
225
|
+
graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_object_properties)[0], @base.Positive)) if strand and strand == '+'
|
226
|
+
graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_object_properties)[0], @base.Negative)) if strand and strand == '-'
|
227
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.target, target_uri))
|
228
|
+
elsif tag == 'Variant_seq' and @base == BioInterchange::GVF1O then
|
229
|
+
serialize_variant_seqs(graph, set_uri, feature_uri, list)
|
130
230
|
else
|
231
|
+
# TODO Report unknown upper case letters here? That would be a spec. validation...
|
232
|
+
# Well, or it would show that this implementation is incomplete. Could be either.
|
233
|
+
attribute_properties = @base.attribute_properties
|
234
|
+
attribute_properties = attribute_properties.select { |uri| @base.is_datatype_property?(uri) }[0] if attribute_properties.kind_of?(Array)
|
235
|
+
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
131
236
|
list.each_index { |index|
|
132
237
|
value = list[index]
|
133
238
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
|
134
239
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
|
135
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.attributes, attribute_uri))
|
240
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.attributes ].flatten, feature_properties)[0], attribute_uri))
|
136
241
|
graph.insert(RDF::Statement.new(attribute_uri, RDF.type, @base.Attribute))
|
137
|
-
graph.insert(RDF::Statement.new(attribute_uri, @base.tag, RDF::Literal.new("#{tag}")))
|
242
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.with_parent([ @base.tag ].flatten, attribute_properties)[0], RDF::Literal.new("#{tag}")))
|
138
243
|
graph.insert(RDF::Statement.new(attribute_uri, RDF.value, RDF::Literal.new(value)))
|
139
244
|
}
|
140
245
|
end
|
141
246
|
}
|
142
247
|
end
|
143
248
|
|
249
|
+
# Serializes a structured attribute (given as a pragma statement), which later
|
250
|
+
# can be referred to from feature instances.
|
251
|
+
#
|
252
|
+
# +graph+:: RDF graph to which the structured attribute is added
|
253
|
+
# +set_uri+:: the feature set URI to which the structured attribute belongs to
|
254
|
+
# +pragma+:: a map that encapsulates the structured attribute data
|
255
|
+
def serialize_structured_attribute(graph, set_uri, pragma)
|
256
|
+
attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
|
257
|
+
attributes = nil
|
258
|
+
class_type = nil
|
259
|
+
if pragma.has_key?('attribute-method') then
|
260
|
+
attributes = pragma['attribute-method'][0]
|
261
|
+
class_type = @base.Method
|
262
|
+
elsif pragma.has_key?('data-source') then
|
263
|
+
attributes = pragma['data-source'][0]
|
264
|
+
class_type = @base.DataSource
|
265
|
+
elsif pragma.has_key?('score-method') then
|
266
|
+
attributes = pragma['score-method'][0]
|
267
|
+
class_type = @base.Method
|
268
|
+
elsif pragma.has_key?('source-method') then
|
269
|
+
attributes = pragma['source-method'][0]
|
270
|
+
class_type = @base.Method
|
271
|
+
elsif pragma.has_key?('technology-platform') then
|
272
|
+
attributes = pragma['technology-platform'][0]
|
273
|
+
class_type = @base.TechnologyPlatform
|
274
|
+
else
|
275
|
+
# TODO Error.
|
276
|
+
end
|
277
|
+
graph.insert(RDF::Statement.new(attribute_uri, RDF.type, class_type))
|
278
|
+
if class_type == @base.DataSource and attributes.has_key?('Data_type') then
|
279
|
+
data_type_individual = nil
|
280
|
+
attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
|
281
|
+
if attributes['Data_type'] == 'Array_CGH' then
|
282
|
+
data_type_individual = @base.ArrayComparativeGenomicHybridization
|
283
|
+
elsif attributes['Data_type'] == 'DNA_microarray' then
|
284
|
+
data_type_individual = @base.DNAMicroarray
|
285
|
+
elsif attributes['Data_type'] == 'DNA_sequence' then
|
286
|
+
data_type_individual = @base.DNASequence
|
287
|
+
elsif attributes['Data_type'] == 'RNA_sequence' then
|
288
|
+
data_type_individual = @base.RNASequence
|
289
|
+
else
|
290
|
+
# TODO Error.
|
291
|
+
end
|
292
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.data_type, data_type_individual))
|
293
|
+
elsif class_type == @base.TechnologyPlatform then
|
294
|
+
if attributes.has_key?('Average_coverage') then
|
295
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.average_coverage, RDF::Literal.new(attributes['Average_coverage'][0].to_i)))
|
296
|
+
end
|
297
|
+
if attributes.has_key?('Platform_class') then
|
298
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.platform_class, RDF::Literal.new(attributes['Platform_class'][0])))
|
299
|
+
end
|
300
|
+
if attributes.has_key?('Platform_name') then
|
301
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.platform_name, RDF::Literal.new(attributes['Platform_name'][0])))
|
302
|
+
end
|
303
|
+
if attributes.has_key?('Read_length') then
|
304
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.read_length, RDF::Literal.new(attributes['Read_length'][0].to_i)))
|
305
|
+
end
|
306
|
+
if attributes.has_key?('Read_pair_span') then
|
307
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.read_pair_span, RDF::Literal.new(attributes['Read_pair_span'][0].to_i)))
|
308
|
+
end
|
309
|
+
if attributes.has_key?('Read_type') then
|
310
|
+
read_type_individual = nil
|
311
|
+
attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
|
312
|
+
if attributes['Read_type'] == 'fragment' then
|
313
|
+
read_type_individual = @base.Fragment
|
314
|
+
elsif attributes['Read_type'] == 'pair' then
|
315
|
+
read_type_individual = @base.Pair
|
316
|
+
else
|
317
|
+
# TODO Error.
|
318
|
+
end
|
319
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.read_type, read_type_individual))
|
320
|
+
end
|
321
|
+
end
|
322
|
+
structuredpragma_properties = @base.structuredpragma_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
323
|
+
attributes.keys.each { |tag|
|
324
|
+
if tag.match(/^[a-z]/) then
|
325
|
+
custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
|
326
|
+
graph.insert(RDF::Statement.new(custom_attribute_uri, RDF.type, @base.StructuredAttribute))
|
327
|
+
graph.insert(RDF::Statement.new(custom_attribute_uri, @base.with_parent([ @base.tag ].flatten, @base.structuredattribute_properties)[0], tag))
|
328
|
+
graph.insert(RDF::Statement.new(custom_attribute_uri, RDF.value, RDF::Literal.new(attributes[tag].join(','))))
|
329
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.with_parent([ @base.attributes ].flatten, structuredpragma_properties)[0], custom_attribute_uri))
|
330
|
+
end
|
331
|
+
}
|
332
|
+
end
|
333
|
+
|
334
|
+
# Serializes a list of variant sequences.
|
335
|
+
#
|
336
|
+
# +graph+:: RDF graph to which the structured attribute is added
|
337
|
+
# +set_uri+:: the feature set URI to which the feature belongs to
|
338
|
+
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
339
|
+
# +list+:: list of variant values
|
340
|
+
def serialize_variant_seqs(graph, set_uri, feature_uri, list)
|
341
|
+
list.each_index { |index|
|
342
|
+
value = list[index]
|
343
|
+
variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
|
344
|
+
graph.insert(RDF::Statement.new(variant_uri, RDF.type, @base.Variant)) unless @variants.has_key?(variant_uri.to_s)
|
345
|
+
@variants[variant_uri.to_s] = true
|
346
|
+
graph.insert(RDF::Statement.new(variant_uri, @base.variant_seq, RDF::Literal.new(value)))
|
347
|
+
}
|
348
|
+
end
|
144
349
|
end
|
145
350
|
|
146
351
|
end
|