biointerchange 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +17 -0
- data/VERSION +1 -1
- data/generators/GOxrefify.rb +41 -0
- data/generators/rdfxml.rb +6 -4
- data/lib/biointerchange/core.rb +94 -20
- data/lib/biointerchange/genomics/gff3_feature_set.rb +11 -3
- data/lib/biointerchange/genomics/gff3_pragmas.rb +3 -3
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +217 -12
- data/lib/biointerchange/genomics/gff3_reader.rb +78 -20
- data/lib/biointerchange/genomics/gvf_reader.rb +9 -3
- data/lib/biointerchange/gff3o.rb +69 -55
- data/lib/biointerchange/goxref.rb +867 -0
- data/lib/biointerchange/gvf1o.rb +546 -82
- data/lib/biointerchange/textmining/text_mining_reader.rb +9 -0
- data/spec/gff3_rdfwriter_spec.rb +1 -1
- data/spec/gvf_rdfwriter_spec.rb +1 -1
- data/spec/text_mining_pdfx_xml_reader_spec.rb +3 -0
- data/spec/text_mining_pubannos_json_reader_spec.rb +4 -1
- data/supplemental/java/biointerchange/pom.xml +1 -1
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GFF3O.java +93 -125
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java +304 -205
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java +4044 -4290
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java +3 -3
- data/supplemental/python/biointerchange/gff3o.py +1 -89
- data/supplemental/python/biointerchange/gvf1o.py +129 -147
- data/supplemental/python/biointerchange/sio.py +817 -46
- data/supplemental/python/biointerchange/sofa.py +543 -543
- data/supplemental/python/setup.py +1 -1
- data/web/ontologies.html +1 -3
- metadata +7 -2
data/README.md
CHANGED
@@ -274,6 +274,12 @@ Building a new version of the Ruby vocabulary classes for GFF3, SIO, SOFA (requi
|
|
274
274
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sofa> SOFA >> lib/biointerchange/sofa.rb
|
275
275
|
echo -e "\nend" >> lib/biointerchange/sofa.rb
|
276
276
|
|
277
|
+
A Geno Ontology external reference (GOxref) vocabulary can be created by directly downloading the latest version of `GO.xrf_abbs`:
|
278
|
+
|
279
|
+
echo -e "module BioInterchange\n" > lib/biointerchange/goxref.rb
|
280
|
+
curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb
|
281
|
+
echo -e "\nend" >> lib/biointerchange/goxref.rb
|
282
|
+
|
277
283
|
#### Python Vocabulary Classes
|
278
284
|
|
279
285
|
The source-code generation can be skipped, if none of the ontologies that are used by BioInterchange have been changed. Otherwise, the existing Python vocabulary class wrappers can be generated as follows:
|
@@ -282,6 +288,7 @@ The source-code generation can be skipped, if none of the ontologies that are us
|
|
282
288
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-gvf1o> GVF1O | ruby generators/pythonify.rb > supplemental/python/biointerchange/gvf1o.py
|
283
289
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sio> SIO | ruby generators/pythonify.rb > supplemental/python/biointerchange/sio.py
|
284
290
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sofa> SOFA | ruby generators/pythonify.rb > supplemental/python/biointerchange/sofa.py
|
291
|
+
curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb | ruby generators/pythonify.rb > supplemental/python/biointerchange/goxref.py
|
285
292
|
|
286
293
|
Generate the BioInterchange Python vocabulary egg:
|
287
294
|
|
@@ -302,6 +309,7 @@ The source-code generation can be skipped, if none of the ontologies that are us
|
|
302
309
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-gvf1o> GVF1O | ruby generators/javaify.rb > supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java
|
303
310
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sio> SIO | ruby generators/javaify.rb > supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java
|
304
311
|
ruby generators/rdfxml.rb <path-to-rdf/xml-version-of-sofa> SOFA | ruby generators/javaify.rb "http://purl.obolibrary.org/obo/" > supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java
|
312
|
+
curl ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs | ruby generators/GOxrefify.rb | ruby generators/javaify.rb > supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GOXRef.java
|
305
313
|
|
306
314
|
Generate the BioInterchange Java vocabulary artifact:
|
307
315
|
|
@@ -346,6 +354,15 @@ A more verbose is produced by calling `rspec` directly:
|
|
346
354
|
|
347
355
|
bundle exec rake rdoc
|
348
356
|
|
357
|
+
### Deploying on Rubygems
|
358
|
+
|
359
|
+
_Note:_ Only BioInterchange package maintainers can deploy the 'biointerchange' gem on Rubygems.
|
360
|
+
|
361
|
+
bundle exec rake version:bump:(major | minor | patch)
|
362
|
+
bundle exec rake gemspec
|
363
|
+
bundle exec gem build biointerchange.gemspec
|
364
|
+
bundle exec gem push biointerchange-VERSION.gem
|
365
|
+
|
349
366
|
### Troubleshooting
|
350
367
|
|
351
368
|
#### GCC: No such file or directory
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
def record(id, description, uri)
|
4
|
+
puts " # Returns the link-out URI for objects of \"#{description}\"."
|
5
|
+
puts " def self.#{id}"
|
6
|
+
puts " RDF::URI.new(\"#{uri}\")"
|
7
|
+
puts ' end'
|
8
|
+
puts ''
|
9
|
+
end
|
10
|
+
|
11
|
+
puts 'class GOXRef'
|
12
|
+
puts ''
|
13
|
+
|
14
|
+
in_record = false
|
15
|
+
|
16
|
+
id = nil
|
17
|
+
description = nil
|
18
|
+
uri = nil
|
19
|
+
|
20
|
+
STDIN.each { |line|
|
21
|
+
line.chomp!
|
22
|
+
|
23
|
+
if line.empty? then
|
24
|
+
record(id, description, uri) if uri and not uri.match(/\[.*\]/)
|
25
|
+
uri = nil
|
26
|
+
in_record = false
|
27
|
+
end
|
28
|
+
|
29
|
+
if line.start_with?('abbreviation:') and not in_record then
|
30
|
+
id = line.sub(/^abbreviation: /, '').gsub(/[-\/]/, '_')
|
31
|
+
in_record = true
|
32
|
+
end
|
33
|
+
|
34
|
+
description = line.sub(/^database: /, '') if line.start_with?('database:') and in_record
|
35
|
+
uri = line.sub(/^url_syntax: /, '').sub(/\[example_id\]$/, '') if line.start_with?('url_syntax:') and in_record
|
36
|
+
}
|
37
|
+
|
38
|
+
record(id, description, uri) if uri
|
39
|
+
|
40
|
+
puts 'end'
|
41
|
+
|
data/generators/rdfxml.rb
CHANGED
@@ -16,8 +16,10 @@ OBO_DEF = RDF::URI.new('http://purl.obolibrary.org/obo/def')
|
|
16
16
|
# For handling synonyms in SIO:
|
17
17
|
SIO_SYN = RDF::URI.new('http://semanticscience.org/resource/synonym')
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
# This label conversion also appears in:
|
20
|
+
# +lib/biointerchange/core.rb+
|
21
|
+
def make_safe_label(label)
|
22
|
+
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_')
|
21
23
|
end
|
22
24
|
|
23
25
|
reader = RDF::RDFXML::Reader.open(ARGV[0])
|
@@ -55,7 +57,7 @@ model.keys.each { |key|
|
|
55
57
|
next unless type
|
56
58
|
|
57
59
|
label = entry[RDF::RDFS.label].to_s
|
58
|
-
next if
|
60
|
+
next if make_safe_label(label).empty?
|
59
61
|
uri = key.to_s
|
60
62
|
|
61
63
|
# Only deal with URI sub-classes/sub-properties, whilst ignoring restrictions, etc.
|
@@ -84,7 +86,7 @@ model.keys.each { |key|
|
|
84
86
|
set = combined_uris[label_or_synonym]
|
85
87
|
set = [] unless set
|
86
88
|
combined_uris[label_or_synonym] = set | [ uri ]
|
87
|
-
generated_labels[label_or_synonym] =
|
89
|
+
generated_labels[label_or_synonym] = make_safe_label(label_or_synonym)
|
88
90
|
}
|
89
91
|
|
90
92
|
object_properties[uri] = true if type == RDF::OWL.ObjectProperty
|
data/lib/biointerchange/core.rb
CHANGED
@@ -5,11 +5,23 @@
|
|
5
5
|
# of it as a gem in your own Ruby implementation.
|
6
6
|
module BioInterchange
|
7
7
|
|
8
|
+
### Global behaviour settings, which can be altered programmatically or via the CLI:
|
9
|
+
|
10
|
+
# If true, then RDF::Graph's "insert" function will be overwritten so that it
|
11
|
+
# immediately outputs N-Triples. This reduces memory requirements (since no RDF
|
12
|
+
# graph is kept in memory) and performance (since no looping through an RDF graph
|
13
|
+
# is necessary).
|
14
|
+
@@skip_rdf_graph = true
|
15
|
+
def self.skip_rdf_graph
|
16
|
+
@@skip_rdf_graph
|
17
|
+
end
|
18
|
+
|
8
19
|
# Custom Exceptions and Errors
|
9
20
|
require 'biointerchange/exceptions'
|
10
21
|
|
11
22
|
# Ontologies (besides the ones from the 'rdf' gem)
|
12
23
|
require 'biointerchange/gff3o'
|
24
|
+
require 'biointerchange/goxref'
|
13
25
|
require 'biointerchange/gvf1o'
|
14
26
|
require 'biointerchange/sio'
|
15
27
|
require 'biointerchange/sofa'
|
@@ -78,14 +90,17 @@ module BioInterchange
|
|
78
90
|
opt = Getopt::Long.getopts(
|
79
91
|
["--help", "-h", Getopt::BOOLEAN],
|
80
92
|
["--debug", "-d", Getopt::BOOLEAN], # set debug mode => print stack traces
|
93
|
+
["--no_rdf_graph_optimization", "-n", Getopt::BOOLEAN], # set self.skip_rdf_graph to false
|
94
|
+
["--batchsize", "-b", Getopt::OPTIONAL], # batchsize for readers/writers that support +postpone?+
|
81
95
|
["--input", "-i", Getopt::REQUIRED], # input file format
|
82
96
|
["--rdf", "-r", Getopt::REQUIRED], # output file format
|
83
|
-
["--
|
84
|
-
["--
|
85
|
-
["--
|
86
|
-
["--
|
97
|
+
["--annotate_name", Getopt::OPTIONAL], # name of resourcce/tool/person
|
98
|
+
["--annotate_name_id", Getopt::OPTIONAL], # uri of resource/tool/person
|
99
|
+
["--annotate_date", Getopt::OPTIONAL], # date of processing/annotation
|
100
|
+
["--annotate_version", Getopt::OPTIONAL], # version number of resource
|
87
101
|
["--file", "-f", Getopt::OPTIONAL], # file to read, will read from STDIN if not supplied
|
88
|
-
["--out", "-o", Getopt::OPTIONAL] # output file, will out to STDOUT if not supplied
|
102
|
+
["--out", "-o", Getopt::OPTIONAL], # output file, will out to STDOUT if not supplied
|
103
|
+
["--version", "-v", Getopt::OPTIONAL] # output the version number of the gem and exit
|
89
104
|
)
|
90
105
|
|
91
106
|
if opt['help'] or not opt['input'] or not opt['rdf'] then
|
@@ -115,26 +130,38 @@ module BioInterchange
|
|
115
130
|
puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
|
116
131
|
puts ' Output: rdf.bh12.sio'
|
117
132
|
puts ' Options:'
|
118
|
-
puts '
|
119
|
-
puts '
|
120
|
-
puts ' --
|
121
|
-
puts ' --
|
133
|
+
puts ' --annotate_date <date> : date of processing/annotation (optional)'
|
134
|
+
puts ' --annotate_version <version> : version number of resource (optional)'
|
135
|
+
puts ' --annotate_name <name> : name of resource/tool/person (required)'
|
136
|
+
puts ' --annotate_name_id <id> : URI of resource/tool/person (required)'
|
122
137
|
puts ''
|
123
138
|
puts 'Input-/RDF-format specific options:'
|
124
139
|
puts ' Input: biointerchange.gff3 or biointerchange.gvf'
|
125
140
|
puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
|
126
141
|
puts ' Options:'
|
142
|
+
puts ' -b <size>/--batchsize <size> : process features in batches of the given size (optional)'
|
127
143
|
puts ' -t <date>/--date <date> : date when the GFF3/GVF file was created (optional)'
|
128
144
|
puts ' --name <name> : name of the GFF3/GVF file creator (optional)'
|
129
145
|
puts ' --name_id <id> : email address of the GFF3/GVF file creator (optional)'
|
130
146
|
puts ''
|
131
147
|
puts 'Other options:'
|
148
|
+
puts ' -v / --version : print the Gem\'s version number and exit'
|
132
149
|
puts ' -d / --debug : turn on debugging output (for stacktraces)'
|
133
150
|
puts ' -h --help : this message'
|
134
151
|
|
135
152
|
exit 1
|
136
153
|
end
|
137
154
|
|
155
|
+
# Print version number and exit:
|
156
|
+
if opt['version'] then
|
157
|
+
puts 'BioInterchange 0.1.4'
|
158
|
+
exit
|
159
|
+
end
|
160
|
+
|
161
|
+
# Turn off optimization, if requested. This will generate an RDF graph in memory and
|
162
|
+
# at least double memory requirements and runtime.
|
163
|
+
@@skip_rdf_graph = false if opt['no_rdf_graph_optimization']
|
164
|
+
|
138
165
|
# Check if the input/rdf options are supported:
|
139
166
|
if opt['input'] == 'dbcls.catanns.json' or opt['input'] == 'uk.ac.man.pdfx' then
|
140
167
|
if opt['rdf'] == 'rdf.bh12.sio' then
|
@@ -158,27 +185,27 @@ module BioInterchange
|
|
158
185
|
unsupported_combination
|
159
186
|
end
|
160
187
|
|
161
|
-
opt['
|
162
|
-
|
163
|
-
|
188
|
+
wrong_type('batchsize', 'a positive integer') if opt['batchsize'] and not opt['batchsize'].match(/^[1-9][0-9]*$/)
|
189
|
+
|
190
|
+
opt['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
|
191
|
+
|
164
192
|
# Generate model from file (deserialization).
|
165
193
|
# Note: if-clauses are lexicographically ordered.
|
166
194
|
reader = nil
|
167
195
|
if opt['input'] == 'biointerchange.gff3' then
|
168
|
-
reader = BioInterchange::Genomics::GFF3Reader.new(opt['
|
196
|
+
reader = BioInterchange::Genomics::GFF3Reader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
|
169
197
|
elsif opt['input'] == 'biointerchange.gvf' then
|
170
|
-
reader = BioInterchange::Genomics::GVFReader.new(opt['
|
198
|
+
reader = BioInterchange::Genomics::GVFReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
|
171
199
|
elsif opt['input'] == 'dbcls.catanns.json' then
|
172
|
-
reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['
|
200
|
+
reader = BioInterchange::TextMining::PubannosJsonReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
|
173
201
|
elsif opt['input'] == 'uk.ac.man.pdfx' then
|
174
|
-
reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['
|
202
|
+
reader = BioInterchange::TextMining::PdfxXmlReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['annotate_version'])
|
175
203
|
end
|
176
204
|
|
177
|
-
model = nil
|
178
205
|
if opt["file"]
|
179
|
-
|
206
|
+
input_source = File.new(opt["file"],'r')
|
180
207
|
else
|
181
|
-
|
208
|
+
input_source = STDIN
|
182
209
|
end
|
183
210
|
|
184
211
|
# Generate rdf from model (serialization).
|
@@ -193,7 +220,10 @@ module BioInterchange
|
|
193
220
|
writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
|
194
221
|
end
|
195
222
|
|
196
|
-
|
223
|
+
begin
|
224
|
+
model = reader.deserialize(input_source)
|
225
|
+
writer.serialize(model)
|
226
|
+
end while reader.postponed?
|
197
227
|
|
198
228
|
rescue ArgumentError => e
|
199
229
|
$stderr.puts e.message
|
@@ -228,11 +258,55 @@ module BioInterchange
|
|
228
258
|
}
|
229
259
|
end
|
230
260
|
|
261
|
+
# Returns a "safe" version of a label that can be used as a Ruby method name.
|
262
|
+
#
|
263
|
+
# +label+:: string that should be converted into a "safe" string that can be used as a Ruby method name
|
264
|
+
def self.make_safe_label(label)
|
265
|
+
label.gsub(/[ '-.<>\/]/, '_').gsub(/\([^\)]*?\)/, '').sub(/^(\d+)/, "a_#{$1}").gsub(/^_+|_+$/, '').gsub(/_+/, '_')
|
266
|
+
end
|
267
|
+
|
231
268
|
private
|
232
269
|
|
233
270
|
def self.unsupported_combination
|
234
271
|
raise ArgumentError, 'This input/output format combination is not supported.'
|
235
272
|
end
|
236
273
|
|
274
|
+
def self.wrong_type(parameter, expected_type)
|
275
|
+
raise ArgumentError, "The parameter '#{parameter}' needs to be #{expected_type}."
|
276
|
+
end
|
277
|
+
|
278
|
+
end
|
279
|
+
|
280
|
+
# Overwrite RDF::Graph implementation, in case we do not want to keep
|
281
|
+
# the complete graph in memory. If the implementing writer does not
|
282
|
+
# set an output stream via +fast_ostream+, then fall back to the original
|
283
|
+
# implementation.
|
284
|
+
module RDF
|
285
|
+
|
286
|
+
class Graph
|
287
|
+
# DO NOT keep old insert implementation due to infinite recursion caused by module loading dependencies!
|
288
|
+
# alias_method :graph_building_insert, :insert
|
289
|
+
|
290
|
+
# Set an output stream for writing in +insert+.
|
291
|
+
#
|
292
|
+
# +ostream+:: Output stream that is populated by +insert+, if optimization can be carried out.
|
293
|
+
def fast_ostream(ostream)
|
294
|
+
@ostream = ostream
|
295
|
+
end
|
296
|
+
|
297
|
+
# Alternative implementation to +insert+, which can immediately output N-Triples instead
|
298
|
+
# of building an in-memory graph first.
|
299
|
+
#
|
300
|
+
# +statement+:: RDF statement that should be serialized.
|
301
|
+
def insert(statement)
|
302
|
+
if BioInterchange::skip_rdf_graph and @ostream then
|
303
|
+
@ostream.puts(statement.to_ntriples)
|
304
|
+
else
|
305
|
+
insert_statement(statement)
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
309
|
+
end
|
310
|
+
|
237
311
|
end
|
238
312
|
|
@@ -38,11 +38,11 @@ class GFF3FeatureSet
|
|
38
38
|
@pragmas.keys
|
39
39
|
end
|
40
40
|
|
41
|
-
# Returns an URI for this particular feature set, which is a SHA1 hash over the
|
41
|
+
# Returns an URI for this particular feature set, which is a SHA1 hash over the pragma's concatenated properties.
|
42
42
|
def uri
|
43
43
|
clob = ''
|
44
|
-
|
45
|
-
clob << "#{
|
44
|
+
pragmas.each { |pragma_name|
|
45
|
+
clob << "#{pragma_name}\t#{pragma(pragma_name).to_s}\n"
|
46
46
|
}
|
47
47
|
"biointerchange://gff3/featureset/self/#{Digest::SHA1.hexdigest(clob)}"
|
48
48
|
end
|
@@ -62,6 +62,14 @@ class GFF3FeatureSet
|
|
62
62
|
# TODO Should throw exception if name is not a string.
|
63
63
|
@pragmas[name] = value
|
64
64
|
end
|
65
|
+
|
66
|
+
# Removes all features from the set, but keeps the pragmas. This enables
|
67
|
+
# batched processing, since the URI for the set is only determined by the
|
68
|
+
# pragma statement contents.
|
69
|
+
def prune
|
70
|
+
@set.clear
|
71
|
+
end
|
72
|
+
|
65
73
|
end
|
66
74
|
|
67
75
|
end
|
@@ -1,15 +1,15 @@
|
|
1
1
|
|
2
2
|
module BioInterchange::Genomics
|
3
3
|
|
4
|
-
# Represents a named region, which is defined by the pragma statement 'sequence-region'.
|
5
|
-
class
|
4
|
+
# Represents a named region, a.k.a. landmark, which is defined by the pragma statement 'sequence-region'.
|
5
|
+
class GFF3Landmark
|
6
6
|
|
7
7
|
# Create a new instance of a named region.
|
8
8
|
#
|
9
9
|
# +seqid+:: unique identifier (in the GFF3 file context) that identifies this region
|
10
10
|
# +start_coordinate+:: genomic start coordinate of the region
|
11
11
|
# +end_coordinate+:: genomic end coordinate of the region
|
12
|
-
def initialize(seqid, start_coordinate, end_coordinate)
|
12
|
+
def initialize(seqid, start_coordinate = nil, end_coordinate = nil)
|
13
13
|
@seqid = seqid
|
14
14
|
@start_coordinate = start_coordinate
|
15
15
|
@end_coordinate = end_coordinate
|
@@ -46,7 +46,16 @@ protected
|
|
46
46
|
#
|
47
47
|
# +model+:: an instance of +BioInterchange::Genomics::GFF3FeatureSet+
|
48
48
|
def serialize_model(model)
|
49
|
+
# We record landmarks, because they can either be written when their "##sequence-region"
|
50
|
+
# pragma statement appears, or otherwise, when the first feature with said landmark is
|
51
|
+
# being serialized.
|
52
|
+
@landmarks = {}
|
53
|
+
|
54
|
+
# Record written variants in order to avoid writing out RDF.type multiple times.
|
55
|
+
@variants = {}
|
56
|
+
|
49
57
|
graph = RDF::Graph.new
|
58
|
+
graph.fast_ostream(@ostream) if BioInterchange::skip_rdf_graph
|
50
59
|
set_uri = RDF::URI.new(model.uri)
|
51
60
|
graph.insert(RDF::Statement.new(set_uri, RDF.type, @base.Set))
|
52
61
|
model.pragmas.each { |pragma_name|
|
@@ -57,21 +66,32 @@ protected
|
|
57
66
|
}
|
58
67
|
RDF::NTriples::Writer.dump(graph, @ostream)
|
59
68
|
# TODO Figure out why the following is very slow. Use with 'rdf-raptor'.
|
69
|
+
# Having said that, Jena's rdfcat is very good for converting formats
|
70
|
+
# anyway, so perhaps it is not worth investigating the following.
|
60
71
|
# RDF::RDFXML::Writer.dump(graph, @ostream)
|
61
72
|
end
|
62
73
|
|
63
74
|
# Serializes pragmas for a given feature set URI.
|
75
|
+
#
|
64
76
|
# +graph+:: RDF graph to which the pragmas are added
|
65
77
|
# +set_uri+:: the feature set URI to which the pragmas belong to
|
66
78
|
# +pragma+:: an object representing a pragma statement
|
67
79
|
def serialize_pragma(graph, set_uri, pragma)
|
68
80
|
if pragma.kind_of?(Hash) then
|
69
|
-
if pragma.has_key?('
|
81
|
+
if (pragma.has_key?('attribute-method') or pragma.has_key?('data-source') or pragma.has_key?('score-method') or pragma.has_key?('source-method') or pragma.has_key?('technology-platform')) and @base == BioInterchange::GVF1O then
|
82
|
+
serialize_structured_attribute(graph, set_uri, pragma)
|
83
|
+
elsif pragma.has_key?('gff-version') and @base == BioInterchange::GFF3O then
|
70
84
|
graph.insert(RDF::Statement.new(set_uri, @base.version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
|
71
85
|
elsif pragma.has_key?('gff-version') and @base == BioInterchange::GVF1O then
|
72
86
|
graph.insert(RDF::Statement.new(set_uri, @base.gff_version, RDF::Literal.new(pragma['gff-version'], :datatype => RDF::XSD.float )))
|
73
87
|
elsif pragma.has_key?('gvf-version') and @base == BioInterchange::GVF1O then
|
74
88
|
graph.insert(RDF::Statement.new(set_uri, @base.gvf_version, RDF::Literal.new(pragma['gvf-version'], :datatype => RDF::XSD.float )))
|
89
|
+
elsif pragma.has_key?('sequence-region') then
|
90
|
+
pragma['sequence-region'].keys.each { |seqid|
|
91
|
+
serialize_landmark(graph, set_uri, pragma['sequence-region'][seqid])
|
92
|
+
}
|
93
|
+
elsif pragma.has_key?('species') then
|
94
|
+
graph.insert(RDF::Statement.new(set_uri, @base.species, RDF::URI.new(pragma['species'])))
|
75
95
|
end
|
76
96
|
else
|
77
97
|
# TODO
|
@@ -87,17 +107,18 @@ protected
|
|
87
107
|
# TODO Make sure there is only one value in the 'ID' list.
|
88
108
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.sequence_id},#{feature.source},#{feature.type.to_s.sub(/^[^:]+:\/\//, '')},#{feature.start_coordinate},#{feature.end_coordinate},#{feature.strand},#{feature.phase}") unless feature.attributes.has_key?('ID')
|
89
109
|
feature_uri = RDF::URI.new("#{set_uri.to_s}/feature/#{feature.attributes['ID'][0]}") if feature.attributes.has_key?('ID')
|
90
|
-
|
110
|
+
feature_datatype_properties = @base.feature_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
111
|
+
feature_object_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
91
112
|
graph.insert(RDF::Statement.new(set_uri, @base.contains, feature_uri))
|
92
113
|
graph.insert(RDF::Statement.new(feature_uri, RDF.type, @base.Feature))
|
93
|
-
graph
|
114
|
+
serialize_landmark(graph, set_uri, GFF3Landmark.new(feature.sequence_id)) unless @landmarks.has_key?(feature.sequence_id)
|
115
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.seqid ].flatten, feature_object_properties)[0], @landmarks[feature.sequence_id]))
|
94
116
|
graph.insert(RDF::Statement.new(feature_uri, @base.source, RDF::Literal.new(feature.source)))
|
95
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.type,
|
96
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.start,
|
97
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.end,
|
117
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.type, feature.type))
|
118
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.start, feature_datatype_properties)[0], RDF::Literal.new(feature.start_coordinate)))
|
119
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent(@base.end, feature_datatype_properties)[0], RDF::Literal.new(feature.end_coordinate)))
|
98
120
|
graph.insert(RDF::Statement.new(feature_uri, @base.score, RDF::Literal.new(feature.score))) if feature.score
|
99
|
-
|
100
|
-
strand_uri = @base.with_parent(@base.strand, feature_properties)[0]
|
121
|
+
strand_uri = @base.with_parent(@base.strand, feature_object_properties)[0]
|
101
122
|
case feature.strand
|
102
123
|
when BioInterchange::Genomics::GFF3Feature::NOT_STRANDED
|
103
124
|
graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.NotStranded))
|
@@ -108,13 +129,27 @@ protected
|
|
108
129
|
when BioInterchange::Genomics::GFF3Feature::NEGATIVE
|
109
130
|
graph.insert(RDF::Statement.new(feature_uri, strand_uri, @base.Negative))
|
110
131
|
else
|
111
|
-
raise
|
132
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Strand of feature is set to an unknown constant.'
|
112
133
|
end
|
113
134
|
graph.insert(RDF::Statement.new(feature_uri, @base.phase, RDF::Literal.new(feature.phase))) if feature.phase
|
114
135
|
|
115
136
|
serialize_attributes(graph, set_uri, feature_uri, feature.attributes) unless feature.attributes.keys.empty?
|
116
137
|
end
|
117
138
|
|
139
|
+
# Serializes a genomic feature landmark ("seqid").
|
140
|
+
#
|
141
|
+
# +graph+:: RDF graph to which the landmark is added
|
142
|
+
# +set_uri+:: the feature set URI to which the landmark belongs to
|
143
|
+
# +landmark+:: encapsuled landmark data
|
144
|
+
def serialize_landmark(graph, set_uri, landmark)
|
145
|
+
return if @landmarks.has_key?(landmark.seqid)
|
146
|
+
landmark_uri = RDF::URI.new("#{set_uri.to_s}/landmark/#{landmark.seqid}")
|
147
|
+
@landmarks[landmark.seqid] = landmark_uri
|
148
|
+
graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.id ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.seqid)))
|
149
|
+
graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.start ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.start_coordinate))) if landmark.start_coordinate
|
150
|
+
graph.insert(RDF::Statement.new(landmark_uri, @base.with_parent([ @base.end ].flatten, @base.landmark_properties)[0], RDF::Literal.new(landmark.end_coordinate))) if landmark.end_coordinate
|
151
|
+
end
|
152
|
+
|
118
153
|
# Serializes the attributes of a feature.
|
119
154
|
#
|
120
155
|
# +graph+:: RDF graph to which the feature is added
|
@@ -123,24 +158,194 @@ protected
|
|
123
158
|
# +attribtues+:: a map of tag/value pairs
|
124
159
|
def serialize_attributes(graph, set_uri, feature_uri, attributes)
|
125
160
|
attributes.each_pair { |tag, list|
|
126
|
-
if
|
161
|
+
# Check for defined tags (in alphabetical order), if not matched, serialize as generic Attribute:
|
162
|
+
if tag == 'Alias' then
|
163
|
+
list.each { |value|
|
164
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.alias, RDF::Literal.new(value)))
|
165
|
+
}
|
166
|
+
elsif tag == 'Dbxref' then
|
167
|
+
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
168
|
+
list.each { |value|
|
169
|
+
begin
|
170
|
+
if value.match(/^dbSNP(_\d+)?:rs\d+$/) then
|
171
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new("http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=#{value.split(/:/)[1].sub(/^rs/, '')}")))
|
172
|
+
elsif value.match(/^COSMIC(_\d+)?:COSM\d+$/) then
|
173
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new("http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=#{value.split(/:/)[1].sub(/^COSM/, '')}")))
|
174
|
+
else
|
175
|
+
abbreviation, id = value.split(':', 2)
|
176
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.dbxref ].flatten, feature_properties)[0], RDF::URI.new(BioInterchange::GOXRef.send(BioInterchange.make_safe_label(abbreviation)).to_s + id)))
|
177
|
+
end
|
178
|
+
rescue NoMethodError
|
179
|
+
raise BioInterchange::Exceptions::InputFormatError, 'Attribute Dbxref link-out is not resolvable, i.e. the name cannot be turned into an URL.'
|
180
|
+
end
|
181
|
+
}
|
182
|
+
elsif tag == 'Derives_from' and @base == BioInterchange::GFF3O then
|
183
|
+
list.each { |value|
|
184
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.derives_from, RDF::URI.new("#{set_uri.to_s}/feature/#{value}")))
|
185
|
+
}
|
186
|
+
elsif tag == 'Gap' and @base == BioInterchange::GFF3O then
|
187
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.gap, RDF::Literal.new(list.join(','))))
|
188
|
+
elsif tag == 'ID' then
|
189
|
+
# Do nothing. The feature ID is the URI of the feature. It is not relevant as information anymore.
|
190
|
+
elsif tag == 'Is_circular' and @base == BioInterchange::GFF3O then
|
191
|
+
value = list.join(',')
|
192
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.is_circular, true)) if value == 'true'
|
193
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.is_circular, false)) if value == 'false'
|
194
|
+
# TODO Report invalid value.
|
195
|
+
elsif tag == 'Name' and @base == BioInterchange::GFF3O then
|
196
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.name, RDF::Literal.new(list.join(','))))
|
197
|
+
elsif tag == 'Note' and @base == BioInterchange::GFF3O then
|
198
|
+
list.each { |value|
|
199
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.note, RDF::Literal.new(value)))
|
200
|
+
}
|
201
|
+
elsif tag == 'Ontology_term' and @base == BioInterchange::GFF3O then
|
202
|
+
list.each { |value|
|
203
|
+
# TODO Sanitize values that are either not in GO xrf_abbs or need conversion to match
|
204
|
+
# match their associated Ruby method.
|
205
|
+
namespace, accession = value.split(/:/, 2)
|
206
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.ontology_term, RDF::URI.new("#{BioInterchange::GOXRef.send(namespace).to_s}#{accession}")))
|
207
|
+
}
|
208
|
+
elsif tag == 'Parent' then
|
127
209
|
list.each { |parent_id|
|
128
210
|
graph.insert(RDF::Statement.new(feature_uri, @base.parent, RDF::URI.new("#{set_uri.to_s}/feature/#{parent_id}")))
|
129
211
|
}
|
212
|
+
elsif tag == 'Reference_seq' then
|
213
|
+
list.each { |value|
|
214
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.reference_seq, RDF::Literal.new(value)))
|
215
|
+
}
|
216
|
+
elsif tag == 'Target' then
|
217
|
+
target_id, start_coordinate, end_coordinate, strand = list.join(',').split(/\s+/, 4)
|
218
|
+
target_datatype_properties = @base.target_properties.select { |uri| @base.is_datatype_property?(uri) }[0]
|
219
|
+
target_object_properties = @base.target_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
220
|
+
target_uri = RDF::URI.new("#{feature_uri.to_s}/target/#{target_id}")
|
221
|
+
graph.insert(RDF::Statement.new(target_uri, RDF.type, @base.Target))
|
222
|
+
graph.insert(RDF::Statement.new(target_uri, @base.target_id, RDF::Literal.new(target_id)))
|
223
|
+
graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.start ].flatten, target_datatype_properties)[0], RDF::Literal.new(start_coordinate.to_i)))
|
224
|
+
graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_datatype_properties)[0], RDF::Literal.new(end_coordinate.to_i)))
|
225
|
+
graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_object_properties)[0], @base.Positive)) if strand and strand == '+'
|
226
|
+
graph.insert(RDF::Statement.new(target_uri, @base.with_parent([ @base.end ].flatten, target_object_properties)[0], @base.Negative)) if strand and strand == '-'
|
227
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.target, target_uri))
|
228
|
+
elsif tag == 'Variant_seq' and @base == BioInterchange::GVF1O then
|
229
|
+
serialize_variant_seqs(graph, set_uri, feature_uri, list)
|
130
230
|
else
|
231
|
+
# TODO Report unknown upper case letters here? That would be a spec. validation...
|
232
|
+
# Well, or it would show that this implementation is incomplete. Could be either.
|
233
|
+
attribute_properties = @base.attribute_properties
|
234
|
+
attribute_properties = attribute_properties.select { |uri| @base.is_datatype_property?(uri) }[0] if attribute_properties.kind_of?(Array)
|
235
|
+
feature_properties = @base.feature_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
131
236
|
list.each_index { |index|
|
132
237
|
value = list[index]
|
133
238
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}") if list.size == 1
|
134
239
|
attribute_uri = RDF::URI.new("#{feature_uri.to_s}/attribute/#{tag}-#{index + 1}") unless list.size == 1
|
135
|
-
graph.insert(RDF::Statement.new(feature_uri, @base.attributes, attribute_uri))
|
240
|
+
graph.insert(RDF::Statement.new(feature_uri, @base.with_parent([ @base.attributes ].flatten, feature_properties)[0], attribute_uri))
|
136
241
|
graph.insert(RDF::Statement.new(attribute_uri, RDF.type, @base.Attribute))
|
137
|
-
graph.insert(RDF::Statement.new(attribute_uri, @base.tag, RDF::Literal.new("#{tag}")))
|
242
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.with_parent([ @base.tag ].flatten, attribute_properties)[0], RDF::Literal.new("#{tag}")))
|
138
243
|
graph.insert(RDF::Statement.new(attribute_uri, RDF.value, RDF::Literal.new(value)))
|
139
244
|
}
|
140
245
|
end
|
141
246
|
}
|
142
247
|
end
|
143
248
|
|
249
|
+
# Serializes a structured attribute (given as a pragma statement), which later
|
250
|
+
# can be referred to from feature instances.
|
251
|
+
#
|
252
|
+
# +graph+:: RDF graph to which the structured attribute is added
|
253
|
+
# +set_uri+:: the feature set URI to which the structured attribute belongs to
|
254
|
+
# +pragma+:: a map that encapsulates the structured attribute data
|
255
|
+
def serialize_structured_attribute(graph, set_uri, pragma)
|
256
|
+
attribute_uri = RDF::URI.new("#{set_uri.to_s}/structured_attribute/#{pragma.object_id}")
|
257
|
+
attributes = nil
|
258
|
+
class_type = nil
|
259
|
+
if pragma.has_key?('attribute-method') then
|
260
|
+
attributes = pragma['attribute-method'][0]
|
261
|
+
class_type = @base.Method
|
262
|
+
elsif pragma.has_key?('data-source') then
|
263
|
+
attributes = pragma['data-source'][0]
|
264
|
+
class_type = @base.DataSource
|
265
|
+
elsif pragma.has_key?('score-method') then
|
266
|
+
attributes = pragma['score-method'][0]
|
267
|
+
class_type = @base.Method
|
268
|
+
elsif pragma.has_key?('source-method') then
|
269
|
+
attributes = pragma['source-method'][0]
|
270
|
+
class_type = @base.Method
|
271
|
+
elsif pragma.has_key?('technology-platform') then
|
272
|
+
attributes = pragma['technology-platform'][0]
|
273
|
+
class_type = @base.TechnologyPlatform
|
274
|
+
else
|
275
|
+
# TODO Error.
|
276
|
+
end
|
277
|
+
graph.insert(RDF::Statement.new(attribute_uri, RDF.type, class_type))
|
278
|
+
if class_type == @base.DataSource and attributes.has_key?('Data_type') then
|
279
|
+
data_type_individual = nil
|
280
|
+
attributes['Data_type'] = attributes['Data_type'][0] # TODO Make sure array is of length 1.
|
281
|
+
if attributes['Data_type'] == 'Array_CGH' then
|
282
|
+
data_type_individual = @base.ArrayComparativeGenomicHybridization
|
283
|
+
elsif attributes['Data_type'] == 'DNA_microarray' then
|
284
|
+
data_type_individual = @base.DNAMicroarray
|
285
|
+
elsif attributes['Data_type'] == 'DNA_sequence' then
|
286
|
+
data_type_individual = @base.DNASequence
|
287
|
+
elsif attributes['Data_type'] == 'RNA_sequence' then
|
288
|
+
data_type_individual = @base.RNASequence
|
289
|
+
else
|
290
|
+
# TODO Error.
|
291
|
+
end
|
292
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.data_type, data_type_individual))
|
293
|
+
elsif class_type == @base.TechnologyPlatform then
|
294
|
+
if attributes.has_key?('Average_coverage') then
|
295
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.average_coverage, RDF::Literal.new(attributes['Average_coverage'][0].to_i)))
|
296
|
+
end
|
297
|
+
if attributes.has_key?('Platform_class') then
|
298
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.platform_class, RDF::Literal.new(attributes['Platform_class'][0])))
|
299
|
+
end
|
300
|
+
if attributes.has_key?('Platform_name') then
|
301
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.platform_name, RDF::Literal.new(attributes['Platform_name'][0])))
|
302
|
+
end
|
303
|
+
if attributes.has_key?('Read_length') then
|
304
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.read_length, RDF::Literal.new(attributes['Read_length'][0].to_i)))
|
305
|
+
end
|
306
|
+
if attributes.has_key?('Read_pair_span') then
|
307
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.read_pair_span, RDF::Literal.new(attributes['Read_pair_span'][0].to_i)))
|
308
|
+
end
|
309
|
+
if attributes.has_key?('Read_type') then
|
310
|
+
read_type_individual = nil
|
311
|
+
attributes['Read_type'] = attributes['Read_type'][0] # TODO Make sure array is of length 1.
|
312
|
+
if attributes['Read_type'] == 'fragment' then
|
313
|
+
read_type_individual = @base.Fragment
|
314
|
+
elsif attributes['Read_type'] == 'pair' then
|
315
|
+
read_type_individual = @base.Pair
|
316
|
+
else
|
317
|
+
# TODO Error.
|
318
|
+
end
|
319
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.read_type, read_type_individual))
|
320
|
+
end
|
321
|
+
end
|
322
|
+
structuredpragma_properties = @base.structuredpragma_properties.select { |uri| @base.is_object_property?(uri) }[0]
|
323
|
+
attributes.keys.each { |tag|
|
324
|
+
if tag.match(/^[a-z]/) then
|
325
|
+
custom_attribute_uri = RDF::URI.new("#{attribute_uri.to_s}/attribute/#{tag}")
|
326
|
+
graph.insert(RDF::Statement.new(custom_attribute_uri, RDF.type, @base.StructuredAttribute))
|
327
|
+
graph.insert(RDF::Statement.new(custom_attribute_uri, @base.with_parent([ @base.tag ].flatten, @base.structuredattribute_properties)[0], tag))
|
328
|
+
graph.insert(RDF::Statement.new(custom_attribute_uri, RDF.value, RDF::Literal.new(attributes[tag].join(','))))
|
329
|
+
graph.insert(RDF::Statement.new(attribute_uri, @base.with_parent([ @base.attributes ].flatten, structuredpragma_properties)[0], custom_attribute_uri))
|
330
|
+
end
|
331
|
+
}
|
332
|
+
end
|
333
|
+
|
334
|
+
# Serializes a list of variant sequences.
|
335
|
+
#
|
336
|
+
# +graph+:: RDF graph to which the structured attribute is added
|
337
|
+
# +set_uri+:: the feature set URI to which the feature belongs to
|
338
|
+
# +feature_uri+:: the feature URI to the feature that is annotated with variant data
|
339
|
+
# +list+:: list of variant values
|
340
|
+
def serialize_variant_seqs(graph, set_uri, feature_uri, list)
|
341
|
+
list.each_index { |index|
|
342
|
+
value = list[index]
|
343
|
+
variant_uri = RDF::URI.new("#{feature_uri.to_s}/variant/#{index}")
|
344
|
+
graph.insert(RDF::Statement.new(variant_uri, RDF.type, @base.Variant)) unless @variants.has_key?(variant_uri.to_s)
|
345
|
+
@variants[variant_uri.to_s] = true
|
346
|
+
graph.insert(RDF::Statement.new(variant_uri, @base.variant_seq, RDF::Literal.new(value)))
|
347
|
+
}
|
348
|
+
end
|
144
349
|
end
|
145
350
|
|
146
351
|
end
|