biointerchange 0.2.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/README.md +269 -19
- data/VERSION +1 -1
- data/examples/bininda_emonds_mammals.new +1 -0
- data/examples/rdfization.rb +17 -0
- data/examples/tree1.new +1 -0
- data/examples/tree2.new +1 -0
- data/examples/vocabulary.rb +26 -5
- data/generators/javaify.rb +12 -18
- data/generators/make_supplement_releases.rb +2 -0
- data/generators/pythonify.rb +21 -8
- data/generators/rdfxml.rb +15 -1
- data/lib/biointerchange/cdao.rb +2014 -0
- data/lib/biointerchange/core.rb +70 -77
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +16 -0
- data/lib/biointerchange/genomics/gff3_reader.rb +18 -4
- data/lib/biointerchange/genomics/gvf_reader.rb +14 -0
- data/lib/biointerchange/phylogenetics/cdao_rdf_ntriples.rb +108 -0
- data/lib/biointerchange/phylogenetics/newick_reader.rb +81 -0
- data/lib/biointerchange/phylogenetics/tree_set.rb +50 -0
- data/lib/biointerchange/registry.rb +50 -8
- data/lib/biointerchange/so.rb +150 -0
- data/lib/biointerchange/textmining/pdfx_xml_reader.rb +21 -2
- data/lib/biointerchange/textmining/pubannos_json_reader.rb +24 -1
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +9 -0
- data/lib/biointerchange/textmining/text_mining_reader.rb +5 -5
- data/spec/phylogenetics_spec.rb +79 -0
- data/supplemental/java/biointerchange/pom.xml +1 -1
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/CDAO.java +2602 -0
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/FALDO.java +30 -28
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GFF3O.java +136 -104
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java +367 -278
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java +4388 -3127
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SO.java +5970 -4351
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java +733 -544
- data/supplemental/java/biointerchange/src/test/java/org/biointerchange/AppTest.java +3 -1
- data/supplemental/python/biointerchange/cdao.py +2021 -0
- data/supplemental/python/biointerchange/faldo.py +37 -38
- data/supplemental/python/biointerchange/gff3o.py +156 -157
- data/supplemental/python/biointerchange/goxref.py +172 -172
- data/supplemental/python/biointerchange/gvf1o.py +428 -429
- data/supplemental/python/biointerchange/sio.py +3133 -3134
- data/supplemental/python/biointerchange/so.py +6626 -6527
- data/supplemental/python/biointerchange/sofa.py +790 -791
- data/supplemental/python/example.py +23 -5
- data/supplemental/python/setup.py +2 -2
- data/web/about.html +1 -0
- data/web/api.html +223 -15
- data/web/biointerchange.js +27 -6
- data/web/cli.html +8 -3
- data/web/index.html +6 -2
- data/web/ontologies.html +3 -0
- data/web/service/rdfizer.fcgi +7 -15
- data/web/webservices.html +6 -2
- metadata +30 -3
data/lib/biointerchange/core.rb
CHANGED
@@ -20,6 +20,7 @@ module BioInterchange
|
|
20
20
|
require 'biointerchange/exceptions'
|
21
21
|
|
22
22
|
# Ontologies (besides the ones from the 'rdf' gem)
|
23
|
+
require 'biointerchange/cdao'
|
23
24
|
require 'biointerchange/faldo'
|
24
25
|
require 'biointerchange/gff3o'
|
25
26
|
require 'biointerchange/goxref'
|
@@ -28,6 +29,9 @@ module BioInterchange
|
|
28
29
|
require 'biointerchange/so'
|
29
30
|
require 'biointerchange/sofa'
|
30
31
|
|
32
|
+
# Registry for reader/writer management:
|
33
|
+
require 'biointerchange/registry'
|
34
|
+
|
31
35
|
# Reader/writer interfaces
|
32
36
|
require 'biointerchange/reader'
|
33
37
|
require 'biointerchange/model'
|
@@ -81,6 +85,19 @@ module BioInterchange
|
|
81
85
|
# Writer
|
82
86
|
# ...same GFF3 writer
|
83
87
|
|
88
|
+
#
|
89
|
+
# PHYLOGENETICS
|
90
|
+
#
|
91
|
+
|
92
|
+
# Reader
|
93
|
+
require 'biointerchange/phylogenetics/newick_reader'
|
94
|
+
|
95
|
+
# Model
|
96
|
+
require 'biointerchange/phylogenetics/tree_set'
|
97
|
+
|
98
|
+
# Writer
|
99
|
+
require 'biointerchange/phylogenetics/cdao_rdf_ntriples'
|
100
|
+
|
84
101
|
#
|
85
102
|
# ACTUAL COMMAND LINE IMPLEMENTATION
|
86
103
|
#
|
@@ -97,7 +114,7 @@ module BioInterchange
|
|
97
114
|
["--batchsize", "-b", Getopt::OPTIONAL], # batchsize for readers/writers that support +postpone?+
|
98
115
|
["--input", "-i", Getopt::REQUIRED], # input file format
|
99
116
|
["--rdf", "-r", Getopt::REQUIRED], # output file format
|
100
|
-
["--annotate_name", Getopt::OPTIONAL], # name of
|
117
|
+
["--annotate_name", Getopt::OPTIONAL], # name of resource/tool/person
|
101
118
|
["--annotate_name_id", Getopt::OPTIONAL], # uri of resource/tool/person
|
102
119
|
["--annotate_date", Getopt::OPTIONAL], # date of processing/annotation
|
103
120
|
["--annotate_version", Getopt::OPTIONAL], # version number of resource
|
@@ -110,47 +127,38 @@ module BioInterchange
|
|
110
127
|
puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
|
111
128
|
puts ''
|
112
129
|
puts 'Supported input formats (--input <format>/-i <format>):'
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
puts ' uk.ac.man.pdfx : PDFx XML'
|
130
|
+
Registry.reader_descriptions.each_pair { |reader_id, description|
|
131
|
+
puts " #{reader_id}#{' ' * (34 - reader_id.length)} : #{description}"
|
132
|
+
}
|
117
133
|
puts ''
|
118
134
|
puts 'Supported output formats (--rdf <format>/-r <format>)'
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
puts ' biointerchange.gff3'
|
123
|
-
puts ' biointerchange.gvf'
|
124
|
-
puts ' rdf.bh12.sio : RDF N-Triples for the following inputs'
|
125
|
-
puts ' dbcls.catanns.json'
|
126
|
-
puts ' uk.ac.man.pdfx'
|
135
|
+
Registry.writer_descriptions.each_pair { |writer_id, description|
|
136
|
+
puts " #{writer_id}#{' ' * (34 - writer_id.length)} : #{description}"
|
137
|
+
}
|
127
138
|
puts ''
|
128
139
|
puts 'I/O options:'
|
140
|
+
puts ' -b <size>/--batchsize <size> : process input in batches of the given size'
|
141
|
+
puts ' (if supported, see below for valid input/rdf pairs)'
|
129
142
|
puts ' -f <file>/--file <file> : file to read; STDIN used if not supplied'
|
130
143
|
puts ' -o <file>/--out <file> : output file; STDOUT used if not supplied'
|
131
144
|
puts ''
|
132
|
-
puts 'Input-/RDF-format specific options:'
|
133
|
-
puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
|
134
|
-
puts ' Output: rdf.bh12.sio'
|
135
|
-
puts ' Options:'
|
136
|
-
puts ' --annotate_date <date> : date of processing/annotation (optional)'
|
137
|
-
puts ' --annotate_version <version> : version number of resource (optional)'
|
138
|
-
puts ' --annotate_name <name> : name of resource/tool/person (required)'
|
139
|
-
puts ' --annotate_name_id <id> : URI of resource/tool/person (required)'
|
140
|
-
puts ''
|
141
|
-
puts 'Input-/RDF-format specific options:'
|
142
|
-
puts ' Input: biointerchange.gff3 or biointerchange.gvf'
|
143
|
-
puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
|
144
|
-
puts ' Options:'
|
145
|
-
puts ' -b <size>/--batchsize <size> : process features in batches of the given size (optional)'
|
146
|
-
puts ' -t <date>/--date <date> : date when the GFF3/GVF file was created (optional)'
|
147
|
-
puts ' --name <name> : name of the GFF3/GVF file creator (optional)'
|
148
|
-
puts ' --name_id <id> : email address of the GFF3/GVF file creator (optional)'
|
149
|
-
puts ''
|
150
145
|
puts 'Other options:'
|
151
146
|
puts ' -v / --version : print the Gem\'s version number and exit'
|
152
147
|
puts ' -d / --debug : turn on debugging output (for stacktraces)'
|
153
148
|
puts ' -h --help : this message'
|
149
|
+
puts ''
|
150
|
+
puts 'Input-/RDF-format specific options:'
|
151
|
+
reader_writer_pairs = Registry.reader_writer_pairs
|
152
|
+
reader_writer_pairs.each_index { |reader_writer_pair_index|
|
153
|
+
reader_id, writer_id = reader_writer_pairs[reader_writer_pair_index]
|
154
|
+
puts " Input format : #{reader_id}"
|
155
|
+
puts " Output format : #{writer_id}"
|
156
|
+
Registry.options_help(reader_id).each { |option_description|
|
157
|
+
option, description = option_description
|
158
|
+
puts " --annotate_#{option}#{' ' * (21 - option.length)} : #{description}"
|
159
|
+
}
|
160
|
+
puts '' if reader_writer_pair_index + 1 < reader_writer_pairs.length
|
161
|
+
}
|
154
162
|
|
155
163
|
exit 1
|
156
164
|
end
|
@@ -166,62 +174,43 @@ module BioInterchange
|
|
166
174
|
@@skip_rdf_graph = false if opt['no_rdf_graph_optimization']
|
167
175
|
|
168
176
|
# Check if the input/rdf options are supported:
|
169
|
-
|
170
|
-
if opt['rdf'] == 'rdf.bh12.sio' then
|
171
|
-
raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
|
172
|
-
else
|
173
|
-
unsupported_combination
|
174
|
-
end
|
175
|
-
elsif opt['input'] == 'biointerchange.gff3' then
|
176
|
-
if opt['rdf'] == 'rdf.biointerchange.gff3' then
|
177
|
-
# Okay. No further arguments required.
|
178
|
-
else
|
179
|
-
unsupported_combination
|
180
|
-
end
|
181
|
-
elsif opt['input'] == 'biointerchange.gvf' then
|
182
|
-
if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
|
183
|
-
# Okay. No further arguments required.
|
184
|
-
else
|
185
|
-
unsupported_combination
|
186
|
-
end
|
187
|
-
else
|
188
|
-
unsupported_combination
|
189
|
-
end
|
177
|
+
unsupported_combination unless Registry.is_supported?(opt['input'], opt['rdf'])
|
190
178
|
|
191
|
-
|
179
|
+
if opt['batchsize'] then
|
180
|
+
batching_not_supported unless Registry.is_supporting_batch_processing?(opt['input'], opt['rdf'])
|
181
|
+
wrong_type('batchsize', 'a positive integer') unless opt['batchsize'].match(/^[1-9][0-9]*$/)
|
182
|
+
end
|
192
183
|
|
193
|
-
|
184
|
+
# Create a parameter map that can be passed along to Reader implementations:
|
185
|
+
map = {
|
186
|
+
'input' => opt['input'],
|
187
|
+
'output' => opt['output']
|
188
|
+
}
|
189
|
+
map['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
|
190
|
+
opt.each_key { |key|
|
191
|
+
map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
|
192
|
+
}
|
194
193
|
|
195
194
|
# Generate model from file (deserialization).
|
196
|
-
|
197
|
-
reader =
|
198
|
-
if opt['input'] == 'biointerchange.gff3' then
|
199
|
-
reader = BioInterchange::Genomics::GFF3Reader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
|
200
|
-
elsif opt['input'] == 'biointerchange.gvf' then
|
201
|
-
reader = BioInterchange::Genomics::GVFReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
|
202
|
-
elsif opt['input'] == 'dbcls.catanns.json' then
|
203
|
-
reader = BioInterchange::TextMining::PubAnnosJSONReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
|
204
|
-
elsif opt['input'] == 'uk.ac.man.pdfx' then
|
205
|
-
reader = BioInterchange::TextMining::PDFxXMLReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['annotate_version'])
|
206
|
-
end
|
195
|
+
reader_class, *args = Registry.reader(opt['input'])
|
196
|
+
reader = reader_class.new(*BioInterchange::get_parameters(map, args))
|
207
197
|
|
208
|
-
|
209
|
-
|
198
|
+
input_source = nil
|
199
|
+
if opt['file'] then
|
200
|
+
input_source = File.new(opt['file'], 'r')
|
210
201
|
else
|
211
202
|
input_source = STDIN
|
212
203
|
end
|
213
204
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
|
220
|
-
end
|
221
|
-
if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
|
222
|
-
writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
|
223
|
-
writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
|
205
|
+
output_source = nil
|
206
|
+
if opt['out'] then
|
207
|
+
output_source = File.new(opt['out'], 'w')
|
208
|
+
else
|
209
|
+
output_source = STDOUT
|
224
210
|
end
|
211
|
+
|
212
|
+
# Generate rdf from model (serialization).
|
213
|
+
writer = Registry.writer(opt['rdf']).new(output_source)
|
225
214
|
|
226
215
|
begin
|
227
216
|
model = reader.deserialize(input_source)
|
@@ -270,6 +259,10 @@ module BioInterchange
|
|
270
259
|
|
271
260
|
private
|
272
261
|
|
262
|
+
def self.batching_not_supported
|
263
|
+
raise ArgumentError, 'Batching is not supported for this input/output format combination.'
|
264
|
+
end
|
265
|
+
|
273
266
|
def self.unsupported_combination
|
274
267
|
raise ArgumentError, 'This input/output format combination is not supported.'
|
275
268
|
end
|
@@ -15,6 +15,22 @@ module BioInterchange::Genomics
|
|
15
15
|
# - rdf.biointerchange.gvf
|
16
16
|
class RDFWriter < BioInterchange::Writer
|
17
17
|
|
18
|
+
# Register writers:
|
19
|
+
BioInterchange::Registry.register_writer(
|
20
|
+
'rdf.biointerchange.gff3',
|
21
|
+
BioInterchange::Genomics::RDFWriter,
|
22
|
+
[ 'biointerchange.gff3' ],
|
23
|
+
true,
|
24
|
+
'Generic Feature Format Version 3 Ontology (GFF3O) based RDFization'
|
25
|
+
)
|
26
|
+
BioInterchange::Registry.register_writer(
|
27
|
+
'rdf.biointerchange.gvf',
|
28
|
+
BioInterchange::Genomics::RDFWriter,
|
29
|
+
[ 'biointerchange.gvf' ],
|
30
|
+
true,
|
31
|
+
'Genome Variation Format Version 1 Ontology (GVF1O) based RDFization'
|
32
|
+
)
|
33
|
+
|
18
34
|
# Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
|
19
35
|
#
|
20
36
|
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
@@ -4,6 +4,20 @@ module BioInterchange::Genomics
|
|
4
4
|
|
5
5
|
class GFF3Reader < BioInterchange::Reader
|
6
6
|
|
7
|
+
# Register reader:
|
8
|
+
BioInterchange::Registry.register_reader(
|
9
|
+
'biointerchange.gff3',
|
10
|
+
GFF3Reader,
|
11
|
+
[ 'name', 'name_uri', 'date' ],
|
12
|
+
true,
|
13
|
+
'Generic Feature Format Version 3 (GFF3) reader',
|
14
|
+
[
|
15
|
+
[ 'date <date>', 'date when the GFF3 file was created (optional)' ],
|
16
|
+
[ 'name <name>', 'name of the GFF3 file creator (optional)' ],
|
17
|
+
[ 'name_id <id>', 'email address of the GFF3 file creator (optional)' ]
|
18
|
+
]
|
19
|
+
)
|
20
|
+
|
7
21
|
# Creates a new instance of a Generic Feature Format Version 3 (GFF3) reader.
|
8
22
|
#
|
9
23
|
# The reader supports batch processing.
|
@@ -94,7 +108,7 @@ protected
|
|
94
108
|
if type.match(/^SO:\d{7}$/) then
|
95
109
|
type = RDF::URI.new("http://www.sequenceontology.org/miso/current_release/term/#{feature.type}")
|
96
110
|
else
|
97
|
-
type = BioInterchange::
|
111
|
+
type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
|
98
112
|
end
|
99
113
|
rescue NoMethodError
|
100
114
|
raise BioInterchange::Exceptions::InputFormatError, "Type of feature is set to an unknown SOFA term: \"#{type}\""
|
@@ -154,15 +168,15 @@ protected
|
|
154
168
|
feature_set.set_pragma(name, { name => value.to_f })
|
155
169
|
elsif name == 'sequence-region' then
|
156
170
|
regions = feature_set.pragma(name)
|
157
|
-
regions = {} unless regions
|
171
|
+
regions = { name => {} } unless regions
|
158
172
|
seqid, start_coordinate, end_coordinate = value.split(/\s+/, 3)
|
159
|
-
regions[seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
|
173
|
+
regions[name][seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
|
160
174
|
feature_set.set_pragma(name, regions)
|
161
175
|
elsif name == 'species' then
|
162
176
|
feature_set.set_pragma(name, { name => value })
|
163
177
|
else
|
164
178
|
# Unhandled pragma. Just save the value in its string form.
|
165
|
-
feature_set.set_pragma(name, value)
|
179
|
+
feature_set.set_pragma(name, { name => value })
|
166
180
|
end
|
167
181
|
end
|
168
182
|
|
@@ -2,6 +2,20 @@ module BioInterchange::Genomics
|
|
2
2
|
|
3
3
|
class GVFReader < GFF3Reader
|
4
4
|
|
5
|
+
# Register reader:
|
6
|
+
BioInterchange::Registry.register_reader(
|
7
|
+
'biointerchange.gvf',
|
8
|
+
GVFReader,
|
9
|
+
[ 'name', 'name_uri', 'date' ],
|
10
|
+
true,
|
11
|
+
'Genome Variation Format Version 1 (GVF) reader',
|
12
|
+
[
|
13
|
+
[ 'date <date>', 'date when the GVF file was created (optional)' ],
|
14
|
+
[ 'name <name>', 'name of the GVF file creator (optional)' ],
|
15
|
+
[ 'name_id <id>', 'email address of the GVF file creator (optional)' ]
|
16
|
+
]
|
17
|
+
)
|
18
|
+
|
5
19
|
# Creates a new instance of a Genome Variation Format (GVF) reader.
|
6
20
|
#
|
7
21
|
# +name+:: Optional name of the person who generated the GVF file.
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/ntriples'
|
3
|
+
|
4
|
+
module BioInterchange::Phylogenetics
|
5
|
+
|
6
|
+
# Serialized phylogenetic tree models based on BioRuby's phylogenetic tree implementation.
|
7
|
+
class CDAORDFWriter < BioInterchange::Writer
|
8
|
+
|
9
|
+
# Register writers:
|
10
|
+
BioInterchange::Registry.register_writer(
|
11
|
+
'rdf.phylotastic.newick',
|
12
|
+
CDAORDFWriter,
|
13
|
+
[ 'phylotastic.newick' ],
|
14
|
+
true,
|
15
|
+
'Comparative Data Analysis Ontology (CDAO) based RDFization'
|
16
|
+
)
|
17
|
+
|
18
|
+
# Creates a new instance of a CDAORDFWriter that will use the provided output stream to serialize RDF.
|
19
|
+
#
|
20
|
+
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
21
|
+
def initialize(ostream)
|
22
|
+
@ostream = ostream
|
23
|
+
end
|
24
|
+
|
25
|
+
# Serialize a model as RDF.
|
26
|
+
#
|
27
|
+
# +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
|
28
|
+
def serialize(model)
|
29
|
+
model.contents.each { |tree|
|
30
|
+
serialize_model(model, tree)
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def serialize_model(model, tree)
|
37
|
+
graph = RDF::Graph.new
|
38
|
+
graph.fast_ostream(@ostream) if BioInterchange::skip_rdf_graph
|
39
|
+
tree_uri = RDF::URI.new(model.uri)
|
40
|
+
if model.date then
|
41
|
+
graph.insert(RDF::Statement.new(tree_uri, RDF::DC.date, RDF::Literal.new(model.date)))
|
42
|
+
end
|
43
|
+
serialize_tree(graph, tree, tree_uri, tree.root, true)
|
44
|
+
RDF::NTriples::Writer.dump(graph, @ostream)
|
45
|
+
end
|
46
|
+
|
47
|
+
def serialize_tree(graph, tree, tree_uri, node, is_root)
|
48
|
+
node_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{node.object_id}")
|
49
|
+
|
50
|
+
if is_root then
|
51
|
+
graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.NewickTree))
|
52
|
+
# Commented out some lines since it appears not to be determinable for Newick trees.
|
53
|
+
if tree.root then
|
54
|
+
# graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.rootedtree))
|
55
|
+
else
|
56
|
+
# graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.unrootedtree))
|
57
|
+
# Pick the first node available to permit serialization of the tree:
|
58
|
+
tree.root = node = tree.nodes.first
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
if node.name and not node.name.empty? then
|
63
|
+
taxonomic_unit_uri = RDF::URI.new("#{tree_uri.to_s}/taxonomic_unit/#{node.object_id}")
|
64
|
+
graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF.type, BioInterchange::CDAO.TU))
|
65
|
+
graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO::represents_TU, taxonomic_unit_uri))
|
66
|
+
graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF::RDFS.label, RDF::Literal.new(node.name.gsub('_', ' '))))
|
67
|
+
end
|
68
|
+
|
69
|
+
if tree.descendents(node).empty? then
|
70
|
+
graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.TerminalNode))
|
71
|
+
else
|
72
|
+
graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.AncestralNode))
|
73
|
+
end
|
74
|
+
|
75
|
+
if not tree.root == node and tree.parent(node) then
|
76
|
+
parent_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{tree.parent(node).object_id}")
|
77
|
+
edge_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}")
|
78
|
+
annotation_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}/annotation")
|
79
|
+
graph.insert(RDF::Statement.new(edge_uri, RDF.type, BioInterchange::CDAO.DirectedEdge))
|
80
|
+
graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
|
81
|
+
graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Parent_Node, parent_uri))
|
82
|
+
graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Child_Node, node_uri))
|
83
|
+
graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Edge_as_Child, edge_uri))
|
84
|
+
graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Parent, parent_uri))
|
85
|
+
graph.insert(RDF::Statement.new(parent_uri, BioInterchange::CDAO.belongs_to_Edge_as_Parent, edge_uri))
|
86
|
+
|
87
|
+
# if node.distance then
|
88
|
+
# graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Support_Value, RDF::Literal.new(node.distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
|
89
|
+
# end
|
90
|
+
|
91
|
+
graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Annotation, annotation_uri))
|
92
|
+
graph.insert(RDF::Statement.new(annotation_uri, RDF.type, BioInterchange::CDAO.EdgeLength))
|
93
|
+
graph.insert(RDF::Statement.new(annotation_uri, BioInterchange::CDAO.has_Value, RDF::Literal.new(tree.get_edge(tree.parent(node), node).distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
|
94
|
+
end
|
95
|
+
|
96
|
+
graph.insert(RDF::Statement.new(tree_uri, BioInterchange::CDAO.has_Root, node_uri))
|
97
|
+
graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
|
98
|
+
|
99
|
+
# Now, continue traversing the tree by visiting the current node's descendents:
|
100
|
+
tree.descendents(node).each { |descendent_node|
|
101
|
+
serialize_tree(graph, tree, tree_uri, descendent_node, false)
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'bio'
|
2
|
+
require 'date'
|
3
|
+
|
4
|
+
module BioInterchange::Phylogenetics
|
5
|
+
|
6
|
+
class NewickReader < BioInterchange::Reader
|
7
|
+
|
8
|
+
# Register reader:
|
9
|
+
BioInterchange::Registry.register_reader(
|
10
|
+
'phylotastic.newick',
|
11
|
+
NewickReader,
|
12
|
+
[ 'date' ],
|
13
|
+
true,
|
14
|
+
'Newick Tree File Format reader',
|
15
|
+
[
|
16
|
+
[ 'date <date>', 'date when the Newick file was created (optional)' ]
|
17
|
+
]
|
18
|
+
)
|
19
|
+
|
20
|
+
# Creates a new instance of a Newick file format reader.
|
21
|
+
#
|
22
|
+
# The reader supports batch processing.
|
23
|
+
#
|
24
|
+
# +date+:: Optional date of when the Newick file was produced, annotated, etc.
|
25
|
+
# +batch_size+:: Optional integer that determines that number of features that
|
26
|
+
# should be processed in one go.
|
27
|
+
def initialize(date = nil, batch_size = nil)
|
28
|
+
@date = date
|
29
|
+
@batch_size = batch_size
|
30
|
+
end
|
31
|
+
|
32
|
+
# Reads a Newick file from the input stream and returns an associated model.
|
33
|
+
#
|
34
|
+
# If this method is called when +postponed?+ returns true, then the reading will
|
35
|
+
# continue from where it has been interrupted beforehand.
|
36
|
+
#
|
37
|
+
# +inputstream+:: an instance of class IO or String that holds the contents of a Newick file
|
38
|
+
def deserialize(inputstream)
|
39
|
+
if inputstream.kind_of?(IO)
|
40
|
+
create_model(inputstream)
|
41
|
+
elsif inputstream.kind_of?(String) then
|
42
|
+
create_model(StringIO.new(inputstream))
|
43
|
+
else
|
44
|
+
raise BioInterchange::Exceptions::ImplementationReaderError, 'The provided input stream needs to be either of type IO or String.'
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns true if the reading of the input was postponed due to a full batch.
|
49
|
+
def postponed?
|
50
|
+
@postponed
|
51
|
+
end
|
52
|
+
|
53
|
+
protected
|
54
|
+
|
55
|
+
def create_model(newick)
|
56
|
+
if @postponed then
|
57
|
+
@postponed = false
|
58
|
+
@trees.prune
|
59
|
+
else
|
60
|
+
@trees = BioInterchange::Phylogenetics::TreeSet.new()
|
61
|
+
@trees.set_date(Date.parse(@date)) if @date
|
62
|
+
end
|
63
|
+
|
64
|
+
tree_io = Bio::FlatFile.open(Bio::Newick, newick)
|
65
|
+
while newick_tree = tree_io.next_entry
|
66
|
+
newick_tree.options[:bootstrap_style] = :disabled
|
67
|
+
@trees.add(newick_tree.tree)
|
68
|
+
|
69
|
+
if @batch_size and feature_no >= @batch_size then
|
70
|
+
@postponed = true
|
71
|
+
break
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
@trees
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
|
3
|
+
module BioInterchange::Phylogenetics
|
4
|
+
|
5
|
+
# A phylogenetic tree set that can contain multiple phylogenetic trees.
|
6
|
+
class TreeSet < BioInterchange::Model
|
7
|
+
|
8
|
+
# Create a new instance of a tree set. A tree set can contain multiple phylogenetic trees.
|
9
|
+
def initialize
|
10
|
+
# Trees are stored as the keys of a hash map to increase performance:
|
11
|
+
@set = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns the contents of the tree set.
|
15
|
+
def contents
|
16
|
+
@set.keys
|
17
|
+
end
|
18
|
+
|
19
|
+
# If a date was provided, then this method returns its value.
|
20
|
+
def date
|
21
|
+
@date
|
22
|
+
end
|
23
|
+
|
24
|
+
# Sets a date that is associated with the trees in this model (e.g., annotation date, creation date, etc.).
|
25
|
+
#
|
26
|
+
# +date+:: an instance of Date that is associated with all trees in the model
|
27
|
+
def set_date(date)
|
28
|
+
@date = date
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns an URI for this particular tree set, which is a not necessarily globally unique SHA1 hash.
|
32
|
+
def uri
|
33
|
+
"biointerchange://phylogenetics/treeset/self/#{Digest::SHA1.hexdigest(Time.now.to_s)}"
|
34
|
+
end
|
35
|
+
|
36
|
+
# Add a tree to the tree set.
|
37
|
+
#
|
38
|
+
# +tree+:: BioRuby tree instance that is added to the contents of this tree set
|
39
|
+
def add(tree)
|
40
|
+
@set[tree] = true
|
41
|
+
end
|
42
|
+
|
43
|
+
# Removes all features from the set, but keeps additional data (e.g., the date).
|
44
|
+
def prune
|
45
|
+
@set.clear
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -1,27 +1,69 @@
|
|
1
1
|
module BioInterchange
|
2
2
|
|
3
|
+
# A registry of Reader and Writer subclasses that also keeps track which
|
4
|
+
# Reader/Writer combinations can be used together. The registry makes it
|
5
|
+
# possible to implement readers and writers without the need to modify
|
6
|
+
# other BioInterchange framework code.
|
3
7
|
class Registry
|
4
8
|
|
5
|
-
def self.register_reader(
|
6
|
-
@@readers[
|
9
|
+
def self.register_reader(reader_id, reader_class, parameters, supports_batch_processing, descriptive_name, options_help)
|
10
|
+
@@readers[reader_id] = [ reader_class ] + parameters
|
11
|
+
@@reader_batch_processors[reader_id] = true if supports_batch_processing
|
12
|
+
@@reader_descriptions[reader_id] = descriptive_name
|
13
|
+
@@reader_help_texts[reader_id] = options_help
|
7
14
|
end
|
8
15
|
|
9
|
-
def self.register_writer(
|
10
|
-
@@writers[
|
16
|
+
def self.register_writer(writer_id, writer_class, compatible_reader_ids, supports_batch_processing, descriptive_name)
|
17
|
+
@@writers[writer_id] = writer_class
|
18
|
+
@@writer_batch_processors[writer_id] = true if supports_batch_processing
|
19
|
+
@@writer_descriptions[writer_id] = descriptive_name
|
20
|
+
compatible_reader_ids.each { |reader_id|
|
21
|
+
@@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] = true
|
22
|
+
}
|
11
23
|
end
|
12
24
|
|
13
|
-
def self.
|
14
|
-
@@
|
25
|
+
def self.is_supported?(reader_id, writer_id)
|
26
|
+
@@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] == true
|
15
27
|
end
|
16
28
|
|
17
|
-
def self.
|
18
|
-
@@
|
29
|
+
def self.is_supporting_batch_processing?(reader_id, writer_id)
|
30
|
+
@@reader_batch_processors[reader_id] and @@writer_batch_processors[writer_id]
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.reader(reader_id)
|
34
|
+
@@readers[reader_id]
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.writer(writer_id)
|
38
|
+
@@writers[writer_id]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.reader_descriptions
|
42
|
+
@@reader_descriptions.clone.freeze
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.writer_descriptions
|
46
|
+
@@writer_descriptions.clone.freeze
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.options_help(reader_id)
|
50
|
+
@@reader_help_texts[reader_id].clone.freeze
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.reader_writer_pairs
|
54
|
+
@@compatible_reader_writer_pairs.keys.sort.map { |reader_writer_pair| reader_writer_pair.split(/ /, 2) }.freeze
|
19
55
|
end
|
20
56
|
|
21
57
|
private
|
22
58
|
|
23
59
|
@@readers = {}
|
24
60
|
@@writers = {}
|
61
|
+
@@reader_batch_processors = {}
|
62
|
+
@@writer_batch_processors = {}
|
63
|
+
@@reader_descriptions = {}
|
64
|
+
@@writer_descriptions = {}
|
65
|
+
@@reader_help_texts = {}
|
66
|
+
@@compatible_reader_writer_pairs = {}
|
25
67
|
|
26
68
|
end
|
27
69
|
|