biointerchange 0.2.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -0
- data/README.md +269 -19
- data/VERSION +1 -1
- data/examples/bininda_emonds_mammals.new +1 -0
- data/examples/rdfization.rb +17 -0
- data/examples/tree1.new +1 -0
- data/examples/tree2.new +1 -0
- data/examples/vocabulary.rb +26 -5
- data/generators/javaify.rb +12 -18
- data/generators/make_supplement_releases.rb +2 -0
- data/generators/pythonify.rb +21 -8
- data/generators/rdfxml.rb +15 -1
- data/lib/biointerchange/cdao.rb +2014 -0
- data/lib/biointerchange/core.rb +70 -77
- data/lib/biointerchange/genomics/gff3_rdf_ntriples.rb +16 -0
- data/lib/biointerchange/genomics/gff3_reader.rb +18 -4
- data/lib/biointerchange/genomics/gvf_reader.rb +14 -0
- data/lib/biointerchange/phylogenetics/cdao_rdf_ntriples.rb +108 -0
- data/lib/biointerchange/phylogenetics/newick_reader.rb +81 -0
- data/lib/biointerchange/phylogenetics/tree_set.rb +50 -0
- data/lib/biointerchange/registry.rb +50 -8
- data/lib/biointerchange/so.rb +150 -0
- data/lib/biointerchange/textmining/pdfx_xml_reader.rb +21 -2
- data/lib/biointerchange/textmining/pubannos_json_reader.rb +24 -1
- data/lib/biointerchange/textmining/text_mining_rdf_ntriples.rb +9 -0
- data/lib/biointerchange/textmining/text_mining_reader.rb +5 -5
- data/spec/phylogenetics_spec.rb +79 -0
- data/supplemental/java/biointerchange/pom.xml +1 -1
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/CDAO.java +2602 -0
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/FALDO.java +30 -28
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GFF3O.java +136 -104
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/GVF1O.java +367 -278
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SIO.java +4388 -3127
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SO.java +5970 -4351
- data/supplemental/java/biointerchange/src/main/java/org/biointerchange/vocabulary/SOFA.java +733 -544
- data/supplemental/java/biointerchange/src/test/java/org/biointerchange/AppTest.java +3 -1
- data/supplemental/python/biointerchange/cdao.py +2021 -0
- data/supplemental/python/biointerchange/faldo.py +37 -38
- data/supplemental/python/biointerchange/gff3o.py +156 -157
- data/supplemental/python/biointerchange/goxref.py +172 -172
- data/supplemental/python/biointerchange/gvf1o.py +428 -429
- data/supplemental/python/biointerchange/sio.py +3133 -3134
- data/supplemental/python/biointerchange/so.py +6626 -6527
- data/supplemental/python/biointerchange/sofa.py +790 -791
- data/supplemental/python/example.py +23 -5
- data/supplemental/python/setup.py +2 -2
- data/web/about.html +1 -0
- data/web/api.html +223 -15
- data/web/biointerchange.js +27 -6
- data/web/cli.html +8 -3
- data/web/index.html +6 -2
- data/web/ontologies.html +3 -0
- data/web/service/rdfizer.fcgi +7 -15
- data/web/webservices.html +6 -2
- metadata +30 -3
data/lib/biointerchange/core.rb
CHANGED
@@ -20,6 +20,7 @@ module BioInterchange
|
|
20
20
|
require 'biointerchange/exceptions'
|
21
21
|
|
22
22
|
# Ontologies (besides the ones from the 'rdf' gem)
|
23
|
+
require 'biointerchange/cdao'
|
23
24
|
require 'biointerchange/faldo'
|
24
25
|
require 'biointerchange/gff3o'
|
25
26
|
require 'biointerchange/goxref'
|
@@ -28,6 +29,9 @@ module BioInterchange
|
|
28
29
|
require 'biointerchange/so'
|
29
30
|
require 'biointerchange/sofa'
|
30
31
|
|
32
|
+
# Registry for reader/writer management:
|
33
|
+
require 'biointerchange/registry'
|
34
|
+
|
31
35
|
# Reader/writer interfaces
|
32
36
|
require 'biointerchange/reader'
|
33
37
|
require 'biointerchange/model'
|
@@ -81,6 +85,19 @@ module BioInterchange
|
|
81
85
|
# Writer
|
82
86
|
# ...same GFF3 writer
|
83
87
|
|
88
|
+
#
|
89
|
+
# PHYLOGENETICS
|
90
|
+
#
|
91
|
+
|
92
|
+
# Reader
|
93
|
+
require 'biointerchange/phylogenetics/newick_reader'
|
94
|
+
|
95
|
+
# Model
|
96
|
+
require 'biointerchange/phylogenetics/tree_set'
|
97
|
+
|
98
|
+
# Writer
|
99
|
+
require 'biointerchange/phylogenetics/cdao_rdf_ntriples'
|
100
|
+
|
84
101
|
#
|
85
102
|
# ACTUAL COMMAND LINE IMPLEMENTATION
|
86
103
|
#
|
@@ -97,7 +114,7 @@ module BioInterchange
|
|
97
114
|
["--batchsize", "-b", Getopt::OPTIONAL], # batchsize for readers/writers that support +postpone?+
|
98
115
|
["--input", "-i", Getopt::REQUIRED], # input file format
|
99
116
|
["--rdf", "-r", Getopt::REQUIRED], # output file format
|
100
|
-
["--annotate_name", Getopt::OPTIONAL], # name of
|
117
|
+
["--annotate_name", Getopt::OPTIONAL], # name of resource/tool/person
|
101
118
|
["--annotate_name_id", Getopt::OPTIONAL], # uri of resource/tool/person
|
102
119
|
["--annotate_date", Getopt::OPTIONAL], # date of processing/annotation
|
103
120
|
["--annotate_version", Getopt::OPTIONAL], # version number of resource
|
@@ -110,47 +127,38 @@ module BioInterchange
|
|
110
127
|
puts "Usage: ruby #{$0} -i <format> -r <format> [options]"
|
111
128
|
puts ''
|
112
129
|
puts 'Supported input formats (--input <format>/-i <format>):'
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
puts ' uk.ac.man.pdfx : PDFx XML'
|
130
|
+
Registry.reader_descriptions.each_pair { |reader_id, description|
|
131
|
+
puts " #{reader_id}#{' ' * (34 - reader_id.length)} : #{description}"
|
132
|
+
}
|
117
133
|
puts ''
|
118
134
|
puts 'Supported output formats (--rdf <format>/-r <format>)'
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
puts ' biointerchange.gff3'
|
123
|
-
puts ' biointerchange.gvf'
|
124
|
-
puts ' rdf.bh12.sio : RDF N-Triples for the following inputs'
|
125
|
-
puts ' dbcls.catanns.json'
|
126
|
-
puts ' uk.ac.man.pdfx'
|
135
|
+
Registry.writer_descriptions.each_pair { |writer_id, description|
|
136
|
+
puts " #{writer_id}#{' ' * (34 - writer_id.length)} : #{description}"
|
137
|
+
}
|
127
138
|
puts ''
|
128
139
|
puts 'I/O options:'
|
140
|
+
puts ' -b <size>/--batchsize <size> : process input in batches of the given size'
|
141
|
+
puts ' (if supported, see below for valid input/rdf pairs)'
|
129
142
|
puts ' -f <file>/--file <file> : file to read; STDIN used if not supplied'
|
130
143
|
puts ' -o <file>/--out <file> : output file; STDOUT used if not supplied'
|
131
144
|
puts ''
|
132
|
-
puts 'Input-/RDF-format specific options:'
|
133
|
-
puts ' Input: dbcls.catanns.json, uk.ac.man.pdfx'
|
134
|
-
puts ' Output: rdf.bh12.sio'
|
135
|
-
puts ' Options:'
|
136
|
-
puts ' --annotate_date <date> : date of processing/annotation (optional)'
|
137
|
-
puts ' --annotate_version <version> : version number of resource (optional)'
|
138
|
-
puts ' --annotate_name <name> : name of resource/tool/person (required)'
|
139
|
-
puts ' --annotate_name_id <id> : URI of resource/tool/person (required)'
|
140
|
-
puts ''
|
141
|
-
puts 'Input-/RDF-format specific options:'
|
142
|
-
puts ' Input: biointerchange.gff3 or biointerchange.gvf'
|
143
|
-
puts ' Output: rdf.biointerchange.gff3 or rdf.biointerchange.gvf'
|
144
|
-
puts ' Options:'
|
145
|
-
puts ' -b <size>/--batchsize <size> : process features in batches of the given size (optional)'
|
146
|
-
puts ' -t <date>/--date <date> : date when the GFF3/GVF file was created (optional)'
|
147
|
-
puts ' --name <name> : name of the GFF3/GVF file creator (optional)'
|
148
|
-
puts ' --name_id <id> : email address of the GFF3/GVF file creator (optional)'
|
149
|
-
puts ''
|
150
145
|
puts 'Other options:'
|
151
146
|
puts ' -v / --version : print the Gem\'s version number and exit'
|
152
147
|
puts ' -d / --debug : turn on debugging output (for stacktraces)'
|
153
148
|
puts ' -h --help : this message'
|
149
|
+
puts ''
|
150
|
+
puts 'Input-/RDF-format specific options:'
|
151
|
+
reader_writer_pairs = Registry.reader_writer_pairs
|
152
|
+
reader_writer_pairs.each_index { |reader_writer_pair_index|
|
153
|
+
reader_id, writer_id = reader_writer_pairs[reader_writer_pair_index]
|
154
|
+
puts " Input format : #{reader_id}"
|
155
|
+
puts " Output format : #{writer_id}"
|
156
|
+
Registry.options_help(reader_id).each { |option_description|
|
157
|
+
option, description = option_description
|
158
|
+
puts " --annotate_#{option}#{' ' * (21 - option.length)} : #{description}"
|
159
|
+
}
|
160
|
+
puts '' if reader_writer_pair_index + 1 < reader_writer_pairs.length
|
161
|
+
}
|
154
162
|
|
155
163
|
exit 1
|
156
164
|
end
|
@@ -166,62 +174,43 @@ module BioInterchange
|
|
166
174
|
@@skip_rdf_graph = false if opt['no_rdf_graph_optimization']
|
167
175
|
|
168
176
|
# Check if the input/rdf options are supported:
|
169
|
-
|
170
|
-
if opt['rdf'] == 'rdf.bh12.sio' then
|
171
|
-
raise ArgumentError, 'Require --name and --name_id options to specify source of annotations (e.g., a manual annotators name, or software tool name) and their associated URI (e.g., email address, or webaddress).' unless opt['name'] and opt['name_id']
|
172
|
-
else
|
173
|
-
unsupported_combination
|
174
|
-
end
|
175
|
-
elsif opt['input'] == 'biointerchange.gff3' then
|
176
|
-
if opt['rdf'] == 'rdf.biointerchange.gff3' then
|
177
|
-
# Okay. No further arguments required.
|
178
|
-
else
|
179
|
-
unsupported_combination
|
180
|
-
end
|
181
|
-
elsif opt['input'] == 'biointerchange.gvf' then
|
182
|
-
if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
|
183
|
-
# Okay. No further arguments required.
|
184
|
-
else
|
185
|
-
unsupported_combination
|
186
|
-
end
|
187
|
-
else
|
188
|
-
unsupported_combination
|
189
|
-
end
|
177
|
+
unsupported_combination unless Registry.is_supported?(opt['input'], opt['rdf'])
|
190
178
|
|
191
|
-
|
179
|
+
if opt['batchsize'] then
|
180
|
+
batching_not_supported unless Registry.is_supporting_batch_processing?(opt['input'], opt['rdf'])
|
181
|
+
wrong_type('batchsize', 'a positive integer') unless opt['batchsize'].match(/^[1-9][0-9]*$/)
|
182
|
+
end
|
192
183
|
|
193
|
-
|
184
|
+
# Create a parameter map that can be passed along to Reader implementations:
|
185
|
+
map = {
|
186
|
+
'input' => opt['input'],
|
187
|
+
'output' => opt['output']
|
188
|
+
}
|
189
|
+
map['batchsize'] = opt['batchsize'].to_i if opt['batchsize']
|
190
|
+
opt.each_key { |key|
|
191
|
+
map[key.sub(/^annotate_/, '')] = opt[key] if key.start_with?('annotate_')
|
192
|
+
}
|
194
193
|
|
195
194
|
# Generate model from file (deserialization).
|
196
|
-
|
197
|
-
reader =
|
198
|
-
if opt['input'] == 'biointerchange.gff3' then
|
199
|
-
reader = BioInterchange::Genomics::GFF3Reader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
|
200
|
-
elsif opt['input'] == 'biointerchange.gvf' then
|
201
|
-
reader = BioInterchange::Genomics::GVFReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], opt['batchsize'])
|
202
|
-
elsif opt['input'] == 'dbcls.catanns.json' then
|
203
|
-
reader = BioInterchange::TextMining::PubAnnosJSONReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['version'])
|
204
|
-
elsif opt['input'] == 'uk.ac.man.pdfx' then
|
205
|
-
reader = BioInterchange::TextMining::PDFxXMLReader.new(opt['annotate_name'], opt['annotate_name_id'], opt['annotate_date'], BioInterchange::TextMining::Process::UNSPECIFIED, opt['annotate_version'])
|
206
|
-
end
|
195
|
+
reader_class, *args = Registry.reader(opt['input'])
|
196
|
+
reader = reader_class.new(*BioInterchange::get_parameters(map, args))
|
207
197
|
|
208
|
-
|
209
|
-
|
198
|
+
input_source = nil
|
199
|
+
if opt['file'] then
|
200
|
+
input_source = File.new(opt['file'], 'r')
|
210
201
|
else
|
211
202
|
input_source = STDIN
|
212
203
|
end
|
213
204
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
writer = BioInterchange::TextMining::RDFWriter.new(STDOUT) unless opt['out']
|
220
|
-
end
|
221
|
-
if opt['rdf'] == 'rdf.biointerchange.gff3' or opt['rdf'] == 'rdf.biointerchange.gvf' then
|
222
|
-
writer = BioInterchange::Genomics::RDFWriter.new(File.new(opt['out'], 'w')) if opt['out']
|
223
|
-
writer = BioInterchange::Genomics::RDFWriter.new(STDOUT) unless opt['out']
|
205
|
+
output_source = nil
|
206
|
+
if opt['out'] then
|
207
|
+
output_source = File.new(opt['out'], 'w')
|
208
|
+
else
|
209
|
+
output_source = STDOUT
|
224
210
|
end
|
211
|
+
|
212
|
+
# Generate rdf from model (serialization).
|
213
|
+
writer = Registry.writer(opt['rdf']).new(output_source)
|
225
214
|
|
226
215
|
begin
|
227
216
|
model = reader.deserialize(input_source)
|
@@ -270,6 +259,10 @@ module BioInterchange
|
|
270
259
|
|
271
260
|
private
|
272
261
|
|
262
|
+
def self.batching_not_supported
|
263
|
+
raise ArgumentError, 'Batching is not supported for this input/output format combination.'
|
264
|
+
end
|
265
|
+
|
273
266
|
def self.unsupported_combination
|
274
267
|
raise ArgumentError, 'This input/output format combination is not supported.'
|
275
268
|
end
|
@@ -15,6 +15,22 @@ module BioInterchange::Genomics
|
|
15
15
|
# - rdf.biointerchange.gvf
|
16
16
|
class RDFWriter < BioInterchange::Writer
|
17
17
|
|
18
|
+
# Register writers:
|
19
|
+
BioInterchange::Registry.register_writer(
|
20
|
+
'rdf.biointerchange.gff3',
|
21
|
+
BioInterchange::Genomics::RDFWriter,
|
22
|
+
[ 'biointerchange.gff3' ],
|
23
|
+
true,
|
24
|
+
'Generic Feature Format Version 3 Ontology (GFF3O) based RDFization'
|
25
|
+
)
|
26
|
+
BioInterchange::Registry.register_writer(
|
27
|
+
'rdf.biointerchange.gvf',
|
28
|
+
BioInterchange::Genomics::RDFWriter,
|
29
|
+
[ 'biointerchange.gvf' ],
|
30
|
+
true,
|
31
|
+
'Genome Variation Format Version 1 Ontology (GVF1O) based RDFization'
|
32
|
+
)
|
33
|
+
|
18
34
|
# Creates a new instance of a RDFWriter that will use the provided output stream to serialize RDF.
|
19
35
|
#
|
20
36
|
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
@@ -4,6 +4,20 @@ module BioInterchange::Genomics
|
|
4
4
|
|
5
5
|
class GFF3Reader < BioInterchange::Reader
|
6
6
|
|
7
|
+
# Register reader:
|
8
|
+
BioInterchange::Registry.register_reader(
|
9
|
+
'biointerchange.gff3',
|
10
|
+
GFF3Reader,
|
11
|
+
[ 'name', 'name_uri', 'date' ],
|
12
|
+
true,
|
13
|
+
'Generic Feature Format Version 3 (GFF3) reader',
|
14
|
+
[
|
15
|
+
[ 'date <date>', 'date when the GFF3 file was created (optional)' ],
|
16
|
+
[ 'name <name>', 'name of the GFF3 file creator (optional)' ],
|
17
|
+
[ 'name_id <id>', 'email address of the GFF3 file creator (optional)' ]
|
18
|
+
]
|
19
|
+
)
|
20
|
+
|
7
21
|
# Creates a new instance of a Generic Feature Format Version 3 (GFF3) reader.
|
8
22
|
#
|
9
23
|
# The reader supports batch processing.
|
@@ -94,7 +108,7 @@ protected
|
|
94
108
|
if type.match(/^SO:\d{7}$/) then
|
95
109
|
type = RDF::URI.new("http://www.sequenceontology.org/miso/current_release/term/#{feature.type}")
|
96
110
|
else
|
97
|
-
type = BioInterchange::
|
111
|
+
type = BioInterchange::SO.send(BioInterchange.make_safe_label(type))
|
98
112
|
end
|
99
113
|
rescue NoMethodError
|
100
114
|
raise BioInterchange::Exceptions::InputFormatError, "Type of feature is set to an unknown SOFA term: \"#{type}\""
|
@@ -154,15 +168,15 @@ protected
|
|
154
168
|
feature_set.set_pragma(name, { name => value.to_f })
|
155
169
|
elsif name == 'sequence-region' then
|
156
170
|
regions = feature_set.pragma(name)
|
157
|
-
regions = {} unless regions
|
171
|
+
regions = { name => {} } unless regions
|
158
172
|
seqid, start_coordinate, end_coordinate = value.split(/\s+/, 3)
|
159
|
-
regions[seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
|
173
|
+
regions[name][seqid] = BioInterchange::Genomics::GFF3Landmark.new(seqid, start_coordinate.to_i, end_coordinate.to_i)
|
160
174
|
feature_set.set_pragma(name, regions)
|
161
175
|
elsif name == 'species' then
|
162
176
|
feature_set.set_pragma(name, { name => value })
|
163
177
|
else
|
164
178
|
# Unhandled pragma. Just save the value in its string form.
|
165
|
-
feature_set.set_pragma(name, value)
|
179
|
+
feature_set.set_pragma(name, { name => value })
|
166
180
|
end
|
167
181
|
end
|
168
182
|
|
@@ -2,6 +2,20 @@ module BioInterchange::Genomics
|
|
2
2
|
|
3
3
|
class GVFReader < GFF3Reader
|
4
4
|
|
5
|
+
# Register reader:
|
6
|
+
BioInterchange::Registry.register_reader(
|
7
|
+
'biointerchange.gvf',
|
8
|
+
GVFReader,
|
9
|
+
[ 'name', 'name_uri', 'date' ],
|
10
|
+
true,
|
11
|
+
'Genome Variation Format Version 1 (GVF) reader',
|
12
|
+
[
|
13
|
+
[ 'date <date>', 'date when the GVF file was created (optional)' ],
|
14
|
+
[ 'name <name>', 'name of the GVF file creator (optional)' ],
|
15
|
+
[ 'name_id <id>', 'email address of the GVF file creator (optional)' ]
|
16
|
+
]
|
17
|
+
)
|
18
|
+
|
5
19
|
# Creates a new instance of a Genome Variation Format (GVF) reader.
|
6
20
|
#
|
7
21
|
# +name+:: Optional name of the person who generated the GVF file.
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
require 'rdf/ntriples'
|
3
|
+
|
4
|
+
module BioInterchange::Phylogenetics
|
5
|
+
|
6
|
+
# Serialized phylogenetic tree models based on BioRuby's phylogenetic tree implementation.
|
7
|
+
class CDAORDFWriter < BioInterchange::Writer
|
8
|
+
|
9
|
+
# Register writers:
|
10
|
+
BioInterchange::Registry.register_writer(
|
11
|
+
'rdf.phylotastic.newick',
|
12
|
+
CDAORDFWriter,
|
13
|
+
[ 'phylotastic.newick' ],
|
14
|
+
true,
|
15
|
+
'Comparative Data Analysis Ontology (CDAO) based RDFization'
|
16
|
+
)
|
17
|
+
|
18
|
+
# Creates a new instance of a CDAORDFWriter that will use the provided output stream to serialize RDF.
|
19
|
+
#
|
20
|
+
# +ostream+:: instance of an IO class or derivative that is used for RDF serialization
|
21
|
+
def initialize(ostream)
|
22
|
+
@ostream = ostream
|
23
|
+
end
|
24
|
+
|
25
|
+
# Serialize a model as RDF.
|
26
|
+
#
|
27
|
+
# +model+:: a generic representation of input data that is an instance of BioInterchange::Phylogenetics::TreeSet
|
28
|
+
def serialize(model)
|
29
|
+
model.contents.each { |tree|
|
30
|
+
serialize_model(model, tree)
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def serialize_model(model, tree)
|
37
|
+
graph = RDF::Graph.new
|
38
|
+
graph.fast_ostream(@ostream) if BioInterchange::skip_rdf_graph
|
39
|
+
tree_uri = RDF::URI.new(model.uri)
|
40
|
+
if model.date then
|
41
|
+
graph.insert(RDF::Statement.new(tree_uri, RDF::DC.date, RDF::Literal.new(model.date)))
|
42
|
+
end
|
43
|
+
serialize_tree(graph, tree, tree_uri, tree.root, true)
|
44
|
+
RDF::NTriples::Writer.dump(graph, @ostream)
|
45
|
+
end
|
46
|
+
|
47
|
+
def serialize_tree(graph, tree, tree_uri, node, is_root)
|
48
|
+
node_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{node.object_id}")
|
49
|
+
|
50
|
+
if is_root then
|
51
|
+
graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.NewickTree))
|
52
|
+
# Commented out some lines since it appears not to be determinable for Newick trees.
|
53
|
+
if tree.root then
|
54
|
+
# graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.rootedtree))
|
55
|
+
else
|
56
|
+
# graph.insert(RDF::Statement.new(tree_uri, RDF.type, BioInterchange::CDAO.unrootedtree))
|
57
|
+
# Pick the first node available to permit serialization of the tree:
|
58
|
+
tree.root = node = tree.nodes.first
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
if node.name and not node.name.empty? then
|
63
|
+
taxonomic_unit_uri = RDF::URI.new("#{tree_uri.to_s}/taxonomic_unit/#{node.object_id}")
|
64
|
+
graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF.type, BioInterchange::CDAO.TU))
|
65
|
+
graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO::represents_TU, taxonomic_unit_uri))
|
66
|
+
graph.insert(RDF::Statement.new(taxonomic_unit_uri, RDF::RDFS.label, RDF::Literal.new(node.name.gsub('_', ' '))))
|
67
|
+
end
|
68
|
+
|
69
|
+
if tree.descendents(node).empty? then
|
70
|
+
graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.TerminalNode))
|
71
|
+
else
|
72
|
+
graph.insert(RDF::Statement.new(node_uri, RDF.type, BioInterchange::CDAO.AncestralNode))
|
73
|
+
end
|
74
|
+
|
75
|
+
if not tree.root == node and tree.parent(node) then
|
76
|
+
parent_uri = RDF::URI.new("#{tree_uri.to_s}/node/#{tree.parent(node).object_id}")
|
77
|
+
edge_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}")
|
78
|
+
annotation_uri = RDF::URI.new("#{tree_uri.to_s}/edge/#{tree.get_edge(tree.parent(node), node).object_id}/annotation")
|
79
|
+
graph.insert(RDF::Statement.new(edge_uri, RDF.type, BioInterchange::CDAO.DirectedEdge))
|
80
|
+
graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
|
81
|
+
graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Parent_Node, parent_uri))
|
82
|
+
graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Child_Node, node_uri))
|
83
|
+
graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Edge_as_Child, edge_uri))
|
84
|
+
graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Parent, parent_uri))
|
85
|
+
graph.insert(RDF::Statement.new(parent_uri, BioInterchange::CDAO.belongs_to_Edge_as_Parent, edge_uri))
|
86
|
+
|
87
|
+
# if node.distance then
|
88
|
+
# graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.has_Support_Value, RDF::Literal.new(node.distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
|
89
|
+
# end
|
90
|
+
|
91
|
+
graph.insert(RDF::Statement.new(edge_uri, BioInterchange::CDAO.has_Annotation, annotation_uri))
|
92
|
+
graph.insert(RDF::Statement.new(annotation_uri, RDF.type, BioInterchange::CDAO.EdgeLength))
|
93
|
+
graph.insert(RDF::Statement.new(annotation_uri, BioInterchange::CDAO.has_Value, RDF::Literal.new(tree.get_edge(tree.parent(node), node).distance, :datatype => RDF::URI.new('http://www.w3.org/2001/XMLSchema#decimal'))))
|
94
|
+
end
|
95
|
+
|
96
|
+
graph.insert(RDF::Statement.new(tree_uri, BioInterchange::CDAO.has_Root, node_uri))
|
97
|
+
graph.insert(RDF::Statement.new(node_uri, BioInterchange::CDAO.belongs_to_Tree, tree_uri))
|
98
|
+
|
99
|
+
# Now, continue traversing the tree by visiting the current node's descendents:
|
100
|
+
tree.descendents(node).each { |descendent_node|
|
101
|
+
serialize_tree(graph, tree, tree_uri, descendent_node, false)
|
102
|
+
}
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'bio'
|
2
|
+
require 'date'
|
3
|
+
|
4
|
+
module BioInterchange::Phylogenetics
|
5
|
+
|
6
|
+
class NewickReader < BioInterchange::Reader
|
7
|
+
|
8
|
+
# Register reader:
|
9
|
+
BioInterchange::Registry.register_reader(
|
10
|
+
'phylotastic.newick',
|
11
|
+
NewickReader,
|
12
|
+
[ 'date' ],
|
13
|
+
true,
|
14
|
+
'Newick Tree File Format reader',
|
15
|
+
[
|
16
|
+
[ 'date <date>', 'date when the Newick file was created (optional)' ]
|
17
|
+
]
|
18
|
+
)
|
19
|
+
|
20
|
+
# Creates a new instance of a Newick file format reader.
|
21
|
+
#
|
22
|
+
# The reader supports batch processing.
|
23
|
+
#
|
24
|
+
# +date+:: Optional date of when the Newick file was produced, annotated, etc.
|
25
|
+
# +batch_size+:: Optional integer that determines that number of features that
|
26
|
+
# should be processed in one go.
|
27
|
+
def initialize(date = nil, batch_size = nil)
|
28
|
+
@date = date
|
29
|
+
@batch_size = batch_size
|
30
|
+
end
|
31
|
+
|
32
|
+
# Reads a Newick file from the input stream and returns an associated model.
|
33
|
+
#
|
34
|
+
# If this method is called when +postponed?+ returns true, then the reading will
|
35
|
+
# continue from where it has been interrupted beforehand.
|
36
|
+
#
|
37
|
+
# +inputstream+:: an instance of class IO or String that holds the contents of a Newick file
|
38
|
+
def deserialize(inputstream)
|
39
|
+
if inputstream.kind_of?(IO)
|
40
|
+
create_model(inputstream)
|
41
|
+
elsif inputstream.kind_of?(String) then
|
42
|
+
create_model(StringIO.new(inputstream))
|
43
|
+
else
|
44
|
+
raise BioInterchange::Exceptions::ImplementationReaderError, 'The provided input stream needs to be either of type IO or String.'
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns true if the reading of the input was postponed due to a full batch.
|
49
|
+
def postponed?
|
50
|
+
@postponed
|
51
|
+
end
|
52
|
+
|
53
|
+
protected
|
54
|
+
|
55
|
+
def create_model(newick)
|
56
|
+
if @postponed then
|
57
|
+
@postponed = false
|
58
|
+
@trees.prune
|
59
|
+
else
|
60
|
+
@trees = BioInterchange::Phylogenetics::TreeSet.new()
|
61
|
+
@trees.set_date(Date.parse(@date)) if @date
|
62
|
+
end
|
63
|
+
|
64
|
+
tree_io = Bio::FlatFile.open(Bio::Newick, newick)
|
65
|
+
while newick_tree = tree_io.next_entry
|
66
|
+
newick_tree.options[:bootstrap_style] = :disabled
|
67
|
+
@trees.add(newick_tree.tree)
|
68
|
+
|
69
|
+
if @batch_size and feature_no >= @batch_size then
|
70
|
+
@postponed = true
|
71
|
+
break
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
@trees
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
|
3
|
+
module BioInterchange::Phylogenetics
|
4
|
+
|
5
|
+
# A phylogenetic tree set that can contain multiple phylogenetic trees.
|
6
|
+
class TreeSet < BioInterchange::Model
|
7
|
+
|
8
|
+
# Create a new instance of a tree set. A tree set can contain multiple phylogenetic trees.
|
9
|
+
def initialize
|
10
|
+
# Trees are stored as the keys of a hash map to increase performance:
|
11
|
+
@set = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
# Returns the contents of the tree set.
|
15
|
+
def contents
|
16
|
+
@set.keys
|
17
|
+
end
|
18
|
+
|
19
|
+
# If a date was provided, then this method returns its value.
|
20
|
+
def date
|
21
|
+
@date
|
22
|
+
end
|
23
|
+
|
24
|
+
# Sets a date that is associated with the trees in this model (e.g., annotation date, creation date, etc.).
|
25
|
+
#
|
26
|
+
# +date+:: an instance of Date that is associated with all trees in the model
|
27
|
+
def set_date(date)
|
28
|
+
@date = date
|
29
|
+
end
|
30
|
+
|
31
|
+
# Returns an URI for this particular tree set, which is a not necessarily globally unique SHA1 hash.
|
32
|
+
def uri
|
33
|
+
"biointerchange://phylogenetics/treeset/self/#{Digest::SHA1.hexdigest(Time.now.to_s)}"
|
34
|
+
end
|
35
|
+
|
36
|
+
# Add a tree to the tree set.
|
37
|
+
#
|
38
|
+
# +tree+:: BioRuby tree instance that is added to the contents of this tree set
|
39
|
+
def add(tree)
|
40
|
+
@set[tree] = true
|
41
|
+
end
|
42
|
+
|
43
|
+
# Removes all features from the set, but keeps additional data (e.g., the date).
|
44
|
+
def prune
|
45
|
+
@set.clear
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -1,27 +1,69 @@
|
|
1
1
|
module BioInterchange
|
2
2
|
|
3
|
+
# A registry of Reader and Writer subclasses that also keeps track which
|
4
|
+
# Reader/Writer combinations can be used together. The registry makes it
|
5
|
+
# possible to implement readers and writers without the need to modify
|
6
|
+
# other BioInterchange framework code.
|
3
7
|
class Registry
|
4
8
|
|
5
|
-
def self.register_reader(
|
6
|
-
@@readers[
|
9
|
+
def self.register_reader(reader_id, reader_class, parameters, supports_batch_processing, descriptive_name, options_help)
|
10
|
+
@@readers[reader_id] = [ reader_class ] + parameters
|
11
|
+
@@reader_batch_processors[reader_id] = true if supports_batch_processing
|
12
|
+
@@reader_descriptions[reader_id] = descriptive_name
|
13
|
+
@@reader_help_texts[reader_id] = options_help
|
7
14
|
end
|
8
15
|
|
9
|
-
def self.register_writer(
|
10
|
-
@@writers[
|
16
|
+
def self.register_writer(writer_id, writer_class, compatible_reader_ids, supports_batch_processing, descriptive_name)
|
17
|
+
@@writers[writer_id] = writer_class
|
18
|
+
@@writer_batch_processors[writer_id] = true if supports_batch_processing
|
19
|
+
@@writer_descriptions[writer_id] = descriptive_name
|
20
|
+
compatible_reader_ids.each { |reader_id|
|
21
|
+
@@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] = true
|
22
|
+
}
|
11
23
|
end
|
12
24
|
|
13
|
-
def self.
|
14
|
-
@@
|
25
|
+
def self.is_supported?(reader_id, writer_id)
|
26
|
+
@@compatible_reader_writer_pairs["#{reader_id} #{writer_id}"] == true
|
15
27
|
end
|
16
28
|
|
17
|
-
def self.
|
18
|
-
@@
|
29
|
+
def self.is_supporting_batch_processing?(reader_id, writer_id)
|
30
|
+
@@reader_batch_processors[reader_id] and @@writer_batch_processors[writer_id]
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.reader(reader_id)
|
34
|
+
@@readers[reader_id]
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.writer(writer_id)
|
38
|
+
@@writers[writer_id]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.reader_descriptions
|
42
|
+
@@reader_descriptions.clone.freeze
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.writer_descriptions
|
46
|
+
@@writer_descriptions.clone.freeze
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.options_help(reader_id)
|
50
|
+
@@reader_help_texts[reader_id].clone.freeze
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.reader_writer_pairs
|
54
|
+
@@compatible_reader_writer_pairs.keys.sort.map { |reader_writer_pair| reader_writer_pair.split(/ /, 2) }.freeze
|
19
55
|
end
|
20
56
|
|
21
57
|
private
|
22
58
|
|
23
59
|
@@readers = {}
|
24
60
|
@@writers = {}
|
61
|
+
@@reader_batch_processors = {}
|
62
|
+
@@writer_batch_processors = {}
|
63
|
+
@@reader_descriptions = {}
|
64
|
+
@@writer_descriptions = {}
|
65
|
+
@@reader_help_texts = {}
|
66
|
+
@@compatible_reader_writer_pairs = {}
|
25
67
|
|
26
68
|
end
|
27
69
|
|