bio 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/bioruby +4 -3
- data/lib/bio.rb +3 -3
- data/lib/bio/appl/blast/format0.rb +3 -2
- data/lib/bio/appl/blast/format8.rb +5 -3
- data/lib/bio/db/kegg/compound.rb +6 -1
- data/lib/bio/db/kegg/enzyme.rb +3 -3
- data/lib/bio/db/kegg/genes.rb +2 -2
- data/lib/bio/db/kegg/glycan.rb +5 -5
- data/lib/bio/db/kegg/orthology.rb +27 -3
- data/lib/bio/db/newick.rb +203 -55
- data/lib/bio/io/flatfile.rb +2 -2
- data/lib/bio/io/flatfile/indexer.rb +2 -2
- data/lib/bio/io/keggapi.rb +2 -1
- data/lib/bio/io/pubmed.rb +223 -81
- data/lib/bio/sequence/common.rb +6 -3
- data/lib/bio/shell/interface.rb +2 -2
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/bioruby_generator.rb +5 -5
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/bioruby.css +7 -8
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/bioruby.rhtml +1 -1
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/index.rhtml +21 -17
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/spinner.gif +0 -0
- data/test/functional/bio/io/test_ensembl.rb +87 -4
- data/test/unit/bio/db/test_newick.rb +238 -1
- data/test/unit/bio/sequence/test_aa.rb +3 -2
- data/test/unit/bio/sequence/test_common.rb +11 -2
- data/test/unit/bio/sequence/test_na.rb +63 -1
- metadata +4 -4
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/bioruby-console.png +0 -0
data/bin/bioruby
CHANGED
@@ -2,17 +2,18 @@
|
|
2
2
|
#
|
3
3
|
# = BioRuby shell - command line interface for the BioRuby library
|
4
4
|
#
|
5
|
-
# Copyright:: Copyright (C) 2005, 2006
|
5
|
+
# Copyright:: Copyright (C) 2005, 2006, 2007
|
6
6
|
# Toshiaki Katayama <k@bioruby.org>
|
7
7
|
# License:: The Ruby License
|
8
8
|
#
|
9
|
-
# $Id: bioruby,v 1.
|
9
|
+
# $Id: bioruby,v 1.21 2007/07/26 10:46:46 k Exp $
|
10
10
|
#
|
11
11
|
|
12
12
|
begin
|
13
13
|
require 'rubygems'
|
14
|
-
|
14
|
+
gem 'bio', '>= 1.1.0'
|
15
15
|
rescue LoadError
|
16
|
+
require 'bio'
|
16
17
|
end
|
17
18
|
require 'bio/shell'
|
18
19
|
|
data/lib/bio.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
#
|
2
2
|
# = bio.rb - Loading all BioRuby modules
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C) 2001-
|
4
|
+
# Copyright:: Copyright (C) 2001-2007
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id: bio.rb,v 1.
|
8
|
+
# $Id: bio.rb,v 1.87 2007/12/14 16:04:54 k Exp $
|
9
9
|
#
|
10
10
|
|
11
11
|
module Bio
|
12
12
|
|
13
|
-
BIORUBY_VERSION = [1,
|
13
|
+
BIORUBY_VERSION = [1, 2, 0].extend(Comparable)
|
14
14
|
|
15
15
|
### Basic data types
|
16
16
|
|
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2003-2006 GOTO Naohisa <ng@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: format0.rb,v 1.
|
7
|
+
# $Id: format0.rb,v 1.24 2007/12/14 16:12:17 k Exp $
|
8
8
|
#
|
9
9
|
# == Description
|
10
10
|
#
|
@@ -860,9 +860,10 @@ module Bio
|
|
860
860
|
# Returns definition of the hit.
|
861
861
|
def definition; parse_hitname; @definition; end
|
862
862
|
|
863
|
+
def target_id; definition[/^\s*(\S+)/, 1]; end
|
864
|
+
|
863
865
|
#--
|
864
866
|
# Aliases to keep compatibility with Bio::Fasta::Report::Hit.
|
865
|
-
#alias target_id accession
|
866
867
|
alias target_def definition
|
867
868
|
alias target_len len
|
868
869
|
#++
|
@@ -1,10 +1,10 @@
|
|
1
1
|
#
|
2
2
|
# = bio/appl/blast/format8.rb - BLAST tab-delimited output (-m 8) parser
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C) 2002, 2003 Toshiaki Katayama <k@bioruby.org>
|
4
|
+
# Copyright:: Copyright (C) 2002, 2003, 2007 Toshiaki Katayama <k@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: format8.rb,v 1.
|
7
|
+
# $Id: format8.rb,v 1.8 2007/12/14 16:15:20 k Exp $
|
8
8
|
#
|
9
9
|
# == Note
|
10
10
|
#
|
@@ -22,6 +22,7 @@ module Bio
|
|
22
22
|
@iterations.push(iteration)
|
23
23
|
@query_id = @query_def = data[/\S+/]
|
24
24
|
|
25
|
+
query_prev = ''
|
25
26
|
target_prev = ''
|
26
27
|
hit_num = 1
|
27
28
|
hsp_num = 1
|
@@ -29,7 +30,7 @@ module Bio
|
|
29
30
|
data.each do |line|
|
30
31
|
ary = line.chomp.split("\t")
|
31
32
|
query_id, target_id, hsp = tab_parse_hsp(ary)
|
32
|
-
if target_prev != target_id
|
33
|
+
if query_prev != query_id or target_prev != target_id
|
33
34
|
hit = Hit.new
|
34
35
|
hit.num = hit_num
|
35
36
|
hit_num += 1
|
@@ -41,6 +42,7 @@ module Bio
|
|
41
42
|
hsp.num = hsp_num
|
42
43
|
hsp_num += 1
|
43
44
|
hit.hsps.push(hsp)
|
45
|
+
query_prev = query_id
|
44
46
|
target_prev = target_id
|
45
47
|
end
|
46
48
|
end
|
data/lib/bio/db/kegg/compound.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2001, 2002, 2004, 2007 Toshiaki Katayama <k@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: compound.rb,v 0.
|
7
|
+
# $Id: compound.rb,v 0.17 2007/11/27 07:09:43 k Exp $
|
8
8
|
#
|
9
9
|
|
10
10
|
require 'bio/db'
|
@@ -45,6 +45,11 @@ class COMPOUND < KEGGDB
|
|
45
45
|
field_fetch('MASS').to_f
|
46
46
|
end
|
47
47
|
|
48
|
+
# REMARK
|
49
|
+
def remark
|
50
|
+
field_fetch('REMARK')
|
51
|
+
end
|
52
|
+
|
48
53
|
# GLYCAN
|
49
54
|
def glycans
|
50
55
|
unless @data['GLYCAN']
|
data/lib/bio/db/kegg/enzyme.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2001, 2002, 2007 Toshiaki Katayama <k@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: enzyme.rb,v 0.
|
7
|
+
# $Id: enzyme.rb,v 0.12 2007/12/14 16:20:38 k Exp $
|
8
8
|
#
|
9
9
|
|
10
10
|
require 'bio/db'
|
@@ -106,9 +106,9 @@ class ENZYME < KEGGDB
|
|
106
106
|
lines_fetch('PATHWAY')
|
107
107
|
end
|
108
108
|
|
109
|
-
#
|
109
|
+
# ORTHOLOGY
|
110
110
|
def orthologs
|
111
|
-
lines_fetch('
|
111
|
+
lines_fetch('ORTHOLOGY')
|
112
112
|
end
|
113
113
|
|
114
114
|
# GENES
|
data/lib/bio/db/kegg/genes.rb
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id: genes.rb,v 0.
|
8
|
+
# $Id: genes.rb,v 0.26 2007/12/14 16:20:38 k Exp $
|
9
9
|
#
|
10
10
|
#
|
11
11
|
# == KEGG GENES parser
|
@@ -137,7 +137,7 @@ class GENES < KEGGDB
|
|
137
137
|
end
|
138
138
|
|
139
139
|
def orthologs
|
140
|
-
lines_fetch('
|
140
|
+
lines_fetch('ORTHOLOGY')
|
141
141
|
end
|
142
142
|
|
143
143
|
def pathway
|
data/lib/bio/db/kegg/glycan.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2004 Toshiaki Katayama <k@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: glycan.rb,v 1.
|
7
|
+
# $Id: glycan.rb,v 1.7 2007/12/14 16:20:38 k Exp $
|
8
8
|
#
|
9
9
|
|
10
10
|
require 'bio/db'
|
@@ -94,12 +94,12 @@ class GLYCAN < KEGGDB
|
|
94
94
|
@data['ENZYME']
|
95
95
|
end
|
96
96
|
|
97
|
-
#
|
97
|
+
# ORTHOLOGY
|
98
98
|
def orthologs
|
99
|
-
unless @data['
|
100
|
-
@data['
|
99
|
+
unless @data['ORTHOLOGY']
|
100
|
+
@data['ORTHOLOGY'] = lines_fetch('ORTHOLOGY')
|
101
101
|
end
|
102
|
-
@data['
|
102
|
+
@data['ORTHOLOGY']
|
103
103
|
end
|
104
104
|
|
105
105
|
# COMMENT
|
@@ -5,7 +5,7 @@
|
|
5
5
|
# Copyright:: Copyright (C) 2003 Masumi Itoh <m@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id: orthology.rb,v 1.
|
8
|
+
# $Id: orthology.rb,v 1.10 2007/12/14 16:19:54 k Exp $
|
9
9
|
#
|
10
10
|
|
11
11
|
require 'bio/db'
|
@@ -67,7 +67,7 @@ class ORTHOLOGY < KEGGDB
|
|
67
67
|
keggclass.scan(/\[PATH:(.*?)\]/).flatten
|
68
68
|
end
|
69
69
|
|
70
|
-
# Returns
|
70
|
+
# Returns an Array of a database name and entry IDs in DBLINKS field.
|
71
71
|
def dblinks
|
72
72
|
unless @data['DBLINKS']
|
73
73
|
@data['DBLINKS'] = lines_fetch('DBLINKS')
|
@@ -75,13 +75,37 @@ class ORTHOLOGY < KEGGDB
|
|
75
75
|
@data['DBLINKS']
|
76
76
|
end
|
77
77
|
|
78
|
-
# Returns a Hash of
|
78
|
+
# Returns a Hash of the DB name and an Array of entry IDs in DBLINKS field.
|
79
|
+
def dblinks_as_hash
|
80
|
+
hash = {}
|
81
|
+
dblinks.each do |line|
|
82
|
+
name, *list = line.split(/\s+/)
|
83
|
+
db = name.downcase.sub(/:/, '')
|
84
|
+
hash[db] = list
|
85
|
+
end
|
86
|
+
return hash
|
87
|
+
end
|
88
|
+
|
89
|
+
# Returns an Array of the organism ID and entry IDs in GENES field.
|
79
90
|
def genes
|
80
91
|
unless @data['GENES']
|
81
92
|
@data['GENES'] = lines_fetch('GENES')
|
82
93
|
end
|
83
94
|
@data['GENES']
|
84
95
|
end
|
96
|
+
|
97
|
+
# Returns a Hash of the organism ID and an Array of entry IDs in GENES field.
|
98
|
+
def genes_as_hash
|
99
|
+
hash = {}
|
100
|
+
genes.each do |line|
|
101
|
+
name, *list = line.split(/\s+/)
|
102
|
+
org = name.downcase.sub(/:/, '')
|
103
|
+
genes = list.map {|x| x.sub(/\(.*\)/, '')}
|
104
|
+
#names = list.map {|x| x.scan(/.*\((.*)\)/)}
|
105
|
+
hash[org] = genes
|
106
|
+
end
|
107
|
+
return hash
|
108
|
+
end
|
85
109
|
|
86
110
|
end # ORTHOLOGY
|
87
111
|
|
data/lib/bio/db/newick.rb
CHANGED
@@ -6,9 +6,19 @@
|
|
6
6
|
# Daniel Amelang <dan@amelang.net>
|
7
7
|
# License:: The Ruby License
|
8
8
|
#
|
9
|
-
# $Id: newick.rb,v 1.
|
9
|
+
# $Id: newick.rb,v 1.8 2007/12/12 16:06:22 ngoto Exp $
|
10
|
+
#
|
11
|
+
# == Description
|
12
|
+
#
|
13
|
+
# This file contains parser and formatter of Newick and NHX.
|
14
|
+
#
|
15
|
+
# == References
|
16
|
+
#
|
17
|
+
# * http://evolution.genetics.washington.edu/phylip/newick_doc.html
|
18
|
+
# * http://www.phylosoft.org/forester/NHX.html
|
10
19
|
#
|
11
20
|
|
21
|
+
require 'strscan'
|
12
22
|
require 'bio/tree'
|
13
23
|
|
14
24
|
module Bio
|
@@ -18,6 +28,7 @@ module Bio
|
|
18
28
|
# newick output
|
19
29
|
#+++
|
20
30
|
|
31
|
+
# default options
|
21
32
|
DEFAULT_OPTIONS =
|
22
33
|
{ :indent => ' ' }
|
23
34
|
|
@@ -32,10 +43,26 @@ module Bio
|
|
32
43
|
end
|
33
44
|
private :__get_option
|
34
45
|
|
46
|
+
|
47
|
+
# formats Newick label (unquoted_label or quoted_label)
|
48
|
+
def __to_newick_format_label(str, options)
|
49
|
+
if __get_option(:parser, options) == :naive then
|
50
|
+
return str.to_s
|
51
|
+
end
|
52
|
+
str = str.to_s
|
53
|
+
if /([\(\)\,\:\[\]\_\'\x00-\x1f\x7f])/ =~ str then
|
54
|
+
# quoted_label
|
55
|
+
return "\'" + str.gsub(/\'/, "\'\'") + "\'"
|
56
|
+
end
|
57
|
+
# unquoted_label
|
58
|
+
return str.gsub(/ /, '_')
|
59
|
+
end
|
60
|
+
private :__to_newick_format_label
|
61
|
+
|
35
62
|
# formats leaf
|
36
63
|
def __to_newick_format_leaf(node, edge, options)
|
37
64
|
|
38
|
-
label = get_node_name(node)
|
65
|
+
label = __to_newick_format_label(get_node_name(node), options)
|
39
66
|
|
40
67
|
dist = get_edge_distance_string(edge)
|
41
68
|
|
@@ -62,7 +89,7 @@ module Bio
|
|
62
89
|
# formats leaf for NHX
|
63
90
|
def __to_newick_format_leaf_NHX(node, edge, options)
|
64
91
|
|
65
|
-
label = get_node_name(node)
|
92
|
+
label = __to_newick_format_label(get_node_name(node), options)
|
66
93
|
|
67
94
|
dist = get_edge_distance_string(edge)
|
68
95
|
|
@@ -165,11 +192,14 @@ module Bio
|
|
165
192
|
# Returns a newick formatted string.
|
166
193
|
# If block is given, the order of the node is sorted
|
167
194
|
# (as the same manner as Enumerable#sort).
|
168
|
-
#
|
169
|
-
#
|
170
|
-
#
|
171
|
-
#
|
172
|
-
#
|
195
|
+
#
|
196
|
+
# Available options:
|
197
|
+
# <tt>:indent</tt>::
|
198
|
+
# indent string; set false to disable (default: ' ')
|
199
|
+
# <tt>:bootstrap_style</tt>::
|
200
|
+
# <tt>:disabled</tt> disables bootstrap representations.
|
201
|
+
# <tt>:traditional</tt> for traditional style.
|
202
|
+
# <tt>:molphy</tt> for Molphy style (default).
|
173
203
|
def output_newick(options = {}, &block) #:yields: node1, node2
|
174
204
|
root = @root
|
175
205
|
root ||= self.nodes.first
|
@@ -185,8 +215,11 @@ module Bio
|
|
185
215
|
# Returns a NHX (New Hampshire eXtended) formatted string.
|
186
216
|
# If block is given, the order of the node is sorted
|
187
217
|
# (as the same manner as Enumerable#sort).
|
188
|
-
#
|
189
|
-
#
|
218
|
+
#
|
219
|
+
# Available options:
|
220
|
+
# <tt>:indent</tt>::
|
221
|
+
# indent string; set false to disable (default: ' ')
|
222
|
+
#
|
190
223
|
def output_nhx(options = {}, &block) #:yields: node1, node2
|
191
224
|
root = @root
|
192
225
|
root ||= self.nodes.first
|
@@ -257,13 +290,28 @@ module Bio
|
|
257
290
|
# Creates a new Newick object.
|
258
291
|
# _options_ for parsing can be set.
|
259
292
|
#
|
260
|
-
#
|
261
|
-
#
|
262
|
-
#
|
293
|
+
# Available options:
|
294
|
+
# <tt>:bootstrap_style</tt>::
|
295
|
+
# <tt>:traditional</tt> for traditional bootstrap style,
|
296
|
+
# <tt>:molphy</tt> for molphy style,
|
297
|
+
# <tt>:disabled</tt> to ignore bootstrap strings.
|
298
|
+
# For details of default actions, please read the notes below.
|
299
|
+
# <tt>:parser</tt>::
|
300
|
+
# <tt>:naive</tt> for using naive parser, compatible with
|
301
|
+
# BioRuby 1.1.0, which ignores quoted strings and
|
302
|
+
# do not convert underscores to spaces.
|
303
|
+
#
|
304
|
+
# Notes for bootstrap style:
|
305
|
+
# Molphy-style bootstrap values may always be parsed, even if
|
306
|
+
# the <tt>options[:bootstrap_style]</tt> is set to
|
307
|
+
# <tt>:traditional</tt> or <tt>:disabled</tt>.
|
308
|
+
#
|
309
|
+
# Note for default or traditional bootstrap style:
|
310
|
+
# By default, if all of the internal node's names are numeric
|
263
311
|
# and there are no NHX and no molphy-style boostrap values,
|
264
312
|
# the names of internal nodes are regarded as bootstrap values.
|
265
|
-
# options[:bootstrap_style] = :disabled or
|
266
|
-
# (or at least one NHX tag exists).
|
313
|
+
# <tt>options[:bootstrap_style] = :disabled</tt> or <tt>:molphy</tt>
|
314
|
+
# to disable the feature (or at least one NHX tag exists).
|
267
315
|
def initialize(str, options = nil)
|
268
316
|
str = str.sub(/\;(.*)/m, ';')
|
269
317
|
@original_string = str
|
@@ -308,57 +356,66 @@ module Bio
|
|
308
356
|
end
|
309
357
|
|
310
358
|
# Parses newick formatted leaf (or internal node) name.
|
311
|
-
def __parse_newick_leaf(
|
312
|
-
|
313
|
-
|
314
|
-
node.name =
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
edge.distance_string = $2 if $2 and !($2.strip.empty?)
|
325
|
-
else
|
326
|
-
node.name = str
|
359
|
+
def __parse_newick_leaf(leaf_tokens, node, edge, options)
|
360
|
+
t = leaf_tokens.shift
|
361
|
+
if !t.kind_of?(Symbol) then
|
362
|
+
node.name = t
|
363
|
+
t = leaf_tokens.shift
|
364
|
+
end
|
365
|
+
|
366
|
+
if t == :':' then
|
367
|
+
t = leaf_tokens.shift
|
368
|
+
if !t.kind_of?(Symbol) then
|
369
|
+
edge.distance_string = t if t and !(t.strip.empty?)
|
370
|
+
t = leaf_tokens.shift
|
371
|
+
end
|
327
372
|
end
|
328
373
|
|
329
|
-
|
330
|
-
|
374
|
+
if t == :'[' then
|
375
|
+
btokens = leaf_tokens
|
331
376
|
case __get_option(:original_format, options)
|
332
377
|
when :nhx
|
333
378
|
# regarded as NHX string which might be broken
|
334
|
-
__parse_nhx(
|
379
|
+
__parse_nhx(btokens, node, edge)
|
335
380
|
when :traditional
|
336
381
|
# simply ignored
|
337
382
|
else
|
338
|
-
case
|
383
|
+
case btokens[0].to_s.strip
|
384
|
+
when ''
|
385
|
+
# not automatically determined
|
339
386
|
when /\A\&\&NHX/
|
340
387
|
# NHX string
|
341
388
|
# force to set NHX mode
|
342
389
|
@options[:original_format] = :nhx
|
343
|
-
__parse_nhx(
|
390
|
+
__parse_nhx(btokens, node, edge)
|
344
391
|
else
|
345
392
|
# Molphy-style boostrap values
|
346
393
|
# let molphy mode if nothing determined
|
347
394
|
@options[:original_format] ||= :molphy
|
395
|
+
bstr = ''
|
396
|
+
while t = btokens.shift and t != :']'
|
397
|
+
bstr.concat t.to_s
|
398
|
+
end
|
348
399
|
node.bootstrap_string = bstr
|
349
|
-
end #case
|
400
|
+
end #case btokens[0]
|
350
401
|
end
|
351
402
|
end
|
352
403
|
|
404
|
+
if !btokens and !leaf_tokens.empty? then
|
405
|
+
# syntax error?
|
406
|
+
end
|
407
|
+
node.name ||= '' # compatibility for older BioRuby
|
408
|
+
|
353
409
|
# returns true
|
354
410
|
true
|
355
411
|
end
|
356
412
|
|
357
413
|
# Parses NHX (New Hampshire eXtended) string
|
358
|
-
def __parse_nhx(
|
359
|
-
|
360
|
-
|
361
|
-
|
414
|
+
def __parse_nhx(btokens, node, edge)
|
415
|
+
btokens.shift if btokens[0] == '&&NHX'
|
416
|
+
btokens.each do |str|
|
417
|
+
break if str == :']'
|
418
|
+
next if str.kind_of?(Symbol)
|
362
419
|
tag, val = str.split(/\=/, 2)
|
363
420
|
case tag
|
364
421
|
when 'B'
|
@@ -391,6 +448,97 @@ module Bio
|
|
391
448
|
true
|
392
449
|
end
|
393
450
|
|
451
|
+
# splits string to tokens
|
452
|
+
def __parse_newick_tokenize(str, options)
|
453
|
+
str = str.chop if str[-1..-1] == ';'
|
454
|
+
# http://evolution.genetics.washington.edu/phylip/newick_doc.html
|
455
|
+
# quoted_label ==> ' string_of_printing_characters '
|
456
|
+
# single quote in quoted_label is '' (two single quotes)
|
457
|
+
#
|
458
|
+
|
459
|
+
if __get_option(:parser, options) == :naive then
|
460
|
+
ary = str.split(/([\(\)\,\:\[\]])/)
|
461
|
+
ary.collect! { |x| x.strip!; x.empty? ? nil : x }
|
462
|
+
ary.compact!
|
463
|
+
ary.collect! do |x|
|
464
|
+
if /\A([\(\)\,\:\[\]])\z/ =~ x then
|
465
|
+
x.intern
|
466
|
+
else
|
467
|
+
x
|
468
|
+
end
|
469
|
+
end
|
470
|
+
return ary
|
471
|
+
end
|
472
|
+
|
473
|
+
tokens = []
|
474
|
+
ss = StringScanner.new(str)
|
475
|
+
|
476
|
+
while !(ss.eos?)
|
477
|
+
if ss.scan(/\s+/) then
|
478
|
+
# do nothing
|
479
|
+
|
480
|
+
elsif ss.scan(/[\(\)\,\:\[\]]/) then
|
481
|
+
# '(' or ')' or ',' or ':' or '[' or ']'
|
482
|
+
t = ss.matched
|
483
|
+
tokens.push t.intern
|
484
|
+
|
485
|
+
elsif ss.scan(/\'/) then
|
486
|
+
# quoted_label
|
487
|
+
t = ''
|
488
|
+
while true
|
489
|
+
if ss.scan(/([^\']*)\'/) then
|
490
|
+
t.concat ss[1]
|
491
|
+
if ss.scan(/\'/) then
|
492
|
+
# single quote in quoted_label
|
493
|
+
t.concat ss.matched
|
494
|
+
else
|
495
|
+
break
|
496
|
+
end
|
497
|
+
else
|
498
|
+
# incomplete quoted_label?
|
499
|
+
break
|
500
|
+
end
|
501
|
+
end #while true
|
502
|
+
unless ss.match?(/\s*[\(\)\,\:\[\]]/) or ss.match?(/\s*\z/) then
|
503
|
+
# label continues? (illegal, but try to rescue)
|
504
|
+
if ss.scan(/[^\(\)\,\:\[\]]+/) then
|
505
|
+
t.concat ss.matched.lstrip
|
506
|
+
end
|
507
|
+
end
|
508
|
+
tokens.push t
|
509
|
+
|
510
|
+
elsif ss.scan(/[^\(\)\,\:\[\]]+/) then
|
511
|
+
# unquoted_label
|
512
|
+
t = ss.matched.strip
|
513
|
+
t.gsub!(/[\r\n]/, '')
|
514
|
+
# unquoted underscore should be converted to blank
|
515
|
+
t.gsub!(/\_/, ' ')
|
516
|
+
tokens.push t unless t.empty?
|
517
|
+
|
518
|
+
else
|
519
|
+
# unquoted_label in end of string
|
520
|
+
t = ss.rest.strip
|
521
|
+
t.gsub!(/[\r\n]/, '')
|
522
|
+
# unquoted underscore should be converted to blank
|
523
|
+
t.gsub!(/\_/, ' ')
|
524
|
+
tokens.push t unless t.empty?
|
525
|
+
ss.terminate
|
526
|
+
|
527
|
+
end
|
528
|
+
end #while !(ss.eos?)
|
529
|
+
|
530
|
+
tokens
|
531
|
+
end
|
532
|
+
|
533
|
+
# get tokens for a leaf
|
534
|
+
def __parse_newick_get_tokens_for_leaf(ary)
|
535
|
+
r = []
|
536
|
+
while t = ary[0] and t != :',' and t != :')' and t != :'('
|
537
|
+
r.push ary.shift
|
538
|
+
end
|
539
|
+
r
|
540
|
+
end
|
541
|
+
|
394
542
|
# Parses newick formatted string.
|
395
543
|
def __parse_newick(str, options = {})
|
396
544
|
# initializing
|
@@ -401,40 +549,37 @@ module Bio
|
|
401
549
|
internal_nodes = []
|
402
550
|
node_stack = []
|
403
551
|
# preparation of tokens
|
404
|
-
|
405
|
-
ary = str.split(/([\(\)\,])/)
|
406
|
-
ary.collect! { |x| x.strip!; x.empty? ? nil : x }
|
407
|
-
ary.compact!
|
552
|
+
ary = __parse_newick_tokenize(str, options)
|
408
553
|
previous_token = nil
|
409
554
|
# main loop
|
410
555
|
while token = ary.shift
|
411
556
|
#p token
|
412
557
|
case token
|
413
|
-
when ','
|
414
|
-
if previous_token == ',' or previous_token == '(' then
|
558
|
+
when :','
|
559
|
+
if previous_token == :',' or previous_token == :'(' then
|
415
560
|
# there is a leaf whose name is empty.
|
416
561
|
ary.unshift(token)
|
417
562
|
ary.unshift('')
|
418
563
|
token = nil
|
419
564
|
end
|
420
|
-
when '('
|
565
|
+
when :'('
|
421
566
|
node = Node.new
|
422
567
|
nodes << node
|
423
568
|
internal_nodes << node
|
424
569
|
node_stack.push(cur_node)
|
425
570
|
cur_node = node
|
426
|
-
when ')'
|
427
|
-
if previous_token == ',' or previous_token == '(' then
|
571
|
+
when :')'
|
572
|
+
if previous_token == :',' or previous_token == :'(' then
|
428
573
|
# there is a leaf whose name is empty.
|
429
574
|
ary.unshift(token)
|
430
575
|
ary.unshift('')
|
431
576
|
token = nil
|
432
577
|
else
|
433
578
|
edge = Edge.new
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
579
|
+
leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
|
580
|
+
token = nil
|
581
|
+
if leaf_tokens.size > 0 then
|
582
|
+
__parse_newick_leaf(leaf_tokens, cur_node, edge, options)
|
438
583
|
end
|
439
584
|
parent = node_stack.pop
|
440
585
|
raise ParseError, 'unmatched parentheses' unless parent
|
@@ -444,7 +589,10 @@ module Bio
|
|
444
589
|
else
|
445
590
|
leaf = Node.new
|
446
591
|
edge = Edge.new
|
447
|
-
|
592
|
+
ary.unshift(token)
|
593
|
+
leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
|
594
|
+
token = nil
|
595
|
+
__parse_newick_leaf(leaf_tokens, leaf, edge, options)
|
448
596
|
nodes << leaf
|
449
597
|
edges << Bio::Relation.new(cur_node, leaf, edge)
|
450
598
|
end #case
|