bio 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/bioruby +4 -3
- data/lib/bio.rb +3 -3
- data/lib/bio/appl/blast/format0.rb +3 -2
- data/lib/bio/appl/blast/format8.rb +5 -3
- data/lib/bio/db/kegg/compound.rb +6 -1
- data/lib/bio/db/kegg/enzyme.rb +3 -3
- data/lib/bio/db/kegg/genes.rb +2 -2
- data/lib/bio/db/kegg/glycan.rb +5 -5
- data/lib/bio/db/kegg/orthology.rb +27 -3
- data/lib/bio/db/newick.rb +203 -55
- data/lib/bio/io/flatfile.rb +2 -2
- data/lib/bio/io/flatfile/indexer.rb +2 -2
- data/lib/bio/io/keggapi.rb +2 -1
- data/lib/bio/io/pubmed.rb +223 -81
- data/lib/bio/sequence/common.rb +6 -3
- data/lib/bio/shell/interface.rb +2 -2
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/bioruby_generator.rb +5 -5
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/bioruby.css +7 -8
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/bioruby.rhtml +1 -1
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/index.rhtml +21 -17
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/spinner.gif +0 -0
- data/test/functional/bio/io/test_ensembl.rb +87 -4
- data/test/unit/bio/db/test_newick.rb +238 -1
- data/test/unit/bio/sequence/test_aa.rb +3 -2
- data/test/unit/bio/sequence/test_common.rb +11 -2
- data/test/unit/bio/sequence/test_na.rb +63 -1
- metadata +4 -4
- data/lib/bio/shell/rails/vendor/plugins/generators/bioruby/templates/bioruby-console.png +0 -0
data/bin/bioruby
CHANGED
@@ -2,17 +2,18 @@
|
|
2
2
|
#
|
3
3
|
# = BioRuby shell - command line interface for the BioRuby library
|
4
4
|
#
|
5
|
-
# Copyright:: Copyright (C) 2005, 2006
|
5
|
+
# Copyright:: Copyright (C) 2005, 2006, 2007
|
6
6
|
# Toshiaki Katayama <k@bioruby.org>
|
7
7
|
# License:: The Ruby License
|
8
8
|
#
|
9
|
-
# $Id: bioruby,v 1.
|
9
|
+
# $Id: bioruby,v 1.21 2007/07/26 10:46:46 k Exp $
|
10
10
|
#
|
11
11
|
|
12
12
|
begin
|
13
13
|
require 'rubygems'
|
14
|
-
|
14
|
+
gem 'bio', '>= 1.1.0'
|
15
15
|
rescue LoadError
|
16
|
+
require 'bio'
|
16
17
|
end
|
17
18
|
require 'bio/shell'
|
18
19
|
|
data/lib/bio.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
#
|
2
2
|
# = bio.rb - Loading all BioRuby modules
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C) 2001-
|
4
|
+
# Copyright:: Copyright (C) 2001-2007
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id: bio.rb,v 1.
|
8
|
+
# $Id: bio.rb,v 1.87 2007/12/14 16:04:54 k Exp $
|
9
9
|
#
|
10
10
|
|
11
11
|
module Bio
|
12
12
|
|
13
|
-
BIORUBY_VERSION = [1,
|
13
|
+
BIORUBY_VERSION = [1, 2, 0].extend(Comparable)
|
14
14
|
|
15
15
|
### Basic data types
|
16
16
|
|
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2003-2006 GOTO Naohisa <ng@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: format0.rb,v 1.
|
7
|
+
# $Id: format0.rb,v 1.24 2007/12/14 16:12:17 k Exp $
|
8
8
|
#
|
9
9
|
# == Description
|
10
10
|
#
|
@@ -860,9 +860,10 @@ module Bio
|
|
860
860
|
# Returns definition of the hit.
|
861
861
|
def definition; parse_hitname; @definition; end
|
862
862
|
|
863
|
+
def target_id; definition[/^\s*(\S+)/, 1]; end
|
864
|
+
|
863
865
|
#--
|
864
866
|
# Aliases to keep compatibility with Bio::Fasta::Report::Hit.
|
865
|
-
#alias target_id accession
|
866
867
|
alias target_def definition
|
867
868
|
alias target_len len
|
868
869
|
#++
|
@@ -1,10 +1,10 @@
|
|
1
1
|
#
|
2
2
|
# = bio/appl/blast/format8.rb - BLAST tab-delimited output (-m 8) parser
|
3
3
|
#
|
4
|
-
# Copyright:: Copyright (C) 2002, 2003 Toshiaki Katayama <k@bioruby.org>
|
4
|
+
# Copyright:: Copyright (C) 2002, 2003, 2007 Toshiaki Katayama <k@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: format8.rb,v 1.
|
7
|
+
# $Id: format8.rb,v 1.8 2007/12/14 16:15:20 k Exp $
|
8
8
|
#
|
9
9
|
# == Note
|
10
10
|
#
|
@@ -22,6 +22,7 @@ module Bio
|
|
22
22
|
@iterations.push(iteration)
|
23
23
|
@query_id = @query_def = data[/\S+/]
|
24
24
|
|
25
|
+
query_prev = ''
|
25
26
|
target_prev = ''
|
26
27
|
hit_num = 1
|
27
28
|
hsp_num = 1
|
@@ -29,7 +30,7 @@ module Bio
|
|
29
30
|
data.each do |line|
|
30
31
|
ary = line.chomp.split("\t")
|
31
32
|
query_id, target_id, hsp = tab_parse_hsp(ary)
|
32
|
-
if target_prev != target_id
|
33
|
+
if query_prev != query_id or target_prev != target_id
|
33
34
|
hit = Hit.new
|
34
35
|
hit.num = hit_num
|
35
36
|
hit_num += 1
|
@@ -41,6 +42,7 @@ module Bio
|
|
41
42
|
hsp.num = hsp_num
|
42
43
|
hsp_num += 1
|
43
44
|
hit.hsps.push(hsp)
|
45
|
+
query_prev = query_id
|
44
46
|
target_prev = target_id
|
45
47
|
end
|
46
48
|
end
|
data/lib/bio/db/kegg/compound.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2001, 2002, 2004, 2007 Toshiaki Katayama <k@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: compound.rb,v 0.
|
7
|
+
# $Id: compound.rb,v 0.17 2007/11/27 07:09:43 k Exp $
|
8
8
|
#
|
9
9
|
|
10
10
|
require 'bio/db'
|
@@ -45,6 +45,11 @@ class COMPOUND < KEGGDB
|
|
45
45
|
field_fetch('MASS').to_f
|
46
46
|
end
|
47
47
|
|
48
|
+
# REMARK
|
49
|
+
def remark
|
50
|
+
field_fetch('REMARK')
|
51
|
+
end
|
52
|
+
|
48
53
|
# GLYCAN
|
49
54
|
def glycans
|
50
55
|
unless @data['GLYCAN']
|
data/lib/bio/db/kegg/enzyme.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2001, 2002, 2007 Toshiaki Katayama <k@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: enzyme.rb,v 0.
|
7
|
+
# $Id: enzyme.rb,v 0.12 2007/12/14 16:20:38 k Exp $
|
8
8
|
#
|
9
9
|
|
10
10
|
require 'bio/db'
|
@@ -106,9 +106,9 @@ class ENZYME < KEGGDB
|
|
106
106
|
lines_fetch('PATHWAY')
|
107
107
|
end
|
108
108
|
|
109
|
-
#
|
109
|
+
# ORTHOLOGY
|
110
110
|
def orthologs
|
111
|
-
lines_fetch('
|
111
|
+
lines_fetch('ORTHOLOGY')
|
112
112
|
end
|
113
113
|
|
114
114
|
# GENES
|
data/lib/bio/db/kegg/genes.rb
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
# Toshiaki Katayama <k@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id: genes.rb,v 0.
|
8
|
+
# $Id: genes.rb,v 0.26 2007/12/14 16:20:38 k Exp $
|
9
9
|
#
|
10
10
|
#
|
11
11
|
# == KEGG GENES parser
|
@@ -137,7 +137,7 @@ class GENES < KEGGDB
|
|
137
137
|
end
|
138
138
|
|
139
139
|
def orthologs
|
140
|
-
lines_fetch('
|
140
|
+
lines_fetch('ORTHOLOGY')
|
141
141
|
end
|
142
142
|
|
143
143
|
def pathway
|
data/lib/bio/db/kegg/glycan.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
# Copyright:: Copyright (C) 2004 Toshiaki Katayama <k@bioruby.org>
|
5
5
|
# License:: The Ruby License
|
6
6
|
#
|
7
|
-
# $Id: glycan.rb,v 1.
|
7
|
+
# $Id: glycan.rb,v 1.7 2007/12/14 16:20:38 k Exp $
|
8
8
|
#
|
9
9
|
|
10
10
|
require 'bio/db'
|
@@ -94,12 +94,12 @@ class GLYCAN < KEGGDB
|
|
94
94
|
@data['ENZYME']
|
95
95
|
end
|
96
96
|
|
97
|
-
#
|
97
|
+
# ORTHOLOGY
|
98
98
|
def orthologs
|
99
|
-
unless @data['
|
100
|
-
@data['
|
99
|
+
unless @data['ORTHOLOGY']
|
100
|
+
@data['ORTHOLOGY'] = lines_fetch('ORTHOLOGY')
|
101
101
|
end
|
102
|
-
@data['
|
102
|
+
@data['ORTHOLOGY']
|
103
103
|
end
|
104
104
|
|
105
105
|
# COMMENT
|
@@ -5,7 +5,7 @@
|
|
5
5
|
# Copyright:: Copyright (C) 2003 Masumi Itoh <m@bioruby.org>
|
6
6
|
# License:: The Ruby License
|
7
7
|
#
|
8
|
-
# $Id: orthology.rb,v 1.
|
8
|
+
# $Id: orthology.rb,v 1.10 2007/12/14 16:19:54 k Exp $
|
9
9
|
#
|
10
10
|
|
11
11
|
require 'bio/db'
|
@@ -67,7 +67,7 @@ class ORTHOLOGY < KEGGDB
|
|
67
67
|
keggclass.scan(/\[PATH:(.*?)\]/).flatten
|
68
68
|
end
|
69
69
|
|
70
|
-
# Returns
|
70
|
+
# Returns an Array of a database name and entry IDs in DBLINKS field.
|
71
71
|
def dblinks
|
72
72
|
unless @data['DBLINKS']
|
73
73
|
@data['DBLINKS'] = lines_fetch('DBLINKS')
|
@@ -75,13 +75,37 @@ class ORTHOLOGY < KEGGDB
|
|
75
75
|
@data['DBLINKS']
|
76
76
|
end
|
77
77
|
|
78
|
-
# Returns a Hash of
|
78
|
+
# Returns a Hash of the DB name and an Array of entry IDs in DBLINKS field.
|
79
|
+
def dblinks_as_hash
|
80
|
+
hash = {}
|
81
|
+
dblinks.each do |line|
|
82
|
+
name, *list = line.split(/\s+/)
|
83
|
+
db = name.downcase.sub(/:/, '')
|
84
|
+
hash[db] = list
|
85
|
+
end
|
86
|
+
return hash
|
87
|
+
end
|
88
|
+
|
89
|
+
# Returns an Array of the organism ID and entry IDs in GENES field.
|
79
90
|
def genes
|
80
91
|
unless @data['GENES']
|
81
92
|
@data['GENES'] = lines_fetch('GENES')
|
82
93
|
end
|
83
94
|
@data['GENES']
|
84
95
|
end
|
96
|
+
|
97
|
+
# Returns a Hash of the organism ID and an Array of entry IDs in GENES field.
|
98
|
+
def genes_as_hash
|
99
|
+
hash = {}
|
100
|
+
genes.each do |line|
|
101
|
+
name, *list = line.split(/\s+/)
|
102
|
+
org = name.downcase.sub(/:/, '')
|
103
|
+
genes = list.map {|x| x.sub(/\(.*\)/, '')}
|
104
|
+
#names = list.map {|x| x.scan(/.*\((.*)\)/)}
|
105
|
+
hash[org] = genes
|
106
|
+
end
|
107
|
+
return hash
|
108
|
+
end
|
85
109
|
|
86
110
|
end # ORTHOLOGY
|
87
111
|
|
data/lib/bio/db/newick.rb
CHANGED
@@ -6,9 +6,19 @@
|
|
6
6
|
# Daniel Amelang <dan@amelang.net>
|
7
7
|
# License:: The Ruby License
|
8
8
|
#
|
9
|
-
# $Id: newick.rb,v 1.
|
9
|
+
# $Id: newick.rb,v 1.8 2007/12/12 16:06:22 ngoto Exp $
|
10
|
+
#
|
11
|
+
# == Description
|
12
|
+
#
|
13
|
+
# This file contains parser and formatter of Newick and NHX.
|
14
|
+
#
|
15
|
+
# == References
|
16
|
+
#
|
17
|
+
# * http://evolution.genetics.washington.edu/phylip/newick_doc.html
|
18
|
+
# * http://www.phylosoft.org/forester/NHX.html
|
10
19
|
#
|
11
20
|
|
21
|
+
require 'strscan'
|
12
22
|
require 'bio/tree'
|
13
23
|
|
14
24
|
module Bio
|
@@ -18,6 +28,7 @@ module Bio
|
|
18
28
|
# newick output
|
19
29
|
#+++
|
20
30
|
|
31
|
+
# default options
|
21
32
|
DEFAULT_OPTIONS =
|
22
33
|
{ :indent => ' ' }
|
23
34
|
|
@@ -32,10 +43,26 @@ module Bio
|
|
32
43
|
end
|
33
44
|
private :__get_option
|
34
45
|
|
46
|
+
|
47
|
+
# formats Newick label (unquoted_label or quoted_label)
|
48
|
+
def __to_newick_format_label(str, options)
|
49
|
+
if __get_option(:parser, options) == :naive then
|
50
|
+
return str.to_s
|
51
|
+
end
|
52
|
+
str = str.to_s
|
53
|
+
if /([\(\)\,\:\[\]\_\'\x00-\x1f\x7f])/ =~ str then
|
54
|
+
# quoted_label
|
55
|
+
return "\'" + str.gsub(/\'/, "\'\'") + "\'"
|
56
|
+
end
|
57
|
+
# unquoted_label
|
58
|
+
return str.gsub(/ /, '_')
|
59
|
+
end
|
60
|
+
private :__to_newick_format_label
|
61
|
+
|
35
62
|
# formats leaf
|
36
63
|
def __to_newick_format_leaf(node, edge, options)
|
37
64
|
|
38
|
-
label = get_node_name(node)
|
65
|
+
label = __to_newick_format_label(get_node_name(node), options)
|
39
66
|
|
40
67
|
dist = get_edge_distance_string(edge)
|
41
68
|
|
@@ -62,7 +89,7 @@ module Bio
|
|
62
89
|
# formats leaf for NHX
|
63
90
|
def __to_newick_format_leaf_NHX(node, edge, options)
|
64
91
|
|
65
|
-
label = get_node_name(node)
|
92
|
+
label = __to_newick_format_label(get_node_name(node), options)
|
66
93
|
|
67
94
|
dist = get_edge_distance_string(edge)
|
68
95
|
|
@@ -165,11 +192,14 @@ module Bio
|
|
165
192
|
# Returns a newick formatted string.
|
166
193
|
# If block is given, the order of the node is sorted
|
167
194
|
# (as the same manner as Enumerable#sort).
|
168
|
-
#
|
169
|
-
#
|
170
|
-
#
|
171
|
-
#
|
172
|
-
#
|
195
|
+
#
|
196
|
+
# Available options:
|
197
|
+
# <tt>:indent</tt>::
|
198
|
+
# indent string; set false to disable (default: ' ')
|
199
|
+
# <tt>:bootstrap_style</tt>::
|
200
|
+
# <tt>:disabled</tt> disables bootstrap representations.
|
201
|
+
# <tt>:traditional</tt> for traditional style.
|
202
|
+
# <tt>:molphy</tt> for Molphy style (default).
|
173
203
|
def output_newick(options = {}, &block) #:yields: node1, node2
|
174
204
|
root = @root
|
175
205
|
root ||= self.nodes.first
|
@@ -185,8 +215,11 @@ module Bio
|
|
185
215
|
# Returns a NHX (New Hampshire eXtended) formatted string.
|
186
216
|
# If block is given, the order of the node is sorted
|
187
217
|
# (as the same manner as Enumerable#sort).
|
188
|
-
#
|
189
|
-
#
|
218
|
+
#
|
219
|
+
# Available options:
|
220
|
+
# <tt>:indent</tt>::
|
221
|
+
# indent string; set false to disable (default: ' ')
|
222
|
+
#
|
190
223
|
def output_nhx(options = {}, &block) #:yields: node1, node2
|
191
224
|
root = @root
|
192
225
|
root ||= self.nodes.first
|
@@ -257,13 +290,28 @@ module Bio
|
|
257
290
|
# Creates a new Newick object.
|
258
291
|
# _options_ for parsing can be set.
|
259
292
|
#
|
260
|
-
#
|
261
|
-
#
|
262
|
-
#
|
293
|
+
# Available options:
|
294
|
+
# <tt>:bootstrap_style</tt>::
|
295
|
+
# <tt>:traditional</tt> for traditional bootstrap style,
|
296
|
+
# <tt>:molphy</tt> for molphy style,
|
297
|
+
# <tt>:disabled</tt> to ignore bootstrap strings.
|
298
|
+
# For details of default actions, please read the notes below.
|
299
|
+
# <tt>:parser</tt>::
|
300
|
+
# <tt>:naive</tt> for using naive parser, compatible with
|
301
|
+
# BioRuby 1.1.0, which ignores quoted strings and
|
302
|
+
# do not convert underscores to spaces.
|
303
|
+
#
|
304
|
+
# Notes for bootstrap style:
|
305
|
+
# Molphy-style bootstrap values may always be parsed, even if
|
306
|
+
# the <tt>options[:bootstrap_style]</tt> is set to
|
307
|
+
# <tt>:traditional</tt> or <tt>:disabled</tt>.
|
308
|
+
#
|
309
|
+
# Note for default or traditional bootstrap style:
|
310
|
+
# By default, if all of the internal node's names are numeric
|
263
311
|
# and there are no NHX and no molphy-style boostrap values,
|
264
312
|
# the names of internal nodes are regarded as bootstrap values.
|
265
|
-
# options[:bootstrap_style] = :disabled or
|
266
|
-
# (or at least one NHX tag exists).
|
313
|
+
# <tt>options[:bootstrap_style] = :disabled</tt> or <tt>:molphy</tt>
|
314
|
+
# to disable the feature (or at least one NHX tag exists).
|
267
315
|
def initialize(str, options = nil)
|
268
316
|
str = str.sub(/\;(.*)/m, ';')
|
269
317
|
@original_string = str
|
@@ -308,57 +356,66 @@ module Bio
|
|
308
356
|
end
|
309
357
|
|
310
358
|
# Parses newick formatted leaf (or internal node) name.
|
311
|
-
def __parse_newick_leaf(
|
312
|
-
|
313
|
-
|
314
|
-
node.name =
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
edge.distance_string = $2 if $2 and !($2.strip.empty?)
|
325
|
-
else
|
326
|
-
node.name = str
|
359
|
+
def __parse_newick_leaf(leaf_tokens, node, edge, options)
|
360
|
+
t = leaf_tokens.shift
|
361
|
+
if !t.kind_of?(Symbol) then
|
362
|
+
node.name = t
|
363
|
+
t = leaf_tokens.shift
|
364
|
+
end
|
365
|
+
|
366
|
+
if t == :':' then
|
367
|
+
t = leaf_tokens.shift
|
368
|
+
if !t.kind_of?(Symbol) then
|
369
|
+
edge.distance_string = t if t and !(t.strip.empty?)
|
370
|
+
t = leaf_tokens.shift
|
371
|
+
end
|
327
372
|
end
|
328
373
|
|
329
|
-
|
330
|
-
|
374
|
+
if t == :'[' then
|
375
|
+
btokens = leaf_tokens
|
331
376
|
case __get_option(:original_format, options)
|
332
377
|
when :nhx
|
333
378
|
# regarded as NHX string which might be broken
|
334
|
-
__parse_nhx(
|
379
|
+
__parse_nhx(btokens, node, edge)
|
335
380
|
when :traditional
|
336
381
|
# simply ignored
|
337
382
|
else
|
338
|
-
case
|
383
|
+
case btokens[0].to_s.strip
|
384
|
+
when ''
|
385
|
+
# not automatically determined
|
339
386
|
when /\A\&\&NHX/
|
340
387
|
# NHX string
|
341
388
|
# force to set NHX mode
|
342
389
|
@options[:original_format] = :nhx
|
343
|
-
__parse_nhx(
|
390
|
+
__parse_nhx(btokens, node, edge)
|
344
391
|
else
|
345
392
|
# Molphy-style boostrap values
|
346
393
|
# let molphy mode if nothing determined
|
347
394
|
@options[:original_format] ||= :molphy
|
395
|
+
bstr = ''
|
396
|
+
while t = btokens.shift and t != :']'
|
397
|
+
bstr.concat t.to_s
|
398
|
+
end
|
348
399
|
node.bootstrap_string = bstr
|
349
|
-
end #case
|
400
|
+
end #case btokens[0]
|
350
401
|
end
|
351
402
|
end
|
352
403
|
|
404
|
+
if !btokens and !leaf_tokens.empty? then
|
405
|
+
# syntax error?
|
406
|
+
end
|
407
|
+
node.name ||= '' # compatibility for older BioRuby
|
408
|
+
|
353
409
|
# returns true
|
354
410
|
true
|
355
411
|
end
|
356
412
|
|
357
413
|
# Parses NHX (New Hampshire eXtended) string
|
358
|
-
def __parse_nhx(
|
359
|
-
|
360
|
-
|
361
|
-
|
414
|
+
def __parse_nhx(btokens, node, edge)
|
415
|
+
btokens.shift if btokens[0] == '&&NHX'
|
416
|
+
btokens.each do |str|
|
417
|
+
break if str == :']'
|
418
|
+
next if str.kind_of?(Symbol)
|
362
419
|
tag, val = str.split(/\=/, 2)
|
363
420
|
case tag
|
364
421
|
when 'B'
|
@@ -391,6 +448,97 @@ module Bio
|
|
391
448
|
true
|
392
449
|
end
|
393
450
|
|
451
|
+
# splits string to tokens
|
452
|
+
def __parse_newick_tokenize(str, options)
|
453
|
+
str = str.chop if str[-1..-1] == ';'
|
454
|
+
# http://evolution.genetics.washington.edu/phylip/newick_doc.html
|
455
|
+
# quoted_label ==> ' string_of_printing_characters '
|
456
|
+
# single quote in quoted_label is '' (two single quotes)
|
457
|
+
#
|
458
|
+
|
459
|
+
if __get_option(:parser, options) == :naive then
|
460
|
+
ary = str.split(/([\(\)\,\:\[\]])/)
|
461
|
+
ary.collect! { |x| x.strip!; x.empty? ? nil : x }
|
462
|
+
ary.compact!
|
463
|
+
ary.collect! do |x|
|
464
|
+
if /\A([\(\)\,\:\[\]])\z/ =~ x then
|
465
|
+
x.intern
|
466
|
+
else
|
467
|
+
x
|
468
|
+
end
|
469
|
+
end
|
470
|
+
return ary
|
471
|
+
end
|
472
|
+
|
473
|
+
tokens = []
|
474
|
+
ss = StringScanner.new(str)
|
475
|
+
|
476
|
+
while !(ss.eos?)
|
477
|
+
if ss.scan(/\s+/) then
|
478
|
+
# do nothing
|
479
|
+
|
480
|
+
elsif ss.scan(/[\(\)\,\:\[\]]/) then
|
481
|
+
# '(' or ')' or ',' or ':' or '[' or ']'
|
482
|
+
t = ss.matched
|
483
|
+
tokens.push t.intern
|
484
|
+
|
485
|
+
elsif ss.scan(/\'/) then
|
486
|
+
# quoted_label
|
487
|
+
t = ''
|
488
|
+
while true
|
489
|
+
if ss.scan(/([^\']*)\'/) then
|
490
|
+
t.concat ss[1]
|
491
|
+
if ss.scan(/\'/) then
|
492
|
+
# single quote in quoted_label
|
493
|
+
t.concat ss.matched
|
494
|
+
else
|
495
|
+
break
|
496
|
+
end
|
497
|
+
else
|
498
|
+
# incomplete quoted_label?
|
499
|
+
break
|
500
|
+
end
|
501
|
+
end #while true
|
502
|
+
unless ss.match?(/\s*[\(\)\,\:\[\]]/) or ss.match?(/\s*\z/) then
|
503
|
+
# label continues? (illegal, but try to rescue)
|
504
|
+
if ss.scan(/[^\(\)\,\:\[\]]+/) then
|
505
|
+
t.concat ss.matched.lstrip
|
506
|
+
end
|
507
|
+
end
|
508
|
+
tokens.push t
|
509
|
+
|
510
|
+
elsif ss.scan(/[^\(\)\,\:\[\]]+/) then
|
511
|
+
# unquoted_label
|
512
|
+
t = ss.matched.strip
|
513
|
+
t.gsub!(/[\r\n]/, '')
|
514
|
+
# unquoted underscore should be converted to blank
|
515
|
+
t.gsub!(/\_/, ' ')
|
516
|
+
tokens.push t unless t.empty?
|
517
|
+
|
518
|
+
else
|
519
|
+
# unquoted_label in end of string
|
520
|
+
t = ss.rest.strip
|
521
|
+
t.gsub!(/[\r\n]/, '')
|
522
|
+
# unquoted underscore should be converted to blank
|
523
|
+
t.gsub!(/\_/, ' ')
|
524
|
+
tokens.push t unless t.empty?
|
525
|
+
ss.terminate
|
526
|
+
|
527
|
+
end
|
528
|
+
end #while !(ss.eos?)
|
529
|
+
|
530
|
+
tokens
|
531
|
+
end
|
532
|
+
|
533
|
+
# get tokens for a leaf
|
534
|
+
def __parse_newick_get_tokens_for_leaf(ary)
|
535
|
+
r = []
|
536
|
+
while t = ary[0] and t != :',' and t != :')' and t != :'('
|
537
|
+
r.push ary.shift
|
538
|
+
end
|
539
|
+
r
|
540
|
+
end
|
541
|
+
|
394
542
|
# Parses newick formatted string.
|
395
543
|
def __parse_newick(str, options = {})
|
396
544
|
# initializing
|
@@ -401,40 +549,37 @@ module Bio
|
|
401
549
|
internal_nodes = []
|
402
550
|
node_stack = []
|
403
551
|
# preparation of tokens
|
404
|
-
|
405
|
-
ary = str.split(/([\(\)\,])/)
|
406
|
-
ary.collect! { |x| x.strip!; x.empty? ? nil : x }
|
407
|
-
ary.compact!
|
552
|
+
ary = __parse_newick_tokenize(str, options)
|
408
553
|
previous_token = nil
|
409
554
|
# main loop
|
410
555
|
while token = ary.shift
|
411
556
|
#p token
|
412
557
|
case token
|
413
|
-
when ','
|
414
|
-
if previous_token == ',' or previous_token == '(' then
|
558
|
+
when :','
|
559
|
+
if previous_token == :',' or previous_token == :'(' then
|
415
560
|
# there is a leaf whose name is empty.
|
416
561
|
ary.unshift(token)
|
417
562
|
ary.unshift('')
|
418
563
|
token = nil
|
419
564
|
end
|
420
|
-
when '('
|
565
|
+
when :'('
|
421
566
|
node = Node.new
|
422
567
|
nodes << node
|
423
568
|
internal_nodes << node
|
424
569
|
node_stack.push(cur_node)
|
425
570
|
cur_node = node
|
426
|
-
when ')'
|
427
|
-
if previous_token == ',' or previous_token == '(' then
|
571
|
+
when :')'
|
572
|
+
if previous_token == :',' or previous_token == :'(' then
|
428
573
|
# there is a leaf whose name is empty.
|
429
574
|
ary.unshift(token)
|
430
575
|
ary.unshift('')
|
431
576
|
token = nil
|
432
577
|
else
|
433
578
|
edge = Edge.new
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
579
|
+
leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
|
580
|
+
token = nil
|
581
|
+
if leaf_tokens.size > 0 then
|
582
|
+
__parse_newick_leaf(leaf_tokens, cur_node, edge, options)
|
438
583
|
end
|
439
584
|
parent = node_stack.pop
|
440
585
|
raise ParseError, 'unmatched parentheses' unless parent
|
@@ -444,7 +589,10 @@ module Bio
|
|
444
589
|
else
|
445
590
|
leaf = Node.new
|
446
591
|
edge = Edge.new
|
447
|
-
|
592
|
+
ary.unshift(token)
|
593
|
+
leaf_tokens = __parse_newick_get_tokens_for_leaf(ary)
|
594
|
+
token = nil
|
595
|
+
__parse_newick_leaf(leaf_tokens, leaf, edge, options)
|
448
596
|
nodes << leaf
|
449
597
|
edges << Bio::Relation.new(cur_node, leaf, edge)
|
450
598
|
end #case
|