nexus_parser 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/MIT-LICENSE +20 -0
- data/README +13 -0
- data/README.rdoc +17 -0
- data/Rakefile +53 -0
- data/VERSION +1 -0
- data/init.rb +1 -0
- data/install.rb +1 -0
- data/lib/lexer.rb +66 -0
- data/lib/nexus_file.rb +282 -0
- data/lib/parser.rb +334 -0
- data/lib/tokens.rb +269 -0
- data/tasks/nexus_parser_tasks.rake +4 -0
- data/test/MX_test_03.nex +234 -0
- data/test/test.nex +382 -0
- data/test/test_nexus_parser.rb +937 -0
- data/uninstall.rb +1 -0
- metadata +82 -0
data/lib/parser.rb
ADDED
@@ -0,0 +1,334 @@
|
|
1
|
+
|
2
|
+
class NexusFile::Parser
|
3
|
+
|
4
|
+
def initialize(lexer, builder)
|
5
|
+
@lexer = lexer
|
6
|
+
@builder = builder
|
7
|
+
end
|
8
|
+
|
9
|
+
def parse_file
|
10
|
+
# nf = @builder.new_nexus_file # create new local NexusFile instance, nf
|
11
|
+
blks = []
|
12
|
+
@lexer.pop(NexusFile::Tokens::NexusStart)
|
13
|
+
|
14
|
+
while @lexer.peek(NexusFile::Tokens::BeginBlk)
|
15
|
+
|
16
|
+
@lexer.pop(NexusFile::Tokens::BeginBlk) # pop it
|
17
|
+
|
18
|
+
if @lexer.peek(NexusFile::Tokens::AuthorsBlk)
|
19
|
+
parse_authors_blk
|
20
|
+
|
21
|
+
# we parse these below
|
22
|
+
elsif @lexer.peek(NexusFile::Tokens::TaxaBlk)
|
23
|
+
|
24
|
+
@lexer.pop(NexusFile::Tokens::TaxaBlk )
|
25
|
+
parse_taxa_blk
|
26
|
+
|
27
|
+
elsif @lexer.peek(NexusFile::Tokens::ChrsBlk)
|
28
|
+
@lexer.pop(NexusFile::Tokens::ChrsBlk)
|
29
|
+
parse_characters_blk
|
30
|
+
|
31
|
+
elsif @lexer.peek(NexusFile::Tokens::NotesBlk)
|
32
|
+
@lexer.pop(NexusFile::Tokens::NotesBlk)
|
33
|
+
parse_notes_blk
|
34
|
+
|
35
|
+
# we should parse this
|
36
|
+
elsif @lexer.peek(NexusFile::Tokens::SetsBlk)
|
37
|
+
@lexer.pop(NexusFile::Tokens::SetsBlk)
|
38
|
+
|
39
|
+
# we don't parse these
|
40
|
+
elsif @lexer.peek(NexusFile::Tokens::TreesBlk)
|
41
|
+
@foo = @lexer.pop(NexusFile::Tokens::TreesBlk).value
|
42
|
+
|
43
|
+
elsif @lexer.peek(NexusFile::Tokens::LabelsBlk)
|
44
|
+
@lexer.pop(NexusFile::Tokens::LabelsBlk)
|
45
|
+
|
46
|
+
elsif @lexer.peek(NexusFile::Tokens::MqCharModelsBlk)
|
47
|
+
@lexer.pop(NexusFile::Tokens::MqCharModelsBlk)
|
48
|
+
|
49
|
+
elsif @lexer.peek(NexusFile::Tokens::AssumptionsBlk)
|
50
|
+
@lexer.pop(NexusFile::Tokens::AssumptionsBlk)
|
51
|
+
|
52
|
+
elsif @lexer.peek(NexusFile::Tokens::CodonsBlk)
|
53
|
+
@lexer.pop(NexusFile::Tokens::CodonsBlk)
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# just removes it for the time being
|
60
|
+
def parse_authors_blk
|
61
|
+
# thing has non single word key/value pairs, like "AUTHOR NAME", SIGH
|
62
|
+
# for now just slurp it all up.
|
63
|
+
@lexer.pop(NexusFile::Tokens::AuthorsBlk )
|
64
|
+
|
65
|
+
#while true
|
66
|
+
# if @lexer.peek(NexusFile::Tokens::EndBlk)
|
67
|
+
# @lexer.pop(NexusFile::Tokens::EndBlk)
|
68
|
+
# break
|
69
|
+
# else
|
70
|
+
|
71
|
+
# while @lexer.peek(NexusFile::Tokens::ValuePair)
|
72
|
+
# # IMPORTANT, these are going to a general hash, there may ultimately be overlap of keys used in different blocks, this is ignored at present
|
73
|
+
# @builder.add_var(@lexer.pop(NexusFile::Tokens::ValuePair).value)
|
74
|
+
# end
|
75
|
+
|
76
|
+
#@lexer.pop(NexusFile::Tokens::ID) if @lexer.peek(NexusFile::Tokens::ID)
|
77
|
+
# end
|
78
|
+
#end
|
79
|
+
end
|
80
|
+
|
81
|
+
def parse_taxa_blk
|
82
|
+
@lexer.pop(NexusFile::Tokens::Title) if @lexer.peek(NexusFile::Tokens::Title)
|
83
|
+
|
84
|
+
# need to not ignore to test against
|
85
|
+
parse_dimensions if @lexer.peek(NexusFile::Tokens::Dimensions)
|
86
|
+
|
87
|
+
while true
|
88
|
+
if @lexer.peek(NexusFile::Tokens::EndBlk)
|
89
|
+
@lexer.pop(NexusFile::Tokens::EndBlk)
|
90
|
+
break
|
91
|
+
else
|
92
|
+
|
93
|
+
if @lexer.peek(NexusFile::Tokens::Taxlabels)
|
94
|
+
@lexer.pop(NexusFile::Tokens::Taxlabels) if @lexer.peek(NexusFile::Tokens::Taxlabels)
|
95
|
+
i = 0
|
96
|
+
while @lexer.peek(NexusFile::Tokens::Label)
|
97
|
+
@builder.update_taxon(:index => i, :name => @lexer.pop(NexusFile::Tokens::Label).value)
|
98
|
+
i += 1
|
99
|
+
end
|
100
|
+
@lexer.pop(NexusFile::Tokens::SemiColon) if @lexer.peek(NexusFile::Tokens::SemiColon) # close of tax labels, placement of this seems dubious... but tests are working
|
101
|
+
|
102
|
+
elsif @lexer.peek(NexusFile::Tokens::MesquiteIDs)
|
103
|
+
|
104
|
+
@lexer.pop(NexusFile::Tokens::MesquiteIDs) # trashing these for now
|
105
|
+
elsif @lexer.peek(NexusFile::Tokens::MesquiteBlockID)
|
106
|
+
@lexer.pop(NexusFile::Tokens::MesquiteBlockID)
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
def parse_characters_blk
|
116
|
+
while true
|
117
|
+
if @lexer.peek(NexusFile::Tokens::EndBlk) # we're at the end of the block, exit after geting rid of the semi-colon
|
118
|
+
break
|
119
|
+
else
|
120
|
+
@lexer.pop(NexusFile::Tokens::Title) if @lexer.peek(NexusFile::Tokens::Title) # not used at present
|
121
|
+
|
122
|
+
parse_dimensions if @lexer.peek(NexusFile::Tokens::Dimensions)
|
123
|
+
parse_format if @lexer.peek(NexusFile::Tokens::Format)
|
124
|
+
|
125
|
+
parse_chr_state_labels if @lexer.peek(NexusFile::Tokens::CharStateLabels)
|
126
|
+
|
127
|
+
parse_matrix if @lexer.peek(NexusFile::Tokens::Matrix)
|
128
|
+
|
129
|
+
# handle "\s*OPTIONS MSTAXA = UNCERTAIN;\s\n" within a characters block (sticks in an infinite loop right now)
|
130
|
+
|
131
|
+
@lexer.pop(NexusFile::Tokens::MesquiteIDs) if @lexer.peek(NexusFile::Tokens::MesquiteIDs) # trashing these for now
|
132
|
+
@lexer.pop(NexusFile::Tokens::MesquiteBlockID) if @lexer.peek(NexusFile::Tokens::MesquiteBlockID) # trashing these for now
|
133
|
+
|
134
|
+
false
|
135
|
+
end
|
136
|
+
end
|
137
|
+
@lexer.pop(NexusFile::Tokens::EndBlk)
|
138
|
+
end
|
139
|
+
|
140
|
+
# prolly pop header then fuse with parse_dimensions
|
141
|
+
def parse_format
|
142
|
+
@lexer.pop(NexusFile::Tokens::Format)
|
143
|
+
while @lexer.peek(NexusFile::Tokens::ValuePair)
|
144
|
+
@builder.add_var(@lexer.pop(NexusFile::Tokens::ValuePair).value)
|
145
|
+
end
|
146
|
+
|
147
|
+
check_initialization_of_ntax_nchar
|
148
|
+
end
|
149
|
+
|
150
|
+
def parse_dimensions
|
151
|
+
@lexer.pop(NexusFile::Tokens::Dimensions)
|
152
|
+
while @lexer.peek(NexusFile::Tokens::ValuePair)
|
153
|
+
@builder.add_var(@lexer.pop(NexusFile::Tokens::ValuePair).value)
|
154
|
+
end
|
155
|
+
# the last value pair with a ; is automagically handled, don't try popping it again
|
156
|
+
|
157
|
+
check_initialization_of_ntax_nchar
|
158
|
+
end
|
159
|
+
|
160
|
+
def check_initialization_of_ntax_nchar
|
161
|
+
# check for character dimensions, if otherwise not set generate them
|
162
|
+
if @builder.nexus_file.vars[:nchar] && @builder.nexus_file.characters == []
|
163
|
+
(0..(@builder.nexus_file.vars[:nchar].to_i - 1)).each {|i| @builder.stub_chr }
|
164
|
+
end
|
165
|
+
|
166
|
+
# check for taxa dimensions, if otherwise not set generate them
|
167
|
+
if @builder.nexus_file.vars[:ntax] && @builder.nexus_file.taxa == []
|
168
|
+
(0..(@builder.nexus_file.vars[:ntax].to_i - 1)).each {|i| @builder.stub_taxon }
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def parse_chr_state_labels
|
173
|
+
@lexer.pop(NexusFile::Tokens::CharStateLabels)
|
174
|
+
|
175
|
+
while true
|
176
|
+
if @lexer.peek(NexusFile::Tokens::SemiColon)
|
177
|
+
break
|
178
|
+
else
|
179
|
+
opts = {}
|
180
|
+
|
181
|
+
name = ""
|
182
|
+
index = @lexer.pop(NexusFile::Tokens::Number).value.to_i
|
183
|
+
(name = @lexer.pop(NexusFile::Tokens::Label).value) if @lexer.peek(NexusFile::Tokens::Label) # not always given a letter
|
184
|
+
|
185
|
+
@lexer.pop(NexusFile::Tokens::BckSlash) if @lexer.peek(NexusFile::Tokens::BckSlash)
|
186
|
+
|
187
|
+
if !@lexer.peek(NexusFile::Tokens::Comma) || !@lexer.peek(NexusFile::Tokens::SemiColon)
|
188
|
+
i = 0
|
189
|
+
|
190
|
+
# three kludge lines, need to figure out the label/number priority, could be issue in list order w/in tokens
|
191
|
+
while @lexer.peek(NexusFile::Tokens::Label) || @lexer.peek(NexusFile::Tokens::Number)
|
192
|
+
opts.update({i.to_s => @lexer.pop(NexusFile::Tokens::Label).value}) if @lexer.peek(NexusFile::Tokens::Label)
|
193
|
+
opts.update({i.to_s => @lexer.pop(NexusFile::Tokens::Number).value.to_s}) if @lexer.peek(NexusFile::Tokens::Number)
|
194
|
+
|
195
|
+
i += 1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
@lexer.pop(NexusFile::Tokens::Comma) if @lexer.peek(NexusFile::Tokens::Comma) # we may also have hit semicolon
|
200
|
+
|
201
|
+
opts.update({:index => (index - 1), :name => name})
|
202
|
+
|
203
|
+
raise(ParserError, "Error parsing character state labels for (or around) character #{index -1}.") if !opts[:name]
|
204
|
+
@builder.update_chr(opts)
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
@lexer.pop(NexusFile::Tokens::SemiColon)
|
209
|
+
end
|
210
|
+
|
211
|
+
def parse_matrix
|
212
|
+
@lexer.pop(NexusFile::Tokens::Matrix)
|
213
|
+
i = 0
|
214
|
+
while true
|
215
|
+
if @lexer.peek(NexusFile::Tokens::SemiColon)
|
216
|
+
break
|
217
|
+
else
|
218
|
+
t = @lexer.pop(NexusFile::Tokens::Label).value
|
219
|
+
|
220
|
+
@builder.update_taxon(:index => i, :name => t) # if it exists its not re-added
|
221
|
+
|
222
|
+
@builder.code_row(i, @lexer.pop(NexusFile::Tokens::RowVec).value)
|
223
|
+
|
224
|
+
i += 1
|
225
|
+
end
|
226
|
+
end
|
227
|
+
@lexer.pop(NexusFile::Tokens::SemiColon) # pop the semicolon
|
228
|
+
end
|
229
|
+
|
230
|
+
# this suck(s/ed), it needs work when a better API for Mesquite comes out
|
231
|
+
def parse_notes_blk
|
232
|
+
# IMPORTANT - we don't parse the (CM <note>), we just strip the "(CM" ... ")" bit for now in NexusFile::Note
|
233
|
+
|
234
|
+
@vars = {}
|
235
|
+
inf = 0
|
236
|
+
while true
|
237
|
+
inf += 1
|
238
|
+
raise "Either you have a gazillion notes or more likely parser is caught in an infinite loop inside parse_notes_block" if inf > 100000
|
239
|
+
if @lexer.peek(NexusFile::Tokens::EndBlk)
|
240
|
+
@lexer.pop(NexusFile::Tokens::EndBlk)
|
241
|
+
@builder.add_note(@vars) # one still left to add
|
242
|
+
break
|
243
|
+
else
|
244
|
+
|
245
|
+
if @lexer.peek(NexusFile::Tokens::ValuePair)
|
246
|
+
@vars.update(@lexer.pop(NexusFile::Tokens::ValuePair).value)
|
247
|
+
|
248
|
+
elsif @lexer.peek(NexusFile::Tokens::Label)
|
249
|
+
if @vars[:type] # we have the data for this row write it, and start a new one
|
250
|
+
|
251
|
+
@builder.add_note(@vars)
|
252
|
+
@vars = {}
|
253
|
+
else
|
254
|
+
@vars.update(:type => @lexer.pop(NexusFile::Tokens::Label).value)
|
255
|
+
end
|
256
|
+
elsif @lexer.peek(NexusFile::Tokens::FileLbl)
|
257
|
+
@lexer.pop(NexusFile::Tokens::FileLbl)
|
258
|
+
@vars.update(:file => 'file') # we check for whether :file key is present and handle conditionally
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
#@vars = {}
|
265
|
+
#while true
|
266
|
+
|
267
|
+
# break if @lexer.peek(NexusFile::Tokens::EndBlk)
|
268
|
+
|
269
|
+
# @vars.update(:type => @lexer.pop(NexusFile::Tokens::Label).value)
|
270
|
+
|
271
|
+
# kludge to get around the funny construct that references file
|
272
|
+
# if @lexer.peek(NexusFile::Tokens::FileLbl)
|
273
|
+
# @lexer.pop(NexusFile::Tokens::FileLbl)
|
274
|
+
# vars.update(:file => 'file') # we check for whether :file key is present and handle conditionally
|
275
|
+
# end
|
276
|
+
|
277
|
+
# while true
|
278
|
+
|
279
|
+
# meh = @lexer.pop(NexusFile::Tokens::ValuePair)
|
280
|
+
# @vars.update(meh.value)
|
281
|
+
# break if !@lexer.peek(NexusFile::Tokens::ValuePair)
|
282
|
+
# end
|
283
|
+
#
|
284
|
+
# @builder.add_note(@vars)
|
285
|
+
# @vars = {}
|
286
|
+
#end
|
287
|
+
# @lexer.pop(NexusFile::Tokens::EndBlk)
|
288
|
+
|
289
|
+
|
290
|
+
def parse_trees_blk
|
291
|
+
true
|
292
|
+
end
|
293
|
+
|
294
|
+
def parse_labels_blk
|
295
|
+
|
296
|
+
end
|
297
|
+
|
298
|
+
def parse_sets_blk
|
299
|
+
end
|
300
|
+
|
301
|
+
def parse_assumptions_blk
|
302
|
+
end
|
303
|
+
|
304
|
+
def parse_codens_blk
|
305
|
+
# not likely
|
306
|
+
end
|
307
|
+
|
308
|
+
def parse_mesquitecharmodels_blk
|
309
|
+
# nor this
|
310
|
+
end
|
311
|
+
|
312
|
+
|
313
|
+
def parse_mesquite_blk
|
314
|
+
|
315
|
+
end
|
316
|
+
|
317
|
+
|
318
|
+
|
319
|
+
# def parse_children(parent)
|
320
|
+
# parse a comma-separated list of nodes
|
321
|
+
# while true
|
322
|
+
# parse_node(parent)
|
323
|
+
# if @lexer.peek(NexusFile::Tokens::Comma)
|
324
|
+
# @lexer.pop(NexusFile::Tokens::Comma)
|
325
|
+
# else
|
326
|
+
# break
|
327
|
+
# end
|
328
|
+
# end
|
329
|
+
# end
|
330
|
+
|
331
|
+
end
|
332
|
+
|
333
|
+
|
334
|
+
|
data/lib/tokens.rb
ADDED
@@ -0,0 +1,269 @@
|
|
1
|
+
module NexusFile::Tokens
|
2
|
+
|
3
|
+
class Token
|
4
|
+
# this allows access the the class attribute regexp, without using a class variable
|
5
|
+
class << self; attr_reader :regexp; end
|
6
|
+
attr_reader :value
|
7
|
+
def initialize(str)
|
8
|
+
@value = str
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
# in ruby, \A is needed if you want to only match at the beginning of the string, we need this everywhere, as we're
|
13
|
+
# moving along popping off
|
14
|
+
|
15
|
+
class NexusStart < Token
|
16
|
+
@regexp = Regexp.new(/\A.*(\#nexus)\s*/i)
|
17
|
+
end
|
18
|
+
|
19
|
+
# at present we strip comments pre-parser initialization, because they can be placed anywhere it gets tricky to parse otherwise, and besides, they are non-standard
|
20
|
+
# class NexusComment < Token
|
21
|
+
# @regexp = Regexp.new(/\A\s*(\[[^\]]*\])\s*/i)
|
22
|
+
# def initialize(str)
|
23
|
+
# str = str[1..-2] # strip the []
|
24
|
+
# str.strip!
|
25
|
+
# @value = str
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
|
29
|
+
class BeginBlk < Token
|
30
|
+
@regexp = Regexp.new(/\A\s*(\s*Begin\s*)/i)
|
31
|
+
end
|
32
|
+
|
33
|
+
class EndBlk < Token
|
34
|
+
@regexp = Regexp.new(/\A\s*([\s\n]*End[\s\n]*;[\s\n]*)/i)
|
35
|
+
end
|
36
|
+
|
37
|
+
# label
|
38
|
+
class AuthorsBlk < Token
|
39
|
+
@regexp = Regexp.new(/\A\s*(Authors;.*?END;)\s*/im)
|
40
|
+
end
|
41
|
+
|
42
|
+
# label
|
43
|
+
class TaxaBlk < Token
|
44
|
+
@regexp = Regexp.new(/\A\s*(\s*Taxa\s*;)\s*/i)
|
45
|
+
end
|
46
|
+
|
47
|
+
# label
|
48
|
+
class NotesBlk < Token
|
49
|
+
@regexp = Regexp.new(/\A\s*(\s*Notes\s*;)\s*/i)
|
50
|
+
end
|
51
|
+
|
52
|
+
class FileLbl < Token
|
53
|
+
@regexp = Regexp.new(/\A\s*(\s*File\s*)\s*/i)
|
54
|
+
end
|
55
|
+
|
56
|
+
# label and content
|
57
|
+
class Title < Token
|
58
|
+
@regexp = Regexp.new(/\A\s*(title[^\;]*;)\s*/i)
|
59
|
+
end
|
60
|
+
|
61
|
+
class Dimensions < Token
|
62
|
+
@regexp = Regexp.new(/\A\s*(DIMENSIONS)\s*/i)
|
63
|
+
end
|
64
|
+
|
65
|
+
class Format < Token
|
66
|
+
@regexp = Regexp.new(/\A\s*(format)\s*/i)
|
67
|
+
end
|
68
|
+
|
69
|
+
# label
|
70
|
+
class Taxlabels < Token
|
71
|
+
@regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
|
72
|
+
end
|
73
|
+
|
74
|
+
# same as ID
|
75
|
+
class Label < Token
|
76
|
+
@regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
|
77
|
+
def initialize(str)
|
78
|
+
str.strip!
|
79
|
+
str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
|
80
|
+
str = str[1..-2] if str[0..0] == '"'
|
81
|
+
str.strip!
|
82
|
+
@value = str
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
class ChrsBlk < Token
|
87
|
+
@regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
|
88
|
+
end
|
89
|
+
|
90
|
+
# note we grab EOL and ; here
|
91
|
+
class ValuePair < Token
|
92
|
+
@regexp = Regexp.new(/\A\s*([\w\d\_\&]+\s*=\s*((\'[^\']+\')|(\(.*\))|(\"[^\"]+\")|([^\s\n\t;]+)))[\s\n\t;]+/i) # returns key => value hash for tokens like 'foo=bar' or foo = 'b a ar'
|
93
|
+
def initialize(str)
|
94
|
+
str.strip!
|
95
|
+
str = str.split(/=/)
|
96
|
+
str[1].strip!
|
97
|
+
str[1] = str[1][1..-2] if str[1][0..0] == "'"
|
98
|
+
str[1] = str[1][1..-2] if str[1][0..0] == "\""
|
99
|
+
@value = {str[0].strip.downcase.to_sym => str[1].strip}
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class Matrix < Token
|
104
|
+
@regexp = Regexp.new(/\A\s*(matrix)\s*/i)
|
105
|
+
end
|
106
|
+
|
107
|
+
class RowVec < Token
|
108
|
+
@regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
|
109
|
+
def initialize(str)
|
110
|
+
# meh! Ruby is simpler to read than Perl?
|
111
|
+
# handles both () and {} style multistates
|
112
|
+
s = str.split(/\(|\)|\}|\{/).collect{|s| s=~ /[\,|\s]/ ? s.split(/[\,|\s]/) : s}.inject([]){|sum, x| x.class == Array ? sum << x.delete_if {|y| y == "" } : sum + x.strip.split(//)}
|
113
|
+
@value = s
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
class CharStateLabels < Token
|
118
|
+
@regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
|
119
|
+
end
|
120
|
+
|
121
|
+
class MesquiteIDs < Token
|
122
|
+
@regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
|
123
|
+
end
|
124
|
+
|
125
|
+
class MesquiteBlockID < Token
|
126
|
+
@regexp = Regexp.new(/\A\s*(BLOCKID[^;]*;)\s*/i)
|
127
|
+
end
|
128
|
+
|
129
|
+
# unparsed blocks
|
130
|
+
|
131
|
+
class TreesBlk < Token
|
132
|
+
@regexp = Regexp.new(/\A\s*(trees;.*?END;)\s*/im) # note the multi-line /m
|
133
|
+
end
|
134
|
+
|
135
|
+
class SetsBlk < Token
|
136
|
+
@regexp = Regexp.new(/\A\s*(sets;.*?END;)\s*/im)
|
137
|
+
end
|
138
|
+
|
139
|
+
class MqCharModelsBlk < Token
|
140
|
+
@regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?END;)\s*/im)
|
141
|
+
end
|
142
|
+
|
143
|
+
class LabelsBlk < Token
|
144
|
+
@regexp = Regexp.new(/\A\s*(LABELS;.*?END;)\s*/im)
|
145
|
+
end
|
146
|
+
|
147
|
+
class AssumptionsBlk < Token
|
148
|
+
@regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?END;)\s*/im)
|
149
|
+
end
|
150
|
+
|
151
|
+
class CodonsBlk < Token
|
152
|
+
@regexp = Regexp.new(/\A\s*(CODONS;.*?END;)\s*/im)
|
153
|
+
end
|
154
|
+
|
155
|
+
class MesquiteBlk < Token
|
156
|
+
@regexp = Regexp.new(/\A\s*(Mesquite;.*?END;)\s*/im)
|
157
|
+
end
|
158
|
+
|
159
|
+
class BlkEnd < Token
|
160
|
+
@regexp = Regexp.new(/\A[\s\n]*(END;)\s*/i)
|
161
|
+
end
|
162
|
+
|
163
|
+
class LBracket < Token
|
164
|
+
@regexp = Regexp.new('\A\s*(\[)\s*')
|
165
|
+
end
|
166
|
+
|
167
|
+
class RBracket < Token
|
168
|
+
@regexp = Regexp.new('\A\s*(\])\s*')
|
169
|
+
end
|
170
|
+
|
171
|
+
class LParen < Token
|
172
|
+
@regexp = Regexp.new('\A\s*(\()\s*')
|
173
|
+
end
|
174
|
+
|
175
|
+
class RParen < Token
|
176
|
+
@regexp = Regexp.new('\A\s*(\))\s*')
|
177
|
+
end
|
178
|
+
|
179
|
+
class Equals < Token
|
180
|
+
@regexp = Regexp.new('\A\s*(=)\s*')
|
181
|
+
end
|
182
|
+
|
183
|
+
class BckSlash < Token
|
184
|
+
@regexp = Regexp.new('\A\s*(\/)\s*')
|
185
|
+
end
|
186
|
+
|
187
|
+
# labels
|
188
|
+
class ID < Token
|
189
|
+
@regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
|
190
|
+
def initialize(str)
|
191
|
+
str.strip!
|
192
|
+
str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
|
193
|
+
@value = str
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
class Colon < Token
|
198
|
+
@regexp = Regexp.new('\A\s*(:)\s*')
|
199
|
+
end
|
200
|
+
|
201
|
+
class SemiColon < Token
|
202
|
+
@regexp = Regexp.new('\A\s*(;)\s*')
|
203
|
+
end
|
204
|
+
|
205
|
+
class Comma < Token
|
206
|
+
@regexp = Regexp.new('\A\s*(\,)\s*')
|
207
|
+
end
|
208
|
+
|
209
|
+
class Number < Token
|
210
|
+
@regexp = Regexp.new('\A\s*(-?\d+(\.\d+)?([eE][+-]?\d+)?)\s*')
|
211
|
+
def initialize(str)
|
212
|
+
# a little oddness here, in some case we don't want to include the .0
|
213
|
+
# see issues with numbers as labels
|
214
|
+
if str =~ /\./
|
215
|
+
@value = str.to_f
|
216
|
+
else
|
217
|
+
@value = str.to_i
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
# NexusFile::Tokens::NexusComment
|
224
|
+
|
225
|
+
# this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
|
226
|
+
def self.nexus_file_token_list
|
227
|
+
[ NexusFile::Tokens::NexusStart,
|
228
|
+
NexusFile::Tokens::BeginBlk,
|
229
|
+
NexusFile::Tokens::EndBlk,
|
230
|
+
NexusFile::Tokens::AuthorsBlk,
|
231
|
+
NexusFile::Tokens::SetsBlk,
|
232
|
+
NexusFile::Tokens::MqCharModelsBlk,
|
233
|
+
NexusFile::Tokens::AssumptionsBlk,
|
234
|
+
NexusFile::Tokens::CodonsBlk,
|
235
|
+
NexusFile::Tokens::MesquiteBlk,
|
236
|
+
NexusFile::Tokens::TreesBlk,
|
237
|
+
NexusFile::Tokens::LabelsBlk,
|
238
|
+
NexusFile::Tokens::TaxaBlk,
|
239
|
+
NexusFile::Tokens::NotesBlk,
|
240
|
+
NexusFile::Tokens::Title,
|
241
|
+
NexusFile::Tokens::Taxlabels,
|
242
|
+
NexusFile::Tokens::Dimensions,
|
243
|
+
NexusFile::Tokens::FileLbl,
|
244
|
+
NexusFile::Tokens::Format,
|
245
|
+
NexusFile::Tokens::Equals,
|
246
|
+
NexusFile::Tokens::ValuePair, # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
|
247
|
+
NexusFile::Tokens::CharStateLabels,
|
248
|
+
NexusFile::Tokens::ChrsBlk,
|
249
|
+
NexusFile::Tokens::Number,
|
250
|
+
NexusFile::Tokens::Matrix,
|
251
|
+
NexusFile::Tokens::SemiColon,
|
252
|
+
NexusFile::Tokens::MesquiteIDs,
|
253
|
+
NexusFile::Tokens::MesquiteBlockID,
|
254
|
+
NexusFile::Tokens::BlkEnd,
|
255
|
+
NexusFile::Tokens::Colon,
|
256
|
+
NexusFile::Tokens::BckSlash,
|
257
|
+
NexusFile::Tokens::Comma,
|
258
|
+
NexusFile::Tokens::LParen,
|
259
|
+
NexusFile::Tokens::RParen,
|
260
|
+
NexusFile::Tokens::LBracket,
|
261
|
+
NexusFile::Tokens::RBracket,
|
262
|
+
NexusFile::Tokens::Label, # must be before RowVec
|
263
|
+
NexusFile::Tokens::RowVec,
|
264
|
+
NexusFile::Tokens::ID # need to trash this
|
265
|
+
]
|
266
|
+
end
|
267
|
+
|
268
|
+
end
|
269
|
+
|