nexus_parser 1.2.1 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be7f8d6dc4a222f456df1bb18dc3d63182cfb83b88ee036c227a93883c5ff70a
4
- data.tar.gz: 1ab8785c3ca791476efe19d290ef25f20dc790792cf82fdee4ab1a0cd7468347
3
+ metadata.gz: 4229e2b23de12e3ef92bc88a83aa04805d3884ca09019aaab843846f58fef964
4
+ data.tar.gz: 7973b5f04b84eea945ce632e5b20844a82c02a9a90c3c18a5ae4bbdaa97376c8
5
5
  SHA512:
6
- metadata.gz: e2c206656a9c8a1760e158641923b47f789ef42156fd9486dd75f70f92db670f31308adf739355eca6192ae3c702f6868b04fcab8ab87e7e44590371b3838bf7
7
- data.tar.gz: fb4a0f18b0430dc04aa4feebd9d9ea46fb91b8557bc61efe7d75d4ad4427da1a0fa9f0632a3afe074fe5e47c17b18f9cd6a58786833b6c07a7181f819cece0d8
6
+ metadata.gz: cd2739e8dcf4b84287f325a6443227b0b669a45f38c23a20f32bf30cfe88ac7eb34b5a6af6b0929c9af7b55c21b9096e683543528858527920dccfadff10d425
7
+ data.tar.gz: 40780dadb8ddc80554ca199e6ea9f0ffb5672db51c66c1b41390a25cf4f4c39e2b27799f644a293135b848b9feb3af0fc4cab338e5fe7f40ba670dcaad384965
@@ -33,18 +33,8 @@ class NexusParser::Lexer
33
33
  if @next_token
34
34
  return @next_token
35
35
  else
36
- # check for a match on the specified class first
37
36
  if match(token_class)
38
37
  return @next_token
39
- else
40
- # now check all the tokens for a match
41
- NexusParser::Tokens.nexus_file_token_list.each {|t|
42
- return @next_token if match(t)
43
- }
44
- end
45
- # no match, either end of string or lex-error
46
- if @input != ''
47
- raise( NexusParser::ParseError, "Lex Error, unknown token at #{@input[0..10]}...", caller)
48
38
  else
49
39
  return nil
50
40
  end
@@ -1,6 +1,6 @@
1
1
 
2
2
  class NexusParser::Parser
3
-
3
+
4
4
  def initialize(lexer, builder)
5
5
  @lexer = lexer
6
6
  @builder = builder
@@ -10,41 +10,41 @@ class NexusParser::Parser
10
10
  # nf = @builder.new_nexus_file # create new local NexusParser instance, nf
11
11
  # blks = []
12
12
  @lexer.pop(NexusParser::Tokens::NexusStart)
13
-
13
+
14
14
  while @lexer.peek(NexusParser::Tokens::BeginBlk)
15
-
15
+
16
16
  @lexer.pop(NexusParser::Tokens::BeginBlk) # pop it
17
-
17
+
18
18
  if @lexer.peek(NexusParser::Tokens::AuthorsBlk)
19
19
  parse_authors_blk
20
-
21
- # we parse these below
20
+
21
+ # we parse these below
22
22
  elsif @lexer.peek(NexusParser::Tokens::TaxaBlk)
23
-
23
+
24
24
  @lexer.pop(NexusParser::Tokens::TaxaBlk )
25
25
  parse_taxa_blk
26
-
26
+
27
27
  elsif @lexer.peek(NexusParser::Tokens::ChrsBlk)
28
28
  @lexer.pop(NexusParser::Tokens::ChrsBlk)
29
29
  parse_characters_blk
30
30
 
31
31
  elsif @lexer.peek(NexusParser::Tokens::NotesBlk)
32
- @lexer.pop(NexusParser::Tokens::NotesBlk)
32
+ @lexer.pop(NexusParser::Tokens::NotesBlk)
33
33
  parse_notes_blk
34
34
 
35
35
  # we should parse this
36
36
  elsif @lexer.peek(NexusParser::Tokens::SetsBlk)
37
37
  @lexer.pop(NexusParser::Tokens::SetsBlk)
38
38
 
39
- # we don't parse these
39
+ # we don't parse these
40
40
  elsif @lexer.peek(NexusParser::Tokens::TreesBlk)
41
41
  @foo = @lexer.pop(NexusParser::Tokens::TreesBlk).value
42
-
42
+
43
43
  elsif @lexer.peek(NexusParser::Tokens::LabelsBlk)
44
44
  @lexer.pop(NexusParser::Tokens::LabelsBlk)
45
-
45
+
46
46
  elsif @lexer.peek(NexusParser::Tokens::MqCharModelsBlk)
47
- @lexer.pop(NexusParser::Tokens::MqCharModelsBlk)
47
+ @lexer.pop(NexusParser::Tokens::MqCharModelsBlk)
48
48
 
49
49
  elsif @lexer.peek(NexusParser::Tokens::AssumptionsBlk)
50
50
  @lexer.pop(NexusParser::Tokens::AssumptionsBlk)
@@ -52,7 +52,7 @@ class NexusParser::Parser
52
52
  elsif @lexer.peek(NexusParser::Tokens::CodonsBlk)
53
53
  @lexer.pop(NexusParser::Tokens::CodonsBlk)
54
54
  end
55
-
55
+
56
56
  end
57
57
  end
58
58
 
@@ -70,15 +70,15 @@ class NexusParser::Parser
70
70
 
71
71
  # while @lexer.peek(NexusParser::Tokens::ValuePair)
72
72
  # # IMPORTANT, these are going to a general hash, there may ultimately be overlap of keys used in different blocks, this is ignored at present
73
- # @builder.add_var(@lexer.pop(NexusParser::Tokens::ValuePair).value)
73
+ # @builder.add_var(@lexer.pop(NexusParser::Tokens::ValuePair).value)
74
74
  # end
75
-
75
+
76
76
  #@lexer.pop(NexusParser::Tokens::ID) if @lexer.peek(NexusParser::Tokens::ID)
77
77
  # end
78
78
  #end
79
79
  end
80
80
 
81
- def parse_taxa_blk
81
+ def parse_taxa_blk
82
82
  @lexer.pop(NexusParser::Tokens::Title) if @lexer.peek(NexusParser::Tokens::Title)
83
83
 
84
84
  # need to not ignore to test against
@@ -88,7 +88,7 @@ class NexusParser::Parser
88
88
  while true
89
89
  inf += 1
90
90
  raise(NexusParser::ParseError,"Either you have a gazillion taxa or more likely the parser is caught in an infinite loop trying to parser taxon labels. Check for double single quotes in this block.") if inf > 100000
91
-
91
+
92
92
  if @lexer.peek(NexusParser::Tokens::EndBlk)
93
93
  @lexer.pop(NexusParser::Tokens::EndBlk)
94
94
  break
@@ -98,51 +98,53 @@ class NexusParser::Parser
98
98
  @lexer.pop(NexusParser::Tokens::Taxlabels) if @lexer.peek(NexusParser::Tokens::Taxlabels)
99
99
  i = 0
100
100
  while @lexer.peek(NexusParser::Tokens::Label)
101
- @builder.update_taxon(:index => i, :name => @lexer.pop(NexusParser::Tokens::Label).value)
101
+ @builder.update_taxon(:index => i, :name => @lexer.pop(NexusParser::Tokens::Label).value)
102
102
  i += 1
103
- end
103
+ end
104
104
  @lexer.pop(NexusParser::Tokens::SemiColon) if @lexer.peek(NexusParser::Tokens::SemiColon) # close of tax labels, placement of this seems dubious... but tests are working
105
-
105
+
106
106
  elsif @lexer.peek(NexusParser::Tokens::MesquiteIDs)
107
107
 
108
108
  @lexer.pop(NexusParser::Tokens::MesquiteIDs) # trashing these for now
109
109
  elsif @lexer.peek(NexusParser::Tokens::MesquiteBlockID)
110
- @lexer.pop(NexusParser::Tokens::MesquiteBlockID)
110
+ @lexer.pop(NexusParser::Tokens::MesquiteBlockID)
111
111
  end
112
-
112
+
113
113
  end
114
114
  end
115
115
 
116
116
 
117
117
  end
118
118
 
119
- def parse_characters_blk
120
-
121
- inf = 0
119
+ def parse_characters_blk
120
+
121
+ inf = 0
122
122
  while true
123
123
  inf += 1
124
124
  raise(NexusParser::ParseError,"Either you have a gazillion characters or more likely the parser is caught in an infinite loop trying to parser character data. Check for double single quotes in this block.") if inf > 100000
125
125
 
126
126
  if @lexer.peek(NexusParser::Tokens::EndBlk) # we're at the end of the block, exit after geting rid of the semi-colon
127
- break
127
+ break
128
128
  else
129
129
  @lexer.pop(NexusParser::Tokens::Title) if @lexer.peek(NexusParser::Tokens::Title) # not used at present
130
130
  @lexer.pop(NexusParser::Tokens::LinkLine) if @lexer.peek(NexusParser::Tokens::LinkLine) # trashing these for now
131
-
131
+
132
132
  parse_dimensions if @lexer.peek(NexusParser::Tokens::Dimensions)
133
- parse_format if @lexer.peek(NexusParser::Tokens::Format)
134
-
133
+ parse_format if @lexer.peek(NexusParser::Tokens::Format)
134
+
135
135
  parse_chr_state_labels if @lexer.peek(NexusParser::Tokens::CharStateLabels)
136
136
 
137
- parse_matrix if @lexer.peek(NexusParser::Tokens::Matrix)
138
-
137
+ parse_chr_labels if @lexer.peek(NexusParser::Tokens::CharLabels)
138
+
139
+ parse_state_labels if @lexer.peek(NexusParser::Tokens::StateLabels)
140
+
141
+ parse_matrix if @lexer.peek(NexusParser::Tokens::Matrix)
142
+
139
143
  # handle "\s*OPTIONS MSTAXA = UNCERTAIN;\s\n" within a characters block (sticks in an infinite loop right now)
140
144
 
141
145
 
142
146
  @lexer.pop(NexusParser::Tokens::MesquiteIDs) if @lexer.peek(NexusParser::Tokens::MesquiteIDs) # trashing these for now
143
147
  @lexer.pop(NexusParser::Tokens::MesquiteBlockID) if @lexer.peek(NexusParser::Tokens::MesquiteBlockID) # trashing these for now
144
-
145
- false
146
148
  end
147
149
  end
148
150
  @lexer.pop(NexusParser::Tokens::EndBlk)
@@ -150,7 +152,7 @@ class NexusParser::Parser
150
152
 
151
153
  # prolly pop header then fuse with parse_dimensions
152
154
  def parse_format
153
- @lexer.pop(NexusParser::Tokens::Format)
155
+ @lexer.pop(NexusParser::Tokens::Format)
154
156
 
155
157
  while @lexer.peek(NexusParser::Tokens::ValuePair) || @lexer.peek(NexusParser::Tokens::RespectCase)
156
158
  @lexer.pop(NexusParser::Tokens::RespectCase) if @lexer.peek(NexusParser::Tokens::RespectCase) # !! TODO: nothing is set, respect case is ignored
@@ -160,13 +162,13 @@ class NexusParser::Parser
160
162
  check_initialization_of_ntax_nchar
161
163
  end
162
164
 
163
- def parse_dimensions
165
+ def parse_dimensions
164
166
  @lexer.pop(NexusParser::Tokens::Dimensions)
165
167
  while @lexer.peek(NexusParser::Tokens::ValuePair)
166
168
  @builder.add_var(@lexer.pop(NexusParser::Tokens::ValuePair).value)
167
169
  end
168
170
  # the last value pair with a ; is automagically handled, don't try popping it again
169
-
171
+
170
172
  check_initialization_of_ntax_nchar
171
173
  end
172
174
 
@@ -175,7 +177,7 @@ class NexusParser::Parser
175
177
  if @builder.nexus_file.vars[:nchar] && @builder.nexus_file.characters == []
176
178
  (0..(@builder.nexus_file.vars[:nchar].to_i - 1)).each {|i| @builder.stub_chr }
177
179
  end
178
-
180
+
179
181
  # check for taxa dimensions, if otherwise not set generate them
180
182
  if @builder.nexus_file.vars[:ntax] && @builder.nexus_file.taxa == []
181
183
  (0..(@builder.nexus_file.vars[:ntax].to_i - 1)).each {|i| @builder.stub_taxon }
@@ -184,45 +186,108 @@ class NexusParser::Parser
184
186
 
185
187
  def parse_chr_state_labels
186
188
  @lexer.pop(NexusParser::Tokens::CharStateLabels)
187
-
188
- inf = 0
189
+
190
+ inf = 0
189
191
  while true
190
192
  inf += 1
191
193
  raise(NexusParser::ParseError,"Either you have a gazillion character state labels or more likely the parser is caught in an infinite loop while trying to parser character state labels. Check for double single quotes in this block.") if inf > 100000
192
194
 
193
- if @lexer.peek(NexusParser::Tokens::SemiColon)
194
- break
195
+ if @lexer.peek(NexusParser::Tokens::SemiColon)
196
+ break
195
197
  else
196
198
  opts = {}
197
-
198
199
  name = ""
199
- index = @lexer.pop(NexusParser::Tokens::Number).value.to_i
200
- (name = @lexer.pop(NexusParser::Tokens::Label).value) if @lexer.peek(NexusParser::Tokens::Label) # not always given a letter
200
+
201
+ index = @lexer.pop(NexusParser::Tokens::PositiveInteger).value.to_i
202
+
203
+ (name = @lexer.pop(NexusParser::Tokens::CharacterLabel).value) if @lexer.peek(NexusParser::Tokens::CharacterLabel) # not always given a letter
201
204
 
202
205
  @lexer.pop(NexusParser::Tokens::BckSlash) if @lexer.peek(NexusParser::Tokens::BckSlash)
203
206
 
204
207
  if !@lexer.peek(NexusParser::Tokens::Comma) || !@lexer.peek(NexusParser::Tokens::SemiColon)
205
208
  i = 0
206
209
 
207
- # three kludge lines, need to figure out the label/number priority, could be issue in list order w/in tokens
208
- while @lexer.peek(NexusParser::Tokens::Label) || @lexer.peek(NexusParser::Tokens::Number)
209
- opts.update({i.to_s => @lexer.pop(NexusParser::Tokens::Label).value}) if @lexer.peek(NexusParser::Tokens::Label)
210
- opts.update({i.to_s => @lexer.pop(NexusParser::Tokens::Number).value.to_s}) if @lexer.peek(NexusParser::Tokens::Number)
210
+ while @lexer.peek(NexusParser::Tokens::CharacterLabel)
211
+ opts.update({
212
+ i.to_s => @lexer.pop(NexusParser::Tokens::CharacterLabel).value
213
+ })
211
214
 
212
215
  i += 1
213
- end
216
+ end
214
217
  end
215
218
 
216
219
  @lexer.pop(NexusParser::Tokens::Comma) if @lexer.peek(NexusParser::Tokens::Comma) # we may also have hit semicolon
217
-
220
+
218
221
  opts.update({:index => (index - 1), :name => name})
219
-
222
+
220
223
  raise(NexusParser::ParseError, "Error parsing character state labels for (or around) character #{index - 1}.") if !opts[:name]
221
224
  @builder.update_chr(opts)
222
- end
225
+ end
223
226
 
224
227
  end
225
- @lexer.pop(NexusParser::Tokens::SemiColon)
228
+ @lexer.pop(NexusParser::Tokens::SemiColon)
229
+ end
230
+
231
+ def parse_chr_labels
232
+ @lexer.pop(NexusParser::Tokens::CharLabels)
233
+
234
+ inf = 0
235
+ while true
236
+ inf += 1
237
+ raise(NexusParser::ParseError,"Either you have a gazillion character labels or more likely the parser is caught in an infinite loop while trying to parse character labels. Check for double single quotes in this block.") if inf > 100000
238
+
239
+ if @lexer.peek(NexusParser::Tokens::SemiColon)
240
+ break
241
+ else
242
+ i = 0
243
+ while @lexer.peek(NexusParser::Tokens::CharacterLabel)
244
+ @builder.update_chr_name(
245
+ i, @lexer.pop(NexusParser::Tokens::CharacterLabel).value
246
+ )
247
+
248
+ i += 1
249
+ end
250
+ end
251
+ end
252
+ @lexer.pop(NexusParser::Tokens::SemiColon)
253
+ end
254
+
255
+ def parse_state_labels
256
+ @lexer.pop(NexusParser::Tokens::StateLabels)
257
+
258
+ inf = 0
259
+ while true
260
+ inf += 1
261
+ raise(NexusParser::ParseError,"Either you have a gazillion state labels or more likely the parser is caught in an infinite loop while trying to parse state labels. Check for double single quotes in this block.") if inf > 100000
262
+
263
+ if @lexer.peek(NexusParser::Tokens::SemiColon)
264
+ break
265
+ else
266
+ opts = {}
267
+
268
+ index = @lexer.pop(NexusParser::Tokens::PositiveInteger).value.to_i
269
+
270
+ if !@lexer.peek(NexusParser::Tokens::Comma) && !@lexer.peek(NexusParser::Tokens::SemiColon)
271
+ i = 0
272
+
273
+ while @lexer.peek(NexusParser::Tokens::CharacterLabel)
274
+ opts.update({
275
+ i.to_s => @lexer.pop(NexusParser::Tokens::CharacterLabel).value
276
+ })
277
+
278
+ i += 1
279
+ end
280
+ end
281
+
282
+ @lexer.pop(NexusParser::Tokens::Comma) if @lexer.peek(NexusParser::Tokens::Comma) # we may also have hit semicolon
283
+
284
+ opts.update({:index => (index - 1)})
285
+
286
+ @builder.update_chr_states(opts)
287
+ end
288
+
289
+ end
290
+ @lexer.pop(NexusParser::Tokens::SemiColon)
226
291
  end
227
292
 
228
293
  def parse_matrix
@@ -230,25 +295,25 @@ class NexusParser::Parser
230
295
  i = 0
231
296
  while true
232
297
  if @lexer.peek(NexusParser::Tokens::SemiColon)
233
- break
298
+ break
234
299
  else
235
300
  t = @lexer.pop(NexusParser::Tokens::Label).value
236
301
 
237
302
  @builder.update_taxon(:index => i, :name => t) # if it exists its not re-added
238
303
 
239
304
  @builder.code_row(i, @lexer.pop(NexusParser::Tokens::RowVec).value)
240
-
305
+
241
306
  i += 1
242
307
  end
243
308
  end
244
- @lexer.pop(NexusParser::Tokens::SemiColon) # pop the semicolon
309
+ @lexer.pop(NexusParser::Tokens::SemiColon) # pop the semicolon
245
310
  end
246
311
 
247
312
  # this suck(s/ed), it needs work when a better API for Mesquite comes out
248
313
  def parse_notes_blk
249
314
  # IMPORTANT - we don't parse the (CM <note>), we just strip the "(CM" ... ")" bit for now in NexusParser::Note
250
315
 
251
- @vars = {}
316
+ @vars = {}
252
317
  inf = 0 # a crude iteration checker
253
318
  while true
254
319
  inf += 1
@@ -261,18 +326,20 @@ class NexusParser::Parser
261
326
 
262
327
  if @lexer.peek(NexusParser::Tokens::ValuePair)
263
328
  @vars.update(@lexer.pop(NexusParser::Tokens::ValuePair).value)
264
-
265
- elsif @lexer.peek(NexusParser::Tokens::Label)
266
- if @vars[:type] # we have the data for this row write it, and start a new one
267
-
329
+
330
+ elsif @lexer.peek(NexusParser::Tokens::FileLbl)
331
+ @lexer.pop(NexusParser::Tokens::FileLbl)
332
+ @vars.update(:file => 'file') # we check for whether :file key is present and handle conditionally
333
+
334
+ else @lexer.peek(NexusParser::Tokens::Label)
335
+ # If we already have a :type set then the Label we just peeked starts a
336
+ # new row, so write the current one and then start a new one.
337
+ if @vars[:type]
268
338
  @builder.add_note(@vars)
269
339
  @vars = {}
270
- else
271
- @vars.update(:type => @lexer.pop(NexusParser::Tokens::Label).value)
272
340
  end
273
- elsif @lexer.peek(NexusParser::Tokens::FileLbl)
274
- @lexer.pop(NexusParser::Tokens::FileLbl)
275
- @vars.update(:file => 'file') # we check for whether :file key is present and handle conditionally
341
+
342
+ @vars.update(:type => @lexer.pop(NexusParser::Tokens::Label).value)
276
343
  end
277
344
  end
278
345
  end
@@ -280,9 +347,9 @@ class NexusParser::Parser
280
347
 
281
348
  #@vars = {}
282
349
  #while true
283
-
284
- # break if @lexer.peek(NexusParser::Tokens::EndBlk)
285
-
350
+
351
+ # break if @lexer.peek(NexusParser::Tokens::EndBlk)
352
+
286
353
  # @vars.update(:type => @lexer.pop(NexusParser::Tokens::Label).value)
287
354
 
288
355
  # kludge to get around the funny construct that references file
@@ -293,11 +360,11 @@ class NexusParser::Parser
293
360
 
294
361
  # while true
295
362
 
296
- # meh = @lexer.pop(NexusParser::Tokens::ValuePair)
363
+ # meh = @lexer.pop(NexusParser::Tokens::ValuePair)
297
364
  # @vars.update(meh.value)
298
365
  # break if !@lexer.peek(NexusParser::Tokens::ValuePair)
299
366
  # end
300
- #
367
+ #
301
368
  # @builder.add_note(@vars)
302
369
  # @vars = {}
303
370
  #end
@@ -326,7 +393,7 @@ class NexusParser::Parser
326
393
  # nor this
327
394
  end
328
395
 
329
-
396
+
330
397
  def parse_mesquite_blk
331
398
 
332
399
  end
@@ -335,7 +402,7 @@ class NexusParser::Parser
335
402
 
336
403
  # def parse_children(parent)
337
404
  # parse a comma-separated list of nodes
338
- # while true
405
+ # while true
339
406
  # parse_node(parent)
340
407
  # if @lexer.peek(NexusParser::Tokens::Comma)
341
408
  # @lexer.pop(NexusParser::Tokens::Comma)
@@ -344,7 +411,7 @@ class NexusParser::Parser
344
411
  # end
345
412
  # end
346
413
  # end
347
-
414
+
348
415
  end
349
416
 
350
417
 
@@ -1,6 +1,7 @@
1
1
  module NexusParser::Tokens
2
2
 
3
3
  ENDBLKSTR = '(end|endblock)'.freeze
4
+ QUOTEDLABEL = '(\'+[^\']+\'+)|(\"+[^\"]+\"+)'
4
5
 
5
6
  class Token
6
7
  # this allows access the the class attribute regexp, without using a class variable
@@ -78,9 +79,7 @@ module NexusParser::Tokens
78
79
  @regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
79
80
  end
80
81
 
81
- # same as ID
82
- class Label < Token
83
- @regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
82
+ class LabelBase < Token
84
83
  def initialize(str)
85
84
  str.strip!
86
85
  str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
@@ -90,6 +89,20 @@ module NexusParser::Tokens
90
89
  end
91
90
  end
92
91
 
92
+ class Label < LabelBase
93
+ @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|(\w[^,:(); \t\n]*)+)\s*/) # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" "
94
+ def initialize(str)
95
+ super(str)
96
+ end
97
+ end
98
+
99
+ class CharacterLabel < LabelBase
100
+ @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|[^ \t\n\/\'\",;]+)\s*/)
101
+ def initialize(str)
102
+ super(str)
103
+ end
104
+ end
105
+
93
106
  class ChrsBlk < Token
94
107
  @regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
95
108
  end
@@ -118,10 +131,50 @@ module NexusParser::Tokens
118
131
  class RowVec < Token
119
132
  @regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
120
133
  def initialize(str)
121
- # meh! Ruby is simpler to read than Perl?
122
- # handles both () and {} style multistates
123
- s = str.split(/\(|\)|\}|\{/).collect{|s| s=~ /[\,|\s]/ ? s.split(/[\,|\s]/) : s}.inject([]){|sum, x| x.class == Array ? sum << x.delete_if {|y| y == "" } : sum + x.strip.split(//)}
124
- @value = s
134
+ # We ignore commas outside (and inside) of groupings, it's fine.
135
+ str.gsub!(/[\, \t]/, '')
136
+
137
+ groupers = ['(', ')', '{', '}']
138
+ openers = ['(', '{']
139
+ closers = [')', '}']
140
+ closer_for = { '(' => ')', '{' => '}' }
141
+
142
+ a = []
143
+ group = nil
144
+ group_closer = nil
145
+ str.each_char { |c|
146
+ if groupers.include? c
147
+ if ((openers.include?(c) && !group.nil?) ||
148
+ (closers.include?(c) && (group.nil? || c != group_closer)))
149
+ raise(NexusParser::ParseError,
150
+ "Mismatched grouping in matrix row '#{str}'")
151
+ end
152
+
153
+ if openers.include? c
154
+ group = []
155
+ group_closer = closer_for[c]
156
+ else # c is a closer
157
+ if group.count == 1
158
+ a << group.first
159
+ elsif group.count > 1
160
+ a << group
161
+ end
162
+ group = nil
163
+ group_closer = nil
164
+ end
165
+ else
166
+ if group.nil?
167
+ a << c
168
+ else
169
+ group << c
170
+ end
171
+ end
172
+ }
173
+
174
+ raise(NexusParser::ParseError,
175
+ "Unclosed grouping in matrix row '#{str}'") if !group.nil?
176
+
177
+ @value = a
125
178
  end
126
179
  end
127
180
 
@@ -129,6 +182,14 @@ module NexusParser::Tokens
129
182
  @regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
130
183
  end
131
184
 
185
+ class CharLabels < Token
186
+ @regexp = Regexp.new(/\A\s*(CHARLABELS)\s*/i)
187
+ end
188
+
189
+ class StateLabels < Token
190
+ @regexp = Regexp.new(/\A\s*(STATELABELS)\s*/i)
191
+ end
192
+
132
193
  class MesquiteIDs < Token
133
194
  @regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
134
195
  end
@@ -195,16 +256,6 @@ module NexusParser::Tokens
195
256
  @regexp = Regexp.new('\A\s*(\/)\s*')
196
257
  end
197
258
 
198
- # labels
199
- class ID < Token
200
- @regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
201
- def initialize(str)
202
- str.strip!
203
- str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
204
- @value = str
205
- end
206
- end
207
-
208
259
  class Colon < Token
209
260
  @regexp = Regexp.new('\A\s*(:)\s*')
210
261
  end
@@ -217,66 +268,10 @@ module NexusParser::Tokens
217
268
  @regexp = Regexp.new('\A\s*(\,)\s*')
218
269
  end
219
270
 
220
- class Number < Token
221
- @regexp = Regexp.new('\A\s*(-?\d+(\.\d+)?([eE][+-]?\d+)?)\s*')
222
- def initialize(str)
223
- # a little oddness here, in some case we don't want to include the .0
224
- # see issues with numbers as labels
225
- if str =~ /\./
226
- @value = str.to_f
227
- else
228
- @value = str.to_i
229
- end
230
-
231
- end
271
+ class PositiveInteger < Token
272
+ @regexp = Regexp.new('\A\s*(\d+)\s*')
232
273
  end
233
274
 
234
275
  # NexusParser::Tokens::NexusComment
235
276
 
236
- # this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
237
- def self.nexus_file_token_list
238
- [ NexusParser::Tokens::NexusStart,
239
- NexusParser::Tokens::BeginBlk,
240
- NexusParser::Tokens::EndBlk,
241
- NexusParser::Tokens::AuthorsBlk,
242
- NexusParser::Tokens::SetsBlk,
243
- NexusParser::Tokens::MqCharModelsBlk,
244
- NexusParser::Tokens::AssumptionsBlk,
245
- NexusParser::Tokens::CodonsBlk,
246
- NexusParser::Tokens::MesquiteBlk,
247
- NexusParser::Tokens::TreesBlk,
248
- NexusParser::Tokens::LabelsBlk,
249
- NexusParser::Tokens::TaxaBlk,
250
- NexusParser::Tokens::NotesBlk,
251
- NexusParser::Tokens::Title,
252
- NexusParser::Tokens::Taxlabels,
253
- NexusParser::Tokens::Dimensions,
254
- NexusParser::Tokens::FileLbl,
255
- NexusParser::Tokens::Format,
256
- NexusParser::Tokens::RespectCase,
257
- NexusParser::Tokens::Equals,
258
- NexusParser::Tokens::ValuePair, # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
259
- NexusParser::Tokens::CharStateLabels,
260
- NexusParser::Tokens::ChrsBlk,
261
- NexusParser::Tokens::Number,
262
- NexusParser::Tokens::Matrix,
263
- NexusParser::Tokens::SemiColon,
264
- NexusParser::Tokens::MesquiteIDs,
265
- NexusParser::Tokens::MesquiteBlockID,
266
- NexusParser::Tokens::BlkEnd,
267
- NexusParser::Tokens::Colon,
268
- NexusParser::Tokens::BckSlash,
269
- NexusParser::Tokens::Comma,
270
- NexusParser::Tokens::LParen,
271
- NexusParser::Tokens::RParen,
272
- NexusParser::Tokens::LBracket,
273
- NexusParser::Tokens::RBracket,
274
- NexusParser::Tokens::Label, # must be before RowVec
275
- NexusParser::Tokens::RowVec,
276
- NexusParser::Tokens::LinkLine,
277
- NexusParser::Tokens::ID # need to trash this
278
- ]
279
- end
280
-
281
277
  end
282
-
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module NexusParser
4
- VERSION = "1.2.1"
4
+ VERSION = "1.2.2"
5
5
  end
data/lib/nexus_parser.rb CHANGED
@@ -3,9 +3,6 @@
3
3
  # uses the PhyloTree parser/lexer engine by Krishna Dole which in turn was based on
4
4
  # Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library
5
5
 
6
- # outstanding issues:
7
- ## need to resolve Tokens Labels, ValuePair, IDs
8
-
9
6
  module NexusParser
10
7
 
11
8
  require File.expand_path(File.join(File.dirname(__FILE__), 'nexus_parser', 'tokens'))
@@ -118,7 +115,7 @@ class NexusParser
118
115
  end
119
116
  end
120
117
 
121
- end
118
+ end # end NexusParser
122
119
 
123
120
 
124
121
  # constructs the NexusParser
@@ -141,6 +138,9 @@ class Builder
141
138
  def code_row(taxon_index, rowvector)
142
139
 
143
140
  @nf.characters.each_with_index do |c, i|
141
+ raise(ParseError,
142
+ "Row #{taxon_index} of the matrix is too short") if rowvector[i].nil?
143
+
144
144
  @nf.codings[taxon_index.to_i] = [] if !@nf.codings[taxon_index.to_i]
145
145
  @nf.codings[taxon_index.to_i][i] = NexusParser::Coding.new(:states => rowvector[i])
146
146
 
@@ -185,7 +185,7 @@ class Builder
185
185
 
186
186
  # need to create the characters
187
187
 
188
- raise(NexusParser::ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
188
+ raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
189
189
 
190
190
  (@nf.characters[@index].name = @opt[:name]) if @opt[:name]
191
191
 
@@ -193,18 +193,45 @@ class Builder
193
193
  @opt.delete(:name)
194
194
 
195
195
  # the rest have states
196
- @opt.keys.each do |k|
196
+ create_or_update_states_for_character(@index, @opt)
197
+ end
198
+
199
+ def update_chr_name(i, name)
200
+ raise(ParseError, "There are #{@nf.characters.count} characters but we're trying to update from row #{i + 1} of the CHARLABELS list - check your NCHAR and/or the length of your list.") if !@nf.characters[i]
197
201
 
198
- if (@nf.characters[@index].states != {}) && @nf.characters[@index].states[k] # state exists
202
+ # The CHARLABELS list is unindexed, so users are allowed to use '_' to
203
+ # indicate that a character name is unspecified.
204
+ @nf.characters[i].name = (name == '_' ? '' : name)
205
+ end
206
+
207
+ # legal hash keys are :index and integers that point to state labels
208
+ def update_chr_states(options = {})
209
+ return false if !options[:index]
210
+
211
+ @opt = options
212
+
213
+ @index = @opt[:index].to_i
214
+
215
+ raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the STATELABELS. Check the indices.") if !@nf.characters[@index]
216
+
217
+ @opt.delete(:index)
218
+
219
+ # the rest have states
220
+ create_or_update_states_for_character(@index, @opt)
221
+ end
222
+
223
+ def create_or_update_states_for_character(i, options)
224
+ options.keys.each do |k|
225
+
226
+ if (@nf.characters[i].states != {}) && @nf.characters[i].states[k] # state exists
199
227
 
200
228
  ## !! ONLY HANDLES NAME, UPDATE TO HANDLE notes etc. when we get them ##
201
- update_state(@index, :index => k, :name => @opt[k])
229
+ update_state(i, :index => k, :name => options[k])
202
230
 
203
231
  else # doesn't, create it
204
- @nf.characters[@index].add_state(:label => k.to_s, :name => @opt[k])
232
+ @nf.characters[i].add_state(:label => k.to_s, :name => options[k])
205
233
  end
206
234
  end
207
-
208
235
  end
209
236
 
210
237
  def update_state(chr_index, options = {})
@@ -256,7 +283,7 @@ class Builder
256
283
  @nf
257
284
  end
258
285
 
259
- end # end file
286
+ end # end Builder
260
287
 
261
288
  # NexusParser::ParseError
262
289
  class ParseError < StandardError
@@ -35,18 +35,18 @@ class Test_Lexer < Test::Unit::TestCase
35
35
  def test_lexer
36
36
  lexer = NexusParser::Lexer.new("[ foo ] BEGIN taxa; BLORF end;")
37
37
  assert lexer.pop(NexusParser::Tokens::LBracket)
38
- assert id = lexer.pop(NexusParser::Tokens::ID)
38
+ assert id = lexer.pop(NexusParser::Tokens::Label)
39
39
  assert_equal(id.value, "foo")
40
40
  assert lexer.pop(NexusParser::Tokens::RBracket)
41
41
  assert lexer.pop(NexusParser::Tokens::BeginBlk)
42
42
  assert lexer.pop(NexusParser::Tokens::TaxaBlk)
43
- assert foo = lexer.pop(NexusParser::Tokens::ID)
43
+ assert foo = lexer.pop(NexusParser::Tokens::Label)
44
44
  assert_equal("BLORF", foo.value) # truncating whitespace
45
45
  assert lexer.pop(NexusParser::Tokens::BlkEnd)
46
46
 
47
47
  lexer2 = NexusParser::Lexer.new("[ foo ] begin authors; BLORF end; [] () some crud here")
48
48
  assert lexer2.pop(NexusParser::Tokens::LBracket)
49
- assert id = lexer2.pop(NexusParser::Tokens::ID)
49
+ assert id = lexer2.pop(NexusParser::Tokens::Label)
50
50
  assert_equal(id.value, "foo")
51
51
  assert lexer2.pop(NexusParser::Tokens::RBracket)
52
52
  assert lexer2.pop(NexusParser::Tokens::BeginBlk)
@@ -64,44 +64,44 @@ class Test_Lexer < Test::Unit::TestCase
64
64
 
65
65
  lexer3 = NexusParser::Lexer.new("[ foo ] Begin Characters; BLORF end; [] () some crud here")
66
66
  assert lexer3.pop(NexusParser::Tokens::LBracket)
67
- assert id = lexer3.pop(NexusParser::Tokens::ID)
67
+ assert id = lexer3.pop(NexusParser::Tokens::Label)
68
68
  assert_equal(id.value, "foo")
69
69
  assert lexer3.pop(NexusParser::Tokens::RBracket)
70
70
  assert lexer3.pop(NexusParser::Tokens::BeginBlk)
71
71
  assert lexer3.pop(NexusParser::Tokens::ChrsBlk)
72
- assert foo = lexer3.pop(NexusParser::Tokens::ID)
72
+ assert foo = lexer3.pop(NexusParser::Tokens::Label)
73
73
  assert_equal("BLORF", foo.value)
74
74
  assert lexer3.pop(NexusParser::Tokens::BlkEnd)
75
75
 
76
76
  lexer4 = NexusParser::Lexer.new("Begin Characters; 123123123 end; [] () some crud here")
77
77
  assert lexer4.pop(NexusParser::Tokens::BeginBlk)
78
78
  assert lexer4.pop(NexusParser::Tokens::ChrsBlk)
79
- assert foo = lexer4.pop(NexusParser::Tokens::Number)
80
- assert_equal(123123123, foo.value)
79
+ assert foo = lexer4.pop(NexusParser::Tokens::PositiveInteger)
80
+ assert_equal('123123123', foo.value)
81
81
  assert lexer4.pop(NexusParser::Tokens::BlkEnd)
82
82
 
83
83
  lexer5 = NexusParser::Lexer.new("(0,1)")
84
84
  assert lexer5.pop(NexusParser::Tokens::LParen)
85
- assert foo = lexer5.pop(NexusParser::Tokens::Number)
86
- assert_equal(0, foo.value)
85
+ assert foo = lexer5.pop(NexusParser::Tokens::PositiveInteger)
86
+ assert_equal('0', foo.value)
87
87
  assert lexer5.pop(NexusParser::Tokens::Comma)
88
- assert foo = lexer5.pop(NexusParser::Tokens::Number)
89
- assert_equal(1, foo.value)
88
+ assert foo = lexer5.pop(NexusParser::Tokens::PositiveInteger)
89
+ assert_equal('1', foo.value)
90
90
  assert lexer5.pop(NexusParser::Tokens::RParen)
91
91
 
92
92
  lexer6 = NexusParser::Lexer.new(" 210(0,1)10A1\n")
93
93
  assert foo = lexer6.pop(NexusParser::Tokens::RowVec)
94
94
  assert_equal(["2","1","0",["0","1"],"1","0","A","1"], foo.value)
95
95
 
96
- lexer6a = NexusParser::Lexer.new(" 21a(0 1)0b{3 4 5}(0)(1 a)\n")
96
+ lexer6a = NexusParser::Lexer.new(" 21a(0 1)0b{345}(0)(1 a)\n")
97
97
  assert foo = lexer6a.pop(NexusParser::Tokens::RowVec)
98
98
  assert_equal(["2", "1", "a", ["0", "1"], "0", "b", ["3", "4", "5"], "0", ["1", "a"]], foo.value)
99
99
 
100
- lexer6b = NexusParser::Lexer.new(" 201{0 1}{0 1}0100)\x0A") # *nix line ending
100
+ lexer6b = NexusParser::Lexer.new(" 201(01){0 1}0100\x0A") # *nix line ending
101
101
  assert foo = lexer6b.pop(NexusParser::Tokens::RowVec)
102
102
  assert_equal(["2", "0", "1", ["0", "1"], ["0", "1"], "0", "1", "0", "0"], foo.value)
103
103
 
104
- lexer6c = NexusParser::Lexer.new(" 201{0 1}{0 1}0100)\x0D\x0A") # * dos line ending
104
+ lexer6c = NexusParser::Lexer.new(" 201{0 1}{01}0100\x0D\x0A") # * dos line ending
105
105
  assert foo = lexer6c.pop(NexusParser::Tokens::RowVec)
106
106
  assert_equal(["2", "0", "1", ["0", "1"], ["0", "1"], "0", "1", "0", "0"], foo.value)
107
107
 
@@ -126,7 +126,41 @@ class Test_Lexer < Test::Unit::TestCase
126
126
  def test_row_vec
127
127
  lexer = NexusParser::Lexer.new("0?(0 1)10(A BD , C)1(0,1,2)1-\n")
128
128
  assert foo = lexer.pop(NexusParser::Tokens::RowVec)
129
- assert_equal(["0", "?", ["0", "1"], "1", "0", ["A", "BD", "C"], "1", ["0", "1", "2"], "1", "-"], foo.value)
129
+ assert_equal(["0", "?", ["0", "1"], "1", "0", ["A", "B", "D", "C"], "1", ["0", "1", "2"], "1", "-"], foo.value)
130
+ end
131
+
132
+ def test_ungrouped_spaces_in_row_vec
133
+ lexer = NexusParser::Lexer.new("- A 12(BC) ? \n")
134
+ assert foo = lexer.pop(NexusParser::Tokens::RowVec)
135
+ assert_equal(['-', 'A', '1', '2', ['B', 'C'], '?'], foo.value)
136
+ end
137
+
138
+ def test_mismatched_parens_row_vec
139
+ lexer = NexusParser::Lexer.new("01(12(13\n")
140
+ assert_raise_with_message(NexusParser::ParseError, /Mismatch/) {
141
+ lexer.pop(NexusParser::Tokens::RowVec)
142
+ }
143
+ end
144
+
145
+ def test_mismatched_groupers_row_vec
146
+ lexer = NexusParser::Lexer.new("01(12}13\n")
147
+ assert_raise_with_message(NexusParser::ParseError, /Mismatch/) {
148
+ lexer.pop(NexusParser::Tokens::RowVec)
149
+ }
150
+ end
151
+
152
+ def test_nested_parens_row_vec
153
+ lexer = NexusParser::Lexer.new("01(12(34))13\n")
154
+ assert_raise_with_message(NexusParser::ParseError, /Mismatch/) {
155
+ lexer.pop(NexusParser::Tokens::RowVec)
156
+ }
157
+ end
158
+
159
+ def test_unclosed_parens_row_vec
160
+ lexer = NexusParser::Lexer.new("01(123413\n")
161
+ assert_raise_with_message(NexusParser::ParseError, /Unclosed/) {
162
+ lexer.pop(NexusParser::Tokens::RowVec)
163
+ }
130
164
  end
131
165
 
132
166
  def test_punctuation
@@ -428,11 +462,6 @@ class Test_Lexer < Test::Unit::TestCase
428
462
  assert_equal 'SETS', foo.value.slice(0,4)
429
463
  assert_equal 'END;', foo.value.slice(-4,4)
430
464
  end
431
-
432
- def test_lexer_errors
433
- lexer = NexusParser::Lexer.new("*&")
434
- assert_raise(NexusParser::ParseError) {lexer.peek(NexusParser::Tokens::ID)}
435
- end
436
465
  end
437
466
 
438
467
 
@@ -574,6 +603,30 @@ class Test_Parser < Test::Unit::TestCase
574
603
  assert_equal ["-", "0", "1", "2", "A"], foo.characters[4].state_labels
575
604
  end
576
605
 
606
+ def test_matrix_with_short_row
607
+ input= "
608
+ DIMENSIONS NCHAR=2;
609
+ FORMAT DATATYPE = STANDARD GAP = - MISSING = ? SYMBOLS = \" 0 1 2 3 4 5 6 7 8 9 A\";
610
+ CHARSTATELABELS
611
+ 1 Tibia_II / norm modified, 2 TII_macrosetae / '= TI' stronger;
612
+ MATRIX
613
+ Dictyna 0?
614
+ Uloborus ??
615
+ Deinopis 0
616
+ ;
617
+ END;"
618
+
619
+ builder = NexusParser::Builder.new
620
+ @lexer = NexusParser::Lexer.new(input)
621
+
622
+ # stub the taxa, they would otherwise get added in dimensions or taxa block
623
+ (0..2).each{|i| builder.stub_taxon}
624
+
625
+ assert_raise_with_message(NexusParser::ParseError, /too short/) {
626
+ NexusParser::Parser.new(@lexer, builder).parse_characters_blk
627
+ }
628
+ end
629
+
577
630
  def test_characters_block_without_IDs_or_title
578
631
  input= "
579
632
  DIMENSIONS NCHAR=10;
@@ -623,6 +676,55 @@ class Test_Parser < Test::Unit::TestCase
623
676
  assert_equal 10, foo.characters.size
624
677
  end
625
678
 
679
+ def test_characters_charlabels_statelabels_block
680
+ input= "
681
+ DIMENSIONS NCHAR=4;
682
+ FORMAT DATATYPE = STANDARD GAP = - MISSING = ? SYMBOLS = \" 0 1 2 3 4 5 6 7 8 9 A\";
683
+ CHARLABELS
684
+ Tibia_II
685
+ TII_macrosetae
686
+ 'Femoral tuber'
687
+ _
688
+ ;
689
+ STATELABELS
690
+ 1 norm modified,
691
+ 3 3 3.5 4,
692
+ 4 pres
693
+ ;
694
+ MATRIX
695
+ Dictyna -?1(01)
696
+ Uloborus 0321
697
+ ;
698
+ ENDBLOCK;"
699
+
700
+ builder = NexusParser::Builder.new
701
+ lexer = NexusParser::Lexer.new(input)
702
+
703
+ (0..3).each{|i| builder.stub_taxon}
704
+
705
+ NexusParser::Parser.new(lexer,builder).parse_characters_blk
706
+ foo = builder.nexus_file
707
+
708
+ assert_equal 4, foo.characters.size
709
+ assert_equal "Femoral tuber", foo.characters[2].name
710
+ assert_equal "Undefined", foo.characters[3].name
711
+
712
+ assert_equal "norm", foo.characters[0].states["0"].name
713
+ assert_equal "modified", foo.characters[0].states["1"].name
714
+
715
+ assert_equal "", foo.characters[1].states["3"].name
716
+
717
+ assert_equal ["3", "3.5", "4"], foo.characters[2].states.keys.collect{|s| foo.characters[2].states[s].name}.sort
718
+
719
+ assert_equal "", foo.characters[1].states["3"].name
720
+
721
+ assert_equal ["-"], foo.codings[0][0].states
722
+ assert_equal ["?"], foo.codings[0][1].states
723
+ assert_equal ["0", "1"], foo.codings[0][3].states
724
+
725
+ assert_equal ["3"], foo.codings[1][1].states
726
+ end
727
+
626
728
  def test_codings
627
729
  foo = parse_nexus_file(@nf)
628
730
  assert_equal 100, foo.codings.flatten.size # two multistates count in single cells
@@ -673,6 +775,68 @@ class Test_Parser < Test::Unit::TestCase
673
775
  assert_equal '0 1 2 3 4 5 6 7 8 9 A', foo.vars[:symbols]
674
776
  end
675
777
 
778
+ # https://github.com/mjy/nexus_parser/issues/9
779
+ def test_three_both_numeric_and_label_state_names_in_a_row
780
+ input =" CHARSTATELABELS
781
+ 1 'Metatarsal trichobothria (CodAra.29)' / 3 9 27 asdf;
782
+ Matrix
783
+ fooo 01 more stuff here that should not be hit"
784
+
785
+ builder = NexusParser::Builder.new
786
+ lexer = NexusParser::Lexer.new(input)
787
+
788
+ builder.stub_chr()
789
+
790
+ NexusParser::Parser.new(lexer, builder).parse_chr_state_labels
791
+
792
+ foo = builder.nexus_file
793
+
794
+ assert_equal "3", foo.characters[0].states['0'].name
795
+ assert_equal "9", foo.characters[0].states['1'].name
796
+ assert_equal "27", foo.characters[0].states['2'].name
797
+ assert_equal "asdf", foo.characters[0].states['3'].name
798
+ end
799
+
800
+ def test_non_label_character_name_character_labels
801
+ input = 'CHARSTATELABELS
802
+ 1 (intentionally_blank) /,
803
+ 2 /,
804
+ 3 %_coverage /,
805
+ 4 #_of_widgets /,
806
+ 5 !endangered! /,
807
+ 6 @the_front /,
808
+ 7 =antennae,
809
+ 8 `a_=_2` /,
810
+ 9 -35_or-36 ,
811
+ 10 27_or_less /,
812
+ 11 fine_not_fine /,
813
+ 12 3,
814
+ ;'
815
+
816
+ builder = NexusParser::Builder.new
817
+ lexer = NexusParser::Lexer.new(input)
818
+
819
+ (0..11).each{builder.stub_chr()}
820
+
821
+ NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
822
+
823
+ foo = builder.nexus_file
824
+
825
+ assert_equal 12, foo.characters.size
826
+ assert_equal "(intentionally_blank)", foo.characters[0].name
827
+ assert_equal "Undefined", foo.characters[1].name
828
+ assert_equal "%_coverage", foo.characters[2].name
829
+ assert_equal "#_of_widgets", foo.characters[3].name
830
+ assert_equal "!endangered!", foo.characters[4].name
831
+ assert_equal "@the_front", foo.characters[5].name
832
+ assert_equal "=antennae", foo.characters[6].name # =3
833
+ assert_equal "`a_=_2`", foo.characters[7].name
834
+ assert_equal "-35_or-36", foo.characters[8].name
835
+ assert_equal "27_or_less", foo.characters[9].name
836
+ assert_equal "fine_not_fine", foo.characters[10].name
837
+ assert_equal "3", foo.characters[11].name
838
+ end
839
+
676
840
  def test_parse_chr_state_labels
677
841
  input =" CHARSTATELABELS
678
842
  1 Tibia_II / norm modified, 2 TII_macrosetae / '= TI' stronger, 3 Femoral_tuber / abs pres 'm-setae', 5 Cymbium / dorsal mesal lateral, 6 Paracymbium / abs pres, 7 Globular_tegulum / abs pres, 8 / entire w_lobe, 9 Conductor_wraps_embolus, 10 Median_apophysis / pres abs ;
@@ -754,6 +918,169 @@ class Test_Parser < Test::Unit::TestCase
754
918
 
755
919
  end
756
920
 
921
+ def test_parse_chr_labels
922
+ input =" CHARLABELS
923
+ _
924
+ 'Maxillary teeth'
925
+ as_df
926
+ 'Highest number of maxillary teeth (or alveoli):';
927
+ STATELABELS
928
+ 1 more more more,"
929
+
930
+ builder = NexusParser::Builder.new
931
+ lexer = NexusParser::Lexer.new(input)
932
+
933
+ (0..3).each{builder.stub_chr()}
934
+
935
+ NexusParser::Parser.new(lexer,builder).parse_chr_labels
936
+
937
+ foo = builder.nexus_file
938
+ assert_equal 4, foo.characters.size
939
+ assert_equal 'Undefined', foo.characters[0].name
940
+ assert_equal 'Maxillary teeth', foo.characters[1].name
941
+ assert_equal 'as_df', foo.characters[2].name
942
+ assert_equal 'Highest number of maxillary teeth (or alveoli):', foo.characters[3].name
943
+ end
944
+
945
+ def test_parse_state_labels
946
+ input =" STATELABELS
947
+ 1 norm modified,
948
+ 3,
949
+ 4 pres
950
+ ;
951
+ CHARLABELS;
952
+ "
953
+
954
+ builder = NexusParser::Builder.new
955
+ lexer = NexusParser::Lexer.new(input)
956
+
957
+ (0..3).each{builder.stub_chr()}
958
+
959
+ NexusParser::Parser.new(lexer,builder).parse_state_labels
960
+
961
+ foo = builder.nexus_file
962
+ assert_equal 4, foo.characters.size
963
+
964
+ assert_equal "norm", foo.characters[0].states["0"].name
965
+ assert_equal "modified", foo.characters[0].states["1"].name
966
+
967
+ assert_empty foo.characters[1].states
968
+
969
+ assert_empty foo.characters[2].states
970
+
971
+ assert_equal "pres", foo.characters[3].states["0"].name
972
+ end
973
+
974
+ def test_non_label_character_state_character_labels
975
+ input = 'CHARSTATELABELS 1 Tibia_II /
976
+ .5
977
+ .1.2_form
978
+ idsimple
979
+ %_of_length_less_than_10
980
+ !poisonous!
981
+ #_is_3_or_4
982
+ (leave_as_is)
983
+ @12_o_clock
984
+ >2
985
+ ~equal
986
+ =9
987
+ ;'
988
+
989
+ builder = NexusParser::Builder.new
990
+ lexer = NexusParser::Lexer.new(input)
991
+
992
+ builder.stub_chr()
993
+
994
+ NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
995
+
996
+ foo = builder.nexus_file
997
+
998
+ assert_equal ".5", foo.characters[0].states["0"].name
999
+ assert_equal ".1.2_form", foo.characters[0].states["1"].name
1000
+ assert_equal "idsimple", foo.characters[0].states["2"].name
1001
+ assert_equal "%_of_length_less_than_10", foo.characters[0].states["3"].name
1002
+ assert_equal "!poisonous!", foo.characters[0].states["4"].name
1003
+ assert_equal "#_is_3_or_4", foo.characters[0].states["5"].name
1004
+ assert_equal "(leave_as_is)", foo.characters[0].states["6"].name
1005
+ assert_equal "@12_o_clock", foo.characters[0].states["7"].name
1006
+ assert_equal ">2", foo.characters[0].states["8"].name
1007
+ assert_equal "~equal", foo.characters[0].states["9"].name
1008
+ assert_equal "=9", foo.characters[0].states["10"].name
1009
+ end
1010
+
1011
+ def test_arbitrary_quote_and_quotelike_character_state_labels
1012
+ # We could tighten up our handling of accidentally unclosed quotes, but
1013
+ # there's pretty much no way to recover in general, so we're not testing
1014
+ # them here.
1015
+ # Things like ""asdf" " failing is a known issue (maybe not solvable with
1016
+ # regular expressions?).
1017
+ input = 'CHARSTATELABELS 1 Tibia_II /
1018
+ "asd, \'f\'"
1019
+ ""a\'sdf "
1020
+ \' /as"df/\'
1021
+ \'asdf;\'
1022
+ ""as, df""
1023
+ ;'
1024
+
1025
+ builder = NexusParser::Builder.new
1026
+ lexer = NexusParser::Lexer.new(input)
1027
+
1028
+ builder.stub_chr()
1029
+
1030
+ NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
1031
+
1032
+ foo = builder.nexus_file
1033
+
1034
+ assert_equal 'asd, \'f\'', foo.characters[0].states["0"].name
1035
+ assert_equal '"a\'sdf', foo.characters[0].states["1"].name
1036
+ assert_equal '/as"df/', foo.characters[0].states["2"].name
1037
+ assert_equal 'asdf;', foo.characters[0].states["3"].name
1038
+ assert_equal '"as, df"', foo.characters[0].states["4"].name
1039
+ end
1040
+
1041
+
1042
+ def test_number_label_chr_state_labels
1043
+ # Character state names that start with numbers
1044
+ input = 'CHARSTATELABELS 1 Tibia_II /
1045
+ 123abc
1046
+ -1.23abc
1047
+ -3e-3abc
1048
+ 25%_or_less_than
1049
+ ;'
1050
+
1051
+ builder = NexusParser::Builder.new
1052
+ lexer = NexusParser::Lexer.new(input)
1053
+
1054
+ (0..3).each{builder.stub_chr()}
1055
+
1056
+ NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
1057
+
1058
+ foo = builder.nexus_file
1059
+
1060
+ assert_equal "123abc", foo.characters[0].states["0"].name
1061
+ assert_equal "-1.23abc", foo.characters[0].states["1"].name
1062
+ assert_equal "-3e-3abc", foo.characters[0].states["2"].name
1063
+ assert_equal "25%_or_less_than", foo.characters[0].states["3"].name
1064
+ end
1065
+
1066
+ def test_value_pair_label_chr_state_labels
1067
+ # Character state names that are ValuePairs
1068
+ input = 'CHARSTATELABELS 1 Tibia_II /
1069
+ 234=(a_b_c)
1070
+ ;'
1071
+
1072
+ builder = NexusParser::Builder.new
1073
+ lexer = NexusParser::Lexer.new(input)
1074
+
1075
+ builder.stub_chr()
1076
+
1077
+ NexusParser::Parser.new(lexer,builder).parse_chr_state_labels
1078
+
1079
+ foo = builder.nexus_file
1080
+
1081
+ assert_equal '234=(a_b_c)', foo.characters[0].states["0"].name
1082
+ end
1083
+
757
1084
  def DONT_test_parse_really_long_string_of_chr_state_labels
758
1085
  input =" CHARSTATELABELS
759
1086
  1 Epigynal_ventral_margin / 'entire (Fig. 15G)' 'with scape (Fig. 27D)', 2 Epigynal_external_structure / openings_on_a_broad_depression 'copulatory openings on plate, flush with abdomen, sometimes slit like', 3 Epigynal_depression / 'round or square, at most slightly wider than high ' 'elongate, at least twice as wide as high ', 4 Epigynal_plate_surface / 'smooth (Fig. 12E)' 'ridged (Fig. 21G)', 5 epignynal_septum / absent_ present_, 6 Copulatory_bursa_anterior_margin / 'entire, broadly transverse (Fig. 19B)' 'medially acute (Figs. 22G, 40B)', 7 'Copulatory duct: spermathecal junction' / posterior lateral_or_anterior, 8 Copulatory_duct_loops_relative_to_spermathecae / apart 'encircling (Fig. 93J)', 9 Copulatory_duct_terminal_sclerotization / as_rest_of_duct_ 'distinctly sclerotized, clearly more than rest of duct ', 10 Hard_sclerotized_CD_region / mostly_or_entirely_ectal_to_the_ectal_rim_of_the_spermathecae 'caudal to the spermathecae, mesal to ectal margin of spermathecae', 11 Male_palpal_tibial_rim / uniform_or_only_slightly_asymmetric 'strongly and asymmetrically protruding, scoop-shaped (Fig 36D)', 12 Male_palpal_tibia_prolateral_trichobothria / one none, 13 Cymbial_ridge_ectal_setae / unmodified 'strongly curved towards the palpal bulb (Kochiura, Figs. 51B-C, 52C)', 14 Cymbial_distal_promargin / entire 'with an apophysis (Argyrodes, Figs.) ', 15 Cymbial_mesal_margin / entire 'incised (Anelosimus, Figs. 17D, 20A) ' deeply_notched, 16 Cymbial_tip_sclerotization / like_rest_of_cymbium 'lightly sclerotized, appears white', 17 Cymbial_tip_setae / like_other_setae 'thick and strongly curved (Kochiura, Figs. 51B, 52C)', 18 Cymbial_sheath / absent present, 19 Lock_placement / 'distal (Figs. 67B, 92F-G, I, M)' 'central (Fig. 92H)', 20 Lock_mechanism / 'hook (Figs 31F, 60D, 91A, 92D-E, J-L)' 'hood (Figs 18A, 75B, 92F-I, M)' 'Theridula (Fig 81D)', 21 Cymbial_hook_orientation / 'facing downwards (Figs. 91A, 92D-E, J-K)' 'facing upwards (Fig. 60C-D, 92L)', 22 Cymbial_hook_location / 'inside cymbium (Fig. 92D-E, J-K)' 'ectal cymbial margin (Figs. 67B, 92L).', 23 Cymbial_hook_distal_portion / 'blunt (Figs. 31F, 92D-E)' 'tapering to a narrow tongue (Figs. 66B, 67D, 92L)', 24 Cymbial_hood_size / 'narrow (Fig. 92F-H)' 'broad (Fig. 92I)' 'Spintharus (Fig. 92M)', 25 Cymbial_hood_region / 'translucent, hood visible through cymbium (Anelosimus, Figs. 90A, 91C)' 'opaque, hood not visible', 26 Alveolus_shape / 'circular or oval (Fig. 92A-H)' 'with a mesal extension (Fig. 92A)', 27 Tegulum_ectal_margin / entire 'protruded (Fig. 20D)', 28 Tegular_groove / absent 'present (Fig. 28B)', 29 SDT_SB_I / separate touching, 30 'SDT post-SB II turn' / gradual '90 degrees (Anelosimus, Fig. 93B)', 31 SDT_SB_I_&_II_reservoir_segment_alignment / divergent parallel, 32 SDT_SB_I_&_II_orientation / in_plane_of_first_loop_from_fundus 'out of plane of first loop, against tegular wall', 33 SDT_RSB_I_&_II / absent present, 34 SDT_SB_III / absent present, 35 SDT_SB_IV / absent 'present (Fig. 93E)', 36 Conductor_shape / 'simple, round or oval, short' 'fan shaped, narrow base and broad tip (Selkirkiella, Kochiura)' Enoplognatha Argyrodes Achaearanea Theridion '''rupununi''' '''tanzania''' '''cup-shaped''', 37 Conductor / 'with a groove for embolus (Figs. 10A, 28D, 69B)' 'entire (Figs. 13D, 17F, 52C-D)', 38 Conductor_surface / 'smooth (Figs. 75B, 77B-C)' ' heavily ridged (Figs. 10B-C, 44D. 67C, 69D)', 39 Conductor_tip_sclerotization / like_base more_than_base, 40 Subconductor / absent present, 41 Subconductor_pit_upper_wall / 'entire, or slightly protruding' forms_a_regular_oval_lip, 42 Subconductor_at_C_base / narrows_abruptly_before_C_base narrows_gradually_along_its_entire_length broad_at_base, 43 'Embolus tail-SC relation' / 'hooked in, or oriented towards SC' surpasses_SC behind_E_base, 44 Tegulum_ectally_ / occupying_less_than_half_of_the_cymbial_cavity_ occupying_more_than_half_of_the_cymbial_cavity, 45 MA_and_sperm_duct / sperm_duct_loop_not_inside_MA 'sperm duct loop inside MA (Figs. 90F, 91B)', 46 'MA-tegular membrane connection' / broad narrow, 47 MA_form / unbranched 'two nearly equally sized branches (Fig. 22A-B) ', 48 MA_distal_tip / entire hooded, 49 MA_hood_form / 'narrow, pit-like (Figs. 31F, 34D)' 'scoop-shaped (Figs. 60D, 66B, 67D)', 50 TTA_form / entire 'grooved (Fig. 44C)', 51 TTA / bulky 'prong shaped (vittatus group)', 52 TTA_distal_tip / entire_or_gently_curved Argyrodes 'hooked (branched)', 53 TTA_hook_distal_branch / barely_exceeding_lower_branch_ 'extending beyond lower branch (jucundus group) ', 54 TTA_hook_distal_branch / thick_ 'thin, finger like (domingo, dubiosus)', 55 TTA_hook_proximal_branch / 'blunt, broad' 'flattened, bladelike' 'cylindrical, elongated', 56 TTA_surface_subterminally / smooth ridged, 57 TTA_tip_surface / smooth 'ridged (Figs. 7A-B, 17F, 31D, 34D, 54A, 56B, 86A)', 58 Embolus_and_TTA / loosely_associated_to_or_resting_in_TTA_shallow_groove 'parts of E entirely enclosed in TTA (Figs. 37A-B, 44C, 89C)', 59 Embolus_tip_surface / smooth denticulate, 60 Embolus_spiral_curviture / gentle whip_like corkscrew, 61 Embolus_tip / entire bifid, 62 Embolus_origin / retroventral_on_tegulum 'retrolateral (ectal), partially or completely hidden by cymbium (Figs 44C, 60A-C, 67B)', 63 Embolus_ridges / absent present, 64 Embolus_shape / short_to_moderately_elongate 'extremely long, >2 spirals (Figs. 54D, 73A-E)', 65 Embolus_spiral_width / 'thin, much of E spiral subequal to E tip ' 'thick, entire E spiral much broader than tip ', 66 Embolus_distal_rim / 'entire (normal)' deeply_grooved, 67 Embolic_terminus / abrupt 'with a distal apophysis (EA, Fig. 34E) ', 68 Embolus_tail / 'entire, smooth' 'distinct, lobed', 69 'Embolus-dh connection grooves' / absent present, 70 'Embolus-dh grooves' / 'deep, extend into the E base more than twice longer than the distance between them' 'short, extend into the E base about as long, or slightly longer than the distance between them', 71 E_spiral_distally / 'relatively thin or filiform, cylindrical' 'thick, not cylindrical' 'rupununi/lorenzo like', 72 Embolus_spiral / entire 'biparted (Eb)' pars_pendula, 73 Eb_orientation / towards_embolus_tip towards_tibia, 74 Embolic_division_b / separates_early_from_E E_and_Eb_tightly_associated_the_entire_spiral, 75 Embolic_division_b / broad 'narrow, relative to Eb spiral, snout-like', 76 'Eb distal portion, ectal marginl' / 'level, not raised ' with_a_distinct_ridge_, 77 Eb_form / flat 'globose, inflated', 78 Eb_form / 'distinct, clearly separate apophysis' 'short, confined to first section of spiral, barely separate', 79 Eb_tip_and_E_tip_association / separate Eb_and_E_tips_juxtaposed 'E tip rests on Eb ''cup''', 80 Eb_snout / 'short, snug with E spiral ' 'long, separate from E spiral ', 81 Distal_portion_of_Eb / entire with_a_cup_shaped_apophysis with_a_raised_ridge, 82 E_tail / lobe_not_reaching_ectal_margin_of_Eb_ lobe_touching_ectal_margin_of_Eb_, 83 Extra_tegular_sclerite / absent_ present_, 84 'Median eyes (male)' / flush_with_carapace 'on tubercle (Argyrodes)', 85 'AME size (male)' / subequal_or_slightly_larger_than_ALE clearly_smaller_than_ALE, 86 Cheliceral_posterior_margin / toothed smooth, 87 Cheliceral_posterior_tooth_number / three_or_more two one, 88 Cheliceral_furrow / smooth denticulate, 89 Carapace_hairiness / 'sparsely or patchily hirsute (Fig. 48D)' 'uniformly hirsute (Fig. 71D)', 90 Carapace_pars_stridens / irregular regular_parallel_ridges, 91 Interocular_area / more_or_less_flush_with_clypeus projecting_beyond_clypeus, 92 Clypeus / concave_or_flat with_a_prominent_projection, 93 'ocular and clypeal region setae distribution (male)' / sparse 'in a dense field, or fields', 94 'Labium-sternum connection' / 'visible seam (Fig. 27C)' fused, 95 Sternocoxal_tubercles / present absent, 96 Pedicel_location / 'anterior (Fig. 94A-D)' 'medial (Fig. 94J-K)', 97 Abdominal_folium_pattern / bilateral_spots_or_blotches distinct_central_band_, 98 Abdomen_pattern / Anelosimus_, 99 Dorsal_band / 'dark edged by white (Kochiura, Anelosimus, Fig. 94G, J)' 'light edged by dark (Fig. 94H)' 'Ameridion, light edged by white (Fig. 94I)', 100 Abdominal_dot_pigment / silver 'non-reflective, dull', 101 SPR_form / 'weakly keeled (Figs. 67F, 74F)' 'strongly keeled and elongate (Figs. 16B-C, 24D-E, 42F)', 102 SPR_pick_number / '1-4' '6-28' '>30', 103 SPR_insertion / flush_with_abdominal_surface 'on a ridge (Figs 32D, 72A-B)', 104 'SPR mesally-oriented picks' / absent present, 105 'SPR mesally-oriented picks relative to sagittal plane' / angled_dorsally perpendicular_or_angled_ventrally, 106 SPR / straight_or_slightly_irregular distinctly_curved 'argyrodine, dorsal picks aside others', 107 SPR_dorsal_pick_spacing / subequal_to_ventral_pick_spacing distinctly_compressed, 108 SPR_relative_to_pedicel / lateral dorsal, 109 SPR_setae / separate tight, 110 'Supra pedicillate ventrolateral (4 o''clock) proprioreceptor' / absent present, 111 Epiandrous_fusule_arrangement / in_one_pair_of_sockets in_a_row, 112 Epiandrous_fusule_pair_number / '=>9' '6-8' '4-5' 1, 113 Colulus / 'present (Figs. 45E, 61F)' 'absent (Figs. 16E, 78A)' 'invaginated (Figs. 9D, 63G)', 114 Colulus_size / 'large and fleshy (Figs. 55H, 61F)' 'small, less than half the length of its setae (Fig. 38B)', 115 Colular_setae / present absent, 116 'Colular setae number (female)' / three_or_more two_, 117 'Palpal claw dentition (female)' / 'dense, > half of surface covered by denticles (Figs. 2D, 9E, 11D, 12G, 45G, 47E, 58G, 80D)' 'sparse < half of surface with denticles', 118 'Palpal tibial trichobothria (female)' / four three two five, 119 Femur_I_relative_to_II / subequal 'robust, clearly larger than femur II', 120 'Leg IV relative length (male)' / '3rd longest (typical leg formula 1243)' '2nd longest (typical leg formula 1423)' 'longest (typical leg formula 4123)', 121 'Leg IV relative length (female)' / 3rd_longest 2nd_longest longest_, 122 'Femur vs. metatarsus length (female)' / metatarsus_longer metatarsus_shorter, 123 'Femur vs. metatarsus length (male)' / metatarsus_longer metatarsus_shorter, 124 'Metatarsus vs. tibia length (female)' / metatarsus_longer metatarsus_shorter, 125 'Metatarsus vs. tibia length (male)' / metatarsus_longer metatarsus_shorter, 126 Metatarsal_ventral_macrosetae / like_other_macrosetae thickened_ventrally, 127 Tarsus_IV_comb_serrations / 'simple, straight' curved_hooks, 128 Tarsal_organ_size / 'smaller than setal sockets (normal)' enlarged, 129 'Tarsus IV central claw vs. laterals (male)' / 'short, at most subequal' 'elongate, longer (Figs. 19E, 21C, 23D, 32H, 57F, 58F)', 130 'Tarsus IV central claw vs. laterals (female)' / equal_or_shorter stout_and_distinctly_longer minute, 131 Spinneret_insertion / abdominal_apex 'subapical, abdomen extending beyond spinnerets', 132 PLS_flagelliform_spigot_length / subequal_to__PLS_CY 'longer than PLS CY (Figs. 68E, 78B, 82D)', 133 'PLS, PMS CY spigot bases' / 'not modified, subequal or smaller than ampullates' 'huge and elongated, much larger than ampullates ', 134 CY_shaft_surface / smooth grooved, 135 PLS_AC_spigot_number / five_or_more four_or_less, 136 PLS_flagelliform_spigot / present absent, 137 PLS_posterior_AG_spigot_shape / 'normal, round' flattened, 138 PLS_theridiid_type_AG_position / more_or_less_parallel end_to_end, 139 'PMS minor ampullate (mAP) spigot shaft length' / 'short, subequal to CY shaft' clearly_longer_than_any_CY_shaft, 140 Web_form / 'linyphioid-like sheet web (Fig. 99C)' 'cobweb (Figs. 97G, 99A-B, 100A-F, 101A-E)' 'network mesh web - with foraging field below (rupununi/lorenzo)' 'dry line-web', 141 'Knock-down lines' / absent present, 142 Sticky_silk_in_web / present absent, 143 Egg_sac_surface / spherical_to_lenticular 'stalked (Fig. 88E, 98D).', 144 Egg_case_structure / suboval_or_roundish basal_knob rhomboid elongated Spiky, 145 Web_construction / solitary communal, 146 Mating_thread / present absent, 147 Adult_females_per_nest / one multiple, 148 cooperative_behavior / solitary subsocial permanent_sociality ;
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nexus_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - mjy
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2024-05-03 00:00:00.000000000 Z
12
+ date: 2024-05-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler