nexus_parser 1.2.0 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,8 @@
1
1
  module NexusParser::Tokens
2
2
 
3
+ ENDBLKSTR = '(end|endblock)'.freeze
4
+ QUOTEDLABEL = '(\'+[^\']+\'+)|(\"+[^\"]+\"+)'
5
+
3
6
  class Token
4
7
  # this allows access the the class attribute regexp, without using a class variable
5
8
  class << self; attr_reader :regexp; end
@@ -31,12 +34,12 @@ module NexusParser::Tokens
31
34
  end
32
35
 
33
36
  class EndBlk < Token
34
- @regexp = Regexp.new(/\A\s*([\s]*End[\s]*;[\s]*)/i)
37
+ @regexp = Regexp.new(/\A\s*([\s]*#{ENDBLKSTR}[\s]*;[\s]*)/i)
35
38
  end
36
39
 
37
40
  # label
38
41
  class AuthorsBlk < Token
39
- @regexp = Regexp.new(/\A\s*(Authors;.*?END;)\s*/im)
42
+ @regexp = Regexp.new(/\A\s*(Authors;.*?#{ENDBLKSTR};)\s*/im)
40
43
  end
41
44
 
42
45
  # label
@@ -66,14 +69,17 @@ module NexusParser::Tokens
66
69
  @regexp = Regexp.new(/\A\s*(format)\s*/i)
67
70
  end
68
71
 
72
+ # TODO: Handled, but ignored
73
+ class RespectCase < Token
74
+ @regexp = Regexp.new(/\A\s*(respectcase)\s*/i)
75
+ end
76
+
69
77
  # label
70
78
  class Taxlabels < Token
71
79
  @regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
72
80
  end
73
81
 
74
- # same as ID
75
- class Label < Token
76
- @regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
82
+ class LabelBase < Token
77
83
  def initialize(str)
78
84
  str.strip!
79
85
  str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
@@ -83,6 +89,20 @@ module NexusParser::Tokens
83
89
  end
84
90
  end
85
91
 
92
+ class Label < LabelBase
93
+ @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|(\w[^,:(); \t\n]*)+)\s*/) # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" "
94
+ def initialize(str)
95
+ super(str)
96
+ end
97
+ end
98
+
99
+ class CharacterLabel < LabelBase
100
+ @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|[^ \t\n\/\'\",;]+)\s*/)
101
+ def initialize(str)
102
+ super(str)
103
+ end
104
+ end
105
+
86
106
  class ChrsBlk < Token
87
107
  @regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
88
108
  end
@@ -111,10 +131,50 @@ module NexusParser::Tokens
111
131
  class RowVec < Token
112
132
  @regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
113
133
  def initialize(str)
114
- # meh! Ruby is simpler to read than Perl?
115
- # handles both () and {} style multistates
116
- s = str.split(/\(|\)|\}|\{/).collect{|s| s=~ /[\,|\s]/ ? s.split(/[\,|\s]/) : s}.inject([]){|sum, x| x.class == Array ? sum << x.delete_if {|y| y == "" } : sum + x.strip.split(//)}
117
- @value = s
134
+ # We ignore commas outside (and inside) of groupings, it's fine.
135
+ str.gsub!(/[\, \t]/, '')
136
+
137
+ groupers = ['(', ')', '{', '}']
138
+ openers = ['(', '{']
139
+ closers = [')', '}']
140
+ closer_for = { '(' => ')', '{' => '}' }
141
+
142
+ a = []
143
+ group = nil
144
+ group_closer = nil
145
+ str.each_char { |c|
146
+ if groupers.include? c
147
+ if ((openers.include?(c) && !group.nil?) ||
148
+ (closers.include?(c) && (group.nil? || c != group_closer)))
149
+ raise(NexusParser::ParseError,
150
+ "Mismatched grouping in matrix row '#{str}'")
151
+ end
152
+
153
+ if openers.include? c
154
+ group = []
155
+ group_closer = closer_for[c]
156
+ else # c is a closer
157
+ if group.count == 1
158
+ a << group.first
159
+ elsif group.count > 1
160
+ a << group
161
+ end
162
+ group = nil
163
+ group_closer = nil
164
+ end
165
+ else
166
+ if group.nil?
167
+ a << c
168
+ else
169
+ group << c
170
+ end
171
+ end
172
+ }
173
+
174
+ raise(NexusParser::ParseError,
175
+ "Unclosed grouping in matrix row '#{str}'") if !group.nil?
176
+
177
+ @value = a
118
178
  end
119
179
  end
120
180
 
@@ -122,6 +182,14 @@ module NexusParser::Tokens
122
182
  @regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
123
183
  end
124
184
 
185
+ class CharLabels < Token
186
+ @regexp = Regexp.new(/\A\s*(CHARLABELS)\s*/i)
187
+ end
188
+
189
+ class StateLabels < Token
190
+ @regexp = Regexp.new(/\A\s*(STATELABELS)\s*/i)
191
+ end
192
+
125
193
  class MesquiteIDs < Token
126
194
  @regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
127
195
  end
@@ -133,35 +201,35 @@ module NexusParser::Tokens
133
201
  # unparsed blocks
134
202
 
135
203
  class TreesBlk < Token
136
- @regexp = Regexp.new(/\A\s*(trees;.*?END;)\s*/im) # note the multi-line /m
204
+ @regexp = Regexp.new(/\A\s*(trees;.*?#{ENDBLKSTR};)\s*/im) # note the multi-line /m
137
205
  end
138
206
 
139
207
  class SetsBlk < Token
140
- @regexp = Regexp.new(/\A\s*(sets;.*?END;)\s*/im)
208
+ @regexp = Regexp.new(/\A\s*(sets;.*?#{ENDBLKSTR};)\s*/im)
141
209
  end
142
210
 
143
211
  class MqCharModelsBlk < Token
144
- @regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?END;)\s*/im)
212
+ @regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?#{ENDBLKSTR};)\s*/im)
145
213
  end
146
214
 
147
215
  class LabelsBlk < Token
148
- @regexp = Regexp.new(/\A\s*(LABELS;.*?END;)\s*/im)
216
+ @regexp = Regexp.new(/\A\s*(LABELS;.*?#{ENDBLKSTR};)\s*/im)
149
217
  end
150
218
 
151
219
  class AssumptionsBlk < Token
152
- @regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?END;)\s*/im)
220
+ @regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?#{ENDBLKSTR};)\s*/im)
153
221
  end
154
222
 
155
223
  class CodonsBlk < Token
156
- @regexp = Regexp.new(/\A\s*(CODONS;.*?END;)\s*/im)
224
+ @regexp = Regexp.new(/\A\s*(CODONS;.*?#{ENDBLKSTR};)\s*/im)
157
225
  end
158
226
 
159
227
  class MesquiteBlk < Token
160
- @regexp = Regexp.new(/\A\s*(Mesquite;.*?END;)\s*/im)
228
+ @regexp = Regexp.new(/\A\s*(Mesquite;.*?#{ENDBLKSTR};)\s*/im)
161
229
  end
162
230
 
163
231
  class BlkEnd < Token
164
- @regexp = Regexp.new(/\A[\s]*(END;)\s*/i)
232
+ @regexp = Regexp.new(/\A[\s]*(#{ENDBLKSTR};)\s*/i)
165
233
  end
166
234
 
167
235
  class LBracket < Token
@@ -188,16 +256,6 @@ module NexusParser::Tokens
188
256
  @regexp = Regexp.new('\A\s*(\/)\s*')
189
257
  end
190
258
 
191
- # labels
192
- class ID < Token
193
- @regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
194
- def initialize(str)
195
- str.strip!
196
- str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
197
- @value = str
198
- end
199
- end
200
-
201
259
  class Colon < Token
202
260
  @regexp = Regexp.new('\A\s*(:)\s*')
203
261
  end
@@ -210,65 +268,10 @@ module NexusParser::Tokens
210
268
  @regexp = Regexp.new('\A\s*(\,)\s*')
211
269
  end
212
270
 
213
- class Number < Token
214
- @regexp = Regexp.new('\A\s*(-?\d+(\.\d+)?([eE][+-]?\d+)?)\s*')
215
- def initialize(str)
216
- # a little oddness here, in some case we don't want to include the .0
217
- # see issues with numbers as labels
218
- if str =~ /\./
219
- @value = str.to_f
220
- else
221
- @value = str.to_i
222
- end
223
-
224
- end
271
+ class PositiveInteger < Token
272
+ @regexp = Regexp.new('\A\s*(\d+)\s*')
225
273
  end
226
274
 
227
275
  # NexusParser::Tokens::NexusComment
228
276
 
229
- # this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
230
- def self.nexus_file_token_list
231
- [ NexusParser::Tokens::NexusStart,
232
- NexusParser::Tokens::BeginBlk,
233
- NexusParser::Tokens::EndBlk,
234
- NexusParser::Tokens::AuthorsBlk,
235
- NexusParser::Tokens::SetsBlk,
236
- NexusParser::Tokens::MqCharModelsBlk,
237
- NexusParser::Tokens::AssumptionsBlk,
238
- NexusParser::Tokens::CodonsBlk,
239
- NexusParser::Tokens::MesquiteBlk,
240
- NexusParser::Tokens::TreesBlk,
241
- NexusParser::Tokens::LabelsBlk,
242
- NexusParser::Tokens::TaxaBlk,
243
- NexusParser::Tokens::NotesBlk,
244
- NexusParser::Tokens::Title,
245
- NexusParser::Tokens::Taxlabels,
246
- NexusParser::Tokens::Dimensions,
247
- NexusParser::Tokens::FileLbl,
248
- NexusParser::Tokens::Format,
249
- NexusParser::Tokens::Equals,
250
- NexusParser::Tokens::ValuePair, # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
251
- NexusParser::Tokens::CharStateLabels,
252
- NexusParser::Tokens::ChrsBlk,
253
- NexusParser::Tokens::Number,
254
- NexusParser::Tokens::Matrix,
255
- NexusParser::Tokens::SemiColon,
256
- NexusParser::Tokens::MesquiteIDs,
257
- NexusParser::Tokens::MesquiteBlockID,
258
- NexusParser::Tokens::BlkEnd,
259
- NexusParser::Tokens::Colon,
260
- NexusParser::Tokens::BckSlash,
261
- NexusParser::Tokens::Comma,
262
- NexusParser::Tokens::LParen,
263
- NexusParser::Tokens::RParen,
264
- NexusParser::Tokens::LBracket,
265
- NexusParser::Tokens::RBracket,
266
- NexusParser::Tokens::Label, # must be before RowVec
267
- NexusParser::Tokens::RowVec,
268
- NexusParser::Tokens::LinkLine,
269
- NexusParser::Tokens::ID # need to trash this
270
- ]
271
- end
272
-
273
277
  end
274
-
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module NexusParser
4
- VERSION = "1.2.0"
4
+ VERSION = "1.2.2"
5
5
  end
data/lib/nexus_parser.rb CHANGED
@@ -3,9 +3,6 @@
3
3
  # uses the PhyloTree parser/lexer engine by Krishna Dole which in turn was based on
4
4
  # Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library
5
5
 
6
- # outstanding issues:
7
- ## need to resolve Tokens Labels, ValuePair, IDs
8
-
9
6
  module NexusParser
10
7
 
11
8
  require File.expand_path(File.join(File.dirname(__FILE__), 'nexus_parser', 'tokens'))
@@ -75,7 +72,7 @@ class NexusParser
75
72
  class Coding
76
73
  # unfortunately we need this for notes
77
74
  attr_accessor :notes
78
- attr_writer :state
75
+ attr_writer :state
79
76
 
80
77
  def initialize(options = {})
81
78
  @states = options[:states]
@@ -85,7 +82,7 @@ class NexusParser
85
82
  def states
86
83
  @states.class == Array ? @states : [@states]
87
84
  end
88
-
85
+
89
86
  end
90
87
 
91
88
  class Note
@@ -118,7 +115,7 @@ class NexusParser
118
115
  end
119
116
  end
120
117
 
121
- end
118
+ end # end NexusParser
122
119
 
123
120
 
124
121
  # constructs the NexusParser
@@ -141,6 +138,9 @@ class Builder
141
138
  def code_row(taxon_index, rowvector)
142
139
 
143
140
  @nf.characters.each_with_index do |c, i|
141
+ raise(ParseError,
142
+ "Row #{taxon_index} of the matrix is too short") if rowvector[i].nil?
143
+
144
144
  @nf.codings[taxon_index.to_i] = [] if !@nf.codings[taxon_index.to_i]
145
145
  @nf.codings[taxon_index.to_i][i] = NexusParser::Coding.new(:states => rowvector[i])
146
146
 
@@ -185,7 +185,7 @@ class Builder
185
185
 
186
186
  # need to create the characters
187
187
 
188
- raise(NexusParser::ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
188
+ raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
189
189
 
190
190
  (@nf.characters[@index].name = @opt[:name]) if @opt[:name]
191
191
 
@@ -193,18 +193,45 @@ class Builder
193
193
  @opt.delete(:name)
194
194
 
195
195
  # the rest have states
196
- @opt.keys.each do |k|
196
+ create_or_update_states_for_character(@index, @opt)
197
+ end
198
+
199
+ def update_chr_name(i, name)
200
+ raise(ParseError, "There are #{@nf.characters.count} characters but we're trying to update from row #{i + 1} of the CHARLABELS list - check your NCHAR and/or the length of your list.") if !@nf.characters[i]
201
+
202
+ # The CHARLABELS list is unindexed, so users are allowed to use '_' to
203
+ # indicate that a character name is unspecified.
204
+ @nf.characters[i].name = (name == '_' ? '' : name)
205
+ end
206
+
207
+ # legal hash keys are :index and integers that point to state labels
208
+ def update_chr_states(options = {})
209
+ return false if !options[:index]
210
+
211
+ @opt = options
212
+
213
+ @index = @opt[:index].to_i
214
+
215
+ raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the STATELABELS. Check the indices.") if !@nf.characters[@index]
216
+
217
+ @opt.delete(:index)
218
+
219
+ # the rest have states
220
+ create_or_update_states_for_character(@index, @opt)
221
+ end
197
222
 
198
- if (@nf.characters[@index].states != {}) && @nf.characters[@index].states[k] # state exists
223
+ def create_or_update_states_for_character(i, options)
224
+ options.keys.each do |k|
225
+
226
+ if (@nf.characters[i].states != {}) && @nf.characters[i].states[k] # state exists
199
227
 
200
228
  ## !! ONLY HANDLES NAME, UPDATE TO HANDLE notes etc. when we get them ##
201
- update_state(@index, :index => k, :name => @opt[k])
229
+ update_state(i, :index => k, :name => options[k])
202
230
 
203
231
  else # doesn't, create it
204
- @nf.characters[@index].add_state(:label => k.to_s, :name => @opt[k])
232
+ @nf.characters[i].add_state(:label => k.to_s, :name => options[k])
205
233
  end
206
234
  end
207
-
208
235
  end
209
236
 
210
237
  def update_state(chr_index, options = {})
@@ -256,7 +283,7 @@ class Builder
256
283
  @nf
257
284
  end
258
285
 
259
- end # end file
286
+ end # end Builder
260
287
 
261
288
  # NexusParser::ParseError
262
289
  class ParseError < StandardError
@@ -270,7 +297,7 @@ def parse_nexus_file(input)
270
297
  @input = input
271
298
  @input.gsub!(/\[[^\]]*\]/,'') # strip out all comments BEFORE we parse the file
272
299
  # quickly peek at the input, does this look like a Nexus file?
273
- if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /end\;/i)
300
+ if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /(end|endblock)\;/i)
274
301
  raise(NexusParser::ParseError, "File is missing at least some required headers, check formatting.", caller)
275
302
  end
276
303