nexus_parser 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,8 @@
1
1
  module NexusParser::Tokens
2
2
 
3
+ ENDBLKSTR = '(end|endblock)'.freeze
4
+ QUOTEDLABEL = '(\'+[^\']+\'+)|(\"+[^\"]+\"+)'
5
+
3
6
  class Token
4
7
  # this allows access the the class attribute regexp, without using a class variable
5
8
  class << self; attr_reader :regexp; end
@@ -31,12 +34,12 @@ module NexusParser::Tokens
31
34
  end
32
35
 
33
36
  class EndBlk < Token
34
- @regexp = Regexp.new(/\A\s*([\s]*End[\s]*;[\s]*)/i)
37
+ @regexp = Regexp.new(/\A\s*([\s]*#{ENDBLKSTR}[\s]*;[\s]*)/i)
35
38
  end
36
39
 
37
40
  # label
38
41
  class AuthorsBlk < Token
39
- @regexp = Regexp.new(/\A\s*(Authors;.*?END;)\s*/im)
42
+ @regexp = Regexp.new(/\A\s*(Authors;.*?#{ENDBLKSTR};)\s*/im)
40
43
  end
41
44
 
42
45
  # label
@@ -66,14 +69,17 @@ module NexusParser::Tokens
66
69
  @regexp = Regexp.new(/\A\s*(format)\s*/i)
67
70
  end
68
71
 
72
+ # TODO: Handled, but ignored
73
+ class RespectCase < Token
74
+ @regexp = Regexp.new(/\A\s*(respectcase)\s*/i)
75
+ end
76
+
69
77
  # label
70
78
  class Taxlabels < Token
71
79
  @regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
72
80
  end
73
81
 
74
- # same as ID
75
- class Label < Token
76
- @regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
82
+ class LabelBase < Token
77
83
  def initialize(str)
78
84
  str.strip!
79
85
  str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
@@ -83,6 +89,20 @@ module NexusParser::Tokens
83
89
  end
84
90
  end
85
91
 
92
+ class Label < LabelBase
93
+ @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|(\w[^,:(); \t\n]*)+)\s*/) # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" "
94
+ def initialize(str)
95
+ super(str)
96
+ end
97
+ end
98
+
99
+ class CharacterLabel < LabelBase
100
+ @regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|[^ \t\n\/\'\",;]+)\s*/)
101
+ def initialize(str)
102
+ super(str)
103
+ end
104
+ end
105
+
86
106
  class ChrsBlk < Token
87
107
  @regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
88
108
  end
@@ -111,10 +131,50 @@ module NexusParser::Tokens
111
131
  class RowVec < Token
112
132
  @regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
113
133
  def initialize(str)
114
- # meh! Ruby is simpler to read than Perl?
115
- # handles both () and {} style multistates
116
- s = str.split(/\(|\)|\}|\{/).collect{|s| s=~ /[\,|\s]/ ? s.split(/[\,|\s]/) : s}.inject([]){|sum, x| x.class == Array ? sum << x.delete_if {|y| y == "" } : sum + x.strip.split(//)}
117
- @value = s
134
+ # We ignore commas outside (and inside) of groupings, it's fine.
135
+ str.gsub!(/[\, \t]/, '')
136
+
137
+ groupers = ['(', ')', '{', '}']
138
+ openers = ['(', '{']
139
+ closers = [')', '}']
140
+ closer_for = { '(' => ')', '{' => '}' }
141
+
142
+ a = []
143
+ group = nil
144
+ group_closer = nil
145
+ str.each_char { |c|
146
+ if groupers.include? c
147
+ if ((openers.include?(c) && !group.nil?) ||
148
+ (closers.include?(c) && (group.nil? || c != group_closer)))
149
+ raise(NexusParser::ParseError,
150
+ "Mismatched grouping in matrix row '#{str}'")
151
+ end
152
+
153
+ if openers.include? c
154
+ group = []
155
+ group_closer = closer_for[c]
156
+ else # c is a closer
157
+ if group.count == 1
158
+ a << group.first
159
+ elsif group.count > 1
160
+ a << group
161
+ end
162
+ group = nil
163
+ group_closer = nil
164
+ end
165
+ else
166
+ if group.nil?
167
+ a << c
168
+ else
169
+ group << c
170
+ end
171
+ end
172
+ }
173
+
174
+ raise(NexusParser::ParseError,
175
+ "Unclosed grouping in matrix row '#{str}'") if !group.nil?
176
+
177
+ @value = a
118
178
  end
119
179
  end
120
180
 
@@ -122,6 +182,14 @@ module NexusParser::Tokens
122
182
  @regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
123
183
  end
124
184
 
185
+ class CharLabels < Token
186
+ @regexp = Regexp.new(/\A\s*(CHARLABELS)\s*/i)
187
+ end
188
+
189
+ class StateLabels < Token
190
+ @regexp = Regexp.new(/\A\s*(STATELABELS)\s*/i)
191
+ end
192
+
125
193
  class MesquiteIDs < Token
126
194
  @regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
127
195
  end
@@ -133,35 +201,35 @@ module NexusParser::Tokens
133
201
  # unparsed blocks
134
202
 
135
203
  class TreesBlk < Token
136
- @regexp = Regexp.new(/\A\s*(trees;.*?END;)\s*/im) # note the multi-line /m
204
+ @regexp = Regexp.new(/\A\s*(trees;.*?#{ENDBLKSTR};)\s*/im) # note the multi-line /m
137
205
  end
138
206
 
139
207
  class SetsBlk < Token
140
- @regexp = Regexp.new(/\A\s*(sets;.*?END;)\s*/im)
208
+ @regexp = Regexp.new(/\A\s*(sets;.*?#{ENDBLKSTR};)\s*/im)
141
209
  end
142
210
 
143
211
  class MqCharModelsBlk < Token
144
- @regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?END;)\s*/im)
212
+ @regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?#{ENDBLKSTR};)\s*/im)
145
213
  end
146
214
 
147
215
  class LabelsBlk < Token
148
- @regexp = Regexp.new(/\A\s*(LABELS;.*?END;)\s*/im)
216
+ @regexp = Regexp.new(/\A\s*(LABELS;.*?#{ENDBLKSTR};)\s*/im)
149
217
  end
150
218
 
151
219
  class AssumptionsBlk < Token
152
- @regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?END;)\s*/im)
220
+ @regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?#{ENDBLKSTR};)\s*/im)
153
221
  end
154
222
 
155
223
  class CodonsBlk < Token
156
- @regexp = Regexp.new(/\A\s*(CODONS;.*?END;)\s*/im)
224
+ @regexp = Regexp.new(/\A\s*(CODONS;.*?#{ENDBLKSTR};)\s*/im)
157
225
  end
158
226
 
159
227
  class MesquiteBlk < Token
160
- @regexp = Regexp.new(/\A\s*(Mesquite;.*?END;)\s*/im)
228
+ @regexp = Regexp.new(/\A\s*(Mesquite;.*?#{ENDBLKSTR};)\s*/im)
161
229
  end
162
230
 
163
231
  class BlkEnd < Token
164
- @regexp = Regexp.new(/\A[\s]*(END;)\s*/i)
232
+ @regexp = Regexp.new(/\A[\s]*(#{ENDBLKSTR};)\s*/i)
165
233
  end
166
234
 
167
235
  class LBracket < Token
@@ -188,16 +256,6 @@ module NexusParser::Tokens
188
256
  @regexp = Regexp.new('\A\s*(\/)\s*')
189
257
  end
190
258
 
191
- # labels
192
- class ID < Token
193
- @regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
194
- def initialize(str)
195
- str.strip!
196
- str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
197
- @value = str
198
- end
199
- end
200
-
201
259
  class Colon < Token
202
260
  @regexp = Regexp.new('\A\s*(:)\s*')
203
261
  end
@@ -210,65 +268,10 @@ module NexusParser::Tokens
210
268
  @regexp = Regexp.new('\A\s*(\,)\s*')
211
269
  end
212
270
 
213
- class Number < Token
214
- @regexp = Regexp.new('\A\s*(-?\d+(\.\d+)?([eE][+-]?\d+)?)\s*')
215
- def initialize(str)
216
- # a little oddness here, in some case we don't want to include the .0
217
- # see issues with numbers as labels
218
- if str =~ /\./
219
- @value = str.to_f
220
- else
221
- @value = str.to_i
222
- end
223
-
224
- end
271
+ class PositiveInteger < Token
272
+ @regexp = Regexp.new('\A\s*(\d+)\s*')
225
273
  end
226
274
 
227
275
  # NexusParser::Tokens::NexusComment
228
276
 
229
- # this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
230
- def self.nexus_file_token_list
231
- [ NexusParser::Tokens::NexusStart,
232
- NexusParser::Tokens::BeginBlk,
233
- NexusParser::Tokens::EndBlk,
234
- NexusParser::Tokens::AuthorsBlk,
235
- NexusParser::Tokens::SetsBlk,
236
- NexusParser::Tokens::MqCharModelsBlk,
237
- NexusParser::Tokens::AssumptionsBlk,
238
- NexusParser::Tokens::CodonsBlk,
239
- NexusParser::Tokens::MesquiteBlk,
240
- NexusParser::Tokens::TreesBlk,
241
- NexusParser::Tokens::LabelsBlk,
242
- NexusParser::Tokens::TaxaBlk,
243
- NexusParser::Tokens::NotesBlk,
244
- NexusParser::Tokens::Title,
245
- NexusParser::Tokens::Taxlabels,
246
- NexusParser::Tokens::Dimensions,
247
- NexusParser::Tokens::FileLbl,
248
- NexusParser::Tokens::Format,
249
- NexusParser::Tokens::Equals,
250
- NexusParser::Tokens::ValuePair, # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
251
- NexusParser::Tokens::CharStateLabels,
252
- NexusParser::Tokens::ChrsBlk,
253
- NexusParser::Tokens::Number,
254
- NexusParser::Tokens::Matrix,
255
- NexusParser::Tokens::SemiColon,
256
- NexusParser::Tokens::MesquiteIDs,
257
- NexusParser::Tokens::MesquiteBlockID,
258
- NexusParser::Tokens::BlkEnd,
259
- NexusParser::Tokens::Colon,
260
- NexusParser::Tokens::BckSlash,
261
- NexusParser::Tokens::Comma,
262
- NexusParser::Tokens::LParen,
263
- NexusParser::Tokens::RParen,
264
- NexusParser::Tokens::LBracket,
265
- NexusParser::Tokens::RBracket,
266
- NexusParser::Tokens::Label, # must be before RowVec
267
- NexusParser::Tokens::RowVec,
268
- NexusParser::Tokens::LinkLine,
269
- NexusParser::Tokens::ID # need to trash this
270
- ]
271
- end
272
-
273
277
  end
274
-
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module NexusParser
4
- VERSION = "1.2.0"
4
+ VERSION = "1.2.2"
5
5
  end
data/lib/nexus_parser.rb CHANGED
@@ -3,9 +3,6 @@
3
3
  # uses the PhyloTree parser/lexer engine by Krishna Dole which in turn was based on
4
4
  # Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library
5
5
 
6
- # outstanding issues:
7
- ## need to resolve Tokens Labels, ValuePair, IDs
8
-
9
6
  module NexusParser
10
7
 
11
8
  require File.expand_path(File.join(File.dirname(__FILE__), 'nexus_parser', 'tokens'))
@@ -75,7 +72,7 @@ class NexusParser
75
72
  class Coding
76
73
  # unfortunately we need this for notes
77
74
  attr_accessor :notes
78
- attr_writer :state
75
+ attr_writer :state
79
76
 
80
77
  def initialize(options = {})
81
78
  @states = options[:states]
@@ -85,7 +82,7 @@ class NexusParser
85
82
  def states
86
83
  @states.class == Array ? @states : [@states]
87
84
  end
88
-
85
+
89
86
  end
90
87
 
91
88
  class Note
@@ -118,7 +115,7 @@ class NexusParser
118
115
  end
119
116
  end
120
117
 
121
- end
118
+ end # end NexusParser
122
119
 
123
120
 
124
121
  # constructs the NexusParser
@@ -141,6 +138,9 @@ class Builder
141
138
  def code_row(taxon_index, rowvector)
142
139
 
143
140
  @nf.characters.each_with_index do |c, i|
141
+ raise(ParseError,
142
+ "Row #{taxon_index} of the matrix is too short") if rowvector[i].nil?
143
+
144
144
  @nf.codings[taxon_index.to_i] = [] if !@nf.codings[taxon_index.to_i]
145
145
  @nf.codings[taxon_index.to_i][i] = NexusParser::Coding.new(:states => rowvector[i])
146
146
 
@@ -185,7 +185,7 @@ class Builder
185
185
 
186
186
  # need to create the characters
187
187
 
188
- raise(NexusParser::ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
188
+ raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
189
189
 
190
190
  (@nf.characters[@index].name = @opt[:name]) if @opt[:name]
191
191
 
@@ -193,18 +193,45 @@ class Builder
193
193
  @opt.delete(:name)
194
194
 
195
195
  # the rest have states
196
- @opt.keys.each do |k|
196
+ create_or_update_states_for_character(@index, @opt)
197
+ end
198
+
199
+ def update_chr_name(i, name)
200
+ raise(ParseError, "There are #{@nf.characters.count} characters but we're trying to update from row #{i + 1} of the CHARLABELS list - check your NCHAR and/or the length of your list.") if !@nf.characters[i]
201
+
202
+ # The CHARLABELS list is unindexed, so users are allowed to use '_' to
203
+ # indicate that a character name is unspecified.
204
+ @nf.characters[i].name = (name == '_' ? '' : name)
205
+ end
206
+
207
+ # legal hash keys are :index and integers that point to state labels
208
+ def update_chr_states(options = {})
209
+ return false if !options[:index]
210
+
211
+ @opt = options
212
+
213
+ @index = @opt[:index].to_i
214
+
215
+ raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the STATELABELS. Check the indices.") if !@nf.characters[@index]
216
+
217
+ @opt.delete(:index)
218
+
219
+ # the rest have states
220
+ create_or_update_states_for_character(@index, @opt)
221
+ end
197
222
 
198
- if (@nf.characters[@index].states != {}) && @nf.characters[@index].states[k] # state exists
223
+ def create_or_update_states_for_character(i, options)
224
+ options.keys.each do |k|
225
+
226
+ if (@nf.characters[i].states != {}) && @nf.characters[i].states[k] # state exists
199
227
 
200
228
  ## !! ONLY HANDLES NAME, UPDATE TO HANDLE notes etc. when we get them ##
201
- update_state(@index, :index => k, :name => @opt[k])
229
+ update_state(i, :index => k, :name => options[k])
202
230
 
203
231
  else # doesn't, create it
204
- @nf.characters[@index].add_state(:label => k.to_s, :name => @opt[k])
232
+ @nf.characters[i].add_state(:label => k.to_s, :name => options[k])
205
233
  end
206
234
  end
207
-
208
235
  end
209
236
 
210
237
  def update_state(chr_index, options = {})
@@ -256,7 +283,7 @@ class Builder
256
283
  @nf
257
284
  end
258
285
 
259
- end # end file
286
+ end # end Builder
260
287
 
261
288
  # NexusParser::ParseError
262
289
  class ParseError < StandardError
@@ -270,7 +297,7 @@ def parse_nexus_file(input)
270
297
  @input = input
271
298
  @input.gsub!(/\[[^\]]*\]/,'') # strip out all comments BEFORE we parse the file
272
299
  # quickly peek at the input, does this look like a Nexus file?
273
- if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /end\;/i)
300
+ if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /(end|endblock)\;/i)
274
301
  raise(NexusParser::ParseError, "File is missing at least some required headers, check formatting.", caller)
275
302
  end
276
303