nexus_parser 1.2.0 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/lib/nexus_parser/lexer.rb +0 -10
- data/lib/nexus_parser/parser.rb +146 -77
- data/lib/nexus_parser/tokens.rb +87 -84
- data/lib/nexus_parser/version.rb +1 -1
- data/lib/nexus_parser.rb +41 -14
- data/test/test_nexus_parser.rb +371 -26
- metadata +3 -3
data/lib/nexus_parser/tokens.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
module NexusParser::Tokens
|
2
2
|
|
3
|
+
ENDBLKSTR = '(end|endblock)'.freeze
|
4
|
+
QUOTEDLABEL = '(\'+[^\']+\'+)|(\"+[^\"]+\"+)'
|
5
|
+
|
3
6
|
class Token
|
4
7
|
# this allows access the the class attribute regexp, without using a class variable
|
5
8
|
class << self; attr_reader :regexp; end
|
@@ -31,12 +34,12 @@ module NexusParser::Tokens
|
|
31
34
|
end
|
32
35
|
|
33
36
|
class EndBlk < Token
|
34
|
-
@regexp = Regexp.new(/\A\s*([\s]
|
37
|
+
@regexp = Regexp.new(/\A\s*([\s]*#{ENDBLKSTR}[\s]*;[\s]*)/i)
|
35
38
|
end
|
36
39
|
|
37
40
|
# label
|
38
41
|
class AuthorsBlk < Token
|
39
|
-
@regexp = Regexp.new(/\A\s*(Authors
|
42
|
+
@regexp = Regexp.new(/\A\s*(Authors;.*?#{ENDBLKSTR};)\s*/im)
|
40
43
|
end
|
41
44
|
|
42
45
|
# label
|
@@ -66,14 +69,17 @@ module NexusParser::Tokens
|
|
66
69
|
@regexp = Regexp.new(/\A\s*(format)\s*/i)
|
67
70
|
end
|
68
71
|
|
72
|
+
# TODO: Handled, but ignored
|
73
|
+
class RespectCase < Token
|
74
|
+
@regexp = Regexp.new(/\A\s*(respectcase)\s*/i)
|
75
|
+
end
|
76
|
+
|
69
77
|
# label
|
70
78
|
class Taxlabels < Token
|
71
79
|
@regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
|
72
80
|
end
|
73
81
|
|
74
|
-
|
75
|
-
class Label < Token
|
76
|
-
@regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
|
82
|
+
class LabelBase < Token
|
77
83
|
def initialize(str)
|
78
84
|
str.strip!
|
79
85
|
str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
|
@@ -83,6 +89,20 @@ module NexusParser::Tokens
|
|
83
89
|
end
|
84
90
|
end
|
85
91
|
|
92
|
+
class Label < LabelBase
|
93
|
+
@regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|(\w[^,:(); \t\n]*)+)\s*/) # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" "
|
94
|
+
def initialize(str)
|
95
|
+
super(str)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
class CharacterLabel < LabelBase
|
100
|
+
@regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|[^ \t\n\/\'\",;]+)\s*/)
|
101
|
+
def initialize(str)
|
102
|
+
super(str)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
86
106
|
class ChrsBlk < Token
|
87
107
|
@regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
|
88
108
|
end
|
@@ -111,10 +131,50 @@ module NexusParser::Tokens
|
|
111
131
|
class RowVec < Token
|
112
132
|
@regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
|
113
133
|
def initialize(str)
|
114
|
-
#
|
115
|
-
|
116
|
-
|
117
|
-
|
134
|
+
# We ignore commas outside (and inside) of groupings, it's fine.
|
135
|
+
str.gsub!(/[\, \t]/, '')
|
136
|
+
|
137
|
+
groupers = ['(', ')', '{', '}']
|
138
|
+
openers = ['(', '{']
|
139
|
+
closers = [')', '}']
|
140
|
+
closer_for = { '(' => ')', '{' => '}' }
|
141
|
+
|
142
|
+
a = []
|
143
|
+
group = nil
|
144
|
+
group_closer = nil
|
145
|
+
str.each_char { |c|
|
146
|
+
if groupers.include? c
|
147
|
+
if ((openers.include?(c) && !group.nil?) ||
|
148
|
+
(closers.include?(c) && (group.nil? || c != group_closer)))
|
149
|
+
raise(NexusParser::ParseError,
|
150
|
+
"Mismatched grouping in matrix row '#{str}'")
|
151
|
+
end
|
152
|
+
|
153
|
+
if openers.include? c
|
154
|
+
group = []
|
155
|
+
group_closer = closer_for[c]
|
156
|
+
else # c is a closer
|
157
|
+
if group.count == 1
|
158
|
+
a << group.first
|
159
|
+
elsif group.count > 1
|
160
|
+
a << group
|
161
|
+
end
|
162
|
+
group = nil
|
163
|
+
group_closer = nil
|
164
|
+
end
|
165
|
+
else
|
166
|
+
if group.nil?
|
167
|
+
a << c
|
168
|
+
else
|
169
|
+
group << c
|
170
|
+
end
|
171
|
+
end
|
172
|
+
}
|
173
|
+
|
174
|
+
raise(NexusParser::ParseError,
|
175
|
+
"Unclosed grouping in matrix row '#{str}'") if !group.nil?
|
176
|
+
|
177
|
+
@value = a
|
118
178
|
end
|
119
179
|
end
|
120
180
|
|
@@ -122,6 +182,14 @@ module NexusParser::Tokens
|
|
122
182
|
@regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
|
123
183
|
end
|
124
184
|
|
185
|
+
class CharLabels < Token
|
186
|
+
@regexp = Regexp.new(/\A\s*(CHARLABELS)\s*/i)
|
187
|
+
end
|
188
|
+
|
189
|
+
class StateLabels < Token
|
190
|
+
@regexp = Regexp.new(/\A\s*(STATELABELS)\s*/i)
|
191
|
+
end
|
192
|
+
|
125
193
|
class MesquiteIDs < Token
|
126
194
|
@regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
|
127
195
|
end
|
@@ -133,35 +201,35 @@ module NexusParser::Tokens
|
|
133
201
|
# unparsed blocks
|
134
202
|
|
135
203
|
class TreesBlk < Token
|
136
|
-
@regexp = Regexp.new(/\A\s*(trees
|
204
|
+
@regexp = Regexp.new(/\A\s*(trees;.*?#{ENDBLKSTR};)\s*/im) # note the multi-line /m
|
137
205
|
end
|
138
206
|
|
139
207
|
class SetsBlk < Token
|
140
|
-
@regexp = Regexp.new(/\A\s*(sets
|
208
|
+
@regexp = Regexp.new(/\A\s*(sets;.*?#{ENDBLKSTR};)\s*/im)
|
141
209
|
end
|
142
210
|
|
143
211
|
class MqCharModelsBlk < Token
|
144
|
-
@regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS
|
212
|
+
@regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?#{ENDBLKSTR};)\s*/im)
|
145
213
|
end
|
146
214
|
|
147
215
|
class LabelsBlk < Token
|
148
|
-
@regexp = Regexp.new(/\A\s*(LABELS
|
216
|
+
@regexp = Regexp.new(/\A\s*(LABELS;.*?#{ENDBLKSTR};)\s*/im)
|
149
217
|
end
|
150
218
|
|
151
219
|
class AssumptionsBlk < Token
|
152
|
-
@regexp = Regexp.new(/\A\s*(ASSUMPTIONS
|
220
|
+
@regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?#{ENDBLKSTR};)\s*/im)
|
153
221
|
end
|
154
222
|
|
155
223
|
class CodonsBlk < Token
|
156
|
-
@regexp = Regexp.new(/\A\s*(CODONS
|
224
|
+
@regexp = Regexp.new(/\A\s*(CODONS;.*?#{ENDBLKSTR};)\s*/im)
|
157
225
|
end
|
158
226
|
|
159
227
|
class MesquiteBlk < Token
|
160
|
-
@regexp = Regexp.new(/\A\s*(Mesquite
|
228
|
+
@regexp = Regexp.new(/\A\s*(Mesquite;.*?#{ENDBLKSTR};)\s*/im)
|
161
229
|
end
|
162
230
|
|
163
231
|
class BlkEnd < Token
|
164
|
-
@regexp = Regexp.new(/\A[\s]*(
|
232
|
+
@regexp = Regexp.new(/\A[\s]*(#{ENDBLKSTR};)\s*/i)
|
165
233
|
end
|
166
234
|
|
167
235
|
class LBracket < Token
|
@@ -188,16 +256,6 @@ module NexusParser::Tokens
|
|
188
256
|
@regexp = Regexp.new('\A\s*(\/)\s*')
|
189
257
|
end
|
190
258
|
|
191
|
-
# labels
|
192
|
-
class ID < Token
|
193
|
-
@regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
|
194
|
-
def initialize(str)
|
195
|
-
str.strip!
|
196
|
-
str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
|
197
|
-
@value = str
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
259
|
class Colon < Token
|
202
260
|
@regexp = Regexp.new('\A\s*(:)\s*')
|
203
261
|
end
|
@@ -210,65 +268,10 @@ module NexusParser::Tokens
|
|
210
268
|
@regexp = Regexp.new('\A\s*(\,)\s*')
|
211
269
|
end
|
212
270
|
|
213
|
-
class
|
214
|
-
@regexp = Regexp.new('\A\s*(
|
215
|
-
def initialize(str)
|
216
|
-
# a little oddness here, in some case we don't want to include the .0
|
217
|
-
# see issues with numbers as labels
|
218
|
-
if str =~ /\./
|
219
|
-
@value = str.to_f
|
220
|
-
else
|
221
|
-
@value = str.to_i
|
222
|
-
end
|
223
|
-
|
224
|
-
end
|
271
|
+
class PositiveInteger < Token
|
272
|
+
@regexp = Regexp.new('\A\s*(\d+)\s*')
|
225
273
|
end
|
226
274
|
|
227
275
|
# NexusParser::Tokens::NexusComment
|
228
276
|
|
229
|
-
# this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
|
230
|
-
def self.nexus_file_token_list
|
231
|
-
[ NexusParser::Tokens::NexusStart,
|
232
|
-
NexusParser::Tokens::BeginBlk,
|
233
|
-
NexusParser::Tokens::EndBlk,
|
234
|
-
NexusParser::Tokens::AuthorsBlk,
|
235
|
-
NexusParser::Tokens::SetsBlk,
|
236
|
-
NexusParser::Tokens::MqCharModelsBlk,
|
237
|
-
NexusParser::Tokens::AssumptionsBlk,
|
238
|
-
NexusParser::Tokens::CodonsBlk,
|
239
|
-
NexusParser::Tokens::MesquiteBlk,
|
240
|
-
NexusParser::Tokens::TreesBlk,
|
241
|
-
NexusParser::Tokens::LabelsBlk,
|
242
|
-
NexusParser::Tokens::TaxaBlk,
|
243
|
-
NexusParser::Tokens::NotesBlk,
|
244
|
-
NexusParser::Tokens::Title,
|
245
|
-
NexusParser::Tokens::Taxlabels,
|
246
|
-
NexusParser::Tokens::Dimensions,
|
247
|
-
NexusParser::Tokens::FileLbl,
|
248
|
-
NexusParser::Tokens::Format,
|
249
|
-
NexusParser::Tokens::Equals,
|
250
|
-
NexusParser::Tokens::ValuePair, # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
|
251
|
-
NexusParser::Tokens::CharStateLabels,
|
252
|
-
NexusParser::Tokens::ChrsBlk,
|
253
|
-
NexusParser::Tokens::Number,
|
254
|
-
NexusParser::Tokens::Matrix,
|
255
|
-
NexusParser::Tokens::SemiColon,
|
256
|
-
NexusParser::Tokens::MesquiteIDs,
|
257
|
-
NexusParser::Tokens::MesquiteBlockID,
|
258
|
-
NexusParser::Tokens::BlkEnd,
|
259
|
-
NexusParser::Tokens::Colon,
|
260
|
-
NexusParser::Tokens::BckSlash,
|
261
|
-
NexusParser::Tokens::Comma,
|
262
|
-
NexusParser::Tokens::LParen,
|
263
|
-
NexusParser::Tokens::RParen,
|
264
|
-
NexusParser::Tokens::LBracket,
|
265
|
-
NexusParser::Tokens::RBracket,
|
266
|
-
NexusParser::Tokens::Label, # must be before RowVec
|
267
|
-
NexusParser::Tokens::RowVec,
|
268
|
-
NexusParser::Tokens::LinkLine,
|
269
|
-
NexusParser::Tokens::ID # need to trash this
|
270
|
-
]
|
271
|
-
end
|
272
|
-
|
273
277
|
end
|
274
|
-
|
data/lib/nexus_parser/version.rb
CHANGED
data/lib/nexus_parser.rb
CHANGED
@@ -3,9 +3,6 @@
|
|
3
3
|
# uses the PhyloTree parser/lexer engine by Krishna Dole which in turn was based on
|
4
4
|
# Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library
|
5
5
|
|
6
|
-
# outstanding issues:
|
7
|
-
## need to resolve Tokens Labels, ValuePair, IDs
|
8
|
-
|
9
6
|
module NexusParser
|
10
7
|
|
11
8
|
require File.expand_path(File.join(File.dirname(__FILE__), 'nexus_parser', 'tokens'))
|
@@ -75,7 +72,7 @@ class NexusParser
|
|
75
72
|
class Coding
|
76
73
|
# unfortunately we need this for notes
|
77
74
|
attr_accessor :notes
|
78
|
-
attr_writer :state
|
75
|
+
attr_writer :state
|
79
76
|
|
80
77
|
def initialize(options = {})
|
81
78
|
@states = options[:states]
|
@@ -85,7 +82,7 @@ class NexusParser
|
|
85
82
|
def states
|
86
83
|
@states.class == Array ? @states : [@states]
|
87
84
|
end
|
88
|
-
|
85
|
+
|
89
86
|
end
|
90
87
|
|
91
88
|
class Note
|
@@ -118,7 +115,7 @@ class NexusParser
|
|
118
115
|
end
|
119
116
|
end
|
120
117
|
|
121
|
-
end
|
118
|
+
end # end NexusParser
|
122
119
|
|
123
120
|
|
124
121
|
# constructs the NexusParser
|
@@ -141,6 +138,9 @@ class Builder
|
|
141
138
|
def code_row(taxon_index, rowvector)
|
142
139
|
|
143
140
|
@nf.characters.each_with_index do |c, i|
|
141
|
+
raise(ParseError,
|
142
|
+
"Row #{taxon_index} of the matrix is too short") if rowvector[i].nil?
|
143
|
+
|
144
144
|
@nf.codings[taxon_index.to_i] = [] if !@nf.codings[taxon_index.to_i]
|
145
145
|
@nf.codings[taxon_index.to_i][i] = NexusParser::Coding.new(:states => rowvector[i])
|
146
146
|
|
@@ -185,7 +185,7 @@ class Builder
|
|
185
185
|
|
186
186
|
# need to create the characters
|
187
187
|
|
188
|
-
raise(
|
188
|
+
raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
|
189
189
|
|
190
190
|
(@nf.characters[@index].name = @opt[:name]) if @opt[:name]
|
191
191
|
|
@@ -193,18 +193,45 @@ class Builder
|
|
193
193
|
@opt.delete(:name)
|
194
194
|
|
195
195
|
# the rest have states
|
196
|
-
@opt
|
196
|
+
create_or_update_states_for_character(@index, @opt)
|
197
|
+
end
|
198
|
+
|
199
|
+
def update_chr_name(i, name)
|
200
|
+
raise(ParseError, "There are #{@nf.characters.count} characters but we're trying to update from row #{i + 1} of the CHARLABELS list - check your NCHAR and/or the length of your list.") if !@nf.characters[i]
|
201
|
+
|
202
|
+
# The CHARLABELS list is unindexed, so users are allowed to use '_' to
|
203
|
+
# indicate that a character name is unspecified.
|
204
|
+
@nf.characters[i].name = (name == '_' ? '' : name)
|
205
|
+
end
|
206
|
+
|
207
|
+
# legal hash keys are :index and integers that point to state labels
|
208
|
+
def update_chr_states(options = {})
|
209
|
+
return false if !options[:index]
|
210
|
+
|
211
|
+
@opt = options
|
212
|
+
|
213
|
+
@index = @opt[:index].to_i
|
214
|
+
|
215
|
+
raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the STATELABELS. Check the indices.") if !@nf.characters[@index]
|
216
|
+
|
217
|
+
@opt.delete(:index)
|
218
|
+
|
219
|
+
# the rest have states
|
220
|
+
create_or_update_states_for_character(@index, @opt)
|
221
|
+
end
|
197
222
|
|
198
|
-
|
223
|
+
def create_or_update_states_for_character(i, options)
|
224
|
+
options.keys.each do |k|
|
225
|
+
|
226
|
+
if (@nf.characters[i].states != {}) && @nf.characters[i].states[k] # state exists
|
199
227
|
|
200
228
|
## !! ONLY HANDLES NAME, UPDATE TO HANDLE notes etc. when we get them ##
|
201
|
-
update_state(
|
229
|
+
update_state(i, :index => k, :name => options[k])
|
202
230
|
|
203
231
|
else # doesn't, create it
|
204
|
-
@nf.characters[
|
232
|
+
@nf.characters[i].add_state(:label => k.to_s, :name => options[k])
|
205
233
|
end
|
206
234
|
end
|
207
|
-
|
208
235
|
end
|
209
236
|
|
210
237
|
def update_state(chr_index, options = {})
|
@@ -256,7 +283,7 @@ class Builder
|
|
256
283
|
@nf
|
257
284
|
end
|
258
285
|
|
259
|
-
end # end
|
286
|
+
end # end Builder
|
260
287
|
|
261
288
|
# NexusParser::ParseError
|
262
289
|
class ParseError < StandardError
|
@@ -270,7 +297,7 @@ def parse_nexus_file(input)
|
|
270
297
|
@input = input
|
271
298
|
@input.gsub!(/\[[^\]]*\]/,'') # strip out all comments BEFORE we parse the file
|
272
299
|
# quickly peek at the input, does this look like a Nexus file?
|
273
|
-
if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /end\;/i)
|
300
|
+
if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /(end|endblock)\;/i)
|
274
301
|
raise(NexusParser::ParseError, "File is missing at least some required headers, check formatting.", caller)
|
275
302
|
end
|
276
303
|
|