nexus_parser 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/lib/nexus_parser/lexer.rb +0 -10
- data/lib/nexus_parser/parser.rb +146 -77
- data/lib/nexus_parser/tokens.rb +87 -84
- data/lib/nexus_parser/version.rb +1 -1
- data/lib/nexus_parser.rb +41 -14
- data/test/test_nexus_parser.rb +371 -26
- metadata +3 -3
data/lib/nexus_parser/tokens.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
module NexusParser::Tokens
|
2
2
|
|
3
|
+
ENDBLKSTR = '(end|endblock)'.freeze
|
4
|
+
QUOTEDLABEL = '(\'+[^\']+\'+)|(\"+[^\"]+\"+)'
|
5
|
+
|
3
6
|
class Token
|
4
7
|
# this allows access the the class attribute regexp, without using a class variable
|
5
8
|
class << self; attr_reader :regexp; end
|
@@ -31,12 +34,12 @@ module NexusParser::Tokens
|
|
31
34
|
end
|
32
35
|
|
33
36
|
class EndBlk < Token
|
34
|
-
@regexp = Regexp.new(/\A\s*([\s]
|
37
|
+
@regexp = Regexp.new(/\A\s*([\s]*#{ENDBLKSTR}[\s]*;[\s]*)/i)
|
35
38
|
end
|
36
39
|
|
37
40
|
# label
|
38
41
|
class AuthorsBlk < Token
|
39
|
-
@regexp = Regexp.new(/\A\s*(Authors
|
42
|
+
@regexp = Regexp.new(/\A\s*(Authors;.*?#{ENDBLKSTR};)\s*/im)
|
40
43
|
end
|
41
44
|
|
42
45
|
# label
|
@@ -66,14 +69,17 @@ module NexusParser::Tokens
|
|
66
69
|
@regexp = Regexp.new(/\A\s*(format)\s*/i)
|
67
70
|
end
|
68
71
|
|
72
|
+
# TODO: Handled, but ignored
|
73
|
+
class RespectCase < Token
|
74
|
+
@regexp = Regexp.new(/\A\s*(respectcase)\s*/i)
|
75
|
+
end
|
76
|
+
|
69
77
|
# label
|
70
78
|
class Taxlabels < Token
|
71
79
|
@regexp = Regexp.new(/\A\s*(\s*taxlabels\s*)\s*/i)
|
72
80
|
end
|
73
81
|
|
74
|
-
|
75
|
-
class Label < Token
|
76
|
-
@regexp = Regexp.new('\A\s*((\'+[^\']+\'+)|(\"+[^\"]+\"+)|(\w[^,:(); \t\n]*|_)+)\s*') # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" " # choking on 'Foo_stuff_things'
|
82
|
+
class LabelBase < Token
|
77
83
|
def initialize(str)
|
78
84
|
str.strip!
|
79
85
|
str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
|
@@ -83,6 +89,20 @@ module NexusParser::Tokens
|
|
83
89
|
end
|
84
90
|
end
|
85
91
|
|
92
|
+
class Label < LabelBase
|
93
|
+
@regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|(\w[^,:(); \t\n]*)+)\s*/) # matches "foo and stuff", foo, 'stuff or foo', '''foo''', """bar""" BUT NOT ""foo" "
|
94
|
+
def initialize(str)
|
95
|
+
super(str)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
class CharacterLabel < LabelBase
|
100
|
+
@regexp = Regexp.new(/\A\s*(#{QUOTEDLABEL}|[^ \t\n\/\'\",;]+)\s*/)
|
101
|
+
def initialize(str)
|
102
|
+
super(str)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
86
106
|
class ChrsBlk < Token
|
87
107
|
@regexp = Regexp.new(/\A\s*(characters\s*;)\s*/i)
|
88
108
|
end
|
@@ -111,10 +131,50 @@ module NexusParser::Tokens
|
|
111
131
|
class RowVec < Token
|
112
132
|
@regexp = Regexp.new(/\A\s*(.+)\s*\n/i)
|
113
133
|
def initialize(str)
|
114
|
-
#
|
115
|
-
|
116
|
-
|
117
|
-
|
134
|
+
# We ignore commas outside (and inside) of groupings, it's fine.
|
135
|
+
str.gsub!(/[\, \t]/, '')
|
136
|
+
|
137
|
+
groupers = ['(', ')', '{', '}']
|
138
|
+
openers = ['(', '{']
|
139
|
+
closers = [')', '}']
|
140
|
+
closer_for = { '(' => ')', '{' => '}' }
|
141
|
+
|
142
|
+
a = []
|
143
|
+
group = nil
|
144
|
+
group_closer = nil
|
145
|
+
str.each_char { |c|
|
146
|
+
if groupers.include? c
|
147
|
+
if ((openers.include?(c) && !group.nil?) ||
|
148
|
+
(closers.include?(c) && (group.nil? || c != group_closer)))
|
149
|
+
raise(NexusParser::ParseError,
|
150
|
+
"Mismatched grouping in matrix row '#{str}'")
|
151
|
+
end
|
152
|
+
|
153
|
+
if openers.include? c
|
154
|
+
group = []
|
155
|
+
group_closer = closer_for[c]
|
156
|
+
else # c is a closer
|
157
|
+
if group.count == 1
|
158
|
+
a << group.first
|
159
|
+
elsif group.count > 1
|
160
|
+
a << group
|
161
|
+
end
|
162
|
+
group = nil
|
163
|
+
group_closer = nil
|
164
|
+
end
|
165
|
+
else
|
166
|
+
if group.nil?
|
167
|
+
a << c
|
168
|
+
else
|
169
|
+
group << c
|
170
|
+
end
|
171
|
+
end
|
172
|
+
}
|
173
|
+
|
174
|
+
raise(NexusParser::ParseError,
|
175
|
+
"Unclosed grouping in matrix row '#{str}'") if !group.nil?
|
176
|
+
|
177
|
+
@value = a
|
118
178
|
end
|
119
179
|
end
|
120
180
|
|
@@ -122,6 +182,14 @@ module NexusParser::Tokens
|
|
122
182
|
@regexp = Regexp.new(/\A\s*(CHARSTATELABELS)\s*/i)
|
123
183
|
end
|
124
184
|
|
185
|
+
class CharLabels < Token
|
186
|
+
@regexp = Regexp.new(/\A\s*(CHARLABELS)\s*/i)
|
187
|
+
end
|
188
|
+
|
189
|
+
class StateLabels < Token
|
190
|
+
@regexp = Regexp.new(/\A\s*(STATELABELS)\s*/i)
|
191
|
+
end
|
192
|
+
|
125
193
|
class MesquiteIDs < Token
|
126
194
|
@regexp = Regexp.new(/\A\s*(IDS[^;]*;)\s*/i)
|
127
195
|
end
|
@@ -133,35 +201,35 @@ module NexusParser::Tokens
|
|
133
201
|
# unparsed blocks
|
134
202
|
|
135
203
|
class TreesBlk < Token
|
136
|
-
@regexp = Regexp.new(/\A\s*(trees
|
204
|
+
@regexp = Regexp.new(/\A\s*(trees;.*?#{ENDBLKSTR};)\s*/im) # note the multi-line /m
|
137
205
|
end
|
138
206
|
|
139
207
|
class SetsBlk < Token
|
140
|
-
@regexp = Regexp.new(/\A\s*(sets
|
208
|
+
@regexp = Regexp.new(/\A\s*(sets;.*?#{ENDBLKSTR};)\s*/im)
|
141
209
|
end
|
142
210
|
|
143
211
|
class MqCharModelsBlk < Token
|
144
|
-
@regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS
|
212
|
+
@regexp = Regexp.new(/\A\s*(MESQUITECHARMODELS;.*?#{ENDBLKSTR};)\s*/im)
|
145
213
|
end
|
146
214
|
|
147
215
|
class LabelsBlk < Token
|
148
|
-
@regexp = Regexp.new(/\A\s*(LABELS
|
216
|
+
@regexp = Regexp.new(/\A\s*(LABELS;.*?#{ENDBLKSTR};)\s*/im)
|
149
217
|
end
|
150
218
|
|
151
219
|
class AssumptionsBlk < Token
|
152
|
-
@regexp = Regexp.new(/\A\s*(ASSUMPTIONS
|
220
|
+
@regexp = Regexp.new(/\A\s*(ASSUMPTIONS;.*?#{ENDBLKSTR};)\s*/im)
|
153
221
|
end
|
154
222
|
|
155
223
|
class CodonsBlk < Token
|
156
|
-
@regexp = Regexp.new(/\A\s*(CODONS
|
224
|
+
@regexp = Regexp.new(/\A\s*(CODONS;.*?#{ENDBLKSTR};)\s*/im)
|
157
225
|
end
|
158
226
|
|
159
227
|
class MesquiteBlk < Token
|
160
|
-
@regexp = Regexp.new(/\A\s*(Mesquite
|
228
|
+
@regexp = Regexp.new(/\A\s*(Mesquite;.*?#{ENDBLKSTR};)\s*/im)
|
161
229
|
end
|
162
230
|
|
163
231
|
class BlkEnd < Token
|
164
|
-
@regexp = Regexp.new(/\A[\s]*(
|
232
|
+
@regexp = Regexp.new(/\A[\s]*(#{ENDBLKSTR};)\s*/i)
|
165
233
|
end
|
166
234
|
|
167
235
|
class LBracket < Token
|
@@ -188,16 +256,6 @@ module NexusParser::Tokens
|
|
188
256
|
@regexp = Regexp.new('\A\s*(\/)\s*')
|
189
257
|
end
|
190
258
|
|
191
|
-
# labels
|
192
|
-
class ID < Token
|
193
|
-
@regexp = Regexp.new('\A\s*((\'[^\']+\')|(\w[^,:(); \t\n]*|_)+)\s*')
|
194
|
-
def initialize(str)
|
195
|
-
str.strip!
|
196
|
-
str = str[1..-2] if str[0..0] == "'" # get rid of quote marks
|
197
|
-
@value = str
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
259
|
class Colon < Token
|
202
260
|
@regexp = Regexp.new('\A\s*(:)\s*')
|
203
261
|
end
|
@@ -210,65 +268,10 @@ module NexusParser::Tokens
|
|
210
268
|
@regexp = Regexp.new('\A\s*(\,)\s*')
|
211
269
|
end
|
212
270
|
|
213
|
-
class
|
214
|
-
@regexp = Regexp.new('\A\s*(
|
215
|
-
def initialize(str)
|
216
|
-
# a little oddness here, in some case we don't want to include the .0
|
217
|
-
# see issues with numbers as labels
|
218
|
-
if str =~ /\./
|
219
|
-
@value = str.to_f
|
220
|
-
else
|
221
|
-
@value = str.to_i
|
222
|
-
end
|
223
|
-
|
224
|
-
end
|
271
|
+
class PositiveInteger < Token
|
272
|
+
@regexp = Regexp.new('\A\s*(\d+)\s*')
|
225
273
|
end
|
226
274
|
|
227
275
|
# NexusParser::Tokens::NexusComment
|
228
276
|
|
229
|
-
# this list also defines priority, i.e. if tokens have overlap (which they shouldn't!!) then the earlier indexed token will match first
|
230
|
-
def self.nexus_file_token_list
|
231
|
-
[ NexusParser::Tokens::NexusStart,
|
232
|
-
NexusParser::Tokens::BeginBlk,
|
233
|
-
NexusParser::Tokens::EndBlk,
|
234
|
-
NexusParser::Tokens::AuthorsBlk,
|
235
|
-
NexusParser::Tokens::SetsBlk,
|
236
|
-
NexusParser::Tokens::MqCharModelsBlk,
|
237
|
-
NexusParser::Tokens::AssumptionsBlk,
|
238
|
-
NexusParser::Tokens::CodonsBlk,
|
239
|
-
NexusParser::Tokens::MesquiteBlk,
|
240
|
-
NexusParser::Tokens::TreesBlk,
|
241
|
-
NexusParser::Tokens::LabelsBlk,
|
242
|
-
NexusParser::Tokens::TaxaBlk,
|
243
|
-
NexusParser::Tokens::NotesBlk,
|
244
|
-
NexusParser::Tokens::Title,
|
245
|
-
NexusParser::Tokens::Taxlabels,
|
246
|
-
NexusParser::Tokens::Dimensions,
|
247
|
-
NexusParser::Tokens::FileLbl,
|
248
|
-
NexusParser::Tokens::Format,
|
249
|
-
NexusParser::Tokens::Equals,
|
250
|
-
NexusParser::Tokens::ValuePair, # this has bad overlap with Label and likely IDs (need to kill the latter, its a lesser Label)
|
251
|
-
NexusParser::Tokens::CharStateLabels,
|
252
|
-
NexusParser::Tokens::ChrsBlk,
|
253
|
-
NexusParser::Tokens::Number,
|
254
|
-
NexusParser::Tokens::Matrix,
|
255
|
-
NexusParser::Tokens::SemiColon,
|
256
|
-
NexusParser::Tokens::MesquiteIDs,
|
257
|
-
NexusParser::Tokens::MesquiteBlockID,
|
258
|
-
NexusParser::Tokens::BlkEnd,
|
259
|
-
NexusParser::Tokens::Colon,
|
260
|
-
NexusParser::Tokens::BckSlash,
|
261
|
-
NexusParser::Tokens::Comma,
|
262
|
-
NexusParser::Tokens::LParen,
|
263
|
-
NexusParser::Tokens::RParen,
|
264
|
-
NexusParser::Tokens::LBracket,
|
265
|
-
NexusParser::Tokens::RBracket,
|
266
|
-
NexusParser::Tokens::Label, # must be before RowVec
|
267
|
-
NexusParser::Tokens::RowVec,
|
268
|
-
NexusParser::Tokens::LinkLine,
|
269
|
-
NexusParser::Tokens::ID # need to trash this
|
270
|
-
]
|
271
|
-
end
|
272
|
-
|
273
277
|
end
|
274
|
-
|
data/lib/nexus_parser/version.rb
CHANGED
data/lib/nexus_parser.rb
CHANGED
@@ -3,9 +3,6 @@
|
|
3
3
|
# uses the PhyloTree parser/lexer engine by Krishna Dole which in turn was based on
|
4
4
|
# Thomas Mailund's <mailund@birc.dk> 'newick-1.0.5' Python library
|
5
5
|
|
6
|
-
# outstanding issues:
|
7
|
-
## need to resolve Tokens Labels, ValuePair, IDs
|
8
|
-
|
9
6
|
module NexusParser
|
10
7
|
|
11
8
|
require File.expand_path(File.join(File.dirname(__FILE__), 'nexus_parser', 'tokens'))
|
@@ -75,7 +72,7 @@ class NexusParser
|
|
75
72
|
class Coding
|
76
73
|
# unfortunately we need this for notes
|
77
74
|
attr_accessor :notes
|
78
|
-
attr_writer :state
|
75
|
+
attr_writer :state
|
79
76
|
|
80
77
|
def initialize(options = {})
|
81
78
|
@states = options[:states]
|
@@ -85,7 +82,7 @@ class NexusParser
|
|
85
82
|
def states
|
86
83
|
@states.class == Array ? @states : [@states]
|
87
84
|
end
|
88
|
-
|
85
|
+
|
89
86
|
end
|
90
87
|
|
91
88
|
class Note
|
@@ -118,7 +115,7 @@ class NexusParser
|
|
118
115
|
end
|
119
116
|
end
|
120
117
|
|
121
|
-
end
|
118
|
+
end # end NexusParser
|
122
119
|
|
123
120
|
|
124
121
|
# constructs the NexusParser
|
@@ -141,6 +138,9 @@ class Builder
|
|
141
138
|
def code_row(taxon_index, rowvector)
|
142
139
|
|
143
140
|
@nf.characters.each_with_index do |c, i|
|
141
|
+
raise(ParseError,
|
142
|
+
"Row #{taxon_index} of the matrix is too short") if rowvector[i].nil?
|
143
|
+
|
144
144
|
@nf.codings[taxon_index.to_i] = [] if !@nf.codings[taxon_index.to_i]
|
145
145
|
@nf.codings[taxon_index.to_i][i] = NexusParser::Coding.new(:states => rowvector[i])
|
146
146
|
|
@@ -185,7 +185,7 @@ class Builder
|
|
185
185
|
|
186
186
|
# need to create the characters
|
187
187
|
|
188
|
-
raise(
|
188
|
+
raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the character state labels. Check the indices. It may be for this character \"#{@opt[:name]}\".") if !@nf.characters[@index]
|
189
189
|
|
190
190
|
(@nf.characters[@index].name = @opt[:name]) if @opt[:name]
|
191
191
|
|
@@ -193,18 +193,45 @@ class Builder
|
|
193
193
|
@opt.delete(:name)
|
194
194
|
|
195
195
|
# the rest have states
|
196
|
-
@opt
|
196
|
+
create_or_update_states_for_character(@index, @opt)
|
197
|
+
end
|
198
|
+
|
199
|
+
def update_chr_name(i, name)
|
200
|
+
raise(ParseError, "There are #{@nf.characters.count} characters but we're trying to update from row #{i + 1} of the CHARLABELS list - check your NCHAR and/or the length of your list.") if !@nf.characters[i]
|
201
|
+
|
202
|
+
# The CHARLABELS list is unindexed, so users are allowed to use '_' to
|
203
|
+
# indicate that a character name is unspecified.
|
204
|
+
@nf.characters[i].name = (name == '_' ? '' : name)
|
205
|
+
end
|
206
|
+
|
207
|
+
# legal hash keys are :index and integers that point to state labels
|
208
|
+
def update_chr_states(options = {})
|
209
|
+
return false if !options[:index]
|
210
|
+
|
211
|
+
@opt = options
|
212
|
+
|
213
|
+
@index = @opt[:index].to_i
|
214
|
+
|
215
|
+
raise(ParseError, "Can't update character of index #{@index}, it doesn't exist! This is a problem parsing the STATELABELS. Check the indices.") if !@nf.characters[@index]
|
216
|
+
|
217
|
+
@opt.delete(:index)
|
218
|
+
|
219
|
+
# the rest have states
|
220
|
+
create_or_update_states_for_character(@index, @opt)
|
221
|
+
end
|
197
222
|
|
198
|
-
|
223
|
+
def create_or_update_states_for_character(i, options)
|
224
|
+
options.keys.each do |k|
|
225
|
+
|
226
|
+
if (@nf.characters[i].states != {}) && @nf.characters[i].states[k] # state exists
|
199
227
|
|
200
228
|
## !! ONLY HANDLES NAME, UPDATE TO HANDLE notes etc. when we get them ##
|
201
|
-
update_state(
|
229
|
+
update_state(i, :index => k, :name => options[k])
|
202
230
|
|
203
231
|
else # doesn't, create it
|
204
|
-
@nf.characters[
|
232
|
+
@nf.characters[i].add_state(:label => k.to_s, :name => options[k])
|
205
233
|
end
|
206
234
|
end
|
207
|
-
|
208
235
|
end
|
209
236
|
|
210
237
|
def update_state(chr_index, options = {})
|
@@ -256,7 +283,7 @@ class Builder
|
|
256
283
|
@nf
|
257
284
|
end
|
258
285
|
|
259
|
-
end # end
|
286
|
+
end # end Builder
|
260
287
|
|
261
288
|
# NexusParser::ParseError
|
262
289
|
class ParseError < StandardError
|
@@ -270,7 +297,7 @@ def parse_nexus_file(input)
|
|
270
297
|
@input = input
|
271
298
|
@input.gsub!(/\[[^\]]*\]/,'') # strip out all comments BEFORE we parse the file
|
272
299
|
# quickly peek at the input, does this look like a Nexus file?
|
273
|
-
if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /end\;/i)
|
300
|
+
if !(@input =~ /\#Nexus/i) || !(@input =~ /Begin/i) || !(@input =~ /Matrix/i) || !(@input =~ /(end|endblock)\;/i)
|
274
301
|
raise(NexusParser::ParseError, "File is missing at least some required headers, check formatting.", caller)
|
275
302
|
end
|
276
303
|
|