tokn 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,156 +1,161 @@
1
1
  require_relative 'tools'
2
2
  req('tokn_const code_set dfa_builder state reg_parse')
3
3
 
4
- # Parses a token definition script, and generates an NFA that
5
- # is capable of recognizing and distinguishing between the various
6
- # tokens.
7
- #
8
- # Each line in the script is one of
9
- #
10
- # # ...comment... (the # must appear as the first character in the line)
11
- #
12
- # <tokenname> ':' <regex>
13
- #
14
- #
15
- # A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
16
- # If the first character is '_', the token is treated as an 'anonymous' token; these can
17
- # appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
18
- # generated NFA.
19
- #
20
- class TokenDefParser
21
- include Tokn
22
-
23
- attr_reader :dfa
24
-
25
- # Compile a token definition script into a DFA
26
- #
27
- def initialize(script, createPDF = false)
28
- @script = script
29
- parseScript
30
- if createPDF
31
- dfa.startState.generatePDF("tokenizer_dfa")
32
- end
33
- end
34
-
35
- private
36
-
37
- def parseScript
38
- db = false
39
-
40
- nextTokenId = 0
41
-
42
- # List of tokens entries, including anonymous ones
43
- @tokenListBig = []
4
+ module ToknInternal
5
+
6
+ # Parses a token definition script, and generates an NFA that
7
+ # is capable of recognizing and distinguishing between the various
8
+ # tokens.
9
+ #
10
+ # Each line in the script is one of
11
+ #
12
+ # # ...comment... (the # must appear as the first character in the line)
13
+ #
14
+ # <tokenname> ':' <regex>
15
+ #
16
+ #
17
+ # A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
18
+ # If the first character is '_', the token is treated as an 'anonymous' token; these can
19
+ # appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
20
+ # generated NFA.
21
+ #
22
+ class TokenDefParser
44
23
 
45
- # List of tokens names, excluding anonymous ones
46
- tokenListSmall = []
24
+ attr_reader :dfa
47
25
 
48
- # Maps token name to token entry
49
- @tokenNameMap = {}
26
+ # Compile a token definition script into a DFA
27
+ #
28
+ def initialize(script, createPDF = false)
29
+ @script = script
30
+ parseScript
31
+ if createPDF
32
+ dfa.startState.generatePDF("tokenizer_dfa")
33
+ end
34
+ end
50
35
 
51
- @lines = @script.split("\n")
36
+ private
52
37
 
53
- @lines.each_with_index do |line, lineNumber|
54
-
55
- line.strip!
56
-
57
- # If line is empty, or starts with '#', it's a comment
58
- if line.length == 0 || line[0] == '#'
59
- next
60
- end
61
-
62
- if !(line =~ TOKENNAME_EXPR)
63
- raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
64
- end
65
-
66
- pos = line.index(":")
67
-
68
- tokenName = line[0,pos].strip()
69
-
70
- expr = line[pos+1..-1].strip()
71
-
72
- rex = RegParse.new(expr, @tokenNameMap)
73
-
74
- # Give it the next available token id, if it's not an anonymous token
75
- tkId = nil
76
- if tokenName[0] != '_'
77
- tkId = nextTokenId
78
- nextTokenId += 1
79
- end
80
-
81
- tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
82
-
83
- !db || pr("token entry: %s\n",d(tkEntry))
84
-
85
- if @tokenNameMap.has_key?(tokenName)
86
- raise ParseException, "Duplicate token name: "+line
38
+ def parseScript
39
+ db = false
40
+
41
+ nextTokenId = 0
42
+
43
+ # List of tokens entries, including anonymous ones
44
+ @tokenListBig = []
45
+
46
+ # List of tokens names, excluding anonymous ones
47
+ tokenListSmall = []
48
+
49
+ # Maps token name to token entry
50
+ @tokenNameMap = {}
51
+
52
+ @lines = @script.split("\n")
53
+
54
+ @lines.each_with_index do |line, lineNumber|
55
+
56
+ line.strip!
57
+
58
+ # If line is empty, or starts with '#', it's a comment
59
+ if line.length == 0 || line[0] == '#'
60
+ next
61
+ end
62
+
63
+ if !(line =~ TOKENNAME_EXPR)
64
+ raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
65
+ end
66
+
67
+ pos = line.index(":")
68
+
69
+ tokenName = line[0,pos].strip()
70
+
71
+ expr = line[pos+1..-1].strip()
72
+
73
+ rex = RegParse.new(expr, @tokenNameMap)
74
+
75
+ # Give it the next available token id, if it's not an anonymous token
76
+ tkId = nil
77
+ if tokenName[0] != '_'
78
+ tkId = nextTokenId
79
+ nextTokenId += 1
80
+ end
81
+
82
+ tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
83
+
84
+ !db || pr("token entry: %s\n",d(tkEntry))
85
+
86
+ if @tokenNameMap.has_key?(tokenName)
87
+ raise ParseException, "Duplicate token name: "+line
88
+ end
89
+
90
+
91
+ @tokenListBig.push(tkEntry)
92
+ @tokenNameMap[tkEntry[0]] = tkEntry
93
+
94
+ if tkId
95
+ tokenListSmall.push(tokenName)
96
+ end
97
+
98
+ !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
99
+
87
100
  end
88
101
 
102
+ combined = combineTokenNFAs()
103
+ !db || combined.generatePDF("combined")
89
104
 
90
- @tokenListBig.push(tkEntry)
91
- @tokenNameMap[tkEntry[0]] = tkEntry
105
+ dfa = DFABuilder.nfa_to_dfa(combined)
106
+ !db || dfa.generatePDF("combined_minimized")
92
107
 
93
- if tkId
94
- tokenListSmall.push(tokenName)
95
- end
96
-
97
- !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
108
+ @dfa = Tokn::DFA.new(tokenListSmall, dfa)
109
+ end
110
+
111
+ # Combine the individual NFAs constructed for the token definitions into
112
+ # one large NFA, each augmented with an edge labelled with the appropriate
113
+ # token identifier to let the tokenizer see which token led to the final state.
114
+ #
115
+ def combineTokenNFAs
116
+
98
117
 
118
+ baseId = 0
119
+ startState = nil
120
+
121
+ @tokenListBig.each do |tokenName, regParse, index, tokenId|
122
+
123
+ # Skip anonymous token definitions
124
+ if !tokenId
125
+ next
126
+ end
127
+
128
+ oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
129
+
130
+ dupStart = oldToNewMap[regParse.startState]
131
+
132
+ # Transition from the expression's end state (not a final state)
133
+ # to a new final state, with the transitioning edge
134
+ # labelled with the token id (actually, a transformed token id to distinguish
135
+ # it from character codes)
136
+ dupEnd = oldToNewMap[regParse.endState]
137
+
138
+ dupfinalState = State.new(baseId)
139
+ baseId += 1
140
+ dupfinalState.finalState = true
141
+
142
+ # Why do I need to add 'ToknInternal.' here? Very confusing.
143
+ dupEnd.addEdge(CodeSet.new(ToknInternal.tokenIdToEdgeLabel(tokenId)), dupfinalState)
144
+
145
+ if !startState
146
+ startState = dupStart
147
+ else
148
+ # Add an e-transition from the start state to this expression's start
149
+ startState.addEdge(CodeSet.new(EPSILON),dupStart)
150
+ end
151
+ end
152
+ startState
99
153
  end
100
-
101
- combined = combineTokenNFAs()
102
- !db || combined.generatePDF("combined")
103
-
104
- dfa = DFABuilder.nfa_to_dfa(combined)
105
- !db || dfa.generatePDF("combined_minimized")
106
-
107
- @dfa = DFA.new(tokenListSmall, dfa)
108
- end
109
-
110
- # Combine the individual NFAs constructed for the token definitions into
111
- # one large NFA, each augmented with an edge labelled with the appropriate
112
- # token identifier to let the tokenizer see which token led to the final state.
113
- #
114
- def combineTokenNFAs
115
154
 
116
- baseId = 0
117
- startState = nil
155
+ # Regex for token names preceding regular expressions
156
+ #
157
+ TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
118
158
 
119
- @tokenListBig.each do |tokenName, regParse, index, tokenId|
120
-
121
- # Skip anonymous token definitions
122
- if !tokenId
123
- next
124
- end
125
-
126
- oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
127
-
128
- dupStart = oldToNewMap[regParse.startState]
129
-
130
- # Transition from the expression's end state (not a final state)
131
- # to a new final state, with the transitioning edge
132
- # labelled with the token id (actually, a transformed token id to distinguish
133
- # it from character codes)
134
- dupEnd = oldToNewMap[regParse.endState]
135
-
136
- dupfinalState = State.new(baseId)
137
- baseId += 1
138
- dupfinalState.finalState = true
139
-
140
- dupEnd.addEdge(CodeSet.new(tokenIdToEdgeLabel(tokenId)), dupfinalState)
141
-
142
- if !startState
143
- startState = dupStart
144
- else
145
- # Add an e-transition from the start state to this expression's start
146
- startState.addEdge(CodeSet.new(EPSILON),dupStart)
147
- end
148
- end
149
- startState
150
159
  end
151
160
 
152
- # Regex for token names preceding regular expressions
153
- #
154
- TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
155
-
156
- end
161
+ end # module ToknInternal
@@ -1,211 +1,279 @@
1
1
  require_relative 'tools'
2
- req('tokn_const ')
2
+ req('tokn_const dfa')
3
3
 
4
- # Extracts tokens from a script, given a previously constructed DFA.
5
- #
6
- class Tokenizer
7
- include Tokn
8
-
9
- # Construct a tokenizer, given a DFA and some text to process
10
- #
11
- def initialize(dfa, text)
12
- @dfa = dfa
13
- @text = text
14
- @lineNumber = 0
15
- @column = 0
16
- @cursor = 0
17
- @tokenHistory = []
18
- @historyPointer = 0
19
- end
4
+ module Tokn
20
5
 
21
- # Determine next token (without reading it)
6
+ # Extracts tokens from a script, given a previously constructed DFA.
22
7
  #
23
- # Returns Token, or nil if end of input
24
- #
25
- def peek
26
- if !@text
27
- raise IllegalStateException, "No input text specified"
28
- end
29
-
30
- db = false
31
- !db || warn("debug printing is on")
32
- !db || pr("peek, cursor=%d\n",@cursor)
8
+ class Tokenizer
33
9
 
34
- if @historyPointer == @tokenHistory.size
35
- if @cursor < @text.length
36
-
37
- bestLength = 0
38
- bestId = UNKNOWN_TOKEN
39
-
40
- charOffset = 0
41
- state = @dfa.startState
42
- while @cursor + charOffset <= @text.length
43
- ch = nil
44
- if @cursor + charOffset < @text.length
45
- ch = @text[@cursor + charOffset].ord()
46
- !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
47
- end
48
-
49
- nextState = nil
50
-
51
- # Examine edges leaving this state.
52
- # If one is labelled with a token id, we don't need to match the character with it;
53
- # store as best token found if length is longer than previous, or equal to previous
54
- # with higher id.
10
+ # Construct a tokenizer
11
+ #
12
+ # @param dfa the DFA to use
13
+ # @param text the text to extract tokens from
14
+ # @param skipName if not nil, tokens with this name will be skipped
15
+ #
16
+ def initialize(dfa, text, skipName = nil)
17
+ @dfa = dfa
18
+ @text = text
19
+ if !text
20
+ raise ArgumentError, "No text defined"
21
+ end
22
+ @skipTokenId = nil
23
+ if skipName
24
+ @skipTokenId = dfa.tokenId(skipName)
25
+ if !@skipTokenId
26
+ raise ArgumentError, "No token with name "+skipName+" found"
27
+ end
28
+ end
29
+ @lineNumber = 0
30
+ @column = 0
31
+ @cursor = 0
32
+ @tokenHistory = []
33
+ @historyPointer = 0
34
+ end
35
+
36
+ # Determine next token (without reading it)
37
+ #
38
+ # Returns Token, or nil if end of input
39
+ #
40
+ def peek
41
+ # if !@text
42
+ # raise IllegalStateException, "No input text specified"
43
+ # end
44
+
45
+ db = false
46
+ !db || warn("debug printing is on")
47
+ !db || pr("peek, cursor=%d\n",@cursor)
48
+
49
+ if @historyPointer == @tokenHistory.size
50
+ while true # repeat until we find a non-skipped token, or run out of text
51
+ break if @cursor >= @text.length
55
52
 
56
- # If an edge is labelled with the current character, advance to that state.
53
+ bestLength = 0
54
+ bestId = ToknInternal::UNKNOWN_TOKEN
57
55
 
58
- edges = state.edges
59
- edges.each do |lbl,dest|
60
- a = lbl.array
61
- !db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest))
62
- if a[0] < EPSILON
63
- newTokenId = edgeLabelToTokenId(a[0])
64
- !db || pr(" new token id=%d\n",newTokenId)
56
+ charOffset = 0
57
+ state = @dfa.startState
58
+ while @cursor + charOffset <= @text.length
59
+ ch = nil
60
+ if @cursor + charOffset < @text.length
61
+ ch = @text[@cursor + charOffset].ord()
62
+ !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
63
+ end
64
+
65
+ nextState = nil
66
+
67
+ # Examine edges leaving this state.
68
+ # If one is labelled with a token id, we don't need to match the character with it;
69
+ # store as best token found if length is longer than previous, or equal to previous
70
+ # with higher id.
65
71
 
66
- if (bestLength < charOffset || newTokenId > bestId)
67
- bestLength, bestId = charOffset, newTokenId
68
- !db || pr(" making longest found so far\n")
72
+ # If an edge is labelled with the current character, advance to that state.
73
+
74
+ edges = state.edges
75
+ edges.each do |lbl,dest|
76
+ a = lbl.array
77
+ !db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest))
78
+ if a[0] < ToknInternal::EPSILON
79
+ newTokenId = ToknInternal::edgeLabelToTokenId(a[0])
80
+ !db || pr(" new token id=%d\n",newTokenId)
81
+
82
+ if (bestLength < charOffset || newTokenId > bestId)
83
+ bestLength, bestId = charOffset, newTokenId
84
+ !db || pr(" making longest found so far\n")
85
+ end
69
86
  end
70
- end
87
+
88
+ if ch && lbl.contains?(ch)
89
+ !db || pr(" setting next state to %s\n",d(dest))
90
+ nextState = dest
91
+ break
92
+ end
93
+ end
71
94
 
72
- if ch && lbl.contains?(ch)
73
- !db || pr(" setting next state to %s\n",d(dest))
74
- nextState = dest
95
+ if !nextState
75
96
  break
76
97
  end
77
- end
78
-
79
- if !nextState
80
- break
98
+ state = nextState
99
+ charOffset += 1
100
+ !db || pr(" advanced to next state\n")
81
101
  end
82
- state = nextState
83
- charOffset += 1
84
- !db || pr(" advanced to next state\n")
85
- end
102
+
103
+ if bestId == @skipTokenId
104
+ @cursor += bestLength
105
+ next
106
+ end
107
+
108
+ peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
109
+
110
+ @tokenHistory.push(peekToken)
111
+ break # We found a token, so stop
112
+ end
113
+ end
114
+
115
+ ret = nil
116
+ if @historyPointer < @tokenHistory.size
117
+ ret = @tokenHistory[@historyPointer]
118
+ end
86
119
 
87
- peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
88
- @tokenHistory.push(peekToken)
89
- end
120
+ ret
121
+ end
122
+
123
+
124
+ # Read next token
125
+ #
126
+ # @param tokenName if not nil, the (string) name of the token expected
127
+ #
128
+ # @raise TokenizerException if no more tokens,if unrecognized token, or
129
+ # if token has different than expected name
130
+ #
131
+ def read(tokenName = nil)
132
+ token = peek()
133
+ if !token
134
+ raise TokenizerException,"No more tokens"
135
+ end
136
+
137
+ if token.id == ToknInternal::UNKNOWN_TOKEN
138
+ raise TokenizerException, "Unknown token "+token.inspect
139
+ end
140
+
141
+ if tokenName && tokenName != nameOf(token)
142
+ raise TokenizerException, "Unexpected token "+token.inspect
143
+ end
144
+
145
+ @historyPointer += 1
146
+
147
+ # Advance cursor, line number, column
148
+
149
+ tl = token.text.length
150
+ @cursor += tl
151
+ tl.times do |i|
152
+ c = token.text[i]
153
+ @column += 1
154
+ if c == "\n"
155
+ @lineNumber += 1
156
+ @column = 0
157
+ end
158
+ end
159
+ token
90
160
  end
91
161
 
92
- ret = nil
93
- if @historyPointer < @tokenHistory.size
94
- ret = @tokenHistory[@historyPointer]
162
+ # Read next token if it has a particular name
163
+ #
164
+ # > tokenName : name to look for
165
+ # < token read, or nil
166
+ #
167
+ def readIf(tokenName)
168
+ ret = nil
169
+ token = peek()
170
+ if token && nameOf(token) == tokenName
171
+ ret = read()
172
+ end
173
+ ret
95
174
  end
96
175
 
97
- ret
98
- end
176
+ # Read a sequence of tokens
177
+ # @param seq string of space-delimited token names; if name is '_',
178
+ # allows any token name in that position
179
+ # @return array of tokens read
180
+ #
181
+ def readSequence(seq)
182
+ seqNames = seq.split(' ')
183
+ ret = []
184
+ seqNames.each do |name|
185
+ tk = name != '_' ? read(name) : read
186
+ ret.push(tk)
187
+ end
188
+ ret
189
+ end
99
190
 
100
-
101
- # Read next token
102
- #
103
- # > tokenName : if not nil, the (string) name of the token expected
104
- #
105
- # Raises TokenizerException if no more tokens,if unrecognized token, or
106
- # if token has different than expected name
107
- #
108
- def read(tokenName = nil)
109
- token = peek()
110
- if !token
111
- raise TokenizerException,"No more tokens"
191
+ # Read a sequence of tokens, if they have particular names
192
+ # @param seq string of space-delimited token names; if name is '_',
193
+ # allows any token name in that position
194
+ # @return array of tokens read, or nil if the tokens had different
195
+ # names (or an end of input was encountered)
196
+ #
197
+ def readSequenceIf(seq)
198
+ ret = []
199
+ seqNames = seq.split(' ')
200
+ seqNames.each do |name|
201
+ tk = peek
202
+ break if !tk
203
+ if name != '_' && nameOf(tk) != name
204
+ break
205
+ end
206
+ ret.push(read)
207
+ end
208
+
209
+ if ret.size != seqNames.size
210
+ unread(ret.size)
211
+ ret = nil
212
+ end
213
+ ret
112
214
  end
113
215
 
114
- if token.id == UNKNOWN_TOKEN
115
- raise TokenizerException, "Unknown token "+token.inspect
116
- end
117
216
 
118
- if tokenName && tokenName != nameOf(token)
119
- raise TokenizerException, "Unexpected token "+token.inspect
217
+ # Determine if another token exists
218
+ #
219
+ def hasNext
220
+ !peek().nil?
120
221
  end
121
222
 
122
- @historyPointer += 1
123
-
124
- # Advance cursor, line number, column
223
+ # Get the name of a token
224
+ # (i.e., the name of the token definition, not its text)
225
+ #
226
+ # > token read from this tokenizer
227
+ #
228
+ def nameOf(token)
229
+ @dfa.tokenName(token.id)
230
+ end
125
231
 
126
- tl = token.text.length
127
- @cursor += tl
128
- tl.times do |i|
129
- c = token.text[i]
130
- @column += 1
131
- if c == "\n"
132
- @lineNumber += 1
133
- @column = 0
232
+ # Unread one (or more) previously read tokens
233
+ #
234
+ # @raise TokenizerException if attempt to unread token that was never read
235
+ #
236
+ def unread(count = 1)
237
+ if @historyPointer < count
238
+ raise TokenizerException, "Cannot unread before start"
134
239
  end
240
+ @historyPointer -= count
135
241
  end
136
- token
137
- end
138
-
139
- # Read next token if it has a particular name
140
- #
141
- # > tokenName : name to look for
142
- # < token read, or nil
143
- #
144
- def readIf(tokenName)
145
- ret = nil
146
- token = peek()
147
- if token && nameOf(token) == tokenName
148
- ret = read()
149
- end
150
- ret
242
+
151
243
  end
152
244
 
153
- # Determine if another token exists
154
- #
155
- def hasNext
156
- !peek().nil?
157
- end
158
245
 
159
- # Get the name of a token
160
- # (i.e., the name of the token definition, not its text)
246
+ # Tokens read by Tokenizer
161
247
  #
162
- # > token read from this tokenizer
163
- #
164
- def nameOf(token)
165
- @dfa.tokenName(token.id)
166
- end
167
-
168
- # Unread one (or more) previously read tokens
169
- #
170
- def unread(count = 1)
171
- if @historyPointer < count
172
- raise TokenizerException, "Cannot unread before start"
248
+ class Token
249
+
250
+ attr_reader :text, :lineNumber, :column, :id
251
+
252
+ def initialize(id, text, lineNumber, column)
253
+ @id = id
254
+ @text = text
255
+ @lineNumber = lineNumber
256
+ @column = column
257
+ end
258
+
259
+ def unknown?
260
+ id == ToknInternal::UNKNOWN_TOKEN
261
+ end
262
+
263
+ # Construct description of token location within text
264
+ #
265
+ def inspect
266
+ s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
267
+ if !unknown?
268
+ s = s.ljust(17) + " : " + text
269
+ end
270
+ s
173
271
  end
174
- @historyPointer -= count
175
- end
176
-
177
- end
178
-
179
-
180
-
181
-
182
- # Tokens read by Tokenizer
183
- #
184
- class Token
185
- include Tokn
186
-
187
- attr_reader :text, :lineNumber, :column, :id
188
-
189
- def initialize(id, text, lineNumber, column)
190
- @id = id
191
- @text = text
192
- @lineNumber = lineNumber
193
- @column = column
194
- end
195
-
196
- def unknown?
197
- id == UNKNOWN_TOKEN
198
272
  end
199
273
 
200
- def inspect
201
- s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
202
- if !unknown?
203
- s = s.ljust(17) + " : " + text
204
- end
205
- s
274
+ # Exception class for Tokenizer methods
275
+ #
276
+ class TokenizerException < Exception
206
277
  end
207
- end
208
-
209
278
 
210
- class TokenizerException < Exception
211
- end
279
+ end # module Tokn