tokn 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,156 +1,161 @@
1
1
  require_relative 'tools'
2
2
  req('tokn_const code_set dfa_builder state reg_parse')
3
3
 
4
- # Parses a token definition script, and generates an NFA that
5
- # is capable of recognizing and distinguishing between the various
6
- # tokens.
7
- #
8
- # Each line in the script is one of
9
- #
10
- # # ...comment... (the # must appear as the first character in the line)
11
- #
12
- # <tokenname> ':' <regex>
13
- #
14
- #
15
- # A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
16
- # If the first character is '_', the token is treated as an 'anonymous' token; these can
17
- # appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
18
- # generated NFA.
19
- #
20
- class TokenDefParser
21
- include Tokn
22
-
23
- attr_reader :dfa
24
-
25
- # Compile a token definition script into a DFA
26
- #
27
- def initialize(script, createPDF = false)
28
- @script = script
29
- parseScript
30
- if createPDF
31
- dfa.startState.generatePDF("tokenizer_dfa")
32
- end
33
- end
34
-
35
- private
36
-
37
- def parseScript
38
- db = false
39
-
40
- nextTokenId = 0
41
-
42
- # List of tokens entries, including anonymous ones
43
- @tokenListBig = []
4
+ module ToknInternal
5
+
6
+ # Parses a token definition script, and generates an NFA that
7
+ # is capable of recognizing and distinguishing between the various
8
+ # tokens.
9
+ #
10
+ # Each line in the script is one of
11
+ #
12
+ # # ...comment... (the # must appear as the first character in the line)
13
+ #
14
+ # <tokenname> ':' <regex>
15
+ #
16
+ #
17
+ # A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
18
+ # If the first character is '_', the token is treated as an 'anonymous' token; these can
19
+ # appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
20
+ # generated NFA.
21
+ #
22
+ class TokenDefParser
44
23
 
45
- # List of tokens names, excluding anonymous ones
46
- tokenListSmall = []
24
+ attr_reader :dfa
47
25
 
48
- # Maps token name to token entry
49
- @tokenNameMap = {}
26
+ # Compile a token definition script into a DFA
27
+ #
28
+ def initialize(script, createPDF = false)
29
+ @script = script
30
+ parseScript
31
+ if createPDF
32
+ dfa.startState.generatePDF("tokenizer_dfa")
33
+ end
34
+ end
50
35
 
51
- @lines = @script.split("\n")
36
+ private
52
37
 
53
- @lines.each_with_index do |line, lineNumber|
54
-
55
- line.strip!
56
-
57
- # If line is empty, or starts with '#', it's a comment
58
- if line.length == 0 || line[0] == '#'
59
- next
60
- end
61
-
62
- if !(line =~ TOKENNAME_EXPR)
63
- raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
64
- end
65
-
66
- pos = line.index(":")
67
-
68
- tokenName = line[0,pos].strip()
69
-
70
- expr = line[pos+1..-1].strip()
71
-
72
- rex = RegParse.new(expr, @tokenNameMap)
73
-
74
- # Give it the next available token id, if it's not an anonymous token
75
- tkId = nil
76
- if tokenName[0] != '_'
77
- tkId = nextTokenId
78
- nextTokenId += 1
79
- end
80
-
81
- tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
82
-
83
- !db || pr("token entry: %s\n",d(tkEntry))
84
-
85
- if @tokenNameMap.has_key?(tokenName)
86
- raise ParseException, "Duplicate token name: "+line
38
+ def parseScript
39
+ db = false
40
+
41
+ nextTokenId = 0
42
+
43
+ # List of tokens entries, including anonymous ones
44
+ @tokenListBig = []
45
+
46
+ # List of tokens names, excluding anonymous ones
47
+ tokenListSmall = []
48
+
49
+ # Maps token name to token entry
50
+ @tokenNameMap = {}
51
+
52
+ @lines = @script.split("\n")
53
+
54
+ @lines.each_with_index do |line, lineNumber|
55
+
56
+ line.strip!
57
+
58
+ # If line is empty, or starts with '#', it's a comment
59
+ if line.length == 0 || line[0] == '#'
60
+ next
61
+ end
62
+
63
+ if !(line =~ TOKENNAME_EXPR)
64
+ raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
65
+ end
66
+
67
+ pos = line.index(":")
68
+
69
+ tokenName = line[0,pos].strip()
70
+
71
+ expr = line[pos+1..-1].strip()
72
+
73
+ rex = RegParse.new(expr, @tokenNameMap)
74
+
75
+ # Give it the next available token id, if it's not an anonymous token
76
+ tkId = nil
77
+ if tokenName[0] != '_'
78
+ tkId = nextTokenId
79
+ nextTokenId += 1
80
+ end
81
+
82
+ tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
83
+
84
+ !db || pr("token entry: %s\n",d(tkEntry))
85
+
86
+ if @tokenNameMap.has_key?(tokenName)
87
+ raise ParseException, "Duplicate token name: "+line
88
+ end
89
+
90
+
91
+ @tokenListBig.push(tkEntry)
92
+ @tokenNameMap[tkEntry[0]] = tkEntry
93
+
94
+ if tkId
95
+ tokenListSmall.push(tokenName)
96
+ end
97
+
98
+ !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
99
+
87
100
  end
88
101
 
102
+ combined = combineTokenNFAs()
103
+ !db || combined.generatePDF("combined")
89
104
 
90
- @tokenListBig.push(tkEntry)
91
- @tokenNameMap[tkEntry[0]] = tkEntry
105
+ dfa = DFABuilder.nfa_to_dfa(combined)
106
+ !db || dfa.generatePDF("combined_minimized")
92
107
 
93
- if tkId
94
- tokenListSmall.push(tokenName)
95
- end
96
-
97
- !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
108
+ @dfa = Tokn::DFA.new(tokenListSmall, dfa)
109
+ end
110
+
111
+ # Combine the individual NFAs constructed for the token definitions into
112
+ # one large NFA, each augmented with an edge labelled with the appropriate
113
+ # token identifier to let the tokenizer see which token led to the final state.
114
+ #
115
+ def combineTokenNFAs
116
+
98
117
 
118
+ baseId = 0
119
+ startState = nil
120
+
121
+ @tokenListBig.each do |tokenName, regParse, index, tokenId|
122
+
123
+ # Skip anonymous token definitions
124
+ if !tokenId
125
+ next
126
+ end
127
+
128
+ oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
129
+
130
+ dupStart = oldToNewMap[regParse.startState]
131
+
132
+ # Transition from the expression's end state (not a final state)
133
+ # to a new final state, with the transitioning edge
134
+ # labelled with the token id (actually, a transformed token id to distinguish
135
+ # it from character codes)
136
+ dupEnd = oldToNewMap[regParse.endState]
137
+
138
+ dupfinalState = State.new(baseId)
139
+ baseId += 1
140
+ dupfinalState.finalState = true
141
+
142
+ # Why do I need to add 'ToknInternal.' here? Very confusing.
143
+ dupEnd.addEdge(CodeSet.new(ToknInternal.tokenIdToEdgeLabel(tokenId)), dupfinalState)
144
+
145
+ if !startState
146
+ startState = dupStart
147
+ else
148
+ # Add an e-transition from the start state to this expression's start
149
+ startState.addEdge(CodeSet.new(EPSILON),dupStart)
150
+ end
151
+ end
152
+ startState
99
153
  end
100
-
101
- combined = combineTokenNFAs()
102
- !db || combined.generatePDF("combined")
103
-
104
- dfa = DFABuilder.nfa_to_dfa(combined)
105
- !db || dfa.generatePDF("combined_minimized")
106
-
107
- @dfa = DFA.new(tokenListSmall, dfa)
108
- end
109
-
110
- # Combine the individual NFAs constructed for the token definitions into
111
- # one large NFA, each augmented with an edge labelled with the appropriate
112
- # token identifier to let the tokenizer see which token led to the final state.
113
- #
114
- def combineTokenNFAs
115
154
 
116
- baseId = 0
117
- startState = nil
155
+ # Regex for token names preceding regular expressions
156
+ #
157
+ TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
118
158
 
119
- @tokenListBig.each do |tokenName, regParse, index, tokenId|
120
-
121
- # Skip anonymous token definitions
122
- if !tokenId
123
- next
124
- end
125
-
126
- oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
127
-
128
- dupStart = oldToNewMap[regParse.startState]
129
-
130
- # Transition from the expression's end state (not a final state)
131
- # to a new final state, with the transitioning edge
132
- # labelled with the token id (actually, a transformed token id to distinguish
133
- # it from character codes)
134
- dupEnd = oldToNewMap[regParse.endState]
135
-
136
- dupfinalState = State.new(baseId)
137
- baseId += 1
138
- dupfinalState.finalState = true
139
-
140
- dupEnd.addEdge(CodeSet.new(tokenIdToEdgeLabel(tokenId)), dupfinalState)
141
-
142
- if !startState
143
- startState = dupStart
144
- else
145
- # Add an e-transition from the start state to this expression's start
146
- startState.addEdge(CodeSet.new(EPSILON),dupStart)
147
- end
148
- end
149
- startState
150
159
  end
151
160
 
152
- # Regex for token names preceding regular expressions
153
- #
154
- TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
155
-
156
- end
161
+ end # module ToknInternal
@@ -1,211 +1,279 @@
1
1
  require_relative 'tools'
2
- req('tokn_const ')
2
+ req('tokn_const dfa')
3
3
 
4
- # Extracts tokens from a script, given a previously constructed DFA.
5
- #
6
- class Tokenizer
7
- include Tokn
8
-
9
- # Construct a tokenizer, given a DFA and some text to process
10
- #
11
- def initialize(dfa, text)
12
- @dfa = dfa
13
- @text = text
14
- @lineNumber = 0
15
- @column = 0
16
- @cursor = 0
17
- @tokenHistory = []
18
- @historyPointer = 0
19
- end
4
+ module Tokn
20
5
 
21
- # Determine next token (without reading it)
6
+ # Extracts tokens from a script, given a previously constructed DFA.
22
7
  #
23
- # Returns Token, or nil if end of input
24
- #
25
- def peek
26
- if !@text
27
- raise IllegalStateException, "No input text specified"
28
- end
29
-
30
- db = false
31
- !db || warn("debug printing is on")
32
- !db || pr("peek, cursor=%d\n",@cursor)
8
+ class Tokenizer
33
9
 
34
- if @historyPointer == @tokenHistory.size
35
- if @cursor < @text.length
36
-
37
- bestLength = 0
38
- bestId = UNKNOWN_TOKEN
39
-
40
- charOffset = 0
41
- state = @dfa.startState
42
- while @cursor + charOffset <= @text.length
43
- ch = nil
44
- if @cursor + charOffset < @text.length
45
- ch = @text[@cursor + charOffset].ord()
46
- !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
47
- end
48
-
49
- nextState = nil
50
-
51
- # Examine edges leaving this state.
52
- # If one is labelled with a token id, we don't need to match the character with it;
53
- # store as best token found if length is longer than previous, or equal to previous
54
- # with higher id.
10
+ # Construct a tokenizer
11
+ #
12
+ # @param dfa the DFA to use
13
+ # @param text the text to extract tokens from
14
+ # @param skipName if not nil, tokens with this name will be skipped
15
+ #
16
+ def initialize(dfa, text, skipName = nil)
17
+ @dfa = dfa
18
+ @text = text
19
+ if !text
20
+ raise ArgumentError, "No text defined"
21
+ end
22
+ @skipTokenId = nil
23
+ if skipName
24
+ @skipTokenId = dfa.tokenId(skipName)
25
+ if !@skipTokenId
26
+ raise ArgumentError, "No token with name "+skipName+" found"
27
+ end
28
+ end
29
+ @lineNumber = 0
30
+ @column = 0
31
+ @cursor = 0
32
+ @tokenHistory = []
33
+ @historyPointer = 0
34
+ end
35
+
36
+ # Determine next token (without reading it)
37
+ #
38
+ # Returns Token, or nil if end of input
39
+ #
40
+ def peek
41
+ # if !@text
42
+ # raise IllegalStateException, "No input text specified"
43
+ # end
44
+
45
+ db = false
46
+ !db || warn("debug printing is on")
47
+ !db || pr("peek, cursor=%d\n",@cursor)
48
+
49
+ if @historyPointer == @tokenHistory.size
50
+ while true # repeat until we find a non-skipped token, or run out of text
51
+ break if @cursor >= @text.length
55
52
 
56
- # If an edge is labelled with the current character, advance to that state.
53
+ bestLength = 0
54
+ bestId = ToknInternal::UNKNOWN_TOKEN
57
55
 
58
- edges = state.edges
59
- edges.each do |lbl,dest|
60
- a = lbl.array
61
- !db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest))
62
- if a[0] < EPSILON
63
- newTokenId = edgeLabelToTokenId(a[0])
64
- !db || pr(" new token id=%d\n",newTokenId)
56
+ charOffset = 0
57
+ state = @dfa.startState
58
+ while @cursor + charOffset <= @text.length
59
+ ch = nil
60
+ if @cursor + charOffset < @text.length
61
+ ch = @text[@cursor + charOffset].ord()
62
+ !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
63
+ end
64
+
65
+ nextState = nil
66
+
67
+ # Examine edges leaving this state.
68
+ # If one is labelled with a token id, we don't need to match the character with it;
69
+ # store as best token found if length is longer than previous, or equal to previous
70
+ # with higher id.
65
71
 
66
- if (bestLength < charOffset || newTokenId > bestId)
67
- bestLength, bestId = charOffset, newTokenId
68
- !db || pr(" making longest found so far\n")
72
+ # If an edge is labelled with the current character, advance to that state.
73
+
74
+ edges = state.edges
75
+ edges.each do |lbl,dest|
76
+ a = lbl.array
77
+ !db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest))
78
+ if a[0] < ToknInternal::EPSILON
79
+ newTokenId = ToknInternal::edgeLabelToTokenId(a[0])
80
+ !db || pr(" new token id=%d\n",newTokenId)
81
+
82
+ if (bestLength < charOffset || newTokenId > bestId)
83
+ bestLength, bestId = charOffset, newTokenId
84
+ !db || pr(" making longest found so far\n")
85
+ end
69
86
  end
70
- end
87
+
88
+ if ch && lbl.contains?(ch)
89
+ !db || pr(" setting next state to %s\n",d(dest))
90
+ nextState = dest
91
+ break
92
+ end
93
+ end
71
94
 
72
- if ch && lbl.contains?(ch)
73
- !db || pr(" setting next state to %s\n",d(dest))
74
- nextState = dest
95
+ if !nextState
75
96
  break
76
97
  end
77
- end
78
-
79
- if !nextState
80
- break
98
+ state = nextState
99
+ charOffset += 1
100
+ !db || pr(" advanced to next state\n")
81
101
  end
82
- state = nextState
83
- charOffset += 1
84
- !db || pr(" advanced to next state\n")
85
- end
102
+
103
+ if bestId == @skipTokenId
104
+ @cursor += bestLength
105
+ next
106
+ end
107
+
108
+ peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
109
+
110
+ @tokenHistory.push(peekToken)
111
+ break # We found a token, so stop
112
+ end
113
+ end
114
+
115
+ ret = nil
116
+ if @historyPointer < @tokenHistory.size
117
+ ret = @tokenHistory[@historyPointer]
118
+ end
86
119
 
87
- peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
88
- @tokenHistory.push(peekToken)
89
- end
120
+ ret
121
+ end
122
+
123
+
124
+ # Read next token
125
+ #
126
+ # @param tokenName if not nil, the (string) name of the token expected
127
+ #
128
+ # @raise TokenizerException if no more tokens,if unrecognized token, or
129
+ # if token has different than expected name
130
+ #
131
+ def read(tokenName = nil)
132
+ token = peek()
133
+ if !token
134
+ raise TokenizerException,"No more tokens"
135
+ end
136
+
137
+ if token.id == ToknInternal::UNKNOWN_TOKEN
138
+ raise TokenizerException, "Unknown token "+token.inspect
139
+ end
140
+
141
+ if tokenName && tokenName != nameOf(token)
142
+ raise TokenizerException, "Unexpected token "+token.inspect
143
+ end
144
+
145
+ @historyPointer += 1
146
+
147
+ # Advance cursor, line number, column
148
+
149
+ tl = token.text.length
150
+ @cursor += tl
151
+ tl.times do |i|
152
+ c = token.text[i]
153
+ @column += 1
154
+ if c == "\n"
155
+ @lineNumber += 1
156
+ @column = 0
157
+ end
158
+ end
159
+ token
90
160
  end
91
161
 
92
- ret = nil
93
- if @historyPointer < @tokenHistory.size
94
- ret = @tokenHistory[@historyPointer]
162
+ # Read next token if it has a particular name
163
+ #
164
+ # > tokenName : name to look for
165
+ # < token read, or nil
166
+ #
167
+ def readIf(tokenName)
168
+ ret = nil
169
+ token = peek()
170
+ if token && nameOf(token) == tokenName
171
+ ret = read()
172
+ end
173
+ ret
95
174
  end
96
175
 
97
- ret
98
- end
176
+ # Read a sequence of tokens
177
+ # @param seq string of space-delimited token names; if name is '_',
178
+ # allows any token name in that position
179
+ # @return array of tokens read
180
+ #
181
+ def readSequence(seq)
182
+ seqNames = seq.split(' ')
183
+ ret = []
184
+ seqNames.each do |name|
185
+ tk = name != '_' ? read(name) : read
186
+ ret.push(tk)
187
+ end
188
+ ret
189
+ end
99
190
 
100
-
101
- # Read next token
102
- #
103
- # > tokenName : if not nil, the (string) name of the token expected
104
- #
105
- # Raises TokenizerException if no more tokens,if unrecognized token, or
106
- # if token has different than expected name
107
- #
108
- def read(tokenName = nil)
109
- token = peek()
110
- if !token
111
- raise TokenizerException,"No more tokens"
191
+ # Read a sequence of tokens, if they have particular names
192
+ # @param seq string of space-delimited token names; if name is '_',
193
+ # allows any token name in that position
194
+ # @return array of tokens read, or nil if the tokens had different
195
+ # names (or an end of input was encountered)
196
+ #
197
+ def readSequenceIf(seq)
198
+ ret = []
199
+ seqNames = seq.split(' ')
200
+ seqNames.each do |name|
201
+ tk = peek
202
+ break if !tk
203
+ if name != '_' && nameOf(tk) != name
204
+ break
205
+ end
206
+ ret.push(read)
207
+ end
208
+
209
+ if ret.size != seqNames.size
210
+ unread(ret.size)
211
+ ret = nil
212
+ end
213
+ ret
112
214
  end
113
215
 
114
- if token.id == UNKNOWN_TOKEN
115
- raise TokenizerException, "Unknown token "+token.inspect
116
- end
117
216
 
118
- if tokenName && tokenName != nameOf(token)
119
- raise TokenizerException, "Unexpected token "+token.inspect
217
+ # Determine if another token exists
218
+ #
219
+ def hasNext
220
+ !peek().nil?
120
221
  end
121
222
 
122
- @historyPointer += 1
123
-
124
- # Advance cursor, line number, column
223
+ # Get the name of a token
224
+ # (i.e., the name of the token definition, not its text)
225
+ #
226
+ # > token read from this tokenizer
227
+ #
228
+ def nameOf(token)
229
+ @dfa.tokenName(token.id)
230
+ end
125
231
 
126
- tl = token.text.length
127
- @cursor += tl
128
- tl.times do |i|
129
- c = token.text[i]
130
- @column += 1
131
- if c == "\n"
132
- @lineNumber += 1
133
- @column = 0
232
+ # Unread one (or more) previously read tokens
233
+ #
234
+ # @raise TokenizerException if attempt to unread token that was never read
235
+ #
236
+ def unread(count = 1)
237
+ if @historyPointer < count
238
+ raise TokenizerException, "Cannot unread before start"
134
239
  end
240
+ @historyPointer -= count
135
241
  end
136
- token
137
- end
138
-
139
- # Read next token if it has a particular name
140
- #
141
- # > tokenName : name to look for
142
- # < token read, or nil
143
- #
144
- def readIf(tokenName)
145
- ret = nil
146
- token = peek()
147
- if token && nameOf(token) == tokenName
148
- ret = read()
149
- end
150
- ret
242
+
151
243
  end
152
244
 
153
- # Determine if another token exists
154
- #
155
- def hasNext
156
- !peek().nil?
157
- end
158
245
 
159
- # Get the name of a token
160
- # (i.e., the name of the token definition, not its text)
246
+ # Tokens read by Tokenizer
161
247
  #
162
- # > token read from this tokenizer
163
- #
164
- def nameOf(token)
165
- @dfa.tokenName(token.id)
166
- end
167
-
168
- # Unread one (or more) previously read tokens
169
- #
170
- def unread(count = 1)
171
- if @historyPointer < count
172
- raise TokenizerException, "Cannot unread before start"
248
+ class Token
249
+
250
+ attr_reader :text, :lineNumber, :column, :id
251
+
252
+ def initialize(id, text, lineNumber, column)
253
+ @id = id
254
+ @text = text
255
+ @lineNumber = lineNumber
256
+ @column = column
257
+ end
258
+
259
+ def unknown?
260
+ id == ToknInternal::UNKNOWN_TOKEN
261
+ end
262
+
263
+ # Construct description of token location within text
264
+ #
265
+ def inspect
266
+ s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
267
+ if !unknown?
268
+ s = s.ljust(17) + " : " + text
269
+ end
270
+ s
173
271
  end
174
- @historyPointer -= count
175
- end
176
-
177
- end
178
-
179
-
180
-
181
-
182
- # Tokens read by Tokenizer
183
- #
184
- class Token
185
- include Tokn
186
-
187
- attr_reader :text, :lineNumber, :column, :id
188
-
189
- def initialize(id, text, lineNumber, column)
190
- @id = id
191
- @text = text
192
- @lineNumber = lineNumber
193
- @column = column
194
- end
195
-
196
- def unknown?
197
- id == UNKNOWN_TOKEN
198
272
  end
199
273
 
200
- def inspect
201
- s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
202
- if !unknown?
203
- s = s.ljust(17) + " : " + text
204
- end
205
- s
274
+ # Exception class for Tokenizer methods
275
+ #
276
+ class TokenizerException < Exception
206
277
  end
207
- end
208
-
209
278
 
210
- class TokenizerException < Exception
211
- end
279
+ end # module Tokn