tokn 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,233 +1,235 @@
1
1
  require_relative 'tools'
2
2
  req('tokn_const code_set')
3
3
 
4
-
5
- # A data structure that transforms a set of CodeSets to a
6
- # disjoint set of them, such that no two range sets overlap.
7
- #
8
- # This is improve the efficiency of the NFA => DFA algorithm,
9
- # which involves gathering information about what states are
10
- # reachable on certain characters. We can't afford to treat each
11
- # character as a singleton, since the ranges can be quite large.
12
- # Hence, we want to treat ranges of characters as single entities;
13
- # this will only work if no two such ranges overlap.
14
- #
15
- # It works by starting with a tree whose node is labelled with
16
- # the maximal superset of character values. Then, for each edge
17
- # in the NFA, performs a DFS on this tree, splitting any node that
18
- # only partially intersects any one set that appears in the edge label.
19
- # The running time is O(n log k), where n is the size of the NFA, and
20
- # k is the height of the resulting tree.
21
- #
22
- # We encourage k to be small by sorting the NFA edges by their
23
- # label complexity.
24
- #
25
- class RangePartition
26
- include Tokn
4
+ module ToknInternal
27
5
 
28
- def initialize()
29
- # We will build a tree, where each node has a CodeSet
30
- # associated with it, and the child nodes (if present)
31
- # partition this CodeSet into smaller, nonempty sets.
32
-
33
- # A tree is represented by a node, where each node is a pair [x,y],
34
- # with x the node's CodeSet, and y a list of the node's children.
35
-
36
- @nextNodeId = 0
37
-
38
- # Make the root node hold the largest possible CodeSet.
39
- # We want to be able to include all the token ids as well.
40
-
41
- @rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
42
-
43
- @setsToAdd = Set.new
44
-
45
- # Add epsilon immediately, so it's always in its own subset
46
- addSet(CodeSet.new(EPSILON))
47
-
48
- @prepared = false
49
- end
50
-
51
- def addSet(s)
52
- if @prepared
53
- raise IllegalStateException
54
- end
55
- @setsToAdd.add(s)
56
- end
6
+ # A data structure that transforms a set of CodeSets to a
7
+ # disjoint set of them, such that no two range sets overlap.
8
+ #
9
+ # This is improve the efficiency of the NFA => DFA algorithm,
10
+ # which involves gathering information about what states are
11
+ # reachable on certain characters. We can't afford to treat each
12
+ # character as a singleton, since the ranges can be quite large.
13
+ # Hence, we want to treat ranges of characters as single entities;
14
+ # this will only work if no two such ranges overlap.
15
+ #
16
+ # It works by starting with a tree whose node is labelled with
17
+ # the maximal superset of character values. Then, for each edge
18
+ # in the NFA, performs a DFS on this tree, splitting any node that
19
+ # only partially intersects any one set that appears in the edge label.
20
+ # The running time is O(n log k), where n is the size of the NFA, and
21
+ # k is the height of the resulting tree.
22
+ #
23
+ # We encourage k to be small by sorting the NFA edges by their
24
+ # label complexity.
25
+ #
26
+ class RangePartition
27
+ # include Tokn
28
+
29
+ def initialize()
30
+ # We will build a tree, where each node has a CodeSet
31
+ # associated with it, and the child nodes (if present)
32
+ # partition this CodeSet into smaller, nonempty sets.
33
+
34
+ # A tree is represented by a node, where each node is a pair [x,y],
35
+ # with x the node's CodeSet, and y a list of the node's children.
57
36
 
58
- def prepare()
59
- if @prepared
60
- raise IllegalStateException
37
+ @nextNodeId = 0
38
+
39
+ # Make the root node hold the largest possible CodeSet.
40
+ # We want to be able to include all the token ids as well.
41
+
42
+ @rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
43
+
44
+ @setsToAdd = Set.new
45
+
46
+ # Add epsilon immediately, so it's always in its own subset
47
+ addSet(CodeSet.new(EPSILON))
48
+
49
+ @prepared = false
61
50
  end
62
-
63
- # Construct partition from previously added sets
64
-
65
- list = @setsToAdd.to_a
66
-
67
- # Sort set by cardinality: probably get a more balanced tree
68
- # if larger sets are processed first
69
- list.sort!{ |x,y| y.cardinality <=> x.cardinality }
70
-
71
- list.each do |s|
72
- addSetAux(s)
73
- end
74
-
75
- @prepared = true
76
- end
77
-
78
51
 
79
- # Generate a .dot file, and from that, a PDF, for debug purposes
80
- #
81
- def generatePDF(name = "partition")
82
- if !@prepared
83
- raise IllegalStateException
84
- end
85
-
86
- g = ""
87
- g += "digraph "+name+" {\n\n"
88
-
89
- nodes = []
90
- buildNodeList(nodes)
91
- nodes.each do |node|
92
- g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
93
- end
94
-
95
- g += "\n"
96
- nodes.each do |node|
97
- node.children.each do |ch|
98
- g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
52
+ def addSet(s)
53
+ if @prepared
54
+ raise IllegalStateException
99
55
  end
56
+ @setsToAdd.add(s)
100
57
  end
101
-
102
- g += "\n}\n"
103
- g.gsub!( /'/, '"' )
104
58
 
105
- dotToPDF(g,name)
106
-
107
- end
108
-
109
-
110
- # Apply the partition to a CodeSet
111
- #
112
- # > s CodeSet
113
- # < array of subsets from the partition whose union equals s
114
- # (this array will be the single element s if no partitioning was necessary)
115
- #
116
- def apply(s)
117
- if !@prepared
118
- raise IllegalStateException
59
+ def prepare()
60
+ if @prepared
61
+ raise IllegalStateException
62
+ end
63
+
64
+ # Construct partition from previously added sets
65
+
66
+ list = @setsToAdd.to_a
67
+
68
+ # Sort set by cardinality: probably get a more balanced tree
69
+ # if larger sets are processed first
70
+ list.sort!{ |x,y| y.cardinality <=> x.cardinality }
71
+
72
+ list.each do |s|
73
+ addSetAux(s)
74
+ end
75
+
76
+ @prepared = true
119
77
  end
120
78
 
121
- list = []
122
- s2 = s.makeCopy
123
- applyAux(@rootNode, s2, list)
124
-
125
- # Sort the list of subsets by their first elements
126
- list.sort! { |x,y| x.array[0] <=> y.array[0] }
127
79
 
128
- list
129
- end
130
-
131
-
132
- private
133
-
134
- def applyAux(n, s, list)
135
- db = false
136
-
137
- !db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
138
-
139
- if n.children.empty?
140
- # # Verify that this set equals the input set
141
- # myAssert(s.eql? n.set)
142
- list.push(s)
143
- else
144
- n.children.each do |m|
145
- s1 = s.intersect(m.set)
146
- !db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
147
-
148
- if s1.empty?
149
- next
150
- end
151
-
152
- applyAux(m, s1, list)
153
-
154
- !db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
155
- s = s.difference(m.set)
156
- !db||pr(" subtracted child set, now [%s]\n",d(s))
157
- if s.empty?
158
- break
80
+ # Generate a .dot file, and from that, a PDF, for debug purposes
81
+ #
82
+ def generatePDF(name = "partition")
83
+ if !@prepared
84
+ raise IllegalStateException
85
+ end
86
+
87
+ g = ""
88
+ g += "digraph "+name+" {\n\n"
89
+
90
+ nodes = []
91
+ buildNodeList(nodes)
92
+ nodes.each do |node|
93
+ g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
94
+ end
95
+
96
+ g += "\n"
97
+ nodes.each do |node|
98
+ node.children.each do |ch|
99
+ g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
159
100
  end
160
101
  end
161
- end
162
- end
102
+
103
+ g += "\n}\n"
104
+ g.gsub!( /'/, '"' )
105
+
106
+ dotToPDF(g,name)
163
107
 
164
- def buildNode(rangeSet)
165
- id = @nextNodeId
166
- @nextNodeId += 1
167
- n = RPNode.new(id, rangeSet, [])
168
- n
169
- end
170
-
171
- def buildNodeList(list, root = nil)
172
- if not root
173
- root = @rootNode
174
- end
175
- list.push(root)
176
- root.children.each do |x|
177
- buildNodeList(list, x)
178
108
  end
179
- end
180
109
 
181
- # Add a set to the tree, extending the tree as necessary to
182
- # maintain a (disjoint) partition
183
- #
184
- def addSetAux(s, n = @rootNode)
185
- #
186
- # The algorithm is this:
110
+
111
+ # Apply the partition to a CodeSet
187
112
  #
188
- # add (s, n) # add set s to node n; s must be subset of n.set
189
- # if n.set = s, return
190
- # if n is leaf:
191
- # x = n.set - s
192
- # add x,y as child sets of n
193
- # else
194
- # for each child m of n:
195
- # t = intersect of m.set and s
196
- # if t is nonempty, add(t, m)
113
+ # > s CodeSet
114
+ # < array of subsets from the partition whose union equals s
115
+ # (this array will be the single element s if no partitioning was necessary)
197
116
  #
198
- if n.set.eql? s
199
- return
200
- end
201
- if n.children.empty?
202
- x = n.set.difference(s)
203
- n.children.push buildNode(x)
204
- n.children.push buildNode(s)
205
- else
206
- n.children.each do |m|
207
- t = m.set.intersect(s)
208
- addSetAux(t,m) unless t.empty?
117
+ def apply(s)
118
+ if !@prepared
119
+ raise IllegalStateException
209
120
  end
121
+
122
+ list = []
123
+ s2 = s.makeCopy
124
+ applyAux(@rootNode, s2, list)
125
+
126
+ # Sort the list of subsets by their first elements
127
+ list.sort! { |x,y| x.array[0] <=> y.array[0] }
128
+
129
+ list
210
130
  end
211
- end
212
131
 
213
- end
214
-
215
- # A node within a RangePartition tree
216
- #
217
- class RPNode
218
132
 
219
- attr_accessor :id, :set, :children
133
+ private
220
134
 
221
- def initialize(id, set, children)
222
- @id = id
223
- @set = set
224
- @children = children
225
- end
135
+ def applyAux(n, s, list)
136
+ db = false
137
+
138
+ !db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
139
+
140
+ if n.children.empty?
141
+ # # Verify that this set equals the input set
142
+ # myAssert(s.eql? n.set)
143
+ list.push(s)
144
+ else
145
+ n.children.each do |m|
146
+ s1 = s.intersect(m.set)
147
+ !db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
148
+
149
+ if s1.empty?
150
+ next
151
+ end
152
+
153
+ applyAux(m, s1, list)
154
+
155
+ !db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
156
+ s = s.difference(m.set)
157
+ !db||pr(" subtracted child set, now [%s]\n",d(s))
158
+ if s.empty?
159
+ break
160
+ end
161
+ end
162
+ end
163
+ end
164
+
165
+ def buildNode(rangeSet)
166
+ id = @nextNodeId
167
+ @nextNodeId += 1
168
+ n = RPNode.new(id, rangeSet, [])
169
+ n
170
+ end
226
171
 
227
- def inspect
228
- return 'N' + id.to_s
172
+ def buildNodeList(list, root = nil)
173
+ if not root
174
+ root = @rootNode
175
+ end
176
+ list.push(root)
177
+ root.children.each do |x|
178
+ buildNodeList(list, x)
179
+ end
180
+ end
181
+
182
+ # Add a set to the tree, extending the tree as necessary to
183
+ # maintain a (disjoint) partition
184
+ #
185
+ def addSetAux(s, n = @rootNode)
186
+ #
187
+ # The algorithm is this:
188
+ #
189
+ # add (s, n) # add set s to node n; s must be subset of n.set
190
+ # if n.set = s, return
191
+ # if n is leaf:
192
+ # x = n.set - s
193
+ # add x,y as child sets of n
194
+ # else
195
+ # for each child m of n:
196
+ # t = intersect of m.set and s
197
+ # if t is nonempty, add(t, m)
198
+ #
199
+ if n.set.eql? s
200
+ return
201
+ end
202
+ if n.children.empty?
203
+ x = n.set.difference(s)
204
+ n.children.push buildNode(x)
205
+ n.children.push buildNode(s)
206
+ else
207
+ n.children.each do |m|
208
+ t = m.set.intersect(s)
209
+ addSetAux(t,m) unless t.empty?
210
+ end
211
+ end
212
+ end
213
+
229
214
  end
230
215
 
231
- end
216
+ # A node within a RangePartition tree
217
+ #
218
+ class RPNode
219
+
220
+ attr_accessor :id, :set, :children
221
+
222
+ def initialize(id, set, children)
223
+ @id = id
224
+ @set = set
225
+ @children = children
226
+ end
227
+
228
+ def inspect
229
+ return 'N' + id.to_s
230
+ end
231
+
232
+ end
232
233
 
234
+ end # module ToknInternal
233
235
 
@@ -1,379 +1,384 @@
1
1
  require_relative 'tools'
2
2
  req('code_set state')
3
3
 
4
- class ParseException < Exception
5
- end
6
-
7
- # Parses a single regular expression from a string.
8
- # Produces an NFA with distinguished start and end states
9
- # (none of these states are marked as final states)
10
- #
11
- # Here is the grammar for regular expressions. Spaces are ignored,
12
- # and can be liberally sprinkled within the regular expressions to
13
- # aid readability. To represent a space, the \s escape sequence must be used.
14
- # See the file 'sampletokens.txt' for some examples.
15
- #
16
- # Expressions have one of these types:
17
- #
18
- # E : base class
19
- # J : a Join expression, formed by concatenating one or more together
20
- # Q : a Quantified expression; followed optionally by '*', '+', or '?'
21
- # P : a Parenthesized expression, which is optionally surrounded with (), {}, []
22
- #
23
- # E -> J '|' E
24
- # | J
25
- #
26
- # J -> Q J
27
- # | Q
28
- #
29
- # Q -> P '*'
30
- # | P '+'
31
- # | P '?'
32
- # | P
33
- #
34
- # P -> '(' E ')'
35
- # | '{' TOKENNAME '}'
36
- # | '[^' SETSEQ ']' A code not appearing in the set
37
- # | '[' SETSEQ ']'
38
- # | CHARCODE
39
- #
40
- # SETSEQ -> SET SETSEQ
41
- # | SET
42
- #
43
- # SET -> CHARCODE
44
- # | CHARCODE '-' CHARCODE
45
- #
46
- # CHARCODE ->
47
- # a | b | c ... any printable except {,},[, etc.
48
- # | \xhh hex value from 00...ff
49
- # | \uhhhh hex value from 0000...ffff (e.g., unicode)
50
- # | \f | \n | \r | \t formfeed, linefeed, return, tab
51
- # | \s a space (' ')
52
- # | \* where * is some other non-alphabetic
53
- # character that needs to be escaped
54
- #
55
- # The parser performs recursive descent parsing;
56
- # each method returns an NFA represented by
57
- # a pair of states: the start and end states.
58
- #
59
- class RegParse
60
-
61
- attr_reader :startState, :endState
4
+ module ToknInternal
62
5
 
63
- # Construct a parser and perform the parsing
64
- # @param script script to parse
65
- # @param tokenDefMap if not nil, a map of previously parsed regular expressions
66
- # (mapping names to ids) to be consulted if a curly brace expression appears
67
- # in the script
68
- #
69
- def initialize(script, tokenDefMap = nil)
70
- @script = script.strip
71
- @nextStateId = 0
72
- @tokenDefMap = tokenDefMap
73
- parseScript
74
- end
75
-
76
-
77
- def inspect
78
- s = "RegParse: #{@script}"
79
- s += " start:"+d(@startState)+" end:"+d(@endState)
80
- return s
81
- end
82
-
83
- private
84
-
85
- # Raise a ParseException, with a helpful message indicating
86
- # the parser's current location within the string
6
+ # Exception thrown if problem parsing regular expression
87
7
  #
88
- def abort(msg)
89
- # Assume we've already read the problem character
90
- i = @cursor - 1
91
- s = ''
92
- if i > 4
93
- s += '...'
94
- end
95
- s += @script[i-3...i] || ""
96
- s += ' !!! '
97
- s += @script[i...i+3] || ""
98
- if i +3 < @script.size
99
- s += '...'
100
- end
101
- raise ParseException, msg + ": "+s
8
+ class ParseException < Exception
102
9
  end
103
10
 
104
- # Read next character as a hex digit
11
+ # Parses a single regular expression from a string.
12
+ # Produces an NFA with distinguished start and end states
13
+ # (none of these states are marked as final states)
105
14
  #
106
- def readHex
107
- v = read.upcase.ord
108
- if v >= 48 and v < 58
109
- return v - 48
110
- elsif v >= 65 and v < 71
111
- return v - 65 + 10
112
- else
113
- abort "Missing hex digit"
114
- end
115
- end
116
-
117
-
118
- NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
119
-
120
- # Parse character definition (CHARCODE) from input
15
+ # Here is the grammar for regular expressions. Spaces are ignored,
16
+ # and can be liberally sprinkled within the regular expressions to
17
+ # aid readability. To represent a space, the \s escape sequence must be used.
18
+ # See the file 'sampletokens.txt' for some examples.
19
+ #
20
+ # Expressions have one of these types:
21
+ #
22
+ # E : base class
23
+ # J : a Join expression, formed by concatenating one or more together
24
+ # Q : a Quantified expression; followed optionally by '*', '+', or '?'
25
+ # P : a Parenthesized expression, which is optionally surrounded with (), {}, []
26
+ #
27
+ # E -> J '|' E
28
+ # | J
29
+ #
30
+ # J -> Q J
31
+ # | Q
32
+ #
33
+ # Q -> P '*'
34
+ # | P '+'
35
+ # | P '?'
36
+ # | P
37
+ #
38
+ # P -> '(' E ')'
39
+ # | '{' TOKENNAME '}'
40
+ # | '[^' SETSEQ ']' A code not appearing in the set
41
+ # | '[' SETSEQ ']'
42
+ # | CHARCODE
43
+ #
44
+ # SETSEQ -> SET SETSEQ
45
+ # | SET
46
+ #
47
+ # SET -> CHARCODE
48
+ # | CHARCODE '-' CHARCODE
121
49
  #
122
- def parseChar
50
+ # CHARCODE ->
51
+ # a | b | c ... any printable except {,},[, etc.
52
+ # | \xhh hex value from 00...ff
53
+ # | \uhhhh hex value from 0000...ffff (e.g., unicode)
54
+ # | \f | \n | \r | \t formfeed, linefeed, return, tab
55
+ # | \s a space (' ')
56
+ # | \* where * is some other non-alphabetic
57
+ # character that needs to be escaped
58
+ #
59
+ # The parser performs recursive descent parsing;
60
+ # each method returns an NFA represented by
61
+ # a pair of states: the start and end states.
62
+ #
63
+ class RegParse
123
64
 
124
- c = read
65
+ attr_reader :startState, :endState
125
66
 
126
- val = c.ord
67
+ # Construct a parser and perform the parsing
68
+ # @param script script to parse
69
+ # @param tokenDefMap if not nil, a map of previously parsed regular expressions
70
+ # (mapping names to ids) to be consulted if a curly brace expression appears
71
+ # in the script
72
+ #
73
+ def initialize(script, tokenDefMap = nil)
74
+ @script = script.strip
75
+ @nextStateId = 0
76
+ @tokenDefMap = tokenDefMap
77
+ parseScript
78
+ end
79
+
80
+
81
+ def inspect
82
+ s = "RegParse: #{@script}"
83
+ s += " start:"+d(@startState)+" end:"+d(@endState)
84
+ return s
85
+ end
86
+
87
+ private
88
+
89
+ # Raise a ParseException, with a helpful message indicating
90
+ # the parser's current location within the string
91
+ #
92
+ def abort(msg)
93
+ # Assume we've already read the problem character
94
+ i = @cursor - 1
95
+ s = ''
96
+ if i > 4
97
+ s += '...'
98
+ end
99
+ s += @script[i-3...i] || ""
100
+ s += ' !!! '
101
+ s += @script[i...i+3] || ""
102
+ if i +3 < @script.size
103
+ s += '...'
104
+ end
105
+ raise ParseException, msg + ": "+s
106
+ end
127
107
 
128
- if "{}[]*?+|-^()".include?(c) or val <= 0x20
129
- abort "Unexpected or unescaped character"
108
+ # Read next character as a hex digit
109
+ #
110
+ def readHex
111
+ v = read.upcase.ord
112
+ if v >= 48 and v < 58
113
+ return v - 48
114
+ elsif v >= 65 and v < 71
115
+ return v - 65 + 10
116
+ else
117
+ abort "Missing hex digit"
118
+ end
130
119
  end
131
-
132
- if c == '\\'
120
+
121
+
122
+ NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
123
+
124
+ # Parse character definition (CHARCODE) from input
125
+ #
126
+ def parseChar
133
127
 
134
128
  c = read
135
129
 
136
- if "xX".include? c
137
- val = (readHex() << 4) | readHex()
138
- elsif "uU".include? c
139
- val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
140
- else
141
- if c == 'f'
142
- val = "\f".ord
143
- elsif c == 'r'
144
- val == "\r".ord
145
- elsif c == 'n'
146
- val = "\n".ord
147
- elsif c == 't'
148
- val = "\t".ord
149
- elsif c == 's'
150
- val = " ".ord
130
+ val = c.ord
131
+
132
+ if "{}[]*?+|-^()".include?(c) or val <= 0x20
133
+ abort "Unexpected or unescaped character"
134
+ end
135
+
136
+ if c == '\\'
137
+
138
+ c = read
139
+
140
+ if "xX".include? c
141
+ val = (readHex() << 4) | readHex()
142
+ elsif "uU".include? c
143
+ val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
151
144
  else
152
- if c =~ NO_ESCAPE_CHARS
153
- abort "Unsupported escape sequence ("+c+")"
154
- end
155
- val = c.ord
156
- end
145
+ if c == 'f'
146
+ val = "\f".ord
147
+ elsif c == 'r'
148
+ val == "\r".ord
149
+ elsif c == 'n'
150
+ val = "\n".ord
151
+ elsif c == 't'
152
+ val = "\t".ord
153
+ elsif c == 's'
154
+ val = " ".ord
155
+ else
156
+ if c =~ NO_ESCAPE_CHARS
157
+ abort "Unsupported escape sequence ("+c+")"
158
+ end
159
+ val = c.ord
160
+ end
161
+ end
157
162
  end
163
+
164
+ return val
158
165
  end
159
-
160
- return val
161
- end
162
-
163
166
 
164
- def parseCharNFA
165
- val = parseChar
166
-
167
- # Construct a pair of states with an edge between them
168
- # labelled with this character code
169
167
 
170
- sA = newState
171
- sB = newState
172
- cset = CodeSet.new
173
- cset.add(val)
174
- sA.addEdge(cset, sB)
175
- return [sA,sB]
176
- end
177
-
178
-
168
+ def parseCharNFA
169
+ val = parseChar
179
170
 
180
- def dbInfo
181
- j = @cursor
182
- k = j + 5
183
- if k >= @script.size
184
- return @script[j..k]+"<<<== end"
185
- else
186
- return @script[j..k]+"..."
171
+ # Construct a pair of states with an edge between them
172
+ # labelled with this character code
173
+
174
+ sA = newState
175
+ sB = newState
176
+ cset = CodeSet.new
177
+ cset.add(val)
178
+ sA.addEdge(cset, sB)
179
+ return [sA,sB]
187
180
  end
188
- end
189
-
190
- def parseScript
191
- # Set up the input scanner
192
- @cursor = 0
193
181
 
194
- exp = parseE
195
- @startState = exp[0]
196
- @endState = exp[1]
197
- end
198
-
199
- def newState
200
- s = State.new(@nextStateId)
201
- @nextStateId += 1
202
- return s
203
- end
204
-
205
- def parseSET
206
- u = parseChar
207
- v = u+1
208
- if readIf('-')
209
- v = parseChar() + 1
210
- if v <= u
211
- abort "Illegal range"
212
- end
213
- end
214
- return u,v
215
- end
216
-
217
- def parseSETSEQ
218
- db = false
219
-
220
- !db || pr("parseSETSEQ\n")
221
-
222
- read('[')
223
- negated = readIf('^')
224
- !db || pr(" negated=%s\n",negated)
225
182
 
226
- rs = CodeSet.new
183
+
184
+ def dbInfo
185
+ j = @cursor
186
+ k = j + 5
187
+ if k >= @script.size
188
+ return @script[j..k]+"<<<== end"
189
+ else
190
+ return @script[j..k]+"..."
191
+ end
192
+ end
227
193
 
228
- u,v = parseSET
229
- rs.add(u,v)
230
- !db || pr(" initial set=%s\n",d(rs))
231
-
232
- while not readIf(']')
233
- u,v = parseSET
234
- rs.add(u,v)
235
- !db || pr(" added another; %s\n",d(rs))
236
- end
237
- if negated
238
- rs.negate
239
- !db || pr(" negated=%s\n",d(rs))
194
+ def parseScript
195
+ # Set up the input scanner
196
+ @cursor = 0
197
+
198
+ exp = parseE
199
+ @startState = exp[0]
200
+ @endState = exp[1]
240
201
  end
241
-
242
- if rs.empty?
243
- abort "Empty character range"
202
+
203
+ def newState
204
+ s = State.new(@nextStateId)
205
+ @nextStateId += 1
206
+ return s
244
207
  end
245
208
 
246
- sA = newState
247
- sB = newState
248
- sA.addEdge(rs, sB)
249
- return [sA,sB]
250
- end
209
+ def parseSET
210
+ u = parseChar
211
+ v = u+1
212
+ if readIf('-')
213
+ v = parseChar() + 1
214
+ if v <= u
215
+ abort "Illegal range"
216
+ end
217
+ end
218
+ return u,v
219
+ end
251
220
 
252
- TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
221
+ def parseSETSEQ
222
+ db = false
223
+
224
+ !db || pr("parseSETSEQ\n")
225
+
226
+ read('[')
227
+ negated = readIf('^')
228
+ !db || pr(" negated=%s\n",negated)
229
+
230
+ rs = CodeSet.new
231
+
232
+ u,v = parseSET
233
+ rs.add(u,v)
234
+ !db || pr(" initial set=%s\n",d(rs))
253
235
 
254
- def parseTokenDef
255
- read('{')
256
- name = ''
257
- while !readIf('}')
258
- name += read
259
- end
260
- # pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
261
- if name !~ TOKENREF_EXPR
262
- abort "Problem with token name"
263
- end
264
- tokInfo = nil
265
- if @tokenDefMap
266
- tokInfo = @tokenDefMap[name]
267
- end
268
- if !tokInfo
269
- abort "Undefined token"
236
+ while not readIf(']')
237
+ u,v = parseSET
238
+ rs.add(u,v)
239
+ !db || pr(" added another; %s\n",d(rs))
240
+ end
241
+ if negated
242
+ rs.negate
243
+ !db || pr(" negated=%s\n",d(rs))
244
+ end
245
+
246
+ if rs.empty?
247
+ abort "Empty character range"
248
+ end
249
+
250
+ sA = newState
251
+ sB = newState
252
+ sA.addEdge(rs, sB)
253
+ return [sA,sB]
270
254
  end
271
- rg = tokInfo[1]
272
255
 
273
- oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
256
+ TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
274
257
 
275
- newStart = oldToNewMap[rg.startState]
276
- newEnd = oldToNewMap[rg.endState]
258
+ def parseTokenDef
259
+ read('{')
260
+ name = ''
261
+ while !readIf('}')
262
+ name += read
263
+ end
264
+ # pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
265
+ if name !~ TOKENREF_EXPR
266
+ abort "Problem with token name"
267
+ end
268
+ tokInfo = nil
269
+ if @tokenDefMap
270
+ tokInfo = @tokenDefMap[name]
271
+ end
272
+ if !tokInfo
273
+ abort "Undefined token"
274
+ end
275
+ rg = tokInfo[1]
276
+
277
+ oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
278
+
279
+ newStart = oldToNewMap[rg.startState]
280
+ newEnd = oldToNewMap[rg.endState]
281
+
282
+ [newStart, newEnd]
283
+
284
+
285
+ end
277
286
 
278
- [newStart, newEnd]
279
287
 
280
-
281
- end
282
-
283
-
284
- def parseP
285
- ch = peek
286
- if ch == '('
287
- read
288
- e1 = parseE
289
- read ')'
290
- elsif ch == '{'
291
- e1 = parseTokenDef
292
- elsif ch == '['
293
- e1 = parseSETSEQ
294
- else
295
- e1 = parseCharNFA
288
+ def parseP
289
+ ch = peek
290
+ if ch == '('
291
+ read
292
+ e1 = parseE
293
+ read ')'
294
+ elsif ch == '{'
295
+ e1 = parseTokenDef
296
+ elsif ch == '['
297
+ e1 = parseSETSEQ
298
+ else
299
+ e1 = parseCharNFA
300
+ end
301
+ return e1
302
+ end
303
+
304
+
305
+ def parseE
306
+ e1 = parseJ
307
+ if readIf('|')
308
+ e2 = parseE
309
+
310
+ u = newState
311
+ v = newState
312
+ u.addEps(e1[0])
313
+ u.addEps(e2[0])
314
+ e1[1].addEps(v)
315
+ e2[1].addEps(v)
316
+ e1 = [u,v]
317
+ end
318
+ return e1
296
319
  end
297
- return e1
298
- end
299
320
 
300
-
301
- def parseE
302
- e1 = parseJ
303
- if readIf('|')
304
- e2 = parseE
321
+ def parseJ
322
+ e1 = parseQ
323
+ p = peek
324
+ if p and not "|)".include? p
325
+ e2 = parseJ
326
+ e1[1].addEps(e2[0])
327
+ e1 = [e1[0],e2[1]]
328
+ end
305
329
 
306
- u = newState
307
- v = newState
308
- u.addEps(e1[0])
309
- u.addEps(e2[0])
310
- e1[1].addEps(v)
311
- e2[1].addEps(v)
312
- e1 = [u,v]
313
- end
314
- return e1
315
- end
316
-
317
- def parseJ
318
- e1 = parseQ
319
- p = peek
320
- if p and not "|)".include? p
321
- e2 = parseJ
322
- e1[1].addEps(e2[0])
323
- e1 = [e1[0],e2[1]]
330
+ return e1
324
331
  end
325
332
 
326
- return e1
327
- end
328
-
329
- def parseQ
330
- e1 = parseP
331
- p = peek
333
+ def parseQ
334
+ e1 = parseP
335
+ p = peek
336
+
337
+ if p == '*'
338
+ read
339
+ e1[0].addEps(e1[1])
340
+ e1[1].addEps(e1[0])
341
+ elsif p == '+'
342
+ read
343
+ e1[1].addEps(e1[0])
344
+ elsif p == '?'
345
+ read
346
+ e1[0].addEps(e1[1])
347
+ # e1[0].generatePDF("optional")
348
+ end
349
+ return e1
350
+ end
332
351
 
333
- if p == '*'
334
- read
335
- e1[0].addEps(e1[1])
336
- e1[1].addEps(e1[0])
337
- elsif p == '+'
338
- read
339
- e1[1].addEps(e1[0])
340
- elsif p == '?'
341
- read
342
- e1[0].addEps(e1[1])
343
- # e1[0].generatePDF("optional")
352
+
353
+ def peek(mustExist = false)
354
+ # skip over any non-linefeed whitespace
355
+ while @cursor < @script.size && " \t".index(@script[@cursor])
356
+ @cursor += 1
357
+ end
358
+ if mustExist or @cursor < @script.size
359
+ @script[@cursor]
360
+ else
361
+ nil
362
+ end
344
363
  end
345
- return e1
346
- end
347
-
348
364
 
349
- def peek(mustExist = false)
350
- # skip over any non-linefeed whitespace
351
- while @cursor < @script.size && " \t".index(@script[@cursor])
352
- @cursor += 1
365
+ def readIf(expChar)
366
+ r = (peek == expChar)
367
+ if r
368
+ read
369
+ end
370
+ return r
353
371
  end
354
- if mustExist or @cursor < @script.size
355
- @script[@cursor]
356
- else
357
- nil
372
+
373
+ def read(expChar = nil)
374
+ ch = peek
375
+ if ch and ((not expChar) or ch == expChar)
376
+ @cursor += 1
377
+ ch
378
+ else
379
+ abort 'Unexpected end of input'
380
+ end
358
381
  end
359
382
  end
360
383
 
361
- def readIf(expChar)
362
- r = (peek == expChar)
363
- if r
364
- read
365
- end
366
- return r
367
- end
368
-
369
- def read(expChar = nil)
370
- ch = peek
371
- if ch and ((not expChar) or ch == expChar)
372
- @cursor += 1
373
- ch
374
- else
375
- abort 'Unexpected end of input'
376
- end
377
- end
378
- end
379
-
384
+ end # module ToknInternal