tokn 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,233 +1,235 @@
1
1
  require_relative 'tools'
2
2
  req('tokn_const code_set')
3
3
 
4
-
5
- # A data structure that transforms a set of CodeSets to a
6
- # disjoint set of them, such that no two range sets overlap.
7
- #
8
- # This is improve the efficiency of the NFA => DFA algorithm,
9
- # which involves gathering information about what states are
10
- # reachable on certain characters. We can't afford to treat each
11
- # character as a singleton, since the ranges can be quite large.
12
- # Hence, we want to treat ranges of characters as single entities;
13
- # this will only work if no two such ranges overlap.
14
- #
15
- # It works by starting with a tree whose node is labelled with
16
- # the maximal superset of character values. Then, for each edge
17
- # in the NFA, performs a DFS on this tree, splitting any node that
18
- # only partially intersects any one set that appears in the edge label.
19
- # The running time is O(n log k), where n is the size of the NFA, and
20
- # k is the height of the resulting tree.
21
- #
22
- # We encourage k to be small by sorting the NFA edges by their
23
- # label complexity.
24
- #
25
- class RangePartition
26
- include Tokn
4
+ module ToknInternal
27
5
 
28
- def initialize()
29
- # We will build a tree, where each node has a CodeSet
30
- # associated with it, and the child nodes (if present)
31
- # partition this CodeSet into smaller, nonempty sets.
32
-
33
- # A tree is represented by a node, where each node is a pair [x,y],
34
- # with x the node's CodeSet, and y a list of the node's children.
35
-
36
- @nextNodeId = 0
37
-
38
- # Make the root node hold the largest possible CodeSet.
39
- # We want to be able to include all the token ids as well.
40
-
41
- @rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
42
-
43
- @setsToAdd = Set.new
44
-
45
- # Add epsilon immediately, so it's always in its own subset
46
- addSet(CodeSet.new(EPSILON))
47
-
48
- @prepared = false
49
- end
50
-
51
- def addSet(s)
52
- if @prepared
53
- raise IllegalStateException
54
- end
55
- @setsToAdd.add(s)
56
- end
6
+ # A data structure that transforms a set of CodeSets to a
7
+ # disjoint set of them, such that no two range sets overlap.
8
+ #
9
+ # This is improve the efficiency of the NFA => DFA algorithm,
10
+ # which involves gathering information about what states are
11
+ # reachable on certain characters. We can't afford to treat each
12
+ # character as a singleton, since the ranges can be quite large.
13
+ # Hence, we want to treat ranges of characters as single entities;
14
+ # this will only work if no two such ranges overlap.
15
+ #
16
+ # It works by starting with a tree whose node is labelled with
17
+ # the maximal superset of character values. Then, for each edge
18
+ # in the NFA, performs a DFS on this tree, splitting any node that
19
+ # only partially intersects any one set that appears in the edge label.
20
+ # The running time is O(n log k), where n is the size of the NFA, and
21
+ # k is the height of the resulting tree.
22
+ #
23
+ # We encourage k to be small by sorting the NFA edges by their
24
+ # label complexity.
25
+ #
26
+ class RangePartition
27
+ # include Tokn
28
+
29
+ def initialize()
30
+ # We will build a tree, where each node has a CodeSet
31
+ # associated with it, and the child nodes (if present)
32
+ # partition this CodeSet into smaller, nonempty sets.
33
+
34
+ # A tree is represented by a node, where each node is a pair [x,y],
35
+ # with x the node's CodeSet, and y a list of the node's children.
57
36
 
58
- def prepare()
59
- if @prepared
60
- raise IllegalStateException
37
+ @nextNodeId = 0
38
+
39
+ # Make the root node hold the largest possible CodeSet.
40
+ # We want to be able to include all the token ids as well.
41
+
42
+ @rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
43
+
44
+ @setsToAdd = Set.new
45
+
46
+ # Add epsilon immediately, so it's always in its own subset
47
+ addSet(CodeSet.new(EPSILON))
48
+
49
+ @prepared = false
61
50
  end
62
-
63
- # Construct partition from previously added sets
64
-
65
- list = @setsToAdd.to_a
66
-
67
- # Sort set by cardinality: probably get a more balanced tree
68
- # if larger sets are processed first
69
- list.sort!{ |x,y| y.cardinality <=> x.cardinality }
70
-
71
- list.each do |s|
72
- addSetAux(s)
73
- end
74
-
75
- @prepared = true
76
- end
77
-
78
51
 
79
- # Generate a .dot file, and from that, a PDF, for debug purposes
80
- #
81
- def generatePDF(name = "partition")
82
- if !@prepared
83
- raise IllegalStateException
84
- end
85
-
86
- g = ""
87
- g += "digraph "+name+" {\n\n"
88
-
89
- nodes = []
90
- buildNodeList(nodes)
91
- nodes.each do |node|
92
- g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
93
- end
94
-
95
- g += "\n"
96
- nodes.each do |node|
97
- node.children.each do |ch|
98
- g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
52
+ def addSet(s)
53
+ if @prepared
54
+ raise IllegalStateException
99
55
  end
56
+ @setsToAdd.add(s)
100
57
  end
101
-
102
- g += "\n}\n"
103
- g.gsub!( /'/, '"' )
104
58
 
105
- dotToPDF(g,name)
106
-
107
- end
108
-
109
-
110
- # Apply the partition to a CodeSet
111
- #
112
- # > s CodeSet
113
- # < array of subsets from the partition whose union equals s
114
- # (this array will be the single element s if no partitioning was necessary)
115
- #
116
- def apply(s)
117
- if !@prepared
118
- raise IllegalStateException
59
+ def prepare()
60
+ if @prepared
61
+ raise IllegalStateException
62
+ end
63
+
64
+ # Construct partition from previously added sets
65
+
66
+ list = @setsToAdd.to_a
67
+
68
+ # Sort set by cardinality: probably get a more balanced tree
69
+ # if larger sets are processed first
70
+ list.sort!{ |x,y| y.cardinality <=> x.cardinality }
71
+
72
+ list.each do |s|
73
+ addSetAux(s)
74
+ end
75
+
76
+ @prepared = true
119
77
  end
120
78
 
121
- list = []
122
- s2 = s.makeCopy
123
- applyAux(@rootNode, s2, list)
124
-
125
- # Sort the list of subsets by their first elements
126
- list.sort! { |x,y| x.array[0] <=> y.array[0] }
127
79
 
128
- list
129
- end
130
-
131
-
132
- private
133
-
134
- def applyAux(n, s, list)
135
- db = false
136
-
137
- !db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
138
-
139
- if n.children.empty?
140
- # # Verify that this set equals the input set
141
- # myAssert(s.eql? n.set)
142
- list.push(s)
143
- else
144
- n.children.each do |m|
145
- s1 = s.intersect(m.set)
146
- !db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
147
-
148
- if s1.empty?
149
- next
150
- end
151
-
152
- applyAux(m, s1, list)
153
-
154
- !db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
155
- s = s.difference(m.set)
156
- !db||pr(" subtracted child set, now [%s]\n",d(s))
157
- if s.empty?
158
- break
80
+ # Generate a .dot file, and from that, a PDF, for debug purposes
81
+ #
82
+ def generatePDF(name = "partition")
83
+ if !@prepared
84
+ raise IllegalStateException
85
+ end
86
+
87
+ g = ""
88
+ g += "digraph "+name+" {\n\n"
89
+
90
+ nodes = []
91
+ buildNodeList(nodes)
92
+ nodes.each do |node|
93
+ g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
94
+ end
95
+
96
+ g += "\n"
97
+ nodes.each do |node|
98
+ node.children.each do |ch|
99
+ g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
159
100
  end
160
101
  end
161
- end
162
- end
102
+
103
+ g += "\n}\n"
104
+ g.gsub!( /'/, '"' )
105
+
106
+ dotToPDF(g,name)
163
107
 
164
- def buildNode(rangeSet)
165
- id = @nextNodeId
166
- @nextNodeId += 1
167
- n = RPNode.new(id, rangeSet, [])
168
- n
169
- end
170
-
171
- def buildNodeList(list, root = nil)
172
- if not root
173
- root = @rootNode
174
- end
175
- list.push(root)
176
- root.children.each do |x|
177
- buildNodeList(list, x)
178
108
  end
179
- end
180
109
 
181
- # Add a set to the tree, extending the tree as necessary to
182
- # maintain a (disjoint) partition
183
- #
184
- def addSetAux(s, n = @rootNode)
185
- #
186
- # The algorithm is this:
110
+
111
+ # Apply the partition to a CodeSet
187
112
  #
188
- # add (s, n) # add set s to node n; s must be subset of n.set
189
- # if n.set = s, return
190
- # if n is leaf:
191
- # x = n.set - s
192
- # add x,y as child sets of n
193
- # else
194
- # for each child m of n:
195
- # t = intersect of m.set and s
196
- # if t is nonempty, add(t, m)
113
+ # > s CodeSet
114
+ # < array of subsets from the partition whose union equals s
115
+ # (this array will be the single element s if no partitioning was necessary)
197
116
  #
198
- if n.set.eql? s
199
- return
200
- end
201
- if n.children.empty?
202
- x = n.set.difference(s)
203
- n.children.push buildNode(x)
204
- n.children.push buildNode(s)
205
- else
206
- n.children.each do |m|
207
- t = m.set.intersect(s)
208
- addSetAux(t,m) unless t.empty?
117
+ def apply(s)
118
+ if !@prepared
119
+ raise IllegalStateException
209
120
  end
121
+
122
+ list = []
123
+ s2 = s.makeCopy
124
+ applyAux(@rootNode, s2, list)
125
+
126
+ # Sort the list of subsets by their first elements
127
+ list.sort! { |x,y| x.array[0] <=> y.array[0] }
128
+
129
+ list
210
130
  end
211
- end
212
131
 
213
- end
214
-
215
- # A node within a RangePartition tree
216
- #
217
- class RPNode
218
132
 
219
- attr_accessor :id, :set, :children
133
+ private
220
134
 
221
- def initialize(id, set, children)
222
- @id = id
223
- @set = set
224
- @children = children
225
- end
135
+ def applyAux(n, s, list)
136
+ db = false
137
+
138
+ !db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
139
+
140
+ if n.children.empty?
141
+ # # Verify that this set equals the input set
142
+ # myAssert(s.eql? n.set)
143
+ list.push(s)
144
+ else
145
+ n.children.each do |m|
146
+ s1 = s.intersect(m.set)
147
+ !db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
148
+
149
+ if s1.empty?
150
+ next
151
+ end
152
+
153
+ applyAux(m, s1, list)
154
+
155
+ !db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
156
+ s = s.difference(m.set)
157
+ !db||pr(" subtracted child set, now [%s]\n",d(s))
158
+ if s.empty?
159
+ break
160
+ end
161
+ end
162
+ end
163
+ end
164
+
165
+ def buildNode(rangeSet)
166
+ id = @nextNodeId
167
+ @nextNodeId += 1
168
+ n = RPNode.new(id, rangeSet, [])
169
+ n
170
+ end
226
171
 
227
- def inspect
228
- return 'N' + id.to_s
172
+ def buildNodeList(list, root = nil)
173
+ if not root
174
+ root = @rootNode
175
+ end
176
+ list.push(root)
177
+ root.children.each do |x|
178
+ buildNodeList(list, x)
179
+ end
180
+ end
181
+
182
+ # Add a set to the tree, extending the tree as necessary to
183
+ # maintain a (disjoint) partition
184
+ #
185
+ def addSetAux(s, n = @rootNode)
186
+ #
187
+ # The algorithm is this:
188
+ #
189
+ # add (s, n) # add set s to node n; s must be subset of n.set
190
+ # if n.set = s, return
191
+ # if n is leaf:
192
+ # x = n.set - s
193
+ # add x,y as child sets of n
194
+ # else
195
+ # for each child m of n:
196
+ # t = intersect of m.set and s
197
+ # if t is nonempty, add(t, m)
198
+ #
199
+ if n.set.eql? s
200
+ return
201
+ end
202
+ if n.children.empty?
203
+ x = n.set.difference(s)
204
+ n.children.push buildNode(x)
205
+ n.children.push buildNode(s)
206
+ else
207
+ n.children.each do |m|
208
+ t = m.set.intersect(s)
209
+ addSetAux(t,m) unless t.empty?
210
+ end
211
+ end
212
+ end
213
+
229
214
  end
230
215
 
231
- end
216
+ # A node within a RangePartition tree
217
+ #
218
+ class RPNode
219
+
220
+ attr_accessor :id, :set, :children
221
+
222
+ def initialize(id, set, children)
223
+ @id = id
224
+ @set = set
225
+ @children = children
226
+ end
227
+
228
+ def inspect
229
+ return 'N' + id.to_s
230
+ end
231
+
232
+ end
232
233
 
234
+ end # module ToknInternal
233
235
 
@@ -1,379 +1,384 @@
1
1
  require_relative 'tools'
2
2
  req('code_set state')
3
3
 
4
- class ParseException < Exception
5
- end
6
-
7
- # Parses a single regular expression from a string.
8
- # Produces an NFA with distinguished start and end states
9
- # (none of these states are marked as final states)
10
- #
11
- # Here is the grammar for regular expressions. Spaces are ignored,
12
- # and can be liberally sprinkled within the regular expressions to
13
- # aid readability. To represent a space, the \s escape sequence must be used.
14
- # See the file 'sampletokens.txt' for some examples.
15
- #
16
- # Expressions have one of these types:
17
- #
18
- # E : base class
19
- # J : a Join expression, formed by concatenating one or more together
20
- # Q : a Quantified expression; followed optionally by '*', '+', or '?'
21
- # P : a Parenthesized expression, which is optionally surrounded with (), {}, []
22
- #
23
- # E -> J '|' E
24
- # | J
25
- #
26
- # J -> Q J
27
- # | Q
28
- #
29
- # Q -> P '*'
30
- # | P '+'
31
- # | P '?'
32
- # | P
33
- #
34
- # P -> '(' E ')'
35
- # | '{' TOKENNAME '}'
36
- # | '[^' SETSEQ ']' A code not appearing in the set
37
- # | '[' SETSEQ ']'
38
- # | CHARCODE
39
- #
40
- # SETSEQ -> SET SETSEQ
41
- # | SET
42
- #
43
- # SET -> CHARCODE
44
- # | CHARCODE '-' CHARCODE
45
- #
46
- # CHARCODE ->
47
- # a | b | c ... any printable except {,},[, etc.
48
- # | \xhh hex value from 00...ff
49
- # | \uhhhh hex value from 0000...ffff (e.g., unicode)
50
- # | \f | \n | \r | \t formfeed, linefeed, return, tab
51
- # | \s a space (' ')
52
- # | \* where * is some other non-alphabetic
53
- # character that needs to be escaped
54
- #
55
- # The parser performs recursive descent parsing;
56
- # each method returns an NFA represented by
57
- # a pair of states: the start and end states.
58
- #
59
- class RegParse
60
-
61
- attr_reader :startState, :endState
4
+ module ToknInternal
62
5
 
63
- # Construct a parser and perform the parsing
64
- # @param script script to parse
65
- # @param tokenDefMap if not nil, a map of previously parsed regular expressions
66
- # (mapping names to ids) to be consulted if a curly brace expression appears
67
- # in the script
68
- #
69
- def initialize(script, tokenDefMap = nil)
70
- @script = script.strip
71
- @nextStateId = 0
72
- @tokenDefMap = tokenDefMap
73
- parseScript
74
- end
75
-
76
-
77
- def inspect
78
- s = "RegParse: #{@script}"
79
- s += " start:"+d(@startState)+" end:"+d(@endState)
80
- return s
81
- end
82
-
83
- private
84
-
85
- # Raise a ParseException, with a helpful message indicating
86
- # the parser's current location within the string
6
+ # Exception thrown if problem parsing regular expression
87
7
  #
88
- def abort(msg)
89
- # Assume we've already read the problem character
90
- i = @cursor - 1
91
- s = ''
92
- if i > 4
93
- s += '...'
94
- end
95
- s += @script[i-3...i] || ""
96
- s += ' !!! '
97
- s += @script[i...i+3] || ""
98
- if i +3 < @script.size
99
- s += '...'
100
- end
101
- raise ParseException, msg + ": "+s
8
+ class ParseException < Exception
102
9
  end
103
10
 
104
- # Read next character as a hex digit
11
+ # Parses a single regular expression from a string.
12
+ # Produces an NFA with distinguished start and end states
13
+ # (none of these states are marked as final states)
105
14
  #
106
- def readHex
107
- v = read.upcase.ord
108
- if v >= 48 and v < 58
109
- return v - 48
110
- elsif v >= 65 and v < 71
111
- return v - 65 + 10
112
- else
113
- abort "Missing hex digit"
114
- end
115
- end
116
-
117
-
118
- NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
119
-
120
- # Parse character definition (CHARCODE) from input
15
+ # Here is the grammar for regular expressions. Spaces are ignored,
16
+ # and can be liberally sprinkled within the regular expressions to
17
+ # aid readability. To represent a space, the \s escape sequence must be used.
18
+ # See the file 'sampletokens.txt' for some examples.
19
+ #
20
+ # Expressions have one of these types:
21
+ #
22
+ # E : base class
23
+ # J : a Join expression, formed by concatenating one or more together
24
+ # Q : a Quantified expression; followed optionally by '*', '+', or '?'
25
+ # P : a Parenthesized expression, which is optionally surrounded with (), {}, []
26
+ #
27
+ # E -> J '|' E
28
+ # | J
29
+ #
30
+ # J -> Q J
31
+ # | Q
32
+ #
33
+ # Q -> P '*'
34
+ # | P '+'
35
+ # | P '?'
36
+ # | P
37
+ #
38
+ # P -> '(' E ')'
39
+ # | '{' TOKENNAME '}'
40
+ # | '[^' SETSEQ ']' A code not appearing in the set
41
+ # | '[' SETSEQ ']'
42
+ # | CHARCODE
43
+ #
44
+ # SETSEQ -> SET SETSEQ
45
+ # | SET
46
+ #
47
+ # SET -> CHARCODE
48
+ # | CHARCODE '-' CHARCODE
121
49
  #
122
- def parseChar
50
+ # CHARCODE ->
51
+ # a | b | c ... any printable except {,},[, etc.
52
+ # | \xhh hex value from 00...ff
53
+ # | \uhhhh hex value from 0000...ffff (e.g., unicode)
54
+ # | \f | \n | \r | \t formfeed, linefeed, return, tab
55
+ # | \s a space (' ')
56
+ # | \* where * is some other non-alphabetic
57
+ # character that needs to be escaped
58
+ #
59
+ # The parser performs recursive descent parsing;
60
+ # each method returns an NFA represented by
61
+ # a pair of states: the start and end states.
62
+ #
63
+ class RegParse
123
64
 
124
- c = read
65
+ attr_reader :startState, :endState
125
66
 
126
- val = c.ord
67
+ # Construct a parser and perform the parsing
68
+ # @param script script to parse
69
+ # @param tokenDefMap if not nil, a map of previously parsed regular expressions
70
+ # (mapping names to ids) to be consulted if a curly brace expression appears
71
+ # in the script
72
+ #
73
+ def initialize(script, tokenDefMap = nil)
74
+ @script = script.strip
75
+ @nextStateId = 0
76
+ @tokenDefMap = tokenDefMap
77
+ parseScript
78
+ end
79
+
80
+
81
+ def inspect
82
+ s = "RegParse: #{@script}"
83
+ s += " start:"+d(@startState)+" end:"+d(@endState)
84
+ return s
85
+ end
86
+
87
+ private
88
+
89
+ # Raise a ParseException, with a helpful message indicating
90
+ # the parser's current location within the string
91
+ #
92
+ def abort(msg)
93
+ # Assume we've already read the problem character
94
+ i = @cursor - 1
95
+ s = ''
96
+ if i > 4
97
+ s += '...'
98
+ end
99
+ s += @script[i-3...i] || ""
100
+ s += ' !!! '
101
+ s += @script[i...i+3] || ""
102
+ if i +3 < @script.size
103
+ s += '...'
104
+ end
105
+ raise ParseException, msg + ": "+s
106
+ end
127
107
 
128
- if "{}[]*?+|-^()".include?(c) or val <= 0x20
129
- abort "Unexpected or unescaped character"
108
+ # Read next character as a hex digit
109
+ #
110
+ def readHex
111
+ v = read.upcase.ord
112
+ if v >= 48 and v < 58
113
+ return v - 48
114
+ elsif v >= 65 and v < 71
115
+ return v - 65 + 10
116
+ else
117
+ abort "Missing hex digit"
118
+ end
130
119
  end
131
-
132
- if c == '\\'
120
+
121
+
122
+ NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
123
+
124
+ # Parse character definition (CHARCODE) from input
125
+ #
126
+ def parseChar
133
127
 
134
128
  c = read
135
129
 
136
- if "xX".include? c
137
- val = (readHex() << 4) | readHex()
138
- elsif "uU".include? c
139
- val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
140
- else
141
- if c == 'f'
142
- val = "\f".ord
143
- elsif c == 'r'
144
- val == "\r".ord
145
- elsif c == 'n'
146
- val = "\n".ord
147
- elsif c == 't'
148
- val = "\t".ord
149
- elsif c == 's'
150
- val = " ".ord
130
+ val = c.ord
131
+
132
+ if "{}[]*?+|-^()".include?(c) or val <= 0x20
133
+ abort "Unexpected or unescaped character"
134
+ end
135
+
136
+ if c == '\\'
137
+
138
+ c = read
139
+
140
+ if "xX".include? c
141
+ val = (readHex() << 4) | readHex()
142
+ elsif "uU".include? c
143
+ val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
151
144
  else
152
- if c =~ NO_ESCAPE_CHARS
153
- abort "Unsupported escape sequence ("+c+")"
154
- end
155
- val = c.ord
156
- end
145
+ if c == 'f'
146
+ val = "\f".ord
147
+ elsif c == 'r'
148
+ val == "\r".ord
149
+ elsif c == 'n'
150
+ val = "\n".ord
151
+ elsif c == 't'
152
+ val = "\t".ord
153
+ elsif c == 's'
154
+ val = " ".ord
155
+ else
156
+ if c =~ NO_ESCAPE_CHARS
157
+ abort "Unsupported escape sequence ("+c+")"
158
+ end
159
+ val = c.ord
160
+ end
161
+ end
157
162
  end
163
+
164
+ return val
158
165
  end
159
-
160
- return val
161
- end
162
-
163
166
 
164
- def parseCharNFA
165
- val = parseChar
166
-
167
- # Construct a pair of states with an edge between them
168
- # labelled with this character code
169
167
 
170
- sA = newState
171
- sB = newState
172
- cset = CodeSet.new
173
- cset.add(val)
174
- sA.addEdge(cset, sB)
175
- return [sA,sB]
176
- end
177
-
178
-
168
+ def parseCharNFA
169
+ val = parseChar
179
170
 
180
- def dbInfo
181
- j = @cursor
182
- k = j + 5
183
- if k >= @script.size
184
- return @script[j..k]+"<<<== end"
185
- else
186
- return @script[j..k]+"..."
171
+ # Construct a pair of states with an edge between them
172
+ # labelled with this character code
173
+
174
+ sA = newState
175
+ sB = newState
176
+ cset = CodeSet.new
177
+ cset.add(val)
178
+ sA.addEdge(cset, sB)
179
+ return [sA,sB]
187
180
  end
188
- end
189
-
190
- def parseScript
191
- # Set up the input scanner
192
- @cursor = 0
193
181
 
194
- exp = parseE
195
- @startState = exp[0]
196
- @endState = exp[1]
197
- end
198
-
199
- def newState
200
- s = State.new(@nextStateId)
201
- @nextStateId += 1
202
- return s
203
- end
204
-
205
- def parseSET
206
- u = parseChar
207
- v = u+1
208
- if readIf('-')
209
- v = parseChar() + 1
210
- if v <= u
211
- abort "Illegal range"
212
- end
213
- end
214
- return u,v
215
- end
216
-
217
- def parseSETSEQ
218
- db = false
219
-
220
- !db || pr("parseSETSEQ\n")
221
-
222
- read('[')
223
- negated = readIf('^')
224
- !db || pr(" negated=%s\n",negated)
225
182
 
226
- rs = CodeSet.new
183
+
184
+ def dbInfo
185
+ j = @cursor
186
+ k = j + 5
187
+ if k >= @script.size
188
+ return @script[j..k]+"<<<== end"
189
+ else
190
+ return @script[j..k]+"..."
191
+ end
192
+ end
227
193
 
228
- u,v = parseSET
229
- rs.add(u,v)
230
- !db || pr(" initial set=%s\n",d(rs))
231
-
232
- while not readIf(']')
233
- u,v = parseSET
234
- rs.add(u,v)
235
- !db || pr(" added another; %s\n",d(rs))
236
- end
237
- if negated
238
- rs.negate
239
- !db || pr(" negated=%s\n",d(rs))
194
+ def parseScript
195
+ # Set up the input scanner
196
+ @cursor = 0
197
+
198
+ exp = parseE
199
+ @startState = exp[0]
200
+ @endState = exp[1]
240
201
  end
241
-
242
- if rs.empty?
243
- abort "Empty character range"
202
+
203
+ def newState
204
+ s = State.new(@nextStateId)
205
+ @nextStateId += 1
206
+ return s
244
207
  end
245
208
 
246
- sA = newState
247
- sB = newState
248
- sA.addEdge(rs, sB)
249
- return [sA,sB]
250
- end
209
+ def parseSET
210
+ u = parseChar
211
+ v = u+1
212
+ if readIf('-')
213
+ v = parseChar() + 1
214
+ if v <= u
215
+ abort "Illegal range"
216
+ end
217
+ end
218
+ return u,v
219
+ end
251
220
 
252
- TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
221
+ def parseSETSEQ
222
+ db = false
223
+
224
+ !db || pr("parseSETSEQ\n")
225
+
226
+ read('[')
227
+ negated = readIf('^')
228
+ !db || pr(" negated=%s\n",negated)
229
+
230
+ rs = CodeSet.new
231
+
232
+ u,v = parseSET
233
+ rs.add(u,v)
234
+ !db || pr(" initial set=%s\n",d(rs))
253
235
 
254
- def parseTokenDef
255
- read('{')
256
- name = ''
257
- while !readIf('}')
258
- name += read
259
- end
260
- # pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
261
- if name !~ TOKENREF_EXPR
262
- abort "Problem with token name"
263
- end
264
- tokInfo = nil
265
- if @tokenDefMap
266
- tokInfo = @tokenDefMap[name]
267
- end
268
- if !tokInfo
269
- abort "Undefined token"
236
+ while not readIf(']')
237
+ u,v = parseSET
238
+ rs.add(u,v)
239
+ !db || pr(" added another; %s\n",d(rs))
240
+ end
241
+ if negated
242
+ rs.negate
243
+ !db || pr(" negated=%s\n",d(rs))
244
+ end
245
+
246
+ if rs.empty?
247
+ abort "Empty character range"
248
+ end
249
+
250
+ sA = newState
251
+ sB = newState
252
+ sA.addEdge(rs, sB)
253
+ return [sA,sB]
270
254
  end
271
- rg = tokInfo[1]
272
255
 
273
- oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
256
+ TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
274
257
 
275
- newStart = oldToNewMap[rg.startState]
276
- newEnd = oldToNewMap[rg.endState]
258
+ def parseTokenDef
259
+ read('{')
260
+ name = ''
261
+ while !readIf('}')
262
+ name += read
263
+ end
264
+ # pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
265
+ if name !~ TOKENREF_EXPR
266
+ abort "Problem with token name"
267
+ end
268
+ tokInfo = nil
269
+ if @tokenDefMap
270
+ tokInfo = @tokenDefMap[name]
271
+ end
272
+ if !tokInfo
273
+ abort "Undefined token"
274
+ end
275
+ rg = tokInfo[1]
276
+
277
+ oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
278
+
279
+ newStart = oldToNewMap[rg.startState]
280
+ newEnd = oldToNewMap[rg.endState]
281
+
282
+ [newStart, newEnd]
283
+
284
+
285
+ end
277
286
 
278
- [newStart, newEnd]
279
287
 
280
-
281
- end
282
-
283
-
284
- def parseP
285
- ch = peek
286
- if ch == '('
287
- read
288
- e1 = parseE
289
- read ')'
290
- elsif ch == '{'
291
- e1 = parseTokenDef
292
- elsif ch == '['
293
- e1 = parseSETSEQ
294
- else
295
- e1 = parseCharNFA
288
+ def parseP
289
+ ch = peek
290
+ if ch == '('
291
+ read
292
+ e1 = parseE
293
+ read ')'
294
+ elsif ch == '{'
295
+ e1 = parseTokenDef
296
+ elsif ch == '['
297
+ e1 = parseSETSEQ
298
+ else
299
+ e1 = parseCharNFA
300
+ end
301
+ return e1
302
+ end
303
+
304
+
305
+ def parseE
306
+ e1 = parseJ
307
+ if readIf('|')
308
+ e2 = parseE
309
+
310
+ u = newState
311
+ v = newState
312
+ u.addEps(e1[0])
313
+ u.addEps(e2[0])
314
+ e1[1].addEps(v)
315
+ e2[1].addEps(v)
316
+ e1 = [u,v]
317
+ end
318
+ return e1
296
319
  end
297
- return e1
298
- end
299
320
 
300
-
301
- def parseE
302
- e1 = parseJ
303
- if readIf('|')
304
- e2 = parseE
321
+ def parseJ
322
+ e1 = parseQ
323
+ p = peek
324
+ if p and not "|)".include? p
325
+ e2 = parseJ
326
+ e1[1].addEps(e2[0])
327
+ e1 = [e1[0],e2[1]]
328
+ end
305
329
 
306
- u = newState
307
- v = newState
308
- u.addEps(e1[0])
309
- u.addEps(e2[0])
310
- e1[1].addEps(v)
311
- e2[1].addEps(v)
312
- e1 = [u,v]
313
- end
314
- return e1
315
- end
316
-
317
- def parseJ
318
- e1 = parseQ
319
- p = peek
320
- if p and not "|)".include? p
321
- e2 = parseJ
322
- e1[1].addEps(e2[0])
323
- e1 = [e1[0],e2[1]]
330
+ return e1
324
331
  end
325
332
 
326
- return e1
327
- end
328
-
329
- def parseQ
330
- e1 = parseP
331
- p = peek
333
+ def parseQ
334
+ e1 = parseP
335
+ p = peek
336
+
337
+ if p == '*'
338
+ read
339
+ e1[0].addEps(e1[1])
340
+ e1[1].addEps(e1[0])
341
+ elsif p == '+'
342
+ read
343
+ e1[1].addEps(e1[0])
344
+ elsif p == '?'
345
+ read
346
+ e1[0].addEps(e1[1])
347
+ # e1[0].generatePDF("optional")
348
+ end
349
+ return e1
350
+ end
332
351
 
333
- if p == '*'
334
- read
335
- e1[0].addEps(e1[1])
336
- e1[1].addEps(e1[0])
337
- elsif p == '+'
338
- read
339
- e1[1].addEps(e1[0])
340
- elsif p == '?'
341
- read
342
- e1[0].addEps(e1[1])
343
- # e1[0].generatePDF("optional")
352
+
353
+ def peek(mustExist = false)
354
+ # skip over any non-linefeed whitespace
355
+ while @cursor < @script.size && " \t".index(@script[@cursor])
356
+ @cursor += 1
357
+ end
358
+ if mustExist or @cursor < @script.size
359
+ @script[@cursor]
360
+ else
361
+ nil
362
+ end
344
363
  end
345
- return e1
346
- end
347
-
348
364
 
349
- def peek(mustExist = false)
350
- # skip over any non-linefeed whitespace
351
- while @cursor < @script.size && " \t".index(@script[@cursor])
352
- @cursor += 1
365
+ def readIf(expChar)
366
+ r = (peek == expChar)
367
+ if r
368
+ read
369
+ end
370
+ return r
353
371
  end
354
- if mustExist or @cursor < @script.size
355
- @script[@cursor]
356
- else
357
- nil
372
+
373
+ def read(expChar = nil)
374
+ ch = peek
375
+ if ch and ((not expChar) or ch == expChar)
376
+ @cursor += 1
377
+ ch
378
+ else
379
+ abort 'Unexpected end of input'
380
+ end
358
381
  end
359
382
  end
360
383
 
361
- def readIf(expChar)
362
- r = (peek == expChar)
363
- if r
364
- read
365
- end
366
- return r
367
- end
368
-
369
- def read(expChar = nil)
370
- ch = peek
371
- if ch and ((not expChar) or ch == expChar)
372
- @cursor += 1
373
- ch
374
- else
375
- abort 'Unexpected end of input'
376
- end
377
- end
378
- end
379
-
384
+ end # module ToknInternal