tokn 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.txt +4 -5
- data/bin/tokncompile +1 -1
- data/bin/toknprocess +10 -4
- data/lib/tokn/code_set.rb +332 -337
- data/lib/tokn/dfa.rb +187 -162
- data/lib/tokn/dfa_builder.rb +218 -220
- data/lib/tokn/range_partition.rb +205 -203
- data/lib/tokn/reg_parse.rb +336 -331
- data/lib/tokn/state.rb +267 -270
- data/lib/tokn/token_defn_parser.rb +144 -139
- data/lib/tokn/tokenizer.rb +243 -175
- data/lib/tokn/tokn_const.rb +11 -6
- data/lib/tokn/tools.rb +42 -20
- data/test/Example1.rb +50 -0
- data/test/data/compileddfa.txt +1 -0
- data/test/data/sampletext.txt +6 -1
- data/test/test.rb +17 -12
- metadata +7 -6
- data/test/simple.rb +0 -33
data/lib/tokn/range_partition.rb
CHANGED
@@ -1,233 +1,235 @@
|
|
1
1
|
require_relative 'tools'
|
2
2
|
req('tokn_const code_set')
|
3
3
|
|
4
|
-
|
5
|
-
# A data structure that transforms a set of CodeSets to a
|
6
|
-
# disjoint set of them, such that no two range sets overlap.
|
7
|
-
#
|
8
|
-
# This is improve the efficiency of the NFA => DFA algorithm,
|
9
|
-
# which involves gathering information about what states are
|
10
|
-
# reachable on certain characters. We can't afford to treat each
|
11
|
-
# character as a singleton, since the ranges can be quite large.
|
12
|
-
# Hence, we want to treat ranges of characters as single entities;
|
13
|
-
# this will only work if no two such ranges overlap.
|
14
|
-
#
|
15
|
-
# It works by starting with a tree whose node is labelled with
|
16
|
-
# the maximal superset of character values. Then, for each edge
|
17
|
-
# in the NFA, performs a DFS on this tree, splitting any node that
|
18
|
-
# only partially intersects any one set that appears in the edge label.
|
19
|
-
# The running time is O(n log k), where n is the size of the NFA, and
|
20
|
-
# k is the height of the resulting tree.
|
21
|
-
#
|
22
|
-
# We encourage k to be small by sorting the NFA edges by their
|
23
|
-
# label complexity.
|
24
|
-
#
|
25
|
-
class RangePartition
|
26
|
-
include Tokn
|
4
|
+
module ToknInternal
|
27
5
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
6
|
+
# A data structure that transforms a set of CodeSets to a
|
7
|
+
# disjoint set of them, such that no two range sets overlap.
|
8
|
+
#
|
9
|
+
# This is improve the efficiency of the NFA => DFA algorithm,
|
10
|
+
# which involves gathering information about what states are
|
11
|
+
# reachable on certain characters. We can't afford to treat each
|
12
|
+
# character as a singleton, since the ranges can be quite large.
|
13
|
+
# Hence, we want to treat ranges of characters as single entities;
|
14
|
+
# this will only work if no two such ranges overlap.
|
15
|
+
#
|
16
|
+
# It works by starting with a tree whose node is labelled with
|
17
|
+
# the maximal superset of character values. Then, for each edge
|
18
|
+
# in the NFA, performs a DFS on this tree, splitting any node that
|
19
|
+
# only partially intersects any one set that appears in the edge label.
|
20
|
+
# The running time is O(n log k), where n is the size of the NFA, and
|
21
|
+
# k is the height of the resulting tree.
|
22
|
+
#
|
23
|
+
# We encourage k to be small by sorting the NFA edges by their
|
24
|
+
# label complexity.
|
25
|
+
#
|
26
|
+
class RangePartition
|
27
|
+
# include Tokn
|
28
|
+
|
29
|
+
def initialize()
|
30
|
+
# We will build a tree, where each node has a CodeSet
|
31
|
+
# associated with it, and the child nodes (if present)
|
32
|
+
# partition this CodeSet into smaller, nonempty sets.
|
33
|
+
|
34
|
+
# A tree is represented by a node, where each node is a pair [x,y],
|
35
|
+
# with x the node's CodeSet, and y a list of the node's children.
|
57
36
|
|
58
|
-
|
59
|
-
|
60
|
-
|
37
|
+
@nextNodeId = 0
|
38
|
+
|
39
|
+
# Make the root node hold the largest possible CodeSet.
|
40
|
+
# We want to be able to include all the token ids as well.
|
41
|
+
|
42
|
+
@rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
|
43
|
+
|
44
|
+
@setsToAdd = Set.new
|
45
|
+
|
46
|
+
# Add epsilon immediately, so it's always in its own subset
|
47
|
+
addSet(CodeSet.new(EPSILON))
|
48
|
+
|
49
|
+
@prepared = false
|
61
50
|
end
|
62
|
-
|
63
|
-
# Construct partition from previously added sets
|
64
|
-
|
65
|
-
list = @setsToAdd.to_a
|
66
|
-
|
67
|
-
# Sort set by cardinality: probably get a more balanced tree
|
68
|
-
# if larger sets are processed first
|
69
|
-
list.sort!{ |x,y| y.cardinality <=> x.cardinality }
|
70
|
-
|
71
|
-
list.each do |s|
|
72
|
-
addSetAux(s)
|
73
|
-
end
|
74
|
-
|
75
|
-
@prepared = true
|
76
|
-
end
|
77
|
-
|
78
51
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
if !@prepared
|
83
|
-
raise IllegalStateException
|
84
|
-
end
|
85
|
-
|
86
|
-
g = ""
|
87
|
-
g += "digraph "+name+" {\n\n"
|
88
|
-
|
89
|
-
nodes = []
|
90
|
-
buildNodeList(nodes)
|
91
|
-
nodes.each do |node|
|
92
|
-
g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
|
93
|
-
end
|
94
|
-
|
95
|
-
g += "\n"
|
96
|
-
nodes.each do |node|
|
97
|
-
node.children.each do |ch|
|
98
|
-
g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
|
52
|
+
def addSet(s)
|
53
|
+
if @prepared
|
54
|
+
raise IllegalStateException
|
99
55
|
end
|
56
|
+
@setsToAdd.add(s)
|
100
57
|
end
|
101
|
-
|
102
|
-
g += "\n}\n"
|
103
|
-
g.gsub!( /'/, '"' )
|
104
58
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
59
|
+
def prepare()
|
60
|
+
if @prepared
|
61
|
+
raise IllegalStateException
|
62
|
+
end
|
63
|
+
|
64
|
+
# Construct partition from previously added sets
|
65
|
+
|
66
|
+
list = @setsToAdd.to_a
|
67
|
+
|
68
|
+
# Sort set by cardinality: probably get a more balanced tree
|
69
|
+
# if larger sets are processed first
|
70
|
+
list.sort!{ |x,y| y.cardinality <=> x.cardinality }
|
71
|
+
|
72
|
+
list.each do |s|
|
73
|
+
addSetAux(s)
|
74
|
+
end
|
75
|
+
|
76
|
+
@prepared = true
|
119
77
|
end
|
120
78
|
|
121
|
-
list = []
|
122
|
-
s2 = s.makeCopy
|
123
|
-
applyAux(@rootNode, s2, list)
|
124
|
-
|
125
|
-
# Sort the list of subsets by their first elements
|
126
|
-
list.sort! { |x,y| x.array[0] <=> y.array[0] }
|
127
79
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
if s1.empty?
|
149
|
-
next
|
150
|
-
end
|
151
|
-
|
152
|
-
applyAux(m, s1, list)
|
153
|
-
|
154
|
-
!db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
|
155
|
-
s = s.difference(m.set)
|
156
|
-
!db||pr(" subtracted child set, now [%s]\n",d(s))
|
157
|
-
if s.empty?
|
158
|
-
break
|
80
|
+
# Generate a .dot file, and from that, a PDF, for debug purposes
|
81
|
+
#
|
82
|
+
def generatePDF(name = "partition")
|
83
|
+
if !@prepared
|
84
|
+
raise IllegalStateException
|
85
|
+
end
|
86
|
+
|
87
|
+
g = ""
|
88
|
+
g += "digraph "+name+" {\n\n"
|
89
|
+
|
90
|
+
nodes = []
|
91
|
+
buildNodeList(nodes)
|
92
|
+
nodes.each do |node|
|
93
|
+
g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
|
94
|
+
end
|
95
|
+
|
96
|
+
g += "\n"
|
97
|
+
nodes.each do |node|
|
98
|
+
node.children.each do |ch|
|
99
|
+
g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
|
159
100
|
end
|
160
101
|
end
|
161
|
-
|
162
|
-
|
102
|
+
|
103
|
+
g += "\n}\n"
|
104
|
+
g.gsub!( /'/, '"' )
|
105
|
+
|
106
|
+
dotToPDF(g,name)
|
163
107
|
|
164
|
-
def buildNode(rangeSet)
|
165
|
-
id = @nextNodeId
|
166
|
-
@nextNodeId += 1
|
167
|
-
n = RPNode.new(id, rangeSet, [])
|
168
|
-
n
|
169
|
-
end
|
170
|
-
|
171
|
-
def buildNodeList(list, root = nil)
|
172
|
-
if not root
|
173
|
-
root = @rootNode
|
174
|
-
end
|
175
|
-
list.push(root)
|
176
|
-
root.children.each do |x|
|
177
|
-
buildNodeList(list, x)
|
178
108
|
end
|
179
|
-
end
|
180
109
|
|
181
|
-
|
182
|
-
|
183
|
-
#
|
184
|
-
def addSetAux(s, n = @rootNode)
|
185
|
-
#
|
186
|
-
# The algorithm is this:
|
110
|
+
|
111
|
+
# Apply the partition to a CodeSet
|
187
112
|
#
|
188
|
-
#
|
189
|
-
#
|
190
|
-
# if
|
191
|
-
# x = n.set - s
|
192
|
-
# add x,y as child sets of n
|
193
|
-
# else
|
194
|
-
# for each child m of n:
|
195
|
-
# t = intersect of m.set and s
|
196
|
-
# if t is nonempty, add(t, m)
|
113
|
+
# > s CodeSet
|
114
|
+
# < array of subsets from the partition whose union equals s
|
115
|
+
# (this array will be the single element s if no partitioning was necessary)
|
197
116
|
#
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
if n.children.empty?
|
202
|
-
x = n.set.difference(s)
|
203
|
-
n.children.push buildNode(x)
|
204
|
-
n.children.push buildNode(s)
|
205
|
-
else
|
206
|
-
n.children.each do |m|
|
207
|
-
t = m.set.intersect(s)
|
208
|
-
addSetAux(t,m) unless t.empty?
|
117
|
+
def apply(s)
|
118
|
+
if !@prepared
|
119
|
+
raise IllegalStateException
|
209
120
|
end
|
121
|
+
|
122
|
+
list = []
|
123
|
+
s2 = s.makeCopy
|
124
|
+
applyAux(@rootNode, s2, list)
|
125
|
+
|
126
|
+
# Sort the list of subsets by their first elements
|
127
|
+
list.sort! { |x,y| x.array[0] <=> y.array[0] }
|
128
|
+
|
129
|
+
list
|
210
130
|
end
|
211
|
-
end
|
212
131
|
|
213
|
-
end
|
214
|
-
|
215
|
-
# A node within a RangePartition tree
|
216
|
-
#
|
217
|
-
class RPNode
|
218
132
|
|
219
|
-
|
133
|
+
private
|
220
134
|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
135
|
+
def applyAux(n, s, list)
|
136
|
+
db = false
|
137
|
+
|
138
|
+
!db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
|
139
|
+
|
140
|
+
if n.children.empty?
|
141
|
+
# # Verify that this set equals the input set
|
142
|
+
# myAssert(s.eql? n.set)
|
143
|
+
list.push(s)
|
144
|
+
else
|
145
|
+
n.children.each do |m|
|
146
|
+
s1 = s.intersect(m.set)
|
147
|
+
!db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
|
148
|
+
|
149
|
+
if s1.empty?
|
150
|
+
next
|
151
|
+
end
|
152
|
+
|
153
|
+
applyAux(m, s1, list)
|
154
|
+
|
155
|
+
!db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
|
156
|
+
s = s.difference(m.set)
|
157
|
+
!db||pr(" subtracted child set, now [%s]\n",d(s))
|
158
|
+
if s.empty?
|
159
|
+
break
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def buildNode(rangeSet)
|
166
|
+
id = @nextNodeId
|
167
|
+
@nextNodeId += 1
|
168
|
+
n = RPNode.new(id, rangeSet, [])
|
169
|
+
n
|
170
|
+
end
|
226
171
|
|
227
|
-
|
228
|
-
|
172
|
+
def buildNodeList(list, root = nil)
|
173
|
+
if not root
|
174
|
+
root = @rootNode
|
175
|
+
end
|
176
|
+
list.push(root)
|
177
|
+
root.children.each do |x|
|
178
|
+
buildNodeList(list, x)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Add a set to the tree, extending the tree as necessary to
|
183
|
+
# maintain a (disjoint) partition
|
184
|
+
#
|
185
|
+
def addSetAux(s, n = @rootNode)
|
186
|
+
#
|
187
|
+
# The algorithm is this:
|
188
|
+
#
|
189
|
+
# add (s, n) # add set s to node n; s must be subset of n.set
|
190
|
+
# if n.set = s, return
|
191
|
+
# if n is leaf:
|
192
|
+
# x = n.set - s
|
193
|
+
# add x,y as child sets of n
|
194
|
+
# else
|
195
|
+
# for each child m of n:
|
196
|
+
# t = intersect of m.set and s
|
197
|
+
# if t is nonempty, add(t, m)
|
198
|
+
#
|
199
|
+
if n.set.eql? s
|
200
|
+
return
|
201
|
+
end
|
202
|
+
if n.children.empty?
|
203
|
+
x = n.set.difference(s)
|
204
|
+
n.children.push buildNode(x)
|
205
|
+
n.children.push buildNode(s)
|
206
|
+
else
|
207
|
+
n.children.each do |m|
|
208
|
+
t = m.set.intersect(s)
|
209
|
+
addSetAux(t,m) unless t.empty?
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
229
214
|
end
|
230
215
|
|
231
|
-
|
216
|
+
# A node within a RangePartition tree
|
217
|
+
#
|
218
|
+
class RPNode
|
219
|
+
|
220
|
+
attr_accessor :id, :set, :children
|
221
|
+
|
222
|
+
def initialize(id, set, children)
|
223
|
+
@id = id
|
224
|
+
@set = set
|
225
|
+
@children = children
|
226
|
+
end
|
227
|
+
|
228
|
+
def inspect
|
229
|
+
return 'N' + id.to_s
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
232
233
|
|
234
|
+
end # module ToknInternal
|
233
235
|
|
data/lib/tokn/reg_parse.rb
CHANGED
@@ -1,379 +1,384 @@
|
|
1
1
|
require_relative 'tools'
|
2
2
|
req('code_set state')
|
3
3
|
|
4
|
-
|
5
|
-
end
|
6
|
-
|
7
|
-
# Parses a single regular expression from a string.
|
8
|
-
# Produces an NFA with distinguished start and end states
|
9
|
-
# (none of these states are marked as final states)
|
10
|
-
#
|
11
|
-
# Here is the grammar for regular expressions. Spaces are ignored,
|
12
|
-
# and can be liberally sprinkled within the regular expressions to
|
13
|
-
# aid readability. To represent a space, the \s escape sequence must be used.
|
14
|
-
# See the file 'sampletokens.txt' for some examples.
|
15
|
-
#
|
16
|
-
# Expressions have one of these types:
|
17
|
-
#
|
18
|
-
# E : base class
|
19
|
-
# J : a Join expression, formed by concatenating one or more together
|
20
|
-
# Q : a Quantified expression; followed optionally by '*', '+', or '?'
|
21
|
-
# P : a Parenthesized expression, which is optionally surrounded with (), {}, []
|
22
|
-
#
|
23
|
-
# E -> J '|' E
|
24
|
-
# | J
|
25
|
-
#
|
26
|
-
# J -> Q J
|
27
|
-
# | Q
|
28
|
-
#
|
29
|
-
# Q -> P '*'
|
30
|
-
# | P '+'
|
31
|
-
# | P '?'
|
32
|
-
# | P
|
33
|
-
#
|
34
|
-
# P -> '(' E ')'
|
35
|
-
# | '{' TOKENNAME '}'
|
36
|
-
# | '[^' SETSEQ ']' A code not appearing in the set
|
37
|
-
# | '[' SETSEQ ']'
|
38
|
-
# | CHARCODE
|
39
|
-
#
|
40
|
-
# SETSEQ -> SET SETSEQ
|
41
|
-
# | SET
|
42
|
-
#
|
43
|
-
# SET -> CHARCODE
|
44
|
-
# | CHARCODE '-' CHARCODE
|
45
|
-
#
|
46
|
-
# CHARCODE ->
|
47
|
-
# a | b | c ... any printable except {,},[, etc.
|
48
|
-
# | \xhh hex value from 00...ff
|
49
|
-
# | \uhhhh hex value from 0000...ffff (e.g., unicode)
|
50
|
-
# | \f | \n | \r | \t formfeed, linefeed, return, tab
|
51
|
-
# | \s a space (' ')
|
52
|
-
# | \* where * is some other non-alphabetic
|
53
|
-
# character that needs to be escaped
|
54
|
-
#
|
55
|
-
# The parser performs recursive descent parsing;
|
56
|
-
# each method returns an NFA represented by
|
57
|
-
# a pair of states: the start and end states.
|
58
|
-
#
|
59
|
-
class RegParse
|
60
|
-
|
61
|
-
attr_reader :startState, :endState
|
4
|
+
module ToknInternal
|
62
5
|
|
63
|
-
#
|
64
|
-
# @param script script to parse
|
65
|
-
# @param tokenDefMap if not nil, a map of previously parsed regular expressions
|
66
|
-
# (mapping names to ids) to be consulted if a curly brace expression appears
|
67
|
-
# in the script
|
68
|
-
#
|
69
|
-
def initialize(script, tokenDefMap = nil)
|
70
|
-
@script = script.strip
|
71
|
-
@nextStateId = 0
|
72
|
-
@tokenDefMap = tokenDefMap
|
73
|
-
parseScript
|
74
|
-
end
|
75
|
-
|
76
|
-
|
77
|
-
def inspect
|
78
|
-
s = "RegParse: #{@script}"
|
79
|
-
s += " start:"+d(@startState)+" end:"+d(@endState)
|
80
|
-
return s
|
81
|
-
end
|
82
|
-
|
83
|
-
private
|
84
|
-
|
85
|
-
# Raise a ParseException, with a helpful message indicating
|
86
|
-
# the parser's current location within the string
|
6
|
+
# Exception thrown if problem parsing regular expression
|
87
7
|
#
|
88
|
-
|
89
|
-
# Assume we've already read the problem character
|
90
|
-
i = @cursor - 1
|
91
|
-
s = ''
|
92
|
-
if i > 4
|
93
|
-
s += '...'
|
94
|
-
end
|
95
|
-
s += @script[i-3...i] || ""
|
96
|
-
s += ' !!! '
|
97
|
-
s += @script[i...i+3] || ""
|
98
|
-
if i +3 < @script.size
|
99
|
-
s += '...'
|
100
|
-
end
|
101
|
-
raise ParseException, msg + ": "+s
|
8
|
+
class ParseException < Exception
|
102
9
|
end
|
103
10
|
|
104
|
-
#
|
11
|
+
# Parses a single regular expression from a string.
|
12
|
+
# Produces an NFA with distinguished start and end states
|
13
|
+
# (none of these states are marked as final states)
|
105
14
|
#
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
#
|
15
|
+
# Here is the grammar for regular expressions. Spaces are ignored,
|
16
|
+
# and can be liberally sprinkled within the regular expressions to
|
17
|
+
# aid readability. To represent a space, the \s escape sequence must be used.
|
18
|
+
# See the file 'sampletokens.txt' for some examples.
|
19
|
+
#
|
20
|
+
# Expressions have one of these types:
|
21
|
+
#
|
22
|
+
# E : base class
|
23
|
+
# J : a Join expression, formed by concatenating one or more together
|
24
|
+
# Q : a Quantified expression; followed optionally by '*', '+', or '?'
|
25
|
+
# P : a Parenthesized expression, which is optionally surrounded with (), {}, []
|
26
|
+
#
|
27
|
+
# E -> J '|' E
|
28
|
+
# | J
|
29
|
+
#
|
30
|
+
# J -> Q J
|
31
|
+
# | Q
|
32
|
+
#
|
33
|
+
# Q -> P '*'
|
34
|
+
# | P '+'
|
35
|
+
# | P '?'
|
36
|
+
# | P
|
37
|
+
#
|
38
|
+
# P -> '(' E ')'
|
39
|
+
# | '{' TOKENNAME '}'
|
40
|
+
# | '[^' SETSEQ ']' A code not appearing in the set
|
41
|
+
# | '[' SETSEQ ']'
|
42
|
+
# | CHARCODE
|
43
|
+
#
|
44
|
+
# SETSEQ -> SET SETSEQ
|
45
|
+
# | SET
|
46
|
+
#
|
47
|
+
# SET -> CHARCODE
|
48
|
+
# | CHARCODE '-' CHARCODE
|
121
49
|
#
|
122
|
-
|
50
|
+
# CHARCODE ->
|
51
|
+
# a | b | c ... any printable except {,},[, etc.
|
52
|
+
# | \xhh hex value from 00...ff
|
53
|
+
# | \uhhhh hex value from 0000...ffff (e.g., unicode)
|
54
|
+
# | \f | \n | \r | \t formfeed, linefeed, return, tab
|
55
|
+
# | \s a space (' ')
|
56
|
+
# | \* where * is some other non-alphabetic
|
57
|
+
# character that needs to be escaped
|
58
|
+
#
|
59
|
+
# The parser performs recursive descent parsing;
|
60
|
+
# each method returns an NFA represented by
|
61
|
+
# a pair of states: the start and end states.
|
62
|
+
#
|
63
|
+
class RegParse
|
123
64
|
|
124
|
-
|
65
|
+
attr_reader :startState, :endState
|
125
66
|
|
126
|
-
|
67
|
+
# Construct a parser and perform the parsing
|
68
|
+
# @param script script to parse
|
69
|
+
# @param tokenDefMap if not nil, a map of previously parsed regular expressions
|
70
|
+
# (mapping names to ids) to be consulted if a curly brace expression appears
|
71
|
+
# in the script
|
72
|
+
#
|
73
|
+
def initialize(script, tokenDefMap = nil)
|
74
|
+
@script = script.strip
|
75
|
+
@nextStateId = 0
|
76
|
+
@tokenDefMap = tokenDefMap
|
77
|
+
parseScript
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
def inspect
|
82
|
+
s = "RegParse: #{@script}"
|
83
|
+
s += " start:"+d(@startState)+" end:"+d(@endState)
|
84
|
+
return s
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
# Raise a ParseException, with a helpful message indicating
|
90
|
+
# the parser's current location within the string
|
91
|
+
#
|
92
|
+
def abort(msg)
|
93
|
+
# Assume we've already read the problem character
|
94
|
+
i = @cursor - 1
|
95
|
+
s = ''
|
96
|
+
if i > 4
|
97
|
+
s += '...'
|
98
|
+
end
|
99
|
+
s += @script[i-3...i] || ""
|
100
|
+
s += ' !!! '
|
101
|
+
s += @script[i...i+3] || ""
|
102
|
+
if i +3 < @script.size
|
103
|
+
s += '...'
|
104
|
+
end
|
105
|
+
raise ParseException, msg + ": "+s
|
106
|
+
end
|
127
107
|
|
128
|
-
|
129
|
-
|
108
|
+
# Read next character as a hex digit
|
109
|
+
#
|
110
|
+
def readHex
|
111
|
+
v = read.upcase.ord
|
112
|
+
if v >= 48 and v < 58
|
113
|
+
return v - 48
|
114
|
+
elsif v >= 65 and v < 71
|
115
|
+
return v - 65 + 10
|
116
|
+
else
|
117
|
+
abort "Missing hex digit"
|
118
|
+
end
|
130
119
|
end
|
131
|
-
|
132
|
-
|
120
|
+
|
121
|
+
|
122
|
+
NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
|
123
|
+
|
124
|
+
# Parse character definition (CHARCODE) from input
|
125
|
+
#
|
126
|
+
def parseChar
|
133
127
|
|
134
128
|
c = read
|
135
129
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
val = " ".ord
|
130
|
+
val = c.ord
|
131
|
+
|
132
|
+
if "{}[]*?+|-^()".include?(c) or val <= 0x20
|
133
|
+
abort "Unexpected or unescaped character"
|
134
|
+
end
|
135
|
+
|
136
|
+
if c == '\\'
|
137
|
+
|
138
|
+
c = read
|
139
|
+
|
140
|
+
if "xX".include? c
|
141
|
+
val = (readHex() << 4) | readHex()
|
142
|
+
elsif "uU".include? c
|
143
|
+
val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
|
151
144
|
else
|
152
|
-
if c
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
145
|
+
if c == 'f'
|
146
|
+
val = "\f".ord
|
147
|
+
elsif c == 'r'
|
148
|
+
val == "\r".ord
|
149
|
+
elsif c == 'n'
|
150
|
+
val = "\n".ord
|
151
|
+
elsif c == 't'
|
152
|
+
val = "\t".ord
|
153
|
+
elsif c == 's'
|
154
|
+
val = " ".ord
|
155
|
+
else
|
156
|
+
if c =~ NO_ESCAPE_CHARS
|
157
|
+
abort "Unsupported escape sequence ("+c+")"
|
158
|
+
end
|
159
|
+
val = c.ord
|
160
|
+
end
|
161
|
+
end
|
157
162
|
end
|
163
|
+
|
164
|
+
return val
|
158
165
|
end
|
159
|
-
|
160
|
-
return val
|
161
|
-
end
|
162
|
-
|
163
166
|
|
164
|
-
def parseCharNFA
|
165
|
-
val = parseChar
|
166
|
-
|
167
|
-
# Construct a pair of states with an edge between them
|
168
|
-
# labelled with this character code
|
169
167
|
|
170
|
-
|
171
|
-
|
172
|
-
cset = CodeSet.new
|
173
|
-
cset.add(val)
|
174
|
-
sA.addEdge(cset, sB)
|
175
|
-
return [sA,sB]
|
176
|
-
end
|
177
|
-
|
178
|
-
|
168
|
+
def parseCharNFA
|
169
|
+
val = parseChar
|
179
170
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
171
|
+
# Construct a pair of states with an edge between them
|
172
|
+
# labelled with this character code
|
173
|
+
|
174
|
+
sA = newState
|
175
|
+
sB = newState
|
176
|
+
cset = CodeSet.new
|
177
|
+
cset.add(val)
|
178
|
+
sA.addEdge(cset, sB)
|
179
|
+
return [sA,sB]
|
187
180
|
end
|
188
|
-
end
|
189
|
-
|
190
|
-
def parseScript
|
191
|
-
# Set up the input scanner
|
192
|
-
@cursor = 0
|
193
181
|
|
194
|
-
exp = parseE
|
195
|
-
@startState = exp[0]
|
196
|
-
@endState = exp[1]
|
197
|
-
end
|
198
|
-
|
199
|
-
def newState
|
200
|
-
s = State.new(@nextStateId)
|
201
|
-
@nextStateId += 1
|
202
|
-
return s
|
203
|
-
end
|
204
|
-
|
205
|
-
def parseSET
|
206
|
-
u = parseChar
|
207
|
-
v = u+1
|
208
|
-
if readIf('-')
|
209
|
-
v = parseChar() + 1
|
210
|
-
if v <= u
|
211
|
-
abort "Illegal range"
|
212
|
-
end
|
213
|
-
end
|
214
|
-
return u,v
|
215
|
-
end
|
216
|
-
|
217
|
-
def parseSETSEQ
|
218
|
-
db = false
|
219
|
-
|
220
|
-
!db || pr("parseSETSEQ\n")
|
221
|
-
|
222
|
-
read('[')
|
223
|
-
negated = readIf('^')
|
224
|
-
!db || pr(" negated=%s\n",negated)
|
225
182
|
|
226
|
-
|
183
|
+
|
184
|
+
def dbInfo
|
185
|
+
j = @cursor
|
186
|
+
k = j + 5
|
187
|
+
if k >= @script.size
|
188
|
+
return @script[j..k]+"<<<== end"
|
189
|
+
else
|
190
|
+
return @script[j..k]+"..."
|
191
|
+
end
|
192
|
+
end
|
227
193
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
!db || pr(" added another; %s\n",d(rs))
|
236
|
-
end
|
237
|
-
if negated
|
238
|
-
rs.negate
|
239
|
-
!db || pr(" negated=%s\n",d(rs))
|
194
|
+
def parseScript
|
195
|
+
# Set up the input scanner
|
196
|
+
@cursor = 0
|
197
|
+
|
198
|
+
exp = parseE
|
199
|
+
@startState = exp[0]
|
200
|
+
@endState = exp[1]
|
240
201
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
202
|
+
|
203
|
+
def newState
|
204
|
+
s = State.new(@nextStateId)
|
205
|
+
@nextStateId += 1
|
206
|
+
return s
|
244
207
|
end
|
245
208
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
209
|
+
def parseSET
|
210
|
+
u = parseChar
|
211
|
+
v = u+1
|
212
|
+
if readIf('-')
|
213
|
+
v = parseChar() + 1
|
214
|
+
if v <= u
|
215
|
+
abort "Illegal range"
|
216
|
+
end
|
217
|
+
end
|
218
|
+
return u,v
|
219
|
+
end
|
251
220
|
|
252
|
-
|
221
|
+
def parseSETSEQ
|
222
|
+
db = false
|
223
|
+
|
224
|
+
!db || pr("parseSETSEQ\n")
|
225
|
+
|
226
|
+
read('[')
|
227
|
+
negated = readIf('^')
|
228
|
+
!db || pr(" negated=%s\n",negated)
|
229
|
+
|
230
|
+
rs = CodeSet.new
|
231
|
+
|
232
|
+
u,v = parseSET
|
233
|
+
rs.add(u,v)
|
234
|
+
!db || pr(" initial set=%s\n",d(rs))
|
253
235
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
236
|
+
while not readIf(']')
|
237
|
+
u,v = parseSET
|
238
|
+
rs.add(u,v)
|
239
|
+
!db || pr(" added another; %s\n",d(rs))
|
240
|
+
end
|
241
|
+
if negated
|
242
|
+
rs.negate
|
243
|
+
!db || pr(" negated=%s\n",d(rs))
|
244
|
+
end
|
245
|
+
|
246
|
+
if rs.empty?
|
247
|
+
abort "Empty character range"
|
248
|
+
end
|
249
|
+
|
250
|
+
sA = newState
|
251
|
+
sB = newState
|
252
|
+
sA.addEdge(rs, sB)
|
253
|
+
return [sA,sB]
|
270
254
|
end
|
271
|
-
rg = tokInfo[1]
|
272
255
|
|
273
|
-
|
256
|
+
TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
|
274
257
|
|
275
|
-
|
276
|
-
|
258
|
+
def parseTokenDef
|
259
|
+
read('{')
|
260
|
+
name = ''
|
261
|
+
while !readIf('}')
|
262
|
+
name += read
|
263
|
+
end
|
264
|
+
# pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
|
265
|
+
if name !~ TOKENREF_EXPR
|
266
|
+
abort "Problem with token name"
|
267
|
+
end
|
268
|
+
tokInfo = nil
|
269
|
+
if @tokenDefMap
|
270
|
+
tokInfo = @tokenDefMap[name]
|
271
|
+
end
|
272
|
+
if !tokInfo
|
273
|
+
abort "Undefined token"
|
274
|
+
end
|
275
|
+
rg = tokInfo[1]
|
276
|
+
|
277
|
+
oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
|
278
|
+
|
279
|
+
newStart = oldToNewMap[rg.startState]
|
280
|
+
newEnd = oldToNewMap[rg.endState]
|
281
|
+
|
282
|
+
[newStart, newEnd]
|
283
|
+
|
284
|
+
|
285
|
+
end
|
277
286
|
|
278
|
-
[newStart, newEnd]
|
279
287
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
e1
|
294
|
-
|
295
|
-
|
288
|
+
def parseP
|
289
|
+
ch = peek
|
290
|
+
if ch == '('
|
291
|
+
read
|
292
|
+
e1 = parseE
|
293
|
+
read ')'
|
294
|
+
elsif ch == '{'
|
295
|
+
e1 = parseTokenDef
|
296
|
+
elsif ch == '['
|
297
|
+
e1 = parseSETSEQ
|
298
|
+
else
|
299
|
+
e1 = parseCharNFA
|
300
|
+
end
|
301
|
+
return e1
|
302
|
+
end
|
303
|
+
|
304
|
+
|
305
|
+
def parseE
|
306
|
+
e1 = parseJ
|
307
|
+
if readIf('|')
|
308
|
+
e2 = parseE
|
309
|
+
|
310
|
+
u = newState
|
311
|
+
v = newState
|
312
|
+
u.addEps(e1[0])
|
313
|
+
u.addEps(e2[0])
|
314
|
+
e1[1].addEps(v)
|
315
|
+
e2[1].addEps(v)
|
316
|
+
e1 = [u,v]
|
317
|
+
end
|
318
|
+
return e1
|
296
319
|
end
|
297
|
-
return e1
|
298
|
-
end
|
299
320
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
321
|
+
def parseJ
|
322
|
+
e1 = parseQ
|
323
|
+
p = peek
|
324
|
+
if p and not "|)".include? p
|
325
|
+
e2 = parseJ
|
326
|
+
e1[1].addEps(e2[0])
|
327
|
+
e1 = [e1[0],e2[1]]
|
328
|
+
end
|
305
329
|
|
306
|
-
|
307
|
-
v = newState
|
308
|
-
u.addEps(e1[0])
|
309
|
-
u.addEps(e2[0])
|
310
|
-
e1[1].addEps(v)
|
311
|
-
e2[1].addEps(v)
|
312
|
-
e1 = [u,v]
|
313
|
-
end
|
314
|
-
return e1
|
315
|
-
end
|
316
|
-
|
317
|
-
def parseJ
|
318
|
-
e1 = parseQ
|
319
|
-
p = peek
|
320
|
-
if p and not "|)".include? p
|
321
|
-
e2 = parseJ
|
322
|
-
e1[1].addEps(e2[0])
|
323
|
-
e1 = [e1[0],e2[1]]
|
330
|
+
return e1
|
324
331
|
end
|
325
332
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
333
|
+
def parseQ
|
334
|
+
e1 = parseP
|
335
|
+
p = peek
|
336
|
+
|
337
|
+
if p == '*'
|
338
|
+
read
|
339
|
+
e1[0].addEps(e1[1])
|
340
|
+
e1[1].addEps(e1[0])
|
341
|
+
elsif p == '+'
|
342
|
+
read
|
343
|
+
e1[1].addEps(e1[0])
|
344
|
+
elsif p == '?'
|
345
|
+
read
|
346
|
+
e1[0].addEps(e1[1])
|
347
|
+
# e1[0].generatePDF("optional")
|
348
|
+
end
|
349
|
+
return e1
|
350
|
+
end
|
332
351
|
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
352
|
+
|
353
|
+
def peek(mustExist = false)
|
354
|
+
# skip over any non-linefeed whitespace
|
355
|
+
while @cursor < @script.size && " \t".index(@script[@cursor])
|
356
|
+
@cursor += 1
|
357
|
+
end
|
358
|
+
if mustExist or @cursor < @script.size
|
359
|
+
@script[@cursor]
|
360
|
+
else
|
361
|
+
nil
|
362
|
+
end
|
344
363
|
end
|
345
|
-
return e1
|
346
|
-
end
|
347
|
-
|
348
364
|
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
365
|
+
def readIf(expChar)
|
366
|
+
r = (peek == expChar)
|
367
|
+
if r
|
368
|
+
read
|
369
|
+
end
|
370
|
+
return r
|
353
371
|
end
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
372
|
+
|
373
|
+
def read(expChar = nil)
|
374
|
+
ch = peek
|
375
|
+
if ch and ((not expChar) or ch == expChar)
|
376
|
+
@cursor += 1
|
377
|
+
ch
|
378
|
+
else
|
379
|
+
abort 'Unexpected end of input'
|
380
|
+
end
|
358
381
|
end
|
359
382
|
end
|
360
383
|
|
361
|
-
|
362
|
-
r = (peek == expChar)
|
363
|
-
if r
|
364
|
-
read
|
365
|
-
end
|
366
|
-
return r
|
367
|
-
end
|
368
|
-
|
369
|
-
def read(expChar = nil)
|
370
|
-
ch = peek
|
371
|
-
if ch and ((not expChar) or ch == expChar)
|
372
|
-
@cursor += 1
|
373
|
-
ch
|
374
|
-
else
|
375
|
-
abort 'Unexpected end of input'
|
376
|
-
end
|
377
|
-
end
|
378
|
-
end
|
379
|
-
|
384
|
+
end # module ToknInternal
|