tokn 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.txt +4 -5
- data/bin/tokncompile +1 -1
- data/bin/toknprocess +10 -4
- data/lib/tokn/code_set.rb +332 -337
- data/lib/tokn/dfa.rb +187 -162
- data/lib/tokn/dfa_builder.rb +218 -220
- data/lib/tokn/range_partition.rb +205 -203
- data/lib/tokn/reg_parse.rb +336 -331
- data/lib/tokn/state.rb +267 -270
- data/lib/tokn/token_defn_parser.rb +144 -139
- data/lib/tokn/tokenizer.rb +243 -175
- data/lib/tokn/tokn_const.rb +11 -6
- data/lib/tokn/tools.rb +42 -20
- data/test/Example1.rb +50 -0
- data/test/data/compileddfa.txt +1 -0
- data/test/data/sampletext.txt +6 -1
- data/test/test.rb +17 -12
- metadata +7 -6
- data/test/simple.rb +0 -33
data/lib/tokn/range_partition.rb
CHANGED
@@ -1,233 +1,235 @@
|
|
1
1
|
require_relative 'tools'
|
2
2
|
req('tokn_const code_set')
|
3
3
|
|
4
|
-
|
5
|
-
# A data structure that transforms a set of CodeSets to a
|
6
|
-
# disjoint set of them, such that no two range sets overlap.
|
7
|
-
#
|
8
|
-
# This is improve the efficiency of the NFA => DFA algorithm,
|
9
|
-
# which involves gathering information about what states are
|
10
|
-
# reachable on certain characters. We can't afford to treat each
|
11
|
-
# character as a singleton, since the ranges can be quite large.
|
12
|
-
# Hence, we want to treat ranges of characters as single entities;
|
13
|
-
# this will only work if no two such ranges overlap.
|
14
|
-
#
|
15
|
-
# It works by starting with a tree whose node is labelled with
|
16
|
-
# the maximal superset of character values. Then, for each edge
|
17
|
-
# in the NFA, performs a DFS on this tree, splitting any node that
|
18
|
-
# only partially intersects any one set that appears in the edge label.
|
19
|
-
# The running time is O(n log k), where n is the size of the NFA, and
|
20
|
-
# k is the height of the resulting tree.
|
21
|
-
#
|
22
|
-
# We encourage k to be small by sorting the NFA edges by their
|
23
|
-
# label complexity.
|
24
|
-
#
|
25
|
-
class RangePartition
|
26
|
-
include Tokn
|
4
|
+
module ToknInternal
|
27
5
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
6
|
+
# A data structure that transforms a set of CodeSets to a
|
7
|
+
# disjoint set of them, such that no two range sets overlap.
|
8
|
+
#
|
9
|
+
# This is improve the efficiency of the NFA => DFA algorithm,
|
10
|
+
# which involves gathering information about what states are
|
11
|
+
# reachable on certain characters. We can't afford to treat each
|
12
|
+
# character as a singleton, since the ranges can be quite large.
|
13
|
+
# Hence, we want to treat ranges of characters as single entities;
|
14
|
+
# this will only work if no two such ranges overlap.
|
15
|
+
#
|
16
|
+
# It works by starting with a tree whose node is labelled with
|
17
|
+
# the maximal superset of character values. Then, for each edge
|
18
|
+
# in the NFA, performs a DFS on this tree, splitting any node that
|
19
|
+
# only partially intersects any one set that appears in the edge label.
|
20
|
+
# The running time is O(n log k), where n is the size of the NFA, and
|
21
|
+
# k is the height of the resulting tree.
|
22
|
+
#
|
23
|
+
# We encourage k to be small by sorting the NFA edges by their
|
24
|
+
# label complexity.
|
25
|
+
#
|
26
|
+
class RangePartition
|
27
|
+
# include Tokn
|
28
|
+
|
29
|
+
def initialize()
|
30
|
+
# We will build a tree, where each node has a CodeSet
|
31
|
+
# associated with it, and the child nodes (if present)
|
32
|
+
# partition this CodeSet into smaller, nonempty sets.
|
33
|
+
|
34
|
+
# A tree is represented by a node, where each node is a pair [x,y],
|
35
|
+
# with x the node's CodeSet, and y a list of the node's children.
|
57
36
|
|
58
|
-
|
59
|
-
|
60
|
-
|
37
|
+
@nextNodeId = 0
|
38
|
+
|
39
|
+
# Make the root node hold the largest possible CodeSet.
|
40
|
+
# We want to be able to include all the token ids as well.
|
41
|
+
|
42
|
+
@rootNode = buildNode(CodeSet.new(CODEMIN,CODEMAX))
|
43
|
+
|
44
|
+
@setsToAdd = Set.new
|
45
|
+
|
46
|
+
# Add epsilon immediately, so it's always in its own subset
|
47
|
+
addSet(CodeSet.new(EPSILON))
|
48
|
+
|
49
|
+
@prepared = false
|
61
50
|
end
|
62
|
-
|
63
|
-
# Construct partition from previously added sets
|
64
|
-
|
65
|
-
list = @setsToAdd.to_a
|
66
|
-
|
67
|
-
# Sort set by cardinality: probably get a more balanced tree
|
68
|
-
# if larger sets are processed first
|
69
|
-
list.sort!{ |x,y| y.cardinality <=> x.cardinality }
|
70
|
-
|
71
|
-
list.each do |s|
|
72
|
-
addSetAux(s)
|
73
|
-
end
|
74
|
-
|
75
|
-
@prepared = true
|
76
|
-
end
|
77
|
-
|
78
51
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
if !@prepared
|
83
|
-
raise IllegalStateException
|
84
|
-
end
|
85
|
-
|
86
|
-
g = ""
|
87
|
-
g += "digraph "+name+" {\n\n"
|
88
|
-
|
89
|
-
nodes = []
|
90
|
-
buildNodeList(nodes)
|
91
|
-
nodes.each do |node|
|
92
|
-
g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
|
93
|
-
end
|
94
|
-
|
95
|
-
g += "\n"
|
96
|
-
nodes.each do |node|
|
97
|
-
node.children.each do |ch|
|
98
|
-
g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
|
52
|
+
def addSet(s)
|
53
|
+
if @prepared
|
54
|
+
raise IllegalStateException
|
99
55
|
end
|
56
|
+
@setsToAdd.add(s)
|
100
57
|
end
|
101
|
-
|
102
|
-
g += "\n}\n"
|
103
|
-
g.gsub!( /'/, '"' )
|
104
58
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
59
|
+
def prepare()
|
60
|
+
if @prepared
|
61
|
+
raise IllegalStateException
|
62
|
+
end
|
63
|
+
|
64
|
+
# Construct partition from previously added sets
|
65
|
+
|
66
|
+
list = @setsToAdd.to_a
|
67
|
+
|
68
|
+
# Sort set by cardinality: probably get a more balanced tree
|
69
|
+
# if larger sets are processed first
|
70
|
+
list.sort!{ |x,y| y.cardinality <=> x.cardinality }
|
71
|
+
|
72
|
+
list.each do |s|
|
73
|
+
addSetAux(s)
|
74
|
+
end
|
75
|
+
|
76
|
+
@prepared = true
|
119
77
|
end
|
120
78
|
|
121
|
-
list = []
|
122
|
-
s2 = s.makeCopy
|
123
|
-
applyAux(@rootNode, s2, list)
|
124
|
-
|
125
|
-
# Sort the list of subsets by their first elements
|
126
|
-
list.sort! { |x,y| x.array[0] <=> y.array[0] }
|
127
79
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
if s1.empty?
|
149
|
-
next
|
150
|
-
end
|
151
|
-
|
152
|
-
applyAux(m, s1, list)
|
153
|
-
|
154
|
-
!db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
|
155
|
-
s = s.difference(m.set)
|
156
|
-
!db||pr(" subtracted child set, now [%s]\n",d(s))
|
157
|
-
if s.empty?
|
158
|
-
break
|
80
|
+
# Generate a .dot file, and from that, a PDF, for debug purposes
|
81
|
+
#
|
82
|
+
def generatePDF(name = "partition")
|
83
|
+
if !@prepared
|
84
|
+
raise IllegalStateException
|
85
|
+
end
|
86
|
+
|
87
|
+
g = ""
|
88
|
+
g += "digraph "+name+" {\n\n"
|
89
|
+
|
90
|
+
nodes = []
|
91
|
+
buildNodeList(nodes)
|
92
|
+
nodes.each do |node|
|
93
|
+
g += " '" + d(node) + "' [shape=rect] [label='" + node.set.to_s_alt + "']\n"
|
94
|
+
end
|
95
|
+
|
96
|
+
g += "\n"
|
97
|
+
nodes.each do |node|
|
98
|
+
node.children.each do |ch|
|
99
|
+
g += " '" + d(node) + "' -> '" + d(ch) + "'\n"
|
159
100
|
end
|
160
101
|
end
|
161
|
-
|
162
|
-
|
102
|
+
|
103
|
+
g += "\n}\n"
|
104
|
+
g.gsub!( /'/, '"' )
|
105
|
+
|
106
|
+
dotToPDF(g,name)
|
163
107
|
|
164
|
-
def buildNode(rangeSet)
|
165
|
-
id = @nextNodeId
|
166
|
-
@nextNodeId += 1
|
167
|
-
n = RPNode.new(id, rangeSet, [])
|
168
|
-
n
|
169
|
-
end
|
170
|
-
|
171
|
-
def buildNodeList(list, root = nil)
|
172
|
-
if not root
|
173
|
-
root = @rootNode
|
174
|
-
end
|
175
|
-
list.push(root)
|
176
|
-
root.children.each do |x|
|
177
|
-
buildNodeList(list, x)
|
178
108
|
end
|
179
|
-
end
|
180
109
|
|
181
|
-
|
182
|
-
|
183
|
-
#
|
184
|
-
def addSetAux(s, n = @rootNode)
|
185
|
-
#
|
186
|
-
# The algorithm is this:
|
110
|
+
|
111
|
+
# Apply the partition to a CodeSet
|
187
112
|
#
|
188
|
-
#
|
189
|
-
#
|
190
|
-
# if
|
191
|
-
# x = n.set - s
|
192
|
-
# add x,y as child sets of n
|
193
|
-
# else
|
194
|
-
# for each child m of n:
|
195
|
-
# t = intersect of m.set and s
|
196
|
-
# if t is nonempty, add(t, m)
|
113
|
+
# > s CodeSet
|
114
|
+
# < array of subsets from the partition whose union equals s
|
115
|
+
# (this array will be the single element s if no partitioning was necessary)
|
197
116
|
#
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
if n.children.empty?
|
202
|
-
x = n.set.difference(s)
|
203
|
-
n.children.push buildNode(x)
|
204
|
-
n.children.push buildNode(s)
|
205
|
-
else
|
206
|
-
n.children.each do |m|
|
207
|
-
t = m.set.intersect(s)
|
208
|
-
addSetAux(t,m) unless t.empty?
|
117
|
+
def apply(s)
|
118
|
+
if !@prepared
|
119
|
+
raise IllegalStateException
|
209
120
|
end
|
121
|
+
|
122
|
+
list = []
|
123
|
+
s2 = s.makeCopy
|
124
|
+
applyAux(@rootNode, s2, list)
|
125
|
+
|
126
|
+
# Sort the list of subsets by their first elements
|
127
|
+
list.sort! { |x,y| x.array[0] <=> y.array[0] }
|
128
|
+
|
129
|
+
list
|
210
130
|
end
|
211
|
-
end
|
212
131
|
|
213
|
-
end
|
214
|
-
|
215
|
-
# A node within a RangePartition tree
|
216
|
-
#
|
217
|
-
class RPNode
|
218
132
|
|
219
|
-
|
133
|
+
private
|
220
134
|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
135
|
+
def applyAux(n, s, list)
|
136
|
+
db = false
|
137
|
+
|
138
|
+
!db||pr("applyAux to set[%s], node=[%s]\n",d(s),d(n.set))
|
139
|
+
|
140
|
+
if n.children.empty?
|
141
|
+
# # Verify that this set equals the input set
|
142
|
+
# myAssert(s.eql? n.set)
|
143
|
+
list.push(s)
|
144
|
+
else
|
145
|
+
n.children.each do |m|
|
146
|
+
s1 = s.intersect(m.set)
|
147
|
+
!db||pr(" child set=[%s], intersection=[%s]\n",d(m.set),d(s1))
|
148
|
+
|
149
|
+
if s1.empty?
|
150
|
+
next
|
151
|
+
end
|
152
|
+
|
153
|
+
applyAux(m, s1, list)
|
154
|
+
|
155
|
+
!db||pr(" subtracting child set [%s] from s=[%s]\n",d(m.set),d(s))
|
156
|
+
s = s.difference(m.set)
|
157
|
+
!db||pr(" subtracted child set, now [%s]\n",d(s))
|
158
|
+
if s.empty?
|
159
|
+
break
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def buildNode(rangeSet)
|
166
|
+
id = @nextNodeId
|
167
|
+
@nextNodeId += 1
|
168
|
+
n = RPNode.new(id, rangeSet, [])
|
169
|
+
n
|
170
|
+
end
|
226
171
|
|
227
|
-
|
228
|
-
|
172
|
+
def buildNodeList(list, root = nil)
|
173
|
+
if not root
|
174
|
+
root = @rootNode
|
175
|
+
end
|
176
|
+
list.push(root)
|
177
|
+
root.children.each do |x|
|
178
|
+
buildNodeList(list, x)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
# Add a set to the tree, extending the tree as necessary to
|
183
|
+
# maintain a (disjoint) partition
|
184
|
+
#
|
185
|
+
def addSetAux(s, n = @rootNode)
|
186
|
+
#
|
187
|
+
# The algorithm is this:
|
188
|
+
#
|
189
|
+
# add (s, n) # add set s to node n; s must be subset of n.set
|
190
|
+
# if n.set = s, return
|
191
|
+
# if n is leaf:
|
192
|
+
# x = n.set - s
|
193
|
+
# add x,y as child sets of n
|
194
|
+
# else
|
195
|
+
# for each child m of n:
|
196
|
+
# t = intersect of m.set and s
|
197
|
+
# if t is nonempty, add(t, m)
|
198
|
+
#
|
199
|
+
if n.set.eql? s
|
200
|
+
return
|
201
|
+
end
|
202
|
+
if n.children.empty?
|
203
|
+
x = n.set.difference(s)
|
204
|
+
n.children.push buildNode(x)
|
205
|
+
n.children.push buildNode(s)
|
206
|
+
else
|
207
|
+
n.children.each do |m|
|
208
|
+
t = m.set.intersect(s)
|
209
|
+
addSetAux(t,m) unless t.empty?
|
210
|
+
end
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
229
214
|
end
|
230
215
|
|
231
|
-
|
216
|
+
# A node within a RangePartition tree
|
217
|
+
#
|
218
|
+
class RPNode
|
219
|
+
|
220
|
+
attr_accessor :id, :set, :children
|
221
|
+
|
222
|
+
def initialize(id, set, children)
|
223
|
+
@id = id
|
224
|
+
@set = set
|
225
|
+
@children = children
|
226
|
+
end
|
227
|
+
|
228
|
+
def inspect
|
229
|
+
return 'N' + id.to_s
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
232
233
|
|
234
|
+
end # module ToknInternal
|
233
235
|
|
data/lib/tokn/reg_parse.rb
CHANGED
@@ -1,379 +1,384 @@
|
|
1
1
|
require_relative 'tools'
|
2
2
|
req('code_set state')
|
3
3
|
|
4
|
-
|
5
|
-
end
|
6
|
-
|
7
|
-
# Parses a single regular expression from a string.
|
8
|
-
# Produces an NFA with distinguished start and end states
|
9
|
-
# (none of these states are marked as final states)
|
10
|
-
#
|
11
|
-
# Here is the grammar for regular expressions. Spaces are ignored,
|
12
|
-
# and can be liberally sprinkled within the regular expressions to
|
13
|
-
# aid readability. To represent a space, the \s escape sequence must be used.
|
14
|
-
# See the file 'sampletokens.txt' for some examples.
|
15
|
-
#
|
16
|
-
# Expressions have one of these types:
|
17
|
-
#
|
18
|
-
# E : base class
|
19
|
-
# J : a Join expression, formed by concatenating one or more together
|
20
|
-
# Q : a Quantified expression; followed optionally by '*', '+', or '?'
|
21
|
-
# P : a Parenthesized expression, which is optionally surrounded with (), {}, []
|
22
|
-
#
|
23
|
-
# E -> J '|' E
|
24
|
-
# | J
|
25
|
-
#
|
26
|
-
# J -> Q J
|
27
|
-
# | Q
|
28
|
-
#
|
29
|
-
# Q -> P '*'
|
30
|
-
# | P '+'
|
31
|
-
# | P '?'
|
32
|
-
# | P
|
33
|
-
#
|
34
|
-
# P -> '(' E ')'
|
35
|
-
# | '{' TOKENNAME '}'
|
36
|
-
# | '[^' SETSEQ ']' A code not appearing in the set
|
37
|
-
# | '[' SETSEQ ']'
|
38
|
-
# | CHARCODE
|
39
|
-
#
|
40
|
-
# SETSEQ -> SET SETSEQ
|
41
|
-
# | SET
|
42
|
-
#
|
43
|
-
# SET -> CHARCODE
|
44
|
-
# | CHARCODE '-' CHARCODE
|
45
|
-
#
|
46
|
-
# CHARCODE ->
|
47
|
-
# a | b | c ... any printable except {,},[, etc.
|
48
|
-
# | \xhh hex value from 00...ff
|
49
|
-
# | \uhhhh hex value from 0000...ffff (e.g., unicode)
|
50
|
-
# | \f | \n | \r | \t formfeed, linefeed, return, tab
|
51
|
-
# | \s a space (' ')
|
52
|
-
# | \* where * is some other non-alphabetic
|
53
|
-
# character that needs to be escaped
|
54
|
-
#
|
55
|
-
# The parser performs recursive descent parsing;
|
56
|
-
# each method returns an NFA represented by
|
57
|
-
# a pair of states: the start and end states.
|
58
|
-
#
|
59
|
-
class RegParse
|
60
|
-
|
61
|
-
attr_reader :startState, :endState
|
4
|
+
module ToknInternal
|
62
5
|
|
63
|
-
#
|
64
|
-
# @param script script to parse
|
65
|
-
# @param tokenDefMap if not nil, a map of previously parsed regular expressions
|
66
|
-
# (mapping names to ids) to be consulted if a curly brace expression appears
|
67
|
-
# in the script
|
68
|
-
#
|
69
|
-
def initialize(script, tokenDefMap = nil)
|
70
|
-
@script = script.strip
|
71
|
-
@nextStateId = 0
|
72
|
-
@tokenDefMap = tokenDefMap
|
73
|
-
parseScript
|
74
|
-
end
|
75
|
-
|
76
|
-
|
77
|
-
def inspect
|
78
|
-
s = "RegParse: #{@script}"
|
79
|
-
s += " start:"+d(@startState)+" end:"+d(@endState)
|
80
|
-
return s
|
81
|
-
end
|
82
|
-
|
83
|
-
private
|
84
|
-
|
85
|
-
# Raise a ParseException, with a helpful message indicating
|
86
|
-
# the parser's current location within the string
|
6
|
+
# Exception thrown if problem parsing regular expression
|
87
7
|
#
|
88
|
-
|
89
|
-
# Assume we've already read the problem character
|
90
|
-
i = @cursor - 1
|
91
|
-
s = ''
|
92
|
-
if i > 4
|
93
|
-
s += '...'
|
94
|
-
end
|
95
|
-
s += @script[i-3...i] || ""
|
96
|
-
s += ' !!! '
|
97
|
-
s += @script[i...i+3] || ""
|
98
|
-
if i +3 < @script.size
|
99
|
-
s += '...'
|
100
|
-
end
|
101
|
-
raise ParseException, msg + ": "+s
|
8
|
+
class ParseException < Exception
|
102
9
|
end
|
103
10
|
|
104
|
-
#
|
11
|
+
# Parses a single regular expression from a string.
|
12
|
+
# Produces an NFA with distinguished start and end states
|
13
|
+
# (none of these states are marked as final states)
|
105
14
|
#
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
#
|
15
|
+
# Here is the grammar for regular expressions. Spaces are ignored,
|
16
|
+
# and can be liberally sprinkled within the regular expressions to
|
17
|
+
# aid readability. To represent a space, the \s escape sequence must be used.
|
18
|
+
# See the file 'sampletokens.txt' for some examples.
|
19
|
+
#
|
20
|
+
# Expressions have one of these types:
|
21
|
+
#
|
22
|
+
# E : base class
|
23
|
+
# J : a Join expression, formed by concatenating one or more together
|
24
|
+
# Q : a Quantified expression; followed optionally by '*', '+', or '?'
|
25
|
+
# P : a Parenthesized expression, which is optionally surrounded with (), {}, []
|
26
|
+
#
|
27
|
+
# E -> J '|' E
|
28
|
+
# | J
|
29
|
+
#
|
30
|
+
# J -> Q J
|
31
|
+
# | Q
|
32
|
+
#
|
33
|
+
# Q -> P '*'
|
34
|
+
# | P '+'
|
35
|
+
# | P '?'
|
36
|
+
# | P
|
37
|
+
#
|
38
|
+
# P -> '(' E ')'
|
39
|
+
# | '{' TOKENNAME '}'
|
40
|
+
# | '[^' SETSEQ ']' A code not appearing in the set
|
41
|
+
# | '[' SETSEQ ']'
|
42
|
+
# | CHARCODE
|
43
|
+
#
|
44
|
+
# SETSEQ -> SET SETSEQ
|
45
|
+
# | SET
|
46
|
+
#
|
47
|
+
# SET -> CHARCODE
|
48
|
+
# | CHARCODE '-' CHARCODE
|
121
49
|
#
|
122
|
-
|
50
|
+
# CHARCODE ->
|
51
|
+
# a | b | c ... any printable except {,},[, etc.
|
52
|
+
# | \xhh hex value from 00...ff
|
53
|
+
# | \uhhhh hex value from 0000...ffff (e.g., unicode)
|
54
|
+
# | \f | \n | \r | \t formfeed, linefeed, return, tab
|
55
|
+
# | \s a space (' ')
|
56
|
+
# | \* where * is some other non-alphabetic
|
57
|
+
# character that needs to be escaped
|
58
|
+
#
|
59
|
+
# The parser performs recursive descent parsing;
|
60
|
+
# each method returns an NFA represented by
|
61
|
+
# a pair of states: the start and end states.
|
62
|
+
#
|
63
|
+
class RegParse
|
123
64
|
|
124
|
-
|
65
|
+
attr_reader :startState, :endState
|
125
66
|
|
126
|
-
|
67
|
+
# Construct a parser and perform the parsing
|
68
|
+
# @param script script to parse
|
69
|
+
# @param tokenDefMap if not nil, a map of previously parsed regular expressions
|
70
|
+
# (mapping names to ids) to be consulted if a curly brace expression appears
|
71
|
+
# in the script
|
72
|
+
#
|
73
|
+
def initialize(script, tokenDefMap = nil)
|
74
|
+
@script = script.strip
|
75
|
+
@nextStateId = 0
|
76
|
+
@tokenDefMap = tokenDefMap
|
77
|
+
parseScript
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
def inspect
|
82
|
+
s = "RegParse: #{@script}"
|
83
|
+
s += " start:"+d(@startState)+" end:"+d(@endState)
|
84
|
+
return s
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
# Raise a ParseException, with a helpful message indicating
|
90
|
+
# the parser's current location within the string
|
91
|
+
#
|
92
|
+
def abort(msg)
|
93
|
+
# Assume we've already read the problem character
|
94
|
+
i = @cursor - 1
|
95
|
+
s = ''
|
96
|
+
if i > 4
|
97
|
+
s += '...'
|
98
|
+
end
|
99
|
+
s += @script[i-3...i] || ""
|
100
|
+
s += ' !!! '
|
101
|
+
s += @script[i...i+3] || ""
|
102
|
+
if i +3 < @script.size
|
103
|
+
s += '...'
|
104
|
+
end
|
105
|
+
raise ParseException, msg + ": "+s
|
106
|
+
end
|
127
107
|
|
128
|
-
|
129
|
-
|
108
|
+
# Read next character as a hex digit
|
109
|
+
#
|
110
|
+
def readHex
|
111
|
+
v = read.upcase.ord
|
112
|
+
if v >= 48 and v < 58
|
113
|
+
return v - 48
|
114
|
+
elsif v >= 65 and v < 71
|
115
|
+
return v - 65 + 10
|
116
|
+
else
|
117
|
+
abort "Missing hex digit"
|
118
|
+
end
|
130
119
|
end
|
131
|
-
|
132
|
-
|
120
|
+
|
121
|
+
|
122
|
+
NO_ESCAPE_CHARS = Regexp.new("[A-Za-z0-9]")
|
123
|
+
|
124
|
+
# Parse character definition (CHARCODE) from input
|
125
|
+
#
|
126
|
+
def parseChar
|
133
127
|
|
134
128
|
c = read
|
135
129
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
val = " ".ord
|
130
|
+
val = c.ord
|
131
|
+
|
132
|
+
if "{}[]*?+|-^()".include?(c) or val <= 0x20
|
133
|
+
abort "Unexpected or unescaped character"
|
134
|
+
end
|
135
|
+
|
136
|
+
if c == '\\'
|
137
|
+
|
138
|
+
c = read
|
139
|
+
|
140
|
+
if "xX".include? c
|
141
|
+
val = (readHex() << 4) | readHex()
|
142
|
+
elsif "uU".include? c
|
143
|
+
val = (readHex() << 12) | (readHex() << 8) | (readHex() << 4) | readHex()
|
151
144
|
else
|
152
|
-
if c
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
145
|
+
if c == 'f'
|
146
|
+
val = "\f".ord
|
147
|
+
elsif c == 'r'
|
148
|
+
val == "\r".ord
|
149
|
+
elsif c == 'n'
|
150
|
+
val = "\n".ord
|
151
|
+
elsif c == 't'
|
152
|
+
val = "\t".ord
|
153
|
+
elsif c == 's'
|
154
|
+
val = " ".ord
|
155
|
+
else
|
156
|
+
if c =~ NO_ESCAPE_CHARS
|
157
|
+
abort "Unsupported escape sequence ("+c+")"
|
158
|
+
end
|
159
|
+
val = c.ord
|
160
|
+
end
|
161
|
+
end
|
157
162
|
end
|
163
|
+
|
164
|
+
return val
|
158
165
|
end
|
159
|
-
|
160
|
-
return val
|
161
|
-
end
|
162
|
-
|
163
166
|
|
164
|
-
def parseCharNFA
|
165
|
-
val = parseChar
|
166
|
-
|
167
|
-
# Construct a pair of states with an edge between them
|
168
|
-
# labelled with this character code
|
169
167
|
|
170
|
-
|
171
|
-
|
172
|
-
cset = CodeSet.new
|
173
|
-
cset.add(val)
|
174
|
-
sA.addEdge(cset, sB)
|
175
|
-
return [sA,sB]
|
176
|
-
end
|
177
|
-
|
178
|
-
|
168
|
+
def parseCharNFA
|
169
|
+
val = parseChar
|
179
170
|
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
171
|
+
# Construct a pair of states with an edge between them
|
172
|
+
# labelled with this character code
|
173
|
+
|
174
|
+
sA = newState
|
175
|
+
sB = newState
|
176
|
+
cset = CodeSet.new
|
177
|
+
cset.add(val)
|
178
|
+
sA.addEdge(cset, sB)
|
179
|
+
return [sA,sB]
|
187
180
|
end
|
188
|
-
end
|
189
|
-
|
190
|
-
def parseScript
|
191
|
-
# Set up the input scanner
|
192
|
-
@cursor = 0
|
193
181
|
|
194
|
-
exp = parseE
|
195
|
-
@startState = exp[0]
|
196
|
-
@endState = exp[1]
|
197
|
-
end
|
198
|
-
|
199
|
-
def newState
|
200
|
-
s = State.new(@nextStateId)
|
201
|
-
@nextStateId += 1
|
202
|
-
return s
|
203
|
-
end
|
204
|
-
|
205
|
-
def parseSET
|
206
|
-
u = parseChar
|
207
|
-
v = u+1
|
208
|
-
if readIf('-')
|
209
|
-
v = parseChar() + 1
|
210
|
-
if v <= u
|
211
|
-
abort "Illegal range"
|
212
|
-
end
|
213
|
-
end
|
214
|
-
return u,v
|
215
|
-
end
|
216
|
-
|
217
|
-
def parseSETSEQ
|
218
|
-
db = false
|
219
|
-
|
220
|
-
!db || pr("parseSETSEQ\n")
|
221
|
-
|
222
|
-
read('[')
|
223
|
-
negated = readIf('^')
|
224
|
-
!db || pr(" negated=%s\n",negated)
|
225
182
|
|
226
|
-
|
183
|
+
|
184
|
+
def dbInfo
|
185
|
+
j = @cursor
|
186
|
+
k = j + 5
|
187
|
+
if k >= @script.size
|
188
|
+
return @script[j..k]+"<<<== end"
|
189
|
+
else
|
190
|
+
return @script[j..k]+"..."
|
191
|
+
end
|
192
|
+
end
|
227
193
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
!db || pr(" added another; %s\n",d(rs))
|
236
|
-
end
|
237
|
-
if negated
|
238
|
-
rs.negate
|
239
|
-
!db || pr(" negated=%s\n",d(rs))
|
194
|
+
def parseScript
|
195
|
+
# Set up the input scanner
|
196
|
+
@cursor = 0
|
197
|
+
|
198
|
+
exp = parseE
|
199
|
+
@startState = exp[0]
|
200
|
+
@endState = exp[1]
|
240
201
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
202
|
+
|
203
|
+
def newState
|
204
|
+
s = State.new(@nextStateId)
|
205
|
+
@nextStateId += 1
|
206
|
+
return s
|
244
207
|
end
|
245
208
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
209
|
+
def parseSET
|
210
|
+
u = parseChar
|
211
|
+
v = u+1
|
212
|
+
if readIf('-')
|
213
|
+
v = parseChar() + 1
|
214
|
+
if v <= u
|
215
|
+
abort "Illegal range"
|
216
|
+
end
|
217
|
+
end
|
218
|
+
return u,v
|
219
|
+
end
|
251
220
|
|
252
|
-
|
221
|
+
def parseSETSEQ
|
222
|
+
db = false
|
223
|
+
|
224
|
+
!db || pr("parseSETSEQ\n")
|
225
|
+
|
226
|
+
read('[')
|
227
|
+
negated = readIf('^')
|
228
|
+
!db || pr(" negated=%s\n",negated)
|
229
|
+
|
230
|
+
rs = CodeSet.new
|
231
|
+
|
232
|
+
u,v = parseSET
|
233
|
+
rs.add(u,v)
|
234
|
+
!db || pr(" initial set=%s\n",d(rs))
|
253
235
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
236
|
+
while not readIf(']')
|
237
|
+
u,v = parseSET
|
238
|
+
rs.add(u,v)
|
239
|
+
!db || pr(" added another; %s\n",d(rs))
|
240
|
+
end
|
241
|
+
if negated
|
242
|
+
rs.negate
|
243
|
+
!db || pr(" negated=%s\n",d(rs))
|
244
|
+
end
|
245
|
+
|
246
|
+
if rs.empty?
|
247
|
+
abort "Empty character range"
|
248
|
+
end
|
249
|
+
|
250
|
+
sA = newState
|
251
|
+
sB = newState
|
252
|
+
sA.addEdge(rs, sB)
|
253
|
+
return [sA,sB]
|
270
254
|
end
|
271
|
-
rg = tokInfo[1]
|
272
255
|
|
273
|
-
|
256
|
+
TOKENREF_EXPR = Regexp.new('^[_A-Za-z][_A-Za-z0-9]*$')
|
274
257
|
|
275
|
-
|
276
|
-
|
258
|
+
def parseTokenDef
|
259
|
+
read('{')
|
260
|
+
name = ''
|
261
|
+
while !readIf('}')
|
262
|
+
name += read
|
263
|
+
end
|
264
|
+
# pr("name=[%s], TR=[%s], match=[%s]\n",d(name),d(TOKENREF_EXPR),d(name =~ TOKENREF_EXPR))
|
265
|
+
if name !~ TOKENREF_EXPR
|
266
|
+
abort "Problem with token name"
|
267
|
+
end
|
268
|
+
tokInfo = nil
|
269
|
+
if @tokenDefMap
|
270
|
+
tokInfo = @tokenDefMap[name]
|
271
|
+
end
|
272
|
+
if !tokInfo
|
273
|
+
abort "Undefined token"
|
274
|
+
end
|
275
|
+
rg = tokInfo[1]
|
276
|
+
|
277
|
+
oldToNewMap, @nextStateId = rg.startState.duplicateNFA(@nextStateId)
|
278
|
+
|
279
|
+
newStart = oldToNewMap[rg.startState]
|
280
|
+
newEnd = oldToNewMap[rg.endState]
|
281
|
+
|
282
|
+
[newStart, newEnd]
|
283
|
+
|
284
|
+
|
285
|
+
end
|
277
286
|
|
278
|
-
[newStart, newEnd]
|
279
287
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
e1
|
294
|
-
|
295
|
-
|
288
|
+
def parseP
|
289
|
+
ch = peek
|
290
|
+
if ch == '('
|
291
|
+
read
|
292
|
+
e1 = parseE
|
293
|
+
read ')'
|
294
|
+
elsif ch == '{'
|
295
|
+
e1 = parseTokenDef
|
296
|
+
elsif ch == '['
|
297
|
+
e1 = parseSETSEQ
|
298
|
+
else
|
299
|
+
e1 = parseCharNFA
|
300
|
+
end
|
301
|
+
return e1
|
302
|
+
end
|
303
|
+
|
304
|
+
|
305
|
+
def parseE
|
306
|
+
e1 = parseJ
|
307
|
+
if readIf('|')
|
308
|
+
e2 = parseE
|
309
|
+
|
310
|
+
u = newState
|
311
|
+
v = newState
|
312
|
+
u.addEps(e1[0])
|
313
|
+
u.addEps(e2[0])
|
314
|
+
e1[1].addEps(v)
|
315
|
+
e2[1].addEps(v)
|
316
|
+
e1 = [u,v]
|
317
|
+
end
|
318
|
+
return e1
|
296
319
|
end
|
297
|
-
return e1
|
298
|
-
end
|
299
320
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
321
|
+
def parseJ
|
322
|
+
e1 = parseQ
|
323
|
+
p = peek
|
324
|
+
if p and not "|)".include? p
|
325
|
+
e2 = parseJ
|
326
|
+
e1[1].addEps(e2[0])
|
327
|
+
e1 = [e1[0],e2[1]]
|
328
|
+
end
|
305
329
|
|
306
|
-
|
307
|
-
v = newState
|
308
|
-
u.addEps(e1[0])
|
309
|
-
u.addEps(e2[0])
|
310
|
-
e1[1].addEps(v)
|
311
|
-
e2[1].addEps(v)
|
312
|
-
e1 = [u,v]
|
313
|
-
end
|
314
|
-
return e1
|
315
|
-
end
|
316
|
-
|
317
|
-
def parseJ
|
318
|
-
e1 = parseQ
|
319
|
-
p = peek
|
320
|
-
if p and not "|)".include? p
|
321
|
-
e2 = parseJ
|
322
|
-
e1[1].addEps(e2[0])
|
323
|
-
e1 = [e1[0],e2[1]]
|
330
|
+
return e1
|
324
331
|
end
|
325
332
|
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
333
|
+
def parseQ
|
334
|
+
e1 = parseP
|
335
|
+
p = peek
|
336
|
+
|
337
|
+
if p == '*'
|
338
|
+
read
|
339
|
+
e1[0].addEps(e1[1])
|
340
|
+
e1[1].addEps(e1[0])
|
341
|
+
elsif p == '+'
|
342
|
+
read
|
343
|
+
e1[1].addEps(e1[0])
|
344
|
+
elsif p == '?'
|
345
|
+
read
|
346
|
+
e1[0].addEps(e1[1])
|
347
|
+
# e1[0].generatePDF("optional")
|
348
|
+
end
|
349
|
+
return e1
|
350
|
+
end
|
332
351
|
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
352
|
+
|
353
|
+
def peek(mustExist = false)
|
354
|
+
# skip over any non-linefeed whitespace
|
355
|
+
while @cursor < @script.size && " \t".index(@script[@cursor])
|
356
|
+
@cursor += 1
|
357
|
+
end
|
358
|
+
if mustExist or @cursor < @script.size
|
359
|
+
@script[@cursor]
|
360
|
+
else
|
361
|
+
nil
|
362
|
+
end
|
344
363
|
end
|
345
|
-
return e1
|
346
|
-
end
|
347
|
-
|
348
364
|
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
365
|
+
def readIf(expChar)
|
366
|
+
r = (peek == expChar)
|
367
|
+
if r
|
368
|
+
read
|
369
|
+
end
|
370
|
+
return r
|
353
371
|
end
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
372
|
+
|
373
|
+
def read(expChar = nil)
|
374
|
+
ch = peek
|
375
|
+
if ch and ((not expChar) or ch == expChar)
|
376
|
+
@cursor += 1
|
377
|
+
ch
|
378
|
+
else
|
379
|
+
abort 'Unexpected end of input'
|
380
|
+
end
|
358
381
|
end
|
359
382
|
end
|
360
383
|
|
361
|
-
|
362
|
-
r = (peek == expChar)
|
363
|
-
if r
|
364
|
-
read
|
365
|
-
end
|
366
|
-
return r
|
367
|
-
end
|
368
|
-
|
369
|
-
def read(expChar = nil)
|
370
|
-
ch = peek
|
371
|
-
if ch and ((not expChar) or ch == expChar)
|
372
|
-
@cursor += 1
|
373
|
-
ch
|
374
|
-
else
|
375
|
-
abort 'Unexpected end of input'
|
376
|
-
end
|
377
|
-
end
|
378
|
-
end
|
379
|
-
|
384
|
+
end # module ToknInternal
|