tokn 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.txt +4 -5
- data/bin/tokncompile +1 -1
- data/bin/toknprocess +10 -4
- data/lib/tokn/code_set.rb +332 -337
- data/lib/tokn/dfa.rb +187 -162
- data/lib/tokn/dfa_builder.rb +218 -220
- data/lib/tokn/range_partition.rb +205 -203
- data/lib/tokn/reg_parse.rb +336 -331
- data/lib/tokn/state.rb +267 -270
- data/lib/tokn/token_defn_parser.rb +144 -139
- data/lib/tokn/tokenizer.rb +243 -175
- data/lib/tokn/tokn_const.rb +11 -6
- data/lib/tokn/tools.rb +42 -20
- data/test/Example1.rb +50 -0
- data/test/data/compileddfa.txt +1 -0
- data/test/data/sampletext.txt +6 -1
- data/test/test.rb +17 -12
- metadata +7 -6
- data/test/simple.rb +0 -33
data/lib/tokn/dfa.rb
CHANGED
@@ -2,195 +2,220 @@ require 'json'
|
|
2
2
|
require_relative 'tools'
|
3
3
|
req('code_set state')
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
# a
|
8
|
-
#
|
9
|
-
class DFA
|
10
|
-
|
11
|
-
include Tokn
|
12
|
-
|
13
|
-
# Compile a Tokenizer DFA from a token definition script.
|
14
|
-
# If persistPath is not null, it first checks if the file exists and
|
15
|
-
# if so, assumes it contains (in JSON form) a previously compiled
|
16
|
-
# DFA matching this script, and reads the DFA from it.
|
17
|
-
# Second, if no such file exists, it writes the DFA to it after compilation.
|
5
|
+
module Tokn
|
6
|
+
|
7
|
+
# A DFA for tokenizing; includes pointer to a start state, and
|
8
|
+
# a list of token names
|
18
9
|
#
|
19
|
-
|
10
|
+
class DFA
|
20
11
|
|
21
|
-
|
22
|
-
return extractDFA(readTextFile(persistPath))
|
23
|
-
end
|
24
|
-
|
25
|
-
req('token_defn_parser')
|
12
|
+
include ToknInternal
|
26
13
|
|
27
|
-
|
28
|
-
|
14
|
+
# Compile a Tokenizer DFA from a token definition script.
|
15
|
+
# If persistPath is not null, it first checks if the file exists and
|
16
|
+
# if so, assumes it contains (in JSON form) a previously compiled
|
17
|
+
# DFA matching this script, and reads the DFA from it.
|
18
|
+
# Second, if no such file exists, it writes the DFA to it after compilation.
|
19
|
+
#
|
20
|
+
def self.from_script(script, persistPath = nil)
|
21
|
+
|
22
|
+
if persistPath and File.exist?(persistPath)
|
23
|
+
return extractDFA(readTextFile(persistPath))
|
24
|
+
end
|
25
|
+
|
26
|
+
req('token_defn_parser')
|
27
|
+
|
28
|
+
td = TokenDefParser.new(script)
|
29
|
+
dfa = td.dfa
|
30
|
+
|
31
|
+
if persistPath
|
32
|
+
writeTextFile(persistPath, dfa.serialize())
|
33
|
+
end
|
29
34
|
|
30
|
-
|
31
|
-
writeTextFile(persistPath, dfa.serialize())
|
35
|
+
dfa
|
32
36
|
end
|
33
|
-
|
34
|
-
dfa
|
35
|
-
end
|
36
|
-
|
37
|
-
# Similar to dfa_from_script, but reads the script into memory from
|
38
|
-
# the file at scriptPath.
|
39
|
-
#
|
40
|
-
def self.dfa_from_script_file(scriptPath, persistPath = nil)
|
41
|
-
self.dfa_from_script(readTextFile(scriptPath), persistPath)
|
42
|
-
end
|
43
|
-
|
44
|
-
# Compile a Tokenizer DFA from a text file (that contains a
|
45
|
-
# JSON string)
|
46
|
-
#
|
47
|
-
def self.dfa_from_file(path)
|
48
|
-
dfa_from_json(readTextFile(path))
|
49
|
-
end
|
50
|
-
|
51
|
-
# Compile a Tokenizer DFA from a JSON string
|
52
|
-
#
|
53
|
-
def self.dfa_from_json(jsonStr)
|
54
|
-
db = false
|
55
37
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
stateInfo = h["states"]
|
62
|
-
|
63
|
-
!db|| pr("tokens=%s\n",d(tNames))
|
64
|
-
!db|| pr("stateInfo=\n%s\n",d(stateInfo))
|
65
|
-
|
66
|
-
st = []
|
67
|
-
stateInfo.each_with_index do |(key,val),i|
|
68
|
-
!db|| pr(" creating new state, id=%d\n",i)
|
69
|
-
st.push(State.new(i))
|
38
|
+
# Similar to from_script, but reads the script into memory from
|
39
|
+
# the file at scriptPath.
|
40
|
+
#
|
41
|
+
def self.from_script_file(scriptPath, persistPath = nil)
|
42
|
+
self.from_script(readTextFile(scriptPath), persistPath)
|
70
43
|
end
|
71
44
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
edgeList.each do |edge|
|
78
|
-
label,destState = edge
|
79
|
-
cr = CodeSet.new()
|
80
|
-
cr.setArray(label)
|
81
|
-
s.addEdge(cr, st[destState])
|
82
|
-
end
|
45
|
+
# Compile a Tokenizer DFA from a text file (that contains a
|
46
|
+
# JSON string)
|
47
|
+
#
|
48
|
+
def self.from_file(path)
|
49
|
+
from_json(readTextFile(path))
|
83
50
|
end
|
84
51
|
|
85
|
-
DFA
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
# Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
|
100
|
-
# the tokenId is nil. Otherwise, assumes tokenId is 0..n, where
|
101
|
-
# n is the number of token names in the DFA.
|
102
|
-
#
|
103
|
-
def tokenName(tokenId)
|
104
|
-
if !tokenId
|
105
|
-
nm = "<EOF>"
|
106
|
-
elsif tokenId == UNKNOWN_TOKEN
|
107
|
-
nm = "<UNKNOWN>"
|
108
|
-
else
|
109
|
-
if tokenId < 0 || tokenId >= tokenNames.size
|
110
|
-
raise IndexError, "No such token id: "+tokenId.to_s
|
52
|
+
# Compile a Tokenizer DFA from a JSON string
|
53
|
+
#
|
54
|
+
def self.from_json(jsonStr)
|
55
|
+
db = false
|
56
|
+
|
57
|
+
!db|| pr("\n\nextractDFA %s...\n",jsonStr)
|
58
|
+
|
59
|
+
h = JSON.parse(jsonStr)
|
60
|
+
|
61
|
+
version = h["version"]
|
62
|
+
|
63
|
+
if !version || version.floor != VERSION.floor
|
64
|
+
raise ArgumentError,
|
65
|
+
"Bad or missing version number: "+version.to_s+", expected "+VERSION.to_s
|
111
66
|
end
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
h["tokens"] = tokenNames
|
141
|
-
|
142
|
-
stateSet,_,_ = startState.reachableStates
|
67
|
+
|
68
|
+
tNames = h["tokens"]
|
69
|
+
stateInfo = h["states"]
|
70
|
+
|
71
|
+
!db|| pr("tokens=%s\n",d(tNames))
|
72
|
+
!db|| pr("stateInfo=\n%s\n",d(stateInfo))
|
73
|
+
|
74
|
+
st = []
|
75
|
+
stateInfo.each_with_index do |(key,val),i|
|
76
|
+
!db|| pr(" creating new state, id=%d\n",i)
|
77
|
+
st.push(State.new(i))
|
78
|
+
end
|
79
|
+
|
80
|
+
st.each do |s|
|
81
|
+
!db|| pr("proc state %s\n",d(s))
|
82
|
+
|
83
|
+
finalState, edgeList = stateInfo[s.id]
|
84
|
+
s.finalState = finalState
|
85
|
+
edgeList.each do |edge|
|
86
|
+
label,destState = edge
|
87
|
+
cr = CodeSet.new()
|
88
|
+
cr.setArray(label)
|
89
|
+
s.addEdge(cr, st[destState])
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
DFA.new(tNames, st[0])
|
143
94
|
|
144
|
-
idToStateMap = {}
|
145
|
-
stateSet.each do |st|
|
146
|
-
idToStateMap[st.id] = st
|
147
95
|
end
|
148
96
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
97
|
+
attr_reader :startState, :tokenNames
|
98
|
+
|
99
|
+
# Construct a DFA, given a list of token names and a starting state.
|
100
|
+
#
|
101
|
+
def initialize(tokenNameList, startState)
|
102
|
+
|
103
|
+
if (startState.id != 0)
|
104
|
+
raise ArgumentError, "Start state id must be zero"
|
105
|
+
end
|
106
|
+
|
107
|
+
@tokenNames = tokenNameList
|
108
|
+
@startState = startState
|
109
|
+
@tokenIdMap = {}
|
110
|
+
@tokenNames.each_with_index do |name, i|
|
111
|
+
@tokenIdMap[name] = i
|
155
112
|
end
|
156
|
-
nextId += 1
|
157
113
|
|
158
|
-
stateList.push(st)
|
159
114
|
end
|
160
115
|
|
161
|
-
|
162
|
-
|
116
|
+
# Determine the name of a token, given its id.
|
117
|
+
# Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
|
118
|
+
# the tokenId is nil. Otherwise, assumes tokenId is 0 ... n-1, where
|
119
|
+
# n is the number of token names in the DFA.
|
120
|
+
#
|
121
|
+
def tokenName(tokenId)
|
122
|
+
if !tokenId
|
123
|
+
nm = "<EOF>"
|
124
|
+
elsif tokenId == UNKNOWN_TOKEN
|
125
|
+
nm = "<UNKNOWN>"
|
126
|
+
else
|
127
|
+
if tokenId < 0 || tokenId >= tokenNames.size
|
128
|
+
raise IndexError, "No such token id: "+tokenId.to_s
|
129
|
+
end
|
130
|
+
nm = tokenNames[tokenId]
|
131
|
+
end
|
132
|
+
nm
|
163
133
|
end
|
164
134
|
|
165
|
-
|
166
|
-
|
135
|
+
# Get id of token given its name
|
136
|
+
# @param tokenName name of token
|
137
|
+
# @return nil if there is no token with that name
|
138
|
+
#
|
139
|
+
def tokenId(tokenName)
|
140
|
+
@tokenIdMap[tokenName]
|
167
141
|
end
|
168
142
|
|
169
|
-
|
170
|
-
|
171
|
-
|
143
|
+
# Serialize this DFA to a JSON string.
|
144
|
+
# The DFA in JSON form has this structure:
|
145
|
+
#
|
146
|
+
# {
|
147
|
+
# "version" => version number (float)
|
148
|
+
# "tokens" => array of token names (strings)
|
149
|
+
# "states" => array of states, ordered by id (0,1,..)
|
150
|
+
# }
|
151
|
+
#
|
152
|
+
# Each state has this format:
|
153
|
+
# [ finalState (boolean),
|
154
|
+
# [edge0, edge1, ...]
|
155
|
+
# ]
|
156
|
+
#
|
157
|
+
# Edge:
|
158
|
+
# [label, destination id (integer)]
|
159
|
+
#
|
160
|
+
# Labels are arrays of integers, exactly the structure of
|
161
|
+
# a CodeSet array.
|
162
|
+
#
|
163
|
+
def serialize
|
164
|
+
|
165
|
+
h = {"version"=>VERSION, "tokens"=>tokenNames}
|
166
|
+
|
167
|
+
|
168
|
+
stateSet,_,_ = startState.reachableStates
|
169
|
+
|
170
|
+
idToStateMap = {}
|
171
|
+
stateSet.each{ |st| idToStateMap[st.id] = st }
|
172
|
+
|
173
|
+
stateList = []
|
174
|
+
|
175
|
+
nextId = 0
|
176
|
+
idToStateMap.each_pair do |id, st|
|
177
|
+
if nextId != id
|
178
|
+
raise ArgumentError, "unexpected state ids"
|
179
|
+
end
|
180
|
+
nextId += 1
|
181
|
+
|
182
|
+
stateList.push(st)
|
183
|
+
end
|
184
|
+
|
185
|
+
if stateList.size == 0
|
186
|
+
raise ArgumentError, "bad states"
|
187
|
+
end
|
188
|
+
|
189
|
+
if stateList[0] != startState
|
190
|
+
raise ArgumentError, "bad start state"
|
191
|
+
end
|
192
|
+
|
193
|
+
stateInfo = []
|
194
|
+
stateList.each do |st|
|
195
|
+
stateInfo.push(stateToList(st))
|
196
|
+
end
|
197
|
+
h["states"] = stateInfo
|
198
|
+
|
199
|
+
JSON.generate(h)
|
172
200
|
end
|
173
|
-
h["states"] = stateInfo
|
174
|
-
|
175
|
-
JSON.generate(h)
|
176
|
-
end
|
177
|
-
|
178
|
-
private
|
179
201
|
|
180
|
-
|
181
|
-
|
202
|
+
private
|
203
|
+
|
204
|
+
VERSION = 1.0
|
182
205
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
206
|
+
def stateToList(state)
|
207
|
+
list = [state.finalState?]
|
208
|
+
ed = []
|
209
|
+
state.edges.each do |lbl, dest|
|
210
|
+
edInfo = [lbl.array, dest.id]
|
211
|
+
ed.push(edInfo)
|
212
|
+
end
|
213
|
+
list.push(ed)
|
214
|
+
|
215
|
+
list
|
188
216
|
end
|
189
|
-
list.push(ed)
|
190
217
|
|
191
|
-
list
|
192
218
|
end
|
193
219
|
|
194
|
-
end
|
195
|
-
|
220
|
+
end # module Tokn
|
196
221
|
|
data/lib/tokn/dfa_builder.rb
CHANGED
@@ -1,261 +1,259 @@
|
|
1
1
|
require_relative 'tools'
|
2
|
-
req('tokn_const code_set state range_partition reg_parse')
|
2
|
+
# req('tokn_const code_set state range_partition reg_parse')
|
3
|
+
req('range_partition reg_parse')
|
3
4
|
|
4
|
-
|
5
|
-
# minimal DFAs.
|
6
|
-
#
|
7
|
-
# Performs the subset construction algorithm described in
|
8
|
-
# (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
|
9
|
-
#
|
10
|
-
# Also implements an innovative algorithm to partition a set of
|
11
|
-
# edge labels into a set that has the property that no two elements
|
12
|
-
# have overlapping regions. This allows us to perform the subset construction
|
13
|
-
# (and closure operations) efficiently while supporting large possible character
|
14
|
-
# sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
|
15
|
-
# for more details.
|
16
|
-
#
|
17
|
-
class DFABuilder
|
18
|
-
|
19
|
-
include Tokn
|
5
|
+
module ToknInternal
|
20
6
|
|
21
|
-
|
22
|
-
#
|
23
|
-
#
|
24
|
-
# @param startState the start state of the NFA
|
25
|
-
# @param db if true, generates PDF files for debug purposes, showing various
|
26
|
-
# steps of the procedure
|
7
|
+
# Converts NFAs (nondeterministic, finite state automata) to
|
8
|
+
# minimal DFAs.
|
27
9
|
#
|
28
|
-
|
29
|
-
|
30
|
-
!db || startState.generatePDF("original_nfa")
|
31
|
-
|
32
|
-
# Reverse this NFA, convert to DFA, then
|
33
|
-
# reverse it, and convert it again. Apparently this
|
34
|
-
# produces a minimal DFA.
|
35
|
-
|
36
|
-
rev = startState.reverseNFA()
|
37
|
-
!db || rev.generatePDF("reversed_nfa")
|
38
|
-
|
39
|
-
bld = DFABuilder.new(rev)
|
40
|
-
dfa = bld.build(true, false) # partition, but don't normalize
|
41
|
-
|
42
|
-
!db || dfa.generatePDF("reversed_dfa")
|
43
|
-
|
44
|
-
rev2 = dfa.reverseNFA()
|
45
|
-
bld = DFABuilder.new(rev2)
|
46
|
-
|
47
|
-
# Don't regenerate the partition; it is still valid
|
48
|
-
# for this second build process
|
49
|
-
#
|
50
|
-
dfa = bld.build(false, true) # don't partition, but do normalize
|
51
|
-
|
52
|
-
# If there are edges that contain more than one token identifier,
|
53
|
-
# remove all but the first (i.e. the one with the highest token id)
|
54
|
-
|
55
|
-
stSet, _, _ = dfa.reachableStates
|
56
|
-
stSet.each do |s|
|
57
|
-
s.edges.each do |lbl, dest|
|
58
|
-
a = lbl.array
|
59
|
-
if !a.size
|
60
|
-
next
|
61
|
-
end
|
62
|
-
|
63
|
-
primeId = a[0]
|
64
|
-
|
65
|
-
if primeId >= EPSILON-1
|
66
|
-
next
|
67
|
-
end
|
68
|
-
|
69
|
-
lbl.difference!(CodeSet.new(primeId+1, EPSILON))
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
!db || dfa.generatePDF("minimal_dfa")
|
74
|
-
|
75
|
-
dfa
|
76
|
-
end
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
# Constructs a builder object
|
10
|
+
# Performs the subset construction algorithm described in
|
11
|
+
# (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
|
81
12
|
#
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
nfas, _, _ = @nfaStart.reachableStates
|
89
|
-
nfas.each {|s| @nfaStateMap[s.id] = s}
|
90
|
-
|
91
|
-
# Initialize an array of nfa state lists, indexed by dfa state id
|
92
|
-
@nfaStateLists = []
|
93
|
-
|
94
|
-
# Map of existing DFA states; key is array of NFA state ids
|
95
|
-
@dfaStateMap = {}
|
96
|
-
end
|
97
|
-
|
98
|
-
# Perform the build algorithm
|
99
|
-
#
|
100
|
-
# @param partition if true, partitions the edge labels into disjoint code sets
|
101
|
-
# @param normalize if true, normalizes the states afterward
|
13
|
+
# Also implements an innovative algorithm to partition a set of
|
14
|
+
# edge labels into a set that has the property that no two elements
|
15
|
+
# have overlapping regions. This allows us to perform the subset construction
|
16
|
+
# (and closure operations) efficiently while supporting large possible character
|
17
|
+
# sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
|
18
|
+
# for more details.
|
102
19
|
#
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
20
|
+
class DFABuilder
|
21
|
+
|
22
|
+
# Convert an NFA to a DFA.
|
23
|
+
#
|
24
|
+
# @param startState the start state of the NFA
|
25
|
+
# @param db if true, generates PDF files for debug purposes, showing various
|
26
|
+
# steps of the procedure
|
27
|
+
#
|
28
|
+
def self.nfa_to_dfa(startState, db = false)
|
29
|
+
|
30
|
+
!db || startState.generatePDF("original_nfa")
|
31
|
+
|
32
|
+
# Reverse this NFA, convert to DFA, then
|
33
|
+
# reverse it, and convert it again. Apparently this
|
34
|
+
# produces a minimal DFA.
|
35
|
+
|
36
|
+
rev = startState.reverseNFA()
|
37
|
+
!db || rev.generatePDF("reversed_nfa")
|
38
|
+
|
39
|
+
bld = DFABuilder.new(rev)
|
40
|
+
dfa = bld.build(true, false) # partition, but don't normalize
|
41
|
+
|
42
|
+
!db || dfa.generatePDF("reversed_dfa")
|
120
43
|
|
121
|
-
|
44
|
+
rev2 = dfa.reverseNFA()
|
45
|
+
bld = DFABuilder.new(rev2)
|
122
46
|
|
123
|
-
#
|
124
|
-
|
47
|
+
# Don't regenerate the partition; it is still valid
|
48
|
+
# for this second build process
|
49
|
+
#
|
50
|
+
dfa = bld.build(false, true) # don't partition, but do normalize
|
125
51
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
52
|
+
# If there are edges that contain more than one token identifier,
|
53
|
+
# remove all but the first (i.e. the one with the highest token id)
|
54
|
+
|
55
|
+
stSet, _, _ = dfa.reachableStates
|
56
|
+
stSet.each do |s|
|
57
|
+
s.edges.each do |lbl, dest|
|
58
|
+
a = lbl.array
|
59
|
+
if !a.size
|
130
60
|
next
|
131
61
|
end
|
132
62
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
nfaStates.add(dest)
|
63
|
+
primeId = a[0]
|
64
|
+
|
65
|
+
next if primeId >= EPSILON-1
|
66
|
+
|
67
|
+
lbl.difference!(CodeSet.new(primeId+1, EPSILON))
|
139
68
|
end
|
140
69
|
end
|
141
70
|
|
142
|
-
|
143
|
-
# May be better to test if already in set before calc closure; or simply has closure
|
144
|
-
epsClosure(nfaStates)
|
145
|
-
dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
|
146
|
-
if isNew
|
147
|
-
unmarked.push(dfaDestState)
|
148
|
-
end
|
149
|
-
dfaState.addEdge(charRange, dfaDestState)
|
150
|
-
end
|
71
|
+
!db || dfa.generatePDF("minimal_dfa")
|
151
72
|
|
73
|
+
dfa
|
152
74
|
end
|
153
75
|
|
154
|
-
if normalize
|
155
|
-
!db || @dfaStart.generatePDF("prior_normalize")
|
156
|
-
|
157
|
-
!db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
|
158
|
-
State.normalizeStates(@dfaStart)
|
159
|
-
!db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
|
160
|
-
!db || @dfaStart.generatePDF("post_normalize")
|
161
|
-
end
|
162
76
|
|
163
|
-
@dfaStart
|
164
|
-
end
|
165
|
-
|
166
|
-
private
|
167
|
-
|
168
|
-
# Adds a DFA state for a set of NFA states, if one doesn't already exist
|
169
|
-
# for the set
|
170
|
-
# @param nfaStateList a sorted array of NFA state ids
|
171
|
-
# @return a pair [DFA State,
|
172
|
-
# created flag (boolean): true if this did not already exist]
|
173
|
-
#
|
174
|
-
def createDFAState(nfaStateList)
|
175
|
-
|
176
|
-
lst = nfaStateList
|
177
77
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
78
|
+
# Constructs a builder object
|
79
|
+
#
|
80
|
+
def initialize(nfaStartState)
|
81
|
+
@nextId = 0
|
82
|
+
@nfaStart = nfaStartState
|
182
83
|
|
183
|
-
#
|
184
|
-
|
84
|
+
# Build a map of nfa state ids => nfa states
|
85
|
+
@nfaStateMap = {}
|
86
|
+
nfas, _, _ = @nfaStart.reachableStates
|
87
|
+
nfas.each {|s| @nfaStateMap[s.id] = s}
|
185
88
|
|
186
|
-
|
187
|
-
|
188
|
-
# (useful for debugging)
|
189
|
-
newState.label = lst.map {|x| x.to_s}.join(' ')
|
190
|
-
end
|
89
|
+
# Initialize an array of nfa state lists, indexed by dfa state id
|
90
|
+
@nfaStateLists = []
|
191
91
|
|
192
|
-
|
193
|
-
@
|
194
|
-
@nfaStateLists.push(lst)
|
195
|
-
|
92
|
+
# Map of existing DFA states; key is array of NFA state ids
|
93
|
+
@dfaStateMap = {}
|
196
94
|
end
|
197
|
-
return [newState,isNewState]
|
198
|
-
end
|
199
95
|
|
200
|
-
|
201
|
-
|
202
|
-
|
96
|
+
# Perform the build algorithm
|
97
|
+
#
|
98
|
+
# @param partition if true, partitions the edge labels into disjoint code sets
|
99
|
+
# @param normalize if true, normalizes the states afterward
|
100
|
+
#
|
101
|
+
def build(partition = true, normalize = true)
|
102
|
+
db = false
|
103
|
+
|
104
|
+
!partition || partitionEdges(@nfaStart)
|
105
|
+
|
106
|
+
iset = Set.new
|
107
|
+
iset.add(@nfaStart)
|
108
|
+
epsClosure(iset)
|
109
|
+
|
110
|
+
@dfaStart,_ = createDFAState(stateSetToIdArray(iset))
|
111
|
+
|
112
|
+
markedStates = Set.new
|
113
|
+
|
114
|
+
unmarked = [@dfaStart]
|
203
115
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
116
|
+
until unmarked.empty?
|
117
|
+
dfaState = unmarked.pop
|
118
|
+
|
119
|
+
nfaIds = @nfaStateLists[dfaState.id]
|
120
|
+
|
121
|
+
# map of CodeSet => set of NFA states
|
122
|
+
moveMap = {}
|
123
|
+
|
124
|
+
nfaIds.each do |nfaId|
|
125
|
+
nfaState = @nfaStateMap[nfaId]
|
126
|
+
nfaState.edges.each do |lbl,dest|
|
127
|
+
if lbl.array[0] == EPSILON
|
128
|
+
next
|
129
|
+
end
|
130
|
+
|
131
|
+
nfaStates = moveMap[lbl]
|
132
|
+
if !nfaStates
|
133
|
+
nfaStates = Set.new
|
134
|
+
moveMap[lbl] = nfaStates
|
135
|
+
end
|
136
|
+
nfaStates.add(dest)
|
215
137
|
end
|
216
138
|
end
|
139
|
+
|
140
|
+
moveMap.each_pair do |charRange,nfaStates|
|
141
|
+
# May be better to test if already in set before calc closure; or simply has closure
|
142
|
+
epsClosure(nfaStates)
|
143
|
+
dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
|
144
|
+
if isNew
|
145
|
+
unmarked.push(dfaDestState)
|
146
|
+
end
|
147
|
+
dfaState.addEdge(charRange, dfaDestState)
|
148
|
+
end
|
149
|
+
|
217
150
|
end
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
par = RangePartition.new
|
151
|
+
|
152
|
+
if normalize
|
153
|
+
!db || @dfaStart.generatePDF("prior_normalize")
|
154
|
+
|
155
|
+
!db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
|
156
|
+
State.normalizeStates(@dfaStart)
|
157
|
+
!db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
|
158
|
+
!db || @dfaStart.generatePDF("post_normalize")
|
159
|
+
end
|
160
|
+
|
161
|
+
@dfaStart
|
162
|
+
end
|
231
163
|
|
232
|
-
|
164
|
+
private
|
233
165
|
|
234
|
-
|
235
|
-
|
166
|
+
# Adds a DFA state for a set of NFA states, if one doesn't already exist
|
167
|
+
# for the set
|
168
|
+
# @param nfaStateList a sorted array of NFA state ids
|
169
|
+
# @return a pair [DFA State,
|
170
|
+
# created flag (boolean): true if this did not already exist]
|
171
|
+
#
|
172
|
+
def createDFAState(nfaStateList)
|
173
|
+
|
174
|
+
lst = nfaStateList
|
175
|
+
|
176
|
+
newState = @nfaStateMap[lst]
|
177
|
+
isNewState = !newState
|
178
|
+
if isNewState
|
179
|
+
newState = State.new(@nextId)
|
180
|
+
|
181
|
+
# Determine if any of the NFA states were final states
|
182
|
+
newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
|
183
|
+
|
184
|
+
if false
|
185
|
+
# Set label of DFA state to show which NFA states produced it
|
186
|
+
# (useful for debugging)
|
187
|
+
newState.label = lst.map {|x| x.to_s}.join(' ')
|
188
|
+
end
|
189
|
+
|
190
|
+
@nextId += 1
|
191
|
+
@nfaStateMap[lst] = newState
|
192
|
+
@nfaStateLists.push(lst)
|
193
|
+
|
194
|
+
end
|
195
|
+
return [newState,isNewState]
|
236
196
|
end
|
237
197
|
|
238
|
-
|
198
|
+
def stateSetToIdArray(s)
|
199
|
+
s.to_a.map {|x| x.id}.sort
|
200
|
+
end
|
239
201
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
202
|
+
# Calculate the epsilon closure of a set of NFA states
|
203
|
+
# @return a set of states
|
204
|
+
#
|
205
|
+
def epsClosure(stateSet)
|
206
|
+
stk = stateSet.to_a
|
207
|
+
while !stk.empty?
|
208
|
+
s = stk.pop
|
209
|
+
s.edges.each do |lbl,dest|
|
210
|
+
if lbl.contains? EPSILON
|
211
|
+
if stateSet.add?(dest)
|
212
|
+
stk.push(dest)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
246
216
|
end
|
247
|
-
|
217
|
+
stateSet
|
218
|
+
end
|
219
|
+
|
220
|
+
# Modify edges so each is labelled with a disjoint subset
|
221
|
+
# of characters. See the notes at the start of this class,
|
222
|
+
# as well as RangePartition.rb.
|
223
|
+
#
|
224
|
+
def partitionEdges(startState)
|
225
|
+
|
226
|
+
db = false
|
227
|
+
|
228
|
+
par = RangePartition.new
|
229
|
+
|
230
|
+
stateSet, _, _ = startState.reachableStates
|
231
|
+
|
232
|
+
stateSet.each do |s|
|
233
|
+
s.edges.each {|lbl,dest| par.addSet(lbl) }
|
234
|
+
end
|
235
|
+
|
236
|
+
par.prepare
|
248
237
|
|
249
|
-
|
250
|
-
|
251
|
-
s.
|
238
|
+
stateSet.each do |s|
|
239
|
+
newEdges = []
|
240
|
+
s.edges.each do |lbl, dest|
|
241
|
+
!db||pr(" old edge: %s => %s\n",d(lbl),d(dest.name))
|
242
|
+
newLbls = par.apply(lbl)
|
243
|
+
newLbls.each {|x| newEdges.push([x, dest]) }
|
244
|
+
end
|
245
|
+
s.clearEdges()
|
246
|
+
|
247
|
+
newEdges.each do |lbl,dest|
|
248
|
+
!db||pr(" new edge: %s => %s\n",d(lbl),d(dest.name))
|
249
|
+
s.addEdge(lbl,dest)
|
250
|
+
end
|
251
|
+
!db||pr("\n")
|
252
252
|
end
|
253
|
-
|
253
|
+
|
254
254
|
end
|
255
|
+
|
255
256
|
|
256
257
|
end
|
257
258
|
|
258
|
-
|
259
|
-
end
|
260
|
-
|
261
|
-
|
259
|
+
end # module ToknInternal
|