tokn 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.txt +4 -5
- data/bin/tokncompile +1 -1
- data/bin/toknprocess +10 -4
- data/lib/tokn/code_set.rb +332 -337
- data/lib/tokn/dfa.rb +187 -162
- data/lib/tokn/dfa_builder.rb +218 -220
- data/lib/tokn/range_partition.rb +205 -203
- data/lib/tokn/reg_parse.rb +336 -331
- data/lib/tokn/state.rb +267 -270
- data/lib/tokn/token_defn_parser.rb +144 -139
- data/lib/tokn/tokenizer.rb +243 -175
- data/lib/tokn/tokn_const.rb +11 -6
- data/lib/tokn/tools.rb +42 -20
- data/test/Example1.rb +50 -0
- data/test/data/compileddfa.txt +1 -0
- data/test/data/sampletext.txt +6 -1
- data/test/test.rb +17 -12
- metadata +7 -6
- data/test/simple.rb +0 -33
data/lib/tokn/dfa.rb
CHANGED
@@ -2,195 +2,220 @@ require 'json'
|
|
2
2
|
require_relative 'tools'
|
3
3
|
req('code_set state')
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
# a
|
8
|
-
#
|
9
|
-
class DFA
|
10
|
-
|
11
|
-
include Tokn
|
12
|
-
|
13
|
-
# Compile a Tokenizer DFA from a token definition script.
|
14
|
-
# If persistPath is not null, it first checks if the file exists and
|
15
|
-
# if so, assumes it contains (in JSON form) a previously compiled
|
16
|
-
# DFA matching this script, and reads the DFA from it.
|
17
|
-
# Second, if no such file exists, it writes the DFA to it after compilation.
|
5
|
+
module Tokn
|
6
|
+
|
7
|
+
# A DFA for tokenizing; includes pointer to a start state, and
|
8
|
+
# a list of token names
|
18
9
|
#
|
19
|
-
|
10
|
+
class DFA
|
20
11
|
|
21
|
-
|
22
|
-
return extractDFA(readTextFile(persistPath))
|
23
|
-
end
|
24
|
-
|
25
|
-
req('token_defn_parser')
|
12
|
+
include ToknInternal
|
26
13
|
|
27
|
-
|
28
|
-
|
14
|
+
# Compile a Tokenizer DFA from a token definition script.
|
15
|
+
# If persistPath is not null, it first checks if the file exists and
|
16
|
+
# if so, assumes it contains (in JSON form) a previously compiled
|
17
|
+
# DFA matching this script, and reads the DFA from it.
|
18
|
+
# Second, if no such file exists, it writes the DFA to it after compilation.
|
19
|
+
#
|
20
|
+
def self.from_script(script, persistPath = nil)
|
21
|
+
|
22
|
+
if persistPath and File.exist?(persistPath)
|
23
|
+
return extractDFA(readTextFile(persistPath))
|
24
|
+
end
|
25
|
+
|
26
|
+
req('token_defn_parser')
|
27
|
+
|
28
|
+
td = TokenDefParser.new(script)
|
29
|
+
dfa = td.dfa
|
30
|
+
|
31
|
+
if persistPath
|
32
|
+
writeTextFile(persistPath, dfa.serialize())
|
33
|
+
end
|
29
34
|
|
30
|
-
|
31
|
-
writeTextFile(persistPath, dfa.serialize())
|
35
|
+
dfa
|
32
36
|
end
|
33
|
-
|
34
|
-
dfa
|
35
|
-
end
|
36
|
-
|
37
|
-
# Similar to dfa_from_script, but reads the script into memory from
|
38
|
-
# the file at scriptPath.
|
39
|
-
#
|
40
|
-
def self.dfa_from_script_file(scriptPath, persistPath = nil)
|
41
|
-
self.dfa_from_script(readTextFile(scriptPath), persistPath)
|
42
|
-
end
|
43
|
-
|
44
|
-
# Compile a Tokenizer DFA from a text file (that contains a
|
45
|
-
# JSON string)
|
46
|
-
#
|
47
|
-
def self.dfa_from_file(path)
|
48
|
-
dfa_from_json(readTextFile(path))
|
49
|
-
end
|
50
|
-
|
51
|
-
# Compile a Tokenizer DFA from a JSON string
|
52
|
-
#
|
53
|
-
def self.dfa_from_json(jsonStr)
|
54
|
-
db = false
|
55
37
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
stateInfo = h["states"]
|
62
|
-
|
63
|
-
!db|| pr("tokens=%s\n",d(tNames))
|
64
|
-
!db|| pr("stateInfo=\n%s\n",d(stateInfo))
|
65
|
-
|
66
|
-
st = []
|
67
|
-
stateInfo.each_with_index do |(key,val),i|
|
68
|
-
!db|| pr(" creating new state, id=%d\n",i)
|
69
|
-
st.push(State.new(i))
|
38
|
+
# Similar to from_script, but reads the script into memory from
|
39
|
+
# the file at scriptPath.
|
40
|
+
#
|
41
|
+
def self.from_script_file(scriptPath, persistPath = nil)
|
42
|
+
self.from_script(readTextFile(scriptPath), persistPath)
|
70
43
|
end
|
71
44
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
edgeList.each do |edge|
|
78
|
-
label,destState = edge
|
79
|
-
cr = CodeSet.new()
|
80
|
-
cr.setArray(label)
|
81
|
-
s.addEdge(cr, st[destState])
|
82
|
-
end
|
45
|
+
# Compile a Tokenizer DFA from a text file (that contains a
|
46
|
+
# JSON string)
|
47
|
+
#
|
48
|
+
def self.from_file(path)
|
49
|
+
from_json(readTextFile(path))
|
83
50
|
end
|
84
51
|
|
85
|
-
DFA
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
# Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
|
100
|
-
# the tokenId is nil. Otherwise, assumes tokenId is 0..n, where
|
101
|
-
# n is the number of token names in the DFA.
|
102
|
-
#
|
103
|
-
def tokenName(tokenId)
|
104
|
-
if !tokenId
|
105
|
-
nm = "<EOF>"
|
106
|
-
elsif tokenId == UNKNOWN_TOKEN
|
107
|
-
nm = "<UNKNOWN>"
|
108
|
-
else
|
109
|
-
if tokenId < 0 || tokenId >= tokenNames.size
|
110
|
-
raise IndexError, "No such token id: "+tokenId.to_s
|
52
|
+
# Compile a Tokenizer DFA from a JSON string
|
53
|
+
#
|
54
|
+
def self.from_json(jsonStr)
|
55
|
+
db = false
|
56
|
+
|
57
|
+
!db|| pr("\n\nextractDFA %s...\n",jsonStr)
|
58
|
+
|
59
|
+
h = JSON.parse(jsonStr)
|
60
|
+
|
61
|
+
version = h["version"]
|
62
|
+
|
63
|
+
if !version || version.floor != VERSION.floor
|
64
|
+
raise ArgumentError,
|
65
|
+
"Bad or missing version number: "+version.to_s+", expected "+VERSION.to_s
|
111
66
|
end
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
h["tokens"] = tokenNames
|
141
|
-
|
142
|
-
stateSet,_,_ = startState.reachableStates
|
67
|
+
|
68
|
+
tNames = h["tokens"]
|
69
|
+
stateInfo = h["states"]
|
70
|
+
|
71
|
+
!db|| pr("tokens=%s\n",d(tNames))
|
72
|
+
!db|| pr("stateInfo=\n%s\n",d(stateInfo))
|
73
|
+
|
74
|
+
st = []
|
75
|
+
stateInfo.each_with_index do |(key,val),i|
|
76
|
+
!db|| pr(" creating new state, id=%d\n",i)
|
77
|
+
st.push(State.new(i))
|
78
|
+
end
|
79
|
+
|
80
|
+
st.each do |s|
|
81
|
+
!db|| pr("proc state %s\n",d(s))
|
82
|
+
|
83
|
+
finalState, edgeList = stateInfo[s.id]
|
84
|
+
s.finalState = finalState
|
85
|
+
edgeList.each do |edge|
|
86
|
+
label,destState = edge
|
87
|
+
cr = CodeSet.new()
|
88
|
+
cr.setArray(label)
|
89
|
+
s.addEdge(cr, st[destState])
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
DFA.new(tNames, st[0])
|
143
94
|
|
144
|
-
idToStateMap = {}
|
145
|
-
stateSet.each do |st|
|
146
|
-
idToStateMap[st.id] = st
|
147
95
|
end
|
148
96
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
97
|
+
attr_reader :startState, :tokenNames
|
98
|
+
|
99
|
+
# Construct a DFA, given a list of token names and a starting state.
|
100
|
+
#
|
101
|
+
def initialize(tokenNameList, startState)
|
102
|
+
|
103
|
+
if (startState.id != 0)
|
104
|
+
raise ArgumentError, "Start state id must be zero"
|
105
|
+
end
|
106
|
+
|
107
|
+
@tokenNames = tokenNameList
|
108
|
+
@startState = startState
|
109
|
+
@tokenIdMap = {}
|
110
|
+
@tokenNames.each_with_index do |name, i|
|
111
|
+
@tokenIdMap[name] = i
|
155
112
|
end
|
156
|
-
nextId += 1
|
157
113
|
|
158
|
-
stateList.push(st)
|
159
114
|
end
|
160
115
|
|
161
|
-
|
162
|
-
|
116
|
+
# Determine the name of a token, given its id.
|
117
|
+
# Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
|
118
|
+
# the tokenId is nil. Otherwise, assumes tokenId is 0 ... n-1, where
|
119
|
+
# n is the number of token names in the DFA.
|
120
|
+
#
|
121
|
+
def tokenName(tokenId)
|
122
|
+
if !tokenId
|
123
|
+
nm = "<EOF>"
|
124
|
+
elsif tokenId == UNKNOWN_TOKEN
|
125
|
+
nm = "<UNKNOWN>"
|
126
|
+
else
|
127
|
+
if tokenId < 0 || tokenId >= tokenNames.size
|
128
|
+
raise IndexError, "No such token id: "+tokenId.to_s
|
129
|
+
end
|
130
|
+
nm = tokenNames[tokenId]
|
131
|
+
end
|
132
|
+
nm
|
163
133
|
end
|
164
134
|
|
165
|
-
|
166
|
-
|
135
|
+
# Get id of token given its name
|
136
|
+
# @param tokenName name of token
|
137
|
+
# @return nil if there is no token with that name
|
138
|
+
#
|
139
|
+
def tokenId(tokenName)
|
140
|
+
@tokenIdMap[tokenName]
|
167
141
|
end
|
168
142
|
|
169
|
-
|
170
|
-
|
171
|
-
|
143
|
+
# Serialize this DFA to a JSON string.
|
144
|
+
# The DFA in JSON form has this structure:
|
145
|
+
#
|
146
|
+
# {
|
147
|
+
# "version" => version number (float)
|
148
|
+
# "tokens" => array of token names (strings)
|
149
|
+
# "states" => array of states, ordered by id (0,1,..)
|
150
|
+
# }
|
151
|
+
#
|
152
|
+
# Each state has this format:
|
153
|
+
# [ finalState (boolean),
|
154
|
+
# [edge0, edge1, ...]
|
155
|
+
# ]
|
156
|
+
#
|
157
|
+
# Edge:
|
158
|
+
# [label, destination id (integer)]
|
159
|
+
#
|
160
|
+
# Labels are arrays of integers, exactly the structure of
|
161
|
+
# a CodeSet array.
|
162
|
+
#
|
163
|
+
def serialize
|
164
|
+
|
165
|
+
h = {"version"=>VERSION, "tokens"=>tokenNames}
|
166
|
+
|
167
|
+
|
168
|
+
stateSet,_,_ = startState.reachableStates
|
169
|
+
|
170
|
+
idToStateMap = {}
|
171
|
+
stateSet.each{ |st| idToStateMap[st.id] = st }
|
172
|
+
|
173
|
+
stateList = []
|
174
|
+
|
175
|
+
nextId = 0
|
176
|
+
idToStateMap.each_pair do |id, st|
|
177
|
+
if nextId != id
|
178
|
+
raise ArgumentError, "unexpected state ids"
|
179
|
+
end
|
180
|
+
nextId += 1
|
181
|
+
|
182
|
+
stateList.push(st)
|
183
|
+
end
|
184
|
+
|
185
|
+
if stateList.size == 0
|
186
|
+
raise ArgumentError, "bad states"
|
187
|
+
end
|
188
|
+
|
189
|
+
if stateList[0] != startState
|
190
|
+
raise ArgumentError, "bad start state"
|
191
|
+
end
|
192
|
+
|
193
|
+
stateInfo = []
|
194
|
+
stateList.each do |st|
|
195
|
+
stateInfo.push(stateToList(st))
|
196
|
+
end
|
197
|
+
h["states"] = stateInfo
|
198
|
+
|
199
|
+
JSON.generate(h)
|
172
200
|
end
|
173
|
-
h["states"] = stateInfo
|
174
|
-
|
175
|
-
JSON.generate(h)
|
176
|
-
end
|
177
|
-
|
178
|
-
private
|
179
201
|
|
180
|
-
|
181
|
-
|
202
|
+
private
|
203
|
+
|
204
|
+
VERSION = 1.0
|
182
205
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
206
|
+
def stateToList(state)
|
207
|
+
list = [state.finalState?]
|
208
|
+
ed = []
|
209
|
+
state.edges.each do |lbl, dest|
|
210
|
+
edInfo = [lbl.array, dest.id]
|
211
|
+
ed.push(edInfo)
|
212
|
+
end
|
213
|
+
list.push(ed)
|
214
|
+
|
215
|
+
list
|
188
216
|
end
|
189
|
-
list.push(ed)
|
190
217
|
|
191
|
-
list
|
192
218
|
end
|
193
219
|
|
194
|
-
end
|
195
|
-
|
220
|
+
end # module Tokn
|
196
221
|
|
data/lib/tokn/dfa_builder.rb
CHANGED
@@ -1,261 +1,259 @@
|
|
1
1
|
require_relative 'tools'
|
2
|
-
req('tokn_const code_set state range_partition reg_parse')
|
2
|
+
# req('tokn_const code_set state range_partition reg_parse')
|
3
|
+
req('range_partition reg_parse')
|
3
4
|
|
4
|
-
|
5
|
-
# minimal DFAs.
|
6
|
-
#
|
7
|
-
# Performs the subset construction algorithm described in
|
8
|
-
# (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
|
9
|
-
#
|
10
|
-
# Also implements an innovative algorithm to partition a set of
|
11
|
-
# edge labels into a set that has the property that no two elements
|
12
|
-
# have overlapping regions. This allows us to perform the subset construction
|
13
|
-
# (and closure operations) efficiently while supporting large possible character
|
14
|
-
# sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
|
15
|
-
# for more details.
|
16
|
-
#
|
17
|
-
class DFABuilder
|
18
|
-
|
19
|
-
include Tokn
|
5
|
+
module ToknInternal
|
20
6
|
|
21
|
-
|
22
|
-
#
|
23
|
-
#
|
24
|
-
# @param startState the start state of the NFA
|
25
|
-
# @param db if true, generates PDF files for debug purposes, showing various
|
26
|
-
# steps of the procedure
|
7
|
+
# Converts NFAs (nondeterministic, finite state automata) to
|
8
|
+
# minimal DFAs.
|
27
9
|
#
|
28
|
-
|
29
|
-
|
30
|
-
!db || startState.generatePDF("original_nfa")
|
31
|
-
|
32
|
-
# Reverse this NFA, convert to DFA, then
|
33
|
-
# reverse it, and convert it again. Apparently this
|
34
|
-
# produces a minimal DFA.
|
35
|
-
|
36
|
-
rev = startState.reverseNFA()
|
37
|
-
!db || rev.generatePDF("reversed_nfa")
|
38
|
-
|
39
|
-
bld = DFABuilder.new(rev)
|
40
|
-
dfa = bld.build(true, false) # partition, but don't normalize
|
41
|
-
|
42
|
-
!db || dfa.generatePDF("reversed_dfa")
|
43
|
-
|
44
|
-
rev2 = dfa.reverseNFA()
|
45
|
-
bld = DFABuilder.new(rev2)
|
46
|
-
|
47
|
-
# Don't regenerate the partition; it is still valid
|
48
|
-
# for this second build process
|
49
|
-
#
|
50
|
-
dfa = bld.build(false, true) # don't partition, but do normalize
|
51
|
-
|
52
|
-
# If there are edges that contain more than one token identifier,
|
53
|
-
# remove all but the first (i.e. the one with the highest token id)
|
54
|
-
|
55
|
-
stSet, _, _ = dfa.reachableStates
|
56
|
-
stSet.each do |s|
|
57
|
-
s.edges.each do |lbl, dest|
|
58
|
-
a = lbl.array
|
59
|
-
if !a.size
|
60
|
-
next
|
61
|
-
end
|
62
|
-
|
63
|
-
primeId = a[0]
|
64
|
-
|
65
|
-
if primeId >= EPSILON-1
|
66
|
-
next
|
67
|
-
end
|
68
|
-
|
69
|
-
lbl.difference!(CodeSet.new(primeId+1, EPSILON))
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
!db || dfa.generatePDF("minimal_dfa")
|
74
|
-
|
75
|
-
dfa
|
76
|
-
end
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
# Constructs a builder object
|
10
|
+
# Performs the subset construction algorithm described in
|
11
|
+
# (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
|
81
12
|
#
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
nfas, _, _ = @nfaStart.reachableStates
|
89
|
-
nfas.each {|s| @nfaStateMap[s.id] = s}
|
90
|
-
|
91
|
-
# Initialize an array of nfa state lists, indexed by dfa state id
|
92
|
-
@nfaStateLists = []
|
93
|
-
|
94
|
-
# Map of existing DFA states; key is array of NFA state ids
|
95
|
-
@dfaStateMap = {}
|
96
|
-
end
|
97
|
-
|
98
|
-
# Perform the build algorithm
|
99
|
-
#
|
100
|
-
# @param partition if true, partitions the edge labels into disjoint code sets
|
101
|
-
# @param normalize if true, normalizes the states afterward
|
13
|
+
# Also implements an innovative algorithm to partition a set of
|
14
|
+
# edge labels into a set that has the property that no two elements
|
15
|
+
# have overlapping regions. This allows us to perform the subset construction
|
16
|
+
# (and closure operations) efficiently while supporting large possible character
|
17
|
+
# sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
|
18
|
+
# for more details.
|
102
19
|
#
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
20
|
+
class DFABuilder
|
21
|
+
|
22
|
+
# Convert an NFA to a DFA.
|
23
|
+
#
|
24
|
+
# @param startState the start state of the NFA
|
25
|
+
# @param db if true, generates PDF files for debug purposes, showing various
|
26
|
+
# steps of the procedure
|
27
|
+
#
|
28
|
+
def self.nfa_to_dfa(startState, db = false)
|
29
|
+
|
30
|
+
!db || startState.generatePDF("original_nfa")
|
31
|
+
|
32
|
+
# Reverse this NFA, convert to DFA, then
|
33
|
+
# reverse it, and convert it again. Apparently this
|
34
|
+
# produces a minimal DFA.
|
35
|
+
|
36
|
+
rev = startState.reverseNFA()
|
37
|
+
!db || rev.generatePDF("reversed_nfa")
|
38
|
+
|
39
|
+
bld = DFABuilder.new(rev)
|
40
|
+
dfa = bld.build(true, false) # partition, but don't normalize
|
41
|
+
|
42
|
+
!db || dfa.generatePDF("reversed_dfa")
|
120
43
|
|
121
|
-
|
44
|
+
rev2 = dfa.reverseNFA()
|
45
|
+
bld = DFABuilder.new(rev2)
|
122
46
|
|
123
|
-
#
|
124
|
-
|
47
|
+
# Don't regenerate the partition; it is still valid
|
48
|
+
# for this second build process
|
49
|
+
#
|
50
|
+
dfa = bld.build(false, true) # don't partition, but do normalize
|
125
51
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
52
|
+
# If there are edges that contain more than one token identifier,
|
53
|
+
# remove all but the first (i.e. the one with the highest token id)
|
54
|
+
|
55
|
+
stSet, _, _ = dfa.reachableStates
|
56
|
+
stSet.each do |s|
|
57
|
+
s.edges.each do |lbl, dest|
|
58
|
+
a = lbl.array
|
59
|
+
if !a.size
|
130
60
|
next
|
131
61
|
end
|
132
62
|
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
nfaStates.add(dest)
|
63
|
+
primeId = a[0]
|
64
|
+
|
65
|
+
next if primeId >= EPSILON-1
|
66
|
+
|
67
|
+
lbl.difference!(CodeSet.new(primeId+1, EPSILON))
|
139
68
|
end
|
140
69
|
end
|
141
70
|
|
142
|
-
|
143
|
-
# May be better to test if already in set before calc closure; or simply has closure
|
144
|
-
epsClosure(nfaStates)
|
145
|
-
dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
|
146
|
-
if isNew
|
147
|
-
unmarked.push(dfaDestState)
|
148
|
-
end
|
149
|
-
dfaState.addEdge(charRange, dfaDestState)
|
150
|
-
end
|
71
|
+
!db || dfa.generatePDF("minimal_dfa")
|
151
72
|
|
73
|
+
dfa
|
152
74
|
end
|
153
75
|
|
154
|
-
if normalize
|
155
|
-
!db || @dfaStart.generatePDF("prior_normalize")
|
156
|
-
|
157
|
-
!db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
|
158
|
-
State.normalizeStates(@dfaStart)
|
159
|
-
!db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
|
160
|
-
!db || @dfaStart.generatePDF("post_normalize")
|
161
|
-
end
|
162
76
|
|
163
|
-
@dfaStart
|
164
|
-
end
|
165
|
-
|
166
|
-
private
|
167
|
-
|
168
|
-
# Adds a DFA state for a set of NFA states, if one doesn't already exist
|
169
|
-
# for the set
|
170
|
-
# @param nfaStateList a sorted array of NFA state ids
|
171
|
-
# @return a pair [DFA State,
|
172
|
-
# created flag (boolean): true if this did not already exist]
|
173
|
-
#
|
174
|
-
def createDFAState(nfaStateList)
|
175
|
-
|
176
|
-
lst = nfaStateList
|
177
77
|
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
78
|
+
# Constructs a builder object
|
79
|
+
#
|
80
|
+
def initialize(nfaStartState)
|
81
|
+
@nextId = 0
|
82
|
+
@nfaStart = nfaStartState
|
182
83
|
|
183
|
-
#
|
184
|
-
|
84
|
+
# Build a map of nfa state ids => nfa states
|
85
|
+
@nfaStateMap = {}
|
86
|
+
nfas, _, _ = @nfaStart.reachableStates
|
87
|
+
nfas.each {|s| @nfaStateMap[s.id] = s}
|
185
88
|
|
186
|
-
|
187
|
-
|
188
|
-
# (useful for debugging)
|
189
|
-
newState.label = lst.map {|x| x.to_s}.join(' ')
|
190
|
-
end
|
89
|
+
# Initialize an array of nfa state lists, indexed by dfa state id
|
90
|
+
@nfaStateLists = []
|
191
91
|
|
192
|
-
|
193
|
-
@
|
194
|
-
@nfaStateLists.push(lst)
|
195
|
-
|
92
|
+
# Map of existing DFA states; key is array of NFA state ids
|
93
|
+
@dfaStateMap = {}
|
196
94
|
end
|
197
|
-
return [newState,isNewState]
|
198
|
-
end
|
199
95
|
|
200
|
-
|
201
|
-
|
202
|
-
|
96
|
+
# Perform the build algorithm
|
97
|
+
#
|
98
|
+
# @param partition if true, partitions the edge labels into disjoint code sets
|
99
|
+
# @param normalize if true, normalizes the states afterward
|
100
|
+
#
|
101
|
+
def build(partition = true, normalize = true)
|
102
|
+
db = false
|
103
|
+
|
104
|
+
!partition || partitionEdges(@nfaStart)
|
105
|
+
|
106
|
+
iset = Set.new
|
107
|
+
iset.add(@nfaStart)
|
108
|
+
epsClosure(iset)
|
109
|
+
|
110
|
+
@dfaStart,_ = createDFAState(stateSetToIdArray(iset))
|
111
|
+
|
112
|
+
markedStates = Set.new
|
113
|
+
|
114
|
+
unmarked = [@dfaStart]
|
203
115
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
116
|
+
until unmarked.empty?
|
117
|
+
dfaState = unmarked.pop
|
118
|
+
|
119
|
+
nfaIds = @nfaStateLists[dfaState.id]
|
120
|
+
|
121
|
+
# map of CodeSet => set of NFA states
|
122
|
+
moveMap = {}
|
123
|
+
|
124
|
+
nfaIds.each do |nfaId|
|
125
|
+
nfaState = @nfaStateMap[nfaId]
|
126
|
+
nfaState.edges.each do |lbl,dest|
|
127
|
+
if lbl.array[0] == EPSILON
|
128
|
+
next
|
129
|
+
end
|
130
|
+
|
131
|
+
nfaStates = moveMap[lbl]
|
132
|
+
if !nfaStates
|
133
|
+
nfaStates = Set.new
|
134
|
+
moveMap[lbl] = nfaStates
|
135
|
+
end
|
136
|
+
nfaStates.add(dest)
|
215
137
|
end
|
216
138
|
end
|
139
|
+
|
140
|
+
moveMap.each_pair do |charRange,nfaStates|
|
141
|
+
# May be better to test if already in set before calc closure; or simply has closure
|
142
|
+
epsClosure(nfaStates)
|
143
|
+
dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
|
144
|
+
if isNew
|
145
|
+
unmarked.push(dfaDestState)
|
146
|
+
end
|
147
|
+
dfaState.addEdge(charRange, dfaDestState)
|
148
|
+
end
|
149
|
+
|
217
150
|
end
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
par = RangePartition.new
|
151
|
+
|
152
|
+
if normalize
|
153
|
+
!db || @dfaStart.generatePDF("prior_normalize")
|
154
|
+
|
155
|
+
!db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
|
156
|
+
State.normalizeStates(@dfaStart)
|
157
|
+
!db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
|
158
|
+
!db || @dfaStart.generatePDF("post_normalize")
|
159
|
+
end
|
160
|
+
|
161
|
+
@dfaStart
|
162
|
+
end
|
231
163
|
|
232
|
-
|
164
|
+
private
|
233
165
|
|
234
|
-
|
235
|
-
|
166
|
+
# Adds a DFA state for a set of NFA states, if one doesn't already exist
|
167
|
+
# for the set
|
168
|
+
# @param nfaStateList a sorted array of NFA state ids
|
169
|
+
# @return a pair [DFA State,
|
170
|
+
# created flag (boolean): true if this did not already exist]
|
171
|
+
#
|
172
|
+
def createDFAState(nfaStateList)
|
173
|
+
|
174
|
+
lst = nfaStateList
|
175
|
+
|
176
|
+
newState = @nfaStateMap[lst]
|
177
|
+
isNewState = !newState
|
178
|
+
if isNewState
|
179
|
+
newState = State.new(@nextId)
|
180
|
+
|
181
|
+
# Determine if any of the NFA states were final states
|
182
|
+
newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
|
183
|
+
|
184
|
+
if false
|
185
|
+
# Set label of DFA state to show which NFA states produced it
|
186
|
+
# (useful for debugging)
|
187
|
+
newState.label = lst.map {|x| x.to_s}.join(' ')
|
188
|
+
end
|
189
|
+
|
190
|
+
@nextId += 1
|
191
|
+
@nfaStateMap[lst] = newState
|
192
|
+
@nfaStateLists.push(lst)
|
193
|
+
|
194
|
+
end
|
195
|
+
return [newState,isNewState]
|
236
196
|
end
|
237
197
|
|
238
|
-
|
198
|
+
def stateSetToIdArray(s)
|
199
|
+
s.to_a.map {|x| x.id}.sort
|
200
|
+
end
|
239
201
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
202
|
+
# Calculate the epsilon closure of a set of NFA states
|
203
|
+
# @return a set of states
|
204
|
+
#
|
205
|
+
def epsClosure(stateSet)
|
206
|
+
stk = stateSet.to_a
|
207
|
+
while !stk.empty?
|
208
|
+
s = stk.pop
|
209
|
+
s.edges.each do |lbl,dest|
|
210
|
+
if lbl.contains? EPSILON
|
211
|
+
if stateSet.add?(dest)
|
212
|
+
stk.push(dest)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
246
216
|
end
|
247
|
-
|
217
|
+
stateSet
|
218
|
+
end
|
219
|
+
|
220
|
+
# Modify edges so each is labelled with a disjoint subset
|
221
|
+
# of characters. See the notes at the start of this class,
|
222
|
+
# as well as RangePartition.rb.
|
223
|
+
#
|
224
|
+
def partitionEdges(startState)
|
225
|
+
|
226
|
+
db = false
|
227
|
+
|
228
|
+
par = RangePartition.new
|
229
|
+
|
230
|
+
stateSet, _, _ = startState.reachableStates
|
231
|
+
|
232
|
+
stateSet.each do |s|
|
233
|
+
s.edges.each {|lbl,dest| par.addSet(lbl) }
|
234
|
+
end
|
235
|
+
|
236
|
+
par.prepare
|
248
237
|
|
249
|
-
|
250
|
-
|
251
|
-
s.
|
238
|
+
stateSet.each do |s|
|
239
|
+
newEdges = []
|
240
|
+
s.edges.each do |lbl, dest|
|
241
|
+
!db||pr(" old edge: %s => %s\n",d(lbl),d(dest.name))
|
242
|
+
newLbls = par.apply(lbl)
|
243
|
+
newLbls.each {|x| newEdges.push([x, dest]) }
|
244
|
+
end
|
245
|
+
s.clearEdges()
|
246
|
+
|
247
|
+
newEdges.each do |lbl,dest|
|
248
|
+
!db||pr(" new edge: %s => %s\n",d(lbl),d(dest.name))
|
249
|
+
s.addEdge(lbl,dest)
|
250
|
+
end
|
251
|
+
!db||pr("\n")
|
252
252
|
end
|
253
|
-
|
253
|
+
|
254
254
|
end
|
255
|
+
|
255
256
|
|
256
257
|
end
|
257
258
|
|
258
|
-
|
259
|
-
end
|
260
|
-
|
261
|
-
|
259
|
+
end # module ToknInternal
|