tokn 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
data/lib/tokn/dfa.rb CHANGED
@@ -2,195 +2,220 @@ require 'json'
2
2
  require_relative 'tools'
3
3
  req('code_set state')
4
4
 
5
-
6
- # A DFA for tokenizing; includes pointer to a start state, and
7
- # a list of token names
8
- #
9
- class DFA
10
-
11
- include Tokn
12
-
13
- # Compile a Tokenizer DFA from a token definition script.
14
- # If persistPath is not null, it first checks if the file exists and
15
- # if so, assumes it contains (in JSON form) a previously compiled
16
- # DFA matching this script, and reads the DFA from it.
17
- # Second, if no such file exists, it writes the DFA to it after compilation.
5
+ module Tokn
6
+
7
+ # A DFA for tokenizing; includes pointer to a start state, and
8
+ # a list of token names
18
9
  #
19
- def self.dfa_from_script(script, persistPath = nil)
10
+ class DFA
20
11
 
21
- if persistPath and File.exist?(persistPath)
22
- return extractDFA(readTextFile(persistPath))
23
- end
24
-
25
- req('token_defn_parser')
12
+ include ToknInternal
26
13
 
27
- td = TokenDefParser.new(script)
28
- dfa = td.dfa
14
+ # Compile a Tokenizer DFA from a token definition script.
15
+ # If persistPath is not null, it first checks if the file exists and
16
+ # if so, assumes it contains (in JSON form) a previously compiled
17
+ # DFA matching this script, and reads the DFA from it.
18
+ # Second, if no such file exists, it writes the DFA to it after compilation.
19
+ #
20
+ def self.from_script(script, persistPath = nil)
21
+
22
+ if persistPath and File.exist?(persistPath)
23
+ return extractDFA(readTextFile(persistPath))
24
+ end
25
+
26
+ req('token_defn_parser')
27
+
28
+ td = TokenDefParser.new(script)
29
+ dfa = td.dfa
30
+
31
+ if persistPath
32
+ writeTextFile(persistPath, dfa.serialize())
33
+ end
29
34
 
30
- if persistPath
31
- writeTextFile(persistPath, dfa.serialize())
35
+ dfa
32
36
  end
33
-
34
- dfa
35
- end
36
-
37
- # Similar to dfa_from_script, but reads the script into memory from
38
- # the file at scriptPath.
39
- #
40
- def self.dfa_from_script_file(scriptPath, persistPath = nil)
41
- self.dfa_from_script(readTextFile(scriptPath), persistPath)
42
- end
43
-
44
- # Compile a Tokenizer DFA from a text file (that contains a
45
- # JSON string)
46
- #
47
- def self.dfa_from_file(path)
48
- dfa_from_json(readTextFile(path))
49
- end
50
-
51
- # Compile a Tokenizer DFA from a JSON string
52
- #
53
- def self.dfa_from_json(jsonStr)
54
- db = false
55
37
 
56
- !db|| pr("\n\nextractDFA %s...\n",jsonStr)
57
-
58
- h = JSON.parse(jsonStr)
59
-
60
- tNames = h["tokens"]
61
- stateInfo = h["states"]
62
-
63
- !db|| pr("tokens=%s\n",d(tNames))
64
- !db|| pr("stateInfo=\n%s\n",d(stateInfo))
65
-
66
- st = []
67
- stateInfo.each_with_index do |(key,val),i|
68
- !db|| pr(" creating new state, id=%d\n",i)
69
- st.push(State.new(i))
38
+ # Similar to from_script, but reads the script into memory from
39
+ # the file at scriptPath.
40
+ #
41
+ def self.from_script_file(scriptPath, persistPath = nil)
42
+ self.from_script(readTextFile(scriptPath), persistPath)
70
43
  end
71
44
 
72
- st.each do |s|
73
- !db|| pr("proc state %s\n",d(s))
74
-
75
- finalState, edgeList = stateInfo[s.id]
76
- s.finalState = finalState
77
- edgeList.each do |edge|
78
- label,destState = edge
79
- cr = CodeSet.new()
80
- cr.setArray(label)
81
- s.addEdge(cr, st[destState])
82
- end
45
+ # Compile a Tokenizer DFA from a text file (that contains a
46
+ # JSON string)
47
+ #
48
+ def self.from_file(path)
49
+ from_json(readTextFile(path))
83
50
  end
84
51
 
85
- DFA.new(tNames, st[0])
86
-
87
- end
88
-
89
- attr_reader :startState, :tokenNames
90
-
91
- # Construct a DFA, given a list of token names and a starting state.
92
- #
93
- def initialize(tokenNameList, startState)
94
- @tokenNames = tokenNameList
95
- @startState = startState
96
- end
97
-
98
- # Determine the name of a token, given its id.
99
- # Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
100
- # the tokenId is nil. Otherwise, assumes tokenId is 0..n, where
101
- # n is the number of token names in the DFA.
102
- #
103
- def tokenName(tokenId)
104
- if !tokenId
105
- nm = "<EOF>"
106
- elsif tokenId == UNKNOWN_TOKEN
107
- nm = "<UNKNOWN>"
108
- else
109
- if tokenId < 0 || tokenId >= tokenNames.size
110
- raise IndexError, "No such token id: "+tokenId.to_s
52
+ # Compile a Tokenizer DFA from a JSON string
53
+ #
54
+ def self.from_json(jsonStr)
55
+ db = false
56
+
57
+ !db|| pr("\n\nextractDFA %s...\n",jsonStr)
58
+
59
+ h = JSON.parse(jsonStr)
60
+
61
+ version = h["version"]
62
+
63
+ if !version || version.floor != VERSION.floor
64
+ raise ArgumentError,
65
+ "Bad or missing version number: "+version.to_s+", expected "+VERSION.to_s
111
66
  end
112
- nm = tokenNames[tokenId]
113
- end
114
- nm
115
- end
116
-
117
- # Serialize this DFA to a JSON string.
118
- # The DFA in JSON form has this structure:
119
- #
120
- # {
121
- # "tokens" => array of token names (strings)
122
- # "states" => array of states, ordered by id (0,1,..)
123
- # }
124
- #
125
- # Each state has this format:
126
- # [ finalState (boolean),
127
- # [edge0, edge1, ...]
128
- # ]
129
- #
130
- # Edge:
131
- # [label, destination id (integer)]
132
- #
133
- # Labels are arrays of integers, exactly the structure of
134
- # a CodeSet array.
135
- #
136
- def serialize
137
-
138
- h = {}
139
-
140
- h["tokens"] = tokenNames
141
-
142
- stateSet,_,_ = startState.reachableStates
67
+
68
+ tNames = h["tokens"]
69
+ stateInfo = h["states"]
70
+
71
+ !db|| pr("tokens=%s\n",d(tNames))
72
+ !db|| pr("stateInfo=\n%s\n",d(stateInfo))
73
+
74
+ st = []
75
+ stateInfo.each_with_index do |(key,val),i|
76
+ !db|| pr(" creating new state, id=%d\n",i)
77
+ st.push(State.new(i))
78
+ end
79
+
80
+ st.each do |s|
81
+ !db|| pr("proc state %s\n",d(s))
82
+
83
+ finalState, edgeList = stateInfo[s.id]
84
+ s.finalState = finalState
85
+ edgeList.each do |edge|
86
+ label,destState = edge
87
+ cr = CodeSet.new()
88
+ cr.setArray(label)
89
+ s.addEdge(cr, st[destState])
90
+ end
91
+ end
92
+
93
+ DFA.new(tNames, st[0])
143
94
 
144
- idToStateMap = {}
145
- stateSet.each do |st|
146
- idToStateMap[st.id] = st
147
95
  end
148
96
 
149
- stateList = []
150
-
151
- nextId = 0
152
- idToStateMap.each_pair do |id, st|
153
- if nextId != id
154
- raise ArgumentError, "unexpected state ids"
97
+ attr_reader :startState, :tokenNames
98
+
99
+ # Construct a DFA, given a list of token names and a starting state.
100
+ #
101
+ def initialize(tokenNameList, startState)
102
+
103
+ if (startState.id != 0)
104
+ raise ArgumentError, "Start state id must be zero"
105
+ end
106
+
107
+ @tokenNames = tokenNameList
108
+ @startState = startState
109
+ @tokenIdMap = {}
110
+ @tokenNames.each_with_index do |name, i|
111
+ @tokenIdMap[name] = i
155
112
  end
156
- nextId += 1
157
113
 
158
- stateList.push(st)
159
114
  end
160
115
 
161
- if stateList.size == 0
162
- raise ArgumentError, "bad states"
116
+ # Determine the name of a token, given its id.
117
+ # Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
118
+ # the tokenId is nil. Otherwise, assumes tokenId is 0 ... n-1, where
119
+ # n is the number of token names in the DFA.
120
+ #
121
+ def tokenName(tokenId)
122
+ if !tokenId
123
+ nm = "<EOF>"
124
+ elsif tokenId == UNKNOWN_TOKEN
125
+ nm = "<UNKNOWN>"
126
+ else
127
+ if tokenId < 0 || tokenId >= tokenNames.size
128
+ raise IndexError, "No such token id: "+tokenId.to_s
129
+ end
130
+ nm = tokenNames[tokenId]
131
+ end
132
+ nm
163
133
  end
164
134
 
165
- if stateList[0] != startState
166
- raise ArgumentError, "bad start state"
135
+ # Get id of token given its name
136
+ # @param tokenName name of token
137
+ # @return nil if there is no token with that name
138
+ #
139
+ def tokenId(tokenName)
140
+ @tokenIdMap[tokenName]
167
141
  end
168
142
 
169
- stateInfo = []
170
- stateList.each do |st|
171
- stateInfo.push(stateToList(st))
143
+ # Serialize this DFA to a JSON string.
144
+ # The DFA in JSON form has this structure:
145
+ #
146
+ # {
147
+ # "version" => version number (float)
148
+ # "tokens" => array of token names (strings)
149
+ # "states" => array of states, ordered by id (0,1,..)
150
+ # }
151
+ #
152
+ # Each state has this format:
153
+ # [ finalState (boolean),
154
+ # [edge0, edge1, ...]
155
+ # ]
156
+ #
157
+ # Edge:
158
+ # [label, destination id (integer)]
159
+ #
160
+ # Labels are arrays of integers, exactly the structure of
161
+ # a CodeSet array.
162
+ #
163
+ def serialize
164
+
165
+ h = {"version"=>VERSION, "tokens"=>tokenNames}
166
+
167
+
168
+ stateSet,_,_ = startState.reachableStates
169
+
170
+ idToStateMap = {}
171
+ stateSet.each{ |st| idToStateMap[st.id] = st }
172
+
173
+ stateList = []
174
+
175
+ nextId = 0
176
+ idToStateMap.each_pair do |id, st|
177
+ if nextId != id
178
+ raise ArgumentError, "unexpected state ids"
179
+ end
180
+ nextId += 1
181
+
182
+ stateList.push(st)
183
+ end
184
+
185
+ if stateList.size == 0
186
+ raise ArgumentError, "bad states"
187
+ end
188
+
189
+ if stateList[0] != startState
190
+ raise ArgumentError, "bad start state"
191
+ end
192
+
193
+ stateInfo = []
194
+ stateList.each do |st|
195
+ stateInfo.push(stateToList(st))
196
+ end
197
+ h["states"] = stateInfo
198
+
199
+ JSON.generate(h)
172
200
  end
173
- h["states"] = stateInfo
174
-
175
- JSON.generate(h)
176
- end
177
-
178
- private
179
201
 
180
- def stateToList(state)
181
- list = []
202
+ private
203
+
204
+ VERSION = 1.0
182
205
 
183
- list.push(state.finalState?)
184
- ed = []
185
- state.edges.each do |lbl, dest|
186
- edInfo = [lbl.array, dest.id]
187
- ed.push(edInfo)
206
+ def stateToList(state)
207
+ list = [state.finalState?]
208
+ ed = []
209
+ state.edges.each do |lbl, dest|
210
+ edInfo = [lbl.array, dest.id]
211
+ ed.push(edInfo)
212
+ end
213
+ list.push(ed)
214
+
215
+ list
188
216
  end
189
- list.push(ed)
190
217
 
191
- list
192
218
  end
193
219
 
194
- end
195
-
220
+ end # module Tokn
196
221
 
@@ -1,261 +1,259 @@
1
1
  require_relative 'tools'
2
- req('tokn_const code_set state range_partition reg_parse')
2
+ # req('tokn_const code_set state range_partition reg_parse')
3
+ req('range_partition reg_parse')
3
4
 
4
- # Converts NFAs (nondeterministic, finite state automata) to
5
- # minimal DFAs.
6
- #
7
- # Performs the subset construction algorithm described in
8
- # (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
9
- #
10
- # Also implements an innovative algorithm to partition a set of
11
- # edge labels into a set that has the property that no two elements
12
- # have overlapping regions. This allows us to perform the subset construction
13
- # (and closure operations) efficiently while supporting large possible character
14
- # sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
15
- # for more details.
16
- #
17
- class DFABuilder
18
-
19
- include Tokn
5
+ module ToknInternal
20
6
 
21
-
22
- # Convert an NFA to a DFA.
23
- #
24
- # @param startState the start state of the NFA
25
- # @param db if true, generates PDF files for debug purposes, showing various
26
- # steps of the procedure
7
+ # Converts NFAs (nondeterministic, finite state automata) to
8
+ # minimal DFAs.
27
9
  #
28
- def self.nfa_to_dfa(startState, db = false)
29
-
30
- !db || startState.generatePDF("original_nfa")
31
-
32
- # Reverse this NFA, convert to DFA, then
33
- # reverse it, and convert it again. Apparently this
34
- # produces a minimal DFA.
35
-
36
- rev = startState.reverseNFA()
37
- !db || rev.generatePDF("reversed_nfa")
38
-
39
- bld = DFABuilder.new(rev)
40
- dfa = bld.build(true, false) # partition, but don't normalize
41
-
42
- !db || dfa.generatePDF("reversed_dfa")
43
-
44
- rev2 = dfa.reverseNFA()
45
- bld = DFABuilder.new(rev2)
46
-
47
- # Don't regenerate the partition; it is still valid
48
- # for this second build process
49
- #
50
- dfa = bld.build(false, true) # don't partition, but do normalize
51
-
52
- # If there are edges that contain more than one token identifier,
53
- # remove all but the first (i.e. the one with the highest token id)
54
-
55
- stSet, _, _ = dfa.reachableStates
56
- stSet.each do |s|
57
- s.edges.each do |lbl, dest|
58
- a = lbl.array
59
- if !a.size
60
- next
61
- end
62
-
63
- primeId = a[0]
64
-
65
- if primeId >= EPSILON-1
66
- next
67
- end
68
-
69
- lbl.difference!(CodeSet.new(primeId+1, EPSILON))
70
- end
71
- end
72
-
73
- !db || dfa.generatePDF("minimal_dfa")
74
-
75
- dfa
76
- end
77
-
78
-
79
-
80
- # Constructs a builder object
10
+ # Performs the subset construction algorithm described in
11
+ # (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
81
12
  #
82
- def initialize(nfaStartState)
83
- @nextId = 0
84
- @nfaStart = nfaStartState
85
-
86
- # Build a map of nfa state ids => nfa states
87
- @nfaStateMap = {}
88
- nfas, _, _ = @nfaStart.reachableStates
89
- nfas.each {|s| @nfaStateMap[s.id] = s}
90
-
91
- # Initialize an array of nfa state lists, indexed by dfa state id
92
- @nfaStateLists = []
93
-
94
- # Map of existing DFA states; key is array of NFA state ids
95
- @dfaStateMap = {}
96
- end
97
-
98
- # Perform the build algorithm
99
- #
100
- # @param partition if true, partitions the edge labels into disjoint code sets
101
- # @param normalize if true, normalizes the states afterward
13
+ # Also implements an innovative algorithm to partition a set of
14
+ # edge labels into a set that has the property that no two elements
15
+ # have overlapping regions. This allows us to perform the subset construction
16
+ # (and closure operations) efficiently while supporting large possible character
17
+ # sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
18
+ # for more details.
102
19
  #
103
- def build(partition = true, normalize = true)
104
- db = false
105
-
106
- !partition || partitionEdges(@nfaStart)
107
-
108
- iset = Set.new
109
- iset.add(@nfaStart)
110
- epsClosure(iset)
111
-
112
- @dfaStart,_ = createDFAState(stateSetToIdArray(iset))
113
-
114
- markedStates = Set.new
115
-
116
- unmarked = [@dfaStart]
117
-
118
- until unmarked.empty?
119
- dfaState = unmarked.pop
20
+ class DFABuilder
21
+
22
+ # Convert an NFA to a DFA.
23
+ #
24
+ # @param startState the start state of the NFA
25
+ # @param db if true, generates PDF files for debug purposes, showing various
26
+ # steps of the procedure
27
+ #
28
+ def self.nfa_to_dfa(startState, db = false)
29
+
30
+ !db || startState.generatePDF("original_nfa")
31
+
32
+ # Reverse this NFA, convert to DFA, then
33
+ # reverse it, and convert it again. Apparently this
34
+ # produces a minimal DFA.
35
+
36
+ rev = startState.reverseNFA()
37
+ !db || rev.generatePDF("reversed_nfa")
38
+
39
+ bld = DFABuilder.new(rev)
40
+ dfa = bld.build(true, false) # partition, but don't normalize
41
+
42
+ !db || dfa.generatePDF("reversed_dfa")
120
43
 
121
- nfaIds = @nfaStateLists[dfaState.id]
44
+ rev2 = dfa.reverseNFA()
45
+ bld = DFABuilder.new(rev2)
122
46
 
123
- # map of CodeSet => set of NFA states
124
- moveMap = {}
47
+ # Don't regenerate the partition; it is still valid
48
+ # for this second build process
49
+ #
50
+ dfa = bld.build(false, true) # don't partition, but do normalize
125
51
 
126
- nfaIds.each do |nfaId|
127
- nfaState = @nfaStateMap[nfaId]
128
- nfaState.edges.each do |lbl,dest|
129
- if lbl.array[0] == EPSILON
52
+ # If there are edges that contain more than one token identifier,
53
+ # remove all but the first (i.e. the one with the highest token id)
54
+
55
+ stSet, _, _ = dfa.reachableStates
56
+ stSet.each do |s|
57
+ s.edges.each do |lbl, dest|
58
+ a = lbl.array
59
+ if !a.size
130
60
  next
131
61
  end
132
62
 
133
- nfaStates = moveMap[lbl]
134
- if !nfaStates
135
- nfaStates = Set.new
136
- moveMap[lbl] = nfaStates
137
- end
138
- nfaStates.add(dest)
63
+ primeId = a[0]
64
+
65
+ next if primeId >= EPSILON-1
66
+
67
+ lbl.difference!(CodeSet.new(primeId+1, EPSILON))
139
68
  end
140
69
  end
141
70
 
142
- moveMap.each_pair do |charRange,nfaStates|
143
- # May be better to test if already in set before calc closure; or simply has closure
144
- epsClosure(nfaStates)
145
- dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
146
- if isNew
147
- unmarked.push(dfaDestState)
148
- end
149
- dfaState.addEdge(charRange, dfaDestState)
150
- end
71
+ !db || dfa.generatePDF("minimal_dfa")
151
72
 
73
+ dfa
152
74
  end
153
75
 
154
- if normalize
155
- !db || @dfaStart.generatePDF("prior_normalize")
156
-
157
- !db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
158
- State.normalizeStates(@dfaStart)
159
- !db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
160
- !db || @dfaStart.generatePDF("post_normalize")
161
- end
162
76
 
163
- @dfaStart
164
- end
165
-
166
- private
167
-
168
- # Adds a DFA state for a set of NFA states, if one doesn't already exist
169
- # for the set
170
- # @param nfaStateList a sorted array of NFA state ids
171
- # @return a pair [DFA State,
172
- # created flag (boolean): true if this did not already exist]
173
- #
174
- def createDFAState(nfaStateList)
175
-
176
- lst = nfaStateList
177
77
 
178
- newState = @nfaStateMap[lst]
179
- isNewState = !newState
180
- if isNewState
181
- newState = State.new(@nextId)
78
+ # Constructs a builder object
79
+ #
80
+ def initialize(nfaStartState)
81
+ @nextId = 0
82
+ @nfaStart = nfaStartState
182
83
 
183
- # Determine if any of the NFA states were final states
184
- newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
84
+ # Build a map of nfa state ids => nfa states
85
+ @nfaStateMap = {}
86
+ nfas, _, _ = @nfaStart.reachableStates
87
+ nfas.each {|s| @nfaStateMap[s.id] = s}
185
88
 
186
- if false
187
- # Set label of DFA state to show which NFA states produced it
188
- # (useful for debugging)
189
- newState.label = lst.map {|x| x.to_s}.join(' ')
190
- end
89
+ # Initialize an array of nfa state lists, indexed by dfa state id
90
+ @nfaStateLists = []
191
91
 
192
- @nextId += 1
193
- @nfaStateMap[lst] = newState
194
- @nfaStateLists.push(lst)
195
-
92
+ # Map of existing DFA states; key is array of NFA state ids
93
+ @dfaStateMap = {}
196
94
  end
197
- return [newState,isNewState]
198
- end
199
95
 
200
- def stateSetToIdArray(s)
201
- s.to_a.map {|x| x.id}.sort
202
- end
96
+ # Perform the build algorithm
97
+ #
98
+ # @param partition if true, partitions the edge labels into disjoint code sets
99
+ # @param normalize if true, normalizes the states afterward
100
+ #
101
+ def build(partition = true, normalize = true)
102
+ db = false
103
+
104
+ !partition || partitionEdges(@nfaStart)
105
+
106
+ iset = Set.new
107
+ iset.add(@nfaStart)
108
+ epsClosure(iset)
109
+
110
+ @dfaStart,_ = createDFAState(stateSetToIdArray(iset))
111
+
112
+ markedStates = Set.new
113
+
114
+ unmarked = [@dfaStart]
203
115
 
204
- # Calculate the epsilon closure of a set of NFA states
205
- # @return a set of states
206
- #
207
- def epsClosure(stateSet)
208
- stk = stateSet.to_a
209
- while !stk.empty?
210
- s = stk.pop
211
- s.edges.each do |lbl,dest|
212
- if lbl.contains? EPSILON
213
- if stateSet.add?(dest)
214
- stk.push(dest)
116
+ until unmarked.empty?
117
+ dfaState = unmarked.pop
118
+
119
+ nfaIds = @nfaStateLists[dfaState.id]
120
+
121
+ # map of CodeSet => set of NFA states
122
+ moveMap = {}
123
+
124
+ nfaIds.each do |nfaId|
125
+ nfaState = @nfaStateMap[nfaId]
126
+ nfaState.edges.each do |lbl,dest|
127
+ if lbl.array[0] == EPSILON
128
+ next
129
+ end
130
+
131
+ nfaStates = moveMap[lbl]
132
+ if !nfaStates
133
+ nfaStates = Set.new
134
+ moveMap[lbl] = nfaStates
135
+ end
136
+ nfaStates.add(dest)
215
137
  end
216
138
  end
139
+
140
+ moveMap.each_pair do |charRange,nfaStates|
141
+ # May be better to test if already in set before calc closure; or simply has closure
142
+ epsClosure(nfaStates)
143
+ dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
144
+ if isNew
145
+ unmarked.push(dfaDestState)
146
+ end
147
+ dfaState.addEdge(charRange, dfaDestState)
148
+ end
149
+
217
150
  end
218
- end
219
- stateSet
220
- end
221
-
222
- # Modify edges so each is labelled with a disjoint subset
223
- # of characters. See the notes at the start of this class,
224
- # as well as RangePartition.rb.
225
- #
226
- def partitionEdges(startState)
227
-
228
- db = false
229
-
230
- par = RangePartition.new
151
+
152
+ if normalize
153
+ !db || @dfaStart.generatePDF("prior_normalize")
154
+
155
+ !db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
156
+ State.normalizeStates(@dfaStart)
157
+ !db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
158
+ !db || @dfaStart.generatePDF("post_normalize")
159
+ end
160
+
161
+ @dfaStart
162
+ end
231
163
 
232
- stateSet, _, _ = startState.reachableStates
164
+ private
233
165
 
234
- stateSet.each do |s|
235
- s.edges.each {|lbl,dest| par.addSet(lbl) }
166
+ # Adds a DFA state for a set of NFA states, if one doesn't already exist
167
+ # for the set
168
+ # @param nfaStateList a sorted array of NFA state ids
169
+ # @return a pair [DFA State,
170
+ # created flag (boolean): true if this did not already exist]
171
+ #
172
+ def createDFAState(nfaStateList)
173
+
174
+ lst = nfaStateList
175
+
176
+ newState = @nfaStateMap[lst]
177
+ isNewState = !newState
178
+ if isNewState
179
+ newState = State.new(@nextId)
180
+
181
+ # Determine if any of the NFA states were final states
182
+ newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
183
+
184
+ if false
185
+ # Set label of DFA state to show which NFA states produced it
186
+ # (useful for debugging)
187
+ newState.label = lst.map {|x| x.to_s}.join(' ')
188
+ end
189
+
190
+ @nextId += 1
191
+ @nfaStateMap[lst] = newState
192
+ @nfaStateLists.push(lst)
193
+
194
+ end
195
+ return [newState,isNewState]
236
196
  end
237
197
 
238
- par.prepare
198
+ def stateSetToIdArray(s)
199
+ s.to_a.map {|x| x.id}.sort
200
+ end
239
201
 
240
- stateSet.each do |s|
241
- newEdges = []
242
- s.edges.each do |lbl, dest|
243
- !db||pr(" old edge: %s => %s\n",d(lbl),d(dest.name))
244
- newLbls = par.apply(lbl)
245
- newLbls.each {|x| newEdges.push([x, dest]) }
202
+ # Calculate the epsilon closure of a set of NFA states
203
+ # @return a set of states
204
+ #
205
+ def epsClosure(stateSet)
206
+ stk = stateSet.to_a
207
+ while !stk.empty?
208
+ s = stk.pop
209
+ s.edges.each do |lbl,dest|
210
+ if lbl.contains? EPSILON
211
+ if stateSet.add?(dest)
212
+ stk.push(dest)
213
+ end
214
+ end
215
+ end
246
216
  end
247
- s.clearEdges()
217
+ stateSet
218
+ end
219
+
220
+ # Modify edges so each is labelled with a disjoint subset
221
+ # of characters. See the notes at the start of this class,
222
+ # as well as RangePartition.rb.
223
+ #
224
+ def partitionEdges(startState)
225
+
226
+ db = false
227
+
228
+ par = RangePartition.new
229
+
230
+ stateSet, _, _ = startState.reachableStates
231
+
232
+ stateSet.each do |s|
233
+ s.edges.each {|lbl,dest| par.addSet(lbl) }
234
+ end
235
+
236
+ par.prepare
248
237
 
249
- newEdges.each do |lbl,dest|
250
- !db||pr(" new edge: %s => %s\n",d(lbl),d(dest.name))
251
- s.addEdge(lbl,dest)
238
+ stateSet.each do |s|
239
+ newEdges = []
240
+ s.edges.each do |lbl, dest|
241
+ !db||pr(" old edge: %s => %s\n",d(lbl),d(dest.name))
242
+ newLbls = par.apply(lbl)
243
+ newLbls.each {|x| newEdges.push([x, dest]) }
244
+ end
245
+ s.clearEdges()
246
+
247
+ newEdges.each do |lbl,dest|
248
+ !db||pr(" new edge: %s => %s\n",d(lbl),d(dest.name))
249
+ s.addEdge(lbl,dest)
250
+ end
251
+ !db||pr("\n")
252
252
  end
253
- !db||pr("\n")
253
+
254
254
  end
255
+
255
256
 
256
257
  end
257
258
 
258
-
259
- end
260
-
261
-
259
+ end # module ToknInternal