tokn 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/tokn/dfa.rb CHANGED
@@ -2,195 +2,220 @@ require 'json'
2
2
  require_relative 'tools'
3
3
  req('code_set state')
4
4
 
5
-
6
- # A DFA for tokenizing; includes pointer to a start state, and
7
- # a list of token names
8
- #
9
- class DFA
10
-
11
- include Tokn
12
-
13
- # Compile a Tokenizer DFA from a token definition script.
14
- # If persistPath is not null, it first checks if the file exists and
15
- # if so, assumes it contains (in JSON form) a previously compiled
16
- # DFA matching this script, and reads the DFA from it.
17
- # Second, if no such file exists, it writes the DFA to it after compilation.
5
+ module Tokn
6
+
7
+ # A DFA for tokenizing; includes pointer to a start state, and
8
+ # a list of token names
18
9
  #
19
- def self.dfa_from_script(script, persistPath = nil)
10
+ class DFA
20
11
 
21
- if persistPath and File.exist?(persistPath)
22
- return extractDFA(readTextFile(persistPath))
23
- end
24
-
25
- req('token_defn_parser')
12
+ include ToknInternal
26
13
 
27
- td = TokenDefParser.new(script)
28
- dfa = td.dfa
14
+ # Compile a Tokenizer DFA from a token definition script.
15
+ # If persistPath is not null, it first checks if the file exists and
16
+ # if so, assumes it contains (in JSON form) a previously compiled
17
+ # DFA matching this script, and reads the DFA from it.
18
+ # Second, if no such file exists, it writes the DFA to it after compilation.
19
+ #
20
+ def self.from_script(script, persistPath = nil)
21
+
22
+ if persistPath and File.exist?(persistPath)
23
+ return extractDFA(readTextFile(persistPath))
24
+ end
25
+
26
+ req('token_defn_parser')
27
+
28
+ td = TokenDefParser.new(script)
29
+ dfa = td.dfa
30
+
31
+ if persistPath
32
+ writeTextFile(persistPath, dfa.serialize())
33
+ end
29
34
 
30
- if persistPath
31
- writeTextFile(persistPath, dfa.serialize())
35
+ dfa
32
36
  end
33
-
34
- dfa
35
- end
36
-
37
- # Similar to dfa_from_script, but reads the script into memory from
38
- # the file at scriptPath.
39
- #
40
- def self.dfa_from_script_file(scriptPath, persistPath = nil)
41
- self.dfa_from_script(readTextFile(scriptPath), persistPath)
42
- end
43
-
44
- # Compile a Tokenizer DFA from a text file (that contains a
45
- # JSON string)
46
- #
47
- def self.dfa_from_file(path)
48
- dfa_from_json(readTextFile(path))
49
- end
50
-
51
- # Compile a Tokenizer DFA from a JSON string
52
- #
53
- def self.dfa_from_json(jsonStr)
54
- db = false
55
37
 
56
- !db|| pr("\n\nextractDFA %s...\n",jsonStr)
57
-
58
- h = JSON.parse(jsonStr)
59
-
60
- tNames = h["tokens"]
61
- stateInfo = h["states"]
62
-
63
- !db|| pr("tokens=%s\n",d(tNames))
64
- !db|| pr("stateInfo=\n%s\n",d(stateInfo))
65
-
66
- st = []
67
- stateInfo.each_with_index do |(key,val),i|
68
- !db|| pr(" creating new state, id=%d\n",i)
69
- st.push(State.new(i))
38
+ # Similar to from_script, but reads the script into memory from
39
+ # the file at scriptPath.
40
+ #
41
+ def self.from_script_file(scriptPath, persistPath = nil)
42
+ self.from_script(readTextFile(scriptPath), persistPath)
70
43
  end
71
44
 
72
- st.each do |s|
73
- !db|| pr("proc state %s\n",d(s))
74
-
75
- finalState, edgeList = stateInfo[s.id]
76
- s.finalState = finalState
77
- edgeList.each do |edge|
78
- label,destState = edge
79
- cr = CodeSet.new()
80
- cr.setArray(label)
81
- s.addEdge(cr, st[destState])
82
- end
45
+ # Compile a Tokenizer DFA from a text file (that contains a
46
+ # JSON string)
47
+ #
48
+ def self.from_file(path)
49
+ from_json(readTextFile(path))
83
50
  end
84
51
 
85
- DFA.new(tNames, st[0])
86
-
87
- end
88
-
89
- attr_reader :startState, :tokenNames
90
-
91
- # Construct a DFA, given a list of token names and a starting state.
92
- #
93
- def initialize(tokenNameList, startState)
94
- @tokenNames = tokenNameList
95
- @startState = startState
96
- end
97
-
98
- # Determine the name of a token, given its id.
99
- # Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
100
- # the tokenId is nil. Otherwise, assumes tokenId is 0..n, where
101
- # n is the number of token names in the DFA.
102
- #
103
- def tokenName(tokenId)
104
- if !tokenId
105
- nm = "<EOF>"
106
- elsif tokenId == UNKNOWN_TOKEN
107
- nm = "<UNKNOWN>"
108
- else
109
- if tokenId < 0 || tokenId >= tokenNames.size
110
- raise IndexError, "No such token id: "+tokenId.to_s
52
+ # Compile a Tokenizer DFA from a JSON string
53
+ #
54
+ def self.from_json(jsonStr)
55
+ db = false
56
+
57
+ !db|| pr("\n\nextractDFA %s...\n",jsonStr)
58
+
59
+ h = JSON.parse(jsonStr)
60
+
61
+ version = h["version"]
62
+
63
+ if !version || version.floor != VERSION.floor
64
+ raise ArgumentError,
65
+ "Bad or missing version number: "+version.to_s+", expected "+VERSION.to_s
111
66
  end
112
- nm = tokenNames[tokenId]
113
- end
114
- nm
115
- end
116
-
117
- # Serialize this DFA to a JSON string.
118
- # The DFA in JSON form has this structure:
119
- #
120
- # {
121
- # "tokens" => array of token names (strings)
122
- # "states" => array of states, ordered by id (0,1,..)
123
- # }
124
- #
125
- # Each state has this format:
126
- # [ finalState (boolean),
127
- # [edge0, edge1, ...]
128
- # ]
129
- #
130
- # Edge:
131
- # [label, destination id (integer)]
132
- #
133
- # Labels are arrays of integers, exactly the structure of
134
- # a CodeSet array.
135
- #
136
- def serialize
137
-
138
- h = {}
139
-
140
- h["tokens"] = tokenNames
141
-
142
- stateSet,_,_ = startState.reachableStates
67
+
68
+ tNames = h["tokens"]
69
+ stateInfo = h["states"]
70
+
71
+ !db|| pr("tokens=%s\n",d(tNames))
72
+ !db|| pr("stateInfo=\n%s\n",d(stateInfo))
73
+
74
+ st = []
75
+ stateInfo.each_with_index do |(key,val),i|
76
+ !db|| pr(" creating new state, id=%d\n",i)
77
+ st.push(State.new(i))
78
+ end
79
+
80
+ st.each do |s|
81
+ !db|| pr("proc state %s\n",d(s))
82
+
83
+ finalState, edgeList = stateInfo[s.id]
84
+ s.finalState = finalState
85
+ edgeList.each do |edge|
86
+ label,destState = edge
87
+ cr = CodeSet.new()
88
+ cr.setArray(label)
89
+ s.addEdge(cr, st[destState])
90
+ end
91
+ end
92
+
93
+ DFA.new(tNames, st[0])
143
94
 
144
- idToStateMap = {}
145
- stateSet.each do |st|
146
- idToStateMap[st.id] = st
147
95
  end
148
96
 
149
- stateList = []
150
-
151
- nextId = 0
152
- idToStateMap.each_pair do |id, st|
153
- if nextId != id
154
- raise ArgumentError, "unexpected state ids"
97
+ attr_reader :startState, :tokenNames
98
+
99
+ # Construct a DFA, given a list of token names and a starting state.
100
+ #
101
+ def initialize(tokenNameList, startState)
102
+
103
+ if (startState.id != 0)
104
+ raise ArgumentError, "Start state id must be zero"
105
+ end
106
+
107
+ @tokenNames = tokenNameList
108
+ @startState = startState
109
+ @tokenIdMap = {}
110
+ @tokenNames.each_with_index do |name, i|
111
+ @tokenIdMap[name] = i
155
112
  end
156
- nextId += 1
157
113
 
158
- stateList.push(st)
159
114
  end
160
115
 
161
- if stateList.size == 0
162
- raise ArgumentError, "bad states"
116
+ # Determine the name of a token, given its id.
117
+ # Returns <UNKNOWN> if its id is UNKNOWN_TOKEN, or <EOF> if
118
+ # the tokenId is nil. Otherwise, assumes tokenId is 0 ... n-1, where
119
+ # n is the number of token names in the DFA.
120
+ #
121
+ def tokenName(tokenId)
122
+ if !tokenId
123
+ nm = "<EOF>"
124
+ elsif tokenId == UNKNOWN_TOKEN
125
+ nm = "<UNKNOWN>"
126
+ else
127
+ if tokenId < 0 || tokenId >= tokenNames.size
128
+ raise IndexError, "No such token id: "+tokenId.to_s
129
+ end
130
+ nm = tokenNames[tokenId]
131
+ end
132
+ nm
163
133
  end
164
134
 
165
- if stateList[0] != startState
166
- raise ArgumentError, "bad start state"
135
+ # Get id of token given its name
136
+ # @param tokenName name of token
137
+ # @return nil if there is no token with that name
138
+ #
139
+ def tokenId(tokenName)
140
+ @tokenIdMap[tokenName]
167
141
  end
168
142
 
169
- stateInfo = []
170
- stateList.each do |st|
171
- stateInfo.push(stateToList(st))
143
+ # Serialize this DFA to a JSON string.
144
+ # The DFA in JSON form has this structure:
145
+ #
146
+ # {
147
+ # "version" => version number (float)
148
+ # "tokens" => array of token names (strings)
149
+ # "states" => array of states, ordered by id (0,1,..)
150
+ # }
151
+ #
152
+ # Each state has this format:
153
+ # [ finalState (boolean),
154
+ # [edge0, edge1, ...]
155
+ # ]
156
+ #
157
+ # Edge:
158
+ # [label, destination id (integer)]
159
+ #
160
+ # Labels are arrays of integers, exactly the structure of
161
+ # a CodeSet array.
162
+ #
163
+ def serialize
164
+
165
+ h = {"version"=>VERSION, "tokens"=>tokenNames}
166
+
167
+
168
+ stateSet,_,_ = startState.reachableStates
169
+
170
+ idToStateMap = {}
171
+ stateSet.each{ |st| idToStateMap[st.id] = st }
172
+
173
+ stateList = []
174
+
175
+ nextId = 0
176
+ idToStateMap.each_pair do |id, st|
177
+ if nextId != id
178
+ raise ArgumentError, "unexpected state ids"
179
+ end
180
+ nextId += 1
181
+
182
+ stateList.push(st)
183
+ end
184
+
185
+ if stateList.size == 0
186
+ raise ArgumentError, "bad states"
187
+ end
188
+
189
+ if stateList[0] != startState
190
+ raise ArgumentError, "bad start state"
191
+ end
192
+
193
+ stateInfo = []
194
+ stateList.each do |st|
195
+ stateInfo.push(stateToList(st))
196
+ end
197
+ h["states"] = stateInfo
198
+
199
+ JSON.generate(h)
172
200
  end
173
- h["states"] = stateInfo
174
-
175
- JSON.generate(h)
176
- end
177
-
178
- private
179
201
 
180
- def stateToList(state)
181
- list = []
202
+ private
203
+
204
+ VERSION = 1.0
182
205
 
183
- list.push(state.finalState?)
184
- ed = []
185
- state.edges.each do |lbl, dest|
186
- edInfo = [lbl.array, dest.id]
187
- ed.push(edInfo)
206
+ def stateToList(state)
207
+ list = [state.finalState?]
208
+ ed = []
209
+ state.edges.each do |lbl, dest|
210
+ edInfo = [lbl.array, dest.id]
211
+ ed.push(edInfo)
212
+ end
213
+ list.push(ed)
214
+
215
+ list
188
216
  end
189
- list.push(ed)
190
217
 
191
- list
192
218
  end
193
219
 
194
- end
195
-
220
+ end # module Tokn
196
221
 
@@ -1,261 +1,259 @@
1
1
  require_relative 'tools'
2
- req('tokn_const code_set state range_partition reg_parse')
2
+ # req('tokn_const code_set state range_partition reg_parse')
3
+ req('range_partition reg_parse')
3
4
 
4
- # Converts NFAs (nondeterministic, finite state automata) to
5
- # minimal DFAs.
6
- #
7
- # Performs the subset construction algorithm described in
8
- # (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
9
- #
10
- # Also implements an innovative algorithm to partition a set of
11
- # edge labels into a set that has the property that no two elements
12
- # have overlapping regions. This allows us to perform the subset construction
13
- # (and closure operations) efficiently while supporting large possible character
14
- # sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
15
- # for more details.
16
- #
17
- class DFABuilder
18
-
19
- include Tokn
5
+ module ToknInternal
20
6
 
21
-
22
- # Convert an NFA to a DFA.
23
- #
24
- # @param startState the start state of the NFA
25
- # @param db if true, generates PDF files for debug purposes, showing various
26
- # steps of the procedure
7
+ # Converts NFAs (nondeterministic, finite state automata) to
8
+ # minimal DFAs.
27
9
  #
28
- def self.nfa_to_dfa(startState, db = false)
29
-
30
- !db || startState.generatePDF("original_nfa")
31
-
32
- # Reverse this NFA, convert to DFA, then
33
- # reverse it, and convert it again. Apparently this
34
- # produces a minimal DFA.
35
-
36
- rev = startState.reverseNFA()
37
- !db || rev.generatePDF("reversed_nfa")
38
-
39
- bld = DFABuilder.new(rev)
40
- dfa = bld.build(true, false) # partition, but don't normalize
41
-
42
- !db || dfa.generatePDF("reversed_dfa")
43
-
44
- rev2 = dfa.reverseNFA()
45
- bld = DFABuilder.new(rev2)
46
-
47
- # Don't regenerate the partition; it is still valid
48
- # for this second build process
49
- #
50
- dfa = bld.build(false, true) # don't partition, but do normalize
51
-
52
- # If there are edges that contain more than one token identifier,
53
- # remove all but the first (i.e. the one with the highest token id)
54
-
55
- stSet, _, _ = dfa.reachableStates
56
- stSet.each do |s|
57
- s.edges.each do |lbl, dest|
58
- a = lbl.array
59
- if !a.size
60
- next
61
- end
62
-
63
- primeId = a[0]
64
-
65
- if primeId >= EPSILON-1
66
- next
67
- end
68
-
69
- lbl.difference!(CodeSet.new(primeId+1, EPSILON))
70
- end
71
- end
72
-
73
- !db || dfa.generatePDF("minimal_dfa")
74
-
75
- dfa
76
- end
77
-
78
-
79
-
80
- # Constructs a builder object
10
+ # Performs the subset construction algorithm described in
11
+ # (among other placess) http://en.wikipedia.org/wiki/Powerset_construction
81
12
  #
82
- def initialize(nfaStartState)
83
- @nextId = 0
84
- @nfaStart = nfaStartState
85
-
86
- # Build a map of nfa state ids => nfa states
87
- @nfaStateMap = {}
88
- nfas, _, _ = @nfaStart.reachableStates
89
- nfas.each {|s| @nfaStateMap[s.id] = s}
90
-
91
- # Initialize an array of nfa state lists, indexed by dfa state id
92
- @nfaStateLists = []
93
-
94
- # Map of existing DFA states; key is array of NFA state ids
95
- @dfaStateMap = {}
96
- end
97
-
98
- # Perform the build algorithm
99
- #
100
- # @param partition if true, partitions the edge labels into disjoint code sets
101
- # @param normalize if true, normalizes the states afterward
13
+ # Also implements an innovative algorithm to partition a set of
14
+ # edge labels into a set that has the property that no two elements
15
+ # have overlapping regions. This allows us to perform the subset construction
16
+ # (and closure operations) efficiently while supporting large possible character
17
+ # sets (e.g., unicode, which ranges from 0..0x10ffff. See RangePartition.rb
18
+ # for more details.
102
19
  #
103
- def build(partition = true, normalize = true)
104
- db = false
105
-
106
- !partition || partitionEdges(@nfaStart)
107
-
108
- iset = Set.new
109
- iset.add(@nfaStart)
110
- epsClosure(iset)
111
-
112
- @dfaStart,_ = createDFAState(stateSetToIdArray(iset))
113
-
114
- markedStates = Set.new
115
-
116
- unmarked = [@dfaStart]
117
-
118
- until unmarked.empty?
119
- dfaState = unmarked.pop
20
+ class DFABuilder
21
+
22
+ # Convert an NFA to a DFA.
23
+ #
24
+ # @param startState the start state of the NFA
25
+ # @param db if true, generates PDF files for debug purposes, showing various
26
+ # steps of the procedure
27
+ #
28
+ def self.nfa_to_dfa(startState, db = false)
29
+
30
+ !db || startState.generatePDF("original_nfa")
31
+
32
+ # Reverse this NFA, convert to DFA, then
33
+ # reverse it, and convert it again. Apparently this
34
+ # produces a minimal DFA.
35
+
36
+ rev = startState.reverseNFA()
37
+ !db || rev.generatePDF("reversed_nfa")
38
+
39
+ bld = DFABuilder.new(rev)
40
+ dfa = bld.build(true, false) # partition, but don't normalize
41
+
42
+ !db || dfa.generatePDF("reversed_dfa")
120
43
 
121
- nfaIds = @nfaStateLists[dfaState.id]
44
+ rev2 = dfa.reverseNFA()
45
+ bld = DFABuilder.new(rev2)
122
46
 
123
- # map of CodeSet => set of NFA states
124
- moveMap = {}
47
+ # Don't regenerate the partition; it is still valid
48
+ # for this second build process
49
+ #
50
+ dfa = bld.build(false, true) # don't partition, but do normalize
125
51
 
126
- nfaIds.each do |nfaId|
127
- nfaState = @nfaStateMap[nfaId]
128
- nfaState.edges.each do |lbl,dest|
129
- if lbl.array[0] == EPSILON
52
+ # If there are edges that contain more than one token identifier,
53
+ # remove all but the first (i.e. the one with the highest token id)
54
+
55
+ stSet, _, _ = dfa.reachableStates
56
+ stSet.each do |s|
57
+ s.edges.each do |lbl, dest|
58
+ a = lbl.array
59
+ if !a.size
130
60
  next
131
61
  end
132
62
 
133
- nfaStates = moveMap[lbl]
134
- if !nfaStates
135
- nfaStates = Set.new
136
- moveMap[lbl] = nfaStates
137
- end
138
- nfaStates.add(dest)
63
+ primeId = a[0]
64
+
65
+ next if primeId >= EPSILON-1
66
+
67
+ lbl.difference!(CodeSet.new(primeId+1, EPSILON))
139
68
  end
140
69
  end
141
70
 
142
- moveMap.each_pair do |charRange,nfaStates|
143
- # May be better to test if already in set before calc closure; or simply has closure
144
- epsClosure(nfaStates)
145
- dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
146
- if isNew
147
- unmarked.push(dfaDestState)
148
- end
149
- dfaState.addEdge(charRange, dfaDestState)
150
- end
71
+ !db || dfa.generatePDF("minimal_dfa")
151
72
 
73
+ dfa
152
74
  end
153
75
 
154
- if normalize
155
- !db || @dfaStart.generatePDF("prior_normalize")
156
-
157
- !db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
158
- State.normalizeStates(@dfaStart)
159
- !db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
160
- !db || @dfaStart.generatePDF("post_normalize")
161
- end
162
76
 
163
- @dfaStart
164
- end
165
-
166
- private
167
-
168
- # Adds a DFA state for a set of NFA states, if one doesn't already exist
169
- # for the set
170
- # @param nfaStateList a sorted array of NFA state ids
171
- # @return a pair [DFA State,
172
- # created flag (boolean): true if this did not already exist]
173
- #
174
- def createDFAState(nfaStateList)
175
-
176
- lst = nfaStateList
177
77
 
178
- newState = @nfaStateMap[lst]
179
- isNewState = !newState
180
- if isNewState
181
- newState = State.new(@nextId)
78
+ # Constructs a builder object
79
+ #
80
+ def initialize(nfaStartState)
81
+ @nextId = 0
82
+ @nfaStart = nfaStartState
182
83
 
183
- # Determine if any of the NFA states were final states
184
- newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
84
+ # Build a map of nfa state ids => nfa states
85
+ @nfaStateMap = {}
86
+ nfas, _, _ = @nfaStart.reachableStates
87
+ nfas.each {|s| @nfaStateMap[s.id] = s}
185
88
 
186
- if false
187
- # Set label of DFA state to show which NFA states produced it
188
- # (useful for debugging)
189
- newState.label = lst.map {|x| x.to_s}.join(' ')
190
- end
89
+ # Initialize an array of nfa state lists, indexed by dfa state id
90
+ @nfaStateLists = []
191
91
 
192
- @nextId += 1
193
- @nfaStateMap[lst] = newState
194
- @nfaStateLists.push(lst)
195
-
92
+ # Map of existing DFA states; key is array of NFA state ids
93
+ @dfaStateMap = {}
196
94
  end
197
- return [newState,isNewState]
198
- end
199
95
 
200
- def stateSetToIdArray(s)
201
- s.to_a.map {|x| x.id}.sort
202
- end
96
+ # Perform the build algorithm
97
+ #
98
+ # @param partition if true, partitions the edge labels into disjoint code sets
99
+ # @param normalize if true, normalizes the states afterward
100
+ #
101
+ def build(partition = true, normalize = true)
102
+ db = false
103
+
104
+ !partition || partitionEdges(@nfaStart)
105
+
106
+ iset = Set.new
107
+ iset.add(@nfaStart)
108
+ epsClosure(iset)
109
+
110
+ @dfaStart,_ = createDFAState(stateSetToIdArray(iset))
111
+
112
+ markedStates = Set.new
113
+
114
+ unmarked = [@dfaStart]
203
115
 
204
- # Calculate the epsilon closure of a set of NFA states
205
- # @return a set of states
206
- #
207
- def epsClosure(stateSet)
208
- stk = stateSet.to_a
209
- while !stk.empty?
210
- s = stk.pop
211
- s.edges.each do |lbl,dest|
212
- if lbl.contains? EPSILON
213
- if stateSet.add?(dest)
214
- stk.push(dest)
116
+ until unmarked.empty?
117
+ dfaState = unmarked.pop
118
+
119
+ nfaIds = @nfaStateLists[dfaState.id]
120
+
121
+ # map of CodeSet => set of NFA states
122
+ moveMap = {}
123
+
124
+ nfaIds.each do |nfaId|
125
+ nfaState = @nfaStateMap[nfaId]
126
+ nfaState.edges.each do |lbl,dest|
127
+ if lbl.array[0] == EPSILON
128
+ next
129
+ end
130
+
131
+ nfaStates = moveMap[lbl]
132
+ if !nfaStates
133
+ nfaStates = Set.new
134
+ moveMap[lbl] = nfaStates
135
+ end
136
+ nfaStates.add(dest)
215
137
  end
216
138
  end
139
+
140
+ moveMap.each_pair do |charRange,nfaStates|
141
+ # May be better to test if already in set before calc closure; or simply has closure
142
+ epsClosure(nfaStates)
143
+ dfaDestState, isNew = createDFAState(stateSetToIdArray(nfaStates))
144
+ if isNew
145
+ unmarked.push(dfaDestState)
146
+ end
147
+ dfaState.addEdge(charRange, dfaDestState)
148
+ end
149
+
217
150
  end
218
- end
219
- stateSet
220
- end
221
-
222
- # Modify edges so each is labelled with a disjoint subset
223
- # of characters. See the notes at the start of this class,
224
- # as well as RangePartition.rb.
225
- #
226
- def partitionEdges(startState)
227
-
228
- db = false
229
-
230
- par = RangePartition.new
151
+
152
+ if normalize
153
+ !db || @dfaStart.generatePDF("prior_normalize")
154
+
155
+ !db || pr("Normalizing states for:\n\n%s\n",State.dumpNFA(@dfaStart))
156
+ State.normalizeStates(@dfaStart)
157
+ !db || pr("After normalizing:\n\n%s\n",State.dumpNFA(@dfaStart))
158
+ !db || @dfaStart.generatePDF("post_normalize")
159
+ end
160
+
161
+ @dfaStart
162
+ end
231
163
 
232
- stateSet, _, _ = startState.reachableStates
164
+ private
233
165
 
234
- stateSet.each do |s|
235
- s.edges.each {|lbl,dest| par.addSet(lbl) }
166
+ # Adds a DFA state for a set of NFA states, if one doesn't already exist
167
+ # for the set
168
+ # @param nfaStateList a sorted array of NFA state ids
169
+ # @return a pair [DFA State,
170
+ # created flag (boolean): true if this did not already exist]
171
+ #
172
+ def createDFAState(nfaStateList)
173
+
174
+ lst = nfaStateList
175
+
176
+ newState = @nfaStateMap[lst]
177
+ isNewState = !newState
178
+ if isNewState
179
+ newState = State.new(@nextId)
180
+
181
+ # Determine if any of the NFA states were final states
182
+ newState.finalState = nfaStateList.any?{|id| @nfaStateMap[id].finalState?}
183
+
184
+ if false
185
+ # Set label of DFA state to show which NFA states produced it
186
+ # (useful for debugging)
187
+ newState.label = lst.map {|x| x.to_s}.join(' ')
188
+ end
189
+
190
+ @nextId += 1
191
+ @nfaStateMap[lst] = newState
192
+ @nfaStateLists.push(lst)
193
+
194
+ end
195
+ return [newState,isNewState]
236
196
  end
237
197
 
238
- par.prepare
198
+ def stateSetToIdArray(s)
199
+ s.to_a.map {|x| x.id}.sort
200
+ end
239
201
 
240
- stateSet.each do |s|
241
- newEdges = []
242
- s.edges.each do |lbl, dest|
243
- !db||pr(" old edge: %s => %s\n",d(lbl),d(dest.name))
244
- newLbls = par.apply(lbl)
245
- newLbls.each {|x| newEdges.push([x, dest]) }
202
+ # Calculate the epsilon closure of a set of NFA states
203
+ # @return a set of states
204
+ #
205
+ def epsClosure(stateSet)
206
+ stk = stateSet.to_a
207
+ while !stk.empty?
208
+ s = stk.pop
209
+ s.edges.each do |lbl,dest|
210
+ if lbl.contains? EPSILON
211
+ if stateSet.add?(dest)
212
+ stk.push(dest)
213
+ end
214
+ end
215
+ end
246
216
  end
247
- s.clearEdges()
217
+ stateSet
218
+ end
219
+
220
+ # Modify edges so each is labelled with a disjoint subset
221
+ # of characters. See the notes at the start of this class,
222
+ # as well as RangePartition.rb.
223
+ #
224
+ def partitionEdges(startState)
225
+
226
+ db = false
227
+
228
+ par = RangePartition.new
229
+
230
+ stateSet, _, _ = startState.reachableStates
231
+
232
+ stateSet.each do |s|
233
+ s.edges.each {|lbl,dest| par.addSet(lbl) }
234
+ end
235
+
236
+ par.prepare
248
237
 
249
- newEdges.each do |lbl,dest|
250
- !db||pr(" new edge: %s => %s\n",d(lbl),d(dest.name))
251
- s.addEdge(lbl,dest)
238
+ stateSet.each do |s|
239
+ newEdges = []
240
+ s.edges.each do |lbl, dest|
241
+ !db||pr(" old edge: %s => %s\n",d(lbl),d(dest.name))
242
+ newLbls = par.apply(lbl)
243
+ newLbls.each {|x| newEdges.push([x, dest]) }
244
+ end
245
+ s.clearEdges()
246
+
247
+ newEdges.each do |lbl,dest|
248
+ !db||pr(" new edge: %s => %s\n",d(lbl),d(dest.name))
249
+ s.addEdge(lbl,dest)
250
+ end
251
+ !db||pr("\n")
252
252
  end
253
- !db||pr("\n")
253
+
254
254
  end
255
+
255
256
 
256
257
  end
257
258
 
258
-
259
- end
260
-
261
-
259
+ end # module ToknInternal