tokn 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,156 @@
1
+ require_relative 'tools'
2
+ req('tokn_const code_set dfa_builder state reg_parse')
3
+
4
+ # Parses a token definition script, and generates an NFA that
5
+ # is capable of recognizing and distinguishing between the various
6
+ # tokens.
7
+ #
8
+ # Each line in the script is one of
9
+ #
10
+ # # ...comment... (the # must appear as the first character in the line)
11
+ #
12
+ # <tokenname> ':' <regex>
13
+ #
14
+ #
15
+ # A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
16
+ # If the first character is '_', the token is treated as an 'anonymous' token; these can
17
+ # appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
18
+ # generated NFA.
19
+ #
20
+ class TokenDefParser
21
+ include Tokn
22
+
23
+ attr_reader :dfa
24
+
25
+ # Compile a token definition script into a DFA
26
+ #
27
+ def initialize(script, createPDF = false)
28
+ @script = script
29
+ parseScript
30
+ if createPDF
31
+ dfa.startState.generatePDF("tokenizer_dfa")
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def parseScript
38
+ db = false
39
+
40
+ nextTokenId = 0
41
+
42
+ # List of tokens entries, including anonymous ones
43
+ @tokenListBig = []
44
+
45
+ # List of tokens names, excluding anonymous ones
46
+ tokenListSmall = []
47
+
48
+ # Maps token name to token entry
49
+ @tokenNameMap = {}
50
+
51
+ @lines = @script.split("\n")
52
+
53
+ @lines.each_with_index do |line, lineNumber|
54
+
55
+ line.strip!
56
+
57
+ # If line is empty, or starts with '#', it's a comment
58
+ if line.length == 0 || line[0] == '#'
59
+ next
60
+ end
61
+
62
+ if !(line =~ TOKENNAME_EXPR)
63
+ raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
64
+ end
65
+
66
+ pos = line.index(":")
67
+
68
+ tokenName = line[0,pos].strip()
69
+
70
+ expr = line[pos+1..-1].strip()
71
+
72
+ rex = RegParse.new(expr, @tokenNameMap)
73
+
74
+ # Give it the next available token id, if it's not an anonymous token
75
+ tkId = nil
76
+ if tokenName[0] != '_'
77
+ tkId = nextTokenId
78
+ nextTokenId += 1
79
+ end
80
+
81
+ tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
82
+
83
+ !db || pr("token entry: %s\n",d(tkEntry))
84
+
85
+ if @tokenNameMap.has_key?(tokenName)
86
+ raise ParseException, "Duplicate token name: "+line
87
+ end
88
+
89
+
90
+ @tokenListBig.push(tkEntry)
91
+ @tokenNameMap[tkEntry[0]] = tkEntry
92
+
93
+ if tkId
94
+ tokenListSmall.push(tokenName)
95
+ end
96
+
97
+ !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
98
+
99
+ end
100
+
101
+ combined = combineTokenNFAs()
102
+ !db || combined.generatePDF("combined")
103
+
104
+ dfa = DFABuilder.nfa_to_dfa(combined)
105
+ !db || dfa.generatePDF("combined_minimized")
106
+
107
+ @dfa = DFA.new(tokenListSmall, dfa)
108
+ end
109
+
110
+ # Combine the individual NFAs constructed for the token definitions into
111
+ # one large NFA, each augmented with an edge labelled with the appropriate
112
+ # token identifier to let the tokenizer see which token led to the final state.
113
+ #
114
+ def combineTokenNFAs
115
+
116
+ baseId = 0
117
+ startState = nil
118
+
119
+ @tokenListBig.each do |tokenName, regParse, index, tokenId|
120
+
121
+ # Skip anonymous token definitions
122
+ if !tokenId
123
+ next
124
+ end
125
+
126
+ oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
127
+
128
+ dupStart = oldToNewMap[regParse.startState]
129
+
130
+ # Transition from the expression's end state (not a final state)
131
+ # to a new final state, with the transitioning edge
132
+ # labelled with the token id (actually, a transformed token id to distinguish
133
+ # it from character codes)
134
+ dupEnd = oldToNewMap[regParse.endState]
135
+
136
+ dupfinalState = State.new(baseId)
137
+ baseId += 1
138
+ dupfinalState.finalState = true
139
+
140
+ dupEnd.addEdge(CodeSet.new(tokenIdToEdgeLabel(tokenId)), dupfinalState)
141
+
142
+ if !startState
143
+ startState = dupStart
144
+ else
145
+ # Add an e-transition from the start state to this expression's start
146
+ startState.addEdge(CodeSet.new(EPSILON),dupStart)
147
+ end
148
+ end
149
+ startState
150
+ end
151
+
152
+ # Regex for token names preceding regular expressions
153
+ #
154
+ TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
155
+
156
+ end
@@ -0,0 +1,211 @@
1
+ require_relative 'tools'
2
+ req('tokn_const ')
3
+
4
+ # Extracts tokens from a script, given a previously constructed DFA.
5
+ #
6
+ class Tokenizer
7
+ include Tokn
8
+
9
+ # Construct a tokenizer, given a DFA and some text to process
10
+ #
11
+ def initialize(dfa, text)
12
+ @dfa = dfa
13
+ @text = text
14
+ @lineNumber = 0
15
+ @column = 0
16
+ @cursor = 0
17
+ @tokenHistory = []
18
+ @historyPointer = 0
19
+ end
20
+
21
+ # Determine next token (without reading it)
22
+ #
23
+ # Returns Token, or nil if end of input
24
+ #
25
+ def peek
26
+ if !@text
27
+ raise IllegalStateException, "No input text specified"
28
+ end
29
+
30
+ db = false
31
+ !db || warn("debug printing is on")
32
+ !db || pr("peek, cursor=%d\n",@cursor)
33
+
34
+ if @historyPointer == @tokenHistory.size
35
+ if @cursor < @text.length
36
+
37
+ bestLength = 0
38
+ bestId = UNKNOWN_TOKEN
39
+
40
+ charOffset = 0
41
+ state = @dfa.startState
42
+ while @cursor + charOffset <= @text.length
43
+ ch = nil
44
+ if @cursor + charOffset < @text.length
45
+ ch = @text[@cursor + charOffset].ord()
46
+ !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
47
+ end
48
+
49
+ nextState = nil
50
+
51
+ # Examine edges leaving this state.
52
+ # If one is labelled with a token id, we don't need to match the character with it;
53
+ # store as best token found if length is longer than previous, or equal to previous
54
+ # with higher id.
55
+
56
+ # If an edge is labelled with the current character, advance to that state.
57
+
58
+ edges = state.edges
59
+ edges.each do |lbl,dest|
60
+ a = lbl.array
61
+ !db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest))
62
+ if a[0] < EPSILON
63
+ newTokenId = edgeLabelToTokenId(a[0])
64
+ !db || pr(" new token id=%d\n",newTokenId)
65
+
66
+ if (bestLength < charOffset || newTokenId > bestId)
67
+ bestLength, bestId = charOffset, newTokenId
68
+ !db || pr(" making longest found so far\n")
69
+ end
70
+ end
71
+
72
+ if ch && lbl.contains?(ch)
73
+ !db || pr(" setting next state to %s\n",d(dest))
74
+ nextState = dest
75
+ break
76
+ end
77
+ end
78
+
79
+ if !nextState
80
+ break
81
+ end
82
+ state = nextState
83
+ charOffset += 1
84
+ !db || pr(" advanced to next state\n")
85
+ end
86
+
87
+ peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
88
+ @tokenHistory.push(peekToken)
89
+ end
90
+ end
91
+
92
+ ret = nil
93
+ if @historyPointer < @tokenHistory.size
94
+ ret = @tokenHistory[@historyPointer]
95
+ end
96
+
97
+ ret
98
+ end
99
+
100
+
101
+ # Read next token
102
+ #
103
+ # > tokenName : if not nil, the (string) name of the token expected
104
+ #
105
+ # Raises TokenizerException if no more tokens,if unrecognized token, or
106
+ # if token has different than expected name
107
+ #
108
+ def read(tokenName = nil)
109
+ token = peek()
110
+ if !token
111
+ raise TokenizerException,"No more tokens"
112
+ end
113
+
114
+ if token.id == UNKNOWN_TOKEN
115
+ raise TokenizerException, "Unknown token "+token.inspect
116
+ end
117
+
118
+ if tokenName && tokenName != nameOf(token)
119
+ raise TokenizerException, "Unexpected token "+token.inspect
120
+ end
121
+
122
+ @historyPointer += 1
123
+
124
+ # Advance cursor, line number, column
125
+
126
+ tl = token.text.length
127
+ @cursor += tl
128
+ tl.times do |i|
129
+ c = token.text[i]
130
+ @column += 1
131
+ if c == "\n"
132
+ @lineNumber += 1
133
+ @column = 0
134
+ end
135
+ end
136
+ token
137
+ end
138
+
139
+ # Read next token if it has a particular name
140
+ #
141
+ # > tokenName : name to look for
142
+ # < token read, or nil
143
+ #
144
+ def readIf(tokenName)
145
+ ret = nil
146
+ token = peek()
147
+ if token && nameOf(token) == tokenName
148
+ ret = read()
149
+ end
150
+ ret
151
+ end
152
+
153
+ # Determine if another token exists
154
+ #
155
+ def hasNext
156
+ !peek().nil?
157
+ end
158
+
159
+ # Get the name of a token
160
+ # (i.e., the name of the token definition, not its text)
161
+ #
162
+ # > token read from this tokenizer
163
+ #
164
+ def nameOf(token)
165
+ @dfa.tokenName(token.id)
166
+ end
167
+
168
+ # Unread one (or more) previously read tokens
169
+ #
170
+ def unread(count = 1)
171
+ if @historyPointer < count
172
+ raise TokenizerException, "Cannot unread before start"
173
+ end
174
+ @historyPointer -= count
175
+ end
176
+
177
+ end
178
+
179
+
180
+
181
+
182
+ # Tokens read by Tokenizer
183
+ #
184
+ class Token
185
+ include Tokn
186
+
187
+ attr_reader :text, :lineNumber, :column, :id
188
+
189
+ def initialize(id, text, lineNumber, column)
190
+ @id = id
191
+ @text = text
192
+ @lineNumber = lineNumber
193
+ @column = column
194
+ end
195
+
196
+ def unknown?
197
+ id == UNKNOWN_TOKEN
198
+ end
199
+
200
+ def inspect
201
+ s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
202
+ if !unknown?
203
+ s = s.ljust(17) + " : " + text
204
+ end
205
+ s
206
+ end
207
+ end
208
+
209
+
210
+ class TokenizerException < Exception
211
+ end
@@ -0,0 +1,29 @@
1
+ # Module containing tokn-related constants and functions
2
+ #
3
+ module Tokn
4
+
5
+ # Token id if text didn't match any tokens in the DFA
6
+ UNKNOWN_TOKEN = -1
7
+
8
+ # Code for epsilon transitions
9
+ EPSILON = -1
10
+
11
+ # One plus the maximum code represented
12
+ CODEMAX = 0x110000
13
+
14
+ # Minimum code possible (e.g., indicating a token id)
15
+ CODEMIN = -10000
16
+
17
+ # Convert a token id (>=0) to an edge label value ( < 0)
18
+ #
19
+ def tokenIdToEdgeLabel(tokenId)
20
+ EPSILON-1-tokenId
21
+ end
22
+
23
+ # Convert an edge label value ( < 0) to a token id (>=0)
24
+ #
25
+ def edgeLabelToTokenId(edgeLabel)
26
+ EPSILON-1-edgeLabel
27
+ end
28
+
29
+ end
data/lib/tokn/tools.rb ADDED
@@ -0,0 +1,186 @@
1
+ require 'set'
2
+
3
+ # Various utility and debug convenience functions.
4
+ #
5
+
6
+ # Perform 'require_relative' on a set of files
7
+ #
8
+ # fileListStr : space-delimited file/path items, without .rb extensions
9
+ # subdir : optional path to files relative to tools.rb
10
+ #
11
+ def req(fileListStr,subdir = nil)
12
+ fileListStr.split(' ').each do |x|
13
+ if subdir
14
+ x = File.join(subdir,x)
15
+ end
16
+ x += '.rb'
17
+ require_relative(x)
18
+ end
19
+ end
20
+
21
+ # Shorthand for printf(...)
22
+ #
23
+ def pr(*args)
24
+ printf(*args)
25
+ end
26
+
27
+
28
+ # Convert an object to a human-readable string;
29
+ # should be considered a debug-only feature
30
+ #
31
+ def d(arg)
32
+ if arg.nil?
33
+ "<nil>"
34
+ else
35
+ arg.inspect
36
+ end
37
+ end
38
+
39
+ # Assert that a value is true. Should be considered a
40
+ # very temporary, debug-only option; it is slow and
41
+ # generates a warning that it is being called.
42
+ #
43
+ def myAssert(cond, *msg)
44
+ oneTimeAlert("warning",0,"Checking assertion")
45
+ if not cond
46
+ if msg.size == 0
47
+ str = "assertion error"
48
+ else
49
+ str = sprintf(*msg)
50
+ end
51
+ raise Exception, str
52
+ end
53
+ end
54
+
55
+
56
+ # Set test directory. If nil, sets to home directory + "__test__"
57
+ #
58
+ def setTestDir(d = nil)
59
+ if !d
60
+ d = File.join(Dir.home,"__test__")
61
+ end
62
+ $testDir = d
63
+ end
64
+
65
+ # Get a path within the test directory;
66
+ # create test directory if it doesn't exist.
67
+ #
68
+ # relPath : if nil, returns the test directory; else
69
+ # returns the test directory joined to this one
70
+ #
71
+ def withinTestDir(relPath = nil)
72
+ if !$testDir
73
+ raise IllegalStateException, "No test directory has been defined"
74
+ end
75
+ if !File.directory?($testDir)
76
+ Dir::mkdir($testDir)
77
+ end
78
+ if relPath
79
+ File.join($testDir,relPath)
80
+ else
81
+ $testDir
82
+ end
83
+ end
84
+
85
+ # Convert a .dot file (string) to a PDF file "__mygraph__nnn.pdf"
86
+ # in the test directory.
87
+ #
88
+ # It does this by making a system call to the 'dot' utility.
89
+ #
90
+ def dotToPDF(dotFile, name = "")
91
+ gr = dotFile
92
+ dotPath = withinTestDir(".__mygraph__.dot")
93
+ writeTextFile(dotPath,gr)
94
+ destName = withinTestDir( "__mygraph__"+name+".pdf")
95
+ system("dot -Tpdf "+dotPath+" -o "+destName)
96
+ end
97
+
98
+
99
+ # Get a nice, concise description of the file and line
100
+ # of some caller within the stack.
101
+ #
102
+ # nSkip : the number of items deep in the call stack to look
103
+ #
104
+ def getCallerLocation(nSkip = 2)
105
+
106
+ filename = nil
107
+ linenumber = nil
108
+
109
+ if nSkip >= 0 && nSkip < caller.size
110
+ fi = caller[nSkip]
111
+
112
+ # ' path : line number : other '
113
+ i = fi.index(':')
114
+ j = nil
115
+ if i
116
+ j = fi.index(':',i+1)
117
+ end
118
+ if j
119
+ pth = fi[0,i].split('/')
120
+ if pth.size
121
+ filename = pth[-1]
122
+ end
123
+ linenumber = fi[i+1,j-i-1].to_i
124
+ end
125
+ end
126
+ if filename && linenumber
127
+ loc = filename + " ("+linenumber.to_s+")"
128
+ else
129
+ loc = "(UNKNOWN LOCATION)"
130
+ end
131
+ loc
132
+ end
133
+
134
+ # Set of alert strings that have already been reported
135
+ # (to avoid printing anything on subsequent invocations)
136
+ #
137
+ $AlertStrings = Set.new
138
+
139
+ # Print a message if it hasn't yet been printed,
140
+ # which includes the caller's location
141
+ #
142
+ # > typeString : e.g., "warning", "unimplemented"
143
+ # > nSkip : the number of levels deep that the caller is in the stack
144
+ # > args : if present, calls sprintf(...) with these to append to the message
145
+ #
146
+ def oneTimeAlert(typeString, nSkip, *args)
147
+ loc = getCallerLocation(nSkip + 2)
148
+ s = "*** "+typeString+" " + loc
149
+ if args && args.size
150
+ s2 = sprintf(args[0], *args[1..-1])
151
+ msg = s + ": " + s2
152
+ else
153
+ msg = s
154
+ end
155
+
156
+ if $AlertStrings.add?(msg)
157
+ puts msg
158
+ end
159
+ end
160
+
161
+ # Print a 'warning' alert, one time only
162
+ #
163
+ def warn(*args)
164
+ oneTimeAlert("warning",0, *args)
165
+ end
166
+
167
+ # Print an 'unimplemented' alert, one time only
168
+ #
169
+ def unimp(*args)
170
+ oneTimeAlert("unimplemented", 0, *args)
171
+ end
172
+
173
+ # Write a string to a text file
174
+ #
175
+ def writeTextFile(path, contents)
176
+ File.open(path, "wb") {|f| f.write(contents) }
177
+ end
178
+
179
+ # Read a file's contents, return as a string
180
+ #
181
+ def readTextFile(path)
182
+ contents = nil
183
+ File.open(path,"rb") {|f| contents = f.read }
184
+ contents
185
+ end
186
+
data/lib/tokn.rb ADDED
@@ -0,0 +1 @@
1
+ require 'tokn/tokenizer'
@@ -0,0 +1,11 @@
1
+ // Example source file that can be tokenized
2
+
3
+ speed = 42 // speed of object
4
+
5
+ gravity = -9.80
6
+
7
+ title = 'This is a string with \' an escaped delimiter'
8
+
9
+ if gravity == 12 {
10
+ do something
11
+ }
@@ -0,0 +1,32 @@
1
+ # Sample token definitions
2
+
3
+ # Whitespace includes a comment, which starts with '//' and
4
+ # extends to the end of the line:
5
+ #
6
+ WS: ( [\f\r\s\t\n]+ ) | ( // [^\n]* \n? )
7
+
8
+ # An anonymous token, for convenience; a non-empty sequence of digits
9
+ #
10
+ _DIG: [0-9]+
11
+
12
+ # Double has lower priority than int; we want ints to
13
+ # be interpreted as ints, not as doubles
14
+ DBL: \-?(({_DIG}(.{_DIG})?)|.{_DIG})
15
+
16
+ INT: \-?{_DIG}
17
+
18
+ LBL: '([^'\n]|\\')*'
19
+
20
+ ID: [_a-zA-Z][_a-zA-Z0-9]*
21
+
22
+ ASSIGN: =
23
+
24
+ EQUIV: ==
25
+
26
+ IF: if
27
+ DO: do
28
+
29
+ BROP: \{
30
+
31
+ BRCL: \}
32
+
data/test/simple.rb ADDED
@@ -0,0 +1,33 @@
1
+ require 'test/unit'
2
+ require_relative '../lib/tokn/tools.rb'
3
+ req('tokenizer dfa')
4
+
5
+
6
+ class Simple
7
+
8
+ def dataPath(f)
9
+ File.dirname(__FILE__)+"/data/"+f
10
+ end
11
+
12
+ setTestDir()
13
+
14
+ # Various unit tests for state machines, character range sets, etc.
15
+
16
+ def initialize
17
+ @sampleText = readTextFile(self.dataPath("sampletext.txt"))
18
+ # @sampleTokens = readTextFile(self.dataPath("sampletokens.txt"))
19
+ end
20
+
21
+ def makeTok
22
+ dfa = DFA.dfa_from_script_file(self.dataPath("sampletokens.txt"))
23
+ Tokenizer.new(dfa, @sampleText)
24
+ end
25
+
26
+ def go
27
+ makeTok
28
+ end
29
+ end
30
+
31
+
32
+ s = Simple.new
33
+ s.go