tokn 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ require_relative 'tools'
2
+ req('tokn_const code_set dfa_builder state reg_parse')
3
+
4
+ # Parses a token definition script, and generates an NFA that
5
+ # is capable of recognizing and distinguishing between the various
6
+ # tokens.
7
+ #
8
+ # Each line in the script is one of
9
+ #
10
+ # # ...comment... (the # must appear as the first character in the line)
11
+ #
12
+ # <tokenname> ':' <regex>
13
+ #
14
+ #
15
+ # A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
16
+ # If the first character is '_', the token is treated as an 'anonymous' token; these can
17
+ # appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
18
+ # generated NFA.
19
+ #
20
+ class TokenDefParser
21
+ include Tokn
22
+
23
+ attr_reader :dfa
24
+
25
+ # Compile a token definition script into a DFA
26
+ #
27
+ def initialize(script, createPDF = false)
28
+ @script = script
29
+ parseScript
30
+ if createPDF
31
+ dfa.startState.generatePDF("tokenizer_dfa")
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def parseScript
38
+ db = false
39
+
40
+ nextTokenId = 0
41
+
42
+ # List of tokens entries, including anonymous ones
43
+ @tokenListBig = []
44
+
45
+ # List of tokens names, excluding anonymous ones
46
+ tokenListSmall = []
47
+
48
+ # Maps token name to token entry
49
+ @tokenNameMap = {}
50
+
51
+ @lines = @script.split("\n")
52
+
53
+ @lines.each_with_index do |line, lineNumber|
54
+
55
+ line.strip!
56
+
57
+ # If line is empty, or starts with '#', it's a comment
58
+ if line.length == 0 || line[0] == '#'
59
+ next
60
+ end
61
+
62
+ if !(line =~ TOKENNAME_EXPR)
63
+ raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
64
+ end
65
+
66
+ pos = line.index(":")
67
+
68
+ tokenName = line[0,pos].strip()
69
+
70
+ expr = line[pos+1..-1].strip()
71
+
72
+ rex = RegParse.new(expr, @tokenNameMap)
73
+
74
+ # Give it the next available token id, if it's not an anonymous token
75
+ tkId = nil
76
+ if tokenName[0] != '_'
77
+ tkId = nextTokenId
78
+ nextTokenId += 1
79
+ end
80
+
81
+ tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
82
+
83
+ !db || pr("token entry: %s\n",d(tkEntry))
84
+
85
+ if @tokenNameMap.has_key?(tokenName)
86
+ raise ParseException, "Duplicate token name: "+line
87
+ end
88
+
89
+
90
+ @tokenListBig.push(tkEntry)
91
+ @tokenNameMap[tkEntry[0]] = tkEntry
92
+
93
+ if tkId
94
+ tokenListSmall.push(tokenName)
95
+ end
96
+
97
+ !db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
98
+
99
+ end
100
+
101
+ combined = combineTokenNFAs()
102
+ !db || combined.generatePDF("combined")
103
+
104
+ dfa = DFABuilder.nfa_to_dfa(combined)
105
+ !db || dfa.generatePDF("combined_minimized")
106
+
107
+ @dfa = DFA.new(tokenListSmall, dfa)
108
+ end
109
+
110
+ # Combine the individual NFAs constructed for the token definitions into
111
+ # one large NFA, each augmented with an edge labelled with the appropriate
112
+ # token identifier to let the tokenizer see which token led to the final state.
113
+ #
114
+ def combineTokenNFAs
115
+
116
+ baseId = 0
117
+ startState = nil
118
+
119
+ @tokenListBig.each do |tokenName, regParse, index, tokenId|
120
+
121
+ # Skip anonymous token definitions
122
+ if !tokenId
123
+ next
124
+ end
125
+
126
+ oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
127
+
128
+ dupStart = oldToNewMap[regParse.startState]
129
+
130
+ # Transition from the expression's end state (not a final state)
131
+ # to a new final state, with the transitioning edge
132
+ # labelled with the token id (actually, a transformed token id to distinguish
133
+ # it from character codes)
134
+ dupEnd = oldToNewMap[regParse.endState]
135
+
136
+ dupfinalState = State.new(baseId)
137
+ baseId += 1
138
+ dupfinalState.finalState = true
139
+
140
+ dupEnd.addEdge(CodeSet.new(tokenIdToEdgeLabel(tokenId)), dupfinalState)
141
+
142
+ if !startState
143
+ startState = dupStart
144
+ else
145
+ # Add an e-transition from the start state to this expression's start
146
+ startState.addEdge(CodeSet.new(EPSILON),dupStart)
147
+ end
148
+ end
149
+ startState
150
+ end
151
+
152
+ # Regex for token names preceding regular expressions
153
+ #
154
+ TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
155
+
156
+ end
@@ -0,0 +1,211 @@
1
+ require_relative 'tools'
2
+ req('tokn_const ')
3
+
4
+ # Extracts tokens from a script, given a previously constructed DFA.
5
+ #
6
+ class Tokenizer
7
+ include Tokn
8
+
9
+ # Construct a tokenizer, given a DFA and some text to process
10
+ #
11
+ def initialize(dfa, text)
12
+ @dfa = dfa
13
+ @text = text
14
+ @lineNumber = 0
15
+ @column = 0
16
+ @cursor = 0
17
+ @tokenHistory = []
18
+ @historyPointer = 0
19
+ end
20
+
21
+ # Determine next token (without reading it)
22
+ #
23
+ # Returns Token, or nil if end of input
24
+ #
25
+ def peek
26
+ if !@text
27
+ raise IllegalStateException, "No input text specified"
28
+ end
29
+
30
+ db = false
31
+ !db || warn("debug printing is on")
32
+ !db || pr("peek, cursor=%d\n",@cursor)
33
+
34
+ if @historyPointer == @tokenHistory.size
35
+ if @cursor < @text.length
36
+
37
+ bestLength = 0
38
+ bestId = UNKNOWN_TOKEN
39
+
40
+ charOffset = 0
41
+ state = @dfa.startState
42
+ while @cursor + charOffset <= @text.length
43
+ ch = nil
44
+ if @cursor + charOffset < @text.length
45
+ ch = @text[@cursor + charOffset].ord()
46
+ !db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
47
+ end
48
+
49
+ nextState = nil
50
+
51
+ # Examine edges leaving this state.
52
+ # If one is labelled with a token id, we don't need to match the character with it;
53
+ # store as best token found if length is longer than previous, or equal to previous
54
+ # with higher id.
55
+
56
+ # If an edge is labelled with the current character, advance to that state.
57
+
58
+ edges = state.edges
59
+ edges.each do |lbl,dest|
60
+ a = lbl.array
61
+ !db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest))
62
+ if a[0] < EPSILON
63
+ newTokenId = edgeLabelToTokenId(a[0])
64
+ !db || pr(" new token id=%d\n",newTokenId)
65
+
66
+ if (bestLength < charOffset || newTokenId > bestId)
67
+ bestLength, bestId = charOffset, newTokenId
68
+ !db || pr(" making longest found so far\n")
69
+ end
70
+ end
71
+
72
+ if ch && lbl.contains?(ch)
73
+ !db || pr(" setting next state to %s\n",d(dest))
74
+ nextState = dest
75
+ break
76
+ end
77
+ end
78
+
79
+ if !nextState
80
+ break
81
+ end
82
+ state = nextState
83
+ charOffset += 1
84
+ !db || pr(" advanced to next state\n")
85
+ end
86
+
87
+ peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
88
+ @tokenHistory.push(peekToken)
89
+ end
90
+ end
91
+
92
+ ret = nil
93
+ if @historyPointer < @tokenHistory.size
94
+ ret = @tokenHistory[@historyPointer]
95
+ end
96
+
97
+ ret
98
+ end
99
+
100
+
101
+ # Read next token
102
+ #
103
+ # > tokenName : if not nil, the (string) name of the token expected
104
+ #
105
+ # Raises TokenizerException if no more tokens,if unrecognized token, or
106
+ # if token has different than expected name
107
+ #
108
+ def read(tokenName = nil)
109
+ token = peek()
110
+ if !token
111
+ raise TokenizerException,"No more tokens"
112
+ end
113
+
114
+ if token.id == UNKNOWN_TOKEN
115
+ raise TokenizerException, "Unknown token "+token.inspect
116
+ end
117
+
118
+ if tokenName && tokenName != nameOf(token)
119
+ raise TokenizerException, "Unexpected token "+token.inspect
120
+ end
121
+
122
+ @historyPointer += 1
123
+
124
+ # Advance cursor, line number, column
125
+
126
+ tl = token.text.length
127
+ @cursor += tl
128
+ tl.times do |i|
129
+ c = token.text[i]
130
+ @column += 1
131
+ if c == "\n"
132
+ @lineNumber += 1
133
+ @column = 0
134
+ end
135
+ end
136
+ token
137
+ end
138
+
139
+ # Read next token if it has a particular name
140
+ #
141
+ # > tokenName : name to look for
142
+ # < token read, or nil
143
+ #
144
+ def readIf(tokenName)
145
+ ret = nil
146
+ token = peek()
147
+ if token && nameOf(token) == tokenName
148
+ ret = read()
149
+ end
150
+ ret
151
+ end
152
+
153
+ # Determine if another token exists
154
+ #
155
+ def hasNext
156
+ !peek().nil?
157
+ end
158
+
159
+ # Get the name of a token
160
+ # (i.e., the name of the token definition, not its text)
161
+ #
162
+ # > token read from this tokenizer
163
+ #
164
+ def nameOf(token)
165
+ @dfa.tokenName(token.id)
166
+ end
167
+
168
+ # Unread one (or more) previously read tokens
169
+ #
170
+ def unread(count = 1)
171
+ if @historyPointer < count
172
+ raise TokenizerException, "Cannot unread before start"
173
+ end
174
+ @historyPointer -= count
175
+ end
176
+
177
+ end
178
+
179
+
180
+
181
+
182
+ # Tokens read by Tokenizer
183
+ #
184
+ class Token
185
+ include Tokn
186
+
187
+ attr_reader :text, :lineNumber, :column, :id
188
+
189
+ def initialize(id, text, lineNumber, column)
190
+ @id = id
191
+ @text = text
192
+ @lineNumber = lineNumber
193
+ @column = column
194
+ end
195
+
196
+ def unknown?
197
+ id == UNKNOWN_TOKEN
198
+ end
199
+
200
+ def inspect
201
+ s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
202
+ if !unknown?
203
+ s = s.ljust(17) + " : " + text
204
+ end
205
+ s
206
+ end
207
+ end
208
+
209
+
210
+ class TokenizerException < Exception
211
+ end
@@ -0,0 +1,29 @@
1
+ # Module containing tokn-related constants and functions
2
+ #
3
+ module Tokn
4
+
5
+ # Token id if text didn't match any tokens in the DFA
6
+ UNKNOWN_TOKEN = -1
7
+
8
+ # Code for epsilon transitions
9
+ EPSILON = -1
10
+
11
+ # One plus the maximum code represented
12
+ CODEMAX = 0x110000
13
+
14
+ # Minimum code possible (e.g., indicating a token id)
15
+ CODEMIN = -10000
16
+
17
+ # Convert a token id (>=0) to an edge label value ( < 0)
18
+ #
19
+ def tokenIdToEdgeLabel(tokenId)
20
+ EPSILON-1-tokenId
21
+ end
22
+
23
+ # Convert an edge label value ( < 0) to a token id (>=0)
24
+ #
25
+ def edgeLabelToTokenId(edgeLabel)
26
+ EPSILON-1-edgeLabel
27
+ end
28
+
29
+ end
data/lib/tokn/tools.rb ADDED
@@ -0,0 +1,186 @@
1
+ require 'set'
2
+
3
+ # Various utility and debug convenience functions.
4
+ #
5
+
6
+ # Perform 'require_relative' on a set of files
7
+ #
8
+ # fileListStr : space-delimited file/path items, without .rb extensions
9
+ # subdir : optional path to files relative to tools.rb
10
+ #
11
+ def req(fileListStr,subdir = nil)
12
+ fileListStr.split(' ').each do |x|
13
+ if subdir
14
+ x = File.join(subdir,x)
15
+ end
16
+ x += '.rb'
17
+ require_relative(x)
18
+ end
19
+ end
20
+
21
+ # Shorthand for printf(...)
22
+ #
23
+ def pr(*args)
24
+ printf(*args)
25
+ end
26
+
27
+
28
+ # Convert an object to a human-readable string;
29
+ # should be considered a debug-only feature
30
+ #
31
+ def d(arg)
32
+ if arg.nil?
33
+ "<nil>"
34
+ else
35
+ arg.inspect
36
+ end
37
+ end
38
+
39
+ # Assert that a value is true. Should be considered a
40
+ # very temporary, debug-only option; it is slow and
41
+ # generates a warning that it is being called.
42
+ #
43
+ def myAssert(cond, *msg)
44
+ oneTimeAlert("warning",0,"Checking assertion")
45
+ if not cond
46
+ if msg.size == 0
47
+ str = "assertion error"
48
+ else
49
+ str = sprintf(*msg)
50
+ end
51
+ raise Exception, str
52
+ end
53
+ end
54
+
55
+
56
+ # Set test directory. If nil, sets to home directory + "__test__"
57
+ #
58
+ def setTestDir(d = nil)
59
+ if !d
60
+ d = File.join(Dir.home,"__test__")
61
+ end
62
+ $testDir = d
63
+ end
64
+
65
+ # Get a path within the test directory;
66
+ # create test directory if it doesn't exist.
67
+ #
68
+ # relPath : if nil, returns the test directory; else
69
+ # returns the test directory joined to this one
70
+ #
71
+ def withinTestDir(relPath = nil)
72
+ if !$testDir
73
+ raise IllegalStateException, "No test directory has been defined"
74
+ end
75
+ if !File.directory?($testDir)
76
+ Dir::mkdir($testDir)
77
+ end
78
+ if relPath
79
+ File.join($testDir,relPath)
80
+ else
81
+ $testDir
82
+ end
83
+ end
84
+
85
+ # Convert a .dot file (string) to a PDF file "__mygraph__nnn.pdf"
86
+ # in the test directory.
87
+ #
88
+ # It does this by making a system call to the 'dot' utility.
89
+ #
90
+ def dotToPDF(dotFile, name = "")
91
+ gr = dotFile
92
+ dotPath = withinTestDir(".__mygraph__.dot")
93
+ writeTextFile(dotPath,gr)
94
+ destName = withinTestDir( "__mygraph__"+name+".pdf")
95
+ system("dot -Tpdf "+dotPath+" -o "+destName)
96
+ end
97
+
98
+
99
+ # Get a nice, concise description of the file and line
100
+ # of some caller within the stack.
101
+ #
102
+ # nSkip : the number of items deep in the call stack to look
103
+ #
104
+ def getCallerLocation(nSkip = 2)
105
+
106
+ filename = nil
107
+ linenumber = nil
108
+
109
+ if nSkip >= 0 && nSkip < caller.size
110
+ fi = caller[nSkip]
111
+
112
+ # ' path : line number : other '
113
+ i = fi.index(':')
114
+ j = nil
115
+ if i
116
+ j = fi.index(':',i+1)
117
+ end
118
+ if j
119
+ pth = fi[0,i].split('/')
120
+ if pth.size
121
+ filename = pth[-1]
122
+ end
123
+ linenumber = fi[i+1,j-i-1].to_i
124
+ end
125
+ end
126
+ if filename && linenumber
127
+ loc = filename + " ("+linenumber.to_s+")"
128
+ else
129
+ loc = "(UNKNOWN LOCATION)"
130
+ end
131
+ loc
132
+ end
133
+
134
+ # Set of alert strings that have already been reported
135
+ # (to avoid printing anything on subsequent invocations)
136
+ #
137
+ $AlertStrings = Set.new
138
+
139
+ # Print a message if it hasn't yet been printed,
140
+ # which includes the caller's location
141
+ #
142
+ # > typeString : e.g., "warning", "unimplemented"
143
+ # > nSkip : the number of levels deep that the caller is in the stack
144
+ # > args : if present, calls sprintf(...) with these to append to the message
145
+ #
146
+ def oneTimeAlert(typeString, nSkip, *args)
147
+ loc = getCallerLocation(nSkip + 2)
148
+ s = "*** "+typeString+" " + loc
149
+ if args && args.size
150
+ s2 = sprintf(args[0], *args[1..-1])
151
+ msg = s + ": " + s2
152
+ else
153
+ msg = s
154
+ end
155
+
156
+ if $AlertStrings.add?(msg)
157
+ puts msg
158
+ end
159
+ end
160
+
161
+ # Print a 'warning' alert, one time only
162
+ #
163
+ def warn(*args)
164
+ oneTimeAlert("warning",0, *args)
165
+ end
166
+
167
+ # Print an 'unimplemented' alert, one time only
168
+ #
169
+ def unimp(*args)
170
+ oneTimeAlert("unimplemented", 0, *args)
171
+ end
172
+
173
+ # Write a string to a text file
174
+ #
175
+ def writeTextFile(path, contents)
176
+ File.open(path, "wb") {|f| f.write(contents) }
177
+ end
178
+
179
+ # Read a file's contents, return as a string
180
+ #
181
+ def readTextFile(path)
182
+ contents = nil
183
+ File.open(path,"rb") {|f| contents = f.read }
184
+ contents
185
+ end
186
+
data/lib/tokn.rb ADDED
@@ -0,0 +1 @@
1
+ require 'tokn/tokenizer'
@@ -0,0 +1,11 @@
1
+ // Example source file that can be tokenized
2
+
3
+ speed = 42 // speed of object
4
+
5
+ gravity = -9.80
6
+
7
+ title = 'This is a string with \' an escaped delimiter'
8
+
9
+ if gravity == 12 {
10
+ do something
11
+ }
@@ -0,0 +1,32 @@
1
+ # Sample token definitions
2
+
3
+ # Whitespace includes a comment, which starts with '//' and
4
+ # extends to the end of the line:
5
+ #
6
+ WS: ( [\f\r\s\t\n]+ ) | ( // [^\n]* \n? )
7
+
8
+ # An anonymous token, for convenience; a non-empty sequence of digits
9
+ #
10
+ _DIG: [0-9]+
11
+
12
+ # Double has lower priority than int; we want ints to
13
+ # be interpreted as ints, not as doubles
14
+ DBL: \-?(({_DIG}(.{_DIG})?)|.{_DIG})
15
+
16
+ INT: \-?{_DIG}
17
+
18
+ LBL: '([^'\n]|\\')*'
19
+
20
+ ID: [_a-zA-Z][_a-zA-Z0-9]*
21
+
22
+ ASSIGN: =
23
+
24
+ EQUIV: ==
25
+
26
+ IF: if
27
+ DO: do
28
+
29
+ BROP: \{
30
+
31
+ BRCL: \}
32
+
data/test/simple.rb ADDED
@@ -0,0 +1,33 @@
1
+ require 'test/unit'
2
+ require_relative '../lib/tokn/tools.rb'
3
+ req('tokenizer dfa')
4
+
5
+
6
+ class Simple
7
+
8
+ def dataPath(f)
9
+ File.dirname(__FILE__)+"/data/"+f
10
+ end
11
+
12
+ setTestDir()
13
+
14
+ # Various unit tests for state machines, character range sets, etc.
15
+
16
+ def initialize
17
+ @sampleText = readTextFile(self.dataPath("sampletext.txt"))
18
+ # @sampleTokens = readTextFile(self.dataPath("sampletokens.txt"))
19
+ end
20
+
21
+ def makeTok
22
+ dfa = DFA.dfa_from_script_file(self.dataPath("sampletokens.txt"))
23
+ Tokenizer.new(dfa, @sampleText)
24
+ end
25
+
26
+ def go
27
+ makeTok
28
+ end
29
+ end
30
+
31
+
32
+ s = Simple.new
33
+ s.go