tokn 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.txt +194 -0
- data/bin/tokncompile +16 -0
- data/bin/toknprocess +26 -0
- data/figures/sample_dfa.pdf +0 -0
- data/lib/tokn/code_set.rb +392 -0
- data/lib/tokn/dfa.rb +196 -0
- data/lib/tokn/dfa_builder.rb +261 -0
- data/lib/tokn/range_partition.rb +233 -0
- data/lib/tokn/reg_parse.rb +379 -0
- data/lib/tokn/state.rb +320 -0
- data/lib/tokn/token_defn_parser.rb +156 -0
- data/lib/tokn/tokenizer.rb +211 -0
- data/lib/tokn/tokn_const.rb +29 -0
- data/lib/tokn/tools.rb +186 -0
- data/lib/tokn.rb +1 -0
- data/test/data/sampletext.txt +11 -0
- data/test/data/sampletokens.txt +32 -0
- data/test/simple.rb +33 -0
- data/test/test.rb +519 -0
- data/test/testcmds +4 -0
- metadata +69 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
require_relative 'tools'
|
2
|
+
req('tokn_const code_set dfa_builder state reg_parse')
|
3
|
+
|
4
|
+
# Parses a token definition script, and generates an NFA that
|
5
|
+
# is capable of recognizing and distinguishing between the various
|
6
|
+
# tokens.
|
7
|
+
#
|
8
|
+
# Each line in the script is one of
|
9
|
+
#
|
10
|
+
# # ...comment... (the # must appear as the first character in the line)
|
11
|
+
#
|
12
|
+
# <tokenname> ':' <regex>
|
13
|
+
#
|
14
|
+
#
|
15
|
+
# A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
|
16
|
+
# If the first character is '_', the token is treated as an 'anonymous' token; these can
|
17
|
+
# appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
|
18
|
+
# generated NFA.
|
19
|
+
#
|
20
|
+
class TokenDefParser
|
21
|
+
include Tokn
|
22
|
+
|
23
|
+
attr_reader :dfa
|
24
|
+
|
25
|
+
# Compile a token definition script into a DFA
|
26
|
+
#
|
27
|
+
def initialize(script, createPDF = false)
|
28
|
+
@script = script
|
29
|
+
parseScript
|
30
|
+
if createPDF
|
31
|
+
dfa.startState.generatePDF("tokenizer_dfa")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def parseScript
|
38
|
+
db = false
|
39
|
+
|
40
|
+
nextTokenId = 0
|
41
|
+
|
42
|
+
# List of tokens entries, including anonymous ones
|
43
|
+
@tokenListBig = []
|
44
|
+
|
45
|
+
# List of tokens names, excluding anonymous ones
|
46
|
+
tokenListSmall = []
|
47
|
+
|
48
|
+
# Maps token name to token entry
|
49
|
+
@tokenNameMap = {}
|
50
|
+
|
51
|
+
@lines = @script.split("\n")
|
52
|
+
|
53
|
+
@lines.each_with_index do |line, lineNumber|
|
54
|
+
|
55
|
+
line.strip!
|
56
|
+
|
57
|
+
# If line is empty, or starts with '#', it's a comment
|
58
|
+
if line.length == 0 || line[0] == '#'
|
59
|
+
next
|
60
|
+
end
|
61
|
+
|
62
|
+
if !(line =~ TOKENNAME_EXPR)
|
63
|
+
raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
|
64
|
+
end
|
65
|
+
|
66
|
+
pos = line.index(":")
|
67
|
+
|
68
|
+
tokenName = line[0,pos].strip()
|
69
|
+
|
70
|
+
expr = line[pos+1..-1].strip()
|
71
|
+
|
72
|
+
rex = RegParse.new(expr, @tokenNameMap)
|
73
|
+
|
74
|
+
# Give it the next available token id, if it's not an anonymous token
|
75
|
+
tkId = nil
|
76
|
+
if tokenName[0] != '_'
|
77
|
+
tkId = nextTokenId
|
78
|
+
nextTokenId += 1
|
79
|
+
end
|
80
|
+
|
81
|
+
tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
|
82
|
+
|
83
|
+
!db || pr("token entry: %s\n",d(tkEntry))
|
84
|
+
|
85
|
+
if @tokenNameMap.has_key?(tokenName)
|
86
|
+
raise ParseException, "Duplicate token name: "+line
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
@tokenListBig.push(tkEntry)
|
91
|
+
@tokenNameMap[tkEntry[0]] = tkEntry
|
92
|
+
|
93
|
+
if tkId
|
94
|
+
tokenListSmall.push(tokenName)
|
95
|
+
end
|
96
|
+
|
97
|
+
!db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
combined = combineTokenNFAs()
|
102
|
+
!db || combined.generatePDF("combined")
|
103
|
+
|
104
|
+
dfa = DFABuilder.nfa_to_dfa(combined)
|
105
|
+
!db || dfa.generatePDF("combined_minimized")
|
106
|
+
|
107
|
+
@dfa = DFA.new(tokenListSmall, dfa)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Combine the individual NFAs constructed for the token definitions into
|
111
|
+
# one large NFA, each augmented with an edge labelled with the appropriate
|
112
|
+
# token identifier to let the tokenizer see which token led to the final state.
|
113
|
+
#
|
114
|
+
def combineTokenNFAs
|
115
|
+
|
116
|
+
baseId = 0
|
117
|
+
startState = nil
|
118
|
+
|
119
|
+
@tokenListBig.each do |tokenName, regParse, index, tokenId|
|
120
|
+
|
121
|
+
# Skip anonymous token definitions
|
122
|
+
if !tokenId
|
123
|
+
next
|
124
|
+
end
|
125
|
+
|
126
|
+
oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
|
127
|
+
|
128
|
+
dupStart = oldToNewMap[regParse.startState]
|
129
|
+
|
130
|
+
# Transition from the expression's end state (not a final state)
|
131
|
+
# to a new final state, with the transitioning edge
|
132
|
+
# labelled with the token id (actually, a transformed token id to distinguish
|
133
|
+
# it from character codes)
|
134
|
+
dupEnd = oldToNewMap[regParse.endState]
|
135
|
+
|
136
|
+
dupfinalState = State.new(baseId)
|
137
|
+
baseId += 1
|
138
|
+
dupfinalState.finalState = true
|
139
|
+
|
140
|
+
dupEnd.addEdge(CodeSet.new(tokenIdToEdgeLabel(tokenId)), dupfinalState)
|
141
|
+
|
142
|
+
if !startState
|
143
|
+
startState = dupStart
|
144
|
+
else
|
145
|
+
# Add an e-transition from the start state to this expression's start
|
146
|
+
startState.addEdge(CodeSet.new(EPSILON),dupStart)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
startState
|
150
|
+
end
|
151
|
+
|
152
|
+
# Regex for token names preceding regular expressions
|
153
|
+
#
|
154
|
+
TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
|
155
|
+
|
156
|
+
end
|
@@ -0,0 +1,211 @@
|
|
1
|
+
require_relative 'tools'
|
2
|
+
req('tokn_const ')
|
3
|
+
|
4
|
+
# Extracts tokens from a script, given a previously constructed DFA.
|
5
|
+
#
|
6
|
+
class Tokenizer
|
7
|
+
include Tokn
|
8
|
+
|
9
|
+
# Construct a tokenizer, given a DFA and some text to process
|
10
|
+
#
|
11
|
+
def initialize(dfa, text)
|
12
|
+
@dfa = dfa
|
13
|
+
@text = text
|
14
|
+
@lineNumber = 0
|
15
|
+
@column = 0
|
16
|
+
@cursor = 0
|
17
|
+
@tokenHistory = []
|
18
|
+
@historyPointer = 0
|
19
|
+
end
|
20
|
+
|
21
|
+
# Determine next token (without reading it)
|
22
|
+
#
|
23
|
+
# Returns Token, or nil if end of input
|
24
|
+
#
|
25
|
+
def peek
|
26
|
+
if !@text
|
27
|
+
raise IllegalStateException, "No input text specified"
|
28
|
+
end
|
29
|
+
|
30
|
+
db = false
|
31
|
+
!db || warn("debug printing is on")
|
32
|
+
!db || pr("peek, cursor=%d\n",@cursor)
|
33
|
+
|
34
|
+
if @historyPointer == @tokenHistory.size
|
35
|
+
if @cursor < @text.length
|
36
|
+
|
37
|
+
bestLength = 0
|
38
|
+
bestId = UNKNOWN_TOKEN
|
39
|
+
|
40
|
+
charOffset = 0
|
41
|
+
state = @dfa.startState
|
42
|
+
while @cursor + charOffset <= @text.length
|
43
|
+
ch = nil
|
44
|
+
if @cursor + charOffset < @text.length
|
45
|
+
ch = @text[@cursor + charOffset].ord()
|
46
|
+
!db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
|
47
|
+
end
|
48
|
+
|
49
|
+
nextState = nil
|
50
|
+
|
51
|
+
# Examine edges leaving this state.
|
52
|
+
# If one is labelled with a token id, we don't need to match the character with it;
|
53
|
+
# store as best token found if length is longer than previous, or equal to previous
|
54
|
+
# with higher id.
|
55
|
+
|
56
|
+
# If an edge is labelled with the current character, advance to that state.
|
57
|
+
|
58
|
+
edges = state.edges
|
59
|
+
edges.each do |lbl,dest|
|
60
|
+
a = lbl.array
|
61
|
+
!db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest))
|
62
|
+
if a[0] < EPSILON
|
63
|
+
newTokenId = edgeLabelToTokenId(a[0])
|
64
|
+
!db || pr(" new token id=%d\n",newTokenId)
|
65
|
+
|
66
|
+
if (bestLength < charOffset || newTokenId > bestId)
|
67
|
+
bestLength, bestId = charOffset, newTokenId
|
68
|
+
!db || pr(" making longest found so far\n")
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
if ch && lbl.contains?(ch)
|
73
|
+
!db || pr(" setting next state to %s\n",d(dest))
|
74
|
+
nextState = dest
|
75
|
+
break
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
if !nextState
|
80
|
+
break
|
81
|
+
end
|
82
|
+
state = nextState
|
83
|
+
charOffset += 1
|
84
|
+
!db || pr(" advanced to next state\n")
|
85
|
+
end
|
86
|
+
|
87
|
+
peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
|
88
|
+
@tokenHistory.push(peekToken)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
ret = nil
|
93
|
+
if @historyPointer < @tokenHistory.size
|
94
|
+
ret = @tokenHistory[@historyPointer]
|
95
|
+
end
|
96
|
+
|
97
|
+
ret
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
# Read next token
|
102
|
+
#
|
103
|
+
# > tokenName : if not nil, the (string) name of the token expected
|
104
|
+
#
|
105
|
+
# Raises TokenizerException if no more tokens,if unrecognized token, or
|
106
|
+
# if token has different than expected name
|
107
|
+
#
|
108
|
+
def read(tokenName = nil)
|
109
|
+
token = peek()
|
110
|
+
if !token
|
111
|
+
raise TokenizerException,"No more tokens"
|
112
|
+
end
|
113
|
+
|
114
|
+
if token.id == UNKNOWN_TOKEN
|
115
|
+
raise TokenizerException, "Unknown token "+token.inspect
|
116
|
+
end
|
117
|
+
|
118
|
+
if tokenName && tokenName != nameOf(token)
|
119
|
+
raise TokenizerException, "Unexpected token "+token.inspect
|
120
|
+
end
|
121
|
+
|
122
|
+
@historyPointer += 1
|
123
|
+
|
124
|
+
# Advance cursor, line number, column
|
125
|
+
|
126
|
+
tl = token.text.length
|
127
|
+
@cursor += tl
|
128
|
+
tl.times do |i|
|
129
|
+
c = token.text[i]
|
130
|
+
@column += 1
|
131
|
+
if c == "\n"
|
132
|
+
@lineNumber += 1
|
133
|
+
@column = 0
|
134
|
+
end
|
135
|
+
end
|
136
|
+
token
|
137
|
+
end
|
138
|
+
|
139
|
+
# Read next token if it has a particular name
|
140
|
+
#
|
141
|
+
# > tokenName : name to look for
|
142
|
+
# < token read, or nil
|
143
|
+
#
|
144
|
+
def readIf(tokenName)
|
145
|
+
ret = nil
|
146
|
+
token = peek()
|
147
|
+
if token && nameOf(token) == tokenName
|
148
|
+
ret = read()
|
149
|
+
end
|
150
|
+
ret
|
151
|
+
end
|
152
|
+
|
153
|
+
# Determine if another token exists
|
154
|
+
#
|
155
|
+
def hasNext
|
156
|
+
!peek().nil?
|
157
|
+
end
|
158
|
+
|
159
|
+
# Get the name of a token
|
160
|
+
# (i.e., the name of the token definition, not its text)
|
161
|
+
#
|
162
|
+
# > token read from this tokenizer
|
163
|
+
#
|
164
|
+
def nameOf(token)
|
165
|
+
@dfa.tokenName(token.id)
|
166
|
+
end
|
167
|
+
|
168
|
+
# Unread one (or more) previously read tokens
|
169
|
+
#
|
170
|
+
def unread(count = 1)
|
171
|
+
if @historyPointer < count
|
172
|
+
raise TokenizerException, "Cannot unread before start"
|
173
|
+
end
|
174
|
+
@historyPointer -= count
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
# Tokens read by Tokenizer
|
183
|
+
#
|
184
|
+
class Token
|
185
|
+
include Tokn
|
186
|
+
|
187
|
+
attr_reader :text, :lineNumber, :column, :id
|
188
|
+
|
189
|
+
def initialize(id, text, lineNumber, column)
|
190
|
+
@id = id
|
191
|
+
@text = text
|
192
|
+
@lineNumber = lineNumber
|
193
|
+
@column = column
|
194
|
+
end
|
195
|
+
|
196
|
+
def unknown?
|
197
|
+
id == UNKNOWN_TOKEN
|
198
|
+
end
|
199
|
+
|
200
|
+
def inspect
|
201
|
+
s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
|
202
|
+
if !unknown?
|
203
|
+
s = s.ljust(17) + " : " + text
|
204
|
+
end
|
205
|
+
s
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
class TokenizerException < Exception
|
211
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Module containing tokn-related constants and functions
|
2
|
+
#
|
3
|
+
module Tokn
|
4
|
+
|
5
|
+
# Token id if text didn't match any tokens in the DFA
|
6
|
+
UNKNOWN_TOKEN = -1
|
7
|
+
|
8
|
+
# Code for epsilon transitions
|
9
|
+
EPSILON = -1
|
10
|
+
|
11
|
+
# One plus the maximum code represented
|
12
|
+
CODEMAX = 0x110000
|
13
|
+
|
14
|
+
# Minimum code possible (e.g., indicating a token id)
|
15
|
+
CODEMIN = -10000
|
16
|
+
|
17
|
+
# Convert a token id (>=0) to an edge label value ( < 0)
|
18
|
+
#
|
19
|
+
def tokenIdToEdgeLabel(tokenId)
|
20
|
+
EPSILON-1-tokenId
|
21
|
+
end
|
22
|
+
|
23
|
+
# Convert an edge label value ( < 0) to a token id (>=0)
|
24
|
+
#
|
25
|
+
def edgeLabelToTokenId(edgeLabel)
|
26
|
+
EPSILON-1-edgeLabel
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/lib/tokn/tools.rb
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
# Various utility and debug convenience functions.
|
4
|
+
#
|
5
|
+
|
6
|
+
# Perform 'require_relative' on a set of files
|
7
|
+
#
|
8
|
+
# fileListStr : space-delimited file/path items, without .rb extensions
|
9
|
+
# subdir : optional path to files relative to tools.rb
|
10
|
+
#
|
11
|
+
def req(fileListStr,subdir = nil)
|
12
|
+
fileListStr.split(' ').each do |x|
|
13
|
+
if subdir
|
14
|
+
x = File.join(subdir,x)
|
15
|
+
end
|
16
|
+
x += '.rb'
|
17
|
+
require_relative(x)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Shorthand for printf(...)
|
22
|
+
#
|
23
|
+
def pr(*args)
|
24
|
+
printf(*args)
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
# Convert an object to a human-readable string;
|
29
|
+
# should be considered a debug-only feature
|
30
|
+
#
|
31
|
+
def d(arg)
|
32
|
+
if arg.nil?
|
33
|
+
"<nil>"
|
34
|
+
else
|
35
|
+
arg.inspect
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Assert that a value is true. Should be considered a
|
40
|
+
# very temporary, debug-only option; it is slow and
|
41
|
+
# generates a warning that it is being called.
|
42
|
+
#
|
43
|
+
def myAssert(cond, *msg)
|
44
|
+
oneTimeAlert("warning",0,"Checking assertion")
|
45
|
+
if not cond
|
46
|
+
if msg.size == 0
|
47
|
+
str = "assertion error"
|
48
|
+
else
|
49
|
+
str = sprintf(*msg)
|
50
|
+
end
|
51
|
+
raise Exception, str
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
# Set test directory. If nil, sets to home directory + "__test__"
|
57
|
+
#
|
58
|
+
def setTestDir(d = nil)
|
59
|
+
if !d
|
60
|
+
d = File.join(Dir.home,"__test__")
|
61
|
+
end
|
62
|
+
$testDir = d
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get a path within the test directory;
|
66
|
+
# create test directory if it doesn't exist.
|
67
|
+
#
|
68
|
+
# relPath : if nil, returns the test directory; else
|
69
|
+
# returns the test directory joined to this one
|
70
|
+
#
|
71
|
+
def withinTestDir(relPath = nil)
|
72
|
+
if !$testDir
|
73
|
+
raise IllegalStateException, "No test directory has been defined"
|
74
|
+
end
|
75
|
+
if !File.directory?($testDir)
|
76
|
+
Dir::mkdir($testDir)
|
77
|
+
end
|
78
|
+
if relPath
|
79
|
+
File.join($testDir,relPath)
|
80
|
+
else
|
81
|
+
$testDir
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Convert a .dot file (string) to a PDF file "__mygraph__nnn.pdf"
|
86
|
+
# in the test directory.
|
87
|
+
#
|
88
|
+
# It does this by making a system call to the 'dot' utility.
|
89
|
+
#
|
90
|
+
def dotToPDF(dotFile, name = "")
|
91
|
+
gr = dotFile
|
92
|
+
dotPath = withinTestDir(".__mygraph__.dot")
|
93
|
+
writeTextFile(dotPath,gr)
|
94
|
+
destName = withinTestDir( "__mygraph__"+name+".pdf")
|
95
|
+
system("dot -Tpdf "+dotPath+" -o "+destName)
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
# Get a nice, concise description of the file and line
|
100
|
+
# of some caller within the stack.
|
101
|
+
#
|
102
|
+
# nSkip : the number of items deep in the call stack to look
|
103
|
+
#
|
104
|
+
def getCallerLocation(nSkip = 2)
|
105
|
+
|
106
|
+
filename = nil
|
107
|
+
linenumber = nil
|
108
|
+
|
109
|
+
if nSkip >= 0 && nSkip < caller.size
|
110
|
+
fi = caller[nSkip]
|
111
|
+
|
112
|
+
# ' path : line number : other '
|
113
|
+
i = fi.index(':')
|
114
|
+
j = nil
|
115
|
+
if i
|
116
|
+
j = fi.index(':',i+1)
|
117
|
+
end
|
118
|
+
if j
|
119
|
+
pth = fi[0,i].split('/')
|
120
|
+
if pth.size
|
121
|
+
filename = pth[-1]
|
122
|
+
end
|
123
|
+
linenumber = fi[i+1,j-i-1].to_i
|
124
|
+
end
|
125
|
+
end
|
126
|
+
if filename && linenumber
|
127
|
+
loc = filename + " ("+linenumber.to_s+")"
|
128
|
+
else
|
129
|
+
loc = "(UNKNOWN LOCATION)"
|
130
|
+
end
|
131
|
+
loc
|
132
|
+
end
|
133
|
+
|
134
|
+
# Set of alert strings that have already been reported
|
135
|
+
# (to avoid printing anything on subsequent invocations)
|
136
|
+
#
|
137
|
+
$AlertStrings = Set.new
|
138
|
+
|
139
|
+
# Print a message if it hasn't yet been printed,
|
140
|
+
# which includes the caller's location
|
141
|
+
#
|
142
|
+
# > typeString : e.g., "warning", "unimplemented"
|
143
|
+
# > nSkip : the number of levels deep that the caller is in the stack
|
144
|
+
# > args : if present, calls sprintf(...) with these to append to the message
|
145
|
+
#
|
146
|
+
def oneTimeAlert(typeString, nSkip, *args)
|
147
|
+
loc = getCallerLocation(nSkip + 2)
|
148
|
+
s = "*** "+typeString+" " + loc
|
149
|
+
if args && args.size
|
150
|
+
s2 = sprintf(args[0], *args[1..-1])
|
151
|
+
msg = s + ": " + s2
|
152
|
+
else
|
153
|
+
msg = s
|
154
|
+
end
|
155
|
+
|
156
|
+
if $AlertStrings.add?(msg)
|
157
|
+
puts msg
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# Print a 'warning' alert, one time only
|
162
|
+
#
|
163
|
+
def warn(*args)
|
164
|
+
oneTimeAlert("warning",0, *args)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Print an 'unimplemented' alert, one time only
|
168
|
+
#
|
169
|
+
def unimp(*args)
|
170
|
+
oneTimeAlert("unimplemented", 0, *args)
|
171
|
+
end
|
172
|
+
|
173
|
+
# Write a string to a text file
|
174
|
+
#
|
175
|
+
def writeTextFile(path, contents)
|
176
|
+
File.open(path, "wb") {|f| f.write(contents) }
|
177
|
+
end
|
178
|
+
|
179
|
+
# Read a file's contents, return as a string
|
180
|
+
#
|
181
|
+
def readTextFile(path)
|
182
|
+
contents = nil
|
183
|
+
File.open(path,"rb") {|f| contents = f.read }
|
184
|
+
contents
|
185
|
+
end
|
186
|
+
|
data/lib/tokn.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'tokn/tokenizer'
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# Sample token definitions
|
2
|
+
|
3
|
+
# Whitespace includes a comment, which starts with '//' and
|
4
|
+
# extends to the end of the line:
|
5
|
+
#
|
6
|
+
WS: ( [\f\r\s\t\n]+ ) | ( // [^\n]* \n? )
|
7
|
+
|
8
|
+
# An anonymous token, for convenience; a non-empty sequence of digits
|
9
|
+
#
|
10
|
+
_DIG: [0-9]+
|
11
|
+
|
12
|
+
# Double has lower priority than int; we want ints to
|
13
|
+
# be interpreted as ints, not as doubles
|
14
|
+
DBL: \-?(({_DIG}(.{_DIG})?)|.{_DIG})
|
15
|
+
|
16
|
+
INT: \-?{_DIG}
|
17
|
+
|
18
|
+
LBL: '([^'\n]|\\')*'
|
19
|
+
|
20
|
+
ID: [_a-zA-Z][_a-zA-Z0-9]*
|
21
|
+
|
22
|
+
ASSIGN: =
|
23
|
+
|
24
|
+
EQUIV: ==
|
25
|
+
|
26
|
+
IF: if
|
27
|
+
DO: do
|
28
|
+
|
29
|
+
BROP: \{
|
30
|
+
|
31
|
+
BRCL: \}
|
32
|
+
|
data/test/simple.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require_relative '../lib/tokn/tools.rb'
|
3
|
+
req('tokenizer dfa')
|
4
|
+
|
5
|
+
|
6
|
+
class Simple
|
7
|
+
|
8
|
+
def dataPath(f)
|
9
|
+
File.dirname(__FILE__)+"/data/"+f
|
10
|
+
end
|
11
|
+
|
12
|
+
setTestDir()
|
13
|
+
|
14
|
+
# Various unit tests for state machines, character range sets, etc.
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@sampleText = readTextFile(self.dataPath("sampletext.txt"))
|
18
|
+
# @sampleTokens = readTextFile(self.dataPath("sampletokens.txt"))
|
19
|
+
end
|
20
|
+
|
21
|
+
def makeTok
|
22
|
+
dfa = DFA.dfa_from_script_file(self.dataPath("sampletokens.txt"))
|
23
|
+
Tokenizer.new(dfa, @sampleText)
|
24
|
+
end
|
25
|
+
|
26
|
+
def go
|
27
|
+
makeTok
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
s = Simple.new
|
33
|
+
s.go
|