tokn 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.txt +194 -0
- data/bin/tokncompile +16 -0
- data/bin/toknprocess +26 -0
- data/figures/sample_dfa.pdf +0 -0
- data/lib/tokn/code_set.rb +392 -0
- data/lib/tokn/dfa.rb +196 -0
- data/lib/tokn/dfa_builder.rb +261 -0
- data/lib/tokn/range_partition.rb +233 -0
- data/lib/tokn/reg_parse.rb +379 -0
- data/lib/tokn/state.rb +320 -0
- data/lib/tokn/token_defn_parser.rb +156 -0
- data/lib/tokn/tokenizer.rb +211 -0
- data/lib/tokn/tokn_const.rb +29 -0
- data/lib/tokn/tools.rb +186 -0
- data/lib/tokn.rb +1 -0
- data/test/data/sampletext.txt +11 -0
- data/test/data/sampletokens.txt +32 -0
- data/test/simple.rb +33 -0
- data/test/test.rb +519 -0
- data/test/testcmds +4 -0
- metadata +69 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
require_relative 'tools'
|
2
|
+
req('tokn_const code_set dfa_builder state reg_parse')
|
3
|
+
|
4
|
+
# Parses a token definition script, and generates an NFA that
|
5
|
+
# is capable of recognizing and distinguishing between the various
|
6
|
+
# tokens.
|
7
|
+
#
|
8
|
+
# Each line in the script is one of
|
9
|
+
#
|
10
|
+
# # ...comment... (the # must appear as the first character in the line)
|
11
|
+
#
|
12
|
+
# <tokenname> ':' <regex>
|
13
|
+
#
|
14
|
+
#
|
15
|
+
# A <tokenname> must be an 'identifier' (alphanumeric, with first character a letter (or '_')).
|
16
|
+
# If the first character is '_', the token is treated as an 'anonymous' token; these can
|
17
|
+
# appear in the curly brace portions of previous reg. expr. entries, but do not appear as tokens in the
|
18
|
+
# generated NFA.
|
19
|
+
#
|
20
|
+
class TokenDefParser
|
21
|
+
include Tokn
|
22
|
+
|
23
|
+
attr_reader :dfa
|
24
|
+
|
25
|
+
# Compile a token definition script into a DFA
|
26
|
+
#
|
27
|
+
def initialize(script, createPDF = false)
|
28
|
+
@script = script
|
29
|
+
parseScript
|
30
|
+
if createPDF
|
31
|
+
dfa.startState.generatePDF("tokenizer_dfa")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def parseScript
|
38
|
+
db = false
|
39
|
+
|
40
|
+
nextTokenId = 0
|
41
|
+
|
42
|
+
# List of tokens entries, including anonymous ones
|
43
|
+
@tokenListBig = []
|
44
|
+
|
45
|
+
# List of tokens names, excluding anonymous ones
|
46
|
+
tokenListSmall = []
|
47
|
+
|
48
|
+
# Maps token name to token entry
|
49
|
+
@tokenNameMap = {}
|
50
|
+
|
51
|
+
@lines = @script.split("\n")
|
52
|
+
|
53
|
+
@lines.each_with_index do |line, lineNumber|
|
54
|
+
|
55
|
+
line.strip!
|
56
|
+
|
57
|
+
# If line is empty, or starts with '#', it's a comment
|
58
|
+
if line.length == 0 || line[0] == '#'
|
59
|
+
next
|
60
|
+
end
|
61
|
+
|
62
|
+
if !(line =~ TOKENNAME_EXPR)
|
63
|
+
raise ParseException, "Syntax error, line #"+lineNumber.to_s+": "+line
|
64
|
+
end
|
65
|
+
|
66
|
+
pos = line.index(":")
|
67
|
+
|
68
|
+
tokenName = line[0,pos].strip()
|
69
|
+
|
70
|
+
expr = line[pos+1..-1].strip()
|
71
|
+
|
72
|
+
rex = RegParse.new(expr, @tokenNameMap)
|
73
|
+
|
74
|
+
# Give it the next available token id, if it's not an anonymous token
|
75
|
+
tkId = nil
|
76
|
+
if tokenName[0] != '_'
|
77
|
+
tkId = nextTokenId
|
78
|
+
nextTokenId += 1
|
79
|
+
end
|
80
|
+
|
81
|
+
tkEntry = [tokenName, rex, @tokenListBig.size, tkId]
|
82
|
+
|
83
|
+
!db || pr("token entry: %s\n",d(tkEntry))
|
84
|
+
|
85
|
+
if @tokenNameMap.has_key?(tokenName)
|
86
|
+
raise ParseException, "Duplicate token name: "+line
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
@tokenListBig.push(tkEntry)
|
91
|
+
@tokenNameMap[tkEntry[0]] = tkEntry
|
92
|
+
|
93
|
+
if tkId
|
94
|
+
tokenListSmall.push(tokenName)
|
95
|
+
end
|
96
|
+
|
97
|
+
!db || pr(" added token name [%s] to map\n",d(tkEntry[0]))
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
combined = combineTokenNFAs()
|
102
|
+
!db || combined.generatePDF("combined")
|
103
|
+
|
104
|
+
dfa = DFABuilder.nfa_to_dfa(combined)
|
105
|
+
!db || dfa.generatePDF("combined_minimized")
|
106
|
+
|
107
|
+
@dfa = DFA.new(tokenListSmall, dfa)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Combine the individual NFAs constructed for the token definitions into
|
111
|
+
# one large NFA, each augmented with an edge labelled with the appropriate
|
112
|
+
# token identifier to let the tokenizer see which token led to the final state.
|
113
|
+
#
|
114
|
+
def combineTokenNFAs
|
115
|
+
|
116
|
+
baseId = 0
|
117
|
+
startState = nil
|
118
|
+
|
119
|
+
@tokenListBig.each do |tokenName, regParse, index, tokenId|
|
120
|
+
|
121
|
+
# Skip anonymous token definitions
|
122
|
+
if !tokenId
|
123
|
+
next
|
124
|
+
end
|
125
|
+
|
126
|
+
oldToNewMap, baseId = regParse.startState.duplicateNFA( baseId)
|
127
|
+
|
128
|
+
dupStart = oldToNewMap[regParse.startState]
|
129
|
+
|
130
|
+
# Transition from the expression's end state (not a final state)
|
131
|
+
# to a new final state, with the transitioning edge
|
132
|
+
# labelled with the token id (actually, a transformed token id to distinguish
|
133
|
+
# it from character codes)
|
134
|
+
dupEnd = oldToNewMap[regParse.endState]
|
135
|
+
|
136
|
+
dupfinalState = State.new(baseId)
|
137
|
+
baseId += 1
|
138
|
+
dupfinalState.finalState = true
|
139
|
+
|
140
|
+
dupEnd.addEdge(CodeSet.new(tokenIdToEdgeLabel(tokenId)), dupfinalState)
|
141
|
+
|
142
|
+
if !startState
|
143
|
+
startState = dupStart
|
144
|
+
else
|
145
|
+
# Add an e-transition from the start state to this expression's start
|
146
|
+
startState.addEdge(CodeSet.new(EPSILON),dupStart)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
startState
|
150
|
+
end
|
151
|
+
|
152
|
+
# Regex for token names preceding regular expressions
|
153
|
+
#
|
154
|
+
TOKENNAME_EXPR = Regexp.new("[_A-Za-z][_A-Za-z0-9]*\s*:\s*")
|
155
|
+
|
156
|
+
end
|
@@ -0,0 +1,211 @@
|
|
1
|
+
require_relative 'tools'
|
2
|
+
req('tokn_const ')
|
3
|
+
|
4
|
+
# Extracts tokens from a script, given a previously constructed DFA.
|
5
|
+
#
|
6
|
+
class Tokenizer
|
7
|
+
include Tokn
|
8
|
+
|
9
|
+
# Construct a tokenizer, given a DFA and some text to process
|
10
|
+
#
|
11
|
+
def initialize(dfa, text)
|
12
|
+
@dfa = dfa
|
13
|
+
@text = text
|
14
|
+
@lineNumber = 0
|
15
|
+
@column = 0
|
16
|
+
@cursor = 0
|
17
|
+
@tokenHistory = []
|
18
|
+
@historyPointer = 0
|
19
|
+
end
|
20
|
+
|
21
|
+
# Determine next token (without reading it)
|
22
|
+
#
|
23
|
+
# Returns Token, or nil if end of input
|
24
|
+
#
|
25
|
+
def peek
|
26
|
+
if !@text
|
27
|
+
raise IllegalStateException, "No input text specified"
|
28
|
+
end
|
29
|
+
|
30
|
+
db = false
|
31
|
+
!db || warn("debug printing is on")
|
32
|
+
!db || pr("peek, cursor=%d\n",@cursor)
|
33
|
+
|
34
|
+
if @historyPointer == @tokenHistory.size
|
35
|
+
if @cursor < @text.length
|
36
|
+
|
37
|
+
bestLength = 0
|
38
|
+
bestId = UNKNOWN_TOKEN
|
39
|
+
|
40
|
+
charOffset = 0
|
41
|
+
state = @dfa.startState
|
42
|
+
while @cursor + charOffset <= @text.length
|
43
|
+
ch = nil
|
44
|
+
if @cursor + charOffset < @text.length
|
45
|
+
ch = @text[@cursor + charOffset].ord()
|
46
|
+
!db || pr(" offset=%d, ch=%d (%s)\n",charOffset,ch,ch.chr)
|
47
|
+
end
|
48
|
+
|
49
|
+
nextState = nil
|
50
|
+
|
51
|
+
# Examine edges leaving this state.
|
52
|
+
# If one is labelled with a token id, we don't need to match the character with it;
|
53
|
+
# store as best token found if length is longer than previous, or equal to previous
|
54
|
+
# with higher id.
|
55
|
+
|
56
|
+
# If an edge is labelled with the current character, advance to that state.
|
57
|
+
|
58
|
+
edges = state.edges
|
59
|
+
edges.each do |lbl,dest|
|
60
|
+
a = lbl.array
|
61
|
+
!db || pr(" edge lbl=%s, dest=%s\n",d(lbl),d(dest))
|
62
|
+
if a[0] < EPSILON
|
63
|
+
newTokenId = edgeLabelToTokenId(a[0])
|
64
|
+
!db || pr(" new token id=%d\n",newTokenId)
|
65
|
+
|
66
|
+
if (bestLength < charOffset || newTokenId > bestId)
|
67
|
+
bestLength, bestId = charOffset, newTokenId
|
68
|
+
!db || pr(" making longest found so far\n")
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
if ch && lbl.contains?(ch)
|
73
|
+
!db || pr(" setting next state to %s\n",d(dest))
|
74
|
+
nextState = dest
|
75
|
+
break
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
if !nextState
|
80
|
+
break
|
81
|
+
end
|
82
|
+
state = nextState
|
83
|
+
charOffset += 1
|
84
|
+
!db || pr(" advanced to next state\n")
|
85
|
+
end
|
86
|
+
|
87
|
+
peekToken = Token.new(bestId, @text[@cursor, bestLength], 1 + @lineNumber, 1 + @column)
|
88
|
+
@tokenHistory.push(peekToken)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
ret = nil
|
93
|
+
if @historyPointer < @tokenHistory.size
|
94
|
+
ret = @tokenHistory[@historyPointer]
|
95
|
+
end
|
96
|
+
|
97
|
+
ret
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
# Read next token
|
102
|
+
#
|
103
|
+
# > tokenName : if not nil, the (string) name of the token expected
|
104
|
+
#
|
105
|
+
# Raises TokenizerException if no more tokens,if unrecognized token, or
|
106
|
+
# if token has different than expected name
|
107
|
+
#
|
108
|
+
def read(tokenName = nil)
|
109
|
+
token = peek()
|
110
|
+
if !token
|
111
|
+
raise TokenizerException,"No more tokens"
|
112
|
+
end
|
113
|
+
|
114
|
+
if token.id == UNKNOWN_TOKEN
|
115
|
+
raise TokenizerException, "Unknown token "+token.inspect
|
116
|
+
end
|
117
|
+
|
118
|
+
if tokenName && tokenName != nameOf(token)
|
119
|
+
raise TokenizerException, "Unexpected token "+token.inspect
|
120
|
+
end
|
121
|
+
|
122
|
+
@historyPointer += 1
|
123
|
+
|
124
|
+
# Advance cursor, line number, column
|
125
|
+
|
126
|
+
tl = token.text.length
|
127
|
+
@cursor += tl
|
128
|
+
tl.times do |i|
|
129
|
+
c = token.text[i]
|
130
|
+
@column += 1
|
131
|
+
if c == "\n"
|
132
|
+
@lineNumber += 1
|
133
|
+
@column = 0
|
134
|
+
end
|
135
|
+
end
|
136
|
+
token
|
137
|
+
end
|
138
|
+
|
139
|
+
# Read next token if it has a particular name
|
140
|
+
#
|
141
|
+
# > tokenName : name to look for
|
142
|
+
# < token read, or nil
|
143
|
+
#
|
144
|
+
def readIf(tokenName)
|
145
|
+
ret = nil
|
146
|
+
token = peek()
|
147
|
+
if token && nameOf(token) == tokenName
|
148
|
+
ret = read()
|
149
|
+
end
|
150
|
+
ret
|
151
|
+
end
|
152
|
+
|
153
|
+
# Determine if another token exists
|
154
|
+
#
|
155
|
+
def hasNext
|
156
|
+
!peek().nil?
|
157
|
+
end
|
158
|
+
|
159
|
+
# Get the name of a token
|
160
|
+
# (i.e., the name of the token definition, not its text)
|
161
|
+
#
|
162
|
+
# > token read from this tokenizer
|
163
|
+
#
|
164
|
+
def nameOf(token)
|
165
|
+
@dfa.tokenName(token.id)
|
166
|
+
end
|
167
|
+
|
168
|
+
# Unread one (or more) previously read tokens
|
169
|
+
#
|
170
|
+
def unread(count = 1)
|
171
|
+
if @historyPointer < count
|
172
|
+
raise TokenizerException, "Cannot unread before start"
|
173
|
+
end
|
174
|
+
@historyPointer -= count
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
# Tokens read by Tokenizer
|
183
|
+
#
|
184
|
+
class Token
|
185
|
+
include Tokn
|
186
|
+
|
187
|
+
attr_reader :text, :lineNumber, :column, :id
|
188
|
+
|
189
|
+
def initialize(id, text, lineNumber, column)
|
190
|
+
@id = id
|
191
|
+
@text = text
|
192
|
+
@lineNumber = lineNumber
|
193
|
+
@column = column
|
194
|
+
end
|
195
|
+
|
196
|
+
def unknown?
|
197
|
+
id == UNKNOWN_TOKEN
|
198
|
+
end
|
199
|
+
|
200
|
+
def inspect
|
201
|
+
s = "(line "+lineNumber.to_s+", col "+column.to_s+")"
|
202
|
+
if !unknown?
|
203
|
+
s = s.ljust(17) + " : " + text
|
204
|
+
end
|
205
|
+
s
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
class TokenizerException < Exception
|
211
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# Module containing tokn-related constants and functions
|
2
|
+
#
|
3
|
+
module Tokn
|
4
|
+
|
5
|
+
# Token id if text didn't match any tokens in the DFA
|
6
|
+
UNKNOWN_TOKEN = -1
|
7
|
+
|
8
|
+
# Code for epsilon transitions
|
9
|
+
EPSILON = -1
|
10
|
+
|
11
|
+
# One plus the maximum code represented
|
12
|
+
CODEMAX = 0x110000
|
13
|
+
|
14
|
+
# Minimum code possible (e.g., indicating a token id)
|
15
|
+
CODEMIN = -10000
|
16
|
+
|
17
|
+
# Convert a token id (>=0) to an edge label value ( < 0)
|
18
|
+
#
|
19
|
+
def tokenIdToEdgeLabel(tokenId)
|
20
|
+
EPSILON-1-tokenId
|
21
|
+
end
|
22
|
+
|
23
|
+
# Convert an edge label value ( < 0) to a token id (>=0)
|
24
|
+
#
|
25
|
+
def edgeLabelToTokenId(edgeLabel)
|
26
|
+
EPSILON-1-edgeLabel
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
data/lib/tokn/tools.rb
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
# Various utility and debug convenience functions.
|
4
|
+
#
|
5
|
+
|
6
|
+
# Perform 'require_relative' on a set of files
|
7
|
+
#
|
8
|
+
# fileListStr : space-delimited file/path items, without .rb extensions
|
9
|
+
# subdir : optional path to files relative to tools.rb
|
10
|
+
#
|
11
|
+
def req(fileListStr,subdir = nil)
|
12
|
+
fileListStr.split(' ').each do |x|
|
13
|
+
if subdir
|
14
|
+
x = File.join(subdir,x)
|
15
|
+
end
|
16
|
+
x += '.rb'
|
17
|
+
require_relative(x)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Shorthand for printf(...)
|
22
|
+
#
|
23
|
+
def pr(*args)
|
24
|
+
printf(*args)
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
# Convert an object to a human-readable string;
|
29
|
+
# should be considered a debug-only feature
|
30
|
+
#
|
31
|
+
def d(arg)
|
32
|
+
if arg.nil?
|
33
|
+
"<nil>"
|
34
|
+
else
|
35
|
+
arg.inspect
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Assert that a value is true. Should be considered a
|
40
|
+
# very temporary, debug-only option; it is slow and
|
41
|
+
# generates a warning that it is being called.
|
42
|
+
#
|
43
|
+
def myAssert(cond, *msg)
|
44
|
+
oneTimeAlert("warning",0,"Checking assertion")
|
45
|
+
if not cond
|
46
|
+
if msg.size == 0
|
47
|
+
str = "assertion error"
|
48
|
+
else
|
49
|
+
str = sprintf(*msg)
|
50
|
+
end
|
51
|
+
raise Exception, str
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
# Set test directory. If nil, sets to home directory + "__test__"
|
57
|
+
#
|
58
|
+
def setTestDir(d = nil)
|
59
|
+
if !d
|
60
|
+
d = File.join(Dir.home,"__test__")
|
61
|
+
end
|
62
|
+
$testDir = d
|
63
|
+
end
|
64
|
+
|
65
|
+
# Get a path within the test directory;
|
66
|
+
# create test directory if it doesn't exist.
|
67
|
+
#
|
68
|
+
# relPath : if nil, returns the test directory; else
|
69
|
+
# returns the test directory joined to this one
|
70
|
+
#
|
71
|
+
def withinTestDir(relPath = nil)
|
72
|
+
if !$testDir
|
73
|
+
raise IllegalStateException, "No test directory has been defined"
|
74
|
+
end
|
75
|
+
if !File.directory?($testDir)
|
76
|
+
Dir::mkdir($testDir)
|
77
|
+
end
|
78
|
+
if relPath
|
79
|
+
File.join($testDir,relPath)
|
80
|
+
else
|
81
|
+
$testDir
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Convert a .dot file (string) to a PDF file "__mygraph__nnn.pdf"
|
86
|
+
# in the test directory.
|
87
|
+
#
|
88
|
+
# It does this by making a system call to the 'dot' utility.
|
89
|
+
#
|
90
|
+
def dotToPDF(dotFile, name = "")
|
91
|
+
gr = dotFile
|
92
|
+
dotPath = withinTestDir(".__mygraph__.dot")
|
93
|
+
writeTextFile(dotPath,gr)
|
94
|
+
destName = withinTestDir( "__mygraph__"+name+".pdf")
|
95
|
+
system("dot -Tpdf "+dotPath+" -o "+destName)
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
# Get a nice, concise description of the file and line
|
100
|
+
# of some caller within the stack.
|
101
|
+
#
|
102
|
+
# nSkip : the number of items deep in the call stack to look
|
103
|
+
#
|
104
|
+
def getCallerLocation(nSkip = 2)
|
105
|
+
|
106
|
+
filename = nil
|
107
|
+
linenumber = nil
|
108
|
+
|
109
|
+
if nSkip >= 0 && nSkip < caller.size
|
110
|
+
fi = caller[nSkip]
|
111
|
+
|
112
|
+
# ' path : line number : other '
|
113
|
+
i = fi.index(':')
|
114
|
+
j = nil
|
115
|
+
if i
|
116
|
+
j = fi.index(':',i+1)
|
117
|
+
end
|
118
|
+
if j
|
119
|
+
pth = fi[0,i].split('/')
|
120
|
+
if pth.size
|
121
|
+
filename = pth[-1]
|
122
|
+
end
|
123
|
+
linenumber = fi[i+1,j-i-1].to_i
|
124
|
+
end
|
125
|
+
end
|
126
|
+
if filename && linenumber
|
127
|
+
loc = filename + " ("+linenumber.to_s+")"
|
128
|
+
else
|
129
|
+
loc = "(UNKNOWN LOCATION)"
|
130
|
+
end
|
131
|
+
loc
|
132
|
+
end
|
133
|
+
|
134
|
+
# Set of alert strings that have already been reported
|
135
|
+
# (to avoid printing anything on subsequent invocations)
|
136
|
+
#
|
137
|
+
$AlertStrings = Set.new
|
138
|
+
|
139
|
+
# Print a message if it hasn't yet been printed,
|
140
|
+
# which includes the caller's location
|
141
|
+
#
|
142
|
+
# > typeString : e.g., "warning", "unimplemented"
|
143
|
+
# > nSkip : the number of levels deep that the caller is in the stack
|
144
|
+
# > args : if present, calls sprintf(...) with these to append to the message
|
145
|
+
#
|
146
|
+
def oneTimeAlert(typeString, nSkip, *args)
|
147
|
+
loc = getCallerLocation(nSkip + 2)
|
148
|
+
s = "*** "+typeString+" " + loc
|
149
|
+
if args && args.size
|
150
|
+
s2 = sprintf(args[0], *args[1..-1])
|
151
|
+
msg = s + ": " + s2
|
152
|
+
else
|
153
|
+
msg = s
|
154
|
+
end
|
155
|
+
|
156
|
+
if $AlertStrings.add?(msg)
|
157
|
+
puts msg
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# Print a 'warning' alert, one time only
|
162
|
+
#
|
163
|
+
def warn(*args)
|
164
|
+
oneTimeAlert("warning",0, *args)
|
165
|
+
end
|
166
|
+
|
167
|
+
# Print an 'unimplemented' alert, one time only
|
168
|
+
#
|
169
|
+
def unimp(*args)
|
170
|
+
oneTimeAlert("unimplemented", 0, *args)
|
171
|
+
end
|
172
|
+
|
173
|
+
# Write a string to a text file
|
174
|
+
#
|
175
|
+
def writeTextFile(path, contents)
|
176
|
+
File.open(path, "wb") {|f| f.write(contents) }
|
177
|
+
end
|
178
|
+
|
179
|
+
# Read a file's contents, return as a string
|
180
|
+
#
|
181
|
+
def readTextFile(path)
|
182
|
+
contents = nil
|
183
|
+
File.open(path,"rb") {|f| contents = f.read }
|
184
|
+
contents
|
185
|
+
end
|
186
|
+
|
data/lib/tokn.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'tokn/tokenizer'
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# Sample token definitions
|
2
|
+
|
3
|
+
# Whitespace includes a comment, which starts with '//' and
|
4
|
+
# extends to the end of the line:
|
5
|
+
#
|
6
|
+
WS: ( [\f\r\s\t\n]+ ) | ( // [^\n]* \n? )
|
7
|
+
|
8
|
+
# An anonymous token, for convenience; a non-empty sequence of digits
|
9
|
+
#
|
10
|
+
_DIG: [0-9]+
|
11
|
+
|
12
|
+
# Double has lower priority than int; we want ints to
|
13
|
+
# be interpreted as ints, not as doubles
|
14
|
+
DBL: \-?(({_DIG}(.{_DIG})?)|.{_DIG})
|
15
|
+
|
16
|
+
INT: \-?{_DIG}
|
17
|
+
|
18
|
+
LBL: '([^'\n]|\\')*'
|
19
|
+
|
20
|
+
ID: [_a-zA-Z][_a-zA-Z0-9]*
|
21
|
+
|
22
|
+
ASSIGN: =
|
23
|
+
|
24
|
+
EQUIV: ==
|
25
|
+
|
26
|
+
IF: if
|
27
|
+
DO: do
|
28
|
+
|
29
|
+
BROP: \{
|
30
|
+
|
31
|
+
BRCL: \}
|
32
|
+
|
data/test/simple.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require_relative '../lib/tokn/tools.rb'
|
3
|
+
req('tokenizer dfa')
|
4
|
+
|
5
|
+
|
6
|
+
class Simple
|
7
|
+
|
8
|
+
def dataPath(f)
|
9
|
+
File.dirname(__FILE__)+"/data/"+f
|
10
|
+
end
|
11
|
+
|
12
|
+
setTestDir()
|
13
|
+
|
14
|
+
# Various unit tests for state machines, character range sets, etc.
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@sampleText = readTextFile(self.dataPath("sampletext.txt"))
|
18
|
+
# @sampleTokens = readTextFile(self.dataPath("sampletokens.txt"))
|
19
|
+
end
|
20
|
+
|
21
|
+
def makeTok
|
22
|
+
dfa = DFA.dfa_from_script_file(self.dataPath("sampletokens.txt"))
|
23
|
+
Tokenizer.new(dfa, @sampleText)
|
24
|
+
end
|
25
|
+
|
26
|
+
def go
|
27
|
+
makeTok
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
s = Simple.new
|
33
|
+
s.go
|