tokn 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.txt +4 -5
- data/bin/tokncompile +1 -1
- data/bin/toknprocess +10 -4
- data/lib/tokn/code_set.rb +332 -337
- data/lib/tokn/dfa.rb +187 -162
- data/lib/tokn/dfa_builder.rb +218 -220
- data/lib/tokn/range_partition.rb +205 -203
- data/lib/tokn/reg_parse.rb +336 -331
- data/lib/tokn/state.rb +267 -270
- data/lib/tokn/token_defn_parser.rb +144 -139
- data/lib/tokn/tokenizer.rb +243 -175
- data/lib/tokn/tokn_const.rb +11 -6
- data/lib/tokn/tools.rb +42 -20
- data/test/Example1.rb +50 -0
- data/test/data/compileddfa.txt +1 -0
- data/test/data/sampletext.txt +6 -1
- data/test/test.rb +17 -12
- metadata +7 -6
- data/test/simple.rb +0 -33
data/lib/tokn/tokn_const.rb
CHANGED
@@ -1,7 +1,13 @@
|
|
1
|
-
#
|
1
|
+
# Namespace to encompass the portions of the Tokn gem
|
2
|
+
# accessible to end users
|
2
3
|
#
|
3
4
|
module Tokn
|
4
|
-
|
5
|
+
end
|
6
|
+
|
7
|
+
# Namespace to encompass the portions of the Tokn gem
|
8
|
+
# used only internally
|
9
|
+
#
|
10
|
+
module ToknInternal
|
5
11
|
# Token id if text didn't match any tokens in the DFA
|
6
12
|
UNKNOWN_TOKEN = -1
|
7
13
|
|
@@ -16,14 +22,13 @@ module Tokn
|
|
16
22
|
|
17
23
|
# Convert a token id (>=0) to an edge label value ( < 0)
|
18
24
|
#
|
19
|
-
def tokenIdToEdgeLabel(tokenId)
|
25
|
+
def self.tokenIdToEdgeLabel(tokenId)
|
20
26
|
EPSILON-1-tokenId
|
21
27
|
end
|
22
28
|
|
23
29
|
# Convert an edge label value ( < 0) to a token id (>=0)
|
24
30
|
#
|
25
|
-
def edgeLabelToTokenId(edgeLabel)
|
31
|
+
def self.edgeLabelToTokenId(edgeLabel)
|
26
32
|
EPSILON-1-edgeLabel
|
27
33
|
end
|
28
|
-
|
29
|
-
end
|
34
|
+
end
|
data/lib/tokn/tools.rb
CHANGED
@@ -29,11 +29,7 @@ end
|
|
29
29
|
# should be considered a debug-only feature
|
30
30
|
#
|
31
31
|
def d(arg)
|
32
|
-
|
33
|
-
"<nil>"
|
34
|
-
else
|
35
|
-
arg.inspect
|
36
|
-
end
|
32
|
+
arg.nil? ? "<nil>" : arg.inspect
|
37
33
|
end
|
38
34
|
|
39
35
|
# Assert that a value is true. Should be considered a
|
@@ -43,11 +39,7 @@ end
|
|
43
39
|
def myAssert(cond, *msg)
|
44
40
|
oneTimeAlert("warning",0,"Checking assertion")
|
45
41
|
if not cond
|
46
|
-
|
47
|
-
str = "assertion error"
|
48
|
-
else
|
49
|
-
str = sprintf(*msg)
|
50
|
-
end
|
42
|
+
str = (msg.size == 0) ? "assertion error" : sprintf(*msg)
|
51
43
|
raise Exception, str
|
52
44
|
end
|
53
45
|
end
|
@@ -56,10 +48,7 @@ end
|
|
56
48
|
# Set test directory. If nil, sets to home directory + "__test__"
|
57
49
|
#
|
58
50
|
def setTestDir(d = nil)
|
59
|
-
|
60
|
-
d = File.join(Dir.home,"__test__")
|
61
|
-
end
|
62
|
-
$testDir = d
|
51
|
+
$testDir = d || File.join(Dir.home,"__test__")
|
63
52
|
end
|
64
53
|
|
65
54
|
# Get a path within the test directory;
|
@@ -75,11 +64,7 @@ def withinTestDir(relPath = nil)
|
|
75
64
|
if !File.directory?($testDir)
|
76
65
|
Dir::mkdir($testDir)
|
77
66
|
end
|
78
|
-
|
79
|
-
File.join($testDir,relPath)
|
80
|
-
else
|
81
|
-
$testDir
|
82
|
-
end
|
67
|
+
relPath ? File.join($testDir,relPath) : $testDir
|
83
68
|
end
|
84
69
|
|
85
70
|
# Convert a .dot file (string) to a PDF file "__mygraph__nnn.pdf"
|
@@ -95,6 +80,28 @@ def dotToPDF(dotFile, name = "")
|
|
95
80
|
system("dot -Tpdf "+dotPath+" -o "+destName)
|
96
81
|
end
|
97
82
|
|
83
|
+
# Extensions to the Enumerable module
|
84
|
+
#
|
85
|
+
module Enumerable
|
86
|
+
# Calculate a value for each item, and return the item with the
|
87
|
+
# highest value, its index, and the value.
|
88
|
+
# @yieldparam function to calculate value of an object, given that object as a parameter
|
89
|
+
# @return the triple [object, index, value] reflecting the maximum value, or
|
90
|
+
# nil if there were no items
|
91
|
+
def max_with_index
|
92
|
+
|
93
|
+
best = nil
|
94
|
+
|
95
|
+
each_with_index do |obj,ind|
|
96
|
+
sc = yield(obj)
|
97
|
+
if !best || best[2] < sc
|
98
|
+
best = [obj,ind,sc]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
best
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
98
105
|
|
99
106
|
# Get a nice, concise description of the file and line
|
100
107
|
# of some caller within the stack.
|
@@ -109,7 +116,6 @@ def getCallerLocation(nSkip = 2)
|
|
109
116
|
if nSkip >= 0 && nSkip < caller.size
|
110
117
|
fi = caller[nSkip]
|
111
118
|
|
112
|
-
# ' path : line number : other '
|
113
119
|
i = fi.index(':')
|
114
120
|
j = nil
|
115
121
|
if i
|
@@ -184,3 +190,19 @@ def readTextFile(path)
|
|
184
190
|
contents
|
185
191
|
end
|
186
192
|
|
193
|
+
# Method that takes a code block as an argument to
|
194
|
+
# achieve the same functionality as Java/C++'s
|
195
|
+
# do {
|
196
|
+
# ...
|
197
|
+
# ... possibly with 'break' to jump to the end ...
|
198
|
+
# } while (false);
|
199
|
+
#
|
200
|
+
def block
|
201
|
+
yield
|
202
|
+
end
|
203
|
+
|
204
|
+
# Exception class for objects in illegal states
|
205
|
+
#
|
206
|
+
class IllegalStateException < Exception
|
207
|
+
end
|
208
|
+
|
data/test/Example1.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require_relative '../lib/tokn/tokenizer'
|
2
|
+
|
3
|
+
class Example1
|
4
|
+
|
5
|
+
include Tokn
|
6
|
+
|
7
|
+
def dataPath(f)
|
8
|
+
File.dirname(__FILE__)+"/data/"+f
|
9
|
+
end
|
10
|
+
|
11
|
+
setTestDir()
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@sampleText = readTextFile(dataPath("sampletext.txt"))
|
15
|
+
end
|
16
|
+
|
17
|
+
def makeTok
|
18
|
+
@dfa = DFA.from_script_file(dataPath("sampletokens.txt"))
|
19
|
+
Tokenizer.new(@dfa, @sampleText, "WS")
|
20
|
+
end
|
21
|
+
|
22
|
+
def go
|
23
|
+
puts "Tokenizing the 'sampletext.txt' file, filtering out whitespace (WS) tokens...\n\n"
|
24
|
+
|
25
|
+
t = makeTok
|
26
|
+
|
27
|
+
while t.hasNext do
|
28
|
+
|
29
|
+
tk = t.peek
|
30
|
+
|
31
|
+
if t.nameOf(tk) == 'BROP'
|
32
|
+
lst = t.readSequenceIf('BROP DO ID BRCL')
|
33
|
+
if lst
|
34
|
+
pr(" ...read BROP DO ID sequence...\n")
|
35
|
+
lst.each{ |x| pr(" %s\n",d(x))}
|
36
|
+
next
|
37
|
+
else
|
38
|
+
pr(" ...couldn't find sequence...\n")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
tk = t.read
|
43
|
+
pr("%s\n",d(tk))
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
Example1.new.go
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":1.0,"tokens":["WS","DBL","INT","LBL","ID","ASSIGN","EQUIV","IF","DO","BROP","BRCL"],"states":[[false,[[[125,126],1],[[123,124],2],[[100,101],3],[[105,106],4],[[61,62],5],[[65,91,95,96,97,100,101,105,106,123],6],[[39,40],7],[[48,58],8],[[45,46],9],[[46,47],10],[[9,11,12,13,32,33,92,93],11],[[47,48],12]]],[false,[[[-12,-11],14]]],[false,[[[-11,-10],14]]],[false,[[[48,58,65,91,95,96,97,111,112,123],6],[[-6,-5],14],[[111,112],22]]],[false,[[[48,58,65,91,95,96,97,102,103,123],6],[[-6,-5],14],[[102,103],21]]],[false,[[[-7,-6],14],[[61,62],20]]],[false,[[[48,58,65,91,95,96,97,123],6],[[-6,-5],14]]],[false,[[[0,10,11,39,40,92,93,1114112],7],[[39,40],17],[[92,93],18]]],[false,[[[48,58],8],[[46,47],10],[[-4,-3],14]]],[false,[[[48,58],8],[[46,47],10]]],[false,[[[48,58],16]]],[false,[[[9,11,12,13,32,33,92,93],11],[[-2,-1],14]]],[false,[[[47,48],13]]],[false,[[[0,10,11,1114112],13],[[-2,-1],14],[[10,11],15]]],[true,[]],[false,[[[-2,-1],14]]],[false,[[[-3,-2],14],[[48,58],16]]],[false,[[[-5,-4],14]]],[false,[[[0,10,11,39,40,92,93,1114112],7],[[92,93],18],[[39,40],19]]],[false,[[[0,10,11,39,40,92,93,1114112],7],[[-5,-4],14],[[39,40],17],[[92,93],18]]],[false,[[[-8,-7],14]]],[false,[[[48,58,65,91,95,96,97,123],6],[[-9,-8],14]]],[false,[[[48,58,65,91,95,96,97,123],6],[[-10,-9],14]]]]}
|
data/test/data/sampletext.txt
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
// Example source file that can be tokenized
|
2
2
|
|
3
3
|
speed = 42 // speed of object
|
4
|
-
|
5
4
|
gravity = -9.80
|
6
5
|
|
6
|
+
{ color = green }
|
7
|
+
|
7
8
|
title = 'This is a string with \' an escaped delimiter'
|
8
9
|
|
9
10
|
if gravity == 12 {
|
10
11
|
do something
|
11
12
|
}
|
13
|
+
|
14
|
+
do something_else
|
15
|
+
|
16
|
+
// End of 'sampletext.txt'
|
data/test/test.rb
CHANGED
@@ -3,16 +3,22 @@ require_relative '../lib/tokn/tools.rb'
|
|
3
3
|
req('range_partition dfa dfa_builder tokenizer')
|
4
4
|
|
5
5
|
|
6
|
+
|
7
|
+
# Get access to Tokn namespace
|
8
|
+
|
6
9
|
def dataPath(f)
|
7
10
|
File.dirname(__FILE__)+"/data/"+f
|
8
11
|
end
|
9
12
|
|
13
|
+
|
10
14
|
setTestDir()
|
11
15
|
|
12
16
|
# Various unit tests for state machines, character range sets, etc.
|
13
17
|
#
|
14
18
|
class TestComponent < Test::Unit::TestCase
|
15
19
|
|
20
|
+
include Tokn, ToknInternal
|
21
|
+
|
16
22
|
SKIPMOST = false # skip most of the tests?
|
17
23
|
|
18
24
|
def add(lower, upper = nil)
|
@@ -163,7 +169,7 @@ class TestComponent < Test::Unit::TestCase
|
|
163
169
|
end
|
164
170
|
|
165
171
|
def prep
|
166
|
-
@cs =
|
172
|
+
@cs = CodeSet.new
|
167
173
|
end
|
168
174
|
|
169
175
|
def test_illegalRange
|
@@ -277,12 +283,12 @@ class TestComponent < Test::Unit::TestCase
|
|
277
283
|
|
278
284
|
|
279
285
|
def newpar
|
280
|
-
@par =
|
286
|
+
@par = RangePartition.new
|
281
287
|
end
|
282
288
|
|
283
289
|
def addset(lower, upper = nil)
|
284
290
|
upper ||= lower + 1
|
285
|
-
r =
|
291
|
+
r = CodeSet.new(lower,upper)
|
286
292
|
@par.addSet(r)
|
287
293
|
end
|
288
294
|
|
@@ -330,17 +336,17 @@ class TestComponent < Test::Unit::TestCase
|
|
330
336
|
REGEX_SCRIPT = "(\\-?[0-9]+)|[_a-zA-Z][_a-zA-Z0-9]*|333q"
|
331
337
|
|
332
338
|
TOKEN_SCRIPT2 = <<'END'
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
339
|
+
sep: \s
|
340
|
+
tku: a(a|b)*
|
341
|
+
tkv: b(aa|b*)
|
342
|
+
tkw: bbb
|
337
343
|
END
|
338
344
|
|
339
345
|
|
340
346
|
def test_buildDFA
|
341
347
|
return if SKIPMOST
|
342
348
|
|
343
|
-
x =
|
349
|
+
x = RegParse.new(REGEX_SCRIPT)
|
344
350
|
s = x.startState
|
345
351
|
x.endState.finalState = true
|
346
352
|
|
@@ -389,7 +395,7 @@ END
|
|
389
395
|
@@sampleTokens = readTextFile(dataPath("sampletokens.txt"))
|
390
396
|
|
391
397
|
def makeTok
|
392
|
-
dfa = DFA.
|
398
|
+
dfa = DFA.from_script(@@sampleTokens)
|
393
399
|
Tokenizer.new(dfa, @@sampleText)
|
394
400
|
end
|
395
401
|
|
@@ -443,7 +449,7 @@ END
|
|
443
449
|
end
|
444
450
|
assert(!File.exist?(destPath))
|
445
451
|
|
446
|
-
dfa = DFA.
|
452
|
+
dfa = DFA.from_script(tokScript, destPath)
|
447
453
|
assert(File.exist?(destPath))
|
448
454
|
|
449
455
|
tok = Tokenizer.new(dfa, testText)
|
@@ -453,7 +459,7 @@ END
|
|
453
459
|
|
454
460
|
def prep2
|
455
461
|
testText = @@sampleText
|
456
|
-
dfa = DFA.
|
462
|
+
dfa = DFA.from_file(withinTestDir("sampletokens_dfa.txt"))
|
457
463
|
tok = Tokenizer.new(dfa, testText)
|
458
464
|
end
|
459
465
|
|
@@ -516,4 +522,3 @@ END
|
|
516
522
|
end
|
517
523
|
end
|
518
524
|
|
519
|
-
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeff Sember
|
@@ -10,9 +10,9 @@ bindir: bin
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2013-03-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
14
|
-
tokn compiles
|
15
|
-
|
13
|
+
description: Given a script containing token descriptions (each a regular expression),
|
14
|
+
tokn compiles an automaton which it can then use to efficiently convert a text file
|
15
|
+
to a sequence of those tokens.
|
16
16
|
email: jpsember@gmail.com
|
17
17
|
executables:
|
18
18
|
- tokncompile
|
@@ -34,9 +34,10 @@ files:
|
|
34
34
|
- bin/tokncompile
|
35
35
|
- bin/toknprocess
|
36
36
|
- README.txt
|
37
|
+
- test/Example1.rb
|
38
|
+
- test/data/compileddfa.txt
|
37
39
|
- test/data/sampletext.txt
|
38
40
|
- test/data/sampletokens.txt
|
39
|
-
- test/simple.rb
|
40
41
|
- test/test.rb
|
41
42
|
- test/testcmds
|
42
43
|
- figures/sample_dfa.pdf
|
@@ -64,6 +65,6 @@ signing_key:
|
|
64
65
|
specification_version: 4
|
65
66
|
summary: Extracts tokens from source files
|
66
67
|
test_files:
|
67
|
-
- test/
|
68
|
+
- test/Example1.rb
|
68
69
|
- test/test.rb
|
69
70
|
has_rdoc:
|
data/test/simple.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
require 'test/unit'
|
2
|
-
require_relative '../lib/tokn/tools.rb'
|
3
|
-
req('tokenizer dfa')
|
4
|
-
|
5
|
-
|
6
|
-
class Simple
|
7
|
-
|
8
|
-
def dataPath(f)
|
9
|
-
File.dirname(__FILE__)+"/data/"+f
|
10
|
-
end
|
11
|
-
|
12
|
-
setTestDir()
|
13
|
-
|
14
|
-
# Various unit tests for state machines, character range sets, etc.
|
15
|
-
|
16
|
-
def initialize
|
17
|
-
@sampleText = readTextFile(self.dataPath("sampletext.txt"))
|
18
|
-
# @sampleTokens = readTextFile(self.dataPath("sampletokens.txt"))
|
19
|
-
end
|
20
|
-
|
21
|
-
def makeTok
|
22
|
-
dfa = DFA.dfa_from_script_file(self.dataPath("sampletokens.txt"))
|
23
|
-
Tokenizer.new(dfa, @sampleText)
|
24
|
-
end
|
25
|
-
|
26
|
-
def go
|
27
|
-
makeTok
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
|
32
|
-
s = Simple.new
|
33
|
-
s.go
|