tokn 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.txt +4 -5
- data/bin/tokncompile +1 -1
- data/bin/toknprocess +10 -4
- data/lib/tokn/code_set.rb +332 -337
- data/lib/tokn/dfa.rb +187 -162
- data/lib/tokn/dfa_builder.rb +218 -220
- data/lib/tokn/range_partition.rb +205 -203
- data/lib/tokn/reg_parse.rb +336 -331
- data/lib/tokn/state.rb +267 -270
- data/lib/tokn/token_defn_parser.rb +144 -139
- data/lib/tokn/tokenizer.rb +243 -175
- data/lib/tokn/tokn_const.rb +11 -6
- data/lib/tokn/tools.rb +42 -20
- data/test/Example1.rb +50 -0
- data/test/data/compileddfa.txt +1 -0
- data/test/data/sampletext.txt +6 -1
- data/test/test.rb +17 -12
- metadata +7 -6
- data/test/simple.rb +0 -33
data/lib/tokn/tokn_const.rb
CHANGED
@@ -1,7 +1,13 @@
|
|
1
|
-
#
|
1
|
+
# Namespace to encompass the portions of the Tokn gem
|
2
|
+
# accessible to end users
|
2
3
|
#
|
3
4
|
module Tokn
|
4
|
-
|
5
|
+
end
|
6
|
+
|
7
|
+
# Namespace to encompass the portions of the Tokn gem
|
8
|
+
# used only internally
|
9
|
+
#
|
10
|
+
module ToknInternal
|
5
11
|
# Token id if text didn't match any tokens in the DFA
|
6
12
|
UNKNOWN_TOKEN = -1
|
7
13
|
|
@@ -16,14 +22,13 @@ module Tokn
|
|
16
22
|
|
17
23
|
# Convert a token id (>=0) to an edge label value ( < 0)
|
18
24
|
#
|
19
|
-
def tokenIdToEdgeLabel(tokenId)
|
25
|
+
def self.tokenIdToEdgeLabel(tokenId)
|
20
26
|
EPSILON-1-tokenId
|
21
27
|
end
|
22
28
|
|
23
29
|
# Convert an edge label value ( < 0) to a token id (>=0)
|
24
30
|
#
|
25
|
-
def edgeLabelToTokenId(edgeLabel)
|
31
|
+
def self.edgeLabelToTokenId(edgeLabel)
|
26
32
|
EPSILON-1-edgeLabel
|
27
33
|
end
|
28
|
-
|
29
|
-
end
|
34
|
+
end
|
data/lib/tokn/tools.rb
CHANGED
@@ -29,11 +29,7 @@ end
|
|
29
29
|
# should be considered a debug-only feature
|
30
30
|
#
|
31
31
|
def d(arg)
|
32
|
-
|
33
|
-
"<nil>"
|
34
|
-
else
|
35
|
-
arg.inspect
|
36
|
-
end
|
32
|
+
arg.nil? ? "<nil>" : arg.inspect
|
37
33
|
end
|
38
34
|
|
39
35
|
# Assert that a value is true. Should be considered a
|
@@ -43,11 +39,7 @@ end
|
|
43
39
|
def myAssert(cond, *msg)
|
44
40
|
oneTimeAlert("warning",0,"Checking assertion")
|
45
41
|
if not cond
|
46
|
-
|
47
|
-
str = "assertion error"
|
48
|
-
else
|
49
|
-
str = sprintf(*msg)
|
50
|
-
end
|
42
|
+
str = (msg.size == 0) ? "assertion error" : sprintf(*msg)
|
51
43
|
raise Exception, str
|
52
44
|
end
|
53
45
|
end
|
@@ -56,10 +48,7 @@ end
|
|
56
48
|
# Set test directory. If nil, sets to home directory + "__test__"
|
57
49
|
#
|
58
50
|
def setTestDir(d = nil)
|
59
|
-
|
60
|
-
d = File.join(Dir.home,"__test__")
|
61
|
-
end
|
62
|
-
$testDir = d
|
51
|
+
$testDir = d || File.join(Dir.home,"__test__")
|
63
52
|
end
|
64
53
|
|
65
54
|
# Get a path within the test directory;
|
@@ -75,11 +64,7 @@ def withinTestDir(relPath = nil)
|
|
75
64
|
if !File.directory?($testDir)
|
76
65
|
Dir::mkdir($testDir)
|
77
66
|
end
|
78
|
-
|
79
|
-
File.join($testDir,relPath)
|
80
|
-
else
|
81
|
-
$testDir
|
82
|
-
end
|
67
|
+
relPath ? File.join($testDir,relPath) : $testDir
|
83
68
|
end
|
84
69
|
|
85
70
|
# Convert a .dot file (string) to a PDF file "__mygraph__nnn.pdf"
|
@@ -95,6 +80,28 @@ def dotToPDF(dotFile, name = "")
|
|
95
80
|
system("dot -Tpdf "+dotPath+" -o "+destName)
|
96
81
|
end
|
97
82
|
|
83
|
+
# Extensions to the Enumerable module
|
84
|
+
#
|
85
|
+
module Enumerable
|
86
|
+
# Calculate a value for each item, and return the item with the
|
87
|
+
# highest value, its index, and the value.
|
88
|
+
# @yieldparam function to calculate value of an object, given that object as a parameter
|
89
|
+
# @return the triple [object, index, value] reflecting the maximum value, or
|
90
|
+
# nil if there were no items
|
91
|
+
def max_with_index
|
92
|
+
|
93
|
+
best = nil
|
94
|
+
|
95
|
+
each_with_index do |obj,ind|
|
96
|
+
sc = yield(obj)
|
97
|
+
if !best || best[2] < sc
|
98
|
+
best = [obj,ind,sc]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
best
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
98
105
|
|
99
106
|
# Get a nice, concise description of the file and line
|
100
107
|
# of some caller within the stack.
|
@@ -109,7 +116,6 @@ def getCallerLocation(nSkip = 2)
|
|
109
116
|
if nSkip >= 0 && nSkip < caller.size
|
110
117
|
fi = caller[nSkip]
|
111
118
|
|
112
|
-
# ' path : line number : other '
|
113
119
|
i = fi.index(':')
|
114
120
|
j = nil
|
115
121
|
if i
|
@@ -184,3 +190,19 @@ def readTextFile(path)
|
|
184
190
|
contents
|
185
191
|
end
|
186
192
|
|
193
|
+
# Method that takes a code block as an argument to
|
194
|
+
# achieve the same functionality as Java/C++'s
|
195
|
+
# do {
|
196
|
+
# ...
|
197
|
+
# ... possibly with 'break' to jump to the end ...
|
198
|
+
# } while (false);
|
199
|
+
#
|
200
|
+
def block
|
201
|
+
yield
|
202
|
+
end
|
203
|
+
|
204
|
+
# Exception class for objects in illegal states
|
205
|
+
#
|
206
|
+
class IllegalStateException < Exception
|
207
|
+
end
|
208
|
+
|
data/test/Example1.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require_relative '../lib/tokn/tokenizer'
|
2
|
+
|
3
|
+
class Example1
|
4
|
+
|
5
|
+
include Tokn
|
6
|
+
|
7
|
+
def dataPath(f)
|
8
|
+
File.dirname(__FILE__)+"/data/"+f
|
9
|
+
end
|
10
|
+
|
11
|
+
setTestDir()
|
12
|
+
|
13
|
+
def initialize
|
14
|
+
@sampleText = readTextFile(dataPath("sampletext.txt"))
|
15
|
+
end
|
16
|
+
|
17
|
+
def makeTok
|
18
|
+
@dfa = DFA.from_script_file(dataPath("sampletokens.txt"))
|
19
|
+
Tokenizer.new(@dfa, @sampleText, "WS")
|
20
|
+
end
|
21
|
+
|
22
|
+
def go
|
23
|
+
puts "Tokenizing the 'sampletext.txt' file, filtering out whitespace (WS) tokens...\n\n"
|
24
|
+
|
25
|
+
t = makeTok
|
26
|
+
|
27
|
+
while t.hasNext do
|
28
|
+
|
29
|
+
tk = t.peek
|
30
|
+
|
31
|
+
if t.nameOf(tk) == 'BROP'
|
32
|
+
lst = t.readSequenceIf('BROP DO ID BRCL')
|
33
|
+
if lst
|
34
|
+
pr(" ...read BROP DO ID sequence...\n")
|
35
|
+
lst.each{ |x| pr(" %s\n",d(x))}
|
36
|
+
next
|
37
|
+
else
|
38
|
+
pr(" ...couldn't find sequence...\n")
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
tk = t.read
|
43
|
+
pr("%s\n",d(tk))
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
Example1.new.go
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":1.0,"tokens":["WS","DBL","INT","LBL","ID","ASSIGN","EQUIV","IF","DO","BROP","BRCL"],"states":[[false,[[[125,126],1],[[123,124],2],[[100,101],3],[[105,106],4],[[61,62],5],[[65,91,95,96,97,100,101,105,106,123],6],[[39,40],7],[[48,58],8],[[45,46],9],[[46,47],10],[[9,11,12,13,32,33,92,93],11],[[47,48],12]]],[false,[[[-12,-11],14]]],[false,[[[-11,-10],14]]],[false,[[[48,58,65,91,95,96,97,111,112,123],6],[[-6,-5],14],[[111,112],22]]],[false,[[[48,58,65,91,95,96,97,102,103,123],6],[[-6,-5],14],[[102,103],21]]],[false,[[[-7,-6],14],[[61,62],20]]],[false,[[[48,58,65,91,95,96,97,123],6],[[-6,-5],14]]],[false,[[[0,10,11,39,40,92,93,1114112],7],[[39,40],17],[[92,93],18]]],[false,[[[48,58],8],[[46,47],10],[[-4,-3],14]]],[false,[[[48,58],8],[[46,47],10]]],[false,[[[48,58],16]]],[false,[[[9,11,12,13,32,33,92,93],11],[[-2,-1],14]]],[false,[[[47,48],13]]],[false,[[[0,10,11,1114112],13],[[-2,-1],14],[[10,11],15]]],[true,[]],[false,[[[-2,-1],14]]],[false,[[[-3,-2],14],[[48,58],16]]],[false,[[[-5,-4],14]]],[false,[[[0,10,11,39,40,92,93,1114112],7],[[92,93],18],[[39,40],19]]],[false,[[[0,10,11,39,40,92,93,1114112],7],[[-5,-4],14],[[39,40],17],[[92,93],18]]],[false,[[[-8,-7],14]]],[false,[[[48,58,65,91,95,96,97,123],6],[[-9,-8],14]]],[false,[[[48,58,65,91,95,96,97,123],6],[[-10,-9],14]]]]}
|
data/test/data/sampletext.txt
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
// Example source file that can be tokenized
|
2
2
|
|
3
3
|
speed = 42 // speed of object
|
4
|
-
|
5
4
|
gravity = -9.80
|
6
5
|
|
6
|
+
{ color = green }
|
7
|
+
|
7
8
|
title = 'This is a string with \' an escaped delimiter'
|
8
9
|
|
9
10
|
if gravity == 12 {
|
10
11
|
do something
|
11
12
|
}
|
13
|
+
|
14
|
+
do something_else
|
15
|
+
|
16
|
+
// End of 'sampletext.txt'
|
data/test/test.rb
CHANGED
@@ -3,16 +3,22 @@ require_relative '../lib/tokn/tools.rb'
|
|
3
3
|
req('range_partition dfa dfa_builder tokenizer')
|
4
4
|
|
5
5
|
|
6
|
+
|
7
|
+
# Get access to Tokn namespace
|
8
|
+
|
6
9
|
def dataPath(f)
|
7
10
|
File.dirname(__FILE__)+"/data/"+f
|
8
11
|
end
|
9
12
|
|
13
|
+
|
10
14
|
setTestDir()
|
11
15
|
|
12
16
|
# Various unit tests for state machines, character range sets, etc.
|
13
17
|
#
|
14
18
|
class TestComponent < Test::Unit::TestCase
|
15
19
|
|
20
|
+
include Tokn, ToknInternal
|
21
|
+
|
16
22
|
SKIPMOST = false # skip most of the tests?
|
17
23
|
|
18
24
|
def add(lower, upper = nil)
|
@@ -163,7 +169,7 @@ class TestComponent < Test::Unit::TestCase
|
|
163
169
|
end
|
164
170
|
|
165
171
|
def prep
|
166
|
-
@cs =
|
172
|
+
@cs = CodeSet.new
|
167
173
|
end
|
168
174
|
|
169
175
|
def test_illegalRange
|
@@ -277,12 +283,12 @@ class TestComponent < Test::Unit::TestCase
|
|
277
283
|
|
278
284
|
|
279
285
|
def newpar
|
280
|
-
@par =
|
286
|
+
@par = RangePartition.new
|
281
287
|
end
|
282
288
|
|
283
289
|
def addset(lower, upper = nil)
|
284
290
|
upper ||= lower + 1
|
285
|
-
r =
|
291
|
+
r = CodeSet.new(lower,upper)
|
286
292
|
@par.addSet(r)
|
287
293
|
end
|
288
294
|
|
@@ -330,17 +336,17 @@ class TestComponent < Test::Unit::TestCase
|
|
330
336
|
REGEX_SCRIPT = "(\\-?[0-9]+)|[_a-zA-Z][_a-zA-Z0-9]*|333q"
|
331
337
|
|
332
338
|
TOKEN_SCRIPT2 = <<'END'
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
339
|
+
sep: \s
|
340
|
+
tku: a(a|b)*
|
341
|
+
tkv: b(aa|b*)
|
342
|
+
tkw: bbb
|
337
343
|
END
|
338
344
|
|
339
345
|
|
340
346
|
def test_buildDFA
|
341
347
|
return if SKIPMOST
|
342
348
|
|
343
|
-
x =
|
349
|
+
x = RegParse.new(REGEX_SCRIPT)
|
344
350
|
s = x.startState
|
345
351
|
x.endState.finalState = true
|
346
352
|
|
@@ -389,7 +395,7 @@ END
|
|
389
395
|
@@sampleTokens = readTextFile(dataPath("sampletokens.txt"))
|
390
396
|
|
391
397
|
def makeTok
|
392
|
-
dfa = DFA.
|
398
|
+
dfa = DFA.from_script(@@sampleTokens)
|
393
399
|
Tokenizer.new(dfa, @@sampleText)
|
394
400
|
end
|
395
401
|
|
@@ -443,7 +449,7 @@ END
|
|
443
449
|
end
|
444
450
|
assert(!File.exist?(destPath))
|
445
451
|
|
446
|
-
dfa = DFA.
|
452
|
+
dfa = DFA.from_script(tokScript, destPath)
|
447
453
|
assert(File.exist?(destPath))
|
448
454
|
|
449
455
|
tok = Tokenizer.new(dfa, testText)
|
@@ -453,7 +459,7 @@ END
|
|
453
459
|
|
454
460
|
def prep2
|
455
461
|
testText = @@sampleText
|
456
|
-
dfa = DFA.
|
462
|
+
dfa = DFA.from_file(withinTestDir("sampletokens_dfa.txt"))
|
457
463
|
tok = Tokenizer.new(dfa, testText)
|
458
464
|
end
|
459
465
|
|
@@ -516,4 +522,3 @@ END
|
|
516
522
|
end
|
517
523
|
end
|
518
524
|
|
519
|
-
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeff Sember
|
@@ -10,9 +10,9 @@ bindir: bin
|
|
10
10
|
cert_chain: []
|
11
11
|
date: 2013-03-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
14
|
-
tokn compiles
|
15
|
-
|
13
|
+
description: Given a script containing token descriptions (each a regular expression),
|
14
|
+
tokn compiles an automaton which it can then use to efficiently convert a text file
|
15
|
+
to a sequence of those tokens.
|
16
16
|
email: jpsember@gmail.com
|
17
17
|
executables:
|
18
18
|
- tokncompile
|
@@ -34,9 +34,10 @@ files:
|
|
34
34
|
- bin/tokncompile
|
35
35
|
- bin/toknprocess
|
36
36
|
- README.txt
|
37
|
+
- test/Example1.rb
|
38
|
+
- test/data/compileddfa.txt
|
37
39
|
- test/data/sampletext.txt
|
38
40
|
- test/data/sampletokens.txt
|
39
|
-
- test/simple.rb
|
40
41
|
- test/test.rb
|
41
42
|
- test/testcmds
|
42
43
|
- figures/sample_dfa.pdf
|
@@ -64,6 +65,6 @@ signing_key:
|
|
64
65
|
specification_version: 4
|
65
66
|
summary: Extracts tokens from source files
|
66
67
|
test_files:
|
67
|
-
- test/
|
68
|
+
- test/Example1.rb
|
68
69
|
- test/test.rb
|
69
70
|
has_rdoc:
|
data/test/simple.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
require 'test/unit'
|
2
|
-
require_relative '../lib/tokn/tools.rb'
|
3
|
-
req('tokenizer dfa')
|
4
|
-
|
5
|
-
|
6
|
-
class Simple
|
7
|
-
|
8
|
-
def dataPath(f)
|
9
|
-
File.dirname(__FILE__)+"/data/"+f
|
10
|
-
end
|
11
|
-
|
12
|
-
setTestDir()
|
13
|
-
|
14
|
-
# Various unit tests for state machines, character range sets, etc.
|
15
|
-
|
16
|
-
def initialize
|
17
|
-
@sampleText = readTextFile(self.dataPath("sampletext.txt"))
|
18
|
-
# @sampleTokens = readTextFile(self.dataPath("sampletokens.txt"))
|
19
|
-
end
|
20
|
-
|
21
|
-
def makeTok
|
22
|
-
dfa = DFA.dfa_from_script_file(self.dataPath("sampletokens.txt"))
|
23
|
-
Tokenizer.new(dfa, @sampleText)
|
24
|
-
end
|
25
|
-
|
26
|
-
def go
|
27
|
-
makeTok
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
|
32
|
-
s = Simple.new
|
33
|
-
s.go
|