tokn 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.txt +194 -0
- data/bin/tokncompile +16 -0
- data/bin/toknprocess +26 -0
- data/figures/sample_dfa.pdf +0 -0
- data/lib/tokn/code_set.rb +392 -0
- data/lib/tokn/dfa.rb +196 -0
- data/lib/tokn/dfa_builder.rb +261 -0
- data/lib/tokn/range_partition.rb +233 -0
- data/lib/tokn/reg_parse.rb +379 -0
- data/lib/tokn/state.rb +320 -0
- data/lib/tokn/token_defn_parser.rb +156 -0
- data/lib/tokn/tokenizer.rb +211 -0
- data/lib/tokn/tokn_const.rb +29 -0
- data/lib/tokn/tools.rb +186 -0
- data/lib/tokn.rb +1 -0
- data/test/data/sampletext.txt +11 -0
- data/test/data/sampletokens.txt +32 -0
- data/test/simple.rb +33 -0
- data/test/test.rb +519 -0
- data/test/testcmds +4 -0
- metadata +69 -0
data/test/test.rb
ADDED
@@ -0,0 +1,519 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require_relative '../lib/tokn/tools.rb'
|
3
|
+
req('range_partition dfa dfa_builder tokenizer')
|
4
|
+
|
5
|
+
|
6
|
+
def dataPath(f)
|
7
|
+
File.dirname(__FILE__)+"/data/"+f
|
8
|
+
end
|
9
|
+
|
10
|
+
setTestDir()
|
11
|
+
|
12
|
+
# Various unit tests for state machines, character range sets, etc.
|
13
|
+
#
|
14
|
+
class TestComponent < Test::Unit::TestCase
|
15
|
+
|
16
|
+
SKIPMOST = false # skip most of the tests?
|
17
|
+
|
18
|
+
def add(lower, upper = nil)
|
19
|
+
@cs.add(lower,upper)
|
20
|
+
end
|
21
|
+
|
22
|
+
def remove(lower, upper = nil)
|
23
|
+
@cs.remove(lower,upper)
|
24
|
+
end
|
25
|
+
|
26
|
+
def swap
|
27
|
+
@ct = @cs
|
28
|
+
prep
|
29
|
+
end
|
30
|
+
|
31
|
+
def isect
|
32
|
+
@cs.intersect!(@ct)
|
33
|
+
end
|
34
|
+
|
35
|
+
def diff
|
36
|
+
@cs.difference!(@ct)
|
37
|
+
end
|
38
|
+
|
39
|
+
def equ(s, arr = nil)
|
40
|
+
arr ||= @cs.array
|
41
|
+
ia = s.split.map{|n| n.to_i}
|
42
|
+
assert_equal(ia,arr)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_add
|
46
|
+
prep
|
47
|
+
|
48
|
+
add(72,81)
|
49
|
+
equ '72 81'
|
50
|
+
|
51
|
+
add(50)
|
52
|
+
equ '50 51 72 81'
|
53
|
+
|
54
|
+
add(75,77)
|
55
|
+
equ '50 51 72 81'
|
56
|
+
|
57
|
+
add(72,78)
|
58
|
+
equ '50 51 72 81'
|
59
|
+
|
60
|
+
add(70,78)
|
61
|
+
equ '50 51 70 81'
|
62
|
+
|
63
|
+
add 60
|
64
|
+
equ '50 51 60 61 70 81'
|
65
|
+
|
66
|
+
add 40
|
67
|
+
equ '40 41 50 51 60 61 70 81'
|
68
|
+
|
69
|
+
add 41
|
70
|
+
equ '40 42 50 51 60 61 70 81'
|
71
|
+
|
72
|
+
add 81
|
73
|
+
equ '40 42 50 51 60 61 70 82'
|
74
|
+
|
75
|
+
add 83
|
76
|
+
equ '40 42 50 51 60 61 70 82 83 84'
|
77
|
+
|
78
|
+
add 49,84
|
79
|
+
equ '40 42 49 84'
|
80
|
+
|
81
|
+
add 39,86
|
82
|
+
equ '39 86'
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_intersect
|
86
|
+
prep
|
87
|
+
add 39,86
|
88
|
+
swap
|
89
|
+
add 50,70
|
90
|
+
isect
|
91
|
+
equ '50 70'
|
92
|
+
|
93
|
+
swap
|
94
|
+
add 20,25
|
95
|
+
add 35,51
|
96
|
+
add 62,68
|
97
|
+
add 72,80
|
98
|
+
isect
|
99
|
+
equ '50 51 62 68'
|
100
|
+
|
101
|
+
prep
|
102
|
+
swap
|
103
|
+
add 50,70
|
104
|
+
isect
|
105
|
+
equ ''
|
106
|
+
|
107
|
+
add 50,70
|
108
|
+
swap
|
109
|
+
add 50,70
|
110
|
+
isect
|
111
|
+
equ '50 70'
|
112
|
+
|
113
|
+
|
114
|
+
prep
|
115
|
+
add 20,25
|
116
|
+
swap
|
117
|
+
add 25,30
|
118
|
+
isect
|
119
|
+
equ ''
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
def test_difference
|
124
|
+
prep
|
125
|
+
add 20,30
|
126
|
+
add 40,50
|
127
|
+
swap
|
128
|
+
|
129
|
+
|
130
|
+
add 20,80
|
131
|
+
diff
|
132
|
+
equ '30 40 50 80'
|
133
|
+
|
134
|
+
prep
|
135
|
+
add 19,32
|
136
|
+
diff
|
137
|
+
equ '19 20 30 32'
|
138
|
+
|
139
|
+
prep
|
140
|
+
add 30,40
|
141
|
+
diff
|
142
|
+
equ '30 40'
|
143
|
+
|
144
|
+
prep
|
145
|
+
add 20,30
|
146
|
+
add 40,50
|
147
|
+
diff
|
148
|
+
equ ''
|
149
|
+
|
150
|
+
prep
|
151
|
+
add 19,30
|
152
|
+
add 40,50
|
153
|
+
diff
|
154
|
+
equ '19 20'
|
155
|
+
|
156
|
+
prep
|
157
|
+
add 20,30
|
158
|
+
add 40,51
|
159
|
+
diff
|
160
|
+
equ '50 51'
|
161
|
+
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
def prep
|
166
|
+
@cs = CodeSet.new
|
167
|
+
end
|
168
|
+
|
169
|
+
def test_illegalRange
|
170
|
+
prep
|
171
|
+
|
172
|
+
assert_raise(RangeError) { add 60,50 }
|
173
|
+
assert_raise(RangeError) { add 60,60 }
|
174
|
+
end
|
175
|
+
|
176
|
+
def neg lower, upper
|
177
|
+
@cs.negate lower, upper
|
178
|
+
end
|
179
|
+
|
180
|
+
def test_negate
|
181
|
+
prep
|
182
|
+
add 10,15
|
183
|
+
add 20,25
|
184
|
+
add 30
|
185
|
+
add 40,45
|
186
|
+
equ '10 15 20 25 30 31 40 45'
|
187
|
+
neg 22,37
|
188
|
+
equ '10 15 20 22 25 30 31 37 40 45'
|
189
|
+
neg 25,27
|
190
|
+
equ '10 15 20 22 27 30 31 37 40 45'
|
191
|
+
neg 15,20
|
192
|
+
equ '10 22 27 30 31 37 40 45'
|
193
|
+
|
194
|
+
prep
|
195
|
+
add 10,22
|
196
|
+
@cs.negate
|
197
|
+
equ '0 10 22 1114112'
|
198
|
+
|
199
|
+
|
200
|
+
prep
|
201
|
+
add 10,20
|
202
|
+
neg 10,20
|
203
|
+
equ ''
|
204
|
+
|
205
|
+
prep
|
206
|
+
add 10,20
|
207
|
+
add 30,40
|
208
|
+
neg 5,10
|
209
|
+
equ '5 20 30 40'
|
210
|
+
|
211
|
+
prep
|
212
|
+
add 10,20
|
213
|
+
add 30,40
|
214
|
+
neg 25,30
|
215
|
+
equ '10 20 25 40'
|
216
|
+
|
217
|
+
prep
|
218
|
+
add 10,20
|
219
|
+
add 30,40
|
220
|
+
neg 40,50
|
221
|
+
equ '10 20 30 50'
|
222
|
+
|
223
|
+
prep
|
224
|
+
add 10,20
|
225
|
+
add 30,40
|
226
|
+
neg 41,50
|
227
|
+
equ '10 20 30 40 41 50'
|
228
|
+
|
229
|
+
prep
|
230
|
+
add 10,20
|
231
|
+
add 30,40
|
232
|
+
neg 15,35
|
233
|
+
equ '10 15 20 30 35 40'
|
234
|
+
end
|
235
|
+
|
236
|
+
def test_remove
|
237
|
+
|
238
|
+
prep
|
239
|
+
add 10,20
|
240
|
+
add 30,40
|
241
|
+
remove 29,41
|
242
|
+
equ '10 20'
|
243
|
+
|
244
|
+
add 30,40
|
245
|
+
equ '10 20 30 40'
|
246
|
+
|
247
|
+
remove 20,30
|
248
|
+
equ '10 20 30 40'
|
249
|
+
|
250
|
+
remove 15,35
|
251
|
+
equ '10 15 35 40'
|
252
|
+
|
253
|
+
remove 10,15
|
254
|
+
equ '35 40'
|
255
|
+
remove 35
|
256
|
+
equ '36 40'
|
257
|
+
remove 40
|
258
|
+
equ '36 40'
|
259
|
+
remove 38
|
260
|
+
equ '36 38 39 40'
|
261
|
+
remove 37,39
|
262
|
+
equ '36 37 39 40'
|
263
|
+
|
264
|
+
end
|
265
|
+
|
266
|
+
|
267
|
+
def dset(st)
|
268
|
+
s = ''
|
269
|
+
st.each{|x|
|
270
|
+
if s.length > 0
|
271
|
+
s+= ' '
|
272
|
+
end
|
273
|
+
s += d(x)
|
274
|
+
}
|
275
|
+
return s
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
def newpar
|
280
|
+
@par = RangePartition.new
|
281
|
+
end
|
282
|
+
|
283
|
+
def addset(lower, upper = nil)
|
284
|
+
upper ||= lower + 1
|
285
|
+
r = CodeSet.new(lower,upper)
|
286
|
+
@par.addSet(r)
|
287
|
+
end
|
288
|
+
|
289
|
+
def apply
|
290
|
+
list = @par.apply(@cs)
|
291
|
+
res = []
|
292
|
+
list.each do |x|
|
293
|
+
res.concat x.array
|
294
|
+
end
|
295
|
+
@parResult = res
|
296
|
+
end
|
297
|
+
|
298
|
+
def test_partition
|
299
|
+
return if SKIPMOST
|
300
|
+
|
301
|
+
newpar
|
302
|
+
addset(20,30)
|
303
|
+
addset(25,33)
|
304
|
+
addset(37)
|
305
|
+
addset(40,50)
|
306
|
+
@par.prepare
|
307
|
+
|
308
|
+
@par.generatePDF()
|
309
|
+
|
310
|
+
prep
|
311
|
+
add 25,33
|
312
|
+
|
313
|
+
apply
|
314
|
+
equ('25 30 30 33', @parResult)
|
315
|
+
|
316
|
+
|
317
|
+
prep
|
318
|
+
add 37
|
319
|
+
apply
|
320
|
+
equ('37 38', @parResult)
|
321
|
+
|
322
|
+
prep
|
323
|
+
add 40,50
|
324
|
+
apply
|
325
|
+
equ('40 50', @parResult)
|
326
|
+
|
327
|
+
end
|
328
|
+
|
329
|
+
|
330
|
+
REGEX_SCRIPT = "(\\-?[0-9]+)|[_a-zA-Z][_a-zA-Z0-9]*|333q"
|
331
|
+
|
332
|
+
TOKEN_SCRIPT2 = <<'END'
|
333
|
+
sep: \s
|
334
|
+
tku: a(a|b)*
|
335
|
+
tkv: b(aa|b*)
|
336
|
+
tkw: bbb
|
337
|
+
END
|
338
|
+
|
339
|
+
|
340
|
+
def test_buildDFA
|
341
|
+
return if SKIPMOST
|
342
|
+
|
343
|
+
x = RegParse.new(REGEX_SCRIPT)
|
344
|
+
s = x.startState
|
345
|
+
x.endState.finalState = true
|
346
|
+
|
347
|
+
s.generatePDF("nfa")
|
348
|
+
|
349
|
+
r = s.reverseNFA()
|
350
|
+
r.generatePDF("reversed")
|
351
|
+
|
352
|
+
dfa = DFABuilder.nfa_to_dfa(s)
|
353
|
+
dfa.generatePDF("buildDFA")
|
354
|
+
end
|
355
|
+
|
356
|
+
def test_cvtNFAToDFA
|
357
|
+
return if SKIPMOST
|
358
|
+
|
359
|
+
x = RegParse.new(REGEX_SCRIPT)
|
360
|
+
s = x.startState
|
361
|
+
x.endState.finalState = true
|
362
|
+
|
363
|
+
s.generatePDF("nfa")
|
364
|
+
|
365
|
+
dfa = DFABuilder.nfa_to_dfa(s)
|
366
|
+
dfa.generatePDF("dfa")
|
367
|
+
|
368
|
+
oldToNewMap, maxId2 = dfa.duplicateNFA(42)
|
369
|
+
dfa2 = oldToNewMap[dfa]
|
370
|
+
dfa2.generatePDF("dfa_duplicated")
|
371
|
+
end
|
372
|
+
|
373
|
+
|
374
|
+
def test_TokenDefParser
|
375
|
+
return if SKIPMOST
|
376
|
+
|
377
|
+
s = TOKEN_SCRIPT2
|
378
|
+
|
379
|
+
td = TokenDefParser.new(s)
|
380
|
+
|
381
|
+
tokDFA = td.dfa
|
382
|
+
tokDFA.startState.generatePDF("TokenDFA")
|
383
|
+
|
384
|
+
end
|
385
|
+
|
386
|
+
|
387
|
+
|
388
|
+
@@sampleText = readTextFile(dataPath("sampletext.txt"))
|
389
|
+
@@sampleTokens = readTextFile(dataPath("sampletokens.txt"))
|
390
|
+
|
391
|
+
def makeTok
|
392
|
+
dfa = DFA.dfa_from_script(@@sampleTokens)
|
393
|
+
Tokenizer.new(dfa, @@sampleText)
|
394
|
+
end
|
395
|
+
|
396
|
+
|
397
|
+
def test_Tokenizer
|
398
|
+
return if SKIPMOST
|
399
|
+
|
400
|
+
tok = makeTok
|
401
|
+
|
402
|
+
tokList = []
|
403
|
+
while tok.hasNext
|
404
|
+
t = tok.read
|
405
|
+
tokList.push(t)
|
406
|
+
end
|
407
|
+
|
408
|
+
tok.unread(tokList.size)
|
409
|
+
|
410
|
+
tokList.each do |t1|
|
411
|
+
tName = tok.nameOf(t1)
|
412
|
+
t2 = tok.read(tName)
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
|
417
|
+
|
418
|
+
|
419
|
+
def test_TokenizerMissingExpected
|
420
|
+
return if SKIPMOST
|
421
|
+
|
422
|
+
assert_raise TokenizerException do
|
423
|
+
|
424
|
+
tok = makeTok
|
425
|
+
|
426
|
+
tok.read
|
427
|
+
tok.read
|
428
|
+
tok.read
|
429
|
+
tok.read
|
430
|
+
tok.read("signedint")
|
431
|
+
end
|
432
|
+
|
433
|
+
end
|
434
|
+
|
435
|
+
def test_CompileDFAToDisk
|
436
|
+
tokScript = @@sampleTokens
|
437
|
+
testText = @@sampleText
|
438
|
+
|
439
|
+
destPath = withinTestDir("sampletokens_dfa.txt")
|
440
|
+
|
441
|
+
if File.exist?(destPath)
|
442
|
+
File.delete(destPath)
|
443
|
+
end
|
444
|
+
assert(!File.exist?(destPath))
|
445
|
+
|
446
|
+
dfa = DFA.dfa_from_script(tokScript, destPath)
|
447
|
+
assert(File.exist?(destPath))
|
448
|
+
|
449
|
+
tok = Tokenizer.new(dfa, testText)
|
450
|
+
|
451
|
+
end
|
452
|
+
|
453
|
+
|
454
|
+
def prep2
|
455
|
+
testText = @@sampleText
|
456
|
+
dfa = DFA.dfa_from_file(withinTestDir("sampletokens_dfa.txt"))
|
457
|
+
tok = Tokenizer.new(dfa, testText)
|
458
|
+
end
|
459
|
+
|
460
|
+
def test_readAndUnread
|
461
|
+
tok = prep2
|
462
|
+
unread = false
|
463
|
+
while tok.hasNext
|
464
|
+
t = tok.read
|
465
|
+
pr("Read %-8s %s\n",tok.nameOf(t),d(t))
|
466
|
+
|
467
|
+
if !unread && tok.nameOf(t) == "DO"
|
468
|
+
pr(" ...pushing back four tokens...\n")
|
469
|
+
tok.unread(4)
|
470
|
+
unread = true
|
471
|
+
pr(" ...and resuming...\n")
|
472
|
+
end
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
def test_UnrecognizedToken
|
477
|
+
assert_raise TokenizerException do
|
478
|
+
tok = prep2
|
479
|
+
while tok.hasNext
|
480
|
+
t = tok.read
|
481
|
+
if tok.nameOf(t) == "DO"
|
482
|
+
tok.read("BRCL") # <== this should raise problem
|
483
|
+
end
|
484
|
+
end
|
485
|
+
end
|
486
|
+
end
|
487
|
+
|
488
|
+
def test_ReadPastEnd
|
489
|
+
assert_raise TokenizerException do
|
490
|
+
tok = prep2
|
491
|
+
while tok.hasNext
|
492
|
+
t = tok.read
|
493
|
+
end
|
494
|
+
tok.read
|
495
|
+
end
|
496
|
+
end
|
497
|
+
|
498
|
+
def test_UnreadBeforeStart
|
499
|
+
|
500
|
+
assert_raise TokenizerException do
|
501
|
+
tok = prep2
|
502
|
+
k = 0
|
503
|
+
while tok.hasNext
|
504
|
+
t = tok.read
|
505
|
+
k += 1
|
506
|
+
if k == 15
|
507
|
+
tok.unread(5)
|
508
|
+
tok.unread(7)
|
509
|
+
tok.read()
|
510
|
+
tok.unread(4)
|
511
|
+
tok.unread(3)
|
512
|
+
end
|
513
|
+
end
|
514
|
+
tok.read
|
515
|
+
end
|
516
|
+
end
|
517
|
+
end
|
518
|
+
|
519
|
+
|
data/test/testcmds
ADDED
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tokn
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeff Sember
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-07 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: 'Given a script containing token descriptions (each a regular expression),
|
14
|
+
tokn compiles a DFA which it then uses to efficiently extract a sequence of tokens
|
15
|
+
from source files. '
|
16
|
+
email: jpsember@gmail.com
|
17
|
+
executables:
|
18
|
+
- tokncompile
|
19
|
+
- toknprocess
|
20
|
+
extensions: []
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- lib/tokn.rb
|
24
|
+
- lib/tokn/code_set.rb
|
25
|
+
- lib/tokn/dfa.rb
|
26
|
+
- lib/tokn/dfa_builder.rb
|
27
|
+
- lib/tokn/range_partition.rb
|
28
|
+
- lib/tokn/reg_parse.rb
|
29
|
+
- lib/tokn/state.rb
|
30
|
+
- lib/tokn/token_defn_parser.rb
|
31
|
+
- lib/tokn/tokenizer.rb
|
32
|
+
- lib/tokn/tokn_const.rb
|
33
|
+
- lib/tokn/tools.rb
|
34
|
+
- bin/tokncompile
|
35
|
+
- bin/toknprocess
|
36
|
+
- README.txt
|
37
|
+
- test/data/sampletext.txt
|
38
|
+
- test/data/sampletokens.txt
|
39
|
+
- test/simple.rb
|
40
|
+
- test/test.rb
|
41
|
+
- test/testcmds
|
42
|
+
- figures/sample_dfa.pdf
|
43
|
+
homepage:
|
44
|
+
licenses: []
|
45
|
+
metadata: {}
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 2.0.2
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: Extracts tokens from source files
|
66
|
+
test_files:
|
67
|
+
- test/simple.rb
|
68
|
+
- test/test.rb
|
69
|
+
has_rdoc:
|