tokn 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.txt +194 -0
- data/bin/tokncompile +16 -0
- data/bin/toknprocess +26 -0
- data/figures/sample_dfa.pdf +0 -0
- data/lib/tokn/code_set.rb +392 -0
- data/lib/tokn/dfa.rb +196 -0
- data/lib/tokn/dfa_builder.rb +261 -0
- data/lib/tokn/range_partition.rb +233 -0
- data/lib/tokn/reg_parse.rb +379 -0
- data/lib/tokn/state.rb +320 -0
- data/lib/tokn/token_defn_parser.rb +156 -0
- data/lib/tokn/tokenizer.rb +211 -0
- data/lib/tokn/tokn_const.rb +29 -0
- data/lib/tokn/tools.rb +186 -0
- data/lib/tokn.rb +1 -0
- data/test/data/sampletext.txt +11 -0
- data/test/data/sampletokens.txt +32 -0
- data/test/simple.rb +33 -0
- data/test/test.rb +519 -0
- data/test/testcmds +4 -0
- metadata +69 -0
data/test/test.rb
ADDED
@@ -0,0 +1,519 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require_relative '../lib/tokn/tools.rb'
|
3
|
+
req('range_partition dfa dfa_builder tokenizer')
|
4
|
+
|
5
|
+
|
6
|
+
def dataPath(f)
|
7
|
+
File.dirname(__FILE__)+"/data/"+f
|
8
|
+
end
|
9
|
+
|
10
|
+
setTestDir()
|
11
|
+
|
12
|
+
# Various unit tests for state machines, character range sets, etc.
|
13
|
+
#
|
14
|
+
class TestComponent < Test::Unit::TestCase
|
15
|
+
|
16
|
+
SKIPMOST = false # skip most of the tests?
|
17
|
+
|
18
|
+
def add(lower, upper = nil)
|
19
|
+
@cs.add(lower,upper)
|
20
|
+
end
|
21
|
+
|
22
|
+
def remove(lower, upper = nil)
|
23
|
+
@cs.remove(lower,upper)
|
24
|
+
end
|
25
|
+
|
26
|
+
def swap
|
27
|
+
@ct = @cs
|
28
|
+
prep
|
29
|
+
end
|
30
|
+
|
31
|
+
def isect
|
32
|
+
@cs.intersect!(@ct)
|
33
|
+
end
|
34
|
+
|
35
|
+
def diff
|
36
|
+
@cs.difference!(@ct)
|
37
|
+
end
|
38
|
+
|
39
|
+
def equ(s, arr = nil)
|
40
|
+
arr ||= @cs.array
|
41
|
+
ia = s.split.map{|n| n.to_i}
|
42
|
+
assert_equal(ia,arr)
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_add
|
46
|
+
prep
|
47
|
+
|
48
|
+
add(72,81)
|
49
|
+
equ '72 81'
|
50
|
+
|
51
|
+
add(50)
|
52
|
+
equ '50 51 72 81'
|
53
|
+
|
54
|
+
add(75,77)
|
55
|
+
equ '50 51 72 81'
|
56
|
+
|
57
|
+
add(72,78)
|
58
|
+
equ '50 51 72 81'
|
59
|
+
|
60
|
+
add(70,78)
|
61
|
+
equ '50 51 70 81'
|
62
|
+
|
63
|
+
add 60
|
64
|
+
equ '50 51 60 61 70 81'
|
65
|
+
|
66
|
+
add 40
|
67
|
+
equ '40 41 50 51 60 61 70 81'
|
68
|
+
|
69
|
+
add 41
|
70
|
+
equ '40 42 50 51 60 61 70 81'
|
71
|
+
|
72
|
+
add 81
|
73
|
+
equ '40 42 50 51 60 61 70 82'
|
74
|
+
|
75
|
+
add 83
|
76
|
+
equ '40 42 50 51 60 61 70 82 83 84'
|
77
|
+
|
78
|
+
add 49,84
|
79
|
+
equ '40 42 49 84'
|
80
|
+
|
81
|
+
add 39,86
|
82
|
+
equ '39 86'
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_intersect
|
86
|
+
prep
|
87
|
+
add 39,86
|
88
|
+
swap
|
89
|
+
add 50,70
|
90
|
+
isect
|
91
|
+
equ '50 70'
|
92
|
+
|
93
|
+
swap
|
94
|
+
add 20,25
|
95
|
+
add 35,51
|
96
|
+
add 62,68
|
97
|
+
add 72,80
|
98
|
+
isect
|
99
|
+
equ '50 51 62 68'
|
100
|
+
|
101
|
+
prep
|
102
|
+
swap
|
103
|
+
add 50,70
|
104
|
+
isect
|
105
|
+
equ ''
|
106
|
+
|
107
|
+
add 50,70
|
108
|
+
swap
|
109
|
+
add 50,70
|
110
|
+
isect
|
111
|
+
equ '50 70'
|
112
|
+
|
113
|
+
|
114
|
+
prep
|
115
|
+
add 20,25
|
116
|
+
swap
|
117
|
+
add 25,30
|
118
|
+
isect
|
119
|
+
equ ''
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
def test_difference
|
124
|
+
prep
|
125
|
+
add 20,30
|
126
|
+
add 40,50
|
127
|
+
swap
|
128
|
+
|
129
|
+
|
130
|
+
add 20,80
|
131
|
+
diff
|
132
|
+
equ '30 40 50 80'
|
133
|
+
|
134
|
+
prep
|
135
|
+
add 19,32
|
136
|
+
diff
|
137
|
+
equ '19 20 30 32'
|
138
|
+
|
139
|
+
prep
|
140
|
+
add 30,40
|
141
|
+
diff
|
142
|
+
equ '30 40'
|
143
|
+
|
144
|
+
prep
|
145
|
+
add 20,30
|
146
|
+
add 40,50
|
147
|
+
diff
|
148
|
+
equ ''
|
149
|
+
|
150
|
+
prep
|
151
|
+
add 19,30
|
152
|
+
add 40,50
|
153
|
+
diff
|
154
|
+
equ '19 20'
|
155
|
+
|
156
|
+
prep
|
157
|
+
add 20,30
|
158
|
+
add 40,51
|
159
|
+
diff
|
160
|
+
equ '50 51'
|
161
|
+
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
def prep
|
166
|
+
@cs = CodeSet.new
|
167
|
+
end
|
168
|
+
|
169
|
+
def test_illegalRange
|
170
|
+
prep
|
171
|
+
|
172
|
+
assert_raise(RangeError) { add 60,50 }
|
173
|
+
assert_raise(RangeError) { add 60,60 }
|
174
|
+
end
|
175
|
+
|
176
|
+
def neg lower, upper
|
177
|
+
@cs.negate lower, upper
|
178
|
+
end
|
179
|
+
|
180
|
+
def test_negate
|
181
|
+
prep
|
182
|
+
add 10,15
|
183
|
+
add 20,25
|
184
|
+
add 30
|
185
|
+
add 40,45
|
186
|
+
equ '10 15 20 25 30 31 40 45'
|
187
|
+
neg 22,37
|
188
|
+
equ '10 15 20 22 25 30 31 37 40 45'
|
189
|
+
neg 25,27
|
190
|
+
equ '10 15 20 22 27 30 31 37 40 45'
|
191
|
+
neg 15,20
|
192
|
+
equ '10 22 27 30 31 37 40 45'
|
193
|
+
|
194
|
+
prep
|
195
|
+
add 10,22
|
196
|
+
@cs.negate
|
197
|
+
equ '0 10 22 1114112'
|
198
|
+
|
199
|
+
|
200
|
+
prep
|
201
|
+
add 10,20
|
202
|
+
neg 10,20
|
203
|
+
equ ''
|
204
|
+
|
205
|
+
prep
|
206
|
+
add 10,20
|
207
|
+
add 30,40
|
208
|
+
neg 5,10
|
209
|
+
equ '5 20 30 40'
|
210
|
+
|
211
|
+
prep
|
212
|
+
add 10,20
|
213
|
+
add 30,40
|
214
|
+
neg 25,30
|
215
|
+
equ '10 20 25 40'
|
216
|
+
|
217
|
+
prep
|
218
|
+
add 10,20
|
219
|
+
add 30,40
|
220
|
+
neg 40,50
|
221
|
+
equ '10 20 30 50'
|
222
|
+
|
223
|
+
prep
|
224
|
+
add 10,20
|
225
|
+
add 30,40
|
226
|
+
neg 41,50
|
227
|
+
equ '10 20 30 40 41 50'
|
228
|
+
|
229
|
+
prep
|
230
|
+
add 10,20
|
231
|
+
add 30,40
|
232
|
+
neg 15,35
|
233
|
+
equ '10 15 20 30 35 40'
|
234
|
+
end
|
235
|
+
|
236
|
+
def test_remove
|
237
|
+
|
238
|
+
prep
|
239
|
+
add 10,20
|
240
|
+
add 30,40
|
241
|
+
remove 29,41
|
242
|
+
equ '10 20'
|
243
|
+
|
244
|
+
add 30,40
|
245
|
+
equ '10 20 30 40'
|
246
|
+
|
247
|
+
remove 20,30
|
248
|
+
equ '10 20 30 40'
|
249
|
+
|
250
|
+
remove 15,35
|
251
|
+
equ '10 15 35 40'
|
252
|
+
|
253
|
+
remove 10,15
|
254
|
+
equ '35 40'
|
255
|
+
remove 35
|
256
|
+
equ '36 40'
|
257
|
+
remove 40
|
258
|
+
equ '36 40'
|
259
|
+
remove 38
|
260
|
+
equ '36 38 39 40'
|
261
|
+
remove 37,39
|
262
|
+
equ '36 37 39 40'
|
263
|
+
|
264
|
+
end
|
265
|
+
|
266
|
+
|
267
|
+
def dset(st)
|
268
|
+
s = ''
|
269
|
+
st.each{|x|
|
270
|
+
if s.length > 0
|
271
|
+
s+= ' '
|
272
|
+
end
|
273
|
+
s += d(x)
|
274
|
+
}
|
275
|
+
return s
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
def newpar
|
280
|
+
@par = RangePartition.new
|
281
|
+
end
|
282
|
+
|
283
|
+
def addset(lower, upper = nil)
|
284
|
+
upper ||= lower + 1
|
285
|
+
r = CodeSet.new(lower,upper)
|
286
|
+
@par.addSet(r)
|
287
|
+
end
|
288
|
+
|
289
|
+
def apply
|
290
|
+
list = @par.apply(@cs)
|
291
|
+
res = []
|
292
|
+
list.each do |x|
|
293
|
+
res.concat x.array
|
294
|
+
end
|
295
|
+
@parResult = res
|
296
|
+
end
|
297
|
+
|
298
|
+
def test_partition
|
299
|
+
return if SKIPMOST
|
300
|
+
|
301
|
+
newpar
|
302
|
+
addset(20,30)
|
303
|
+
addset(25,33)
|
304
|
+
addset(37)
|
305
|
+
addset(40,50)
|
306
|
+
@par.prepare
|
307
|
+
|
308
|
+
@par.generatePDF()
|
309
|
+
|
310
|
+
prep
|
311
|
+
add 25,33
|
312
|
+
|
313
|
+
apply
|
314
|
+
equ('25 30 30 33', @parResult)
|
315
|
+
|
316
|
+
|
317
|
+
prep
|
318
|
+
add 37
|
319
|
+
apply
|
320
|
+
equ('37 38', @parResult)
|
321
|
+
|
322
|
+
prep
|
323
|
+
add 40,50
|
324
|
+
apply
|
325
|
+
equ('40 50', @parResult)
|
326
|
+
|
327
|
+
end
|
328
|
+
|
329
|
+
|
330
|
+
REGEX_SCRIPT = "(\\-?[0-9]+)|[_a-zA-Z][_a-zA-Z0-9]*|333q"
|
331
|
+
|
332
|
+
TOKEN_SCRIPT2 = <<'END'
|
333
|
+
sep: \s
|
334
|
+
tku: a(a|b)*
|
335
|
+
tkv: b(aa|b*)
|
336
|
+
tkw: bbb
|
337
|
+
END
|
338
|
+
|
339
|
+
|
340
|
+
def test_buildDFA
|
341
|
+
return if SKIPMOST
|
342
|
+
|
343
|
+
x = RegParse.new(REGEX_SCRIPT)
|
344
|
+
s = x.startState
|
345
|
+
x.endState.finalState = true
|
346
|
+
|
347
|
+
s.generatePDF("nfa")
|
348
|
+
|
349
|
+
r = s.reverseNFA()
|
350
|
+
r.generatePDF("reversed")
|
351
|
+
|
352
|
+
dfa = DFABuilder.nfa_to_dfa(s)
|
353
|
+
dfa.generatePDF("buildDFA")
|
354
|
+
end
|
355
|
+
|
356
|
+
def test_cvtNFAToDFA
|
357
|
+
return if SKIPMOST
|
358
|
+
|
359
|
+
x = RegParse.new(REGEX_SCRIPT)
|
360
|
+
s = x.startState
|
361
|
+
x.endState.finalState = true
|
362
|
+
|
363
|
+
s.generatePDF("nfa")
|
364
|
+
|
365
|
+
dfa = DFABuilder.nfa_to_dfa(s)
|
366
|
+
dfa.generatePDF("dfa")
|
367
|
+
|
368
|
+
oldToNewMap, maxId2 = dfa.duplicateNFA(42)
|
369
|
+
dfa2 = oldToNewMap[dfa]
|
370
|
+
dfa2.generatePDF("dfa_duplicated")
|
371
|
+
end
|
372
|
+
|
373
|
+
|
374
|
+
def test_TokenDefParser
|
375
|
+
return if SKIPMOST
|
376
|
+
|
377
|
+
s = TOKEN_SCRIPT2
|
378
|
+
|
379
|
+
td = TokenDefParser.new(s)
|
380
|
+
|
381
|
+
tokDFA = td.dfa
|
382
|
+
tokDFA.startState.generatePDF("TokenDFA")
|
383
|
+
|
384
|
+
end
|
385
|
+
|
386
|
+
|
387
|
+
|
388
|
+
@@sampleText = readTextFile(dataPath("sampletext.txt"))
|
389
|
+
@@sampleTokens = readTextFile(dataPath("sampletokens.txt"))
|
390
|
+
|
391
|
+
def makeTok
|
392
|
+
dfa = DFA.dfa_from_script(@@sampleTokens)
|
393
|
+
Tokenizer.new(dfa, @@sampleText)
|
394
|
+
end
|
395
|
+
|
396
|
+
|
397
|
+
def test_Tokenizer
|
398
|
+
return if SKIPMOST
|
399
|
+
|
400
|
+
tok = makeTok
|
401
|
+
|
402
|
+
tokList = []
|
403
|
+
while tok.hasNext
|
404
|
+
t = tok.read
|
405
|
+
tokList.push(t)
|
406
|
+
end
|
407
|
+
|
408
|
+
tok.unread(tokList.size)
|
409
|
+
|
410
|
+
tokList.each do |t1|
|
411
|
+
tName = tok.nameOf(t1)
|
412
|
+
t2 = tok.read(tName)
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
|
417
|
+
|
418
|
+
|
419
|
+
def test_TokenizerMissingExpected
|
420
|
+
return if SKIPMOST
|
421
|
+
|
422
|
+
assert_raise TokenizerException do
|
423
|
+
|
424
|
+
tok = makeTok
|
425
|
+
|
426
|
+
tok.read
|
427
|
+
tok.read
|
428
|
+
tok.read
|
429
|
+
tok.read
|
430
|
+
tok.read("signedint")
|
431
|
+
end
|
432
|
+
|
433
|
+
end
|
434
|
+
|
435
|
+
def test_CompileDFAToDisk
|
436
|
+
tokScript = @@sampleTokens
|
437
|
+
testText = @@sampleText
|
438
|
+
|
439
|
+
destPath = withinTestDir("sampletokens_dfa.txt")
|
440
|
+
|
441
|
+
if File.exist?(destPath)
|
442
|
+
File.delete(destPath)
|
443
|
+
end
|
444
|
+
assert(!File.exist?(destPath))
|
445
|
+
|
446
|
+
dfa = DFA.dfa_from_script(tokScript, destPath)
|
447
|
+
assert(File.exist?(destPath))
|
448
|
+
|
449
|
+
tok = Tokenizer.new(dfa, testText)
|
450
|
+
|
451
|
+
end
|
452
|
+
|
453
|
+
|
454
|
+
def prep2
|
455
|
+
testText = @@sampleText
|
456
|
+
dfa = DFA.dfa_from_file(withinTestDir("sampletokens_dfa.txt"))
|
457
|
+
tok = Tokenizer.new(dfa, testText)
|
458
|
+
end
|
459
|
+
|
460
|
+
def test_readAndUnread
|
461
|
+
tok = prep2
|
462
|
+
unread = false
|
463
|
+
while tok.hasNext
|
464
|
+
t = tok.read
|
465
|
+
pr("Read %-8s %s\n",tok.nameOf(t),d(t))
|
466
|
+
|
467
|
+
if !unread && tok.nameOf(t) == "DO"
|
468
|
+
pr(" ...pushing back four tokens...\n")
|
469
|
+
tok.unread(4)
|
470
|
+
unread = true
|
471
|
+
pr(" ...and resuming...\n")
|
472
|
+
end
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
def test_UnrecognizedToken
|
477
|
+
assert_raise TokenizerException do
|
478
|
+
tok = prep2
|
479
|
+
while tok.hasNext
|
480
|
+
t = tok.read
|
481
|
+
if tok.nameOf(t) == "DO"
|
482
|
+
tok.read("BRCL") # <== this should raise problem
|
483
|
+
end
|
484
|
+
end
|
485
|
+
end
|
486
|
+
end
|
487
|
+
|
488
|
+
def test_ReadPastEnd
|
489
|
+
assert_raise TokenizerException do
|
490
|
+
tok = prep2
|
491
|
+
while tok.hasNext
|
492
|
+
t = tok.read
|
493
|
+
end
|
494
|
+
tok.read
|
495
|
+
end
|
496
|
+
end
|
497
|
+
|
498
|
+
def test_UnreadBeforeStart
|
499
|
+
|
500
|
+
assert_raise TokenizerException do
|
501
|
+
tok = prep2
|
502
|
+
k = 0
|
503
|
+
while tok.hasNext
|
504
|
+
t = tok.read
|
505
|
+
k += 1
|
506
|
+
if k == 15
|
507
|
+
tok.unread(5)
|
508
|
+
tok.unread(7)
|
509
|
+
tok.read()
|
510
|
+
tok.unread(4)
|
511
|
+
tok.unread(3)
|
512
|
+
end
|
513
|
+
end
|
514
|
+
tok.read
|
515
|
+
end
|
516
|
+
end
|
517
|
+
end
|
518
|
+
|
519
|
+
|
data/test/testcmds
ADDED
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tokn
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jeff Sember
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-03-07 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: 'Given a script containing token descriptions (each a regular expression),
|
14
|
+
tokn compiles a DFA which it then uses to efficiently extract a sequence of tokens
|
15
|
+
from source files. '
|
16
|
+
email: jpsember@gmail.com
|
17
|
+
executables:
|
18
|
+
- tokncompile
|
19
|
+
- toknprocess
|
20
|
+
extensions: []
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- lib/tokn.rb
|
24
|
+
- lib/tokn/code_set.rb
|
25
|
+
- lib/tokn/dfa.rb
|
26
|
+
- lib/tokn/dfa_builder.rb
|
27
|
+
- lib/tokn/range_partition.rb
|
28
|
+
- lib/tokn/reg_parse.rb
|
29
|
+
- lib/tokn/state.rb
|
30
|
+
- lib/tokn/token_defn_parser.rb
|
31
|
+
- lib/tokn/tokenizer.rb
|
32
|
+
- lib/tokn/tokn_const.rb
|
33
|
+
- lib/tokn/tools.rb
|
34
|
+
- bin/tokncompile
|
35
|
+
- bin/toknprocess
|
36
|
+
- README.txt
|
37
|
+
- test/data/sampletext.txt
|
38
|
+
- test/data/sampletokens.txt
|
39
|
+
- test/simple.rb
|
40
|
+
- test/test.rb
|
41
|
+
- test/testcmds
|
42
|
+
- figures/sample_dfa.pdf
|
43
|
+
homepage:
|
44
|
+
licenses: []
|
45
|
+
metadata: {}
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 2.0.2
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: Extracts tokens from source files
|
66
|
+
test_files:
|
67
|
+
- test/simple.rb
|
68
|
+
- test/test.rb
|
69
|
+
has_rdoc:
|