StringGenerator 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- strgen/__init__.py +797 -0
- strgen/countries.py +212 -0
- strgen/tests.py +497 -0
- stringgenerator-0.5.0.dist-info/METADATA +189 -0
- stringgenerator-0.5.0.dist-info/RECORD +8 -0
- stringgenerator-0.5.0.dist-info/WHEEL +5 -0
- stringgenerator-0.5.0.dist-info/licenses/LICENSE +29 -0
- stringgenerator-0.5.0.dist-info/top_level.txt +1 -0
strgen/__init__.py
ADDED
|
@@ -0,0 +1,797 @@
|
|
|
1
|
+
# Copyright (c) 2013-2020, Paul Wolf
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
|
|
4
|
+
# Redistribution and use in source and binary forms, with or without
|
|
5
|
+
# modification, are permitted provided that the following conditions are
|
|
6
|
+
# met:
|
|
7
|
+
|
|
8
|
+
# 1. Redistributions of source code must retain the above copyright
|
|
9
|
+
# notice, this list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
# 2. Redistributions in binary form must reproduce the above copyright
|
|
12
|
+
# notice, this list of conditions and the following disclaimer in the
|
|
13
|
+
# documentation and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
# 3. Neither the name of Yewleaf Ltd. nor the names of its contributors
|
|
16
|
+
# may be used to endorse or promote products derived from this software
|
|
17
|
+
# without specific prior written permission.
|
|
18
|
+
|
|
19
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
20
|
+
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
21
|
+
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
22
|
+
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
23
|
+
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
24
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
25
|
+
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
26
|
+
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
27
|
+
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
28
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
29
|
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
30
|
+
|
|
31
|
+
# Original author: paul.wolf@yewleaf.com
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
import os
|
|
35
|
+
import random
|
|
36
|
+
import string
|
|
37
|
+
import types
|
|
38
|
+
import typing
|
|
39
|
+
import math
|
|
40
|
+
import itertools
|
|
41
|
+
from abc import ABC, abstractmethod
|
|
42
|
+
from collections import Counter, namedtuple
|
|
43
|
+
from math import factorial
|
|
44
|
+
|
|
45
|
+
__version__ = "0.5.0"
|
|
46
|
+
__author__ = "Paul Wolf"
|
|
47
|
+
__license__ = "BSD"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# A lexer token. ``type`` is one of the structural kinds (LBRACKET, PIPE, ...)
|
|
51
|
+
# or "CHAR"/"EOF". ``escaped`` is only meaningful for CHAR tokens and records
|
|
52
|
+
# whether the character came from a backslash escape, so the parser can tell a
|
|
53
|
+
# literal "[" from a class opener without ever re-examining backslashes.
|
|
54
|
+
Token = namedtuple("Token", ["type", "value", "escaped"], defaults=[False])
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def permutation_count(s):
|
|
58
|
+
"""Return the number of different permutations of s.
|
|
59
|
+
math.perm does not exist before P3.8.
|
|
60
|
+
https://codereview.stackexchange.com/questions/132704/counting-permutations-without-repetitions-for-a-number-or-a-string
|
|
61
|
+
"""
|
|
62
|
+
c = 1
|
|
63
|
+
for i in Counter(s).values():
|
|
64
|
+
c *= factorial(i)
|
|
65
|
+
return factorial(len(s)) // c
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def randomizer_factory(seed) -> random.Random:
|
|
69
|
+
"""Return class instance that will provide randint, choice, shuffle.
|
|
70
|
+
|
|
71
|
+
If there is a seed, we need to use Random.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
if seed:
|
|
75
|
+
return random.Random(seed)
|
|
76
|
+
try:
|
|
77
|
+
return random.SystemRandom()
|
|
78
|
+
except Exception:
|
|
79
|
+
return random.Random()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class BufferedSecureRandom(random.Random):
|
|
83
|
+
"""Cryptographically secure RNG that buffers ``os.urandom`` in bulk.
|
|
84
|
+
|
|
85
|
+
``random.SystemRandom`` reads from the OS entropy pool on every draw, which
|
|
86
|
+
means one syscall per random value. Generating large batches (e.g.
|
|
87
|
+
``render_set(1_000_000)``) then spends most of its time in the kernel.
|
|
88
|
+
|
|
89
|
+
This class draws the exact same entropy -- raw ``os.urandom`` bytes, each
|
|
90
|
+
consumed once and never expanded by a userspace PRNG -- but reads it in big
|
|
91
|
+
chunks, amortizing the syscall across many values. It is therefore as
|
|
92
|
+
secure as ``SystemRandom`` (suitable for tokens, passwords, keys) while
|
|
93
|
+
being substantially faster for bulk generation.
|
|
94
|
+
|
|
95
|
+
Pass it via the ``randomizer`` argument; it is reachable without an extra
|
|
96
|
+
import as ``StringGenerator.BufferedSecureRandom``::
|
|
97
|
+
|
|
98
|
+
SG(r"[\\w\\p]{32}", randomizer=SG.BufferedSecureRandom()).render_set(50000)
|
|
99
|
+
|
|
100
|
+
Being entropy-based, it ignores seeding.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def __init__(self, bufsize=1 << 20):
|
|
104
|
+
self._buf = b""
|
|
105
|
+
self._i = 0
|
|
106
|
+
self._bufsize = bufsize
|
|
107
|
+
super().__init__()
|
|
108
|
+
|
|
109
|
+
def _take(self, n):
|
|
110
|
+
"""Return n fresh random bytes, refilling the buffer when needed."""
|
|
111
|
+
if self._i + n > len(self._buf):
|
|
112
|
+
self._buf = os.urandom(max(n, self._bufsize))
|
|
113
|
+
self._i = 0
|
|
114
|
+
chunk = self._buf[self._i : self._i + n]
|
|
115
|
+
self._i += n
|
|
116
|
+
return chunk
|
|
117
|
+
|
|
118
|
+
def random(self):
|
|
119
|
+
"""Return a 53-bit float in [0.0, 1.0), as SystemRandom.random does."""
|
|
120
|
+
return (int.from_bytes(self._take(7), "big") >> 3) * (2.0**-53)
|
|
121
|
+
|
|
122
|
+
def getrandbits(self, k):
|
|
123
|
+
if k <= 0:
|
|
124
|
+
raise ValueError("number of bits must be greater than zero")
|
|
125
|
+
nbytes = (k + 7) // 8
|
|
126
|
+
return int.from_bytes(self._take(nbytes), "big") >> (nbytes * 8 - k)
|
|
127
|
+
|
|
128
|
+
def choices(self, population, weights=None, *, cum_weights=None, k=1):
|
|
129
|
+
"""Unweighted draws map bytes straight to indices via rejection
|
|
130
|
+
sampling, skipping the per-pick float construction in random(). This is
|
|
131
|
+
the hot path for character sets and is several times faster. Weighted
|
|
132
|
+
draws, an empty population, or alphabets larger than one byte fall back
|
|
133
|
+
to the standard implementation (which still uses our random()).
|
|
134
|
+
"""
|
|
135
|
+
n = len(population)
|
|
136
|
+
if weights is not None or cum_weights is not None or n == 0 or n > 256:
|
|
137
|
+
return super().choices(population, weights, cum_weights=cum_weights, k=k)
|
|
138
|
+
limit = 256 - (256 % n) # largest multiple of n <= 256; reject above it for uniformity
|
|
139
|
+
out = []
|
|
140
|
+
append = out.append
|
|
141
|
+
take = self._take
|
|
142
|
+
while len(out) < k:
|
|
143
|
+
for byte in take(k - len(out)):
|
|
144
|
+
if byte < limit:
|
|
145
|
+
append(population[byte % n])
|
|
146
|
+
if len(out) == k:
|
|
147
|
+
break
|
|
148
|
+
return out
|
|
149
|
+
|
|
150
|
+
def seed(self, *args, **kwargs):
|
|
151
|
+
"""No-op: entropy-based, so there is no seed state to set."""
|
|
152
|
+
|
|
153
|
+
def _notimplemented(self, *args, **kwargs):
|
|
154
|
+
raise NotImplementedError("BufferedSecureRandom is entropy-based; state cannot be saved or restored")
|
|
155
|
+
|
|
156
|
+
getstate = setstate = _notimplemented
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class StringGenerator:
|
|
160
|
+
"""Generate a randomized string of characters using a template.
|
|
161
|
+
|
|
162
|
+
The purpose of this class is to generate a string of characters
|
|
163
|
+
according to a template. The template language is superficially
|
|
164
|
+
similar to regular expressions but fundamentally different in
|
|
165
|
+
purpose.
|
|
166
|
+
|
|
167
|
+
Usage:
|
|
168
|
+
|
|
169
|
+
StringGenerator(<template>).render()
|
|
170
|
+
|
|
171
|
+
or:
|
|
172
|
+
|
|
173
|
+
StringGenerator(<template>).render_list(10, unique=True)
|
|
174
|
+
|
|
175
|
+
The latter produces a list of 10 strings that are unique within
|
|
176
|
+
the list.
|
|
177
|
+
|
|
178
|
+
Example:
|
|
179
|
+
|
|
180
|
+
`StringGenerator(r"[\\d]{10}").render_list(10, unique=True)`
|
|
181
|
+
|
|
182
|
+
This generates 10 unique strings containing digits. Each will be
|
|
183
|
+
10 characters in length.
|
|
184
|
+
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
# Per-instance; assigned in __init__. Declared here only as a type hint so
|
|
188
|
+
# it is never a shared class attribute (which would let one instance clobber
|
|
189
|
+
# another's RNG and break seeded determinism / thread-safety).
|
|
190
|
+
randomizer: typing.Optional[random.Random]
|
|
191
|
+
|
|
192
|
+
# Exposed here so callers can opt into the fast secure RNG without a second
|
|
193
|
+
# import: SG(pattern, randomizer=SG.BufferedSecureRandom()).
|
|
194
|
+
BufferedSecureRandom = BufferedSecureRandom
|
|
195
|
+
|
|
196
|
+
class SyntaxError(Exception):
|
|
197
|
+
"""Catch syntax errors."""
|
|
198
|
+
|
|
199
|
+
class UniquenessError(Exception):
|
|
200
|
+
"""Catch when template can't generate required list count."""
|
|
201
|
+
|
|
202
|
+
meta_chars = "[]{}()|&$"
|
|
203
|
+
mytab = " " * 4
|
|
204
|
+
|
|
205
|
+
string_code = {
|
|
206
|
+
"d": string.digits,
|
|
207
|
+
"w": "_" + string.ascii_letters + string.digits,
|
|
208
|
+
"W": string.whitespace + string.punctuation,
|
|
209
|
+
"s": string.whitespace,
|
|
210
|
+
"p": string.punctuation,
|
|
211
|
+
"l": string.ascii_letters,
|
|
212
|
+
"u": string.ascii_uppercase,
|
|
213
|
+
"U": string.ascii_uppercase,
|
|
214
|
+
"c": string.ascii_lowercase,
|
|
215
|
+
"o": string.octdigits,
|
|
216
|
+
"h": string.hexdigits,
|
|
217
|
+
"r": string.printable,
|
|
218
|
+
}
|
|
219
|
+
string_code_help = {
|
|
220
|
+
"d": "digits",
|
|
221
|
+
"w": "_" + "ascii_letters + digits",
|
|
222
|
+
"W": "whitespace + punctuation",
|
|
223
|
+
"s": "whitespace",
|
|
224
|
+
"p": "punctuation",
|
|
225
|
+
"l": "ascii_letters",
|
|
226
|
+
"u": "ascii_uppercase",
|
|
227
|
+
"U": "ascii_uppercase",
|
|
228
|
+
"c": "ascii_lowercase",
|
|
229
|
+
"o": "octdigits",
|
|
230
|
+
"h": "hexdigits",
|
|
231
|
+
"r": "printable",
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
class StringNode(ABC):
|
|
235
|
+
"""The abstract class for all nodes"""
|
|
236
|
+
|
|
237
|
+
@abstractmethod
|
|
238
|
+
def render(self, randomizer, **kwargs):
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
@abstractmethod
|
|
242
|
+
def count(self, randomizer, **kwargs):
|
|
243
|
+
pass
|
|
244
|
+
|
|
245
|
+
@abstractmethod
|
|
246
|
+
def dump(self):
|
|
247
|
+
pass
|
|
248
|
+
|
|
249
|
+
class Sequence:
|
|
250
|
+
"""Render a sequence of nodes from the template."""
|
|
251
|
+
|
|
252
|
+
def __init__(self, seq):
|
|
253
|
+
"""seq is a list."""
|
|
254
|
+
self.seq = seq # list of StringNodes
|
|
255
|
+
|
|
256
|
+
def render(self, randomizer, **kwargs):
|
|
257
|
+
return "".join([x.render(randomizer, **kwargs) for x in self.seq])
|
|
258
|
+
|
|
259
|
+
def count(self, randomizer, **kwargs):
|
|
260
|
+
"""This sequence of counts:
|
|
261
|
+
P x P x P...
|
|
262
|
+
The cummulative product.
|
|
263
|
+
"""
|
|
264
|
+
d = [_.count(randomizer, **kwargs) for _ in self.seq]
|
|
265
|
+
x = 1
|
|
266
|
+
for i in d:
|
|
267
|
+
x *= i
|
|
268
|
+
return x
|
|
269
|
+
|
|
270
|
+
def dump(self, level=-1):
|
|
271
|
+
print((StringGenerator.mytab * level) + f"{self.__class__.__name__}")
|
|
272
|
+
for s in self.seq:
|
|
273
|
+
s.dump(level + 1)
|
|
274
|
+
|
|
275
|
+
class SequenceOR(Sequence):
|
|
276
|
+
"""Randomly choose from operands."""
|
|
277
|
+
|
|
278
|
+
def render(self, randomizer, **kwargs):
|
|
279
|
+
"""Return on of a sequence of nodes."""
|
|
280
|
+
|
|
281
|
+
return self.seq[randomizer.randint(0, len(self.seq) - 1)].render(randomizer, **kwargs)
|
|
282
|
+
|
|
283
|
+
def count(self, randomizer, **kwargs):
|
|
284
|
+
return sum([x.count(randomizer, **kwargs) for x in self.seq])
|
|
285
|
+
|
|
286
|
+
def dump(self, level=-1):
|
|
287
|
+
print((StringGenerator.mytab * level) + repr(self))
|
|
288
|
+
for s in self.seq:
|
|
289
|
+
s.dump(level + 1)
|
|
290
|
+
|
|
291
|
+
def __repr__(self):
|
|
292
|
+
return f"{self.__class__.__name__}"
|
|
293
|
+
|
|
294
|
+
def __str__(self):
|
|
295
|
+
return "OR"
|
|
296
|
+
|
|
297
|
+
class SequenceAND(Sequence):
|
|
298
|
+
"""Render a permutation without replacement
|
|
299
|
+
of characters from operands.
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
def render(self, randomizer, **kwargs):
|
|
303
|
+
"""Return a permutation without replacement of all characters in seq."""
|
|
304
|
+
char_list = list("".join([x.render(randomizer, **kwargs) for x in self.seq]))
|
|
305
|
+
randomizer.shuffle(char_list)
|
|
306
|
+
return "".join(char_list)
|
|
307
|
+
|
|
308
|
+
def count(self, randomizer, **kwargs):
|
|
309
|
+
"""This does not work for complex expressions."""
|
|
310
|
+
char_list = list("".join([x.render(randomizer, **kwargs) for x in self.seq]))
|
|
311
|
+
return permutation_count(char_list)
|
|
312
|
+
|
|
313
|
+
def dump(self, level=-1):
|
|
314
|
+
print((StringGenerator.mytab * level) + repr(self))
|
|
315
|
+
for s in self.seq:
|
|
316
|
+
s.dump(level + 1)
|
|
317
|
+
|
|
318
|
+
def __str__(self):
|
|
319
|
+
return "AND"
|
|
320
|
+
|
|
321
|
+
def __repr__(self):
|
|
322
|
+
return f"{self.__class__.__name__}"
|
|
323
|
+
|
|
324
|
+
class Literal(StringNode):
|
|
325
|
+
"""Render a literal string."""
|
|
326
|
+
|
|
327
|
+
def __init__(self, chars):
|
|
328
|
+
self.literal = chars # a literal string
|
|
329
|
+
|
|
330
|
+
def render(self, randomizer, **kwargs):
|
|
331
|
+
return self.literal
|
|
332
|
+
|
|
333
|
+
def count(self, randomizer, **kwargs):
|
|
334
|
+
return 1
|
|
335
|
+
|
|
336
|
+
def dump(self, level=0):
|
|
337
|
+
print((StringGenerator.mytab * level) + repr(self))
|
|
338
|
+
|
|
339
|
+
def __str__(self):
|
|
340
|
+
return self.literal
|
|
341
|
+
|
|
342
|
+
def __repr__(self):
|
|
343
|
+
return f"{self.__class__.__name__}: {self.literal}"
|
|
344
|
+
|
|
345
|
+
class CharacterSet(StringNode):
|
|
346
|
+
"""Render a random combination from a set of characters."""
|
|
347
|
+
|
|
348
|
+
def __init__(self, chars, start, cnt):
|
|
349
|
+
self.chars = chars
|
|
350
|
+
try:
|
|
351
|
+
self.start = int(start)
|
|
352
|
+
self.cnt = int(cnt)
|
|
353
|
+
except Exception as e:
|
|
354
|
+
raise e
|
|
355
|
+
|
|
356
|
+
def render(self, randomizer, **kwargs):
|
|
357
|
+
if self.start > -1:
|
|
358
|
+
cnt = randomizer.randint(self.start, self.cnt)
|
|
359
|
+
else:
|
|
360
|
+
cnt = self.cnt
|
|
361
|
+
|
|
362
|
+
# choices() draws all cnt characters in a single C-level call, far
|
|
363
|
+
# faster than one randint() per character for large outputs.
|
|
364
|
+
return "".join(randomizer.choices(self.chars, k=cnt))
|
|
365
|
+
|
|
366
|
+
def count(self, randomizer, **kwargs):
|
|
367
|
+
"""Permutation with replacement.
|
|
368
|
+
The cummulative sum of c ** r
|
|
369
|
+
"""
|
|
370
|
+
if self.start < 0:
|
|
371
|
+
# fixed length
|
|
372
|
+
return len(self.chars) ** self.cnt
|
|
373
|
+
# range
|
|
374
|
+
return sum([len(self.chars) ** r for r in range(self.start, self.cnt + 1)])
|
|
375
|
+
|
|
376
|
+
def dump(self, level=0):
|
|
377
|
+
print(StringGenerator.mytab * level + repr(self))
|
|
378
|
+
|
|
379
|
+
def __str__(self):
|
|
380
|
+
return f"start={self.start}, cnt={self.cnt}, chars={self.chars}"
|
|
381
|
+
|
|
382
|
+
def __repr__(self):
|
|
383
|
+
return f"{self.__class__.__name__}: start={self.start}, cnt={self.cnt}, chars={self.chars}"
|
|
384
|
+
|
|
385
|
+
class Source(StringNode):
|
|
386
|
+
"""Render a string from a generator, list, function."""
|
|
387
|
+
|
|
388
|
+
def __init__(self, source):
|
|
389
|
+
self.source = source
|
|
390
|
+
|
|
391
|
+
def render(self, randomizer, **kwargs):
|
|
392
|
+
src = kwargs.get(self.source) if self.source in kwargs else ""
|
|
393
|
+
if isinstance(
|
|
394
|
+
src,
|
|
395
|
+
(
|
|
396
|
+
list,
|
|
397
|
+
set,
|
|
398
|
+
tuple,
|
|
399
|
+
),
|
|
400
|
+
):
|
|
401
|
+
return str(randomizer.choice(src))
|
|
402
|
+
if callable(src):
|
|
403
|
+
return str(src())
|
|
404
|
+
elif isinstance(src, types.GeneratorType):
|
|
405
|
+
return str(next(src))
|
|
406
|
+
else:
|
|
407
|
+
return str(src)
|
|
408
|
+
|
|
409
|
+
def count(self, randomizer, **kwargs):
|
|
410
|
+
"""Since a source name can be a callable, we can't say what the count
|
|
411
|
+
is.
|
|
412
|
+
|
|
413
|
+
"""
|
|
414
|
+
raise NotImplementedError("Cannot get count for source nodes")
|
|
415
|
+
|
|
416
|
+
def dump(self, level=0):
|
|
417
|
+
print((StringGenerator.mytab * level) + "$%s" % self.source)
|
|
418
|
+
|
|
419
|
+
def __repr__(self):
|
|
420
|
+
return f"{self.__class__.__name__}: {self.source}"
|
|
421
|
+
|
|
422
|
+
def __str__(self):
|
|
423
|
+
return str(self)
|
|
424
|
+
|
|
425
|
+
def __init__(self, pattern, uaf=10, randomizer=None, seed=None):
|
|
426
|
+
self.pattern = pattern
|
|
427
|
+
self.pos = 0
|
|
428
|
+
self.unique_attempts_factor = uaf
|
|
429
|
+
self.tokens = self._tokenize()
|
|
430
|
+
self.seq = self._parse()
|
|
431
|
+
if randomizer:
|
|
432
|
+
if not (
|
|
433
|
+
hasattr(randomizer, "randint")
|
|
434
|
+
and hasattr(randomizer, "choice")
|
|
435
|
+
and hasattr(randomizer, "choices")
|
|
436
|
+
and hasattr(randomizer, "shuffle")
|
|
437
|
+
):
|
|
438
|
+
Exception(
|
|
439
|
+
"The randomizer class instance must provide at least these methods: "
|
|
440
|
+
"randint, choice, choices, shuffle"
|
|
441
|
+
)
|
|
442
|
+
self.randomizer = randomizer
|
|
443
|
+
else:
|
|
444
|
+
self.randomizer = randomizer_factory(seed)
|
|
445
|
+
|
|
446
|
+
def getCharacterRange(self, f, t):
|
|
447
|
+
chars = ""
|
|
448
|
+
# support z-a as a range
|
|
449
|
+
if not ord(f) < ord(t):
|
|
450
|
+
f, t = t, f
|
|
451
|
+
if (ord(t) - ord(f)) > 10000: # protect against large sets ?
|
|
452
|
+
raise Exception("character range too large: %s - %s: %s" % (f, t, ord(t) - ord(f)))
|
|
453
|
+
for c in range(ord(f), ord(t) + 1):
|
|
454
|
+
chars += chr(c)
|
|
455
|
+
return chars
|
|
456
|
+
|
|
457
|
+
# ----- Tokenizer ------------------------------------------------------
|
|
458
|
+
|
|
459
|
+
# Structural metacharacters each map to their own token type. Everything
|
|
460
|
+
# else becomes a CHAR token. Crucially, a backslash escape is resolved
|
|
461
|
+
# exactly once here into a CHAR token (escaped=True), so the parser never
|
|
462
|
+
# sees a backslash and never has to guess whether a metacharacter was
|
|
463
|
+
# escaped via lookbehind.
|
|
464
|
+
_meta_token = {
|
|
465
|
+
"[": "LBRACKET",
|
|
466
|
+
"]": "RBRACKET",
|
|
467
|
+
"{": "LBRACE",
|
|
468
|
+
"}": "RBRACE",
|
|
469
|
+
"(": "LPAREN",
|
|
470
|
+
")": "RPAREN",
|
|
471
|
+
"|": "PIPE",
|
|
472
|
+
"&": "AMP",
|
|
473
|
+
"$": "DOLLAR",
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
def _tokenize(self):
|
|
477
|
+
"""Turn self.pattern into a flat list of tokens ending in EOF."""
|
|
478
|
+
tokens = []
|
|
479
|
+
pattern = self.pattern
|
|
480
|
+
i = 0
|
|
481
|
+
n = len(pattern)
|
|
482
|
+
while i < n:
|
|
483
|
+
ch = pattern[i]
|
|
484
|
+
if ch == "\\":
|
|
485
|
+
if i + 1 < n:
|
|
486
|
+
tokens.append(Token("CHAR", pattern[i + 1], True))
|
|
487
|
+
i += 2
|
|
488
|
+
else:
|
|
489
|
+
# A trailing backslash escapes nothing; drop it.
|
|
490
|
+
i += 1
|
|
491
|
+
continue
|
|
492
|
+
kind = self._meta_token.get(ch)
|
|
493
|
+
if kind:
|
|
494
|
+
tokens.append(Token(kind, ch))
|
|
495
|
+
else:
|
|
496
|
+
tokens.append(Token("CHAR", ch))
|
|
497
|
+
i += 1
|
|
498
|
+
tokens.append(Token("EOF", None))
|
|
499
|
+
return tokens
|
|
500
|
+
|
|
501
|
+
# ----- Parser ---------------------------------------------------------
|
|
502
|
+
|
|
503
|
+
def _peek(self):
|
|
504
|
+
return self.tokens[self.pos]
|
|
505
|
+
|
|
506
|
+
def _advance(self):
|
|
507
|
+
tok = self.tokens[self.pos]
|
|
508
|
+
self.pos += 1
|
|
509
|
+
return tok
|
|
510
|
+
|
|
511
|
+
def _parse(self):
|
|
512
|
+
self.pos = 0
|
|
513
|
+
return self._parse_sequence(level=0)
|
|
514
|
+
|
|
515
|
+
def _parse_literal(self):
|
|
516
|
+
"""Consume a run of CHAR tokens into a single Literal node."""
|
|
517
|
+
chars = []
|
|
518
|
+
while self._peek().type == "CHAR":
|
|
519
|
+
chars.append(self._advance().value)
|
|
520
|
+
return StringGenerator.Literal("".join(chars))
|
|
521
|
+
|
|
522
|
+
def _parse_source(self):
|
|
523
|
+
"""Parse a ${identifier} source; '$' and '{' are the current tokens."""
|
|
524
|
+
self._advance() # $
|
|
525
|
+
self._advance() # {
|
|
526
|
+
chars = []
|
|
527
|
+
while True:
|
|
528
|
+
tok = self._advance()
|
|
529
|
+
if tok.type == "EOF":
|
|
530
|
+
raise StringGenerator.SyntaxError("unexpected end of input getting source")
|
|
531
|
+
if tok.type == "RBRACE":
|
|
532
|
+
break
|
|
533
|
+
chars.append(tok.value if tok.value is not None else "")
|
|
534
|
+
identifier = "".join(chars)
|
|
535
|
+
if not identifier or not identifier.isidentifier():
|
|
536
|
+
raise StringGenerator.SyntaxError("not a valid identifier: %s" % identifier)
|
|
537
|
+
return StringGenerator.Source(identifier)
|
|
538
|
+
|
|
539
|
+
def _parse_quantifier(self):
|
|
540
|
+
"""Parse a {m}, {m:n} or {m-n} quantifier; '{' is the current token."""
|
|
541
|
+
self._advance() # {
|
|
542
|
+
start = -1
|
|
543
|
+
digits = "0"
|
|
544
|
+
prev_was_separator = False
|
|
545
|
+
while True:
|
|
546
|
+
tok = self._advance()
|
|
547
|
+
if tok.type == "EOF":
|
|
548
|
+
raise StringGenerator.SyntaxError("unexpected end of input getting quantifier")
|
|
549
|
+
if tok.type == "RBRACE":
|
|
550
|
+
if prev_was_separator:
|
|
551
|
+
# the user likely expected python slice notation, where the
|
|
552
|
+
# upper bound may be left open; we require a closed range
|
|
553
|
+
raise StringGenerator.SyntaxError("quantifier range must be closed")
|
|
554
|
+
break
|
|
555
|
+
if tok.type == "CHAR" and tok.value in ":-":
|
|
556
|
+
start = int(digits)
|
|
557
|
+
digits = "0"
|
|
558
|
+
prev_was_separator = True
|
|
559
|
+
continue
|
|
560
|
+
if tok.type == "CHAR" and tok.value.isnumeric():
|
|
561
|
+
digits += tok.value
|
|
562
|
+
prev_was_separator = False
|
|
563
|
+
continue
|
|
564
|
+
raise StringGenerator.SyntaxError("non-digit in count")
|
|
565
|
+
return [start, int(digits)]
|
|
566
|
+
|
|
567
|
+
def _parse_character_class(self):
|
|
568
|
+
"""Parse a [...] class with individual members, ranges and shortcuts.
|
|
569
|
+
|
|
570
|
+
The current token is the opening '['.
|
|
571
|
+
"""
|
|
572
|
+
self._advance() # [
|
|
573
|
+
chars = []
|
|
574
|
+
closed = False
|
|
575
|
+
while True:
|
|
576
|
+
tok = self._peek()
|
|
577
|
+
if tok.type == "EOF":
|
|
578
|
+
# Unterminated class. The original parser tolerated this, so we
|
|
579
|
+
# keep that behavior rather than introduce a new error here.
|
|
580
|
+
break
|
|
581
|
+
if tok.type == "RBRACKET":
|
|
582
|
+
self._advance()
|
|
583
|
+
closed = True
|
|
584
|
+
break
|
|
585
|
+
if tok.type != "CHAR":
|
|
586
|
+
raise StringGenerator.SyntaxError("Un-escaped character in class definition: %s" % tok.value)
|
|
587
|
+
|
|
588
|
+
nxt = self.tokens[self.pos + 1]
|
|
589
|
+
if not tok.escaped and nxt.type == "CHAR" and not nxt.escaped and nxt.value == "-":
|
|
590
|
+
# a range: <near> '-' <far>
|
|
591
|
+
near = self._advance().value
|
|
592
|
+
self._advance() # hyphen
|
|
593
|
+
far = self._advance()
|
|
594
|
+
if far.type != "CHAR":
|
|
595
|
+
raise StringGenerator.SyntaxError("unexpected end of class range")
|
|
596
|
+
chars.append(self.getCharacterRange(near, far.value))
|
|
597
|
+
continue
|
|
598
|
+
|
|
599
|
+
if tok.escaped and tok.value in self.string_code:
|
|
600
|
+
chars.append(self.string_code[tok.value])
|
|
601
|
+
else:
|
|
602
|
+
chars.append(tok.value)
|
|
603
|
+
self._advance()
|
|
604
|
+
|
|
605
|
+
text = "".join(chars)
|
|
606
|
+
if not text:
|
|
607
|
+
raise StringGenerator.SyntaxError("empty character class")
|
|
608
|
+
|
|
609
|
+
if closed and self._peek().type == "LBRACE":
|
|
610
|
+
start, cnt = self._parse_quantifier()
|
|
611
|
+
elif closed:
|
|
612
|
+
start, cnt = -1, 1
|
|
613
|
+
else:
|
|
614
|
+
# unterminated class: original left start=0 (renders 0 or 1 char)
|
|
615
|
+
start, cnt = 0, 1
|
|
616
|
+
return StringGenerator.CharacterSet(text, start, cnt)
|
|
617
|
+
|
|
618
|
+
def _parse_sequence(self, level=0):
|
|
619
|
+
"""Parse a sequence of nodes, honoring the '|' and '&' operators.
|
|
620
|
+
|
|
621
|
+
The operator handling mirrors the original parser: operands are
|
|
622
|
+
gathered onto a stack and committed into a SequenceOR/SequenceAND
|
|
623
|
+
whenever the operator changes or a new operand group begins.
|
|
624
|
+
"""
|
|
625
|
+
operand_stack = []
|
|
626
|
+
op = None
|
|
627
|
+
seq = []
|
|
628
|
+
|
|
629
|
+
def commit_operands():
|
|
630
|
+
nonlocal operand_stack, op, seq
|
|
631
|
+
if op and operand_stack:
|
|
632
|
+
klass = StringGenerator.SequenceOR if op == "|" else StringGenerator.SequenceAND
|
|
633
|
+
seq.append(klass(operand_stack[:]))
|
|
634
|
+
operand_stack = []
|
|
635
|
+
op = None
|
|
636
|
+
|
|
637
|
+
# Track whether the previously consumed token was a binary operator so
|
|
638
|
+
# we can tell if a new '[', '(' or '${' opens a fresh operand group.
|
|
639
|
+
prev_exists = False
|
|
640
|
+
prev_is_operator = False
|
|
641
|
+
sequence_closed = False
|
|
642
|
+
|
|
643
|
+
while True:
|
|
644
|
+
tok = self._peek()
|
|
645
|
+
t = tok.type
|
|
646
|
+
|
|
647
|
+
if t == "EOF":
|
|
648
|
+
break
|
|
649
|
+
elif t == "CHAR":
|
|
650
|
+
seq.append(self._parse_literal())
|
|
651
|
+
prev_exists, prev_is_operator = True, False
|
|
652
|
+
elif t == "DOLLAR" and self.tokens[self.pos + 1].type == "LBRACE":
|
|
653
|
+
if prev_exists and not prev_is_operator:
|
|
654
|
+
commit_operands()
|
|
655
|
+
seq.append(self._parse_source())
|
|
656
|
+
prev_exists, prev_is_operator = True, False
|
|
657
|
+
elif t == "LBRACKET":
|
|
658
|
+
if prev_exists and not prev_is_operator:
|
|
659
|
+
commit_operands()
|
|
660
|
+
seq.append(self._parse_character_class())
|
|
661
|
+
prev_exists, prev_is_operator = True, False
|
|
662
|
+
elif t == "LPAREN":
|
|
663
|
+
if prev_exists and not prev_is_operator:
|
|
664
|
+
commit_operands()
|
|
665
|
+
self._advance() # (
|
|
666
|
+
seq.append(self._parse_sequence(level + 1))
|
|
667
|
+
prev_exists, prev_is_operator = True, False
|
|
668
|
+
elif t == "RPAREN":
|
|
669
|
+
if level == 0:
|
|
670
|
+
raise StringGenerator.SyntaxError("Extra closing parenthesis")
|
|
671
|
+
self._advance() # )
|
|
672
|
+
sequence_closed = True
|
|
673
|
+
break
|
|
674
|
+
elif t in ("PIPE", "AMP"):
|
|
675
|
+
if op and not op == tok.value:
|
|
676
|
+
# operator switched; flush the pending operand group
|
|
677
|
+
commit_operands()
|
|
678
|
+
op = tok.value
|
|
679
|
+
self._advance()
|
|
680
|
+
prev_exists, prev_is_operator = True, True
|
|
681
|
+
else:
|
|
682
|
+
# LBRACE, RBRACE, or a '$' not introducing a source.
|
|
683
|
+
raise StringGenerator.SyntaxError("Un-escaped special character: %s" % tok.value)
|
|
684
|
+
|
|
685
|
+
if op and len(seq):
|
|
686
|
+
operand_stack.append(seq.pop())
|
|
687
|
+
|
|
688
|
+
commit_operands()
|
|
689
|
+
|
|
690
|
+
if level > 0 and not sequence_closed:
|
|
691
|
+
# finishing a nested sequence without a closing parenthesis
|
|
692
|
+
raise StringGenerator.SyntaxError("Missing closing parenthesis")
|
|
693
|
+
|
|
694
|
+
return StringGenerator.Sequence(seq)
|
|
695
|
+
|
|
696
|
+
def render(self, **kwargs) -> str:
|
|
697
|
+
"""Produce a randomized string that fits the template/pattern.
|
|
698
|
+
|
|
699
|
+
Args:
|
|
700
|
+
None
|
|
701
|
+
|
|
702
|
+
Returns:
|
|
703
|
+
The generated string.
|
|
704
|
+
|
|
705
|
+
"""
|
|
706
|
+
return self.seq.render(self.randomizer, **kwargs)
|
|
707
|
+
|
|
708
|
+
def count(self, **kwargs) -> int:
|
|
709
|
+
return self.seq.count(self.randomizer, **kwargs)
|
|
710
|
+
|
|
711
|
+
def dump(self, cnt=None, **kwargs):
|
|
712
|
+
"""Print the parse tree and then call render for an example."""
|
|
713
|
+
import sys
|
|
714
|
+
|
|
715
|
+
if not self.seq:
|
|
716
|
+
self.seq = self._parse()
|
|
717
|
+
print("StringGenerator version: %s" % (__version__))
|
|
718
|
+
print("Python version: %s" % sys.version)
|
|
719
|
+
print(f"Random method provider class: {self.randomizer.__class__.__name__}")
|
|
720
|
+
self.seq.dump()
|
|
721
|
+
print(f"Potential outcome count: {self.count()}")
|
|
722
|
+
print("Example result:")
|
|
723
|
+
if cnt:
|
|
724
|
+
return self.render_list(cnt, **kwargs)
|
|
725
|
+
return self.render(**kwargs)
|
|
726
|
+
|
|
727
|
+
def render_list(self, cnt, unique=False, progress_callback=None, **kwargs) -> typing.List:
|
|
728
|
+
"""Return a list of generated strings.
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
cnt (int): length of list
|
|
732
|
+
unique (bool): whether to make entries unique
|
|
733
|
+
progress_callback: callable
|
|
734
|
+
|
|
735
|
+
Returns:
|
|
736
|
+
list.
|
|
737
|
+
|
|
738
|
+
We keep track of total attempts because a template may
|
|
739
|
+
specify something impossible to attain, like [1-9]{} with cnt==1000
|
|
740
|
+
|
|
741
|
+
"""
|
|
742
|
+
|
|
743
|
+
rendered_list = []
|
|
744
|
+
i = 0
|
|
745
|
+
total_attempts = 0
|
|
746
|
+
while True:
|
|
747
|
+
if i >= cnt:
|
|
748
|
+
break
|
|
749
|
+
if total_attempts > cnt * self.unique_attempts_factor:
|
|
750
|
+
raise StringGenerator.UniquenessError("couldn't satisfy uniqueness")
|
|
751
|
+
s = self.render(**kwargs)
|
|
752
|
+
if unique:
|
|
753
|
+
if s not in rendered_list:
|
|
754
|
+
rendered_list.append(s)
|
|
755
|
+
i += 1
|
|
756
|
+
else:
|
|
757
|
+
rendered_list.append(s)
|
|
758
|
+
i += 1
|
|
759
|
+
total_attempts += 1
|
|
760
|
+
|
|
761
|
+
# Optionally trigger the progress indicator to inform others about our progress
|
|
762
|
+
if progress_callback and callable(progress_callback):
|
|
763
|
+
progress_callback(i, cnt)
|
|
764
|
+
|
|
765
|
+
return rendered_list
|
|
766
|
+
|
|
767
|
+
def render_set(self, cnt, **kwargs) -> typing.Set:
|
|
768
|
+
"""Return a set of generated strings that will as a result be unique.
|
|
769
|
+
|
|
770
|
+
Args:
|
|
771
|
+
cnt (int): length of list
|
|
772
|
+
|
|
773
|
+
Returns:
|
|
774
|
+
set
|
|
775
|
+
|
|
776
|
+
This is like `render_list(n, unique=True)` but will not take a callback and returns a set.
|
|
777
|
+
It will be much faster than `render_list()`.
|
|
778
|
+
|
|
779
|
+
Caution: this will not check if the solution set is
|
|
780
|
+
feasible. It will be stuck in a loop if you use a large sample space . The following
|
|
781
|
+
will never complete:
|
|
782
|
+
|
|
783
|
+
SG("[123]{2}").render_set(100)
|
|
784
|
+
|
|
785
|
+
"""
|
|
786
|
+
|
|
787
|
+
results: typing.Set = set()
|
|
788
|
+
while len(results) < cnt:
|
|
789
|
+
results |= {self.render(**kwargs) for _ in range(cnt - len(results))}
|
|
790
|
+
|
|
791
|
+
return results
|
|
792
|
+
|
|
793
|
+
def __str__(self):
|
|
794
|
+
return self.render()
|
|
795
|
+
|
|
796
|
+
def __repr__(self):
|
|
797
|
+
return f"{self.__class__.__name__}, {self.pattern}, {self.randomizer.__class__.__name__}"
|