StringGenerator 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
strgen/__init__.py ADDED
@@ -0,0 +1,797 @@
1
+ # Copyright (c) 2013-2020, Paul Wolf
2
+ # All rights reserved.
3
+
4
+ # Redistribution and use in source and binary forms, with or without
5
+ # modification, are permitted provided that the following conditions are
6
+ # met:
7
+
8
+ # 1. Redistributions of source code must retain the above copyright
9
+ # notice, this list of conditions and the following disclaimer.
10
+
11
+ # 2. Redistributions in binary form must reproduce the above copyright
12
+ # notice, this list of conditions and the following disclaimer in the
13
+ # documentation and/or other materials provided with the distribution.
14
+
15
+ # 3. Neither the name of Yewleaf Ltd. nor the names of its contributors
16
+ # may be used to endorse or promote products derived from this software
17
+ # without specific prior written permission.
18
+
19
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22
+ # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23
+ # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26
+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27
+ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
+
31
+ # Original author: paul.wolf@yewleaf.com
32
+
33
+
34
+ import os
35
+ import random
36
+ import string
37
+ import types
38
+ import typing
39
+ import math
40
+ import itertools
41
+ from abc import ABC, abstractmethod
42
+ from collections import Counter, namedtuple
43
+ from math import factorial
44
+
45
+ __version__ = "0.5.0"
46
+ __author__ = "Paul Wolf"
47
+ __license__ = "BSD"
48
+
49
+
50
+ # A lexer token. ``type`` is one of the structural kinds (LBRACKET, PIPE, ...)
51
+ # or "CHAR"/"EOF". ``escaped`` is only meaningful for CHAR tokens and records
52
+ # whether the character came from a backslash escape, so the parser can tell a
53
+ # literal "[" from a class opener without ever re-examining backslashes.
54
+ Token = namedtuple("Token", ["type", "value", "escaped"], defaults=[False])
55
+
56
+
57
+ def permutation_count(s):
58
+ """Return the number of different permutations of s.
59
+ math.perm does not exist before P3.8.
60
+ https://codereview.stackexchange.com/questions/132704/counting-permutations-without-repetitions-for-a-number-or-a-string
61
+ """
62
+ c = 1
63
+ for i in Counter(s).values():
64
+ c *= factorial(i)
65
+ return factorial(len(s)) // c
66
+
67
+
68
+ def randomizer_factory(seed) -> random.Random:
69
+ """Return class instance that will provide randint, choice, shuffle.
70
+
71
+ If there is a seed, we need to use Random.
72
+
73
+ """
74
+ if seed:
75
+ return random.Random(seed)
76
+ try:
77
+ return random.SystemRandom()
78
+ except Exception:
79
+ return random.Random()
80
+
81
+
82
+ class BufferedSecureRandom(random.Random):
83
+ """Cryptographically secure RNG that buffers ``os.urandom`` in bulk.
84
+
85
+ ``random.SystemRandom`` reads from the OS entropy pool on every draw, which
86
+ means one syscall per random value. Generating large batches (e.g.
87
+ ``render_set(1_000_000)``) then spends most of its time in the kernel.
88
+
89
+ This class draws the exact same entropy -- raw ``os.urandom`` bytes, each
90
+ consumed once and never expanded by a userspace PRNG -- but reads it in big
91
+ chunks, amortizing the syscall across many values. It is therefore as
92
+ secure as ``SystemRandom`` (suitable for tokens, passwords, keys) while
93
+ being substantially faster for bulk generation.
94
+
95
+ Pass it via the ``randomizer`` argument; it is reachable without an extra
96
+ import as ``StringGenerator.BufferedSecureRandom``::
97
+
98
+ SG(r"[\\w\\p]{32}", randomizer=SG.BufferedSecureRandom()).render_set(50000)
99
+
100
+ Being entropy-based, it ignores seeding.
101
+ """
102
+
103
+ def __init__(self, bufsize=1 << 20):
104
+ self._buf = b""
105
+ self._i = 0
106
+ self._bufsize = bufsize
107
+ super().__init__()
108
+
109
+ def _take(self, n):
110
+ """Return n fresh random bytes, refilling the buffer when needed."""
111
+ if self._i + n > len(self._buf):
112
+ self._buf = os.urandom(max(n, self._bufsize))
113
+ self._i = 0
114
+ chunk = self._buf[self._i : self._i + n]
115
+ self._i += n
116
+ return chunk
117
+
118
+ def random(self):
119
+ """Return a 53-bit float in [0.0, 1.0), as SystemRandom.random does."""
120
+ return (int.from_bytes(self._take(7), "big") >> 3) * (2.0**-53)
121
+
122
+ def getrandbits(self, k):
123
+ if k <= 0:
124
+ raise ValueError("number of bits must be greater than zero")
125
+ nbytes = (k + 7) // 8
126
+ return int.from_bytes(self._take(nbytes), "big") >> (nbytes * 8 - k)
127
+
128
+ def choices(self, population, weights=None, *, cum_weights=None, k=1):
129
+ """Unweighted draws map bytes straight to indices via rejection
130
+ sampling, skipping the per-pick float construction in random(). This is
131
+ the hot path for character sets and is several times faster. Weighted
132
+ draws, an empty population, or alphabets larger than one byte fall back
133
+ to the standard implementation (which still uses our random()).
134
+ """
135
+ n = len(population)
136
+ if weights is not None or cum_weights is not None or n == 0 or n > 256:
137
+ return super().choices(population, weights, cum_weights=cum_weights, k=k)
138
+ limit = 256 - (256 % n) # largest multiple of n <= 256; reject above it for uniformity
139
+ out = []
140
+ append = out.append
141
+ take = self._take
142
+ while len(out) < k:
143
+ for byte in take(k - len(out)):
144
+ if byte < limit:
145
+ append(population[byte % n])
146
+ if len(out) == k:
147
+ break
148
+ return out
149
+
150
+ def seed(self, *args, **kwargs):
151
+ """No-op: entropy-based, so there is no seed state to set."""
152
+
153
+ def _notimplemented(self, *args, **kwargs):
154
+ raise NotImplementedError("BufferedSecureRandom is entropy-based; state cannot be saved or restored")
155
+
156
+ getstate = setstate = _notimplemented
157
+
158
+
159
+ class StringGenerator:
160
+ """Generate a randomized string of characters using a template.
161
+
162
+ The purpose of this class is to generate a string of characters
163
+ according to a template. The template language is superficially
164
+ similar to regular expressions but fundamentally different in
165
+ purpose.
166
+
167
+ Usage:
168
+
169
+ StringGenerator(<template>).render()
170
+
171
+ or:
172
+
173
+ StringGenerator(<template>).render_list(10, unique=True)
174
+
175
+ The latter produces a list of 10 strings that are unique within
176
+ the list.
177
+
178
+ Example:
179
+
180
+ `StringGenerator(r"[\\d]{10}").render_list(10, unique=True)`
181
+
182
+ This generates 10 unique strings containing digits. Each will be
183
+ 10 characters in length.
184
+
185
+ """
186
+
187
+ # Per-instance; assigned in __init__. Declared here only as a type hint so
188
+ # it is never a shared class attribute (which would let one instance clobber
189
+ # another's RNG and break seeded determinism / thread-safety).
190
+ randomizer: typing.Optional[random.Random]
191
+
192
+ # Exposed here so callers can opt into the fast secure RNG without a second
193
+ # import: SG(pattern, randomizer=SG.BufferedSecureRandom()).
194
+ BufferedSecureRandom = BufferedSecureRandom
195
+
196
+ class SyntaxError(Exception):
197
+ """Catch syntax errors."""
198
+
199
+ class UniquenessError(Exception):
200
+ """Catch when template can't generate required list count."""
201
+
202
+ meta_chars = "[]{}()|&$"
203
+ mytab = " " * 4
204
+
205
+ string_code = {
206
+ "d": string.digits,
207
+ "w": "_" + string.ascii_letters + string.digits,
208
+ "W": string.whitespace + string.punctuation,
209
+ "s": string.whitespace,
210
+ "p": string.punctuation,
211
+ "l": string.ascii_letters,
212
+ "u": string.ascii_uppercase,
213
+ "U": string.ascii_uppercase,
214
+ "c": string.ascii_lowercase,
215
+ "o": string.octdigits,
216
+ "h": string.hexdigits,
217
+ "r": string.printable,
218
+ }
219
+ string_code_help = {
220
+ "d": "digits",
221
+ "w": "_" + "ascii_letters + digits",
222
+ "W": "whitespace + punctuation",
223
+ "s": "whitespace",
224
+ "p": "punctuation",
225
+ "l": "ascii_letters",
226
+ "u": "ascii_uppercase",
227
+ "U": "ascii_uppercase",
228
+ "c": "ascii_lowercase",
229
+ "o": "octdigits",
230
+ "h": "hexdigits",
231
+ "r": "printable",
232
+ }
233
+
234
+ class StringNode(ABC):
235
+ """The abstract class for all nodes"""
236
+
237
+ @abstractmethod
238
+ def render(self, randomizer, **kwargs):
239
+ pass
240
+
241
+ @abstractmethod
242
+ def count(self, randomizer, **kwargs):
243
+ pass
244
+
245
+ @abstractmethod
246
+ def dump(self):
247
+ pass
248
+
249
+ class Sequence:
250
+ """Render a sequence of nodes from the template."""
251
+
252
+ def __init__(self, seq):
253
+ """seq is a list."""
254
+ self.seq = seq # list of StringNodes
255
+
256
+ def render(self, randomizer, **kwargs):
257
+ return "".join([x.render(randomizer, **kwargs) for x in self.seq])
258
+
259
+ def count(self, randomizer, **kwargs):
260
+ """This sequence of counts:
261
+ P x P x P...
262
+ The cummulative product.
263
+ """
264
+ d = [_.count(randomizer, **kwargs) for _ in self.seq]
265
+ x = 1
266
+ for i in d:
267
+ x *= i
268
+ return x
269
+
270
+ def dump(self, level=-1):
271
+ print((StringGenerator.mytab * level) + f"{self.__class__.__name__}")
272
+ for s in self.seq:
273
+ s.dump(level + 1)
274
+
275
+ class SequenceOR(Sequence):
276
+ """Randomly choose from operands."""
277
+
278
+ def render(self, randomizer, **kwargs):
279
+ """Return on of a sequence of nodes."""
280
+
281
+ return self.seq[randomizer.randint(0, len(self.seq) - 1)].render(randomizer, **kwargs)
282
+
283
+ def count(self, randomizer, **kwargs):
284
+ return sum([x.count(randomizer, **kwargs) for x in self.seq])
285
+
286
+ def dump(self, level=-1):
287
+ print((StringGenerator.mytab * level) + repr(self))
288
+ for s in self.seq:
289
+ s.dump(level + 1)
290
+
291
+ def __repr__(self):
292
+ return f"{self.__class__.__name__}"
293
+
294
+ def __str__(self):
295
+ return "OR"
296
+
297
+ class SequenceAND(Sequence):
298
+ """Render a permutation without replacement
299
+ of characters from operands.
300
+ """
301
+
302
+ def render(self, randomizer, **kwargs):
303
+ """Return a permutation without replacement of all characters in seq."""
304
+ char_list = list("".join([x.render(randomizer, **kwargs) for x in self.seq]))
305
+ randomizer.shuffle(char_list)
306
+ return "".join(char_list)
307
+
308
+ def count(self, randomizer, **kwargs):
309
+ """This does not work for complex expressions."""
310
+ char_list = list("".join([x.render(randomizer, **kwargs) for x in self.seq]))
311
+ return permutation_count(char_list)
312
+
313
+ def dump(self, level=-1):
314
+ print((StringGenerator.mytab * level) + repr(self))
315
+ for s in self.seq:
316
+ s.dump(level + 1)
317
+
318
+ def __str__(self):
319
+ return "AND"
320
+
321
+ def __repr__(self):
322
+ return f"{self.__class__.__name__}"
323
+
324
+ class Literal(StringNode):
325
+ """Render a literal string."""
326
+
327
+ def __init__(self, chars):
328
+ self.literal = chars # a literal string
329
+
330
+ def render(self, randomizer, **kwargs):
331
+ return self.literal
332
+
333
+ def count(self, randomizer, **kwargs):
334
+ return 1
335
+
336
+ def dump(self, level=0):
337
+ print((StringGenerator.mytab * level) + repr(self))
338
+
339
+ def __str__(self):
340
+ return self.literal
341
+
342
+ def __repr__(self):
343
+ return f"{self.__class__.__name__}: {self.literal}"
344
+
345
+ class CharacterSet(StringNode):
346
+ """Render a random combination from a set of characters."""
347
+
348
+ def __init__(self, chars, start, cnt):
349
+ self.chars = chars
350
+ try:
351
+ self.start = int(start)
352
+ self.cnt = int(cnt)
353
+ except Exception as e:
354
+ raise e
355
+
356
+ def render(self, randomizer, **kwargs):
357
+ if self.start > -1:
358
+ cnt = randomizer.randint(self.start, self.cnt)
359
+ else:
360
+ cnt = self.cnt
361
+
362
+ # choices() draws all cnt characters in a single C-level call, far
363
+ # faster than one randint() per character for large outputs.
364
+ return "".join(randomizer.choices(self.chars, k=cnt))
365
+
366
+ def count(self, randomizer, **kwargs):
367
+ """Permutation with replacement.
368
+ The cummulative sum of c ** r
369
+ """
370
+ if self.start < 0:
371
+ # fixed length
372
+ return len(self.chars) ** self.cnt
373
+ # range
374
+ return sum([len(self.chars) ** r for r in range(self.start, self.cnt + 1)])
375
+
376
+ def dump(self, level=0):
377
+ print(StringGenerator.mytab * level + repr(self))
378
+
379
+ def __str__(self):
380
+ return f"start={self.start}, cnt={self.cnt}, chars={self.chars}"
381
+
382
+ def __repr__(self):
383
+ return f"{self.__class__.__name__}: start={self.start}, cnt={self.cnt}, chars={self.chars}"
384
+
385
+ class Source(StringNode):
386
+ """Render a string from a generator, list, function."""
387
+
388
+ def __init__(self, source):
389
+ self.source = source
390
+
391
+ def render(self, randomizer, **kwargs):
392
+ src = kwargs.get(self.source) if self.source in kwargs else ""
393
+ if isinstance(
394
+ src,
395
+ (
396
+ list,
397
+ set,
398
+ tuple,
399
+ ),
400
+ ):
401
+ return str(randomizer.choice(src))
402
+ if callable(src):
403
+ return str(src())
404
+ elif isinstance(src, types.GeneratorType):
405
+ return str(next(src))
406
+ else:
407
+ return str(src)
408
+
409
+ def count(self, randomizer, **kwargs):
410
+ """Since a source name can be a callable, we can't say what the count
411
+ is.
412
+
413
+ """
414
+ raise NotImplementedError("Cannot get count for source nodes")
415
+
416
+ def dump(self, level=0):
417
+ print((StringGenerator.mytab * level) + "$%s" % self.source)
418
+
419
+ def __repr__(self):
420
+ return f"{self.__class__.__name__}: {self.source}"
421
+
422
+ def __str__(self):
423
+ return str(self)
424
+
425
+ def __init__(self, pattern, uaf=10, randomizer=None, seed=None):
426
+ self.pattern = pattern
427
+ self.pos = 0
428
+ self.unique_attempts_factor = uaf
429
+ self.tokens = self._tokenize()
430
+ self.seq = self._parse()
431
+ if randomizer:
432
+ if not (
433
+ hasattr(randomizer, "randint")
434
+ and hasattr(randomizer, "choice")
435
+ and hasattr(randomizer, "choices")
436
+ and hasattr(randomizer, "shuffle")
437
+ ):
438
+ Exception(
439
+ "The randomizer class instance must provide at least these methods: "
440
+ "randint, choice, choices, shuffle"
441
+ )
442
+ self.randomizer = randomizer
443
+ else:
444
+ self.randomizer = randomizer_factory(seed)
445
+
446
+ def getCharacterRange(self, f, t):
447
+ chars = ""
448
+ # support z-a as a range
449
+ if not ord(f) < ord(t):
450
+ f, t = t, f
451
+ if (ord(t) - ord(f)) > 10000: # protect against large sets ?
452
+ raise Exception("character range too large: %s - %s: %s" % (f, t, ord(t) - ord(f)))
453
+ for c in range(ord(f), ord(t) + 1):
454
+ chars += chr(c)
455
+ return chars
456
+
457
+ # ----- Tokenizer ------------------------------------------------------
458
+
459
+ # Structural metacharacters each map to their own token type. Everything
460
+ # else becomes a CHAR token. Crucially, a backslash escape is resolved
461
+ # exactly once here into a CHAR token (escaped=True), so the parser never
462
+ # sees a backslash and never has to guess whether a metacharacter was
463
+ # escaped via lookbehind.
464
+ _meta_token = {
465
+ "[": "LBRACKET",
466
+ "]": "RBRACKET",
467
+ "{": "LBRACE",
468
+ "}": "RBRACE",
469
+ "(": "LPAREN",
470
+ ")": "RPAREN",
471
+ "|": "PIPE",
472
+ "&": "AMP",
473
+ "$": "DOLLAR",
474
+ }
475
+
476
+ def _tokenize(self):
477
+ """Turn self.pattern into a flat list of tokens ending in EOF."""
478
+ tokens = []
479
+ pattern = self.pattern
480
+ i = 0
481
+ n = len(pattern)
482
+ while i < n:
483
+ ch = pattern[i]
484
+ if ch == "\\":
485
+ if i + 1 < n:
486
+ tokens.append(Token("CHAR", pattern[i + 1], True))
487
+ i += 2
488
+ else:
489
+ # A trailing backslash escapes nothing; drop it.
490
+ i += 1
491
+ continue
492
+ kind = self._meta_token.get(ch)
493
+ if kind:
494
+ tokens.append(Token(kind, ch))
495
+ else:
496
+ tokens.append(Token("CHAR", ch))
497
+ i += 1
498
+ tokens.append(Token("EOF", None))
499
+ return tokens
500
+
501
+ # ----- Parser ---------------------------------------------------------
502
+
503
+ def _peek(self):
504
+ return self.tokens[self.pos]
505
+
506
+ def _advance(self):
507
+ tok = self.tokens[self.pos]
508
+ self.pos += 1
509
+ return tok
510
+
511
+ def _parse(self):
512
+ self.pos = 0
513
+ return self._parse_sequence(level=0)
514
+
515
+ def _parse_literal(self):
516
+ """Consume a run of CHAR tokens into a single Literal node."""
517
+ chars = []
518
+ while self._peek().type == "CHAR":
519
+ chars.append(self._advance().value)
520
+ return StringGenerator.Literal("".join(chars))
521
+
522
+ def _parse_source(self):
523
+ """Parse a ${identifier} source; '$' and '{' are the current tokens."""
524
+ self._advance() # $
525
+ self._advance() # {
526
+ chars = []
527
+ while True:
528
+ tok = self._advance()
529
+ if tok.type == "EOF":
530
+ raise StringGenerator.SyntaxError("unexpected end of input getting source")
531
+ if tok.type == "RBRACE":
532
+ break
533
+ chars.append(tok.value if tok.value is not None else "")
534
+ identifier = "".join(chars)
535
+ if not identifier or not identifier.isidentifier():
536
+ raise StringGenerator.SyntaxError("not a valid identifier: %s" % identifier)
537
+ return StringGenerator.Source(identifier)
538
+
539
+ def _parse_quantifier(self):
540
+ """Parse a {m}, {m:n} or {m-n} quantifier; '{' is the current token."""
541
+ self._advance() # {
542
+ start = -1
543
+ digits = "0"
544
+ prev_was_separator = False
545
+ while True:
546
+ tok = self._advance()
547
+ if tok.type == "EOF":
548
+ raise StringGenerator.SyntaxError("unexpected end of input getting quantifier")
549
+ if tok.type == "RBRACE":
550
+ if prev_was_separator:
551
+ # the user likely expected python slice notation, where the
552
+ # upper bound may be left open; we require a closed range
553
+ raise StringGenerator.SyntaxError("quantifier range must be closed")
554
+ break
555
+ if tok.type == "CHAR" and tok.value in ":-":
556
+ start = int(digits)
557
+ digits = "0"
558
+ prev_was_separator = True
559
+ continue
560
+ if tok.type == "CHAR" and tok.value.isnumeric():
561
+ digits += tok.value
562
+ prev_was_separator = False
563
+ continue
564
+ raise StringGenerator.SyntaxError("non-digit in count")
565
+ return [start, int(digits)]
566
+
567
+ def _parse_character_class(self):
568
+ """Parse a [...] class with individual members, ranges and shortcuts.
569
+
570
+ The current token is the opening '['.
571
+ """
572
+ self._advance() # [
573
+ chars = []
574
+ closed = False
575
+ while True:
576
+ tok = self._peek()
577
+ if tok.type == "EOF":
578
+ # Unterminated class. The original parser tolerated this, so we
579
+ # keep that behavior rather than introduce a new error here.
580
+ break
581
+ if tok.type == "RBRACKET":
582
+ self._advance()
583
+ closed = True
584
+ break
585
+ if tok.type != "CHAR":
586
+ raise StringGenerator.SyntaxError("Un-escaped character in class definition: %s" % tok.value)
587
+
588
+ nxt = self.tokens[self.pos + 1]
589
+ if not tok.escaped and nxt.type == "CHAR" and not nxt.escaped and nxt.value == "-":
590
+ # a range: <near> '-' <far>
591
+ near = self._advance().value
592
+ self._advance() # hyphen
593
+ far = self._advance()
594
+ if far.type != "CHAR":
595
+ raise StringGenerator.SyntaxError("unexpected end of class range")
596
+ chars.append(self.getCharacterRange(near, far.value))
597
+ continue
598
+
599
+ if tok.escaped and tok.value in self.string_code:
600
+ chars.append(self.string_code[tok.value])
601
+ else:
602
+ chars.append(tok.value)
603
+ self._advance()
604
+
605
+ text = "".join(chars)
606
+ if not text:
607
+ raise StringGenerator.SyntaxError("empty character class")
608
+
609
+ if closed and self._peek().type == "LBRACE":
610
+ start, cnt = self._parse_quantifier()
611
+ elif closed:
612
+ start, cnt = -1, 1
613
+ else:
614
+ # unterminated class: original left start=0 (renders 0 or 1 char)
615
+ start, cnt = 0, 1
616
+ return StringGenerator.CharacterSet(text, start, cnt)
617
+
618
+ def _parse_sequence(self, level=0):
619
+ """Parse a sequence of nodes, honoring the '|' and '&' operators.
620
+
621
+ The operator handling mirrors the original parser: operands are
622
+ gathered onto a stack and committed into a SequenceOR/SequenceAND
623
+ whenever the operator changes or a new operand group begins.
624
+ """
625
+ operand_stack = []
626
+ op = None
627
+ seq = []
628
+
629
+ def commit_operands():
630
+ nonlocal operand_stack, op, seq
631
+ if op and operand_stack:
632
+ klass = StringGenerator.SequenceOR if op == "|" else StringGenerator.SequenceAND
633
+ seq.append(klass(operand_stack[:]))
634
+ operand_stack = []
635
+ op = None
636
+
637
+ # Track whether the previously consumed token was a binary operator so
638
+ # we can tell if a new '[', '(' or '${' opens a fresh operand group.
639
+ prev_exists = False
640
+ prev_is_operator = False
641
+ sequence_closed = False
642
+
643
+ while True:
644
+ tok = self._peek()
645
+ t = tok.type
646
+
647
+ if t == "EOF":
648
+ break
649
+ elif t == "CHAR":
650
+ seq.append(self._parse_literal())
651
+ prev_exists, prev_is_operator = True, False
652
+ elif t == "DOLLAR" and self.tokens[self.pos + 1].type == "LBRACE":
653
+ if prev_exists and not prev_is_operator:
654
+ commit_operands()
655
+ seq.append(self._parse_source())
656
+ prev_exists, prev_is_operator = True, False
657
+ elif t == "LBRACKET":
658
+ if prev_exists and not prev_is_operator:
659
+ commit_operands()
660
+ seq.append(self._parse_character_class())
661
+ prev_exists, prev_is_operator = True, False
662
+ elif t == "LPAREN":
663
+ if prev_exists and not prev_is_operator:
664
+ commit_operands()
665
+ self._advance() # (
666
+ seq.append(self._parse_sequence(level + 1))
667
+ prev_exists, prev_is_operator = True, False
668
+ elif t == "RPAREN":
669
+ if level == 0:
670
+ raise StringGenerator.SyntaxError("Extra closing parenthesis")
671
+ self._advance() # )
672
+ sequence_closed = True
673
+ break
674
+ elif t in ("PIPE", "AMP"):
675
+ if op and not op == tok.value:
676
+ # operator switched; flush the pending operand group
677
+ commit_operands()
678
+ op = tok.value
679
+ self._advance()
680
+ prev_exists, prev_is_operator = True, True
681
+ else:
682
+ # LBRACE, RBRACE, or a '$' not introducing a source.
683
+ raise StringGenerator.SyntaxError("Un-escaped special character: %s" % tok.value)
684
+
685
+ if op and len(seq):
686
+ operand_stack.append(seq.pop())
687
+
688
+ commit_operands()
689
+
690
+ if level > 0 and not sequence_closed:
691
+ # finishing a nested sequence without a closing parenthesis
692
+ raise StringGenerator.SyntaxError("Missing closing parenthesis")
693
+
694
+ return StringGenerator.Sequence(seq)
695
+
696
+ def render(self, **kwargs) -> str:
697
+ """Produce a randomized string that fits the template/pattern.
698
+
699
+ Args:
700
+ None
701
+
702
+ Returns:
703
+ The generated string.
704
+
705
+ """
706
+ return self.seq.render(self.randomizer, **kwargs)
707
+
708
+ def count(self, **kwargs) -> int:
709
+ return self.seq.count(self.randomizer, **kwargs)
710
+
711
+ def dump(self, cnt=None, **kwargs):
712
+ """Print the parse tree and then call render for an example."""
713
+ import sys
714
+
715
+ if not self.seq:
716
+ self.seq = self._parse()
717
+ print("StringGenerator version: %s" % (__version__))
718
+ print("Python version: %s" % sys.version)
719
+ print(f"Random method provider class: {self.randomizer.__class__.__name__}")
720
+ self.seq.dump()
721
+ print(f"Potential outcome count: {self.count()}")
722
+ print("Example result:")
723
+ if cnt:
724
+ return self.render_list(cnt, **kwargs)
725
+ return self.render(**kwargs)
726
+
727
+ def render_list(self, cnt, unique=False, progress_callback=None, **kwargs) -> typing.List:
728
+ """Return a list of generated strings.
729
+
730
+ Args:
731
+ cnt (int): length of list
732
+ unique (bool): whether to make entries unique
733
+ progress_callback: callable
734
+
735
+ Returns:
736
+ list.
737
+
738
+ We keep track of total attempts because a template may
739
+ specify something impossible to attain, like [1-9]{} with cnt==1000
740
+
741
+ """
742
+
743
+ rendered_list = []
744
+ i = 0
745
+ total_attempts = 0
746
+ while True:
747
+ if i >= cnt:
748
+ break
749
+ if total_attempts > cnt * self.unique_attempts_factor:
750
+ raise StringGenerator.UniquenessError("couldn't satisfy uniqueness")
751
+ s = self.render(**kwargs)
752
+ if unique:
753
+ if s not in rendered_list:
754
+ rendered_list.append(s)
755
+ i += 1
756
+ else:
757
+ rendered_list.append(s)
758
+ i += 1
759
+ total_attempts += 1
760
+
761
+ # Optionally trigger the progress indicator to inform others about our progress
762
+ if progress_callback and callable(progress_callback):
763
+ progress_callback(i, cnt)
764
+
765
+ return rendered_list
766
+
767
+ def render_set(self, cnt, **kwargs) -> typing.Set:
768
+ """Return a set of generated strings that will as a result be unique.
769
+
770
+ Args:
771
+ cnt (int): length of list
772
+
773
+ Returns:
774
+ set
775
+
776
+ This is like `render_list(n, unique=True)` but will not take a callback and returns a set.
777
+ It will be much faster than `render_list()`.
778
+
779
+ Caution: this will not check if the solution set is
780
+ feasible. It will be stuck in a loop if you use a large sample space . The following
781
+ will never complete:
782
+
783
+ SG("[123]{2}").render_set(100)
784
+
785
+ """
786
+
787
+ results: typing.Set = set()
788
+ while len(results) < cnt:
789
+ results |= {self.render(**kwargs) for _ in range(cnt - len(results))}
790
+
791
+ return results
792
+
793
+ def __str__(self):
794
+ return self.render()
795
+
796
+ def __repr__(self):
797
+ return f"{self.__class__.__name__}, {self.pattern}, {self.randomizer.__class__.__name__}"