StringGenerator 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: StringGenerator
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Generate randomized strings of characters using a template
5
5
  Home-page: https://github.com/paul-wolf/strgen
6
6
  Author: Paul Wolf
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: StringGenerator
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Generate randomized strings of characters using a template
5
5
  Home-page: https://github.com/paul-wolf/strgen
6
6
  Author: Paul Wolf
@@ -42,7 +42,7 @@ from abc import ABC, abstractmethod
42
42
  from collections import Counter, namedtuple
43
43
  from math import factorial
44
44
 
45
- __version__ = "0.5.0"
45
+ __version__ = "0.5.1"
46
46
  __author__ = "Paul Wolf"
47
47
  __license__ = "BSD"
48
48
 
@@ -306,9 +306,29 @@ class StringGenerator:
306
306
  return "".join(char_list)
307
307
 
308
308
  def count(self, randomizer, **kwargs):
309
- """This does not work for complex expressions."""
310
- char_list = list("".join([x.render(randomizer, **kwargs) for x in self.seq]))
311
- return permutation_count(char_list)
309
+ """Number of distinct outcomes of a permutation ('&') of the operands.
310
+
311
+ '&' shuffles together the characters produced by all operands. When
312
+ every operand is fixed -- it has exactly one possible value, i.e.
313
+ ``count() == 1`` -- the multiset of characters is known, and the
314
+ answer is just the number of distinct permutations of that multiset.
315
+
316
+ When an operand can vary (e.g. a character set), the set of
317
+ characters being shuffled changes with each random draw, so there is
318
+ no single well-defined count. Rather than return a misleading,
319
+ draw-dependent number (the previous behaviour) we raise
320
+ NotImplementedError. See ``StringGenerator.count`` for the full set
321
+ of assumptions behind counting.
322
+ """
323
+ operand_counts = [node.count(randomizer, **kwargs) for node in self.seq]
324
+ if all(c == 1 for c in operand_counts):
325
+ # every operand is fixed, so the multiset of characters is known
326
+ chars = "".join(node.render(randomizer, **kwargs) for node in self.seq)
327
+ return permutation_count(chars)
328
+ raise NotImplementedError(
329
+ "count() is undefined for '&' over operands that are not fixed; "
330
+ "the result would depend on the random draw"
331
+ )
312
332
 
313
333
  def dump(self, level=-1):
314
334
  print((StringGenerator.mytab * level) + repr(self))
@@ -706,6 +726,30 @@ class StringGenerator:
706
726
  return self.seq.render(self.randomizer, **kwargs)
707
727
 
708
728
  def count(self, **kwargs) -> int:
729
+ r"""Return the size of the generation sample space for the template.
730
+
731
+ This is the number of distinct strings the template can produce, but
732
+ only under the following assumptions. Where they do not hold, the value
733
+ is the size of the *generation* space (the number of ways the template
734
+ can be filled in), which may exceed the number of distinct strings:
735
+
736
+ * **Character classes contain no duplicate characters.** ``len(chars)``
737
+ is used as the alphabet size, so a class with repeats (e.g.
738
+ ``[a\d\d]``) counts each repeat as a separate option and overcounts.
739
+ The generator also weights repeated characters more heavily when
740
+ rendering, so this number reflects that weighting.
741
+ * **Alternation (``|``) branches are disjoint.** The count sums the
742
+ branch sizes, which equals the number of distinct results only if no
743
+ two branches can produce the same string; overlapping branches
744
+ overcount.
745
+ * **Permutation (``&``) is applied only to fixed operands.** For ``&``
746
+ over operands that can vary, the count depends on the random draw, so
747
+ ``count()`` raises NotImplementedError instead of guessing.
748
+
749
+ ``count()`` also raises NotImplementedError if the template contains a
750
+ ``${...}`` source, since a source may be an arbitrary callable or list
751
+ whose size is unknown.
752
+ """
709
753
  return self.seq.count(self.randomizer, **kwargs)
710
754
 
711
755
  def dump(self, cnt=None, **kwargs):
@@ -411,6 +411,22 @@ class TestSG(unittest.TestCase):
411
411
  SG(r"[\u\d]{2}|[abc]{3}", uaf=100).render_list(1323, unique=True)
412
412
  )
413
413
 
414
+ def test_count_and_operator(self):
415
+ """count() over '&' is deterministic and only defined for fixed operands."""
416
+ # Fixed (literal) operands: distinct permutations of "1abc" = 4! = 24.
417
+ sg = SG(r"1&abc")
418
+ assert sg.count() == 24
419
+ assert sg.count() == len(sg.render_set(24))
420
+
421
+ # Deterministic across repeated calls. Previously this rendered once and
422
+ # counted that single sample, so the value varied with the random draw.
423
+ assert len({SG(r"1&abc").count() for _ in range(20)}) == 1
424
+
425
+ # Operands that can vary have no single well-defined count: raise rather
426
+ # than return a draw-dependent number.
427
+ with self.assertRaises(NotImplementedError):
428
+ SG(r"[\d]{2}&[\d]{1}").count()
429
+
414
430
  def test_probabilistic_or(self):
415
431
  d = SG("0|1|2|3|4|5|6|7|8|9").render_list(10000)
416
432
  d = [int(d) for d in d]
File without changes