dataforge-py 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. dataforge/__init__.py +20 -0
  2. dataforge/backend.py +147 -0
  3. dataforge/cli.py +166 -0
  4. dataforge/core.py +1169 -0
  5. dataforge/locales/__init__.py +1 -0
  6. dataforge/locales/ar_SA/__init__.py +1 -0
  7. dataforge/locales/ar_SA/address.py +128 -0
  8. dataforge/locales/ar_SA/company.py +183 -0
  9. dataforge/locales/ar_SA/internet.py +25 -0
  10. dataforge/locales/ar_SA/person.py +217 -0
  11. dataforge/locales/ar_SA/phone.py +15 -0
  12. dataforge/locales/de_DE/__init__.py +1 -0
  13. dataforge/locales/de_DE/address.py +148 -0
  14. dataforge/locales/de_DE/company.py +125 -0
  15. dataforge/locales/de_DE/internet.py +32 -0
  16. dataforge/locales/de_DE/person.py +212 -0
  17. dataforge/locales/de_DE/phone.py +17 -0
  18. dataforge/locales/en_AU/__init__.py +1 -0
  19. dataforge/locales/en_AU/address.py +231 -0
  20. dataforge/locales/en_AU/company.py +193 -0
  21. dataforge/locales/en_AU/internet.py +34 -0
  22. dataforge/locales/en_AU/person.py +370 -0
  23. dataforge/locales/en_AU/phone.py +16 -0
  24. dataforge/locales/en_CA/__init__.py +1 -0
  25. dataforge/locales/en_CA/address.py +276 -0
  26. dataforge/locales/en_CA/company.py +193 -0
  27. dataforge/locales/en_CA/internet.py +34 -0
  28. dataforge/locales/en_CA/person.py +377 -0
  29. dataforge/locales/en_CA/phone.py +15 -0
  30. dataforge/locales/en_GB/__init__.py +1 -0
  31. dataforge/locales/en_GB/address.py +312 -0
  32. dataforge/locales/en_GB/company.py +196 -0
  33. dataforge/locales/en_GB/internet.py +34 -0
  34. dataforge/locales/en_GB/person.py +372 -0
  35. dataforge/locales/en_GB/phone.py +15 -0
  36. dataforge/locales/en_US/__init__.py +1 -0
  37. dataforge/locales/en_US/address.py +268 -0
  38. dataforge/locales/en_US/company.py +191 -0
  39. dataforge/locales/en_US/internet.py +34 -0
  40. dataforge/locales/en_US/person.py +370 -0
  41. dataforge/locales/en_US/phone.py +15 -0
  42. dataforge/locales/es_ES/__init__.py +1 -0
  43. dataforge/locales/es_ES/address.py +151 -0
  44. dataforge/locales/es_ES/company.py +125 -0
  45. dataforge/locales/es_ES/internet.py +30 -0
  46. dataforge/locales/es_ES/person.py +207 -0
  47. dataforge/locales/es_ES/phone.py +15 -0
  48. dataforge/locales/fr_FR/__init__.py +1 -0
  49. dataforge/locales/fr_FR/address.py +145 -0
  50. dataforge/locales/fr_FR/company.py +125 -0
  51. dataforge/locales/fr_FR/internet.py +30 -0
  52. dataforge/locales/fr_FR/person.py +212 -0
  53. dataforge/locales/fr_FR/phone.py +15 -0
  54. dataforge/locales/hi_IN/__init__.py +1 -0
  55. dataforge/locales/hi_IN/address.py +177 -0
  56. dataforge/locales/hi_IN/company.py +191 -0
  57. dataforge/locales/hi_IN/internet.py +26 -0
  58. dataforge/locales/hi_IN/person.py +218 -0
  59. dataforge/locales/hi_IN/phone.py +21 -0
  60. dataforge/locales/it_IT/__init__.py +1 -0
  61. dataforge/locales/it_IT/address.py +218 -0
  62. dataforge/locales/it_IT/company.py +151 -0
  63. dataforge/locales/it_IT/internet.py +31 -0
  64. dataforge/locales/it_IT/person.py +187 -0
  65. dataforge/locales/it_IT/phone.py +15 -0
  66. dataforge/locales/ja_JP/__init__.py +1 -0
  67. dataforge/locales/ja_JP/address.py +174 -0
  68. dataforge/locales/ja_JP/company.py +121 -0
  69. dataforge/locales/ja_JP/internet.py +30 -0
  70. dataforge/locales/ja_JP/person.py +207 -0
  71. dataforge/locales/ja_JP/phone.py +18 -0
  72. dataforge/locales/ko_KR/__init__.py +1 -0
  73. dataforge/locales/ko_KR/address.py +121 -0
  74. dataforge/locales/ko_KR/company.py +151 -0
  75. dataforge/locales/ko_KR/internet.py +30 -0
  76. dataforge/locales/ko_KR/person.py +157 -0
  77. dataforge/locales/ko_KR/phone.py +26 -0
  78. dataforge/locales/nl_NL/__init__.py +1 -0
  79. dataforge/locales/nl_NL/address.py +152 -0
  80. dataforge/locales/nl_NL/company.py +182 -0
  81. dataforge/locales/nl_NL/internet.py +41 -0
  82. dataforge/locales/nl_NL/person.py +218 -0
  83. dataforge/locales/nl_NL/phone.py +19 -0
  84. dataforge/locales/pl_PL/__init__.py +1 -0
  85. dataforge/locales/pl_PL/address.py +140 -0
  86. dataforge/locales/pl_PL/company.py +183 -0
  87. dataforge/locales/pl_PL/internet.py +36 -0
  88. dataforge/locales/pl_PL/person.py +217 -0
  89. dataforge/locales/pl_PL/phone.py +15 -0
  90. dataforge/locales/pt_BR/__init__.py +1 -0
  91. dataforge/locales/pt_BR/address.py +127 -0
  92. dataforge/locales/pt_BR/company.py +151 -0
  93. dataforge/locales/pt_BR/internet.py +31 -0
  94. dataforge/locales/pt_BR/person.py +187 -0
  95. dataforge/locales/pt_BR/phone.py +15 -0
  96. dataforge/locales/ru_RU/__init__.py +1 -0
  97. dataforge/locales/ru_RU/address.py +156 -0
  98. dataforge/locales/ru_RU/company.py +168 -0
  99. dataforge/locales/ru_RU/internet.py +26 -0
  100. dataforge/locales/ru_RU/person.py +218 -0
  101. dataforge/locales/ru_RU/phone.py +16 -0
  102. dataforge/locales/zh_CN/__init__.py +1 -0
  103. dataforge/locales/zh_CN/address.py +141 -0
  104. dataforge/locales/zh_CN/company.py +151 -0
  105. dataforge/locales/zh_CN/internet.py +30 -0
  106. dataforge/locales/zh_CN/person.py +157 -0
  107. dataforge/locales/zh_CN/phone.py +25 -0
  108. dataforge/providers/__init__.py +1 -0
  109. dataforge/providers/address.py +460 -0
  110. dataforge/providers/ai_chat.py +170 -0
  111. dataforge/providers/ai_prompt.py +447 -0
  112. dataforge/providers/automotive.py +416 -0
  113. dataforge/providers/barcode.py +149 -0
  114. dataforge/providers/base.py +34 -0
  115. dataforge/providers/color.py +247 -0
  116. dataforge/providers/company.py +144 -0
  117. dataforge/providers/crypto.py +105 -0
  118. dataforge/providers/datetime.py +397 -0
  119. dataforge/providers/ecommerce.py +316 -0
  120. dataforge/providers/education.py +234 -0
  121. dataforge/providers/file.py +271 -0
  122. dataforge/providers/finance.py +545 -0
  123. dataforge/providers/geo.py +332 -0
  124. dataforge/providers/government.py +114 -0
  125. dataforge/providers/internet.py +351 -0
  126. dataforge/providers/llm.py +726 -0
  127. dataforge/providers/lorem.py +241 -0
  128. dataforge/providers/medical.py +364 -0
  129. dataforge/providers/misc.py +196 -0
  130. dataforge/providers/network.py +283 -0
  131. dataforge/providers/payment.py +300 -0
  132. dataforge/providers/person.py +195 -0
  133. dataforge/providers/phone.py +87 -0
  134. dataforge/providers/profile.py +265 -0
  135. dataforge/providers/science.py +365 -0
  136. dataforge/providers/text.py +365 -0
  137. dataforge/py.typed +0 -0
  138. dataforge/pytest_plugin.py +80 -0
  139. dataforge/registry.py +164 -0
  140. dataforge/schema.py +772 -0
  141. dataforge/unique.py +171 -0
  142. dataforge_py-0.2.0.dist-info/METADATA +964 -0
  143. dataforge_py-0.2.0.dist-info/RECORD +145 -0
  144. dataforge_py-0.2.0.dist-info/WHEEL +4 -0
  145. dataforge_py-0.2.0.dist-info/entry_points.txt +35 -0
dataforge/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ """dataforge — High-performance fake data generator for testing.
2
+
3
+ Usage::
4
+
5
+ from dataforge import DataForge
6
+
7
+ forge = DataForge(locale="en_US", seed=42)
8
+ forge.person.first_name() # "James"
9
+ forge.address.full_address() # "4821 Oak Ave, Chicago, IL 60614"
10
+ forge.person.full_name(count=1000) # list of 1000 full names
11
+
12
+ # Unique values
13
+ forge.unique.person.first_name() # guaranteed unique per call
14
+ """
15
+
16
+ from dataforge.core import DataForge
17
+ from dataforge.schema import Schema
18
+
19
+ __version__ = "0.2.0"
20
+ __all__ = ["DataForge", "Schema", "__version__"]
dataforge/backend.py ADDED
@@ -0,0 +1,147 @@
1
+ """RandomEngine — the speed engine behind dataforge.
2
+
3
+ Provides a unified interface for random selection using stdlib
4
+ ``random`` — optimised for both scalar picks and batch generation.
5
+ """
6
+
7
+ import random as _random
8
+ from typing import TypeVar
9
+
10
+ _T = TypeVar("_T")
11
+
12
+ # Pre-computed power-of-10 table for random_digits_str — eliminates
13
+ # per-call ``10**n`` computation for n=1..18.
14
+ _POW10: tuple[int, ...] = tuple(10**i for i in range(19)) # _POW10[0]=1 .. _POW10[18]
15
+
16
+
17
+ class RandomEngine:
18
+ """Core randomness engine.
19
+
20
+ Parameters
21
+ ----------
22
+ seed : int | None
23
+ Optional seed for reproducibility.
24
+ """
25
+
26
+ __slots__ = ("_rng",)
27
+
28
+ def __init__(self, seed: int | None = None) -> None:
29
+ self._rng: _random.Random = _random.Random(seed)
30
+
31
+ # ------------------------------------------------------------------
32
+ # Public API
33
+ # ------------------------------------------------------------------
34
+
35
+ def choice(self, data: tuple[_T, ...]) -> _T:
36
+ """Return a single random element from *data*.
37
+
38
+ Uses stdlib ``random.Random.choice`` which is the fastest path
39
+ for picking one item.
40
+ """
41
+ return self._rng.choice(data)
42
+
43
+ def choices(self, data: tuple[_T, ...], count: int) -> list[_T]:
44
+ """Return *count* random elements from *data*."""
45
+ return self._rng.choices(data, k=count)
46
+
47
+ def random_int(self, min_val: int = 0, max_val: int = 9999) -> int:
48
+ """Return a random integer between *min_val* and *max_val* inclusive."""
49
+ return self._rng.randint(min_val, max_val)
50
+
51
+ def numerify(self, pattern: str) -> str:
52
+ """Replace every ``#`` in *pattern* with a random digit.
53
+
54
+ Example: ``"#####"`` → ``"38201"``
55
+
56
+ Optimized: if the pattern is all ``#`` characters, generates
57
+ all digits in a single call via :meth:`random_digits_str`.
58
+ For mixed patterns, pre-counts ``#`` and generates all digits
59
+ in one bulk call, then substitutes via iterator.
60
+ """
61
+ # Fast path: pattern is entirely # characters (very common).
62
+ # Use length check instead of iterating all characters with all().
63
+ hash_count = pattern.count("#")
64
+ if hash_count == len(pattern):
65
+ return self.random_digits_str(hash_count)
66
+ if hash_count == 0:
67
+ return pattern
68
+ # Slow path optimized: generate all digits in one call, then
69
+ # substitute via iterator — avoids N random_digit() calls.
70
+ digits = self.random_digits_str(hash_count)
71
+ it = iter(digits)
72
+ return "".join(next(it) if ch == "#" else ch for ch in pattern)
73
+
74
+ def getrandbits(self, k: int) -> int:
75
+ """Return a random integer with *k* random bits.
76
+
77
+ This is the fastest way to generate a large block of randomness
78
+ in a single call — used by providers that need to build strings
79
+ from many random hex/decimal digits (IPv6, MAC, barcodes, etc.).
80
+ """
81
+ return self._rng.getrandbits(k)
82
+
83
+ def random_digits_str(self, n: int) -> str:
84
+ """Return a string of *n* random decimal digits.
85
+
86
+ Uses a pre-computed ``_POW10`` lookup table to avoid per-call
87
+ ``10**n`` computation. For small n (≤ 18), a single
88
+ ``randint`` call is the fastest path.
89
+ """
90
+ _pow10 = _POW10
91
+ if n <= 18:
92
+ val = self._rng.randint(0, _pow10[n] - 1)
93
+ return str(val).zfill(n)
94
+ # For larger n, concatenate chunks of 18 digits
95
+ parts: list[str] = []
96
+ remaining = n
97
+ _max18 = _pow10[18] - 1
98
+ _randint = self._rng.randint
99
+ while remaining > 0:
100
+ chunk = min(remaining, 18)
101
+ val = _randint(0, _pow10[chunk] - 1)
102
+ parts.append(str(val).zfill(chunk))
103
+ remaining -= chunk
104
+ return "".join(parts)
105
+
106
+ def seed(self, value: int) -> None:
107
+ """Re-seed the engine for reproducibility."""
108
+ self._rng.seed(value)
109
+
110
+ def weighted_choices(
111
+ self,
112
+ data: tuple[_T, ...],
113
+ weights: tuple[float, ...] | list[float],
114
+ count: int,
115
+ ) -> list[_T]:
116
+ """Return *count* random elements from *data* with *weights*.
117
+
118
+ Each element in *data* is selected with probability proportional
119
+ to its corresponding weight.
120
+
121
+ Parameters
122
+ ----------
123
+ data : tuple
124
+ The items to choose from.
125
+ weights : tuple[float, ...] or list[float]
126
+ Non-negative weights (need not sum to 1).
127
+ count : int
128
+ Number of items to pick.
129
+
130
+ Returns
131
+ -------
132
+ list
133
+ """
134
+ # Accept both tuple and list directly — stdlib choices() handles
135
+ # both; avoid redundant list() conversion.
136
+ return self._rng.choices(data, weights=weights, k=count)
137
+
138
+ def weighted_choice(
139
+ self,
140
+ data: tuple[_T, ...],
141
+ weights: tuple[float, ...] | list[float],
142
+ ) -> _T:
143
+ """Return a single random element from *data* with *weights*.
144
+
145
+ Scalar version of :meth:`weighted_choices`.
146
+ """
147
+ return self._rng.choices(data, weights=weights, k=1)[0]
dataforge/cli.py ADDED
@@ -0,0 +1,166 @@
1
+ """dataforge CLI — generate fake data from the command line.
2
+
3
+ Usage::
4
+
5
+ dataforge --count 100 --format csv name email phone
6
+ dataforge --count 10 --format json first_name last_name city
7
+ dataforge --locale de_DE --count 5 full_name address
8
+ dataforge --list-fields
9
+
10
+ Supported output formats: text, csv, json, jsonl
11
+ """
12
+
13
+ import argparse
14
+ import csv
15
+ import io
16
+ import json
17
+ import sys
18
+
19
+ from dataforge import DataForge
20
+ from dataforge.registry import get_field_map
21
+
22
+
23
+ def _build_parser() -> argparse.ArgumentParser:
24
+ parser = argparse.ArgumentParser(
25
+ prog="dataforge",
26
+ description="Generate fake data for testing from the command line.",
27
+ )
28
+ parser.add_argument(
29
+ "fields",
30
+ nargs="*",
31
+ help="Fields to generate (e.g. first_name email city). "
32
+ "Use --list-fields to see all available fields.",
33
+ )
34
+ parser.add_argument(
35
+ "-n",
36
+ "--count",
37
+ type=int,
38
+ default=10,
39
+ help="Number of rows to generate (default: 10).",
40
+ )
41
+ parser.add_argument(
42
+ "-f",
43
+ "--format",
44
+ choices=("text", "csv", "json", "jsonl"),
45
+ default="text",
46
+ help="Output format (default: text).",
47
+ )
48
+ parser.add_argument(
49
+ "-l",
50
+ "--locale",
51
+ default="en_US",
52
+ help="Locale for data generation (default: en_US).",
53
+ )
54
+ parser.add_argument(
55
+ "-s",
56
+ "--seed",
57
+ type=int,
58
+ default=None,
59
+ help="Random seed for reproducible output.",
60
+ )
61
+ parser.add_argument(
62
+ "--list-fields",
63
+ action="store_true",
64
+ help="List all available field names and exit.",
65
+ )
66
+ parser.add_argument(
67
+ "-o",
68
+ "--output",
69
+ default=None,
70
+ metavar="FILE",
71
+ help="Write output to FILE instead of stdout.",
72
+ )
73
+ parser.add_argument(
74
+ "--no-header",
75
+ action="store_true",
76
+ help="Omit header row in text and csv output formats.",
77
+ )
78
+ return parser
79
+
80
+
81
+ def main(argv: list[str] | None = None) -> int:
82
+ """Entry point for the dataforge CLI."""
83
+ parser = _build_parser()
84
+ args = parser.parse_args(argv)
85
+
86
+ field_map = get_field_map()
87
+
88
+ if args.list_fields:
89
+ # Group fields by provider
90
+ for name in sorted(field_map.keys()):
91
+ provider, method = field_map[name]
92
+ print(f" {name:24s} ({provider}.{method})")
93
+ return 0
94
+
95
+ if not args.fields:
96
+ # Default fields
97
+ args.fields = ["first_name", "last_name", "email"]
98
+
99
+ # Validate fields before generating
100
+ for field in args.fields:
101
+ if field not in field_map:
102
+ print(
103
+ f"Error: unknown field '{field}'. Use --list-fields to see options.",
104
+ file=sys.stderr,
105
+ )
106
+ return 1
107
+
108
+ forge = DataForge(locale=args.locale, seed=args.seed)
109
+
110
+ # Column-first batch generation — dramatically faster than row-by-row
111
+ headers = args.fields
112
+ rows = forge.to_dict(headers, count=args.count)
113
+
114
+ # Determine output destination
115
+ out_file = None
116
+ if args.output:
117
+ out_file = open(args.output, "w", encoding="utf-8", newline="")
118
+ out = out_file or sys.stdout
119
+
120
+ try:
121
+ # Output
122
+ fmt = args.format
123
+
124
+ if fmt == "text":
125
+ # Aligned columns
126
+ col_widths = [len(h) for h in headers]
127
+ for row in rows:
128
+ for i, h in enumerate(headers):
129
+ col_widths[i] = max(col_widths[i], len(row[h]))
130
+
131
+ if not args.no_header:
132
+ header_line = " ".join(
133
+ h.ljust(col_widths[i]) for i, h in enumerate(headers)
134
+ )
135
+ sep_line = " ".join("-" * col_widths[i] for i in range(len(headers)))
136
+ print(header_line, file=out)
137
+ print(sep_line, file=out)
138
+ for row in rows:
139
+ line = " ".join(
140
+ row[h].ljust(col_widths[i]) for i, h in enumerate(headers)
141
+ )
142
+ print(line, file=out)
143
+
144
+ elif fmt == "csv":
145
+ buf = io.StringIO()
146
+ writer = csv.DictWriter(buf, fieldnames=headers)
147
+ if not args.no_header:
148
+ writer.writeheader()
149
+ writer.writerows(rows)
150
+ print(buf.getvalue(), end="", file=out)
151
+
152
+ elif fmt == "json":
153
+ print(json.dumps(rows, indent=2, ensure_ascii=False), file=out)
154
+
155
+ elif fmt == "jsonl":
156
+ for row in rows:
157
+ print(json.dumps(row, ensure_ascii=False), file=out)
158
+ finally:
159
+ if out_file is not None:
160
+ out_file.close()
161
+
162
+ return 0
163
+
164
+
165
+ if __name__ == "__main__":
166
+ sys.exit(main())