loadforge 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datagen/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ __all__ = ["__version__"]
2
+ __version__ = "0.1.1"
datagen/cli.py ADDED
@@ -0,0 +1,258 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import random
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from faker import Faker
9
+ from sqlalchemy import MetaData, Table, create_engine
10
+
11
+ from datagen.config_loader import load_config
12
+ from datagen.generator.base import generate_all
13
+ from datagen.generator.dist import parse_dist_arg
14
+ from datagen.parser.ddl import parse_ddl_file
15
+ from datagen.parser.graph import generation_order
16
+ from datagen.parser.introspect import load_schema_from_db
17
+ from datagen.report import build_report
18
+ from datagen.validation import validate_check_constraints, validate_generated_data
19
+ from datagen.writer.csv_writer import write_csv
20
+ from datagen.writer.json_writer import write_json
21
+ from datagen.writer.postgres import render_insert_sql
22
+ from datagen.writer.parquet_writer import write_parquet
23
+
24
+
25
+ def _insert_via_sqlalchemy(db_url: str, data: dict[str, list[dict]]) -> None:
26
+ engine = create_engine(db_url)
27
+ meta = MetaData()
28
+ with engine.begin() as conn:
29
+ for table_name, rows in data.items():
30
+ if not rows:
31
+ continue
32
+ table = Table(table_name, meta, autoload_with=conn)
33
+ conn.execute(table.insert(), rows)
34
+
35
+
36
+ def _resolve_tables_from_args(args: argparse.Namespace):
37
+ if args.schema_from_db:
38
+ if not args.db_url:
39
+ raise SystemExit("--schema-from-db requires --db-url")
40
+ engine = create_engine(args.db_url)
41
+ table_names = [x.strip() for x in args.tables.split(",")] if args.tables else None
42
+ return load_schema_from_db(engine, table_names)
43
+
44
+ if not args.ddl:
45
+ raise SystemExit("Provide either --ddl <schema.sql> or --schema-from-db")
46
+
47
+ return parse_ddl_file(args.ddl)
48
+
49
+
50
+ def _parse_table_rows_map(raw_entries: list[str] | None) -> dict[str, int]:
51
+ out: dict[str, int] = {}
52
+ for entry in raw_entries or []:
53
+ for token in [t.strip() for t in str(entry).split(",") if t.strip()]:
54
+ if "=" not in token:
55
+ raise SystemExit(f"Invalid --table-rows token '{token}'. Use table=count")
56
+ table, count_raw = token.split("=", 1)
57
+ table = table.strip()
58
+ try:
59
+ count = int(count_raw.strip())
60
+ except ValueError as e:
61
+ raise SystemExit(f"Invalid row count in --table-rows token '{token}'") from e
62
+ if count < 0:
63
+ raise SystemExit(f"Row count must be >=0 in --table-rows token '{token}'")
64
+ out[table] = count
65
+ return out
66
+
67
+
68
+ def build_parser() -> argparse.ArgumentParser:
69
+ p = argparse.ArgumentParser(description="Generate synthetic data from DDL or live DB schema")
70
+
71
+ p.add_argument("--config", help="Path to config file (.json/.toml/.yaml/.yml)")
72
+
73
+ # Input source
74
+ p.add_argument("--ddl", help="Path to schema.sql")
75
+ p.add_argument("--schema-from-db", action="store_true", default=None, help="Introspect tables from --db-url instead of using --ddl")
76
+ p.add_argument("--tables", help="Comma-separated table names to include when using --schema-from-db")
77
+
78
+ # Generation/output
79
+ p.add_argument("--rows", type=int, help="Default rows per table")
80
+ p.add_argument(
81
+ "--table-rows",
82
+ action="append",
83
+ default=None,
84
+ help="Per-table row count override, e.g. users=100,orders=200 (repeatable)",
85
+ )
86
+ p.add_argument("--out", choices=["postgres", "mysql", "sqlite", "bigquery", "json", "csv", "parquet"], default=None)
87
+ p.add_argument("--db-url", help="DB URL for schema introspection and/or direct insert")
88
+ p.add_argument("--insert", action="store_true", default=None, help="Insert generated rows into --db-url")
89
+ p.add_argument(
90
+ "--dist",
91
+ action="append",
92
+ default=None,
93
+ help="Distribution override, e.g. age:normal,mean=35,std=7 or users.age:normal,mean=35,std=7",
94
+ )
95
+ p.add_argument("--seed", type=int, default=None, help="Random seed for reproducible generation")
96
+ p.add_argument("--output-path", default=None, help="Optional output file/path (json/sql file or csv dir)")
97
+ p.add_argument("--report-path", default=None, help="Optional JSON report path for generated data profile")
98
+ p.add_argument("--strict-checks", action="store_true", default=None, help="Validate generated rows against supported CHECK constraints")
99
+ p.add_argument("--engine", choices=["python", "polars"], default=None, help="Generation/render engine")
100
+ p.add_argument("--bq-insert-all", action="store_true", default=None, help="When --out bigquery, render INSERT ALL syntax")
101
+ p.add_argument("--parquet-compression", choices=["snappy", "zstd", "lz4", "gzip", "none"], default=None, help="Parquet compression codec (default: snappy)")
102
+ return p
103
+
104
+
105
+ def _merge_config(args: argparse.Namespace) -> argparse.Namespace:
106
+ cfg = load_config(args.config)
107
+
108
+ defaults: dict[str, Any] = {
109
+ "out": "postgres",
110
+ "dist": [],
111
+ "schema_from_db": False,
112
+ "insert": False,
113
+ "engine": "python",
114
+ "table_rows": [],
115
+ "bq_insert_all": False,
116
+ "parquet_compression": "snappy",
117
+ "strict_checks": False,
118
+ }
119
+
120
+ for k, v in defaults.items():
121
+ if getattr(args, k) is None:
122
+ setattr(args, k, v)
123
+
124
+ # config as base
125
+ for key in [
126
+ "ddl",
127
+ "schema_from_db",
128
+ "tables",
129
+ "rows",
130
+ "out",
131
+ "db_url",
132
+ "insert",
133
+ "seed",
134
+ "output_path",
135
+ "report_path",
136
+ "strict_checks",
137
+ "engine",
138
+ "bq_insert_all",
139
+ "parquet_compression",
140
+ ]:
141
+ if getattr(args, key) is None:
142
+ if key in cfg:
143
+ setattr(args, key, cfg[key])
144
+
145
+ # dist handling (CLI wins)
146
+ cfg_dist = cfg.get("dist", []) if isinstance(cfg, dict) else []
147
+ if not isinstance(cfg_dist, list):
148
+ cfg_dist = []
149
+ cli_dist = args.dist or []
150
+ if cli_dist:
151
+ args.dist = cli_dist
152
+ else:
153
+ args.dist = [str(x) for x in cfg_dist]
154
+
155
+ # table-rows: CLI value or config map/list
156
+ if args.table_rows:
157
+ table_rows_raw = args.table_rows
158
+ else:
159
+ cfg_tr = cfg.get("table_rows", {}) if isinstance(cfg, dict) else {}
160
+ if isinstance(cfg_tr, dict):
161
+ table_rows_raw = [f"{k}={v}" for k, v in cfg_tr.items()]
162
+ elif isinstance(cfg_tr, list):
163
+ table_rows_raw = [str(x) for x in cfg_tr]
164
+ else:
165
+ table_rows_raw = []
166
+ args.table_rows = table_rows_raw
167
+
168
+ if args.rows is None and not args.table_rows:
169
+ raise SystemExit("--rows is required unless --table-rows is set (or configured)")
170
+
171
+ if args.rows is None:
172
+ args.rows = 0
173
+
174
+ if args.out is None:
175
+ args.out = "postgres"
176
+
177
+ return args
178
+
179
+
180
+ def main() -> None:
181
+ args = build_parser().parse_args()
182
+ args = _merge_config(args)
183
+
184
+ if args.seed is not None:
185
+ random.seed(args.seed)
186
+ Faker.seed(args.seed)
187
+
188
+ tables = _resolve_tables_from_args(args)
189
+ order = generation_order(tables)
190
+ dist_map = dict(parse_dist_arg(d) for d in (args.dist or []))
191
+ table_rows_map = _parse_table_rows_map(args.table_rows)
192
+ data = generate_all(
193
+ tables,
194
+ order,
195
+ int(args.rows),
196
+ dist_map,
197
+ table_rows=table_rows_map,
198
+ engine=args.engine,
199
+ )
200
+
201
+ if args.insert:
202
+ if not args.db_url:
203
+ raise SystemExit("--insert requires --db-url")
204
+ _insert_via_sqlalchemy(args.db_url, data)
205
+
206
+ validation = validate_generated_data(tables, data)
207
+ if args.strict_checks:
208
+ check_validation = validate_check_constraints(tables, data)
209
+ validation["counts"]["check_violations"] = check_validation["check_violations"]
210
+ validation["sample_issues"].extend(check_validation["check_details"])
211
+ validation["counts"]["sample_issue_count"] = len(validation["sample_issues"])
212
+ if check_validation["check_violations"] > 0:
213
+ validation["pass"] = False
214
+ validation["counts"]["total_failures"] += check_validation["check_violations"]
215
+
216
+ if args.report_path:
217
+ report = build_report(data, validation=validation)
218
+ write_json(report, args.report_path, engine=args.engine)
219
+
220
+ if args.out == "json":
221
+ payload = write_json(data, args.output_path, engine=args.engine)
222
+ if not args.output_path:
223
+ print(payload)
224
+ return
225
+
226
+ if args.out == "csv":
227
+ out_dir = args.output_path or "./output_csv"
228
+ files = write_csv(data, out_dir, engine=args.engine)
229
+ print("\n".join(files))
230
+ return
231
+
232
+ if args.out == "parquet":
233
+ out_dir = args.output_path or "./output_parquet"
234
+ files = write_parquet(data, out_dir, compression=args.parquet_compression)
235
+ print("\n".join(files))
236
+ return
237
+
238
+ dialect = args.out if args.out in {"postgres", "mysql", "sqlite", "bigquery"} else "postgres"
239
+ chunks = []
240
+ for table_name in order:
241
+ sql = render_insert_sql(
242
+ table_name,
243
+ data.get(table_name, []),
244
+ dialect=dialect,
245
+ engine=args.engine,
246
+ bq_insert_all=bool(args.bq_insert_all),
247
+ )
248
+ if sql:
249
+ chunks.append(sql)
250
+ output = "\n".join(chunks)
251
+ if args.output_path:
252
+ Path(args.output_path).write_text(output, encoding="utf-8")
253
+ else:
254
+ print(output)
255
+
256
+
257
+ if __name__ == "__main__":
258
+ main()
datagen/config.py ADDED
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class DistSpec:
9
+ kind: str
10
+ params: dict[str, Any] = field(default_factory=dict)
11
+
12
+
13
+ @dataclass
14
+ class ForeignKey:
15
+ column: str
16
+ ref_table: str
17
+ ref_column: str
18
+
19
+
20
+ @dataclass
21
+ class ColumnMeta:
22
+ name: str
23
+ type_name: str
24
+ nullable: bool = True
25
+ primary_key: bool = False
26
+ unique: bool = False
27
+ max_length: int | None = None
28
+
29
+
30
+ @dataclass
31
+ class UniqueConstraintMeta:
32
+ columns: list[str]
33
+ name: str | None = None
34
+
35
+
36
+ @dataclass
37
+ class CheckConstraintMeta:
38
+ expression: str
39
+ name: str | None = None
40
+
41
+
42
+ @dataclass
43
+ class TableMeta:
44
+ name: str
45
+ columns: list[ColumnMeta]
46
+ foreign_keys: list[ForeignKey] = field(default_factory=list)
47
+ unique_constraints: list[UniqueConstraintMeta] = field(default_factory=list)
48
+ check_constraints: list[CheckConstraintMeta] = field(default_factory=list)
49
+
50
+ def column(self, name: str) -> ColumnMeta:
51
+ for c in self.columns:
52
+ if c.name == name:
53
+ return c
54
+ raise KeyError(f"Column not found: {name}")
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+
8
+ def _load_toml(path: Path) -> dict[str, Any]:
9
+ try:
10
+ import tomllib # py3.11+
11
+
12
+ return tomllib.loads(path.read_text(encoding="utf-8"))
13
+ except ModuleNotFoundError:
14
+ try:
15
+ import tomli
16
+ except Exception as e: # pragma: no cover
17
+ raise SystemExit("TOML config requires tomli on Python 3.10. Install with: pip install tomli") from e
18
+ return tomli.loads(path.read_text(encoding="utf-8"))
19
+
20
+
21
+ def _load_yaml(path: Path) -> dict[str, Any]:
22
+ try:
23
+ import yaml # type: ignore
24
+ except Exception as e: # pragma: no cover
25
+ raise SystemExit("YAML config requires PyYAML. Install with: pip install pyyaml") from e
26
+
27
+ data = yaml.safe_load(path.read_text(encoding="utf-8"))
28
+ if not isinstance(data, dict):
29
+ raise SystemExit("Config file root must be an object/map")
30
+ return data
31
+
32
+
33
+ def load_config(path_str: str | None) -> dict[str, Any]:
34
+ if not path_str:
35
+ return {}
36
+
37
+ path = Path(path_str)
38
+ if not path.exists():
39
+ raise SystemExit(f"Config file not found: {path}")
40
+
41
+ ext = path.suffix.lower()
42
+ if ext in {".json"}:
43
+ data = json.loads(path.read_text(encoding="utf-8"))
44
+ elif ext in {".toml"}:
45
+ data = _load_toml(path)
46
+ elif ext in {".yaml", ".yml"}:
47
+ data = _load_yaml(path)
48
+ else:
49
+ raise SystemExit("Unsupported config extension. Use .json/.toml/.yaml/.yml")
50
+
51
+ if not isinstance(data, dict):
52
+ raise SystemExit("Config file root must be an object/map")
53
+ return data
@@ -0,0 +1 @@
1
+