loadforge 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datagen/__init__.py +2 -0
- datagen/cli.py +258 -0
- datagen/config.py +54 -0
- datagen/config_loader.py +53 -0
- datagen/generator/__init__.py +1 -0
- datagen/generator/base.py +471 -0
- datagen/generator/dist.py +142 -0
- datagen/parser/__init__.py +1 -0
- datagen/parser/ddl.py +204 -0
- datagen/parser/graph.py +43 -0
- datagen/parser/introspect.py +84 -0
- datagen/report.py +45 -0
- datagen/validation.py +254 -0
- datagen/writer/__init__.py +1 -0
- datagen/writer/csv_writer.py +39 -0
- datagen/writer/json_writer.py +25 -0
- datagen/writer/parquet_writer.py +32 -0
- datagen/writer/postgres.py +113 -0
- loadforge-0.1.1.dist-info/METADATA +453 -0
- loadforge-0.1.1.dist-info/RECORD +24 -0
- loadforge-0.1.1.dist-info/WHEEL +5 -0
- loadforge-0.1.1.dist-info/entry_points.txt +2 -0
- loadforge-0.1.1.dist-info/licenses/LICENSE +201 -0
- loadforge-0.1.1.dist-info/top_level.txt +1 -0
datagen/__init__.py
ADDED
datagen/cli.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import random
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from faker import Faker
|
|
9
|
+
from sqlalchemy import MetaData, Table, create_engine
|
|
10
|
+
|
|
11
|
+
from datagen.config_loader import load_config
|
|
12
|
+
from datagen.generator.base import generate_all
|
|
13
|
+
from datagen.generator.dist import parse_dist_arg
|
|
14
|
+
from datagen.parser.ddl import parse_ddl_file
|
|
15
|
+
from datagen.parser.graph import generation_order
|
|
16
|
+
from datagen.parser.introspect import load_schema_from_db
|
|
17
|
+
from datagen.report import build_report
|
|
18
|
+
from datagen.validation import validate_check_constraints, validate_generated_data
|
|
19
|
+
from datagen.writer.csv_writer import write_csv
|
|
20
|
+
from datagen.writer.json_writer import write_json
|
|
21
|
+
from datagen.writer.postgres import render_insert_sql
|
|
22
|
+
from datagen.writer.parquet_writer import write_parquet
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _insert_via_sqlalchemy(db_url: str, data: dict[str, list[dict]]) -> None:
|
|
26
|
+
engine = create_engine(db_url)
|
|
27
|
+
meta = MetaData()
|
|
28
|
+
with engine.begin() as conn:
|
|
29
|
+
for table_name, rows in data.items():
|
|
30
|
+
if not rows:
|
|
31
|
+
continue
|
|
32
|
+
table = Table(table_name, meta, autoload_with=conn)
|
|
33
|
+
conn.execute(table.insert(), rows)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _resolve_tables_from_args(args: argparse.Namespace):
|
|
37
|
+
if args.schema_from_db:
|
|
38
|
+
if not args.db_url:
|
|
39
|
+
raise SystemExit("--schema-from-db requires --db-url")
|
|
40
|
+
engine = create_engine(args.db_url)
|
|
41
|
+
table_names = [x.strip() for x in args.tables.split(",")] if args.tables else None
|
|
42
|
+
return load_schema_from_db(engine, table_names)
|
|
43
|
+
|
|
44
|
+
if not args.ddl:
|
|
45
|
+
raise SystemExit("Provide either --ddl <schema.sql> or --schema-from-db")
|
|
46
|
+
|
|
47
|
+
return parse_ddl_file(args.ddl)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _parse_table_rows_map(raw_entries: list[str] | None) -> dict[str, int]:
|
|
51
|
+
out: dict[str, int] = {}
|
|
52
|
+
for entry in raw_entries or []:
|
|
53
|
+
for token in [t.strip() for t in str(entry).split(",") if t.strip()]:
|
|
54
|
+
if "=" not in token:
|
|
55
|
+
raise SystemExit(f"Invalid --table-rows token '{token}'. Use table=count")
|
|
56
|
+
table, count_raw = token.split("=", 1)
|
|
57
|
+
table = table.strip()
|
|
58
|
+
try:
|
|
59
|
+
count = int(count_raw.strip())
|
|
60
|
+
except ValueError as e:
|
|
61
|
+
raise SystemExit(f"Invalid row count in --table-rows token '{token}'") from e
|
|
62
|
+
if count < 0:
|
|
63
|
+
raise SystemExit(f"Row count must be >=0 in --table-rows token '{token}'")
|
|
64
|
+
out[table] = count
|
|
65
|
+
return out
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
69
|
+
p = argparse.ArgumentParser(description="Generate synthetic data from DDL or live DB schema")
|
|
70
|
+
|
|
71
|
+
p.add_argument("--config", help="Path to config file (.json/.toml/.yaml/.yml)")
|
|
72
|
+
|
|
73
|
+
# Input source
|
|
74
|
+
p.add_argument("--ddl", help="Path to schema.sql")
|
|
75
|
+
p.add_argument("--schema-from-db", action="store_true", default=None, help="Introspect tables from --db-url instead of using --ddl")
|
|
76
|
+
p.add_argument("--tables", help="Comma-separated table names to include when using --schema-from-db")
|
|
77
|
+
|
|
78
|
+
# Generation/output
|
|
79
|
+
p.add_argument("--rows", type=int, help="Default rows per table")
|
|
80
|
+
p.add_argument(
|
|
81
|
+
"--table-rows",
|
|
82
|
+
action="append",
|
|
83
|
+
default=None,
|
|
84
|
+
help="Per-table row count override, e.g. users=100,orders=200 (repeatable)",
|
|
85
|
+
)
|
|
86
|
+
p.add_argument("--out", choices=["postgres", "mysql", "sqlite", "bigquery", "json", "csv", "parquet"], default=None)
|
|
87
|
+
p.add_argument("--db-url", help="DB URL for schema introspection and/or direct insert")
|
|
88
|
+
p.add_argument("--insert", action="store_true", default=None, help="Insert generated rows into --db-url")
|
|
89
|
+
p.add_argument(
|
|
90
|
+
"--dist",
|
|
91
|
+
action="append",
|
|
92
|
+
default=None,
|
|
93
|
+
help="Distribution override, e.g. age:normal,mean=35,std=7 or users.age:normal,mean=35,std=7",
|
|
94
|
+
)
|
|
95
|
+
p.add_argument("--seed", type=int, default=None, help="Random seed for reproducible generation")
|
|
96
|
+
p.add_argument("--output-path", default=None, help="Optional output file/path (json/sql file or csv dir)")
|
|
97
|
+
p.add_argument("--report-path", default=None, help="Optional JSON report path for generated data profile")
|
|
98
|
+
p.add_argument("--strict-checks", action="store_true", default=None, help="Validate generated rows against supported CHECK constraints")
|
|
99
|
+
p.add_argument("--engine", choices=["python", "polars"], default=None, help="Generation/render engine")
|
|
100
|
+
p.add_argument("--bq-insert-all", action="store_true", default=None, help="When --out bigquery, render INSERT ALL syntax")
|
|
101
|
+
p.add_argument("--parquet-compression", choices=["snappy", "zstd", "lz4", "gzip", "none"], default=None, help="Parquet compression codec (default: snappy)")
|
|
102
|
+
return p
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _merge_config(args: argparse.Namespace) -> argparse.Namespace:
|
|
106
|
+
cfg = load_config(args.config)
|
|
107
|
+
|
|
108
|
+
defaults: dict[str, Any] = {
|
|
109
|
+
"out": "postgres",
|
|
110
|
+
"dist": [],
|
|
111
|
+
"schema_from_db": False,
|
|
112
|
+
"insert": False,
|
|
113
|
+
"engine": "python",
|
|
114
|
+
"table_rows": [],
|
|
115
|
+
"bq_insert_all": False,
|
|
116
|
+
"parquet_compression": "snappy",
|
|
117
|
+
"strict_checks": False,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
for k, v in defaults.items():
|
|
121
|
+
if getattr(args, k) is None:
|
|
122
|
+
setattr(args, k, v)
|
|
123
|
+
|
|
124
|
+
# config as base
|
|
125
|
+
for key in [
|
|
126
|
+
"ddl",
|
|
127
|
+
"schema_from_db",
|
|
128
|
+
"tables",
|
|
129
|
+
"rows",
|
|
130
|
+
"out",
|
|
131
|
+
"db_url",
|
|
132
|
+
"insert",
|
|
133
|
+
"seed",
|
|
134
|
+
"output_path",
|
|
135
|
+
"report_path",
|
|
136
|
+
"strict_checks",
|
|
137
|
+
"engine",
|
|
138
|
+
"bq_insert_all",
|
|
139
|
+
"parquet_compression",
|
|
140
|
+
]:
|
|
141
|
+
if getattr(args, key) is None:
|
|
142
|
+
if key in cfg:
|
|
143
|
+
setattr(args, key, cfg[key])
|
|
144
|
+
|
|
145
|
+
# dist handling (CLI wins)
|
|
146
|
+
cfg_dist = cfg.get("dist", []) if isinstance(cfg, dict) else []
|
|
147
|
+
if not isinstance(cfg_dist, list):
|
|
148
|
+
cfg_dist = []
|
|
149
|
+
cli_dist = args.dist or []
|
|
150
|
+
if cli_dist:
|
|
151
|
+
args.dist = cli_dist
|
|
152
|
+
else:
|
|
153
|
+
args.dist = [str(x) for x in cfg_dist]
|
|
154
|
+
|
|
155
|
+
# table-rows: CLI value or config map/list
|
|
156
|
+
if args.table_rows:
|
|
157
|
+
table_rows_raw = args.table_rows
|
|
158
|
+
else:
|
|
159
|
+
cfg_tr = cfg.get("table_rows", {}) if isinstance(cfg, dict) else {}
|
|
160
|
+
if isinstance(cfg_tr, dict):
|
|
161
|
+
table_rows_raw = [f"{k}={v}" for k, v in cfg_tr.items()]
|
|
162
|
+
elif isinstance(cfg_tr, list):
|
|
163
|
+
table_rows_raw = [str(x) for x in cfg_tr]
|
|
164
|
+
else:
|
|
165
|
+
table_rows_raw = []
|
|
166
|
+
args.table_rows = table_rows_raw
|
|
167
|
+
|
|
168
|
+
if args.rows is None and not args.table_rows:
|
|
169
|
+
raise SystemExit("--rows is required unless --table-rows is set (or configured)")
|
|
170
|
+
|
|
171
|
+
if args.rows is None:
|
|
172
|
+
args.rows = 0
|
|
173
|
+
|
|
174
|
+
if args.out is None:
|
|
175
|
+
args.out = "postgres"
|
|
176
|
+
|
|
177
|
+
return args
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def main() -> None:
|
|
181
|
+
args = build_parser().parse_args()
|
|
182
|
+
args = _merge_config(args)
|
|
183
|
+
|
|
184
|
+
if args.seed is not None:
|
|
185
|
+
random.seed(args.seed)
|
|
186
|
+
Faker.seed(args.seed)
|
|
187
|
+
|
|
188
|
+
tables = _resolve_tables_from_args(args)
|
|
189
|
+
order = generation_order(tables)
|
|
190
|
+
dist_map = dict(parse_dist_arg(d) for d in (args.dist or []))
|
|
191
|
+
table_rows_map = _parse_table_rows_map(args.table_rows)
|
|
192
|
+
data = generate_all(
|
|
193
|
+
tables,
|
|
194
|
+
order,
|
|
195
|
+
int(args.rows),
|
|
196
|
+
dist_map,
|
|
197
|
+
table_rows=table_rows_map,
|
|
198
|
+
engine=args.engine,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if args.insert:
|
|
202
|
+
if not args.db_url:
|
|
203
|
+
raise SystemExit("--insert requires --db-url")
|
|
204
|
+
_insert_via_sqlalchemy(args.db_url, data)
|
|
205
|
+
|
|
206
|
+
validation = validate_generated_data(tables, data)
|
|
207
|
+
if args.strict_checks:
|
|
208
|
+
check_validation = validate_check_constraints(tables, data)
|
|
209
|
+
validation["counts"]["check_violations"] = check_validation["check_violations"]
|
|
210
|
+
validation["sample_issues"].extend(check_validation["check_details"])
|
|
211
|
+
validation["counts"]["sample_issue_count"] = len(validation["sample_issues"])
|
|
212
|
+
if check_validation["check_violations"] > 0:
|
|
213
|
+
validation["pass"] = False
|
|
214
|
+
validation["counts"]["total_failures"] += check_validation["check_violations"]
|
|
215
|
+
|
|
216
|
+
if args.report_path:
|
|
217
|
+
report = build_report(data, validation=validation)
|
|
218
|
+
write_json(report, args.report_path, engine=args.engine)
|
|
219
|
+
|
|
220
|
+
if args.out == "json":
|
|
221
|
+
payload = write_json(data, args.output_path, engine=args.engine)
|
|
222
|
+
if not args.output_path:
|
|
223
|
+
print(payload)
|
|
224
|
+
return
|
|
225
|
+
|
|
226
|
+
if args.out == "csv":
|
|
227
|
+
out_dir = args.output_path or "./output_csv"
|
|
228
|
+
files = write_csv(data, out_dir, engine=args.engine)
|
|
229
|
+
print("\n".join(files))
|
|
230
|
+
return
|
|
231
|
+
|
|
232
|
+
if args.out == "parquet":
|
|
233
|
+
out_dir = args.output_path or "./output_parquet"
|
|
234
|
+
files = write_parquet(data, out_dir, compression=args.parquet_compression)
|
|
235
|
+
print("\n".join(files))
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
dialect = args.out if args.out in {"postgres", "mysql", "sqlite", "bigquery"} else "postgres"
|
|
239
|
+
chunks = []
|
|
240
|
+
for table_name in order:
|
|
241
|
+
sql = render_insert_sql(
|
|
242
|
+
table_name,
|
|
243
|
+
data.get(table_name, []),
|
|
244
|
+
dialect=dialect,
|
|
245
|
+
engine=args.engine,
|
|
246
|
+
bq_insert_all=bool(args.bq_insert_all),
|
|
247
|
+
)
|
|
248
|
+
if sql:
|
|
249
|
+
chunks.append(sql)
|
|
250
|
+
output = "\n".join(chunks)
|
|
251
|
+
if args.output_path:
|
|
252
|
+
Path(args.output_path).write_text(output, encoding="utf-8")
|
|
253
|
+
else:
|
|
254
|
+
print(output)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
if __name__ == "__main__":
|
|
258
|
+
main()
|
datagen/config.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DistSpec:
|
|
9
|
+
kind: str
|
|
10
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ForeignKey:
|
|
15
|
+
column: str
|
|
16
|
+
ref_table: str
|
|
17
|
+
ref_column: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class ColumnMeta:
|
|
22
|
+
name: str
|
|
23
|
+
type_name: str
|
|
24
|
+
nullable: bool = True
|
|
25
|
+
primary_key: bool = False
|
|
26
|
+
unique: bool = False
|
|
27
|
+
max_length: int | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class UniqueConstraintMeta:
|
|
32
|
+
columns: list[str]
|
|
33
|
+
name: str | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class CheckConstraintMeta:
|
|
38
|
+
expression: str
|
|
39
|
+
name: str | None = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class TableMeta:
|
|
44
|
+
name: str
|
|
45
|
+
columns: list[ColumnMeta]
|
|
46
|
+
foreign_keys: list[ForeignKey] = field(default_factory=list)
|
|
47
|
+
unique_constraints: list[UniqueConstraintMeta] = field(default_factory=list)
|
|
48
|
+
check_constraints: list[CheckConstraintMeta] = field(default_factory=list)
|
|
49
|
+
|
|
50
|
+
def column(self, name: str) -> ColumnMeta:
|
|
51
|
+
for c in self.columns:
|
|
52
|
+
if c.name == name:
|
|
53
|
+
return c
|
|
54
|
+
raise KeyError(f"Column not found: {name}")
|
datagen/config_loader.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _load_toml(path: Path) -> dict[str, Any]:
|
|
9
|
+
try:
|
|
10
|
+
import tomllib # py3.11+
|
|
11
|
+
|
|
12
|
+
return tomllib.loads(path.read_text(encoding="utf-8"))
|
|
13
|
+
except ModuleNotFoundError:
|
|
14
|
+
try:
|
|
15
|
+
import tomli
|
|
16
|
+
except Exception as e: # pragma: no cover
|
|
17
|
+
raise SystemExit("TOML config requires tomli on Python 3.10. Install with: pip install tomli") from e
|
|
18
|
+
return tomli.loads(path.read_text(encoding="utf-8"))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _load_yaml(path: Path) -> dict[str, Any]:
|
|
22
|
+
try:
|
|
23
|
+
import yaml # type: ignore
|
|
24
|
+
except Exception as e: # pragma: no cover
|
|
25
|
+
raise SystemExit("YAML config requires PyYAML. Install with: pip install pyyaml") from e
|
|
26
|
+
|
|
27
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
28
|
+
if not isinstance(data, dict):
|
|
29
|
+
raise SystemExit("Config file root must be an object/map")
|
|
30
|
+
return data
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def load_config(path_str: str | None) -> dict[str, Any]:
|
|
34
|
+
if not path_str:
|
|
35
|
+
return {}
|
|
36
|
+
|
|
37
|
+
path = Path(path_str)
|
|
38
|
+
if not path.exists():
|
|
39
|
+
raise SystemExit(f"Config file not found: {path}")
|
|
40
|
+
|
|
41
|
+
ext = path.suffix.lower()
|
|
42
|
+
if ext in {".json"}:
|
|
43
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
44
|
+
elif ext in {".toml"}:
|
|
45
|
+
data = _load_toml(path)
|
|
46
|
+
elif ext in {".yaml", ".yml"}:
|
|
47
|
+
data = _load_yaml(path)
|
|
48
|
+
else:
|
|
49
|
+
raise SystemExit("Unsupported config extension. Use .json/.toml/.yaml/.yml")
|
|
50
|
+
|
|
51
|
+
if not isinstance(data, dict):
|
|
52
|
+
raise SystemExit("Config file root must be an object/map")
|
|
53
|
+
return data
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|