ddl2data 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddl2data/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ __all__ = ["__version__"]
2
+ __version__ = "0.3.0"
ddl2data/cli.py ADDED
@@ -0,0 +1,330 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import random
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from faker import Faker
9
+ from sqlalchemy import MetaData, Table, create_engine
10
+
11
+ from ddl2data.config import DistSpec
12
+ from ddl2data.config_loader import load_config
13
+ from ddl2data.generator.base import generate_all
14
+ from ddl2data.generator.dist import parse_dist_arg
15
+ from ddl2data.parser.dynamodb import load_schema_from_dynamodb, parse_dynamodb_extra_attrs
16
+ from ddl2data.parser.ddl import parse_ddl_file
17
+ from ddl2data.parser.graph import generation_order
18
+ from ddl2data.parser.introspect import load_schema_from_db
19
+ from ddl2data.report import build_report
20
+ from ddl2data.validation import validate_check_constraints, validate_generated_data
21
+ from ddl2data.writer.csv_writer import write_csv
22
+ from ddl2data.writer.dynamodb_json_writer import write_dynamodb_json
23
+ from ddl2data.writer.json_writer import write_json
24
+ from ddl2data.writer.postgres import render_insert_sql
25
+ from ddl2data.writer.parquet_writer import write_parquet
26
+
27
+
28
+ def _insert_via_sqlalchemy(db_url: str, data: dict[str, list[dict[str, Any]]]) -> None:
29
+ engine = create_engine(db_url)
30
+ meta = MetaData()
31
+ with engine.begin() as conn:
32
+ for table_name, rows in data.items():
33
+ if not rows:
34
+ continue
35
+ table = Table(table_name, meta, autoload_with=conn)
36
+ conn.execute(table.insert(), rows)
37
+
38
+
39
+ def _resolve_tables_from_args(args: argparse.Namespace):
40
+ if args.schema_from_db and args.schema_from_dynamodb:
41
+ raise SystemExit("Choose only one schema source: --schema-from-db or --schema-from-dynamodb")
42
+
43
+ if args.schema_from_dynamodb:
44
+ if not args.dynamodb_table:
45
+ raise SystemExit("--schema-from-dynamodb requires --dynamodb-table")
46
+ try:
47
+ extra_attrs = parse_dynamodb_extra_attrs(args.dynamodb_extra_attr)
48
+ except ValueError as e:
49
+ raise SystemExit(str(e)) from e
50
+ return load_schema_from_dynamodb(
51
+ args.dynamodb_table,
52
+ region_name=args.dynamodb_region,
53
+ extra_attrs=extra_attrs,
54
+ )
55
+
56
+ if args.schema_from_db:
57
+ if not args.db_url:
58
+ raise SystemExit("--schema-from-db requires --db-url")
59
+ engine = create_engine(args.db_url)
60
+ table_names = [x.strip() for x in args.tables.split(",")] if args.tables else None
61
+ return load_schema_from_db(engine, table_names)
62
+
63
+ if not args.ddl:
64
+ raise SystemExit("Provide either --ddl <schema.sql>, --schema-from-db, or --schema-from-dynamodb")
65
+
66
+ return parse_ddl_file(args.ddl)
67
+
68
+
69
+ def _parse_table_rows_map(raw_entries: list[str] | None) -> dict[str, int]:
70
+ out: dict[str, int] = {}
71
+ for entry in raw_entries or []:
72
+ for token in [t.strip() for t in str(entry).split(",") if t.strip()]:
73
+ if "=" not in token:
74
+ raise SystemExit(f"Invalid --table-rows token '{token}'. Use table=count")
75
+ table, count_raw = token.split("=", 1)
76
+ table = table.strip()
77
+ if not table:
78
+ raise SystemExit(f"Invalid --table-rows token '{token}'. Table name cannot be empty")
79
+ try:
80
+ count = int(count_raw.strip())
81
+ except ValueError as e:
82
+ raise SystemExit(f"Invalid row count in --table-rows token '{token}'") from e
83
+ if count < 0:
84
+ raise SystemExit(f"Row count must be >=0 in --table-rows token '{token}'")
85
+ out[table] = count
86
+ return out
87
+
88
+
89
+ def _parse_dist_map(raw_entries: list[str] | None) -> dict[str, DistSpec]:
90
+ out: dict[str, DistSpec] = {}
91
+ for entry in raw_entries or []:
92
+ try:
93
+ key, spec = parse_dist_arg(entry)
94
+ except ValueError as e:
95
+ raise SystemExit(str(e)) from e
96
+ out[key] = spec
97
+ return out
98
+
99
+
100
+ def build_parser() -> argparse.ArgumentParser:
101
+ p = argparse.ArgumentParser(description="Generate synthetic data from DDL or live DB schema")
102
+
103
+ p.add_argument("--config", help="Path to config file (.json/.toml/.yaml/.yml)")
104
+
105
+ # Input source
106
+ p.add_argument("--ddl", help="Path to schema.sql")
107
+ p.add_argument("--schema-from-db", action="store_true", default=None, help="Introspect tables from --db-url instead of using --ddl")
108
+ p.add_argument("--tables", help="Comma-separated table names to include when using --schema-from-db")
109
+ p.add_argument("--schema-from-dynamodb", action="store_true", default=None, help="Load key/index schema from a live DynamoDB table")
110
+ p.add_argument("--dynamodb-table", help="DynamoDB table name for --schema-from-dynamodb")
111
+ p.add_argument("--dynamodb-region", help="AWS region for --schema-from-dynamodb")
112
+ p.add_argument(
113
+ "--dynamodb-extra-attr",
114
+ action="append",
115
+ default=None,
116
+ help="Additional DynamoDB non-key attribute, e.g. email:string (repeatable)",
117
+ )
118
+
119
+ # Generation/output
120
+ p.add_argument("--rows", type=int, help="Default rows per table")
121
+ p.add_argument(
122
+ "--table-rows",
123
+ action="append",
124
+ default=None,
125
+ help="Per-table row count override, e.g. users=100,orders=200 (repeatable)",
126
+ )
127
+ p.add_argument(
128
+ "--out",
129
+ choices=["postgres", "mysql", "sqlite", "bigquery", "json", "csv", "parquet", "dynamodb-json"],
130
+ default=None,
131
+ )
132
+ p.add_argument("--db-url", help="DB URL for schema introspection and/or direct insert")
133
+ p.add_argument("--insert", action="store_true", default=None, help="Insert generated rows into --db-url")
134
+ p.add_argument(
135
+ "--dist",
136
+ action="append",
137
+ default=None,
138
+ help="Distribution override, e.g. age:normal,mean=35,std=7 or users.age:normal,mean=35,std=7",
139
+ )
140
+ p.add_argument("--seed", type=int, default=None, help="Random seed for reproducible generation")
141
+ p.add_argument("--output-path", default=None, help="Optional output file/path (json/sql file or csv dir)")
142
+ p.add_argument("--report-path", default=None, help="Optional JSON report path for generated data profile")
143
+ p.add_argument("--strict-checks", action="store_true", default=None, help="Validate generated rows against supported CHECK constraints")
144
+ p.add_argument("--engine", choices=["python", "polars"], default=None, help="Generation/render engine")
145
+ p.add_argument("--bq-insert-all", action="store_true", default=None, help="When --out bigquery, render INSERT ALL syntax")
146
+ p.add_argument("--parquet-compression", choices=["snappy", "zstd", "lz4", "gzip", "none"], default=None, help="Parquet compression codec (default: snappy)")
147
+ return p
148
+
149
+
150
+ def _merge_config(args: argparse.Namespace) -> argparse.Namespace:
151
+ cfg = load_config(args.config)
152
+
153
+ # config as base
154
+ for key in [
155
+ "ddl",
156
+ "schema_from_db",
157
+ "schema_from_dynamodb",
158
+ "tables",
159
+ "dynamodb_table",
160
+ "dynamodb_region",
161
+ "rows",
162
+ "out",
163
+ "db_url",
164
+ "insert",
165
+ "seed",
166
+ "output_path",
167
+ "report_path",
168
+ "strict_checks",
169
+ "engine",
170
+ "bq_insert_all",
171
+ "parquet_compression",
172
+ ]:
173
+ if getattr(args, key) is None:
174
+ if key in cfg:
175
+ setattr(args, key, cfg[key])
176
+
177
+ defaults: dict[str, Any] = {
178
+ "out": "postgres",
179
+ "dist": [],
180
+ "schema_from_db": False,
181
+ "schema_from_dynamodb": False,
182
+ "insert": False,
183
+ "engine": "python",
184
+ "table_rows": [],
185
+ "dynamodb_extra_attr": [],
186
+ "bq_insert_all": False,
187
+ "parquet_compression": "snappy",
188
+ "strict_checks": False,
189
+ }
190
+
191
+ for k, v in defaults.items():
192
+ if getattr(args, k) is None:
193
+ setattr(args, k, v)
194
+
195
+ # dist handling (CLI wins)
196
+ cfg_dist = cfg.get("dist", []) if isinstance(cfg, dict) else []
197
+ if not isinstance(cfg_dist, list):
198
+ cfg_dist = []
199
+ cli_dist = args.dist or []
200
+ if cli_dist:
201
+ args.dist = cli_dist
202
+ else:
203
+ args.dist = [str(x) for x in cfg_dist]
204
+
205
+ cfg_dynamodb_extra = cfg.get("dynamodb_extra_attr", []) if isinstance(cfg, dict) else []
206
+ if isinstance(cfg_dynamodb_extra, dict):
207
+ cfg_dynamodb_extra = [f"{k}:{v}" for k, v in cfg_dynamodb_extra.items()]
208
+ elif not isinstance(cfg_dynamodb_extra, list):
209
+ cfg_dynamodb_extra = []
210
+ cli_dynamodb_extra = args.dynamodb_extra_attr or []
211
+ if cli_dynamodb_extra:
212
+ args.dynamodb_extra_attr = cli_dynamodb_extra
213
+ else:
214
+ args.dynamodb_extra_attr = [str(x) for x in cfg_dynamodb_extra]
215
+
216
+ # table-rows: CLI value or config map/list
217
+ if args.table_rows:
218
+ table_rows_raw = args.table_rows
219
+ else:
220
+ cfg_tr = cfg.get("table_rows", {}) if isinstance(cfg, dict) else {}
221
+ if isinstance(cfg_tr, dict):
222
+ table_rows_raw = [f"{k}={v}" for k, v in cfg_tr.items()]
223
+ elif isinstance(cfg_tr, list):
224
+ table_rows_raw = [str(x) for x in cfg_tr]
225
+ else:
226
+ table_rows_raw = []
227
+ args.table_rows = table_rows_raw
228
+
229
+ if args.rows is None and not args.table_rows:
230
+ raise SystemExit("--rows is required unless --table-rows is set (or configured)")
231
+
232
+ if args.rows is None:
233
+ args.rows = 0
234
+ elif args.rows < 0:
235
+ raise SystemExit("--rows must be >= 0")
236
+
237
+ if args.out is None:
238
+ args.out = "postgres"
239
+
240
+ return args
241
+
242
+
243
+ def main() -> None:
244
+ args = build_parser().parse_args()
245
+ args = _merge_config(args)
246
+
247
+ if args.seed is not None:
248
+ random.seed(args.seed)
249
+ Faker.seed(args.seed)
250
+
251
+ tables = _resolve_tables_from_args(args)
252
+ order = generation_order(tables)
253
+ dist_map = _parse_dist_map(args.dist)
254
+ table_rows_map = _parse_table_rows_map(args.table_rows)
255
+ data = generate_all(
256
+ tables,
257
+ order,
258
+ int(args.rows),
259
+ dist_map,
260
+ table_rows=table_rows_map,
261
+ engine=args.engine,
262
+ )
263
+
264
+ if args.insert:
265
+ if not args.db_url:
266
+ raise SystemExit("--insert requires --db-url")
267
+ _insert_via_sqlalchemy(args.db_url, data)
268
+
269
+ validation = validate_generated_data(tables, data)
270
+ if args.strict_checks:
271
+ check_validation = validate_check_constraints(tables, data)
272
+ validation["counts"]["check_violations"] = check_validation["check_violations"]
273
+ validation["sample_issues"].extend(check_validation["check_details"])
274
+ validation["counts"]["sample_issue_count"] = len(validation["sample_issues"])
275
+ if check_validation["check_violations"] > 0:
276
+ validation["pass"] = False
277
+ validation["counts"]["total_failures"] += check_validation["check_violations"]
278
+
279
+ if args.report_path:
280
+ report = build_report(data, validation=validation)
281
+ write_json(report, args.report_path, engine=args.engine)
282
+
283
+ if args.out == "json":
284
+ payload = write_json(data, args.output_path, engine=args.engine)
285
+ if not args.output_path:
286
+ print(payload)
287
+ return
288
+
289
+ if args.out == "csv":
290
+ out_dir = args.output_path or "./output_csv"
291
+ files = write_csv(data, out_dir, engine=args.engine)
292
+ print("\n".join(files))
293
+ return
294
+
295
+ if args.out == "parquet":
296
+ out_dir = args.output_path or "./output_parquet"
297
+ files = write_parquet(data, out_dir, compression=args.parquet_compression)
298
+ print("\n".join(files))
299
+ return
300
+
301
+ if args.out == "dynamodb-json":
302
+ payload = write_dynamodb_json(tables, data, args.output_path)
303
+ if isinstance(payload, list):
304
+ print("\n".join(payload))
305
+ else:
306
+ if not args.output_path:
307
+ print(payload)
308
+ return
309
+
310
+ dialect = args.out if args.out in {"postgres", "mysql", "sqlite", "bigquery"} else "postgres"
311
+ chunks = []
312
+ for table_name in order:
313
+ sql = render_insert_sql(
314
+ table_name,
315
+ data.get(table_name, []),
316
+ dialect=dialect,
317
+ engine=args.engine,
318
+ bq_insert_all=bool(args.bq_insert_all),
319
+ )
320
+ if sql:
321
+ chunks.append(sql)
322
+ output = "\n".join(chunks)
323
+ if args.output_path:
324
+ Path(args.output_path).write_text(output, encoding="utf-8")
325
+ else:
326
+ print(output)
327
+
328
+
329
+ if __name__ == "__main__":
330
+ main()
ddl2data/config.py ADDED
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class DistSpec:
9
+ kind: str
10
+ params: dict[str, Any] = field(default_factory=dict)
11
+
12
+
13
+ @dataclass
14
+ class ForeignKey:
15
+ column: str
16
+ ref_table: str
17
+ ref_column: str
18
+
19
+
20
+ @dataclass
21
+ class ColumnMeta:
22
+ name: str
23
+ type_name: str
24
+ nullable: bool = True
25
+ primary_key: bool = False
26
+ unique: bool = False
27
+ max_length: int | None = None
28
+ extra: dict[str, Any] = field(default_factory=dict)
29
+
30
+
31
+ @dataclass
32
+ class UniqueConstraintMeta:
33
+ columns: list[str]
34
+ name: str | None = None
35
+
36
+
37
+ @dataclass
38
+ class CheckConstraintMeta:
39
+ expression: str
40
+ name: str | None = None
41
+
42
+
43
+ @dataclass
44
+ class TableMeta:
45
+ name: str
46
+ columns: list[ColumnMeta]
47
+ foreign_keys: list[ForeignKey] = field(default_factory=list)
48
+ unique_constraints: list[UniqueConstraintMeta] = field(default_factory=list)
49
+ check_constraints: list[CheckConstraintMeta] = field(default_factory=list)
50
+
51
+ def column(self, name: str) -> ColumnMeta:
52
+ for c in self.columns:
53
+ if c.name == name:
54
+ return c
55
+ raise KeyError(f"Column not found: {name}")
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+
8
+ def _load_toml(path: Path) -> dict[str, Any]:
9
+ try:
10
+ import tomllib # py3.11+
11
+
12
+ return tomllib.loads(path.read_text(encoding="utf-8"))
13
+ except ModuleNotFoundError:
14
+ try:
15
+ import tomli
16
+ except Exception as e: # pragma: no cover
17
+ raise SystemExit("TOML config requires tomli on Python 3.10. Install with: pip install tomli") from e
18
+ return tomli.loads(path.read_text(encoding="utf-8"))
19
+
20
+
21
+ def _load_yaml(path: Path) -> dict[str, Any]:
22
+ try:
23
+ import yaml # type: ignore
24
+ except Exception as e: # pragma: no cover
25
+ raise SystemExit("YAML config requires PyYAML. Install with: pip install pyyaml") from e
26
+
27
+ data = yaml.safe_load(path.read_text(encoding="utf-8"))
28
+ if not isinstance(data, dict):
29
+ raise SystemExit("Config file root must be an object/map")
30
+ return data
31
+
32
+
33
+ def load_config(path_str: str | None) -> dict[str, Any]:
34
+ if not path_str:
35
+ return {}
36
+
37
+ path = Path(path_str)
38
+ if not path.exists():
39
+ raise SystemExit(f"Config file not found: {path}")
40
+
41
+ ext = path.suffix.lower()
42
+ if ext in {".json"}:
43
+ data = json.loads(path.read_text(encoding="utf-8"))
44
+ elif ext in {".toml"}:
45
+ data = _load_toml(path)
46
+ elif ext in {".yaml", ".yml"}:
47
+ data = _load_yaml(path)
48
+ else:
49
+ raise SystemExit("Unsupported config extension. Use .json/.toml/.yaml/.yml")
50
+
51
+ if not isinstance(data, dict):
52
+ raise SystemExit("Config file root must be an object/map")
53
+ return data
@@ -0,0 +1 @@
1
+