sqlseed 0.1.9__tar.gz → 0.1.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {sqlseed-0.1.9 → sqlseed-0.1.10}/PKG-INFO +1 -1
  2. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/server.py +2 -0
  3. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/__init__.py +5 -0
  4. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/cli/main.py +3 -0
  5. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/models.py +3 -2
  6. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/column_dag.py +12 -0
  7. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/mapper.py +18 -2
  8. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/orchestrator.py +277 -4
  9. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/sqlite_utils_adapter.py +19 -5
  10. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/stream.py +1 -1
  11. {sqlseed-0.1.9 → sqlseed-0.1.10}/.gitignore +0 -0
  12. {sqlseed-0.1.9 → sqlseed-0.1.10}/CHANGELOG.md +0 -0
  13. {sqlseed-0.1.9 → sqlseed-0.1.10}/LICENSE +0 -0
  14. {sqlseed-0.1.9 → sqlseed-0.1.10}/README.md +0 -0
  15. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/README.md +0 -0
  16. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/pyproject.toml +0 -0
  17. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/__init__.py +0 -0
  18. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/__main__.py +0 -0
  19. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/config.py +0 -0
  20. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/README.md +0 -0
  21. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/pyproject.toml +0 -0
  22. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/__init__.py +0 -0
  23. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/_client.py +0 -0
  24. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/_json_utils.py +0 -0
  25. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/analyzer.py +0 -0
  26. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/config.py +0 -0
  27. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/errors.py +0 -0
  28. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/examples.py +0 -0
  29. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/nl_config.py +0 -0
  30. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/provider.py +0 -0
  31. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/refiner.py +0 -0
  32. {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/suggest.py +0 -0
  33. {sqlseed-0.1.9 → sqlseed-0.1.10}/pyproject.toml +0 -0
  34. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/__init__.py +0 -0
  35. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/logger.py +0 -0
  36. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/metrics.py +0 -0
  37. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/progress.py +0 -0
  38. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/schema_helpers.py +0 -0
  39. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/sql_safe.py +0 -0
  40. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_version.py +0 -0
  41. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/cli/__init__.py +0 -0
  42. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/__init__.py +0 -0
  43. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/loader.py +0 -0
  44. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/snapshot.py +0 -0
  45. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/__init__.py +0 -0
  46. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/constraints.py +0 -0
  47. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/expression.py +0 -0
  48. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/relation.py +0 -0
  49. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/result.py +0 -0
  50. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/schema.py +0 -0
  51. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/transform.py +0 -0
  52. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/__init__.py +0 -0
  53. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/_protocol.py +0 -0
  54. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/optimizer.py +0 -0
  55. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/raw_sqlite_adapter.py +0 -0
  56. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/__init__.py +0 -0
  57. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/_protocol.py +0 -0
  58. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/base_provider.py +0 -0
  59. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/faker_provider.py +0 -0
  60. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/mimesis_provider.py +0 -0
  61. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/registry.py +0 -0
  62. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/plugins/__init__.py +0 -0
  63. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/plugins/hookspecs.py +0 -0
  64. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/plugins/manager.py +0 -0
  65. {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sqlseed
3
- Version: 0.1.9
3
+ Version: 0.1.10
4
4
  Summary: Declarative SQLite test data generation toolkit
5
5
  Project-URL: Homepage, https://github.com/sunbos/sqlseed
6
6
  Project-URL: Documentation, https://github.com/sunbos/sqlseed#readme
@@ -109,6 +109,7 @@ def sqlseed_execute_fill(
109
109
  table_name: str,
110
110
  count: int = 1000,
111
111
  yaml_config: str | None = None,
112
+ enrich: bool = False,
112
113
  ) -> dict[str, Any]:
113
114
  """Execute data generation for a table. Optionally provide YAML config string for column rules."""
114
115
  from sqlseed.core.orchestrator import DataOrchestrator
@@ -138,6 +139,7 @@ def sqlseed_execute_fill(
138
139
  column_configs=column_configs,
139
140
  clear_before=clear_before,
140
141
  seed=seed,
142
+ enrich=enrich,
141
143
  )
142
144
 
143
145
  return {
@@ -42,6 +42,7 @@ def fill(
42
42
  batch_size: int = 5000,
43
43
  clear_before: bool = False,
44
44
  optimize_pragma: bool = True,
45
+ enrich: bool = False,
45
46
  ) -> GenerationResult:
46
47
  with DataOrchestrator(
47
48
  db_path=db_path,
@@ -56,6 +57,7 @@ def fill(
56
57
  seed=seed,
57
58
  batch_size=batch_size,
58
59
  clear_before=clear_before,
60
+ enrich=enrich,
59
61
  )
60
62
 
61
63
 
@@ -92,6 +94,7 @@ def fill_from_config(config_path: str) -> list[GenerationResult]:
92
94
  clear_before=table_config.clear_before,
93
95
  column_configs=table_config.columns,
94
96
  transform=table_config.transform,
97
+ enrich=table_config.enrich,
95
98
  )
96
99
  results.append(result)
97
100
  return results
@@ -106,6 +109,7 @@ def preview(
106
109
  provider: str = "mimesis",
107
110
  locale: str = "en_US",
108
111
  seed: int | None = None,
112
+ enrich: bool = False,
109
113
  ) -> list[dict[str, Any]]:
110
114
  with DataOrchestrator(
111
115
  db_path=db_path,
@@ -118,4 +122,5 @@ def preview(
118
122
  count=count,
119
123
  columns=columns,
120
124
  seed=seed,
125
+ enrich=enrich,
121
126
  )
@@ -24,6 +24,7 @@ def cli() -> None:
24
24
  @click.option("--config", "-c", "config_path", default=None, help="YAML/JSON config file path")
25
25
  @click.option("--transform", "transform_path", default=None, help="Python transform script path")
26
26
  @click.option("--snapshot", is_flag=True, help="Save generation snapshot for replay")
27
+ @click.option("--enrich", is_flag=True, help="Enrich data using existing table distribution")
27
28
  def fill(
28
29
  db_path: str | None,
29
30
  table: str | None,
@@ -36,6 +37,7 @@ def fill(
36
37
  config_path: str | None,
37
38
  transform_path: str | None,
38
39
  snapshot: bool,
40
+ enrich: bool,
39
41
  ) -> None:
40
42
  """Fill a table with generated test data.
41
43
 
@@ -73,6 +75,7 @@ def fill(
73
75
  seed=seed,
74
76
  batch_size=batch_size,
75
77
  clear_before=clear,
78
+ enrich=enrich,
76
79
  )
77
80
  click.echo(str(result))
78
81
 
@@ -72,9 +72,10 @@ class TableConfig(BaseModel):
72
72
  count: int = Field(default=1000, gt=0)
73
73
  batch_size: int = Field(default=5000, gt=0)
74
74
  columns: list[ColumnConfig] = Field(default_factory=list)
75
- clear_before: bool = False # 默认不清空,保护原始数据
75
+ clear_before: bool = False
76
76
  seed: int | None = None
77
- transform: str | None = None # [NEW] Python 变换脚本路径
77
+ transform: str | None = None
78
+ enrich: bool = False
78
79
 
79
80
 
80
81
  class ColumnAssociation(BaseModel):
@@ -40,9 +40,12 @@ class ColumnDAG:
40
40
  self,
41
41
  specs: dict[str, GeneratorSpec],
42
42
  column_configs: list[Any] | None = None,
43
+ unique_columns: set[str] | None = None,
44
+ composite_unique_indexes: list[list[str]] | None = None,
43
45
  ) -> list[ColumnNode]:
44
46
  nodes: dict[str, ColumnNode] = {}
45
47
  config_map: dict[str, Any] = {}
48
+ unique_columns = unique_columns or set()
46
49
 
47
50
  if column_configs:
48
51
  for cc in column_configs:
@@ -69,6 +72,15 @@ class ColumnDAG:
69
72
  is_derived = True
70
73
  final_spec = GeneratorSpec(generator_name="__derive__")
71
74
 
75
+ if col_name in unique_columns:
76
+ if constraints is None:
77
+ constraints = ColumnConstraints(unique=True)
78
+ elif not constraints.unique:
79
+ constraints = ColumnConstraints(
80
+ unique=True,
81
+ max_retries=constraints.max_retries,
82
+ )
83
+
72
84
  nodes[col_name] = ColumnNode(
73
85
  name=col_name,
74
86
  generator_spec=final_spec,
@@ -181,7 +181,14 @@ class ColumnMapper:
181
181
  def register_pattern_rule(self, pattern: str, generator: str, params: dict[str, Any] | None = None) -> None:
182
182
  self._custom_pattern_rules.append((pattern, generator, params or {}))
183
183
 
184
- def map_column(self, column_info: ColumnInfo, user_config: Any = None) -> GeneratorSpec:
184
+ def map_column(
185
+ self,
186
+ column_info: ColumnInfo,
187
+ user_config: Any = None,
188
+ *,
189
+ enrich: bool = False,
190
+ force_type_infer: bool = False,
191
+ ) -> GeneratorSpec:
185
192
  column_name = column_info.name.lower()
186
193
  column_type = column_info.type.upper() if column_info.type else "TEXT"
187
194
 
@@ -219,6 +226,13 @@ class ColumnMapper:
219
226
  return GeneratorSpec(generator_name=gen, params=params)
220
227
 
221
228
  if column_info.default is not None or column_info.nullable:
229
+ if force_type_infer:
230
+ return self._type_faithful_fallback(column_type)
231
+ if enrich:
232
+ return GeneratorSpec(
233
+ generator_name="__enrich__",
234
+ params={"_default": column_info.default, "_nullable": column_info.nullable},
235
+ )
222
236
  return GeneratorSpec(generator_name="skip")
223
237
 
224
238
  return self._type_faithful_fallback(column_type)
@@ -248,10 +262,12 @@ class ColumnMapper:
248
262
  self,
249
263
  columns: list[ColumnInfo],
250
264
  user_configs: dict[str, Any] | None = None,
265
+ *,
266
+ enrich: bool = False,
251
267
  ) -> dict[str, GeneratorSpec]:
252
268
  user_configs = user_configs or {}
253
269
  result: dict[str, GeneratorSpec] = {}
254
270
  for col in columns:
255
271
  col_config = user_configs.get(col.name)
256
- result[col.name] = self.map_column(col, col_config)
272
+ result[col.name] = self.map_column(col, col_config, enrich=enrich)
257
273
  return result
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import math
5
+ import re
4
6
  import time
5
7
  from typing import TYPE_CHECKING, Any, ClassVar
6
8
 
@@ -91,6 +93,7 @@ class DataOrchestrator:
91
93
  clear_before: bool = False,
92
94
  column_configs: list[Any] | None = None,
93
95
  transform: str | None = None,
96
+ enrich: bool = False,
94
97
  ) -> GenerationResult:
95
98
  self._ensure_connected()
96
99
  start_time = time.monotonic()
@@ -106,14 +109,17 @@ class DataOrchestrator:
106
109
 
107
110
  column_infos = self._schema.get_column_info(table_name)
108
111
  user_configs = self._resolve_user_configs(columns, column_configs)
109
- generator_specs = self._mapper.map_columns(column_infos, user_configs)
112
+ generator_specs = self._mapper.map_columns(column_infos, user_configs, enrich=enrich)
113
+ unique_columns = self._detect_unique_columns(table_name)
114
+ generator_specs = self._apply_enrich(table_name, generator_specs, column_infos, unique_columns)
115
+ generator_specs = self._adjust_specs_for_unique(generator_specs, unique_columns, count, column_infos)
110
116
  generator_specs = self._resolve_foreign_keys(table_name, generator_specs)
111
117
  generator_specs = self._apply_ai_suggestions(table_name, column_infos, generator_specs)
112
118
  generator_specs = self._apply_template_pool(table_name, column_infos, generator_specs, count)
113
119
 
114
120
  dag = ColumnDAG()
115
121
  col_configs_list = list(user_configs.values()) if user_configs else None
116
- dag_nodes = dag.build(generator_specs, col_configs_list)
122
+ dag_nodes = dag.build(generator_specs, col_configs_list, unique_columns=unique_columns)
117
123
 
118
124
  expr_engine = ExpressionEngine()
119
125
  constraint_solver = ConstraintSolver()
@@ -207,17 +213,21 @@ class DataOrchestrator:
207
213
  seed: int | None = None,
208
214
  transform: str | None = None,
209
215
  column_configs: list[Any] | None = None,
216
+ enrich: bool = False,
210
217
  ) -> list[dict[str, Any]]:
211
218
  self._ensure_connected()
212
219
 
213
220
  column_infos = self._schema.get_column_info(table_name)
214
221
  user_configs = self._resolve_user_configs(columns, column_configs)
215
- generator_specs = self._mapper.map_columns(column_infos, user_configs)
222
+ generator_specs = self._mapper.map_columns(column_infos, user_configs, enrich=enrich)
223
+ unique_columns = self._detect_unique_columns(table_name)
224
+ generator_specs = self._apply_enrich(table_name, generator_specs, column_infos, unique_columns)
225
+ generator_specs = self._adjust_specs_for_unique(generator_specs, unique_columns, count, column_infos)
216
226
  generator_specs = self._resolve_foreign_keys(table_name, generator_specs)
217
227
 
218
228
  dag = ColumnDAG()
219
229
  col_configs_list = list(user_configs.values()) if user_configs else None
220
- dag_nodes = dag.build(generator_specs, col_configs_list)
230
+ dag_nodes = dag.build(generator_specs, col_configs_list, unique_columns=unique_columns)
221
231
 
222
232
  expr_engine = ExpressionEngine()
223
233
  constraint_solver = ConstraintSolver()
@@ -294,6 +304,267 @@ class DataOrchestrator:
294
304
  lines.append(f" {table}: {count} rows")
295
305
  return "\n".join(lines)
296
306
 
307
+ _ENUM_NAME_PATTERNS: ClassVar[list[str]] = [
308
+ r"^[bB]y[A-Za-z]",
309
+ r".*_type$",
310
+ r".*_status$",
311
+ r"^is_.*",
312
+ r"^has_.*",
313
+ r"^can_.*",
314
+ r".*_level$",
315
+ r".*_category$",
316
+ r".*_class$",
317
+ r".*_flag$",
318
+ r".*_kind$",
319
+ r".*_grade$",
320
+ r".*_rank$",
321
+ r".*_tier$",
322
+ r".*_mode$",
323
+ r".*_stage$",
324
+ r".*_phase$",
325
+ r".*_state$",
326
+ r".*_group$",
327
+ ]
328
+
329
+ _SMALL_INT_TYPES: ClassVar[tuple[str, ...]] = ("INT8", "INT16", "TINYINT", "SMALLINT")
330
+
331
+ def _is_enumeration_column(
332
+ self,
333
+ col_name: str,
334
+ col_info: Any,
335
+ distinct_count: int,
336
+ total_rows: int,
337
+ is_unique: bool,
338
+ ) -> bool:
339
+ if is_unique:
340
+ return False
341
+
342
+ if total_rows == 0 or distinct_count == 0:
343
+ return False
344
+
345
+ cardinality_ratio = distinct_count / total_rows
346
+
347
+ name_matches_enum = any(re.match(p, col_name) for p in self._ENUM_NAME_PATTERNS)
348
+
349
+ col_type_upper = col_info.type.upper() if col_info and hasattr(col_info, "type") else ""
350
+ is_small_int = any(t in col_type_upper for t in self._SMALL_INT_TYPES)
351
+
352
+ return (
353
+ (name_matches_enum and cardinality_ratio < 0.1)
354
+ or (is_small_int and cardinality_ratio < 0.1)
355
+ or (distinct_count <= 10 and cardinality_ratio < 0.05)
356
+ or (
357
+ distinct_count <= 30
358
+ and cardinality_ratio < 0.01
359
+ and "CHAR" not in col_type_upper
360
+ and "TEXT" not in col_type_upper
361
+ )
362
+ )
363
+
364
+ def _apply_enrich(
365
+ self,
366
+ table_name: str,
367
+ specs: dict[str, GeneratorSpec],
368
+ column_infos: list[Any],
369
+ unique_columns: set[str] | None = None,
370
+ ) -> dict[str, GeneratorSpec]:
371
+ has_enrich = any(s.generator_name == "__enrich__" for s in specs.values())
372
+ if not has_enrich:
373
+ return specs
374
+
375
+ unique_columns = unique_columns or set()
376
+ row_count = self._db.get_row_count(table_name)
377
+ if row_count == 0:
378
+ for col_name, spec in specs.items():
379
+ if spec.generator_name == "__enrich__":
380
+ specs[col_name] = GeneratorSpec(generator_name="skip")
381
+ return specs
382
+
383
+ for col_name, spec in list(specs.items()):
384
+ if spec.generator_name != "__enrich__":
385
+ continue
386
+ is_unique = col_name in unique_columns
387
+ specs[col_name] = self._build_enriched_spec(table_name, col_name, spec, column_infos, is_unique)
388
+
389
+ return specs
390
+
391
+ def _build_enriched_spec(
392
+ self,
393
+ table_name: str,
394
+ col_name: str,
395
+ spec: GeneratorSpec,
396
+ column_infos: list[Any],
397
+ is_unique: bool = False,
398
+ ) -> GeneratorSpec:
399
+ col_info = next((c for c in column_infos if c.name == col_name), None)
400
+
401
+ try:
402
+ values = self._db.get_column_values(table_name, col_name, limit=10000)
403
+ except Exception:
404
+ return GeneratorSpec(generator_name="skip")
405
+
406
+ if not values:
407
+ return GeneratorSpec(generator_name="skip")
408
+
409
+ null_count = sum(1 for v in values if v is None)
410
+ non_null_values = [v for v in values if v is not None]
411
+ null_ratio = round(null_count / len(values), 3) if values else 0.0
412
+
413
+ if not non_null_values:
414
+ return GeneratorSpec(generator_name="skip")
415
+
416
+ if col_info and not col_info.nullable:
417
+ null_ratio = 0.0
418
+
419
+ if is_unique:
420
+ null_ratio = 0.0
421
+
422
+ distinct_values = list(set(non_null_values))
423
+ distinct_count = len(distinct_values)
424
+ row_count = self._db.get_row_count(table_name)
425
+
426
+ if self._is_enumeration_column(col_name, col_info, distinct_count, row_count, is_unique):
427
+ choices = distinct_values
428
+ if col_info and "INT" in col_info.type.upper():
429
+ choices = [int(v) if isinstance(v, (int, float, str)) else v for v in choices]
430
+ return GeneratorSpec(
431
+ generator_name="choice",
432
+ params={"choices": choices},
433
+ null_ratio=null_ratio,
434
+ )
435
+
436
+ if col_info:
437
+ fallback_spec = self._mapper.map_column(col_info, force_type_infer=True)
438
+ if fallback_spec.generator_name != "skip":
439
+ return GeneratorSpec(
440
+ generator_name=fallback_spec.generator_name,
441
+ params=fallback_spec.params,
442
+ null_ratio=null_ratio,
443
+ provider=fallback_spec.provider,
444
+ )
445
+
446
+ return GeneratorSpec(generator_name="skip")
447
+
448
+ def _detect_unique_columns(self, table_name: str) -> set[str]:
449
+ unique_cols: set[str] = set()
450
+ try:
451
+ indexes = self._schema.get_index_info(table_name)
452
+ for idx in indexes:
453
+ if idx.unique and len(idx.columns) == 1:
454
+ unique_cols.add(idx.columns[0])
455
+ except Exception:
456
+ logger.debug("Failed to detect unique constraints from indexes", table_name=table_name)
457
+
458
+ try:
459
+ pks = self._db.get_primary_keys(table_name)
460
+ column_infos = self._schema.get_column_info(table_name)
461
+ autoincrement_pks = {c.name for c in column_infos if c.is_primary_key and c.is_autoincrement}
462
+ for pk in pks:
463
+ if pk not in autoincrement_pks:
464
+ unique_cols.add(pk)
465
+ except Exception:
466
+ logger.debug("Failed to detect PK unique constraints", table_name=table_name)
467
+
468
+ return unique_cols
469
+
470
+ def _adjust_specs_for_unique(
471
+ self,
472
+ specs: dict[str, GeneratorSpec],
473
+ unique_columns: set[str],
474
+ count: int,
475
+ column_infos: list[Any] | None = None,
476
+ ) -> dict[str, GeneratorSpec]:
477
+ for col_name in unique_columns:
478
+ if col_name not in specs:
479
+ continue
480
+ spec = specs[col_name]
481
+ if spec.generator_name == "skip":
482
+ continue
483
+
484
+ if spec.generator_name == "string":
485
+ params = dict(spec.params)
486
+ charset_size = 62
487
+ if params.get("charset") == "digits":
488
+ charset_size = 10
489
+ elif params.get("charset") == "alpha":
490
+ charset_size = 52
491
+
492
+ max_length = params.get("max_length", 50)
493
+ min_needed = max(1, math.ceil(math.log(max(count * count * 50, 1)) / math.log(charset_size)))
494
+ current_min = params.get("min_length", 1)
495
+ params["min_length"] = max(current_min, min_needed)
496
+
497
+ if params["min_length"] > max_length:
498
+ if params.get("charset") is None:
499
+ params["charset"] = "alphanumeric"
500
+ charset_size = 62
501
+ min_needed = max(1, math.ceil(math.log(max(count * count * 50, 1)) / math.log(charset_size)))
502
+ params["min_length"] = max(current_min, min_needed)
503
+ if params["min_length"] > max_length:
504
+ logger.warning(
505
+ "Cannot guarantee uniqueness for VARCHAR(%d) with count=%d",
506
+ max_length,
507
+ count,
508
+ column=col_name,
509
+ )
510
+ params["max_length"] = max(params["min_length"], max_length)
511
+ elif params["max_length"] < params["min_length"]:
512
+ params["max_length"] = params["min_length"]
513
+
514
+ specs[col_name] = GeneratorSpec(
515
+ generator_name=spec.generator_name,
516
+ params=params,
517
+ null_ratio=spec.null_ratio,
518
+ provider=spec.provider,
519
+ )
520
+
521
+ elif spec.generator_name == "integer":
522
+ params = dict(spec.params)
523
+ min_val = params.get("min_value", 0)
524
+ max_val = params.get("max_value", 999999)
525
+ if max_val - min_val < count * 10:
526
+ col_info = next((c for c in (column_infos or []) if c.name == col_name), None)
527
+ if col_info:
528
+ col_type_upper = col_info.type.upper()
529
+ if "INT8" in col_type_upper and count > 255:
530
+ logger.warning(
531
+ "INT8 column with UNIQUE constraint cannot guarantee uniqueness for count > 255",
532
+ column=col_name,
533
+ count=count,
534
+ )
535
+ elif "INT16" in col_type_upper and count > 65535:
536
+ logger.warning(
537
+ "INT16 column with UNIQUE constraint cannot guarantee uniqueness for count > 65535",
538
+ column=col_name,
539
+ count=count,
540
+ )
541
+ params["max_value"] = min_val + count * 10
542
+ specs[col_name] = GeneratorSpec(
543
+ generator_name=spec.generator_name,
544
+ params=params,
545
+ null_ratio=spec.null_ratio,
546
+ provider=spec.provider,
547
+ )
548
+
549
+ elif spec.generator_name == "choice":
550
+ choices = spec.params.get("choices", [])
551
+ if len(choices) < count:
552
+ col_info = None
553
+ if column_infos:
554
+ col_info = next((c for c in column_infos if c.name == col_name), None)
555
+ if col_info:
556
+ fallback = self._mapper.map_column(col_info, force_type_infer=True)
557
+ if fallback.generator_name not in ("skip", "choice"):
558
+ specs[col_name] = GeneratorSpec(
559
+ generator_name=fallback.generator_name,
560
+ params=fallback.params,
561
+ null_ratio=spec.null_ratio,
562
+ provider=fallback.provider,
563
+ )
564
+ specs = self._adjust_specs_for_unique(specs, {col_name}, count, column_infos)
565
+
566
+ return specs
567
+
297
568
  def _resolve_user_configs(
298
569
  self,
299
570
  columns: dict[str, Any] | None,
@@ -551,6 +822,7 @@ class DataOrchestrator:
551
822
  clear_before: bool = False,
552
823
  column_configs: list[Any] | None = None,
553
824
  transform: str | None = None,
825
+ enrich: bool = False,
554
826
  ) -> GenerationResult:
555
827
  return self.fill_table(
556
828
  table_name=table_name,
@@ -561,6 +833,7 @@ class DataOrchestrator:
561
833
  clear_before=clear_before,
562
834
  column_configs=column_configs,
563
835
  transform=transform,
836
+ enrich=enrich,
564
837
  )
565
838
 
566
839
  def close(self) -> None:
@@ -134,17 +134,31 @@ class SQLiteUtilsAdapter:
134
134
  ) -> int:
135
135
  inserted = 0
136
136
  batch: list[dict[str, Any]] = []
137
- for row in data:
137
+ for item in data:
138
+ row = item
139
+ if not row:
140
+ row = {}
138
141
  batch.append(row)
139
142
  if len(batch) >= batch_size:
140
- self._db[table_name].insert_all(batch)
141
- inserted += len(batch)
143
+ inserted += self._insert_batch(table_name, batch)
142
144
  batch = []
143
145
  if batch:
144
- self._db[table_name].insert_all(batch)
145
- inserted += len(batch)
146
+ inserted += self._insert_batch(table_name, batch)
146
147
  return inserted
147
148
 
149
+ def _insert_batch(self, table_name: str, batch: list[dict[str, Any]]) -> int:
150
+ if not batch:
151
+ return 0
152
+ if batch[0]:
153
+ self._db[table_name].insert_all(batch)
154
+ return len(batch)
155
+ safe_table = quote_identifier(table_name)
156
+ conn = self._db.conn
157
+ for _ in batch:
158
+ conn.execute(f"INSERT INTO {safe_table} DEFAULT VALUES")
159
+ conn.commit()
160
+ return len(batch)
161
+
148
162
  def clear_table(self, table_name: str) -> None:
149
163
  safe_table = quote_identifier(table_name)
150
164
  self._db.execute(f"DELETE FROM {safe_table}")
@@ -117,7 +117,7 @@ class DataStream:
117
117
  total_retries += 1
118
118
  continue
119
119
 
120
- if generated_values:
120
+ if generated_values or not any(not n.is_skip for n in self._nodes):
121
121
  if self._transform_fn:
122
122
  ctx = {"row_number": total_retries}
123
123
  row = self._transform_fn(row, ctx)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes