sqlseed 0.1.9__tar.gz → 0.1.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sqlseed-0.1.9 → sqlseed-0.1.10}/PKG-INFO +1 -1
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/server.py +2 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/__init__.py +5 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/cli/main.py +3 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/models.py +3 -2
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/column_dag.py +12 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/mapper.py +18 -2
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/orchestrator.py +277 -4
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/sqlite_utils_adapter.py +19 -5
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/stream.py +1 -1
- {sqlseed-0.1.9 → sqlseed-0.1.10}/.gitignore +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/CHANGELOG.md +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/LICENSE +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/README.md +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/README.md +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/pyproject.toml +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/__main__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/config.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/README.md +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/pyproject.toml +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/_client.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/_json_utils.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/analyzer.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/config.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/errors.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/examples.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/nl_config.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/provider.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/refiner.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/sqlseed-ai/src/sqlseed_ai/suggest.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/pyproject.toml +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/logger.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/metrics.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/progress.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/schema_helpers.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_utils/sql_safe.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/_version.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/cli/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/loader.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/snapshot.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/constraints.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/expression.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/relation.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/result.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/schema.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/transform.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/_protocol.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/optimizer.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/raw_sqlite_adapter.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/_protocol.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/base_provider.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/faker_provider.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/mimesis_provider.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/registry.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/plugins/__init__.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/plugins/hookspecs.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/plugins/manager.py +0 -0
- {sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/py.typed +0 -0
{sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/server.py
RENAMED
|
@@ -109,6 +109,7 @@ def sqlseed_execute_fill(
|
|
|
109
109
|
table_name: str,
|
|
110
110
|
count: int = 1000,
|
|
111
111
|
yaml_config: str | None = None,
|
|
112
|
+
enrich: bool = False,
|
|
112
113
|
) -> dict[str, Any]:
|
|
113
114
|
"""Execute data generation for a table. Optionally provide YAML config string for column rules."""
|
|
114
115
|
from sqlseed.core.orchestrator import DataOrchestrator
|
|
@@ -138,6 +139,7 @@ def sqlseed_execute_fill(
|
|
|
138
139
|
column_configs=column_configs,
|
|
139
140
|
clear_before=clear_before,
|
|
140
141
|
seed=seed,
|
|
142
|
+
enrich=enrich,
|
|
141
143
|
)
|
|
142
144
|
|
|
143
145
|
return {
|
|
@@ -42,6 +42,7 @@ def fill(
|
|
|
42
42
|
batch_size: int = 5000,
|
|
43
43
|
clear_before: bool = False,
|
|
44
44
|
optimize_pragma: bool = True,
|
|
45
|
+
enrich: bool = False,
|
|
45
46
|
) -> GenerationResult:
|
|
46
47
|
with DataOrchestrator(
|
|
47
48
|
db_path=db_path,
|
|
@@ -56,6 +57,7 @@ def fill(
|
|
|
56
57
|
seed=seed,
|
|
57
58
|
batch_size=batch_size,
|
|
58
59
|
clear_before=clear_before,
|
|
60
|
+
enrich=enrich,
|
|
59
61
|
)
|
|
60
62
|
|
|
61
63
|
|
|
@@ -92,6 +94,7 @@ def fill_from_config(config_path: str) -> list[GenerationResult]:
|
|
|
92
94
|
clear_before=table_config.clear_before,
|
|
93
95
|
column_configs=table_config.columns,
|
|
94
96
|
transform=table_config.transform,
|
|
97
|
+
enrich=table_config.enrich,
|
|
95
98
|
)
|
|
96
99
|
results.append(result)
|
|
97
100
|
return results
|
|
@@ -106,6 +109,7 @@ def preview(
|
|
|
106
109
|
provider: str = "mimesis",
|
|
107
110
|
locale: str = "en_US",
|
|
108
111
|
seed: int | None = None,
|
|
112
|
+
enrich: bool = False,
|
|
109
113
|
) -> list[dict[str, Any]]:
|
|
110
114
|
with DataOrchestrator(
|
|
111
115
|
db_path=db_path,
|
|
@@ -118,4 +122,5 @@ def preview(
|
|
|
118
122
|
count=count,
|
|
119
123
|
columns=columns,
|
|
120
124
|
seed=seed,
|
|
125
|
+
enrich=enrich,
|
|
121
126
|
)
|
|
@@ -24,6 +24,7 @@ def cli() -> None:
|
|
|
24
24
|
@click.option("--config", "-c", "config_path", default=None, help="YAML/JSON config file path")
|
|
25
25
|
@click.option("--transform", "transform_path", default=None, help="Python transform script path")
|
|
26
26
|
@click.option("--snapshot", is_flag=True, help="Save generation snapshot for replay")
|
|
27
|
+
@click.option("--enrich", is_flag=True, help="Enrich data using existing table distribution")
|
|
27
28
|
def fill(
|
|
28
29
|
db_path: str | None,
|
|
29
30
|
table: str | None,
|
|
@@ -36,6 +37,7 @@ def fill(
|
|
|
36
37
|
config_path: str | None,
|
|
37
38
|
transform_path: str | None,
|
|
38
39
|
snapshot: bool,
|
|
40
|
+
enrich: bool,
|
|
39
41
|
) -> None:
|
|
40
42
|
"""Fill a table with generated test data.
|
|
41
43
|
|
|
@@ -73,6 +75,7 @@ def fill(
|
|
|
73
75
|
seed=seed,
|
|
74
76
|
batch_size=batch_size,
|
|
75
77
|
clear_before=clear,
|
|
78
|
+
enrich=enrich,
|
|
76
79
|
)
|
|
77
80
|
click.echo(str(result))
|
|
78
81
|
|
|
@@ -72,9 +72,10 @@ class TableConfig(BaseModel):
|
|
|
72
72
|
count: int = Field(default=1000, gt=0)
|
|
73
73
|
batch_size: int = Field(default=5000, gt=0)
|
|
74
74
|
columns: list[ColumnConfig] = Field(default_factory=list)
|
|
75
|
-
clear_before: bool = False
|
|
75
|
+
clear_before: bool = False
|
|
76
76
|
seed: int | None = None
|
|
77
|
-
transform: str | None = None
|
|
77
|
+
transform: str | None = None
|
|
78
|
+
enrich: bool = False
|
|
78
79
|
|
|
79
80
|
|
|
80
81
|
class ColumnAssociation(BaseModel):
|
|
@@ -40,9 +40,12 @@ class ColumnDAG:
|
|
|
40
40
|
self,
|
|
41
41
|
specs: dict[str, GeneratorSpec],
|
|
42
42
|
column_configs: list[Any] | None = None,
|
|
43
|
+
unique_columns: set[str] | None = None,
|
|
44
|
+
composite_unique_indexes: list[list[str]] | None = None,
|
|
43
45
|
) -> list[ColumnNode]:
|
|
44
46
|
nodes: dict[str, ColumnNode] = {}
|
|
45
47
|
config_map: dict[str, Any] = {}
|
|
48
|
+
unique_columns = unique_columns or set()
|
|
46
49
|
|
|
47
50
|
if column_configs:
|
|
48
51
|
for cc in column_configs:
|
|
@@ -69,6 +72,15 @@ class ColumnDAG:
|
|
|
69
72
|
is_derived = True
|
|
70
73
|
final_spec = GeneratorSpec(generator_name="__derive__")
|
|
71
74
|
|
|
75
|
+
if col_name in unique_columns:
|
|
76
|
+
if constraints is None:
|
|
77
|
+
constraints = ColumnConstraints(unique=True)
|
|
78
|
+
elif not constraints.unique:
|
|
79
|
+
constraints = ColumnConstraints(
|
|
80
|
+
unique=True,
|
|
81
|
+
max_retries=constraints.max_retries,
|
|
82
|
+
)
|
|
83
|
+
|
|
72
84
|
nodes[col_name] = ColumnNode(
|
|
73
85
|
name=col_name,
|
|
74
86
|
generator_spec=final_spec,
|
|
@@ -181,7 +181,14 @@ class ColumnMapper:
|
|
|
181
181
|
def register_pattern_rule(self, pattern: str, generator: str, params: dict[str, Any] | None = None) -> None:
|
|
182
182
|
self._custom_pattern_rules.append((pattern, generator, params or {}))
|
|
183
183
|
|
|
184
|
-
def map_column(
|
|
184
|
+
def map_column(
|
|
185
|
+
self,
|
|
186
|
+
column_info: ColumnInfo,
|
|
187
|
+
user_config: Any = None,
|
|
188
|
+
*,
|
|
189
|
+
enrich: bool = False,
|
|
190
|
+
force_type_infer: bool = False,
|
|
191
|
+
) -> GeneratorSpec:
|
|
185
192
|
column_name = column_info.name.lower()
|
|
186
193
|
column_type = column_info.type.upper() if column_info.type else "TEXT"
|
|
187
194
|
|
|
@@ -219,6 +226,13 @@ class ColumnMapper:
|
|
|
219
226
|
return GeneratorSpec(generator_name=gen, params=params)
|
|
220
227
|
|
|
221
228
|
if column_info.default is not None or column_info.nullable:
|
|
229
|
+
if force_type_infer:
|
|
230
|
+
return self._type_faithful_fallback(column_type)
|
|
231
|
+
if enrich:
|
|
232
|
+
return GeneratorSpec(
|
|
233
|
+
generator_name="__enrich__",
|
|
234
|
+
params={"_default": column_info.default, "_nullable": column_info.nullable},
|
|
235
|
+
)
|
|
222
236
|
return GeneratorSpec(generator_name="skip")
|
|
223
237
|
|
|
224
238
|
return self._type_faithful_fallback(column_type)
|
|
@@ -248,10 +262,12 @@ class ColumnMapper:
|
|
|
248
262
|
self,
|
|
249
263
|
columns: list[ColumnInfo],
|
|
250
264
|
user_configs: dict[str, Any] | None = None,
|
|
265
|
+
*,
|
|
266
|
+
enrich: bool = False,
|
|
251
267
|
) -> dict[str, GeneratorSpec]:
|
|
252
268
|
user_configs = user_configs or {}
|
|
253
269
|
result: dict[str, GeneratorSpec] = {}
|
|
254
270
|
for col in columns:
|
|
255
271
|
col_config = user_configs.get(col.name)
|
|
256
|
-
result[col.name] = self.map_column(col, col_config)
|
|
272
|
+
result[col.name] = self.map_column(col, col_config, enrich=enrich)
|
|
257
273
|
return result
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
+
import math
|
|
5
|
+
import re
|
|
4
6
|
import time
|
|
5
7
|
from typing import TYPE_CHECKING, Any, ClassVar
|
|
6
8
|
|
|
@@ -91,6 +93,7 @@ class DataOrchestrator:
|
|
|
91
93
|
clear_before: bool = False,
|
|
92
94
|
column_configs: list[Any] | None = None,
|
|
93
95
|
transform: str | None = None,
|
|
96
|
+
enrich: bool = False,
|
|
94
97
|
) -> GenerationResult:
|
|
95
98
|
self._ensure_connected()
|
|
96
99
|
start_time = time.monotonic()
|
|
@@ -106,14 +109,17 @@ class DataOrchestrator:
|
|
|
106
109
|
|
|
107
110
|
column_infos = self._schema.get_column_info(table_name)
|
|
108
111
|
user_configs = self._resolve_user_configs(columns, column_configs)
|
|
109
|
-
generator_specs = self._mapper.map_columns(column_infos, user_configs)
|
|
112
|
+
generator_specs = self._mapper.map_columns(column_infos, user_configs, enrich=enrich)
|
|
113
|
+
unique_columns = self._detect_unique_columns(table_name)
|
|
114
|
+
generator_specs = self._apply_enrich(table_name, generator_specs, column_infos, unique_columns)
|
|
115
|
+
generator_specs = self._adjust_specs_for_unique(generator_specs, unique_columns, count, column_infos)
|
|
110
116
|
generator_specs = self._resolve_foreign_keys(table_name, generator_specs)
|
|
111
117
|
generator_specs = self._apply_ai_suggestions(table_name, column_infos, generator_specs)
|
|
112
118
|
generator_specs = self._apply_template_pool(table_name, column_infos, generator_specs, count)
|
|
113
119
|
|
|
114
120
|
dag = ColumnDAG()
|
|
115
121
|
col_configs_list = list(user_configs.values()) if user_configs else None
|
|
116
|
-
dag_nodes = dag.build(generator_specs, col_configs_list)
|
|
122
|
+
dag_nodes = dag.build(generator_specs, col_configs_list, unique_columns=unique_columns)
|
|
117
123
|
|
|
118
124
|
expr_engine = ExpressionEngine()
|
|
119
125
|
constraint_solver = ConstraintSolver()
|
|
@@ -207,17 +213,21 @@ class DataOrchestrator:
|
|
|
207
213
|
seed: int | None = None,
|
|
208
214
|
transform: str | None = None,
|
|
209
215
|
column_configs: list[Any] | None = None,
|
|
216
|
+
enrich: bool = False,
|
|
210
217
|
) -> list[dict[str, Any]]:
|
|
211
218
|
self._ensure_connected()
|
|
212
219
|
|
|
213
220
|
column_infos = self._schema.get_column_info(table_name)
|
|
214
221
|
user_configs = self._resolve_user_configs(columns, column_configs)
|
|
215
|
-
generator_specs = self._mapper.map_columns(column_infos, user_configs)
|
|
222
|
+
generator_specs = self._mapper.map_columns(column_infos, user_configs, enrich=enrich)
|
|
223
|
+
unique_columns = self._detect_unique_columns(table_name)
|
|
224
|
+
generator_specs = self._apply_enrich(table_name, generator_specs, column_infos, unique_columns)
|
|
225
|
+
generator_specs = self._adjust_specs_for_unique(generator_specs, unique_columns, count, column_infos)
|
|
216
226
|
generator_specs = self._resolve_foreign_keys(table_name, generator_specs)
|
|
217
227
|
|
|
218
228
|
dag = ColumnDAG()
|
|
219
229
|
col_configs_list = list(user_configs.values()) if user_configs else None
|
|
220
|
-
dag_nodes = dag.build(generator_specs, col_configs_list)
|
|
230
|
+
dag_nodes = dag.build(generator_specs, col_configs_list, unique_columns=unique_columns)
|
|
221
231
|
|
|
222
232
|
expr_engine = ExpressionEngine()
|
|
223
233
|
constraint_solver = ConstraintSolver()
|
|
@@ -294,6 +304,267 @@ class DataOrchestrator:
|
|
|
294
304
|
lines.append(f" {table}: {count} rows")
|
|
295
305
|
return "\n".join(lines)
|
|
296
306
|
|
|
307
|
+
_ENUM_NAME_PATTERNS: ClassVar[list[str]] = [
|
|
308
|
+
r"^[bB]y[A-Za-z]",
|
|
309
|
+
r".*_type$",
|
|
310
|
+
r".*_status$",
|
|
311
|
+
r"^is_.*",
|
|
312
|
+
r"^has_.*",
|
|
313
|
+
r"^can_.*",
|
|
314
|
+
r".*_level$",
|
|
315
|
+
r".*_category$",
|
|
316
|
+
r".*_class$",
|
|
317
|
+
r".*_flag$",
|
|
318
|
+
r".*_kind$",
|
|
319
|
+
r".*_grade$",
|
|
320
|
+
r".*_rank$",
|
|
321
|
+
r".*_tier$",
|
|
322
|
+
r".*_mode$",
|
|
323
|
+
r".*_stage$",
|
|
324
|
+
r".*_phase$",
|
|
325
|
+
r".*_state$",
|
|
326
|
+
r".*_group$",
|
|
327
|
+
]
|
|
328
|
+
|
|
329
|
+
_SMALL_INT_TYPES: ClassVar[tuple[str, ...]] = ("INT8", "INT16", "TINYINT", "SMALLINT")
|
|
330
|
+
|
|
331
|
+
def _is_enumeration_column(
|
|
332
|
+
self,
|
|
333
|
+
col_name: str,
|
|
334
|
+
col_info: Any,
|
|
335
|
+
distinct_count: int,
|
|
336
|
+
total_rows: int,
|
|
337
|
+
is_unique: bool,
|
|
338
|
+
) -> bool:
|
|
339
|
+
if is_unique:
|
|
340
|
+
return False
|
|
341
|
+
|
|
342
|
+
if total_rows == 0 or distinct_count == 0:
|
|
343
|
+
return False
|
|
344
|
+
|
|
345
|
+
cardinality_ratio = distinct_count / total_rows
|
|
346
|
+
|
|
347
|
+
name_matches_enum = any(re.match(p, col_name) for p in self._ENUM_NAME_PATTERNS)
|
|
348
|
+
|
|
349
|
+
col_type_upper = col_info.type.upper() if col_info and hasattr(col_info, "type") else ""
|
|
350
|
+
is_small_int = any(t in col_type_upper for t in self._SMALL_INT_TYPES)
|
|
351
|
+
|
|
352
|
+
return (
|
|
353
|
+
(name_matches_enum and cardinality_ratio < 0.1)
|
|
354
|
+
or (is_small_int and cardinality_ratio < 0.1)
|
|
355
|
+
or (distinct_count <= 10 and cardinality_ratio < 0.05)
|
|
356
|
+
or (
|
|
357
|
+
distinct_count <= 30
|
|
358
|
+
and cardinality_ratio < 0.01
|
|
359
|
+
and "CHAR" not in col_type_upper
|
|
360
|
+
and "TEXT" not in col_type_upper
|
|
361
|
+
)
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
def _apply_enrich(
|
|
365
|
+
self,
|
|
366
|
+
table_name: str,
|
|
367
|
+
specs: dict[str, GeneratorSpec],
|
|
368
|
+
column_infos: list[Any],
|
|
369
|
+
unique_columns: set[str] | None = None,
|
|
370
|
+
) -> dict[str, GeneratorSpec]:
|
|
371
|
+
has_enrich = any(s.generator_name == "__enrich__" for s in specs.values())
|
|
372
|
+
if not has_enrich:
|
|
373
|
+
return specs
|
|
374
|
+
|
|
375
|
+
unique_columns = unique_columns or set()
|
|
376
|
+
row_count = self._db.get_row_count(table_name)
|
|
377
|
+
if row_count == 0:
|
|
378
|
+
for col_name, spec in specs.items():
|
|
379
|
+
if spec.generator_name == "__enrich__":
|
|
380
|
+
specs[col_name] = GeneratorSpec(generator_name="skip")
|
|
381
|
+
return specs
|
|
382
|
+
|
|
383
|
+
for col_name, spec in list(specs.items()):
|
|
384
|
+
if spec.generator_name != "__enrich__":
|
|
385
|
+
continue
|
|
386
|
+
is_unique = col_name in unique_columns
|
|
387
|
+
specs[col_name] = self._build_enriched_spec(table_name, col_name, spec, column_infos, is_unique)
|
|
388
|
+
|
|
389
|
+
return specs
|
|
390
|
+
|
|
391
|
+
def _build_enriched_spec(
|
|
392
|
+
self,
|
|
393
|
+
table_name: str,
|
|
394
|
+
col_name: str,
|
|
395
|
+
spec: GeneratorSpec,
|
|
396
|
+
column_infos: list[Any],
|
|
397
|
+
is_unique: bool = False,
|
|
398
|
+
) -> GeneratorSpec:
|
|
399
|
+
col_info = next((c for c in column_infos if c.name == col_name), None)
|
|
400
|
+
|
|
401
|
+
try:
|
|
402
|
+
values = self._db.get_column_values(table_name, col_name, limit=10000)
|
|
403
|
+
except Exception:
|
|
404
|
+
return GeneratorSpec(generator_name="skip")
|
|
405
|
+
|
|
406
|
+
if not values:
|
|
407
|
+
return GeneratorSpec(generator_name="skip")
|
|
408
|
+
|
|
409
|
+
null_count = sum(1 for v in values if v is None)
|
|
410
|
+
non_null_values = [v for v in values if v is not None]
|
|
411
|
+
null_ratio = round(null_count / len(values), 3) if values else 0.0
|
|
412
|
+
|
|
413
|
+
if not non_null_values:
|
|
414
|
+
return GeneratorSpec(generator_name="skip")
|
|
415
|
+
|
|
416
|
+
if col_info and not col_info.nullable:
|
|
417
|
+
null_ratio = 0.0
|
|
418
|
+
|
|
419
|
+
if is_unique:
|
|
420
|
+
null_ratio = 0.0
|
|
421
|
+
|
|
422
|
+
distinct_values = list(set(non_null_values))
|
|
423
|
+
distinct_count = len(distinct_values)
|
|
424
|
+
row_count = self._db.get_row_count(table_name)
|
|
425
|
+
|
|
426
|
+
if self._is_enumeration_column(col_name, col_info, distinct_count, row_count, is_unique):
|
|
427
|
+
choices = distinct_values
|
|
428
|
+
if col_info and "INT" in col_info.type.upper():
|
|
429
|
+
choices = [int(v) if isinstance(v, (int, float, str)) else v for v in choices]
|
|
430
|
+
return GeneratorSpec(
|
|
431
|
+
generator_name="choice",
|
|
432
|
+
params={"choices": choices},
|
|
433
|
+
null_ratio=null_ratio,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
if col_info:
|
|
437
|
+
fallback_spec = self._mapper.map_column(col_info, force_type_infer=True)
|
|
438
|
+
if fallback_spec.generator_name != "skip":
|
|
439
|
+
return GeneratorSpec(
|
|
440
|
+
generator_name=fallback_spec.generator_name,
|
|
441
|
+
params=fallback_spec.params,
|
|
442
|
+
null_ratio=null_ratio,
|
|
443
|
+
provider=fallback_spec.provider,
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
return GeneratorSpec(generator_name="skip")
|
|
447
|
+
|
|
448
|
+
def _detect_unique_columns(self, table_name: str) -> set[str]:
|
|
449
|
+
unique_cols: set[str] = set()
|
|
450
|
+
try:
|
|
451
|
+
indexes = self._schema.get_index_info(table_name)
|
|
452
|
+
for idx in indexes:
|
|
453
|
+
if idx.unique and len(idx.columns) == 1:
|
|
454
|
+
unique_cols.add(idx.columns[0])
|
|
455
|
+
except Exception:
|
|
456
|
+
logger.debug("Failed to detect unique constraints from indexes", table_name=table_name)
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
pks = self._db.get_primary_keys(table_name)
|
|
460
|
+
column_infos = self._schema.get_column_info(table_name)
|
|
461
|
+
autoincrement_pks = {c.name for c in column_infos if c.is_primary_key and c.is_autoincrement}
|
|
462
|
+
for pk in pks:
|
|
463
|
+
if pk not in autoincrement_pks:
|
|
464
|
+
unique_cols.add(pk)
|
|
465
|
+
except Exception:
|
|
466
|
+
logger.debug("Failed to detect PK unique constraints", table_name=table_name)
|
|
467
|
+
|
|
468
|
+
return unique_cols
|
|
469
|
+
|
|
470
|
+
def _adjust_specs_for_unique(
|
|
471
|
+
self,
|
|
472
|
+
specs: dict[str, GeneratorSpec],
|
|
473
|
+
unique_columns: set[str],
|
|
474
|
+
count: int,
|
|
475
|
+
column_infos: list[Any] | None = None,
|
|
476
|
+
) -> dict[str, GeneratorSpec]:
|
|
477
|
+
for col_name in unique_columns:
|
|
478
|
+
if col_name not in specs:
|
|
479
|
+
continue
|
|
480
|
+
spec = specs[col_name]
|
|
481
|
+
if spec.generator_name == "skip":
|
|
482
|
+
continue
|
|
483
|
+
|
|
484
|
+
if spec.generator_name == "string":
|
|
485
|
+
params = dict(spec.params)
|
|
486
|
+
charset_size = 62
|
|
487
|
+
if params.get("charset") == "digits":
|
|
488
|
+
charset_size = 10
|
|
489
|
+
elif params.get("charset") == "alpha":
|
|
490
|
+
charset_size = 52
|
|
491
|
+
|
|
492
|
+
max_length = params.get("max_length", 50)
|
|
493
|
+
min_needed = max(1, math.ceil(math.log(max(count * count * 50, 1)) / math.log(charset_size)))
|
|
494
|
+
current_min = params.get("min_length", 1)
|
|
495
|
+
params["min_length"] = max(current_min, min_needed)
|
|
496
|
+
|
|
497
|
+
if params["min_length"] > max_length:
|
|
498
|
+
if params.get("charset") is None:
|
|
499
|
+
params["charset"] = "alphanumeric"
|
|
500
|
+
charset_size = 62
|
|
501
|
+
min_needed = max(1, math.ceil(math.log(max(count * count * 50, 1)) / math.log(charset_size)))
|
|
502
|
+
params["min_length"] = max(current_min, min_needed)
|
|
503
|
+
if params["min_length"] > max_length:
|
|
504
|
+
logger.warning(
|
|
505
|
+
"Cannot guarantee uniqueness for VARCHAR(%d) with count=%d",
|
|
506
|
+
max_length,
|
|
507
|
+
count,
|
|
508
|
+
column=col_name,
|
|
509
|
+
)
|
|
510
|
+
params["max_length"] = max(params["min_length"], max_length)
|
|
511
|
+
elif params["max_length"] < params["min_length"]:
|
|
512
|
+
params["max_length"] = params["min_length"]
|
|
513
|
+
|
|
514
|
+
specs[col_name] = GeneratorSpec(
|
|
515
|
+
generator_name=spec.generator_name,
|
|
516
|
+
params=params,
|
|
517
|
+
null_ratio=spec.null_ratio,
|
|
518
|
+
provider=spec.provider,
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
elif spec.generator_name == "integer":
|
|
522
|
+
params = dict(spec.params)
|
|
523
|
+
min_val = params.get("min_value", 0)
|
|
524
|
+
max_val = params.get("max_value", 999999)
|
|
525
|
+
if max_val - min_val < count * 10:
|
|
526
|
+
col_info = next((c for c in (column_infos or []) if c.name == col_name), None)
|
|
527
|
+
if col_info:
|
|
528
|
+
col_type_upper = col_info.type.upper()
|
|
529
|
+
if "INT8" in col_type_upper and count > 255:
|
|
530
|
+
logger.warning(
|
|
531
|
+
"INT8 column with UNIQUE constraint cannot guarantee uniqueness for count > 255",
|
|
532
|
+
column=col_name,
|
|
533
|
+
count=count,
|
|
534
|
+
)
|
|
535
|
+
elif "INT16" in col_type_upper and count > 65535:
|
|
536
|
+
logger.warning(
|
|
537
|
+
"INT16 column with UNIQUE constraint cannot guarantee uniqueness for count > 65535",
|
|
538
|
+
column=col_name,
|
|
539
|
+
count=count,
|
|
540
|
+
)
|
|
541
|
+
params["max_value"] = min_val + count * 10
|
|
542
|
+
specs[col_name] = GeneratorSpec(
|
|
543
|
+
generator_name=spec.generator_name,
|
|
544
|
+
params=params,
|
|
545
|
+
null_ratio=spec.null_ratio,
|
|
546
|
+
provider=spec.provider,
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
elif spec.generator_name == "choice":
|
|
550
|
+
choices = spec.params.get("choices", [])
|
|
551
|
+
if len(choices) < count:
|
|
552
|
+
col_info = None
|
|
553
|
+
if column_infos:
|
|
554
|
+
col_info = next((c for c in column_infos if c.name == col_name), None)
|
|
555
|
+
if col_info:
|
|
556
|
+
fallback = self._mapper.map_column(col_info, force_type_infer=True)
|
|
557
|
+
if fallback.generator_name not in ("skip", "choice"):
|
|
558
|
+
specs[col_name] = GeneratorSpec(
|
|
559
|
+
generator_name=fallback.generator_name,
|
|
560
|
+
params=fallback.params,
|
|
561
|
+
null_ratio=spec.null_ratio,
|
|
562
|
+
provider=fallback.provider,
|
|
563
|
+
)
|
|
564
|
+
specs = self._adjust_specs_for_unique(specs, {col_name}, count, column_infos)
|
|
565
|
+
|
|
566
|
+
return specs
|
|
567
|
+
|
|
297
568
|
def _resolve_user_configs(
|
|
298
569
|
self,
|
|
299
570
|
columns: dict[str, Any] | None,
|
|
@@ -551,6 +822,7 @@ class DataOrchestrator:
|
|
|
551
822
|
clear_before: bool = False,
|
|
552
823
|
column_configs: list[Any] | None = None,
|
|
553
824
|
transform: str | None = None,
|
|
825
|
+
enrich: bool = False,
|
|
554
826
|
) -> GenerationResult:
|
|
555
827
|
return self.fill_table(
|
|
556
828
|
table_name=table_name,
|
|
@@ -561,6 +833,7 @@ class DataOrchestrator:
|
|
|
561
833
|
clear_before=clear_before,
|
|
562
834
|
column_configs=column_configs,
|
|
563
835
|
transform=transform,
|
|
836
|
+
enrich=enrich,
|
|
564
837
|
)
|
|
565
838
|
|
|
566
839
|
def close(self) -> None:
|
|
@@ -134,17 +134,31 @@ class SQLiteUtilsAdapter:
|
|
|
134
134
|
) -> int:
|
|
135
135
|
inserted = 0
|
|
136
136
|
batch: list[dict[str, Any]] = []
|
|
137
|
-
for
|
|
137
|
+
for item in data:
|
|
138
|
+
row = item
|
|
139
|
+
if not row:
|
|
140
|
+
row = {}
|
|
138
141
|
batch.append(row)
|
|
139
142
|
if len(batch) >= batch_size:
|
|
140
|
-
self.
|
|
141
|
-
inserted += len(batch)
|
|
143
|
+
inserted += self._insert_batch(table_name, batch)
|
|
142
144
|
batch = []
|
|
143
145
|
if batch:
|
|
144
|
-
self.
|
|
145
|
-
inserted += len(batch)
|
|
146
|
+
inserted += self._insert_batch(table_name, batch)
|
|
146
147
|
return inserted
|
|
147
148
|
|
|
149
|
+
def _insert_batch(self, table_name: str, batch: list[dict[str, Any]]) -> int:
|
|
150
|
+
if not batch:
|
|
151
|
+
return 0
|
|
152
|
+
if batch[0]:
|
|
153
|
+
self._db[table_name].insert_all(batch)
|
|
154
|
+
return len(batch)
|
|
155
|
+
safe_table = quote_identifier(table_name)
|
|
156
|
+
conn = self._db.conn
|
|
157
|
+
for _ in batch:
|
|
158
|
+
conn.execute(f"INSERT INTO {safe_table} DEFAULT VALUES")
|
|
159
|
+
conn.commit()
|
|
160
|
+
return len(batch)
|
|
161
|
+
|
|
148
162
|
def clear_table(self, table_name: str) -> None:
|
|
149
163
|
safe_table = quote_identifier(table_name)
|
|
150
164
|
self._db.execute(f"DELETE FROM {safe_table}")
|
|
@@ -117,7 +117,7 @@ class DataStream:
|
|
|
117
117
|
total_retries += 1
|
|
118
118
|
continue
|
|
119
119
|
|
|
120
|
-
if generated_values:
|
|
120
|
+
if generated_values or not any(not n.is_skip for n in self._nodes):
|
|
121
121
|
if self._transform_fn:
|
|
122
122
|
ctx = {"row_number": total_retries}
|
|
123
123
|
row = self._transform_fn(row, ctx)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/__init__.py
RENAMED
|
File without changes
|
{sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/__main__.py
RENAMED
|
File without changes
|
{sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|