PyPI - sqlseed - Versions diffs - 0.1.9__tar.gz → 0.1.10__tar.gz - Mend

sqlseed 0.1.9tar.gz → 0.1.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{sqlseed-0.1.9 → sqlseed-0.1.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sqlseed
-Version: 0.1.9
+Version: 0.1.10
 Summary: Declarative SQLite test data generation toolkit
 Project-URL: Homepage, https://github.com/sunbos/sqlseed
 Project-URL: Documentation, https://github.com/sunbos/sqlseed#readme

{sqlseed-0.1.9 → sqlseed-0.1.10}/plugins/mcp-server-sqlseed/src/mcp_server_sqlseed/server.py RENAMED Viewed

@@ -109,6 +109,7 @@ def sqlseed_execute_fill(
     table_name: str,
     count: int = 1000,
     yaml_config: str | None = None,
+    enrich: bool = False,
 ) -> dict[str, Any]:
     """Execute data generation for a table. Optionally provide YAML config string for column rules."""
     from sqlseed.core.orchestrator import DataOrchestrator
@@ -138,6 +139,7 @@ def sqlseed_execute_fill(
             column_configs=column_configs,
             clear_before=clear_before,
             seed=seed,
+            enrich=enrich,
         )
         return {

{sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/__init__.py RENAMED Viewed

@@ -42,6 +42,7 @@ def fill(
     batch_size: int = 5000,
     clear_before: bool = False,
     optimize_pragma: bool = True,
+    enrich: bool = False,
 ) -> GenerationResult:
     with DataOrchestrator(
         db_path=db_path,
@@ -56,6 +57,7 @@ def fill(
             seed=seed,
             batch_size=batch_size,
             clear_before=clear_before,
+            enrich=enrich,
         )
@@ -92,6 +94,7 @@ def fill_from_config(config_path: str) -> list[GenerationResult]:
                 clear_before=table_config.clear_before,
                 column_configs=table_config.columns,
                 transform=table_config.transform,
+                enrich=table_config.enrich,
             )
             results.append(result)
     return results
@@ -106,6 +109,7 @@ def preview(
     provider: str = "mimesis",
     locale: str = "en_US",
     seed: int | None = None,
+    enrich: bool = False,
 ) -> list[dict[str, Any]]:
     with DataOrchestrator(
         db_path=db_path,
@@ -118,4 +122,5 @@ def preview(
             count=count,
             columns=columns,
             seed=seed,
+            enrich=enrich,
         )

{sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/cli/main.py RENAMED Viewed

@@ -24,6 +24,7 @@ def cli() -> None:
 @click.option("--config", "-c", "config_path", default=None, help="YAML/JSON config file path")
 @click.option("--transform", "transform_path", default=None, help="Python transform script path")
 @click.option("--snapshot", is_flag=True, help="Save generation snapshot for replay")
+@click.option("--enrich", is_flag=True, help="Enrich data using existing table distribution")
 def fill(
     db_path: str | None,
     table: str | None,
@@ -36,6 +37,7 @@ def fill(
     config_path: str | None,
     transform_path: str | None,
     snapshot: bool,
+    enrich: bool,
 ) -> None:
     """Fill a table with generated test data.
@@ -73,6 +75,7 @@ def fill(
         seed=seed,
         batch_size=batch_size,
         clear_before=clear,
+        enrich=enrich,
     )
     click.echo(str(result))

{sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/config/models.py RENAMED Viewed

@@ -72,9 +72,10 @@ class TableConfig(BaseModel):
     count: int = Field(default=1000, gt=0)
     batch_size: int = Field(default=5000, gt=0)
     columns: list[ColumnConfig] = Field(default_factory=list)
-    clear_before: bool = False  # 默认不清空，保护原始数据
+    clear_before: bool = False
     seed: int | None = None
-    transform: str | None = None  # [NEW] Python 变换脚本路径
+    transform: str | None = None
+    enrich: bool = False
 class ColumnAssociation(BaseModel):

{sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/column_dag.py RENAMED Viewed

@@ -40,9 +40,12 @@ class ColumnDAG:
         self,
         specs: dict[str, GeneratorSpec],
         column_configs: list[Any] | None = None,
+        unique_columns: set[str] | None = None,
+        composite_unique_indexes: list[list[str]] | None = None,
     ) -> list[ColumnNode]:
         nodes: dict[str, ColumnNode] = {}
         config_map: dict[str, Any] = {}
+        unique_columns = unique_columns or set()
         if column_configs:
             for cc in column_configs:
@@ -69,6 +72,15 @@ class ColumnDAG:
                     is_derived = True
                     final_spec = GeneratorSpec(generator_name="__derive__")
+            if col_name in unique_columns:
+                if constraints is None:
+                    constraints = ColumnConstraints(unique=True)
+                elif not constraints.unique:
+                    constraints = ColumnConstraints(
+                        unique=True,
+                        max_retries=constraints.max_retries,
+                    )
             nodes[col_name] = ColumnNode(
                 name=col_name,
                 generator_spec=final_spec,

{sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/mapper.py RENAMED Viewed

@@ -181,7 +181,14 @@ class ColumnMapper:
     def register_pattern_rule(self, pattern: str, generator: str, params: dict[str, Any] | None = None) -> None:
         self._custom_pattern_rules.append((pattern, generator, params or {}))
-    def map_column(self, column_info: ColumnInfo, user_config: Any = None) -> GeneratorSpec:
+    def map_column(
+        self,
+        column_info: ColumnInfo,
+        user_config: Any = None,
+        *,
+        enrich: bool = False,
+        force_type_infer: bool = False,
+    ) -> GeneratorSpec:
         column_name = column_info.name.lower()
         column_type = column_info.type.upper() if column_info.type else "TEXT"
@@ -219,6 +226,13 @@ class ColumnMapper:
                 return GeneratorSpec(generator_name=gen, params=params)
         if column_info.default is not None or column_info.nullable:
+            if force_type_infer:
+                return self._type_faithful_fallback(column_type)
+            if enrich:
+                return GeneratorSpec(
+                    generator_name="__enrich__",
+                    params={"_default": column_info.default, "_nullable": column_info.nullable},
+                )
             return GeneratorSpec(generator_name="skip")
         return self._type_faithful_fallback(column_type)
@@ -248,10 +262,12 @@ class ColumnMapper:
         self,
         columns: list[ColumnInfo],
         user_configs: dict[str, Any] | None = None,
+        *,
+        enrich: bool = False,
     ) -> dict[str, GeneratorSpec]:
         user_configs = user_configs or {}
         result: dict[str, GeneratorSpec] = {}
         for col in columns:
             col_config = user_configs.get(col.name)
-            result[col.name] = self.map_column(col, col_config)
+            result[col.name] = self.map_column(col, col_config, enrich=enrich)
         return result

{sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/core/orchestrator.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 import contextlib
+import math
+import re
 import time
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -91,6 +93,7 @@ class DataOrchestrator:
         clear_before: bool = False,
         column_configs: list[Any] | None = None,
         transform: str | None = None,
+        enrich: bool = False,
     ) -> GenerationResult:
         self._ensure_connected()
         start_time = time.monotonic()
@@ -106,14 +109,17 @@ class DataOrchestrator:
             column_infos = self._schema.get_column_info(table_name)
             user_configs = self._resolve_user_configs(columns, column_configs)
-            generator_specs = self._mapper.map_columns(column_infos, user_configs)
+            generator_specs = self._mapper.map_columns(column_infos, user_configs, enrich=enrich)
+            unique_columns = self._detect_unique_columns(table_name)
+            generator_specs = self._apply_enrich(table_name, generator_specs, column_infos, unique_columns)
+            generator_specs = self._adjust_specs_for_unique(generator_specs, unique_columns, count, column_infos)
             generator_specs = self._resolve_foreign_keys(table_name, generator_specs)
             generator_specs = self._apply_ai_suggestions(table_name, column_infos, generator_specs)
             generator_specs = self._apply_template_pool(table_name, column_infos, generator_specs, count)
             dag = ColumnDAG()
             col_configs_list = list(user_configs.values()) if user_configs else None
-            dag_nodes = dag.build(generator_specs, col_configs_list)
+            dag_nodes = dag.build(generator_specs, col_configs_list, unique_columns=unique_columns)
             expr_engine = ExpressionEngine()
             constraint_solver = ConstraintSolver()
@@ -207,17 +213,21 @@ class DataOrchestrator:
         seed: int | None = None,
         transform: str | None = None,
         column_configs: list[Any] | None = None,
+        enrich: bool = False,
     ) -> list[dict[str, Any]]:
         self._ensure_connected()
         column_infos = self._schema.get_column_info(table_name)
         user_configs = self._resolve_user_configs(columns, column_configs)
-        generator_specs = self._mapper.map_columns(column_infos, user_configs)
+        generator_specs = self._mapper.map_columns(column_infos, user_configs, enrich=enrich)
+        unique_columns = self._detect_unique_columns(table_name)
+        generator_specs = self._apply_enrich(table_name, generator_specs, column_infos, unique_columns)
+        generator_specs = self._adjust_specs_for_unique(generator_specs, unique_columns, count, column_infos)
         generator_specs = self._resolve_foreign_keys(table_name, generator_specs)
         dag = ColumnDAG()
         col_configs_list = list(user_configs.values()) if user_configs else None
-        dag_nodes = dag.build(generator_specs, col_configs_list)
+        dag_nodes = dag.build(generator_specs, col_configs_list, unique_columns=unique_columns)
         expr_engine = ExpressionEngine()
         constraint_solver = ConstraintSolver()
@@ -294,6 +304,267 @@ class DataOrchestrator:
             lines.append(f"  {table}: {count} rows")
         return "\n".join(lines)
+    _ENUM_NAME_PATTERNS: ClassVar[list[str]] = [
+        r"^[bB]y[A-Za-z]",
+        r".*_type$",
+        r".*_status$",
+        r"^is_.*",
+        r"^has_.*",
+        r"^can_.*",
+        r".*_level$",
+        r".*_category$",
+        r".*_class$",
+        r".*_flag$",
+        r".*_kind$",
+        r".*_grade$",
+        r".*_rank$",
+        r".*_tier$",
+        r".*_mode$",
+        r".*_stage$",
+        r".*_phase$",
+        r".*_state$",
+        r".*_group$",
+    ]
+    _SMALL_INT_TYPES: ClassVar[tuple[str, ...]] = ("INT8", "INT16", "TINYINT", "SMALLINT")
+    def _is_enumeration_column(
+        self,
+        col_name: str,
+        col_info: Any,
+        distinct_count: int,
+        total_rows: int,
+        is_unique: bool,
+    ) -> bool:
+        if is_unique:
+            return False
+        if total_rows == 0 or distinct_count == 0:
+            return False
+        cardinality_ratio = distinct_count / total_rows
+        name_matches_enum = any(re.match(p, col_name) for p in self._ENUM_NAME_PATTERNS)
+        col_type_upper = col_info.type.upper() if col_info and hasattr(col_info, "type") else ""
+        is_small_int = any(t in col_type_upper for t in self._SMALL_INT_TYPES)
+        return (
+            (name_matches_enum and cardinality_ratio < 0.1)
+            or (is_small_int and cardinality_ratio < 0.1)
+            or (distinct_count <= 10 and cardinality_ratio < 0.05)
+            or (
+                distinct_count <= 30
+                and cardinality_ratio < 0.01
+                and "CHAR" not in col_type_upper
+                and "TEXT" not in col_type_upper
+            )
+        )
+    def _apply_enrich(
+        self,
+        table_name: str,
+        specs: dict[str, GeneratorSpec],
+        column_infos: list[Any],
+        unique_columns: set[str] | None = None,
+    ) -> dict[str, GeneratorSpec]:
+        has_enrich = any(s.generator_name == "__enrich__" for s in specs.values())
+        if not has_enrich:
+            return specs
+        unique_columns = unique_columns or set()
+        row_count = self._db.get_row_count(table_name)
+        if row_count == 0:
+            for col_name, spec in specs.items():
+                if spec.generator_name == "__enrich__":
+                    specs[col_name] = GeneratorSpec(generator_name="skip")
+            return specs
+        for col_name, spec in list(specs.items()):
+            if spec.generator_name != "__enrich__":
+                continue
+            is_unique = col_name in unique_columns
+            specs[col_name] = self._build_enriched_spec(table_name, col_name, spec, column_infos, is_unique)
+        return specs
+    def _build_enriched_spec(
+        self,
+        table_name: str,
+        col_name: str,
+        spec: GeneratorSpec,
+        column_infos: list[Any],
+        is_unique: bool = False,
+    ) -> GeneratorSpec:
+        col_info = next((c for c in column_infos if c.name == col_name), None)
+        try:
+            values = self._db.get_column_values(table_name, col_name, limit=10000)
+        except Exception:
+            return GeneratorSpec(generator_name="skip")
+        if not values:
+            return GeneratorSpec(generator_name="skip")
+        null_count = sum(1 for v in values if v is None)
+        non_null_values = [v for v in values if v is not None]
+        null_ratio = round(null_count / len(values), 3) if values else 0.0
+        if not non_null_values:
+            return GeneratorSpec(generator_name="skip")
+        if col_info and not col_info.nullable:
+            null_ratio = 0.0
+        if is_unique:
+            null_ratio = 0.0
+        distinct_values = list(set(non_null_values))
+        distinct_count = len(distinct_values)
+        row_count = self._db.get_row_count(table_name)
+        if self._is_enumeration_column(col_name, col_info, distinct_count, row_count, is_unique):
+            choices = distinct_values
+            if col_info and "INT" in col_info.type.upper():
+                choices = [int(v) if isinstance(v, (int, float, str)) else v for v in choices]
+            return GeneratorSpec(
+                generator_name="choice",
+                params={"choices": choices},
+                null_ratio=null_ratio,
+            )
+        if col_info:
+            fallback_spec = self._mapper.map_column(col_info, force_type_infer=True)
+            if fallback_spec.generator_name != "skip":
+                return GeneratorSpec(
+                    generator_name=fallback_spec.generator_name,
+                    params=fallback_spec.params,
+                    null_ratio=null_ratio,
+                    provider=fallback_spec.provider,
+                )
+        return GeneratorSpec(generator_name="skip")
+    def _detect_unique_columns(self, table_name: str) -> set[str]:
+        unique_cols: set[str] = set()
+        try:
+            indexes = self._schema.get_index_info(table_name)
+            for idx in indexes:
+                if idx.unique and len(idx.columns) == 1:
+                    unique_cols.add(idx.columns[0])
+        except Exception:
+            logger.debug("Failed to detect unique constraints from indexes", table_name=table_name)
+        try:
+            pks = self._db.get_primary_keys(table_name)
+            column_infos = self._schema.get_column_info(table_name)
+            autoincrement_pks = {c.name for c in column_infos if c.is_primary_key and c.is_autoincrement}
+            for pk in pks:
+                if pk not in autoincrement_pks:
+                    unique_cols.add(pk)
+        except Exception:
+            logger.debug("Failed to detect PK unique constraints", table_name=table_name)
+        return unique_cols
+    def _adjust_specs_for_unique(
+        self,
+        specs: dict[str, GeneratorSpec],
+        unique_columns: set[str],
+        count: int,
+        column_infos: list[Any] | None = None,
+    ) -> dict[str, GeneratorSpec]:
+        for col_name in unique_columns:
+            if col_name not in specs:
+                continue
+            spec = specs[col_name]
+            if spec.generator_name == "skip":
+                continue
+            if spec.generator_name == "string":
+                params = dict(spec.params)
+                charset_size = 62
+                if params.get("charset") == "digits":
+                    charset_size = 10
+                elif params.get("charset") == "alpha":
+                    charset_size = 52
+                max_length = params.get("max_length", 50)
+                min_needed = max(1, math.ceil(math.log(max(count * count * 50, 1)) / math.log(charset_size)))
+                current_min = params.get("min_length", 1)
+                params["min_length"] = max(current_min, min_needed)
+                if params["min_length"] > max_length:
+                    if params.get("charset") is None:
+                        params["charset"] = "alphanumeric"
+                        charset_size = 62
+                        min_needed = max(1, math.ceil(math.log(max(count * count * 50, 1)) / math.log(charset_size)))
+                        params["min_length"] = max(current_min, min_needed)
+                    if params["min_length"] > max_length:
+                        logger.warning(
+                            "Cannot guarantee uniqueness for VARCHAR(%d) with count=%d",
+                            max_length,
+                            count,
+                            column=col_name,
+                        )
+                        params["max_length"] = max(params["min_length"], max_length)
+                elif params["max_length"] < params["min_length"]:
+                    params["max_length"] = params["min_length"]
+                specs[col_name] = GeneratorSpec(
+                    generator_name=spec.generator_name,
+                    params=params,
+                    null_ratio=spec.null_ratio,
+                    provider=spec.provider,
+                )
+            elif spec.generator_name == "integer":
+                params = dict(spec.params)
+                min_val = params.get("min_value", 0)
+                max_val = params.get("max_value", 999999)
+                if max_val - min_val < count * 10:
+                    col_info = next((c for c in (column_infos or []) if c.name == col_name), None)
+                    if col_info:
+                        col_type_upper = col_info.type.upper()
+                        if "INT8" in col_type_upper and count > 255:
+                            logger.warning(
+                                "INT8 column with UNIQUE constraint cannot guarantee uniqueness for count > 255",
+                                column=col_name,
+                                count=count,
+                            )
+                        elif "INT16" in col_type_upper and count > 65535:
+                            logger.warning(
+                                "INT16 column with UNIQUE constraint cannot guarantee uniqueness for count > 65535",
+                                column=col_name,
+                                count=count,
+                            )
+                    params["max_value"] = min_val + count * 10
+                specs[col_name] = GeneratorSpec(
+                    generator_name=spec.generator_name,
+                    params=params,
+                    null_ratio=spec.null_ratio,
+                    provider=spec.provider,
+                )
+            elif spec.generator_name == "choice":
+                choices = spec.params.get("choices", [])
+                if len(choices) < count:
+                    col_info = None
+                    if column_infos:
+                        col_info = next((c for c in column_infos if c.name == col_name), None)
+                    if col_info:
+                        fallback = self._mapper.map_column(col_info, force_type_infer=True)
+                        if fallback.generator_name not in ("skip", "choice"):
+                            specs[col_name] = GeneratorSpec(
+                                generator_name=fallback.generator_name,
+                                params=fallback.params,
+                                null_ratio=spec.null_ratio,
+                                provider=fallback.provider,
+                            )
+                            specs = self._adjust_specs_for_unique(specs, {col_name}, count, column_infos)
+        return specs
     def _resolve_user_configs(
         self,
         columns: dict[str, Any] | None,
@@ -551,6 +822,7 @@ class DataOrchestrator:
         clear_before: bool = False,
         column_configs: list[Any] | None = None,
         transform: str | None = None,
+        enrich: bool = False,
     ) -> GenerationResult:
         return self.fill_table(
             table_name=table_name,
@@ -561,6 +833,7 @@ class DataOrchestrator:
             clear_before=clear_before,
             column_configs=column_configs,
             transform=transform,
+            enrich=enrich,
         )
     def close(self) -> None:

{sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/database/sqlite_utils_adapter.py RENAMED Viewed

@@ -134,17 +134,31 @@ class SQLiteUtilsAdapter:
     ) -> int:
         inserted = 0
         batch: list[dict[str, Any]] = []
-        for row in data:
+        for item in data:
+            row = item
+            if not row:
+                row = {}
             batch.append(row)
             if len(batch) >= batch_size:
-                self._db[table_name].insert_all(batch)
-                inserted += len(batch)
+                inserted += self._insert_batch(table_name, batch)
                 batch = []
         if batch:
-            self._db[table_name].insert_all(batch)
-            inserted += len(batch)
+            inserted += self._insert_batch(table_name, batch)
         return inserted
+    def _insert_batch(self, table_name: str, batch: list[dict[str, Any]]) -> int:
+        if not batch:
+            return 0
+        if batch[0]:
+            self._db[table_name].insert_all(batch)
+            return len(batch)
+        safe_table = quote_identifier(table_name)
+        conn = self._db.conn
+        for _ in batch:
+            conn.execute(f"INSERT INTO {safe_table} DEFAULT VALUES")
+        conn.commit()
+        return len(batch)
     def clear_table(self, table_name: str) -> None:
         safe_table = quote_identifier(table_name)
         self._db.execute(f"DELETE FROM {safe_table}")

{sqlseed-0.1.9 → sqlseed-0.1.10}/src/sqlseed/generators/stream.py RENAMED Viewed

@@ -117,7 +117,7 @@ class DataStream:
                 total_retries += 1
                 continue
-            if generated_values:
+            if generated_values or not any(not n.is_skip for n in self._nodes):
                 if self._transform_fn:
                     ctx = {"row_number": total_retries}
                     row = self._transform_fn(row, ctx)