PyPI - sqlseed - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sqlseed 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

sqlseed/__init__.py +121 -0
sqlseed/_utils/__init__.py +11 -0
sqlseed/_utils/logger.py +30 -0
sqlseed/_utils/metrics.py +45 -0
sqlseed/_utils/progress.py +14 -0
sqlseed/_utils/schema_helpers.py +51 -0
sqlseed/_utils/sql_safe.py +45 -0
sqlseed/_version.py +1 -0
sqlseed/cli/__init__.py +3 -0
sqlseed/cli/main.py +316 -0
sqlseed/config/__init__.py +14 -0
sqlseed/config/loader.py +66 -0
sqlseed/config/models.py +99 -0
sqlseed/config/snapshot.py +91 -0
sqlseed/core/__init__.py +14 -0
sqlseed/core/column_dag.py +108 -0
sqlseed/core/constraints.py +116 -0
sqlseed/core/expression.py +71 -0
sqlseed/core/mapper.py +257 -0
sqlseed/core/orchestrator.py +578 -0
sqlseed/core/relation.py +124 -0
sqlseed/core/result.py +23 -0
sqlseed/core/schema.py +97 -0
sqlseed/core/transform.py +27 -0
sqlseed/database/__init__.py +14 -0
sqlseed/database/_protocol.py +72 -0
sqlseed/database/optimizer.py +96 -0
sqlseed/database/raw_sqlite_adapter.py +197 -0
sqlseed/database/sqlite_utils_adapter.py +183 -0
sqlseed/generators/__init__.py +11 -0
sqlseed/generators/_protocol.py +73 -0
sqlseed/generators/base_provider.py +448 -0
sqlseed/generators/faker_provider.py +157 -0
sqlseed/generators/mimesis_provider.py +203 -0
sqlseed/generators/registry.py +86 -0
sqlseed/generators/stream.py +157 -0
sqlseed/py.typed +0 -0
sqlseed-0.1.0.dist-info/METADATA +934 -0
sqlseed-0.1.0.dist-info/RECORD +42 -0
sqlseed-0.1.0.dist-info/WHEEL +4 -0
sqlseed-0.1.0.dist-info/entry_points.txt +6 -0
sqlseed-0.1.0.dist-info/licenses/LICENSE +17 -0

sqlseed/core/result.py ADDED Viewed

@@ -0,0 +1,23 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class GenerationResult:
+    table_name: str
+    count: int
+    elapsed: float
+    rows_per_second: float = 0.0
+    batch_count: int = 0
+    errors: list[str] = field(default_factory=list)
+    def __post_init__(self) -> None:
+        if self.count > 0 and self.elapsed > 0:
+            self.rows_per_second = self.count / self.elapsed
+    def __str__(self) -> str:
+        return (
+            f"GenerationResult(table={self.table_name}, count={self.count}, "
+            f"elapsed={self.elapsed:.2f}s, speed={self.rows_per_second:.0f} rows/s)"
+        )

sqlseed/core/schema.py ADDED Viewed

@@ -0,0 +1,97 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any, cast
+if TYPE_CHECKING:
+    from sqlseed.database._protocol import ColumnInfo, ForeignKeyInfo, IndexInfo
+class SchemaInferrer:
+    def __init__(self, db_adapter: Any) -> None:
+        self._db = db_adapter
+    def get_column_info(self, table_name: str) -> list[ColumnInfo]:
+        return list(self._db.get_column_info(table_name))
+    def get_foreign_keys(self, table_name: str) -> list[ForeignKeyInfo]:
+        return list(self._db.get_foreign_keys(table_name))
+    def get_table_names(self) -> list[str]:
+        return list(self._db.get_table_names())
+    def get_primary_keys(self, table_name: str) -> list[str]:
+        return list(self._db.get_primary_keys(table_name))
+    def get_table_schema(self, table_name: str) -> dict[str, ColumnInfo]:
+        columns = self.get_column_info(table_name)
+        return {col.name: col for col in columns}
+    def get_index_info(self, table_name: str) -> list[IndexInfo]:
+        return list(self._db.get_index_info(table_name))
+    def get_sample_data(self, table_name: str, limit: int = 5) -> list[dict[str, Any]]:
+        result = self._db.get_sample_rows(table_name, limit=limit)
+        return cast("list[dict[str, Any]]", result)
+    def profile_column_distribution(
+        self,
+        table_name: str,
+        limit: int = 1000,
+    ) -> list[dict[str, Any]]:
+        columns = self.get_column_info(table_name)
+        row_count = self._db.get_row_count(table_name)
+        if row_count == 0:
+            return []
+        profiles: list[dict[str, Any]] = []
+        for col in columns:
+            if col.is_primary_key and col.is_autoincrement:
+                continue
+            profile = self._profile_single_column(table_name, col.name, row_count, limit)
+            profiles.append(profile)
+        return profiles
+    def _profile_single_column(
+        self,
+        table_name: str,
+        column_name: str,
+        total_rows: int,
+        limit: int,
+    ) -> dict[str, Any]:
+        profile: dict[str, Any] = {"column": column_name}
+        try:
+            values = self._db.get_column_values(table_name, column_name, limit=limit)
+            null_count = sum(1 for v in values if v is None)
+            non_null_values = [v for v in values if v is not None]
+            profile["null_ratio"] = round(null_count / len(values), 3) if values else 0.0
+            profile["distinct_count"] = len(set(non_null_values))
+            profile["sample_size"] = len(values)
+            profile["total_rows"] = total_rows
+            if non_null_values:
+                from collections import Counter
+                counter = Counter(non_null_values)
+                top5 = counter.most_common(5)
+                profile["top_values"] = [
+                    {"value": str(v)[:50], "frequency": round(c / len(non_null_values), 3)} for v, c in top5
+                ]
+            else:
+                profile["top_values"] = []
+            numeric_values = [v for v in non_null_values if isinstance(v, (int, float))]
+            if numeric_values:
+                profile["value_range"] = {"min": min(numeric_values), "max": max(numeric_values)}
+            else:
+                profile["value_range"] = None
+        except Exception:
+            profile["error"] = "failed to profile"
+        return profile

sqlseed/core/transform.py ADDED Viewed

@@ -0,0 +1,27 @@
+from __future__ import annotations
+import importlib.util
+from pathlib import Path
+from typing import Any, Protocol, cast
+class RowTransformFn(Protocol):
+    def __call__(self, row: dict[str, Any], ctx: dict[str, Any]) -> dict[str, Any]: ...
+def load_transform(script_path: str) -> RowTransformFn:
+    path = Path(script_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Transform script not found: {script_path}")
+    spec = importlib.util.spec_from_file_location("user_transform", str(path))
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Cannot load transform script: {script_path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    fn = getattr(module, "transform_row", None)
+    if fn is None:
+        raise AttributeError(f"Transform script must define a 'transform_row(row, ctx)' function: {script_path}")
+    return cast("RowTransformFn", fn)

sqlseed/database/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from sqlseed.database._protocol import ColumnInfo, DatabaseAdapter, ForeignKeyInfo
+from sqlseed.database.optimizer import PragmaOptimizer, PragmaProfile
+from sqlseed.database.raw_sqlite_adapter import RawSQLiteAdapter
+from sqlseed.database.sqlite_utils_adapter import SQLiteUtilsAdapter
+__all__ = [
+    "ColumnInfo",
+    "DatabaseAdapter",
+    "ForeignKeyInfo",
+    "PragmaOptimizer",
+    "PragmaProfile",
+    "RawSQLiteAdapter",
+    "SQLiteUtilsAdapter",
+]

sqlseed/database/_protocol.py ADDED Viewed

@@ -0,0 +1,72 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+@dataclass(frozen=True)
+class ColumnInfo:
+    name: str
+    type: str
+    nullable: bool
+    default: Any
+    is_primary_key: bool
+    is_autoincrement: bool
+@dataclass(frozen=True)
+class ForeignKeyInfo:
+    column: str
+    ref_table: str
+    ref_column: str
+@dataclass(frozen=True)
+class IndexInfo:
+    name: str
+    table: str
+    columns: list[str]
+    unique: bool
+@runtime_checkable
+class DatabaseAdapter(Protocol):
+    def connect(self, db_path: str) -> None: ...
+    def close(self) -> None: ...
+    def get_table_names(self) -> list[str]: ...
+    def get_column_info(self, table_name: str) -> list[ColumnInfo]: ...
+    def get_primary_keys(self, table_name: str) -> list[str]: ...
+    def get_foreign_keys(self, table_name: str) -> list[ForeignKeyInfo]: ...
+    def get_row_count(self, table_name: str) -> int: ...
+    def get_column_values(self, table_name: str, column_name: str, limit: int = 1000) -> list[Any]: ...
+    def get_index_info(self, table_name: str) -> list[IndexInfo]: ...
+    def get_sample_rows(self, table_name: str, limit: int = 5) -> list[dict[str, Any]]: ...
+    def batch_insert(
+        self,
+        table_name: str,
+        data: Iterator[dict[str, Any]],
+        batch_size: int = 5000,
+    ) -> int: ...
+    def clear_table(self, table_name: str) -> None: ...
+    def optimize_for_bulk_write(self, expected_rows: int | None = None) -> None: ...
+    def restore_settings(self) -> None: ...
+    def __enter__(self) -> DatabaseAdapter: ...
+    def __exit__(self, exc_type: type | None, exc_val: Exception | None, exc_tb: Any) -> None: ...

sqlseed/database/optimizer.py ADDED Viewed

@@ -0,0 +1,96 @@
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Any
+from sqlseed._utils.logger import get_logger
+logger = get_logger(__name__)
+@dataclass
+class PragmaProfile:
+    synchronous: Any = None
+    journal_mode: Any = None
+    cache_size: Any = None
+    temp_store: Any = None
+    auto_vacuum: Any = None
+    page_size: Any = None
+    mmap_size: Any = None
+    wal_autocheckpoint: Any = None
+class PragmaOptimizer:
+    def __init__(self, execute_fn: Any, fetch_pragma_fn: Any) -> None:
+        self._execute = execute_fn
+        self._fetch_pragma = fetch_pragma_fn
+        self._original: PragmaProfile | None = None
+    def preserve(self) -> None:
+        self._original = PragmaProfile(
+            synchronous=self._fetch_pragma("synchronous"),
+            journal_mode=self._fetch_pragma("journal_mode"),
+            cache_size=self._fetch_pragma("cache_size"),
+            temp_store=self._fetch_pragma("temp_store"),
+            auto_vacuum=self._fetch_pragma("auto_vacuum"),
+            page_size=self._fetch_pragma("page_size"),
+            mmap_size=self._fetch_pragma("mmap_size"),
+        )
+        logger.debug("Preserved PRAGMA config", config=self._original)
+    def optimize(self, expected_rows: int | None = None) -> None:
+        if expected_rows is None:
+            expected_rows = 10000
+        if expected_rows > 100000:
+            self._apply_aggressive()
+        elif expected_rows > 10000:
+            self._apply_moderate()
+        else:
+            self._apply_light()
+    def _apply_light(self) -> None:
+        self._execute("PRAGMA synchronous = NORMAL")
+        self._execute("PRAGMA temp_store = MEMORY")
+        self._execute("PRAGMA cache_size = -8000")
+        logger.debug("Applied LIGHT PRAGMA optimization")
+    def _apply_moderate(self) -> None:
+        self._execute("PRAGMA synchronous = OFF")
+        self._execute("PRAGMA journal_mode = MEMORY")
+        self._execute("PRAGMA temp_store = MEMORY")
+        self._execute("PRAGMA cache_size = -16000")
+        self._execute("PRAGMA mmap_size = 268435456")
+        logger.debug("Applied MODERATE PRAGMA optimization")
+    def _apply_aggressive(self) -> None:
+        self._execute("PRAGMA synchronous = OFF")
+        self._execute("PRAGMA journal_mode = OFF")
+        self._execute("PRAGMA temp_store = MEMORY")
+        self._execute("PRAGMA cache_size = -32000")
+        self._execute("PRAGMA mmap_size = 536870912")
+        self._execute("PRAGMA page_size = 4096")
+        logger.debug("Applied AGGRESSIVE PRAGMA optimization")
+    def restore(self) -> None:
+        if self._original is None:
+            return
+        for attr in [
+            "synchronous",
+            "journal_mode",
+            "cache_size",
+            "temp_store",
+            "auto_vacuum",
+            "page_size",
+            "mmap_size",
+        ]:
+            value = getattr(self._original, attr)
+            if value is not None and (
+                isinstance(value, (int, float)) or (isinstance(value, str) and re.match(r"^[a-zA-Z0-9_-]+$", value))
+            ):
+                self._execute(f"PRAGMA {attr} = {value}")
+        logger.debug("Restored original PRAGMA config")
+        self._original = None

sqlseed/database/raw_sqlite_adapter.py ADDED Viewed

@@ -0,0 +1,197 @@
+from __future__ import annotations
+import sqlite3
+from typing import TYPE_CHECKING, Any
+from typing_extensions import Self
+from sqlseed._utils.logger import get_logger
+from sqlseed._utils.sql_safe import build_insert_sql, quote_identifier
+from sqlseed.database._protocol import ColumnInfo, ForeignKeyInfo, IndexInfo
+from sqlseed.database.optimizer import PragmaOptimizer
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+logger = get_logger(__name__)
+class RawSQLiteAdapter:
+    def __init__(self) -> None:
+        self._conn: sqlite3.Connection | None = None
+        self._optimizer: PragmaOptimizer | None = None
+        self._db_path: str = ""
+    @property
+    def conn(self) -> sqlite3.Connection:
+        assert self._conn is not None, "Database not connected. Call connect() first."
+        return self._conn
+    def connect(self, db_path: str) -> None:
+        self._db_path = db_path
+        self._conn = sqlite3.connect(db_path)
+        self._conn.execute("PRAGMA foreign_keys = ON")
+        self._optimizer = PragmaOptimizer(
+            execute_fn=self._execute_pragma,
+            fetch_pragma_fn=self._fetch_pragma,
+        )
+        logger.debug("Connected to database via raw sqlite3", db_path=db_path)
+    def close(self) -> None:
+        if self._conn is not None:
+            self._conn.close()
+            self._conn = None
+            logger.debug("Closed raw sqlite3 connection", db_path=self._db_path)
+    def get_table_names(self) -> list[str]:
+        cursor = self.conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
+        return [row[0] for row in cursor.fetchall()]
+    def get_column_info(self, table_name: str) -> list[ColumnInfo]:
+        pks = set(self.get_primary_keys(table_name))
+        fks = {fk.column for fk in self.get_foreign_keys(table_name)}
+        cursor = self.conn.execute(f"PRAGMA table_info({quote_identifier(table_name)})")
+        result: list[ColumnInfo] = []
+        for row in cursor.fetchall():
+            _cid, name, col_type, notnull, default_val, _is_pk = row
+            is_pk_flag = name in pks
+            is_autoincrement = is_pk_flag and self._is_autoincrement(table_name, name)
+            result.append(
+                ColumnInfo(
+                    name=name,
+                    type=col_type.upper() if col_type else "TEXT",
+                    nullable=not notnull and name not in fks,
+                    default=default_val,
+                    is_primary_key=is_pk_flag,
+                    is_autoincrement=is_autoincrement,
+                )
+            )
+        return result
+    def get_primary_keys(self, table_name: str) -> list[str]:
+        cursor = self.conn.execute(f"PRAGMA table_info({quote_identifier(table_name)})")
+        pks: list[str] = []
+        for row in cursor.fetchall():
+            _, name, _, _, _, is_pk = row
+            if is_pk:
+                pks.append(name)
+        return pks
+    def get_foreign_keys(self, table_name: str) -> list[ForeignKeyInfo]:
+        cursor = self.conn.execute(f"PRAGMA foreign_key_list({quote_identifier(table_name)})")
+        result: list[ForeignKeyInfo] = []
+        for row in cursor.fetchall():
+            _, _, ref_table, from_col, to_col, *_ = row
+            result.append(
+                ForeignKeyInfo(
+                    column=from_col,
+                    ref_table=ref_table,
+                    ref_column=to_col,
+                )
+            )
+        return result
+    def get_row_count(self, table_name: str) -> int:
+        safe_table = quote_identifier(table_name)
+        cursor = self.conn.execute(f"SELECT COUNT(*) FROM {safe_table}")
+        return int(cursor.fetchone()[0])
+    def get_column_values(self, table_name: str, column_name: str, limit: int = 1000) -> list[Any]:
+        safe_table = quote_identifier(table_name)
+        safe_column = quote_identifier(column_name)
+        cursor = self.conn.execute(
+            f"SELECT {safe_column} FROM {safe_table} LIMIT ?",
+            [limit],
+        )
+        return [row[0] for row in cursor.fetchall()]
+    def get_index_info(self, table_name: str) -> list[IndexInfo]:
+        safe_table = quote_identifier(table_name)
+        cursor = self.conn.execute(f"PRAGMA index_list({safe_table})")
+        result: list[IndexInfo] = []
+        for row in cursor.fetchall():
+            idx_name = row[1]
+            is_unique = bool(row[2])
+            if idx_name.startswith("sqlite_autoindex_"):
+                continue
+            col_cursor = self.conn.execute(f"PRAGMA index_info({quote_identifier(idx_name)})")
+            columns = [cr[2] for cr in col_cursor.fetchall() if cr[2] is not None]
+            result.append(IndexInfo(name=idx_name, table=table_name, columns=columns, unique=is_unique))
+        return result
+    def get_sample_rows(self, table_name: str, limit: int = 5) -> list[dict[str, Any]]:
+        safe_table = quote_identifier(table_name)
+        columns = self.get_column_info(table_name)
+        col_names = [quote_identifier(c.name) for c in columns]
+        cols_sql = ", ".join(col_names)
+        cursor = self.conn.execute(f"SELECT {cols_sql} FROM {safe_table} LIMIT ?", [limit])
+        col_name_list = [c.name for c in columns]
+        return [dict(zip(col_name_list, row, strict=False)) for row in cursor.fetchall()]
+    def batch_insert(
+        self,
+        table_name: str,
+        data: Iterator[dict[str, Any]],
+        batch_size: int = 5000,
+    ) -> int:
+        inserted = 0
+        batch: list[dict[str, Any]] = []
+        for row in data:
+            batch.append(row)
+            if len(batch) >= batch_size:
+                inserted += self._insert_batch(table_name, batch)
+                batch = []
+        if batch:
+            inserted += self._insert_batch(table_name, batch)
+        return inserted
+    def _insert_batch(self, table_name: str, batch: list[dict[str, Any]]) -> int:
+        if not batch:
+            return 0
+        column_names = list(batch[0].keys())
+        sql = build_insert_sql(table_name, column_names)
+        values = [tuple(row[col] for col in column_names) for row in batch]
+        self.conn.executemany(sql, values)
+        self.conn.commit()
+        return len(batch)
+    def clear_table(self, table_name: str) -> None:
+        safe_table = quote_identifier(table_name)
+        self.conn.execute(f"DELETE FROM {safe_table}")
+        self.conn.commit()
+        logger.debug("Cleared table", table_name=table_name)
+    def optimize_for_bulk_write(self, expected_rows: int | None = None) -> None:
+        if self._optimizer is not None:
+            self._optimizer.preserve()
+            self._optimizer.optimize(expected_rows)
+    def restore_settings(self) -> None:
+        if self._optimizer is not None:
+            self._optimizer.restore()
+            self.conn.commit()
+    def _is_autoincrement(self, table_name: str, column_name: str) -> bool:
+        from sqlseed._utils.schema_helpers import detect_autoincrement
+        return detect_autoincrement(self.conn.execute, table_name, column_name)
+    def _execute_pragma(self, sql: str) -> None:
+        self.conn.execute(sql)
+    def _fetch_pragma(self, name: str) -> Any:
+        cursor = self.conn.execute(f"PRAGMA {name}")
+        row = cursor.fetchone()
+        return row[0] if row else None
+    def __enter__(self) -> Self:
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: Any,
+    ) -> None:
+        self.close()