PyPI - sqlseed-ai - Versions diffs - 0.1.0__tar.gz - Mend

sqlseed-ai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

sqlseed_ai-0.1.0/.gitignore +69 -0
sqlseed_ai-0.1.0/PKG-INFO +19 -0
sqlseed_ai-0.1.0/pyproject.toml +37 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/__init__.py +88 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/_client.py +31 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/_json_utils.py +24 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/analyzer.py +304 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/config.py +20 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/errors.py +119 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/examples.py +172 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/nl_config.py +80 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/provider.py +88 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/refiner.py +277 -0
sqlseed_ai-0.1.0/src/sqlseed_ai/suggest.py +62 -0

sqlseed_ai-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,69 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+.venv/
+env/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+# Type checking
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Linting
+.ruff_cache/
+# Project specific
+*.db
+*.sqlite
+*.sqlite3
+snapshots/
+# AI cache
+.sqlseed_cache/
+# Archived temp files
+_archived_temp/
+# macOS
+.DS_Store
+# Trae IDE
+.trae/
+# Build artifacts
+dist/
+*.whl
+*.tar.gz

sqlseed_ai-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,19 @@
+Metadata-Version: 2.4
+Name: sqlseed-ai
+Version: 0.1.0
+Summary: AI-powered data generation plugin for sqlseed
+Project-URL: Homepage, https://github.com/sunbos/sqlseed
+Project-URL: Repository, https://github.com/sunbos/sqlseed/tree/main/plugins/sqlseed-ai
+Author-email: SunBo <1443584939@qq.com>
+License-Expression: AGPL-3.0-or-later
+Classifier: Development Status :: 3 - Alpha
+Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: >=3.10
+Requires-Dist: openai>=1.0
+Requires-Dist: pydantic>=2.0
+Requires-Dist: sqlseed

sqlseed_ai-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,37 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "sqlseed-ai"
+version = "0.1.0"
+requires-python = ">=3.10"
+description = "AI-powered data generation plugin for sqlseed"
+license = "AGPL-3.0-or-later"
+authors = [
+    {name = "SunBo", email = "1443584939@qq.com"},
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "sqlseed",
+    "openai>=1.0",
+    "pydantic>=2.0",
+]
+[project.urls]
+Homepage = "https://github.com/sunbos/sqlseed"
+Repository = "https://github.com/sunbos/sqlseed/tree/main/plugins/sqlseed-ai"
+[project.entry-points."sqlseed"]
+ai = "sqlseed_ai:plugin"
+[tool.hatch.build.targets.wheel]
+packages = ["src/sqlseed_ai"]

sqlseed_ai-0.1.0/src/sqlseed_ai/__init__.py ADDED Viewed

@@ -0,0 +1,88 @@
+from __future__ import annotations
+import re
+from typing import Any
+from sqlseed.plugins.hookspecs import hookimpl
+from sqlseed_ai.analyzer import SchemaAnalyzer
+_SIMPLE_COL_RE = re.compile(
+    r'(^|[_\s])('
+    r'name|email|phone|address|url|uuid|'
+    r'date|time|datetime|timestamp|boolean|'
+    r'int|float|double|real|text|string|'
+    r'char|varchar|blob|byte|id|code|title|'
+    r'description|status|type|category|count|'
+    r'amount|price|value|number|index|order|level'
+    r')($|[_\s])',
+    re.IGNORECASE,
+)
+class AISqlseedPlugin:
+    def __init__(self) -> None:
+        self._analyzer: SchemaAnalyzer | None = None
+    def _get_analyzer(self) -> SchemaAnalyzer:
+        if self._analyzer is None:
+            self._analyzer = SchemaAnalyzer()
+        return self._analyzer
+    def _is_simple_column(self, column_name: str, column_type: str) -> bool:
+        return bool(
+            _SIMPLE_COL_RE.search(column_name) or _SIMPLE_COL_RE.search(column_type)
+        )
+    @hookimpl
+    def sqlseed_ai_analyze_table(
+        self,
+        table_name: str,
+        columns: list[Any],
+        indexes: list[dict[str, Any]],
+        sample_data: list[dict[str, Any]],
+        foreign_keys: list[Any],
+        all_table_names: list[str],
+    ) -> dict[str, Any] | None:
+        analyzer = self._get_analyzer()
+        return analyzer.analyze_table(
+            table_name=table_name,
+            columns=columns,
+            indexes=indexes,
+            sample_data=sample_data,
+            foreign_keys=foreign_keys,
+            all_table_names=all_table_names,
+        )
+    @hookimpl
+    def sqlseed_pre_generate_templates(
+        self,
+        table_name: str,
+        column_name: str,
+        column_type: str,
+        count: int,
+        sample_data: list[Any],
+    ) -> list[Any] | None:
+        if self._is_simple_column(column_name, column_type):
+            return None
+        analyzer = self._get_analyzer()
+        try:
+            return analyzer.generate_template_values(
+                column_name=column_name,
+                column_type=column_type,
+                count=min(count, 50),
+                sample_data=sample_data,
+            )
+        except Exception:
+            return None
+    @hookimpl
+    def sqlseed_register_providers(self, registry: Any) -> None:
+        pass
+    @hookimpl
+    def sqlseed_register_column_mappers(self, mapper: Any) -> None:
+        pass
+plugin = AISqlseedPlugin()

sqlseed_ai-0.1.0/src/sqlseed_ai/_client.py ADDED Viewed

@@ -0,0 +1,31 @@
+from __future__ import annotations
+from typing import Any
+from sqlseed._utils.logger import get_logger
+logger = get_logger(__name__)
+def get_openai_client(config: Any | None = None) -> Any:
+    try:
+        from openai import OpenAI
+        from sqlseed_ai.config import AIConfig
+        if config is None:
+            config = AIConfig.from_env()
+        api_key = config.api_key if hasattr(config, "api_key") else None
+        base_url = config.base_url if hasattr(config, "base_url") else None
+        if not api_key:
+            raise ValueError(
+                "AI API key not configured. Set SQLSEED_AI_API_KEY or OPENAI_API_KEY environment variable."
+            )
+        return OpenAI(api_key=api_key, base_url=base_url)
+    except ImportError:
+        raise ImportError(
+            "openai is not installed. Install it with: pip install sqlseed-ai"
+        ) from None

sqlseed_ai-0.1.0/src/sqlseed_ai/_json_utils.py ADDED Viewed

@@ -0,0 +1,24 @@
+from __future__ import annotations
+import json
+from typing import Any
+def parse_json_response(content: str) -> dict[str, Any]:
+    cleaned = content.strip()
+    if cleaned.startswith("```json"):
+        cleaned = cleaned[7:]
+    if cleaned.startswith("```"):
+        cleaned = cleaned[3:]
+    if cleaned.endswith("```"):
+        cleaned = cleaned[:-3]
+    cleaned = cleaned.strip()
+    try:
+        result = json.loads(cleaned)
+        if isinstance(result, dict):
+            return result
+    except json.JSONDecodeError:
+        pass
+    return {}

sqlseed_ai-0.1.0/src/sqlseed_ai/analyzer.py ADDED Viewed

@@ -0,0 +1,304 @@
+from __future__ import annotations
+from typing import Any
+from sqlseed._utils.logger import get_logger
+from sqlseed_ai._client import get_openai_client
+from sqlseed_ai.config import AIConfig
+logger = get_logger(__name__)
+SYSTEM_PROMPT = """You are an expert database test data engineer.
+You analyze SQLite table schemas and recommend data generation configurations for the sqlseed toolkit.
+## Available Generators
+- string (params: min_length, max_length, charset)
+- integer (params: min_value, max_value)
+- float (params: min_value, max_value, precision)
+- boolean
+- bytes (params: length)
+- name, first_name, last_name
+- email, phone, address, company
+- url, ipv4, uuid
+- date (params: start_year, end_year)
+- datetime (params: start_year, end_year)
+- timestamp
+- text (params: min_length, max_length)
+- sentence, password
+- choice (params: choices)
+- json (params: schema)
+- pattern (params: regex) — generates strings matching a regex pattern
+## Key Rules
+1. INTEGER PRIMARY KEY AUTOINCREMENT columns → do NOT include (auto-skip)
+2. Columns with DEFAULT values → do NOT include (auto-skip)
+3. Nullable columns → do NOT include unless they have semantic meaning
+4. Use `pattern` generator with regex for card numbers, codes, IDs with specific formats
+5. Use `derive_from` + `expression` when one column is computed from another
+6. Use `constraints.unique: true` for columns that must be unique
+7. Detect cross-column dependencies: if CutCard4byte = last 8 chars of sCardNo, use derive_from
+8. Detect implicit business associations: if sUserNo appears in multiple tables, note it
+## Output Format
+You MUST respond with a valid JSON object (NOT YAML, NOT markdown fences).
+The JSON object must have this exact structure:
+{
+  "name": "table_name",
+  "count": 1000,
+  "columns": [
+    {
+      "name": "column_name",
+      "generator": "generator_name",
+      "params": {"key": "value"}
+    },
+    {
+      "name": "derived_column",
+      "derive_from": "source_column",
+      "expression": "value[-8:]",
+      "constraints": {"unique": true}
+    }
+  ]
+}
+IMPORTANT: Do NOT include columns that are PRIMARY KEY AUTOINCREMENT or have DEFAULT values."""
+class SchemaAnalyzer:
+    def __init__(self, config: AIConfig | None = None) -> None:
+        self._config = config
+    def analyze_table(
+        self,
+        table_name: str,
+        columns: list[Any],
+        indexes: list[dict[str, Any]],
+        sample_data: list[dict[str, Any]],
+        foreign_keys: list[Any],
+        all_table_names: list[str],
+    ) -> dict[str, Any] | None:
+        if self._config is None:
+            self._config = AIConfig.from_env()
+        if not self._config.api_key:
+            logger.warning("AI API key not configured, skipping analysis")
+            return None
+        messages = self.build_initial_messages(
+            table_name=table_name,
+            columns=columns,
+            indexes=indexes,
+            sample_data=sample_data,
+            foreign_keys=foreign_keys,
+            all_table_names=all_table_names,
+        )
+        try:
+            return self.call_llm(messages)
+        except Exception as e:
+            logger.warning("AI analysis failed", table_name=table_name, error=str(e))
+            return None
+    def build_initial_messages(
+        self,
+        table_name: str,
+        columns: list[Any],
+        indexes: list[dict[str, Any]],
+        sample_data: list[dict[str, Any]],
+        foreign_keys: list[Any],
+        all_table_names: list[str],
+        distribution_profiles: list[dict[str, Any]] | None = None,
+    ) -> list[dict[str, str]]:
+        context = self._build_context(
+            table_name=table_name,
+            columns=columns,
+            indexes=indexes,
+            sample_data=sample_data,
+            foreign_keys=foreign_keys,
+            all_table_names=all_table_names,
+            distribution_profiles=distribution_profiles,
+        )
+        messages: list[dict[str, str]] = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+        ]
+        from sqlseed_ai.examples import FEW_SHOT_EXAMPLES
+        for example in FEW_SHOT_EXAMPLES:
+            messages.append({"role": "user", "content": example["input"]})
+            messages.append({"role": "assistant", "content": example["output"]})
+        messages.append({"role": "user", "content": context})
+        return messages
+    def call_llm(self, messages: list[dict[str, str]]) -> dict[str, Any]:
+        if self._config is None:
+            self._config = AIConfig.from_env()
+        if not self._config.api_key:
+            raise ValueError("AI API key not configured")
+        client = get_openai_client(self._config)
+        try:
+            response = client.chat.completions.create(
+                model=self._config.model,
+                messages=messages,
+                max_tokens=self._config.max_tokens,
+                temperature=self._config.temperature,
+                response_format={"type": "json_object"},
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"LLM API call failed (model={self._config.model}): {e}"
+            ) from e
+        if not response.choices:
+            raise RuntimeError(
+                f"LLM returned no choices (model={self._config.model}). "
+                "The API key or model may be invalid."
+            )
+        content = response.choices[0].message.content
+        if content is None:
+            return {}
+        return self._parse_json_response(content)
+    TEMPLATE_SYSTEM_PROMPT = (
+        "You are a data generation assistant. Generate realistic sample values "
+        "for the given database column. Return a JSON object with a 'values' "
+        "array containing the requested number of unique, realistic values. "
+        "Each value must be valid for the column type. Do NOT include explanations."
+    )
+    def generate_template_values(
+        self,
+        column_name: str,
+        column_type: str,
+        count: int,
+        sample_data: list[Any],
+    ) -> list[Any]:
+        prompt = (
+            f"Generate {count} realistic sample values for a database column "
+            f"named '{column_name}' with type '{column_type}'."
+        )
+        if sample_data:
+            prompt += f"\nExisting sample values: {sample_data[:5]}"
+        prompt += (
+            f"\nRespond with a JSON object: {{\"values\": [...]}}."
+            f"\nEach value should be a valid {column_type} value."
+        )
+        messages = [
+            {"role": "system", "content": self.TEMPLATE_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ]
+        result = self.call_llm(messages)
+        return result.get("values", [])
+    def _build_context(
+        self,
+        table_name: str,
+        columns: list[Any],
+        indexes: list[dict[str, Any]],
+        sample_data: list[dict[str, Any]],
+        foreign_keys: list[Any],
+        all_table_names: list[str],
+        distribution_profiles: list[dict[str, Any]] | None = None,
+    ) -> str:
+        lines: list[str] = []
+        lines.append(f"# Table: {table_name}")
+        lines.append("")
+        self._append_columns_info(lines, columns)
+        if indexes:
+            self._append_indexes_info(lines, indexes)
+        if foreign_keys:
+            lines.append("")
+            lines.append("## Foreign Keys")
+            for fk in foreign_keys:
+                lines.append(f"- {fk.column} → {fk.ref_table}.{fk.ref_column}")
+        if all_table_names:
+            lines.append("")
+            lines.append("## All Tables in Database")
+            lines.append(", ".join(all_table_names))
+        if sample_data:
+            lines.append("")
+            lines.append("## Sample Data (existing rows)")
+            for i, row in enumerate(sample_data[:5]):
+                row_str = ", ".join(f"{k}={v}" for k, v in row.items())
+                lines.append(f"  Row {i + 1}: {row_str}")
+        if distribution_profiles:
+            self._append_distribution_info(lines, distribution_profiles)
+        lines.append("")
+        lines.append(
+            "Please analyze this table schema and recommend "
+            "a complete sqlseed JSON configuration for generating test data."
+        )
+        return "\n".join(lines)
+    def _append_columns_info(
+        self,
+        lines: list[str],
+        columns: list[Any],
+    ) -> None:
+        lines.append("## Columns")
+        for col in columns:
+            parts = [f"- {col.name}: {col.type}"]
+            if col.is_primary_key:
+                parts.append("PRIMARY KEY")
+            if col.is_autoincrement:
+                parts.append("AUTOINCREMENT")
+            if col.nullable:
+                parts.append("NULLABLE")
+            if col.default is not None:
+                parts.append(f"DEFAULT={col.default}")
+            if not col.nullable and col.default is None and not col.is_primary_key:
+                parts.append("NOT NULL")
+            lines.append(" ".join(parts))
+    def _append_indexes_info(
+        self,
+        lines: list[str],
+        indexes: list[dict[str, Any]],
+    ) -> None:
+        lines.append("")
+        lines.append("## Indexes")
+        for idx in indexes:
+            unique_str = "UNIQUE " if idx.get("unique") else ""
+            cols_str = ", ".join(idx.get("columns", []))
+            lines.append(f"- {unique_str}INDEX ({cols_str})")
+    def _append_distribution_info(
+        self,
+        lines: list[str],
+        distribution_profiles: list[dict[str, Any]],
+    ) -> None:
+        lines.append("")
+        lines.append("## Column Distribution (from existing data)")
+        for profile in distribution_profiles:
+            col = profile["column"]
+            distinct = profile.get("distinct_count", "?")
+            null_ratio = profile.get("null_ratio", 0)
+            lines.append(
+                f"- {col}: {distinct} distinct values, {null_ratio:.1%} null"
+            )
+            top_values = profile.get("top_values", [])
+            if top_values:
+                top_str = ", ".join(
+                    f"{tv['value']}({tv['frequency']:.0%})"
+                    for tv in top_values[:3]
+                )
+                lines.append(f"  Top values: {top_str}")
+            vr = profile.get("value_range")
+            if vr:
+                lines.append(f"  Range: [{vr['min']}, {vr['max']}]")
+    def _parse_json_response(self, content: str) -> dict[str, Any]:
+        from sqlseed_ai._json_utils import parse_json_response
+        return parse_json_response(content)

sqlseed_ai-0.1.0/src/sqlseed_ai/config.py ADDED Viewed

@@ -0,0 +1,20 @@
+from __future__ import annotations
+import os
+from pydantic import BaseModel, Field
+class AIConfig(BaseModel):
+    api_key: str | None = None
+    model: str = "qwen3-coder-plus"
+    base_url: str | None = None
+    temperature: float = Field(default=0.3, ge=0.0, le=2.0)
+    max_tokens: int = Field(default=4096, gt=0)
+    @classmethod
+    def from_env(cls) -> AIConfig:
+        api_key = os.environ.get("SQLSEED_AI_API_KEY") or os.environ.get("OPENAI_API_KEY")
+        base_url = os.environ.get("SQLSEED_AI_BASE_URL") or os.environ.get("OPENAI_BASE_URL")
+        model = os.environ.get("SQLSEED_AI_MODEL", cls.model_fields["model"].default)
+        return cls(api_key=api_key, base_url=base_url, model=model)