PyPI - schema-search - Versions diffs - 0.1.10__py3-none-any.whl - Mend

schema-search 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

schema_search/__init__.py +26 -0
schema_search/chunkers/__init__.py +6 -0
schema_search/chunkers/base.py +95 -0
schema_search/chunkers/factory.py +31 -0
schema_search/chunkers/llm.py +54 -0
schema_search/chunkers/markdown.py +25 -0
schema_search/embedding_cache/__init__.py +5 -0
schema_search/embedding_cache/base.py +40 -0
schema_search/embedding_cache/bm25.py +63 -0
schema_search/embedding_cache/factory.py +20 -0
schema_search/embedding_cache/inmemory.py +122 -0
schema_search/graph_builder.py +69 -0
schema_search/mcp_server.py +81 -0
schema_search/metrics.py +33 -0
schema_search/rankers/__init__.py +5 -0
schema_search/rankers/base.py +45 -0
schema_search/rankers/cross_encoder.py +40 -0
schema_search/rankers/factory.py +11 -0
schema_search/schema_extractor.py +135 -0
schema_search/schema_search.py +276 -0
schema_search/search/__init__.py +15 -0
schema_search/search/base.py +85 -0
schema_search/search/bm25.py +48 -0
schema_search/search/factory.py +61 -0
schema_search/search/fuzzy.py +56 -0
schema_search/search/hybrid.py +82 -0
schema_search/search/semantic.py +49 -0
schema_search/types.py +57 -0
schema_search/utils/__init__.py +0 -0
schema_search/utils/lazy_import.py +26 -0
schema_search-0.1.10.dist-info/METADATA +308 -0
schema_search-0.1.10.dist-info/RECORD +40 -0
schema_search-0.1.10.dist-info/WHEEL +5 -0
schema_search-0.1.10.dist-info/entry_points.txt +2 -0
schema_search-0.1.10.dist-info/licenses/LICENSE +21 -0
schema_search-0.1.10.dist-info/top_level.txt +2 -0
tests/__init__.py +0 -0
tests/test_integration.py +352 -0
tests/test_llm_sql_generation.py +320 -0
tests/test_spider_eval.py +488 -0

schema_search/rankers/base.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import Dict, List, Tuple
+from collections import defaultdict
+from abc import ABC, abstractmethod
+from schema_search.chunkers import Chunk
+class BaseRanker(ABC):
+    def __init__(self):
+        self.chunks: List[Chunk]
+    @abstractmethod
+    def build(self, chunks: List[Chunk]) -> None:
+        pass
+    @abstractmethod
+    def rank(self, query: str) -> List[Tuple[int, float]]:
+        """Returns: List of (chunk_idx, score)"""
+        pass
+    def get_top_tables_from_chunks(
+        self, ranked_chunks: List[Tuple[int, float]], top_k: int
+    ) -> Dict[str, List[int]]:
+        table_to_chunk_indices: Dict[str, List[int]] = defaultdict(list)
+        chunk_idx_to_score: Dict[int, float] = {}
+        for chunk_idx, score in ranked_chunks:
+            chunk = self.chunks[chunk_idx]
+            table_to_chunk_indices[chunk.table_name].append(chunk_idx)
+            chunk_idx_to_score[chunk_idx] = score
+        table_scores: Dict[str, float] = {}
+        for table_name, chunk_indices in table_to_chunk_indices.items():
+            max_score = max(chunk_idx_to_score[idx] for idx in chunk_indices)
+            table_scores[table_name] = max_score
+        top_tables = sorted(table_scores.items(), key=lambda x: x[1], reverse=True)[
+            :top_k
+        ]
+        result: Dict[str, List[int]] = {}
+        for table_name, score in top_tables:
+            result[table_name] = table_to_chunk_indices[table_name]
+        return result

schema_search/rankers/cross_encoder.py ADDED Viewed

@@ -0,0 +1,40 @@
+from typing import List, Tuple, Optional, TYPE_CHECKING
+import logging
+from schema_search.chunkers import Chunk
+from schema_search.rankers.base import BaseRanker
+from schema_search.utils.lazy_import import lazy_import_check
+if TYPE_CHECKING:
+    from sentence_transformers import CrossEncoder
+logger = logging.getLogger(__name__)
+class CrossEncoderRanker(BaseRanker):
+    def __init__(self, model_name: str):
+        super().__init__()
+        self.model_name = model_name
+        self.model: Optional["CrossEncoder"] = None
+    def _load_model(self) -> "CrossEncoder":
+        if self.model is None:
+            sentence_transformers = lazy_import_check(
+                "sentence_transformers", "semantic", "reranking with CrossEncoder"
+            )
+            logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
+            self.model = sentence_transformers.CrossEncoder(self.model_name)
+            assert self.model is not None
+            logger.info(f"Loaded CrossEncoder: {self.model_name}")
+        return self.model
+    def build(self, chunks: List[Chunk]) -> None:
+        self.chunks = chunks
+        logger.debug(f"Initialized CrossEncoder reranker with {len(chunks)} chunks")
+    def rank(self, query: str) -> List[Tuple[int, float]]:
+        model = self._load_model()
+        pairs = [(query, chunk.content) for chunk in self.chunks]
+        scores = model.predict(pairs, show_progress_bar=False)
+        ranked_indices = scores.argsort()[::-1]
+        return [(int(idx), float(scores[idx])) for idx in ranked_indices]

schema_search/rankers/factory.py ADDED Viewed

@@ -0,0 +1,11 @@
+from typing import Dict, Optional
+from schema_search.rankers.base import BaseRanker
+from schema_search.rankers.cross_encoder import CrossEncoderRanker
+def create_ranker(config: Dict) -> Optional[BaseRanker]:
+    reranker_model = config["reranker"]["model"]
+    if reranker_model is None:
+        return None
+    return CrossEncoderRanker(model_name=reranker_model)

schema_search/schema_extractor.py ADDED Viewed

@@ -0,0 +1,135 @@
+from typing import Dict, List, Any
+from sqlalchemy import inspect
+from sqlalchemy.engine import Engine
+from schema_search.types import (
+    TableSchema,
+    ColumnInfo,
+    ForeignKeyInfo,
+    IndexInfo,
+    ConstraintInfo,
+)
+class SchemaExtractor:
+    def __init__(self, engine: Engine, config: Dict[str, Any]):
+        self.engine = engine
+        self.config = config
+    def extract(self) -> Dict[str, TableSchema]:
+        inspector = inspect(self.engine)
+        schemas: Dict[str, TableSchema] = {}
+        schema_names = inspector.get_schema_names()
+        for schema_name in schema_names:
+            if self._should_skip_schema(schema_name):
+                continue
+            for table_name in inspector.get_table_names(schema=schema_name):
+                schemas[table_name] = self._extract_table(
+                    inspector, table_name, schema_name
+                )
+        return schemas
+    def _should_skip_schema(self, schema_name: str) -> bool:
+        skip = {
+            "information_schema",
+            "pg_catalog",
+            "pg_toast",
+            "performance_schema",
+            "mysql",
+            "sys",
+        }
+        return schema_name.lower() in skip
+    def _extract_table(
+        self, inspector, table_name: str, schema_name: str
+    ) -> TableSchema:
+        pk_constraint = inspector.get_pk_constraint(table_name, schema=schema_name)
+        schema: TableSchema = {
+            "name": table_name,
+            "columns": (
+                self._extract_columns(
+                    inspector.get_columns(table_name, schema=schema_name)
+                )
+                if self.config["schema"]["include_columns"]
+                else None
+            ),
+            "primary_keys": pk_constraint["constrained_columns"],
+            "foreign_keys": (
+                self._extract_foreign_keys(
+                    inspector.get_foreign_keys(table_name, schema=schema_name)
+                )
+                if self.config["schema"]["include_foreign_keys"]
+                else None
+            ),
+            "indices": (
+                self._extract_indices(
+                    inspector.get_indexes(table_name, schema=schema_name)
+                )
+                if self.config["schema"]["include_indices"]
+                else None
+            ),
+            "unique_constraints": (
+                self._extract_constraints(
+                    inspector.get_unique_constraints(table_name, schema=schema_name)
+                )
+                if self.config["schema"]["include_constraints"]
+                else None
+            ),
+            "check_constraints": (
+                self._extract_constraints(
+                    inspector.get_check_constraints(table_name, schema=schema_name)
+                )
+                if self.config["schema"]["include_constraints"]
+                else None
+            ),
+        }
+        return schema
+    def _extract_columns(self, columns: List[Dict[str, Any]]) -> List[ColumnInfo]:
+        return [
+            {
+                "name": col["name"],
+                "type": str(col["type"]),
+                "nullable": col["nullable"],
+                "default": str(col["default"]) if col["default"] else None,
+            }
+            for col in columns
+        ]
+    def _extract_foreign_keys(
+        self, foreign_keys: List[Dict[str, Any]]
+    ) -> List[ForeignKeyInfo]:
+        return [
+            {
+                "constrained_columns": fk["constrained_columns"],
+                "referred_table": fk["referred_table"],
+                "referred_columns": fk["referred_columns"],
+            }
+            for fk in foreign_keys
+        ]
+    def _extract_indices(self, indices: List[Dict[str, Any]]) -> List[IndexInfo]:
+        return [
+            {
+                "name": idx["name"],
+                "columns": idx["column_names"],
+                "unique": idx["unique"],
+            }
+            for idx in indices
+        ]
+    def _extract_constraints(
+        self, constraints: List[Dict[str, Any]]
+    ) -> List[ConstraintInfo]:
+        return [
+            {
+                "name": constraint["name"],
+                "columns": constraint["column_names"],
+            }
+            for constraint in constraints
+        ]

schema_search/schema_search.py ADDED Viewed

@@ -0,0 +1,276 @@
+import json
+import logging
+import time
+from functools import wraps
+from pathlib import Path
+from typing import Dict, List, Optional
+import yaml
+from sqlalchemy.engine import Engine
+from schema_search.schema_extractor import SchemaExtractor
+from schema_search.chunkers import Chunk, create_chunker
+from schema_search.embedding_cache import create_embedding_cache
+from schema_search.embedding_cache.bm25 import BM25Cache
+from schema_search.graph_builder import GraphBuilder
+from schema_search.search import create_search_strategy
+from schema_search.types import IndexResult, SearchResult, SearchType, TableSchema
+from schema_search.rankers import create_ranker
+logger = logging.getLogger(__name__)
+def time_it(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        start = time.time()
+        result = func(*args, **kwargs)
+        elapsed = time.time() - start
+        if isinstance(result, dict):
+            result["latency_sec"] = round(elapsed, 3)
+        return result
+    return wrapper
+class SchemaSearch:
+    def __init__(
+        self,
+        engine: Engine,
+        config_path: Optional[str] = None,
+        llm_api_key: Optional[str] = None,
+        llm_base_url: Optional[str] = None,
+    ):
+        self.config = self._load_config(config_path)
+        self._setup_logging()
+        base_cache_dir = Path(self.config["embedding"]["cache_dir"])
+        db_name = engine.url.database or "default"
+        cache_dir = base_cache_dir / db_name
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        self.schemas: Dict[str, TableSchema] = {}
+        self.chunks: List[Chunk] = []
+        self.cache_dir = cache_dir
+        self._validate_dependencies()
+        self.schema_extractor = SchemaExtractor(engine, self.config)
+        self.chunker = create_chunker(self.config, llm_api_key, llm_base_url)
+        self._embedding_cache = None
+        self._bm25_cache = None
+        self.graph_builder = GraphBuilder(cache_dir)
+        self._reranker = None
+        self._reranker_config = self.config["reranker"]["model"]
+        self._search_strategies = {}
+    def _setup_logging(self) -> None:
+        level = getattr(logging, self.config["logging"]["level"])
+        logging.basicConfig(
+            level=level,
+            format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+            force=True,
+        )
+        logger.setLevel(level)
+    def _load_config(self, config_path: Optional[str]) -> Dict:
+        if config_path is None:
+            config_path = str(Path(__file__).parent.parent / "config.yml")
+        with open(config_path) as f:
+            return yaml.safe_load(f)
+    def _validate_dependencies(self) -> None:
+        from schema_search.utils.lazy_import import lazy_import_check
+        strategy = self.config["search"]["strategy"]
+        reranker_model = self.config["reranker"]["model"]
+        chunking_strategy = self.config["chunking"]["strategy"]
+        needs_semantic = strategy in ("semantic", "hybrid") or reranker_model
+        if needs_semantic:
+            lazy_import_check(
+                "sentence_transformers",
+                "semantic",
+                f"{strategy} search or reranking"
+            )
+        if chunking_strategy == "llm":
+            lazy_import_check("openai", "llm", "LLM-based chunking")
+    @time_it
+    def index(self, force: bool = False) -> IndexResult:
+        logger.info("Starting schema indexing" + (" (force)" if force else ""))
+        current_schema = self._extract_current_schema()
+        schema_changed = False
+        if not force:
+            cached_schema = self._load_cached_schema()
+            schema_changed = self._schema_has_changed(cached_schema, current_schema)
+            if schema_changed:
+                logger.info("Schema change detected; forcing reindex")
+        self._cache_schema(current_schema)
+        effective_force = force or schema_changed
+        self.schemas = current_schema
+        self.graph_builder.build(self.schemas, effective_force)
+        self.chunks = self._load_or_generate_chunks(self.schemas, effective_force)
+        self._index_force = effective_force
+        logger.info(
+            f"Indexing complete: {len(self.schemas)} tables, {len(self.chunks)} chunks"
+        )
+        return {
+            "tables": len(self.schemas),
+            "chunks": len(self.chunks),
+            "latency_sec": 0.0,
+        }
+    def _extract_current_schema(self) -> Dict[str, TableSchema]:
+        logger.info("Extracting schema from database")
+        return self.schema_extractor.extract()
+    def _load_cached_schema(self) -> Optional[Dict[str, TableSchema]]:
+        schema_cache = self.cache_dir / "metadata.json"
+        if not schema_cache.exists():
+            logger.debug("Schema cache missing; treating as schema change")
+            return None
+        with open(schema_cache) as f:
+            return json.load(f)
+    def _cache_schema(self, schema: Dict[str, TableSchema]) -> None:
+        schema_cache = self.cache_dir / "metadata.json"
+        with open(schema_cache, "w") as f:
+            json.dump(schema, f, indent=2)
+    def _schema_has_changed(
+        self,
+        cached_schema: Optional[Dict[str, TableSchema]],
+        current_schema: Dict[str, TableSchema],
+    ) -> bool:
+        if cached_schema is None:
+            return True
+        if cached_schema != current_schema:
+            logger.debug("Cached schema differs from current schema")
+            return True
+        logger.debug("Schema matches cached version; reuse existing index")
+        return False
+    def _load_or_generate_chunks(
+        self, schemas: Dict[str, TableSchema], force: bool
+    ) -> List[Chunk]:
+        chunks_cache = self.cache_dir / "chunk_metadata.json"
+        if not force and chunks_cache.exists():
+            logger.info(f"Loading chunks from cache: {chunks_cache}")
+            with open(chunks_cache) as f:
+                chunk_data = json.load(f)
+                return [
+                    Chunk(
+                        table_name=c["table_name"],
+                        content=c["content"],
+                        chunk_id=c["chunk_id"],
+                        token_count=c["token_count"],
+                    )
+                    for c in chunk_data
+                ]
+        logger.info("Generating chunks from schemas")
+        chunks = self.chunker.chunk_schemas(schemas)
+        with open(chunks_cache, "w") as f:
+            chunk_data = [
+                {
+                    "table_name": c.table_name,
+                    "content": c.content,
+                    "chunk_id": c.chunk_id,
+                    "token_count": c.token_count,
+                }
+                for c in chunks
+            ]
+            json.dump(chunk_data, f, indent=2)
+        return chunks
+    def _get_embedding_cache(self):
+        if self._embedding_cache is None:
+            self._embedding_cache = create_embedding_cache(self.config, self.cache_dir)
+        return self._embedding_cache
+    def _get_reranker(self):
+        if self._reranker is None and self._reranker_config:
+            self._reranker = create_ranker(self.config)
+        return self._reranker
+    @property
+    def embedding_cache(self):
+        return self._get_embedding_cache()
+    @property
+    def reranker(self):
+        return self._get_reranker()
+    def _get_bm25_cache(self):
+        if self._bm25_cache is None:
+            self._bm25_cache = BM25Cache()
+        return self._bm25_cache
+    def _ensure_embeddings_loaded(self):
+        cache = self._get_embedding_cache()
+        if cache.embeddings is None:
+            cache.load_or_generate(
+                self.chunks, self._index_force, self.config["chunking"]
+            )
+    def _ensure_bm25_built(self):
+        cache = self._get_bm25_cache()
+        if cache.bm25 is None:
+            logger.info("Building BM25 index")
+            cache.build(self.chunks)
+    def _get_search_strategy(self, search_type: str):
+        if search_type not in self._search_strategies:
+            self._search_strategies[search_type] = create_search_strategy(
+                self.config,
+                self._get_embedding_cache,
+                self._get_bm25_cache,
+                self._get_reranker,
+                search_type,
+            )
+        return self._search_strategies[search_type]
+    @time_it
+    def search(
+        self,
+        query: str,
+        hops: Optional[int] = None,
+        limit: int = 5,
+        search_type: Optional[SearchType] = None,
+    ) -> SearchResult:
+        if hops is None:
+            hops = int(self.config["search"]["hops"])
+        logger.debug(f"Searching: {query} (hops={hops}, search_type={search_type})")
+        search_type = search_type or self.config["search"]["strategy"]
+        if search_type in ["semantic", "hybrid"]:
+            self._ensure_embeddings_loaded()
+        if search_type in ["bm25", "hybrid"]:
+            self._ensure_bm25_built()
+        strategy = self._get_search_strategy(search_type)
+        results = strategy.search(
+            query, self.schemas, self.chunks, self.graph_builder, hops, limit
+        )
+        logger.debug(f"Found {len(results)} results")
+        return {"results": results, "latency_sec": 0.0}

schema_search/search/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from schema_search.search.base import BaseSearchStrategy
+from schema_search.search.semantic import SemanticSearchStrategy
+from schema_search.search.fuzzy import FuzzySearchStrategy
+from schema_search.search.bm25 import BM25SearchStrategy
+from schema_search.search.hybrid import HybridSearchStrategy
+from schema_search.search.factory import create_search_strategy
+__all__ = [
+    "BaseSearchStrategy",
+    "SemanticSearchStrategy",
+    "FuzzySearchStrategy",
+    "BM25SearchStrategy",
+    "HybridSearchStrategy",
+    "create_search_strategy",
+]

schema_search/search/base.py ADDED Viewed

@@ -0,0 +1,85 @@
+from typing import Dict, List, Optional
+from abc import ABC, abstractmethod
+from schema_search.types import TableSchema, SearchResultItem
+from schema_search.chunkers import Chunk
+from schema_search.graph_builder import GraphBuilder
+from schema_search.rankers.base import BaseRanker
+class BaseSearchStrategy(ABC):
+    def __init__(
+        self, reranker: Optional[BaseRanker], initial_top_k: int, rerank_top_k: int
+    ):
+        self.reranker = reranker
+        self.initial_top_k = initial_top_k
+        self.rerank_top_k = rerank_top_k
+    def search(
+        self,
+        query: str,
+        schemas: Dict[str, TableSchema],
+        chunks: List[Chunk],
+        graph_builder: GraphBuilder,
+        hops: int,
+        limit: int,
+    ) -> List[SearchResultItem]:
+        initial_results = self._initial_ranking(
+            query, schemas, chunks, graph_builder, hops
+        )
+        if self.reranker is None:
+            return initial_results[:limit]
+        initial_chunks = []
+        for result in initial_results:
+            for chunk in chunks:
+                if chunk.table_name == result["table"]:
+                    initial_chunks.append(chunk)
+                    break
+        self.reranker.build(initial_chunks)
+        ranked = self.reranker.rank(query)
+        reranked_results: List[SearchResultItem] = []
+        for chunk_idx, score in ranked[: self.rerank_top_k]:
+            chunk = initial_chunks[chunk_idx]
+            result = self._build_result_item(
+                table_name=chunk.table_name,
+                score=score,
+                schema=schemas[chunk.table_name],
+                matched_chunks=[chunk.content],
+                graph_builder=graph_builder,
+                hops=hops,
+            )
+            reranked_results.append(result)
+        return reranked_results[:limit]
+    @abstractmethod
+    def _initial_ranking(
+        self,
+        query: str,
+        schemas: Dict[str, TableSchema],
+        chunks: List[Chunk],
+        graph_builder: GraphBuilder,
+        hops: int,
+    ) -> List[SearchResultItem]:
+        pass
+    def _build_result_item(
+        self,
+        table_name: str,
+        score: float,
+        schema: TableSchema,
+        matched_chunks: List[str],
+        graph_builder: GraphBuilder,
+        hops: int,
+    ) -> SearchResultItem:
+        return {
+            "table": table_name,
+            "score": score,
+            "schema": schema,
+            "matched_chunks": matched_chunks,
+            "related_tables": list(graph_builder.get_neighbors(table_name, hops)),
+        }

schema_search/search/bm25.py ADDED Viewed

@@ -0,0 +1,48 @@
+from typing import Dict, List, Optional, TYPE_CHECKING
+from schema_search.search.base import BaseSearchStrategy
+from schema_search.types import TableSchema, SearchResultItem
+from schema_search.chunkers import Chunk
+from schema_search.graph_builder import GraphBuilder
+from schema_search.rankers.base import BaseRanker
+if TYPE_CHECKING:
+    from schema_search.embedding_cache.bm25 import BM25Cache
+class BM25SearchStrategy(BaseSearchStrategy):
+    def __init__(
+        self,
+        bm25_cache: "BM25Cache",
+        initial_top_k: int,
+        rerank_top_k: int,
+        reranker: Optional[BaseRanker],
+    ):
+        super().__init__(reranker, initial_top_k, rerank_top_k)
+        self.bm25_cache = bm25_cache
+    def _initial_ranking(
+        self,
+        query: str,
+        schemas: Dict[str, TableSchema],
+        chunks: List[Chunk],
+        graph_builder: GraphBuilder,
+        hops: int,
+    ) -> List[SearchResultItem]:
+        scores = self.bm25_cache.get_scores(query)
+        top_indices = scores.argsort()[::-1][: self.initial_top_k]
+        results: List[SearchResultItem] = []
+        for idx in top_indices:
+            chunk = chunks[idx]
+            result = self._build_result_item(
+                table_name=chunk.table_name,
+                score=float(scores[idx]),
+                schema=schemas[chunk.table_name],
+                matched_chunks=[chunk.content],
+                graph_builder=graph_builder,
+                hops=hops,
+            )
+            results.append(result)
+        return results