PyPI - schema-search - Versions diffs - 0.1.2__py3-none-any.whl - Mend

schema-search 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of schema-search might be problematic. Click here for more details.

Files changed (38) hide show

schema_search/__init__.py +26 -0
schema_search/chunkers/__init__.py +6 -0
schema_search/chunkers/base.py +95 -0
schema_search/chunkers/factory.py +31 -0
schema_search/chunkers/llm.py +51 -0
schema_search/chunkers/markdown.py +25 -0
schema_search/embedding_cache/__init__.py +5 -0
schema_search/embedding_cache/base.py +40 -0
schema_search/embedding_cache/bm25.py +63 -0
schema_search/embedding_cache/factory.py +20 -0
schema_search/embedding_cache/inmemory.py +112 -0
schema_search/graph_builder.py +69 -0
schema_search/mcp_server.py +82 -0
schema_search/metrics.py +33 -0
schema_search/rankers/__init__.py +5 -0
schema_search/rankers/base.py +45 -0
schema_search/rankers/cross_encoder.py +34 -0
schema_search/rankers/factory.py +11 -0
schema_search/schema_extractor.py +135 -0
schema_search/schema_search.py +263 -0
schema_search/search/__init__.py +15 -0
schema_search/search/base.py +85 -0
schema_search/search/bm25.py +48 -0
schema_search/search/factory.py +61 -0
schema_search/search/fuzzy.py +56 -0
schema_search/search/hybrid.py +82 -0
schema_search/search/semantic.py +49 -0
schema_search/types.py +57 -0
schema_search-0.1.2.dist-info/METADATA +275 -0
schema_search-0.1.2.dist-info/RECORD +38 -0
schema_search-0.1.2.dist-info/WHEEL +5 -0
schema_search-0.1.2.dist-info/entry_points.txt +2 -0
schema_search-0.1.2.dist-info/licenses/LICENSE +21 -0
schema_search-0.1.2.dist-info/top_level.txt +2 -0
tests/__init__.py +0 -0
tests/test_integration.py +352 -0
tests/test_llm_sql_generation.py +320 -0
tests/test_spider_eval.py +484 -0

schema_search/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+from schema_search.schema_search import SchemaSearch
+from schema_search.types import (
+    IndexResult,
+    SearchResult,
+    SearchResultItem,
+    SearchType,
+    TableSchema,
+    ColumnInfo,
+    ForeignKeyInfo,
+    IndexInfo,
+    ConstraintInfo,
+)
+__version__ = "0.1.0"
+__all__ = [
+    "SchemaSearch",
+    "IndexResult",
+    "SearchResult",
+    "SearchResultItem",
+    "SearchType",
+    "TableSchema",
+    "ColumnInfo",
+    "ForeignKeyInfo",
+    "IndexInfo",
+    "ConstraintInfo",
+]

schema_search/chunkers/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from schema_search.chunkers.base import Chunk, BaseChunker
+from schema_search.chunkers.markdown import MarkdownChunker
+from schema_search.chunkers.llm import LLMChunker
+from schema_search.chunkers.factory import create_chunker
+__all__ = ["Chunk", "BaseChunker", "MarkdownChunker", "LLMChunker", "create_chunker"]

schema_search/chunkers/base.py ADDED Viewed

@@ -0,0 +1,95 @@
+from typing import Dict, List
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from tqdm import tqdm
+from schema_search.types import TableSchema
+@dataclass
+class Chunk:
+    table_name: str
+    content: str
+    chunk_id: int
+    token_count: int
+class BaseChunker(ABC):
+    def __init__(self, max_tokens: int, overlap_tokens: int, show_progress: bool = False):
+        self.max_tokens = max_tokens
+        self.overlap_tokens = overlap_tokens
+        self.show_progress = show_progress
+    def chunk_schemas(self, schemas: Dict[str, TableSchema]) -> List[Chunk]:
+        chunks: List[Chunk] = []
+        chunk_id = 0
+        iterator = schemas.items()
+        if self.show_progress:
+            iterator = tqdm(iterator, desc="Chunking tables", unit="table")
+        for table_name, schema in iterator:
+            table_chunks = self._chunk_table(table_name, schema, chunk_id)
+            chunks.extend(table_chunks)
+            chunk_id += len(table_chunks)
+        return chunks
+    @abstractmethod
+    def _generate_content(self, table_name: str, schema: TableSchema) -> str:
+        pass
+    def _chunk_table(
+        self, table_name: str, schema: TableSchema, start_id: int
+    ) -> List[Chunk]:
+        content = self._generate_content(table_name, schema)
+        lines = content.split("\n")
+        header = f"Table: {table_name}"
+        header_tokens = self._estimate_tokens(header)
+        chunks: List[Chunk] = []
+        current_chunk_lines = [header]
+        current_tokens = header_tokens
+        chunk_id = start_id
+        for line in lines[1:]:
+            line_tokens = self._estimate_tokens(line)
+            if (
+                current_tokens + line_tokens > self.max_tokens
+                and len(current_chunk_lines) > 1
+            ):
+                chunk_content = "\n".join(current_chunk_lines)
+                chunks.append(
+                    Chunk(
+                        table_name=table_name,
+                        content=chunk_content,
+                        chunk_id=chunk_id,
+                        token_count=current_tokens,
+                    )
+                )
+                chunk_id += 1
+                current_chunk_lines = [header]
+                current_tokens = header_tokens
+            current_chunk_lines.append(line)
+            current_tokens += line_tokens
+        if len(current_chunk_lines) > 1:
+            chunk_content = "\n".join(current_chunk_lines)
+            chunks.append(
+                Chunk(
+                    table_name=table_name,
+                    content=chunk_content,
+                    chunk_id=chunk_id,
+                    token_count=current_tokens,
+                )
+            )
+        return chunks
+    def _estimate_tokens(self, text: str) -> int:
+        return len(text.split()) + len(text) // 4

schema_search/chunkers/factory.py ADDED Viewed

@@ -0,0 +1,31 @@
+from typing import Dict, Optional
+from schema_search.chunkers.base import BaseChunker
+from schema_search.chunkers.markdown import MarkdownChunker
+from schema_search.chunkers.llm import LLMChunker
+def create_chunker(
+    config: Dict, llm_api_key: Optional[str], llm_base_url: Optional[str]
+) -> BaseChunker:
+    chunking_config = config["chunking"]
+    strategy = chunking_config["strategy"]
+    show_progress = config["embedding"].get("show_progress", False)
+    if strategy == "llm":
+        return LLMChunker(
+            max_tokens=chunking_config["max_tokens"],
+            overlap_tokens=chunking_config["overlap_tokens"],
+            model=chunking_config["model"],
+            llm_api_key=llm_api_key,
+            llm_base_url=llm_base_url,
+            show_progress=show_progress,
+        )
+    elif strategy == "raw":
+        return MarkdownChunker(
+            max_tokens=chunking_config["max_tokens"],
+            overlap_tokens=chunking_config["overlap_tokens"],
+            show_progress=show_progress,
+        )
+    else:
+        raise ValueError(f"Unknown chunking strategy: {strategy}")

schema_search/chunkers/llm.py ADDED Viewed

@@ -0,0 +1,51 @@
+import json
+import logging
+from typing import Optional
+from openai import OpenAI
+from schema_search.chunkers.base import BaseChunker
+from schema_search.types import TableSchema
+logger = logging.getLogger(__name__)
+class LLMChunker(BaseChunker):
+    def __init__(
+        self,
+        max_tokens: int,
+        overlap_tokens: int,
+        model: str,
+        llm_api_key: Optional[str],
+        llm_base_url: Optional[str],
+        show_progress: bool = False,
+    ):
+        super().__init__(max_tokens, overlap_tokens, show_progress)
+        self.model = model
+        self.llm_client = OpenAI(api_key=llm_api_key, base_url=llm_base_url)
+        logger.info(f"Schema Summarizer Model: {self.model}")
+    def _generate_content(self, table_name: str, schema: TableSchema) -> str:
+        prompt = f"""Generate a concise 250 tokens or less semantic summary of this database table schema. Focus on:
+1. What entity or concept this table represents
+2. Key data it stores (main columns)
+3. How it relates to other tables
+4. Any important constraints or indices
+Keep it brief and semantic, optimized for embedding-based search.
+Schema:
+{json.dumps(schema, indent=2)}
+Return ONLY the summary text, no preamble."""
+        response = self.llm_client.chat.completions.create(
+            model=self.model,
+            max_tokens=500,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        summary = response.choices[0].message.content.strip()  # type: ignore
+        logger.debug(f"Generated LLM summary for {table_name}: {summary[:100]}...")
+        return f"Table: {table_name}\n{summary}"

schema_search/chunkers/markdown.py ADDED Viewed

@@ -0,0 +1,25 @@
+from schema_search.chunkers.base import BaseChunker
+from schema_search.types import TableSchema
+class MarkdownChunker(BaseChunker):
+    def _generate_content(self, table_name: str, schema: TableSchema) -> str:
+        lines = [f"Table: {table_name}"]
+        if schema["primary_keys"]:
+            lines.append(f"Primary keys: {', '.join(schema['primary_keys'])}")
+        if schema["columns"]:
+            col_names = [col["name"] for col in schema["columns"]]
+            lines.append(f"Columns: {', '.join(col_names)}")
+        if schema["foreign_keys"]:
+            related = [fk["referred_table"] for fk in schema["foreign_keys"]]
+            lines.append(f"Related to: {', '.join(related)}")
+        if schema["indices"]:
+            idx_names = [idx["name"] for idx in schema["indices"] if idx["name"]]
+            if idx_names:
+                lines.append(f"Indexes: {', '.join(idx_names)}")
+        return "\n".join(lines)

schema_search/embedding_cache/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from schema_search.embedding_cache.base import BaseEmbeddingCache
+from schema_search.embedding_cache.inmemory import InMemoryEmbeddingCache
+from schema_search.embedding_cache.factory import create_embedding_cache
+__all__ = ["BaseEmbeddingCache", "InMemoryEmbeddingCache", "create_embedding_cache"]

schema_search/embedding_cache/base.py ADDED Viewed

@@ -0,0 +1,40 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, List
+import numpy as np
+from schema_search.chunkers import Chunk
+class BaseEmbeddingCache(ABC):
+    def __init__(
+        self,
+        cache_dir: Path,
+        model_name: str,
+        metric: str,
+        batch_size: int,
+        show_progress: bool,
+    ):
+        self.cache_dir = cache_dir
+        self.cache_dir.mkdir(exist_ok=True)
+        self.model_name = model_name
+        self.model = None
+        self.metric = metric
+        self.batch_size = batch_size
+        self.show_progress = show_progress
+        self.embeddings = None
+    @abstractmethod
+    def load_or_generate(
+        self, chunks: List[Chunk], force: bool, chunking_config: Dict
+    ) -> None:
+        pass
+    @abstractmethod
+    def encode_query(self, query: str) -> np.ndarray:
+        pass
+    @abstractmethod
+    def compute_similarities(self, query_embedding: np.ndarray) -> np.ndarray:
+        pass

schema_search/embedding_cache/bm25.py ADDED Viewed

@@ -0,0 +1,63 @@
+from typing import List
+import re
+import logging
+import numpy as np
+import bm25s
+from schema_search.chunkers import Chunk
+logging.getLogger("bm25s").setLevel(logging.WARNING)
+def light_stem(token: str) -> str:
+    """Tiny rule-based stemmer for schema tokens."""
+    for suf in ("ing", "ers", "ies", "ied", "ed", "es", "s"):
+        if token.endswith(suf) and len(token) > len(suf) + 2:
+            if suf == "ies":
+                return token[:-3] + "y"
+            return token[: -len(suf)]
+    return token
+def _tokenize(text: str) -> List[str]:
+    """Tokenize and normalize database-like text."""
+    text = text.lower()
+    text = text.replace("\n", " ")
+    text = re.sub(r"[_\-]+", " ", text)
+    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
+    text = re.sub(r"([a-z])([0-9])", r"\1 \2", text)
+    text = re.sub(r"([0-9])([a-z])", r"\1 \2", text)
+    tokens = re.findall(r"[a-z0-9]+", text)
+    normalized = []
+    for t in tokens:
+        if t in {"pk", "pkey", "key"}:
+            t = "id"
+        elif t in {"ts", "time", "timestamp"}:
+            t = "timestamp"
+        elif t.endswith("id") and len(t) > 2:
+            t = "id"
+        elif t in {"ix", "index", "idx"}:
+            t = "index"
+        normalized.append(light_stem(t))
+    return normalized
+class BM25Cache:
+    def __init__(self):
+        self.bm25 = None
+        self.tokenized_docs = None
+    def build(self, chunks: List[Chunk]) -> None:
+        if self.bm25 is None:
+            self.tokenized_docs = [_tokenize(chunk.content) for chunk in chunks]
+            self.bm25 = bm25s.BM25()
+            self.bm25.index(self.tokenized_docs)
+    def get_scores(self, query: str) -> np.ndarray:
+        if self.bm25 is None or self.tokenized_docs is None:
+            raise RuntimeError("BM25 cache not built. Call build() first.")
+        query_tokens = _tokenize(query)
+        scores = self.bm25.get_scores(query_tokens)
+        return scores

schema_search/embedding_cache/factory.py ADDED Viewed

@@ -0,0 +1,20 @@
+from pathlib import Path
+from typing import Dict
+from schema_search.embedding_cache.base import BaseEmbeddingCache
+from schema_search.embedding_cache.inmemory import InMemoryEmbeddingCache
+def create_embedding_cache(config: Dict, cache_dir: Path) -> BaseEmbeddingCache:
+    location = config["embedding"]["location"]
+    if location == "memory":
+        return InMemoryEmbeddingCache(
+            cache_dir=cache_dir,
+            model_name=config["embedding"]["model"],
+            metric=config["embedding"]["metric"],
+            batch_size=config["embedding"]["batch_size"],
+            show_progress=config["embedding"]["show_progress"],
+        )
+    else:
+        raise ValueError(f"Unsupported embedding location: {location}")

schema_search/embedding_cache/inmemory.py ADDED Viewed

@@ -0,0 +1,112 @@
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from schema_search.chunkers import Chunk
+from schema_search.embedding_cache.base import BaseEmbeddingCache
+from schema_search.metrics import get_metric
+logger = logging.getLogger(__name__)
+class InMemoryEmbeddingCache(BaseEmbeddingCache):
+    def __init__(
+        self,
+        cache_dir: Path,
+        model_name: str,
+        metric: str,
+        batch_size: int,
+        show_progress: bool,
+    ):
+        super().__init__(cache_dir, model_name, metric, batch_size, show_progress)
+        self.model: SentenceTransformer
+    def load_or_generate(
+        self, chunks: List[Chunk], force: bool, chunking_config: Dict
+    ) -> None:
+        cache_file = self.cache_dir / "embeddings.npz"
+        config_file = self.cache_dir / "cache_config.json"
+        if not force and self._is_cache_valid(cache_file, config_file, chunking_config):
+            self._load_from_cache(cache_file)
+        else:
+            self._generate_and_cache(chunks, cache_file, config_file, chunking_config)
+    def _load_from_cache(self, cache_file: Path) -> None:
+        logger.info("Loading embeddings from cache")
+        self.embeddings = np.load(cache_file)["embeddings"]
+    def _is_cache_valid(
+        self, cache_file: Path, config_file: Path, chunking_config: Dict
+    ) -> bool:
+        if not (cache_file.exists() and config_file.exists()):
+            return False
+        with open(config_file) as f:
+            cached_config = json.load(f)
+        current_config = {
+            "strategy": chunking_config["strategy"],
+            "max_tokens": chunking_config["max_tokens"],
+            "embedding_model": self.model_name,
+        }
+        if cached_config != current_config:
+            logger.info("Cache invalidated: chunking config changed")
+            return False
+        return True
+    def _generate_and_cache(
+        self,
+        chunks: List[Chunk],
+        cache_file: Path,
+        config_file: Path,
+        chunking_config: Dict,
+    ) -> None:
+        self._load_model()
+        logger.info(f"Generating embeddings for {len(chunks)} chunks")
+        texts = [chunk.content for chunk in chunks]
+        self.embeddings = self.model.encode(
+            texts,
+            batch_size=self.batch_size,
+            normalize_embeddings=True,
+            show_progress_bar=self.show_progress,
+        )
+        np.savez_compressed(cache_file, embeddings=self.embeddings)
+        cache_config = {
+            "strategy": chunking_config["strategy"],
+            "max_tokens": chunking_config["max_tokens"],
+            "embedding_model": self.model_name,
+        }
+        with open(config_file, "w") as f:
+            json.dump(cache_config, f, indent=2)
+    def _load_model(self) -> None:
+        if self.model is None:
+            logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
+            self.model = SentenceTransformer(self.model_name)
+            logger.info(f"Loaded embedding model: {self.model_name}")
+    def encode_query(self, query: str) -> np.ndarray:
+        self._load_model()
+        query_emb = self.model.encode(
+            [query],
+            batch_size=self.batch_size,
+            normalize_embeddings=True,
+        )
+        return query_emb
+    def compute_similarities(self, query_embedding: np.ndarray) -> np.ndarray:
+        metric_fn = get_metric(self.metric)
+        return metric_fn(self.embeddings, query_embedding).flatten()

schema_search/graph_builder.py ADDED Viewed

@@ -0,0 +1,69 @@
+import logging
+import pickle
+from pathlib import Path
+from typing import Dict, Set
+import networkx as nx
+from schema_search.types import TableSchema
+logger = logging.getLogger(__name__)
+class GraphBuilder:
+    def __init__(self, cache_dir: Path):
+        self.cache_dir = cache_dir
+        self.cache_dir.mkdir(exist_ok=True)
+        self.graph: nx.DiGraph
+    def build(self, schemas: Dict[str, TableSchema], force: bool) -> None:
+        cache_file = self.cache_dir / "graph.pkl"
+        if not force and cache_file.exists():
+            self._load_from_cache(cache_file)
+        else:
+            self._build_and_cache(schemas, cache_file)
+    def _load_from_cache(self, cache_file: Path) -> None:
+        logger.debug(f"Loading graph from cache: {cache_file}")
+        with open(cache_file, "rb") as f:
+            self.graph = pickle.load(f)
+    def _build_and_cache(
+        self, schemas: Dict[str, TableSchema], cache_file: Path
+    ) -> None:
+        logger.info("Building foreign key relationship graph")
+        self.graph = nx.DiGraph()
+        for table_name, schema in schemas.items():
+            self.graph.add_node(table_name, **schema)
+        for table_name, schema in schemas.items():
+            if schema["foreign_keys"]:
+                for fk in schema["foreign_keys"]:
+                    referred_table = fk["referred_table"]
+                    if referred_table in self.graph:
+                        self.graph.add_edge(table_name, referred_table, **fk)
+        with open(cache_file, "wb") as f:
+            pickle.dump(self.graph, f)
+    def get_neighbors(self, table_name: str, hops: int) -> Set[str]:
+        if table_name not in self.graph:
+            return set()
+        neighbors: Set[str] = set()
+        forward = nx.single_source_shortest_path_length(
+            self.graph, table_name, cutoff=hops
+        )
+        neighbors.update(forward.keys())
+        backward = nx.single_source_shortest_path_length(
+            self.graph.reverse(), table_name, cutoff=hops
+        )
+        neighbors.update(backward.keys())
+        neighbors.discard(table_name)
+        return neighbors

schema_search/mcp_server.py ADDED Viewed

@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+import logging
+from typing import Optional
+from fastmcp import FastMCP
+from sqlalchemy import create_engine
+from schema_search import SchemaSearch
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+mcp = FastMCP("schema-search")
+@mcp.tool()
+def schema_search(
+    query: str,
+    hops: Optional[int] = None,
+    limit: int = 5,
+) -> dict:
+    """Search database schema using natural language.
+    Finds relevant database tables and their relationships by searching through schema metadata
+    using semantic similarity. Expands results by traversing foreign key relationships.
+    Args:
+        query: Natural language question about database schema (e.g., 'where are user refunds stored?', 'tables related to payments')
+        hops: Number of foreign key relationship hops for graph expansion. Use 0 for exact matches only, 1-2 to include related tables. If not specified, uses value from config.yml (default: 1)
+        limit: Maximum number of table schemas to return in results. Default: 5
+    Returns:
+        Dictionary with 'results' (list of table schemas with columns, types, constraints, and relationships) and 'latency_sec' (query execution time)
+    """
+    search_result = mcp.search_engine.search(query, hops=hops, limit=limit)  # type: ignore
+    return {
+        "results": search_result["results"],
+        "latency_sec": search_result["latency_sec"],
+    }
+def run_server(
+    database_url: str,
+    config_path: Optional[str] = None,
+    llm_api_key: Optional[str] = None,
+    llm_base_url: Optional[str] = None,
+):
+    engine = create_engine(database_url)
+    mcp.search_engine = SchemaSearch(  # type: ignore
+        engine,
+        config_path=config_path,
+        llm_api_key=llm_api_key,
+        llm_base_url=llm_base_url,
+    )
+    logger.info("Indexing database schema...")
+    mcp.search_engine.index()  # type: ignore
+    logger.info("Index ready")
+    mcp.run()
+def main():
+    import sys
+    if len(sys.argv) < 2:
+        print(
+            "Usage: schema-search-mcp <database_url> [config_path] [llm_api_key] [llm_base_url]"
+        )
+        sys.exit(1)
+    database_url = sys.argv[1]
+    config_path = sys.argv[2] if len(sys.argv) > 2 else None
+    llm_api_key = sys.argv[3] if len(sys.argv) > 3 else None
+    llm_base_url = sys.argv[4] if len(sys.argv) > 4 else None
+    run_server(database_url, config_path, llm_api_key, llm_base_url)
+if __name__ == "__main__":
+    main()

schema_search/metrics.py ADDED Viewed

@@ -0,0 +1,33 @@
+import numpy as np
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    a_norm = a / (np.linalg.norm(a, axis=-1, keepdims=True) + 1e-8)
+    b_norm = b / (np.linalg.norm(b, axis=-1, keepdims=True) + 1e-8)
+    return a_norm @ b_norm.T
+def dot_product(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    return a @ b.T
+def euclidean_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    return -np.linalg.norm(a[:, None] - b[None, :], axis=-1)
+def manhattan_distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    return -np.sum(np.abs(a[:, None] - b[None, :]), axis=-1)
+METRICS = {
+    "cosine": cosine_similarity,
+    "dot": dot_product,
+    "euclidean": euclidean_distance,
+    "manhattan": manhattan_distance,
+}
+def get_metric(name: str):
+    if name not in METRICS:
+        raise ValueError(f"Unknown metric: {name}. Available: {list(METRICS.keys())}")
+    return METRICS[name]

schema_search/rankers/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from schema_search.rankers.base import BaseRanker
+from schema_search.rankers.cross_encoder import CrossEncoderRanker
+from schema_search.rankers.factory import create_ranker
+__all__ = ["BaseRanker", "CrossEncoderRanker", "create_ranker"]