PyPI - schema-search - Versions diffs - 0.1.2__py3-none-any.whl - Mend

schema-search 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of schema-search might be problematic. Click here for more details.

Files changed (38) hide show

schema_search/__init__.py +26 -0
schema_search/chunkers/__init__.py +6 -0
schema_search/chunkers/base.py +95 -0
schema_search/chunkers/factory.py +31 -0
schema_search/chunkers/llm.py +51 -0
schema_search/chunkers/markdown.py +25 -0
schema_search/embedding_cache/__init__.py +5 -0
schema_search/embedding_cache/base.py +40 -0
schema_search/embedding_cache/bm25.py +63 -0
schema_search/embedding_cache/factory.py +20 -0
schema_search/embedding_cache/inmemory.py +112 -0
schema_search/graph_builder.py +69 -0
schema_search/mcp_server.py +82 -0
schema_search/metrics.py +33 -0
schema_search/rankers/__init__.py +5 -0
schema_search/rankers/base.py +45 -0
schema_search/rankers/cross_encoder.py +34 -0
schema_search/rankers/factory.py +11 -0
schema_search/schema_extractor.py +135 -0
schema_search/schema_search.py +263 -0
schema_search/search/__init__.py +15 -0
schema_search/search/base.py +85 -0
schema_search/search/bm25.py +48 -0
schema_search/search/factory.py +61 -0
schema_search/search/fuzzy.py +56 -0
schema_search/search/hybrid.py +82 -0
schema_search/search/semantic.py +49 -0
schema_search/types.py +57 -0
schema_search-0.1.2.dist-info/METADATA +275 -0
schema_search-0.1.2.dist-info/RECORD +38 -0
schema_search-0.1.2.dist-info/WHEEL +5 -0
schema_search-0.1.2.dist-info/entry_points.txt +2 -0
schema_search-0.1.2.dist-info/licenses/LICENSE +21 -0
schema_search-0.1.2.dist-info/top_level.txt +2 -0
tests/__init__.py +0 -0
tests/test_integration.py +352 -0
tests/test_llm_sql_generation.py +320 -0
tests/test_spider_eval.py +484 -0

tests/test_integration.py ADDED Viewed

@@ -0,0 +1,352 @@
+import os
+from pathlib import Path
+import gc
+from typing import cast
+import pytest
+from dotenv import load_dotenv
+from sqlalchemy import create_engine
+import psutil
+from schema_search import SchemaSearch
+from schema_search.types import SearchType
+@pytest.fixture(scope="module")
+def database_url():
+    env_path = Path(__file__).parent / ".env"
+    load_dotenv(env_path)
+    url = os.getenv("DATABASE_URL")
+    if not url:
+        pytest.skip("DATABASE_URL not set in tests/.env file")
+    return url
+@pytest.fixture(scope="module")
+def llm_config():
+    env_path = Path(__file__).parent / ".env"
+    load_dotenv(env_path)
+    api_key = os.getenv("LLM_API_KEY")
+    base_url = "https://api.anthropic.com/v1/"
+    if not api_key:
+        pytest.skip("LLM_API_KEY not set in tests/.env file")
+    return {"api_key": api_key, "base_url": base_url}
+@pytest.fixture(scope="module")
+def search_engine(database_url, llm_config):
+    engine = create_engine(database_url)
+    search = SchemaSearch(
+        engine,
+        llm_api_key=llm_config["api_key"],
+        llm_base_url=llm_config["base_url"],
+    )
+    return search
+def test_index_creation(search_engine):
+    """Test that the index can be built successfully."""
+    stats = search_engine.index(force=True)
+    assert len(search_engine.schemas) > 0, "No tables found in database"
+    assert len(search_engine.chunks) > 0, "No chunks generated"
+    print(f"\nIndexing: {stats}")
+def test_search_user_information(search_engine):
+    """Test searching for user-related information in the schema."""
+    search_engine.index(force=False)
+    query = "which table has user email address?"
+    response = search_engine.search(query)
+    results = response["results"]
+    for result in results:
+        print(f"Result: {result['table']} (score: {result['score']:.3f})")
+        # print(f"Related tables: {result['related_tables']}")
+        # print("-" * 100)
+    assert len(results) > 0, "No search results returned"
+    top_result = results[0]
+    assert "table" in top_result, "Result missing 'table' field"
+    assert "score" in top_result, "Result missing 'score' field"
+    assert "schema" in top_result, "Result missing 'schema' field"
+    assert "matched_chunks" in top_result, "Result missing 'matched_chunks' field"
+    assert "related_tables" in top_result, "Result missing 'related_tables' field"
+    assert top_result["score"] > 0, "Top result has invalid score"
+    print(f"\nTop result: {top_result['table']} (score: {top_result['score']:.3f})")
+    print(f"Related tables: {top_result['related_tables']}")
+    print(f"Search latency: {response['latency_sec']}s")
+def _calculate_score(results, correct_table):
+    """Calculate score based on position. Top=5, 2nd=4, 3rd=3, 4th=2, 5th=1, not found=0"""
+    for position, result in enumerate(results[:5], 1):
+        if result["table"] == correct_table:
+            return 6 - position
+    return 0
+def _get_eval_data():
+    """Return evaluation dataset."""
+    return [
+        {
+            "question": "which table has user email address?",
+            "correct_table": "user_metadata",
+        },
+        {
+            "question": "which table has scrapped project content?",
+            "correct_table": "project_content",
+        },
+        {
+            "question": "where can I find complete list of twitter bot accounts?",
+            "correct_table": "agent_metadata",
+        },
+        {
+            "question": "which table user api keys??",
+            "correct_table": "api_token",
+        },
+        {
+            "question": "which table has user deposits?",
+            "correct_table": "user_deposits",
+        },
+        {
+            "question": "which table has information about infrastructure?",
+            "correct_table": "node_metadata",
+        },
+        {
+            "question": "which table has information about user balances?",
+            "correct_table": "user_balances",
+        },
+        {
+            "question": "which table maps news to topics?",
+            "correct_table": "news_to_topic_map",
+        },
+        {
+            "question": "which table has information about projects?",
+            "correct_table": "project_metadata",
+        },
+        {
+            "question": "which table user query metrics?",
+            "correct_table": "query_metrics",
+        },
+    ]
+def test_memory_bm25_isolated(database_url, llm_config):
+    """Measure BM25 in complete isolation."""
+    _run_memory_test_for_strategy(database_url, llm_config, "bm25")
+def test_memory_fuzzy_isolated(database_url, llm_config):
+    """Measure Fuzzy in complete isolation."""
+    _run_memory_test_for_strategy(database_url, llm_config, "fuzzy")
+def test_memory_semantic_isolated(database_url, llm_config):
+    """Measure Semantic in complete isolation."""
+    _run_memory_test_for_strategy(database_url, llm_config, "semantic")
+def test_memory_hybrid_isolated(database_url, llm_config):
+    """Measure Hybrid in complete isolation."""
+    _run_memory_test_for_strategy(database_url, llm_config, "hybrid")
+def _run_memory_test_for_strategy(database_url, llm_config, strategy):
+    """Run memory test for a single strategy."""
+    gc.collect()
+    engine = create_engine(database_url)
+    search_engine = SchemaSearch(
+        engine,
+        llm_api_key=llm_config["api_key"],
+        llm_base_url=llm_config["base_url"],
+    )
+    search_engine.index(force=False)
+    process = psutil.Process()
+    after_index_mem = process.memory_info().rss / 1024 / 1024
+    peak_memory = after_index_mem
+    eval_data = _get_eval_data()
+    memory_samples = []
+    latency_samples = []
+    total_score = 0
+    print(f"\n{'='*50} {strategy.upper()} {'='*50}")
+    print(f"After index: {after_index_mem:.2f} MB")
+    print(f"Embedding cache created: {search_engine._embedding_cache is not None}")
+    print(f"BM25 cache created: {search_engine._bm25_cache is not None}")
+    for idx, eval_item in enumerate(eval_data, 1):
+        question = eval_item["question"]
+        correct_table = eval_item["correct_table"]
+        before_mem = process.memory_info().rss / 1024 / 1024
+        response = search_engine.search(
+            question, search_type=cast(SearchType, strategy), hops=1
+        )
+        after_mem = process.memory_info().rss / 1024 / 1024
+        peak_memory = max(peak_memory, after_mem)
+        memory_samples.append(after_mem)
+        latency_samples.append(response["latency_sec"])
+        score = _calculate_score(response["results"], correct_table)
+        total_score += score
+        marker = "✓" if score > 0 else "✗"
+        print(
+            f"  Q{idx}: {marker} Score: {score} | "
+            f"Latency: {response['latency_sec']:.3f}s | "
+            f"Mem: {after_mem:.1f}MB ({after_mem - before_mem:+.1f})"
+        )
+    avg_memory = sum(memory_samples) / len(memory_samples)
+    avg_latency = sum(latency_samples) / len(latency_samples)
+    memory_increase = peak_memory - after_index_mem
+    max_score = len(eval_data) * 5
+    print(f"\n{'='*50} SUMMARY {'='*50}")
+    print(f"Score: {total_score}/{max_score}")
+    print(f"Avg Latency: {avg_latency:.3f}s")
+    print(f"Peak Memory: {peak_memory:.2f} MB")
+    print(f"Avg Memory: {avg_memory:.2f} MB")
+    print(f"Memory Increase: +{memory_increase:.2f} MB")
+    if search_engine._embedding_cache:
+        print(
+            f"Embeddings loaded: {search_engine._embedding_cache.embeddings is not None}"
+        )
+    if search_engine._bm25_cache:
+        print(f"BM25 built: {search_engine._bm25_cache.bm25 is not None}")
+    print("=" * 100)
+def test_bm25_no_embeddings(database_url, llm_config):
+    """Test that BM25 search does NOT load embedding models or cache."""
+    engine = create_engine(database_url)
+    search = SchemaSearch(
+        engine,
+        llm_api_key=llm_config["api_key"],
+        llm_base_url=llm_config["base_url"],
+    )
+    search.index(force=False)
+    assert search._embedding_cache is None, "Embedding cache should not be created yet"
+    assert search._reranker is None, "Reranker should not be created yet"
+    result = search.search("user email", search_type="bm25", limit=5)
+    assert search._embedding_cache is None, "BM25 should not load embedding cache"
+    assert len(result["results"]) > 0, "Should have results"
+    print("\n✓ BM25 search verified: no embeddings loaded")
+def test_fuzzy_no_embeddings(database_url, llm_config):
+    """Test that fuzzy search does NOT load embedding models or cache."""
+    engine = create_engine(database_url)
+    search = SchemaSearch(
+        engine,
+        llm_api_key=llm_config["api_key"],
+        llm_base_url=llm_config["base_url"],
+    )
+    search.index(force=False)
+    assert search._embedding_cache is None, "Embedding cache should not be created yet"
+    assert search._reranker is None, "Reranker should not be created yet"
+    result = search.search("user email", search_type="fuzzy", limit=5)
+    assert search._embedding_cache is None, "Fuzzy should not load embedding cache"
+    assert len(result["results"]) > 0, "Should have results"
+    print("\n✓ Fuzzy search verified: no embeddings loaded")
+def test_semantic_loads_embeddings(database_url, llm_config):
+    """Test that semantic search DOES load embedding models and cache."""
+    engine = create_engine(database_url)
+    search = SchemaSearch(
+        engine,
+        llm_api_key=llm_config["api_key"],
+        llm_base_url=llm_config["base_url"],
+    )
+    search.index(force=False)
+    assert search._embedding_cache is None, "Embedding cache should not be created yet"
+    result = search.search("user email", search_type="semantic", limit=5)
+    assert search._embedding_cache is not None, "Semantic should create embedding cache"
+    assert search.embedding_cache.embeddings is not None, "Embeddings should be loaded"
+    assert len(result["results"]) > 0, "Should have results"
+    print("\n✓ Semantic search verified: embeddings loaded correctly")
+def test_hybrid_loads_embeddings(database_url, llm_config):
+    """Test that hybrid search DOES load embedding models and cache."""
+    engine = create_engine(database_url)
+    search = SchemaSearch(
+        engine,
+        llm_api_key=llm_config["api_key"],
+        llm_base_url=llm_config["base_url"],
+    )
+    search.index(force=False)
+    assert search._embedding_cache is None, "Embedding cache should not be created yet"
+    result = search.search("user email", search_type="hybrid", limit=5)
+    assert search._embedding_cache is not None, "Hybrid should create embedding cache"
+    assert search.embedding_cache.embeddings is not None, "Embeddings should be loaded"
+    assert len(result["results"]) > 0, "Should have results"
+    print("\n✓ Hybrid search verified: embeddings loaded correctly")
+def test_strategy_caching(database_url, llm_config):
+    """Test that search strategies are cached and reused."""
+    engine = create_engine(database_url)
+    search = SchemaSearch(
+        engine,
+        llm_api_key=llm_config["api_key"],
+        llm_base_url=llm_config["base_url"],
+    )
+    search.index(force=False)
+    assert len(search._search_strategies) == 0, "No strategies cached initially"
+    search.search("test query", search_type="bm25", limit=5)
+    assert "bm25" in search._search_strategies, "BM25 strategy should be cached"
+    assert len(search._search_strategies) == 1, "Only one strategy cached"
+    bm25_strategy = search._search_strategies["bm25"]
+    search.search("another query", search_type="bm25", limit=5)
+    assert (
+        search._search_strategies["bm25"] is bm25_strategy
+    ), "Same strategy instance should be reused"
+    search.search("test query", search_type="fuzzy", limit=5)
+    assert "fuzzy" in search._search_strategies, "Fuzzy strategy should be cached"
+    assert len(search._search_strategies) == 2, "Two strategies cached now"
+    print("\n✓ Strategy caching verified: strategies are reused")

tests/test_llm_sql_generation.py ADDED Viewed

@@ -0,0 +1,320 @@
+import os
+from pathlib import Path
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import anthropic
+import pytest
+from dotenv import load_dotenv
+from sqlalchemy import create_engine
+from schema_search import SchemaSearch
+@pytest.fixture(scope="module")
+def database_url():
+    env_path = Path(__file__).parent / ".env"
+    load_dotenv(env_path)
+    url = os.getenv("DATABASE_URL")
+    if not url:
+        pytest.skip("DATABASE_URL not set in tests/.env file")
+    return url
+@pytest.fixture(scope="module")
+def llm_config():
+    env_path = Path(__file__).parent / ".env"
+    load_dotenv(env_path)
+    api_key = os.getenv("LLM_API_KEY")
+    base_url = os.getenv("LLM_BASE_URL")
+    if not api_key:
+        pytest.skip("LLM_API_KEY not set in tests/.env file")
+    return {"api_key": api_key, "base_url": base_url}
+@pytest.fixture(scope="module")
+def search_engine(database_url, llm_config):
+    engine = create_engine(database_url)
+    search = SchemaSearch(
+        engine,
+        llm_api_key=llm_config["api_key"],
+        llm_base_url=llm_config["base_url"],
+    )
+    search.index(force=False)
+    return search
+def test_table_identification_with_schema_search(search_engine, llm_config):
+    """
+    Compare table identification quality when LLM has:
+    1. Full schema context (all tables and indices)
+    2. Limited context from schema search with graph hops
+    For each natural language question, we:
+    - Ask LLM which tables are needed with full schema context (baseline)
+    - Ask LLM which tables are needed with schema search context (our approach)
+    - Compare both against the objective list of required tables
+    """
+    eval_data = [
+        {
+            "question": "how many unique users do we have?",
+            "required_tables": ["user_metadata"],
+            "searches": ["user table"],
+            "hops": 1,
+        },
+        {
+            "question": "what is the email of the user who deposited the most last month",
+            "required_tables": ["user_metadata", "user_deposits"],
+            "searches": ["user email deposit"],
+            "hops": 1,
+        },
+        {
+            "question": "what is the twitter handle of the agent that posted the most?",
+            "required_tables": ["agent_metadata", "agent_content"],
+            "searches": ["agent metadata content"],
+            "hops": 1,
+        },
+        {
+            "question": "which topic was covered the most in news articles last month?",
+            "required_tables": ["news_to_topic_map", "topic_metadata"],
+            "searches": ["topic metadata news map"],
+            "hops": 1,
+        },
+        {
+            "question": "which coin's price increased the most last month?",
+            "required_tables": ["historical_market_data"],
+            "searches": ["historical market data"],
+            "hops": 1,
+        },
+        {
+            "question": "find the 5 most recent news about the coin that increased the most last month?",
+            "required_tables": [
+                "historical_market_data",
+                "news_to_topic_map",
+                "topic_metadata",
+                "news_summary",
+            ],
+            "searches": ["historical market data news topic"],
+            "hops": 1,
+        },
+        {
+            "question": "which model did the top user of last month use?",
+            "required_tables": ["user_metadata", "model_metadata", "query_metrics"],
+            "searches": ["user metadata model query metrics"],
+            "hops": 1,
+        },
+        {
+            "question": "which agent gained the most followers last month?",
+            "required_tables": ["agent_metadata", "twitter_follow_activity"],
+            "searches": ["agent metadata twitter follow activity"],
+            "hops": 1,
+        },
+        {
+            "question": "which agent posted the most content last month?",
+            "required_tables": ["agent_metadata", "agent_content"],
+            "searches": ["agent metadata agent content"],
+            "hops": 1,
+        },
+        {
+            "question": "which api key was most used during last month?",
+            "required_tables": ["api_token", "query_metrics", "user_metadata"],
+            "searches": ["api token query metrics user metadata"],
+            "hops": 1,
+        },
+    ]
+    def get_baseline_context(search_engine):
+        """Get minimal context: just table names and indices."""
+        context_parts = []
+        for table_name, table_schema in search_engine.schemas.items():
+            context_parts.append(f"Table: {table_name}")
+            indices = table_schema.get("indices")
+            if indices:
+                idx_list = ", ".join([idx["name"] for idx in indices])
+                context_parts.append(f"Indices: {idx_list}")
+        return "\n\n".join(context_parts)
+    def get_search_results_context(search_engine, searches, hops):
+        """Get detailed schema from search results to add to baseline."""
+        context_parts = []
+        seen_tables = set()
+        for search_query in searches:
+            response = search_engine.search(
+                search_query, hops=hops, limit=5, search_type="semantic"
+            )
+            for result in response["results"]:
+                table_name = result["table"]
+                if table_name in seen_tables:
+                    continue
+                seen_tables.add(table_name)
+                columns = result["schema"].get("columns")
+                if columns:
+                    col_list = ", ".join(
+                        [f"{col['name']} ({col['type']})" for col in columns]
+                    )
+                    context_parts.append(f"Table: {table_name}\nColumns: {col_list}")
+        print("Search results tables: ", list(seen_tables))
+        return "\n\n".join(context_parts)
+    def call_llm_for_tables(question, schema_context, llm_config):
+        """Call LLM to identify which tables are needed."""
+        client = anthropic.Anthropic(api_key=llm_config["api_key"])
+        prompt = f"""Given the following database schema:
+{schema_context}
+Which tables are necessary to answer this question: {question}
+Return ONLY a comma-separated list of table names, nothing else. No explanations or additional text.
+Example format: table1, table2, table3"""
+        response = client.messages.create(
+            model="claude-sonnet-4-5-20250929",
+            max_tokens=512,
+            system="You are a database expert. Identify only the tables needed to answer the question.",
+            messages=[
+                {"role": "user", "content": prompt},
+            ],
+            temperature=0,
+        )
+        tables_str = response.content[0].text.strip()  # type: ignore
+        tables = [t.strip() for t in tables_str.split(",") if t.strip()]
+        # Remove schema prefix if present
+        tables = [t.split(".")[-1] for t in tables]
+        return tables
+    def compare_tables(identified_tables, required_tables):
+        """Compare identified tables with required tables."""
+        identified_set = set(t.lower() for t in identified_tables)
+        required_set = set(t.lower() for t in required_tables)
+        correct = identified_set & required_set
+        missing = required_set - identified_set
+        extra = identified_set - required_set
+        is_perfect = len(missing) == 0 and len(extra) == 0
+        return {
+            "is_perfect": is_perfect,
+            "correct": correct,
+            "missing": missing,
+            "extra": extra,
+            "precision": len(correct) / len(identified_set) if identified_set else 0,
+            "recall": len(correct) / len(required_set) if required_set else 0,
+        }
+    if len(eval_data) == 0:
+        pytest.skip("No evaluation data provided")
+    print("\n" + "=" * 100)
+    print("EVALUATION: Table Identification - Baseline vs Baseline + Search Results")
+    print("=" * 100)
+    baseline_context = get_baseline_context(search_engine)
+    baseline_perfect = 0
+    baseline_total_precision = 0
+    baseline_total_recall = 0
+    search_perfect = 0
+    search_total_precision = 0
+    search_total_recall = 0
+    for idx, eval_item in enumerate(eval_data, 1):
+        question = eval_item["question"]
+        required_tables = eval_item.get("required_tables", [])
+        searches = eval_item.get("searches", [question])
+        hops = eval_item.get("hops", 1)
+        print(f"\n{'='*100}")
+        print(f"Question {idx}: {question}")
+        print(f"Required tables: {required_tables}")
+        print(f"{'='*100}")
+        # Get search results and combine with baseline
+        search_results_context = get_search_results_context(
+            search_engine, searches, hops
+        )
+        enhanced_context = baseline_context + "\n\n" + search_results_context
+        print(f"\n[Baseline only] Context: {len(baseline_context)} chars")
+        print(f"[Baseline + Search] Context: {len(enhanced_context)} chars")
+        print(f"Additional context from search: {len(search_results_context)} chars")
+        # Identify tables with baseline only
+        print("\n--- Identifying tables with BASELINE ONLY ---")
+        tables_baseline = call_llm_for_tables(question, baseline_context, llm_config)
+        print(f"Identified tables: {tables_baseline}")
+        comparison_baseline = compare_tables(tables_baseline, required_tables)
+        print(
+            f"Precision: {comparison_baseline['precision']:.2f}, Recall: {comparison_baseline['recall']:.2f}"
+        )
+        if comparison_baseline["missing"]:
+            print(f"Missing: {comparison_baseline['missing']}")
+        if comparison_baseline["extra"]:
+            print(f"Extra: {comparison_baseline['extra']}")
+        # Identify tables with baseline + search results
+        print("\n--- Identifying tables with BASELINE + SEARCH ---")
+        tables_search = call_llm_for_tables(question, enhanced_context, llm_config)
+        print(f"Identified tables: {tables_search}")
+        comparison_search = compare_tables(tables_search, required_tables)
+        print(
+            f"Precision: {comparison_search['precision']:.2f}, Recall: {comparison_search['recall']:.2f}"
+        )
+        if comparison_search["missing"]:
+            print(f"Missing: {comparison_search['missing']}")
+        if comparison_search["extra"]:
+            print(f"Extra: {comparison_search['extra']}")
+        # Track metrics
+        if comparison_baseline["is_perfect"]:
+            baseline_perfect += 1
+            print("\n✓ Baseline: PERFECT")
+        else:
+            print("\n✗ Baseline: Not perfect")
+        if comparison_search["is_perfect"]:
+            search_perfect += 1
+            print("✓ Schema Search: PERFECT")
+        else:
+            print("✗ Schema Search: Not perfect")
+        baseline_total_precision += comparison_baseline["precision"]
+        baseline_total_recall += comparison_baseline["recall"]
+        search_total_precision += comparison_search["precision"]
+        search_total_recall += comparison_search["recall"]
+    print("\n" + "=" * 100)
+    print("FINAL RESULTS")
+    print("=" * 100)
+    total_questions = len(eval_data)
+    print(f"Total questions: {total_questions}")
+    print(f"\nBaseline Only:")
+    print(f"  Perfect matches: {baseline_perfect}/{total_questions}")
+    print(f"  Avg Precision: {baseline_total_precision/total_questions:.2f}")
+    print(f"  Avg Recall: {baseline_total_recall/total_questions:.2f}")
+    print(f"\nBaseline + Search Results:")
+    print(f"  Perfect matches: {search_perfect}/{total_questions}")
+    print(f"  Avg Precision: {search_total_precision/total_questions:.2f}")
+    print(f"  Avg Recall: {search_total_recall/total_questions:.2f}")
+    print(f"\nImprovement: {search_perfect - baseline_perfect} more perfect matches")
+    print("=" * 100)