schema-search 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- schema_search/__init__.py +26 -0
- schema_search/chunkers/__init__.py +6 -0
- schema_search/chunkers/base.py +95 -0
- schema_search/chunkers/factory.py +31 -0
- schema_search/chunkers/llm.py +54 -0
- schema_search/chunkers/markdown.py +25 -0
- schema_search/embedding_cache/__init__.py +5 -0
- schema_search/embedding_cache/base.py +40 -0
- schema_search/embedding_cache/bm25.py +63 -0
- schema_search/embedding_cache/factory.py +20 -0
- schema_search/embedding_cache/inmemory.py +122 -0
- schema_search/graph_builder.py +69 -0
- schema_search/mcp_server.py +81 -0
- schema_search/metrics.py +33 -0
- schema_search/rankers/__init__.py +5 -0
- schema_search/rankers/base.py +45 -0
- schema_search/rankers/cross_encoder.py +40 -0
- schema_search/rankers/factory.py +11 -0
- schema_search/schema_extractor.py +135 -0
- schema_search/schema_search.py +276 -0
- schema_search/search/__init__.py +15 -0
- schema_search/search/base.py +85 -0
- schema_search/search/bm25.py +48 -0
- schema_search/search/factory.py +61 -0
- schema_search/search/fuzzy.py +56 -0
- schema_search/search/hybrid.py +82 -0
- schema_search/search/semantic.py +49 -0
- schema_search/types.py +57 -0
- schema_search/utils/__init__.py +0 -0
- schema_search/utils/lazy_import.py +26 -0
- schema_search-0.1.10.dist-info/METADATA +308 -0
- schema_search-0.1.10.dist-info/RECORD +40 -0
- schema_search-0.1.10.dist-info/WHEEL +5 -0
- schema_search-0.1.10.dist-info/entry_points.txt +2 -0
- schema_search-0.1.10.dist-info/licenses/LICENSE +21 -0
- schema_search-0.1.10.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/test_integration.py +352 -0
- tests/test_llm_sql_generation.py +320 -0
- tests/test_spider_eval.py +488 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Adib Hasan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
tests/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import gc
|
|
4
|
+
from typing import cast
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
from sqlalchemy import create_engine
|
|
9
|
+
import psutil
|
|
10
|
+
|
|
11
|
+
from schema_search import SchemaSearch
|
|
12
|
+
from schema_search.types import SearchType
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.fixture(scope="module")
|
|
16
|
+
def database_url():
|
|
17
|
+
env_path = Path(__file__).parent / ".env"
|
|
18
|
+
load_dotenv(env_path)
|
|
19
|
+
|
|
20
|
+
url = os.getenv("DATABASE_URL")
|
|
21
|
+
if not url:
|
|
22
|
+
pytest.skip("DATABASE_URL not set in tests/.env file")
|
|
23
|
+
|
|
24
|
+
return url
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.fixture(scope="module")
|
|
28
|
+
def llm_config():
|
|
29
|
+
env_path = Path(__file__).parent / ".env"
|
|
30
|
+
load_dotenv(env_path)
|
|
31
|
+
|
|
32
|
+
api_key = os.getenv("LLM_API_KEY")
|
|
33
|
+
base_url = "https://api.anthropic.com/v1/"
|
|
34
|
+
|
|
35
|
+
if not api_key:
|
|
36
|
+
pytest.skip("LLM_API_KEY not set in tests/.env file")
|
|
37
|
+
|
|
38
|
+
return {"api_key": api_key, "base_url": base_url}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@pytest.fixture(scope="module")
|
|
42
|
+
def search_engine(database_url, llm_config):
|
|
43
|
+
engine = create_engine(database_url)
|
|
44
|
+
search = SchemaSearch(
|
|
45
|
+
engine,
|
|
46
|
+
llm_api_key=llm_config["api_key"],
|
|
47
|
+
llm_base_url=llm_config["base_url"],
|
|
48
|
+
)
|
|
49
|
+
return search
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_index_creation(search_engine):
|
|
53
|
+
"""Test that the index can be built successfully."""
|
|
54
|
+
stats = search_engine.index(force=True)
|
|
55
|
+
|
|
56
|
+
assert len(search_engine.schemas) > 0, "No tables found in database"
|
|
57
|
+
assert len(search_engine.chunks) > 0, "No chunks generated"
|
|
58
|
+
|
|
59
|
+
print(f"\nIndexing: {stats}")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_search_user_information(search_engine):
|
|
63
|
+
"""Test searching for user-related information in the schema."""
|
|
64
|
+
search_engine.index(force=False)
|
|
65
|
+
|
|
66
|
+
query = "which table has user email address?"
|
|
67
|
+
response = search_engine.search(query)
|
|
68
|
+
|
|
69
|
+
results = response["results"]
|
|
70
|
+
|
|
71
|
+
for result in results:
|
|
72
|
+
print(f"Result: {result['table']} (score: {result['score']:.3f})")
|
|
73
|
+
# print(f"Related tables: {result['related_tables']}")
|
|
74
|
+
# print("-" * 100)
|
|
75
|
+
|
|
76
|
+
assert len(results) > 0, "No search results returned"
|
|
77
|
+
|
|
78
|
+
top_result = results[0]
|
|
79
|
+
assert "table" in top_result, "Result missing 'table' field"
|
|
80
|
+
assert "score" in top_result, "Result missing 'score' field"
|
|
81
|
+
assert "schema" in top_result, "Result missing 'schema' field"
|
|
82
|
+
assert "matched_chunks" in top_result, "Result missing 'matched_chunks' field"
|
|
83
|
+
assert "related_tables" in top_result, "Result missing 'related_tables' field"
|
|
84
|
+
|
|
85
|
+
assert top_result["score"] > 0, "Top result has invalid score"
|
|
86
|
+
|
|
87
|
+
print(f"\nTop result: {top_result['table']} (score: {top_result['score']:.3f})")
|
|
88
|
+
print(f"Related tables: {top_result['related_tables']}")
|
|
89
|
+
print(f"Search latency: {response['latency_sec']}s")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _calculate_score(results, correct_table):
|
|
93
|
+
"""Calculate score based on position. Top=5, 2nd=4, 3rd=3, 4th=2, 5th=1, not found=0"""
|
|
94
|
+
for position, result in enumerate(results[:5], 1):
|
|
95
|
+
if result["table"] == correct_table:
|
|
96
|
+
return 6 - position
|
|
97
|
+
return 0
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _get_eval_data():
|
|
101
|
+
"""Return evaluation dataset."""
|
|
102
|
+
return [
|
|
103
|
+
{
|
|
104
|
+
"question": "which table has user email address?",
|
|
105
|
+
"correct_table": "user_metadata",
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
"question": "which table has scrapped project content?",
|
|
109
|
+
"correct_table": "project_content",
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"question": "where can I find complete list of twitter bot accounts?",
|
|
113
|
+
"correct_table": "agent_metadata",
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"question": "which table user api keys??",
|
|
117
|
+
"correct_table": "api_token",
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"question": "which table has user deposits?",
|
|
121
|
+
"correct_table": "user_deposits",
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"question": "which table has information about infrastructure?",
|
|
125
|
+
"correct_table": "node_metadata",
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
"question": "which table has information about user balances?",
|
|
129
|
+
"correct_table": "user_balances",
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
"question": "which table maps news to topics?",
|
|
133
|
+
"correct_table": "news_to_topic_map",
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
"question": "which table has information about projects?",
|
|
137
|
+
"correct_table": "project_metadata",
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
"question": "which table user query metrics?",
|
|
141
|
+
"correct_table": "query_metrics",
|
|
142
|
+
},
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_memory_bm25_isolated(database_url, llm_config):
|
|
147
|
+
"""Measure BM25 in complete isolation."""
|
|
148
|
+
_run_memory_test_for_strategy(database_url, llm_config, "bm25")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def test_memory_fuzzy_isolated(database_url, llm_config):
|
|
152
|
+
"""Measure Fuzzy in complete isolation."""
|
|
153
|
+
_run_memory_test_for_strategy(database_url, llm_config, "fuzzy")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def test_memory_semantic_isolated(database_url, llm_config):
|
|
157
|
+
"""Measure Semantic in complete isolation."""
|
|
158
|
+
_run_memory_test_for_strategy(database_url, llm_config, "semantic")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def test_memory_hybrid_isolated(database_url, llm_config):
|
|
162
|
+
"""Measure Hybrid in complete isolation."""
|
|
163
|
+
_run_memory_test_for_strategy(database_url, llm_config, "hybrid")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _run_memory_test_for_strategy(database_url, llm_config, strategy):
|
|
167
|
+
"""Run memory test for a single strategy."""
|
|
168
|
+
gc.collect()
|
|
169
|
+
|
|
170
|
+
engine = create_engine(database_url)
|
|
171
|
+
search_engine = SchemaSearch(
|
|
172
|
+
engine,
|
|
173
|
+
llm_api_key=llm_config["api_key"],
|
|
174
|
+
llm_base_url=llm_config["base_url"],
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
search_engine.index(force=False)
|
|
178
|
+
|
|
179
|
+
process = psutil.Process()
|
|
180
|
+
after_index_mem = process.memory_info().rss / 1024 / 1024
|
|
181
|
+
peak_memory = after_index_mem
|
|
182
|
+
|
|
183
|
+
eval_data = _get_eval_data()
|
|
184
|
+
memory_samples = []
|
|
185
|
+
latency_samples = []
|
|
186
|
+
total_score = 0
|
|
187
|
+
|
|
188
|
+
print(f"\n{'='*50} {strategy.upper()} {'='*50}")
|
|
189
|
+
print(f"After index: {after_index_mem:.2f} MB")
|
|
190
|
+
print(f"Embedding cache created: {search_engine._embedding_cache is not None}")
|
|
191
|
+
print(f"BM25 cache created: {search_engine._bm25_cache is not None}")
|
|
192
|
+
|
|
193
|
+
for idx, eval_item in enumerate(eval_data, 1):
|
|
194
|
+
question = eval_item["question"]
|
|
195
|
+
correct_table = eval_item["correct_table"]
|
|
196
|
+
|
|
197
|
+
before_mem = process.memory_info().rss / 1024 / 1024
|
|
198
|
+
response = search_engine.search(
|
|
199
|
+
question, search_type=cast(SearchType, strategy), hops=1
|
|
200
|
+
)
|
|
201
|
+
after_mem = process.memory_info().rss / 1024 / 1024
|
|
202
|
+
|
|
203
|
+
peak_memory = max(peak_memory, after_mem)
|
|
204
|
+
memory_samples.append(after_mem)
|
|
205
|
+
latency_samples.append(response["latency_sec"])
|
|
206
|
+
|
|
207
|
+
score = _calculate_score(response["results"], correct_table)
|
|
208
|
+
total_score += score
|
|
209
|
+
|
|
210
|
+
marker = "✓" if score > 0 else "✗"
|
|
211
|
+
print(
|
|
212
|
+
f" Q{idx}: {marker} Score: {score} | "
|
|
213
|
+
f"Latency: {response['latency_sec']:.3f}s | "
|
|
214
|
+
f"Mem: {after_mem:.1f}MB ({after_mem - before_mem:+.1f})"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
avg_memory = sum(memory_samples) / len(memory_samples)
|
|
218
|
+
avg_latency = sum(latency_samples) / len(latency_samples)
|
|
219
|
+
memory_increase = peak_memory - after_index_mem
|
|
220
|
+
max_score = len(eval_data) * 5
|
|
221
|
+
|
|
222
|
+
print(f"\n{'='*50} SUMMARY {'='*50}")
|
|
223
|
+
print(f"Score: {total_score}/{max_score}")
|
|
224
|
+
print(f"Avg Latency: {avg_latency:.3f}s")
|
|
225
|
+
print(f"Peak Memory: {peak_memory:.2f} MB")
|
|
226
|
+
print(f"Avg Memory: {avg_memory:.2f} MB")
|
|
227
|
+
print(f"Memory Increase: +{memory_increase:.2f} MB")
|
|
228
|
+
if search_engine._embedding_cache:
|
|
229
|
+
print(
|
|
230
|
+
f"Embeddings loaded: {search_engine._embedding_cache.embeddings is not None}"
|
|
231
|
+
)
|
|
232
|
+
if search_engine._bm25_cache:
|
|
233
|
+
print(f"BM25 built: {search_engine._bm25_cache.bm25 is not None}")
|
|
234
|
+
print("=" * 100)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def test_bm25_no_embeddings(database_url, llm_config):
|
|
238
|
+
"""Test that BM25 search does NOT load embedding models or cache."""
|
|
239
|
+
engine = create_engine(database_url)
|
|
240
|
+
search = SchemaSearch(
|
|
241
|
+
engine,
|
|
242
|
+
llm_api_key=llm_config["api_key"],
|
|
243
|
+
llm_base_url=llm_config["base_url"],
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
search.index(force=False)
|
|
247
|
+
|
|
248
|
+
assert search._embedding_cache is None, "Embedding cache should not be created yet"
|
|
249
|
+
assert search._reranker is None, "Reranker should not be created yet"
|
|
250
|
+
|
|
251
|
+
result = search.search("user email", search_type="bm25", limit=5)
|
|
252
|
+
|
|
253
|
+
assert search._embedding_cache is None, "BM25 should not load embedding cache"
|
|
254
|
+
assert len(result["results"]) > 0, "Should have results"
|
|
255
|
+
|
|
256
|
+
print("\n✓ BM25 search verified: no embeddings loaded")
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def test_fuzzy_no_embeddings(database_url, llm_config):
|
|
260
|
+
"""Test that fuzzy search does NOT load embedding models or cache."""
|
|
261
|
+
engine = create_engine(database_url)
|
|
262
|
+
search = SchemaSearch(
|
|
263
|
+
engine,
|
|
264
|
+
llm_api_key=llm_config["api_key"],
|
|
265
|
+
llm_base_url=llm_config["base_url"],
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
search.index(force=False)
|
|
269
|
+
|
|
270
|
+
assert search._embedding_cache is None, "Embedding cache should not be created yet"
|
|
271
|
+
assert search._reranker is None, "Reranker should not be created yet"
|
|
272
|
+
|
|
273
|
+
result = search.search("user email", search_type="fuzzy", limit=5)
|
|
274
|
+
|
|
275
|
+
assert search._embedding_cache is None, "Fuzzy should not load embedding cache"
|
|
276
|
+
assert len(result["results"]) > 0, "Should have results"
|
|
277
|
+
|
|
278
|
+
print("\n✓ Fuzzy search verified: no embeddings loaded")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def test_semantic_loads_embeddings(database_url, llm_config):
|
|
282
|
+
"""Test that semantic search DOES load embedding models and cache."""
|
|
283
|
+
engine = create_engine(database_url)
|
|
284
|
+
search = SchemaSearch(
|
|
285
|
+
engine,
|
|
286
|
+
llm_api_key=llm_config["api_key"],
|
|
287
|
+
llm_base_url=llm_config["base_url"],
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
search.index(force=False)
|
|
291
|
+
|
|
292
|
+
assert search._embedding_cache is None, "Embedding cache should not be created yet"
|
|
293
|
+
|
|
294
|
+
result = search.search("user email", search_type="semantic", limit=5)
|
|
295
|
+
|
|
296
|
+
assert search._embedding_cache is not None, "Semantic should create embedding cache"
|
|
297
|
+
assert search.embedding_cache.embeddings is not None, "Embeddings should be loaded"
|
|
298
|
+
assert len(result["results"]) > 0, "Should have results"
|
|
299
|
+
|
|
300
|
+
print("\n✓ Semantic search verified: embeddings loaded correctly")
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def test_hybrid_loads_embeddings(database_url, llm_config):
|
|
304
|
+
"""Test that hybrid search DOES load embedding models and cache."""
|
|
305
|
+
engine = create_engine(database_url)
|
|
306
|
+
search = SchemaSearch(
|
|
307
|
+
engine,
|
|
308
|
+
llm_api_key=llm_config["api_key"],
|
|
309
|
+
llm_base_url=llm_config["base_url"],
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
search.index(force=False)
|
|
313
|
+
|
|
314
|
+
assert search._embedding_cache is None, "Embedding cache should not be created yet"
|
|
315
|
+
|
|
316
|
+
result = search.search("user email", search_type="hybrid", limit=5)
|
|
317
|
+
|
|
318
|
+
assert search._embedding_cache is not None, "Hybrid should create embedding cache"
|
|
319
|
+
assert search.embedding_cache.embeddings is not None, "Embeddings should be loaded"
|
|
320
|
+
assert len(result["results"]) > 0, "Should have results"
|
|
321
|
+
|
|
322
|
+
print("\n✓ Hybrid search verified: embeddings loaded correctly")
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def test_strategy_caching(database_url, llm_config):
|
|
326
|
+
"""Test that search strategies are cached and reused."""
|
|
327
|
+
engine = create_engine(database_url)
|
|
328
|
+
search = SchemaSearch(
|
|
329
|
+
engine,
|
|
330
|
+
llm_api_key=llm_config["api_key"],
|
|
331
|
+
llm_base_url=llm_config["base_url"],
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
search.index(force=False)
|
|
335
|
+
|
|
336
|
+
assert len(search._search_strategies) == 0, "No strategies cached initially"
|
|
337
|
+
|
|
338
|
+
search.search("test query", search_type="bm25", limit=5)
|
|
339
|
+
assert "bm25" in search._search_strategies, "BM25 strategy should be cached"
|
|
340
|
+
assert len(search._search_strategies) == 1, "Only one strategy cached"
|
|
341
|
+
|
|
342
|
+
bm25_strategy = search._search_strategies["bm25"]
|
|
343
|
+
search.search("another query", search_type="bm25", limit=5)
|
|
344
|
+
assert (
|
|
345
|
+
search._search_strategies["bm25"] is bm25_strategy
|
|
346
|
+
), "Same strategy instance should be reused"
|
|
347
|
+
|
|
348
|
+
search.search("test query", search_type="fuzzy", limit=5)
|
|
349
|
+
assert "fuzzy" in search._search_strategies, "Fuzzy strategy should be cached"
|
|
350
|
+
assert len(search._search_strategies) == 2, "Two strategies cached now"
|
|
351
|
+
|
|
352
|
+
print("\n✓ Strategy caching verified: strategies are reused")
|