schema-search 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of schema-search might be problematic. Click here for more details.

Files changed (45) hide show
  1. schema_search-0.1.2/LICENSE +21 -0
  2. schema_search-0.1.2/MANIFEST.in +4 -0
  3. schema_search-0.1.2/PKG-INFO +275 -0
  4. schema_search-0.1.2/README.md +223 -0
  5. schema_search-0.1.2/config.yml +34 -0
  6. schema_search-0.1.2/schema_search/__init__.py +26 -0
  7. schema_search-0.1.2/schema_search/chunkers/__init__.py +6 -0
  8. schema_search-0.1.2/schema_search/chunkers/base.py +95 -0
  9. schema_search-0.1.2/schema_search/chunkers/factory.py +31 -0
  10. schema_search-0.1.2/schema_search/chunkers/llm.py +51 -0
  11. schema_search-0.1.2/schema_search/chunkers/markdown.py +25 -0
  12. schema_search-0.1.2/schema_search/embedding_cache/__init__.py +5 -0
  13. schema_search-0.1.2/schema_search/embedding_cache/base.py +40 -0
  14. schema_search-0.1.2/schema_search/embedding_cache/bm25.py +63 -0
  15. schema_search-0.1.2/schema_search/embedding_cache/factory.py +20 -0
  16. schema_search-0.1.2/schema_search/embedding_cache/inmemory.py +112 -0
  17. schema_search-0.1.2/schema_search/graph_builder.py +69 -0
  18. schema_search-0.1.2/schema_search/mcp_server.py +82 -0
  19. schema_search-0.1.2/schema_search/metrics.py +33 -0
  20. schema_search-0.1.2/schema_search/rankers/__init__.py +5 -0
  21. schema_search-0.1.2/schema_search/rankers/base.py +45 -0
  22. schema_search-0.1.2/schema_search/rankers/cross_encoder.py +34 -0
  23. schema_search-0.1.2/schema_search/rankers/factory.py +11 -0
  24. schema_search-0.1.2/schema_search/schema_extractor.py +135 -0
  25. schema_search-0.1.2/schema_search/schema_search.py +263 -0
  26. schema_search-0.1.2/schema_search/search/__init__.py +15 -0
  27. schema_search-0.1.2/schema_search/search/base.py +85 -0
  28. schema_search-0.1.2/schema_search/search/bm25.py +48 -0
  29. schema_search-0.1.2/schema_search/search/factory.py +61 -0
  30. schema_search-0.1.2/schema_search/search/fuzzy.py +56 -0
  31. schema_search-0.1.2/schema_search/search/hybrid.py +82 -0
  32. schema_search-0.1.2/schema_search/search/semantic.py +49 -0
  33. schema_search-0.1.2/schema_search/types.py +57 -0
  34. schema_search-0.1.2/schema_search.egg-info/PKG-INFO +275 -0
  35. schema_search-0.1.2/schema_search.egg-info/SOURCES.txt +43 -0
  36. schema_search-0.1.2/schema_search.egg-info/dependency_links.txt +1 -0
  37. schema_search-0.1.2/schema_search.egg-info/entry_points.txt +2 -0
  38. schema_search-0.1.2/schema_search.egg-info/requires.txt +31 -0
  39. schema_search-0.1.2/schema_search.egg-info/top_level.txt +2 -0
  40. schema_search-0.1.2/setup.cfg +4 -0
  41. schema_search-0.1.2/setup.py +63 -0
  42. schema_search-0.1.2/tests/__init__.py +0 -0
  43. schema_search-0.1.2/tests/test_integration.py +352 -0
  44. schema_search-0.1.2/tests/test_llm_sql_generation.py +320 -0
  45. schema_search-0.1.2/tests/test_spider_eval.py +484 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Adib Hasan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include README.md
2
+ include LICENSE
3
+ include config.yml
4
+
@@ -0,0 +1,275 @@
1
+ Metadata-Version: 2.4
2
+ Name: schema-search
3
+ Version: 0.1.2
4
+ Summary: Natural language search for database schemas with graph-aware semantic retrieval
5
+ Home-page: https://github.com/neehan/schema-search
6
+ Author:
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.8
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: sqlalchemy>=1.4.0
19
+ Requires-Dist: sentence-transformers>=2.2.0
20
+ Requires-Dist: networkx>=2.8.0
21
+ Requires-Dist: bm25s>=0.2.0
22
+ Requires-Dist: numpy>=1.21.0
23
+ Requires-Dist: pyyaml>=6.0
24
+ Requires-Dist: tqdm>=4.65.0
25
+ Requires-Dist: openai>=1.0.0
26
+ Requires-Dist: rapidfuzz>=3.0.0
27
+ Provides-Extra: mcp
28
+ Requires-Dist: fastmcp>=2.0.0; extra == "mcp"
29
+ Provides-Extra: test
30
+ Requires-Dist: pytest>=7.0.0; extra == "test"
31
+ Requires-Dist: python-dotenv>=1.0.0; extra == "test"
32
+ Requires-Dist: psutil>=5.9.0; extra == "test"
33
+ Requires-Dist: datasets>=2.0.0; extra == "test"
34
+ Provides-Extra: postgres
35
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
36
+ Provides-Extra: mysql
37
+ Requires-Dist: pymysql>=1.0.0; extra == "mysql"
38
+ Provides-Extra: snowflake
39
+ Requires-Dist: snowflake-sqlalchemy>=1.4.0; extra == "snowflake"
40
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == "snowflake"
41
+ Provides-Extra: bigquery
42
+ Requires-Dist: sqlalchemy-bigquery>=1.6.0; extra == "bigquery"
43
+ Dynamic: classifier
44
+ Dynamic: description
45
+ Dynamic: description-content-type
46
+ Dynamic: home-page
47
+ Dynamic: license-file
48
+ Dynamic: provides-extra
49
+ Dynamic: requires-dist
50
+ Dynamic: requires-python
51
+ Dynamic: summary
52
+
53
+ # Schema Search
54
+
55
+ An MCP Server for Natural Language Search over RDBMS Schemas. Find exact tables you need, with all their relationships mapped out, in milliseconds. No vector database setup is required.
56
+
57
+ ## Why
58
+
59
+ You have 200 tables in your database. Someone asks "where are user refunds stored?"
60
+
61
+ You could:
62
+ - Grep through SQL files for 20 minutes
63
+ - Pass the full schema to an LLM and watch it struggle with 200 tables
64
+
65
+ Or **build schematic embeddings of your tables, store in-memory, and query in natural language in an MCP server**.
66
+
67
+ ### Benefits
68
+ - No vector database setup is required
69
+ - Small memory footprint -- easily scales up to 1000 tables and 10,000+ columns.
70
+ - Millisecond query latency
71
+
72
+ ## Install
73
+
74
+ ```bash
75
+ # With uv - PostgreSQL (recommended)
76
+ uv pip install "schema-search[postgres,mcp]"
77
+
78
+ # With pip - PostgreSQL
79
+ pip install "schema-search[postgres,mcp]"
80
+
81
+ # Other databases
82
+ uv pip install "schema-search[mysql,mcp]" # MySQL
83
+ uv pip install "schema-search[snowflake,mcp]" # Snowflake
84
+ uv pip install "schema-search[bigquery,mcp]" # BigQuery
85
+ ```
86
+
87
+ ## MCP Server
88
+
89
+ Integrate with Claude Desktop or any MCP client.
90
+
91
+ ### Setup
92
+
93
+ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
94
+
95
+ **Using uv (Recommended):**
96
+ ```json
97
+ {
98
+ "mcpServers": {
99
+ "schema-search": {
100
+ "command": "uvx",
101
+ "args": ["schema-search[postgres,mcp]", "postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
102
+ }
103
+ }
104
+ }
105
+ ```
106
+
107
+ **Using pip:**
108
+ ```json
109
+ {
110
+ "mcpServers": {
111
+ "schema-search": {
112
+ "command": "path/to/schema-search-mcp", // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search-mcp",
113
+ "args": ["postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
114
+ }
115
+ }
116
+ }
117
+ ```
118
+
119
+
120
+ The LLM API key and base url are only required if you use LLM-generated schema summaries (`config.chunking.strategy = 'llm'`).
121
+
122
+ ### CLI Usage
123
+
124
+ ```bash
125
+ schema-search-mcp "postgresql://user:pass@localhost/db"
126
+ ```
127
+
128
+ Optional args: `[config_path] [llm_api_key] [llm_base_url]`
129
+
130
+ The server exposes `schema_search(query, hops, limit)` for natural language schema queries.
131
+
132
+ ## Python Use
133
+
134
+ ```python
135
+ from sqlalchemy import create_engine
136
+ from schema_search import SchemaSearch
137
+
138
+ engine = create_engine("postgresql://user:pass@localhost/db")
139
+ search = SchemaSearch(engine)
140
+
141
+ search.index(force=False) # default is False
142
+ results = search.search("where are user refunds stored?")
143
+
144
+ for result in results['results']:
145
+ print(result['table']) # "refund_transactions"
146
+ print(result['schema']) # Full column info, types, constraints
147
+ print(result['related_tables']) # ["users", "payments", "transactions"]
148
+
149
+ # Override hops, limit, search strategy
150
+ results = search.search("user_table", hops=0, limit=5, search_type="semantic")
151
+
152
+ ```
153
+
154
+ `SchemaSearch.index()` automatically detects schema changes and refreshes cached metadata, so you rarely need to force a reindex manually.
155
+
156
+ ## Configuration
157
+
158
+ Edit `[config.yml](config.yml)`:
159
+
160
+ ```yaml
161
+ logging:
162
+ level: "WARNING"
163
+
164
+ embedding:
165
+ location: "memory" # Options: "memory", "vectordb" (coming soon)
166
+ model: "multi-qa-MiniLM-L6-cos-v1"
167
+ metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
168
+ batch_size: 32
169
+ show_progress: false
170
+ cache_dir: "/tmp/.schema_search_cache"
171
+
172
+ chunking:
173
+ strategy: "raw" # Options: "raw", "llm"
174
+ max_tokens: 256
175
+ overlap_tokens: 50
176
+ model: "gpt-4o-mini"
177
+
178
+ search:
179
+ # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
180
+ strategy: "hybrid"
181
+ initial_top_k: 20
182
+ rerank_top_k: 5
183
+ semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
184
+ hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
185
+
186
+ reranker:
187
+ # CrossEncoder model for reranking. Set to null to disable reranking
188
+ model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
189
+
190
+ schema:
191
+ include_columns: true
192
+ include_indices: true
193
+ include_foreign_keys: true
194
+ include_constraints: true
195
+ ```
196
+
197
+ ## Search Strategies
198
+
199
+ Schema Search supports four search strategies:
200
+
201
+ - **semantic**: Embedding-based similarity search using sentence transformers
202
+ - **bm25**: Lexical search using BM25 ranking algorithm
203
+ - **fuzzy**: String matching on table/column names using fuzzy matching
204
+ - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% fuzzy)
205
+
206
+ Each strategy performs its own initial ranking, then optionally applies CrossEncoder reranking if `reranker.model` is configured. Set `reranker.model` to `null` to disable reranking.
207
+
208
+ ## Performance Comparison
209
+ We [benchmarked](/tests/test_spider_eval.py) on the Spider dataset (1,234 train queries across 18 databases) using the default `config.yml`.
210
+
211
+ **Memory:** The embedding model requires ~90 MB and the optional reranker adds ~155 MB. Actual process memory depends on your Python runtime.
212
+
213
+ ### Without Reranker (`reranker.model: null`)
214
+ ![Without Reranker](img/spider_benchmark_without_reranker.png)
215
+ - **Indexing:** 0.22s ± 0.08s per database (18 total).
216
+ - **Accuracy:** Hybrid leads with Recall@1 62% / MRR 0.93; Semantic follows at Recall@1 58% / MRR 0.89.
217
+ - **Latency:** BM25 and Fuzzy return in ~5ms; Semantic spends ~15ms; Hybrid (semantic + fuzzy) averages 52ms.
218
+ - **Fuzzy baseline:** Recall@1 22%, highlighting the need for semantic signals on natural-language queries.
219
+
220
+ ### With Reranker (`Alibaba-NLP/gte-reranker-modernbert-base`)
221
+ ![With Reranker](img/spider_benchmark_with_reranker.png)
222
+ - **Indexing:** 0.25s ± 0.05s per database (same 18 DBs).
223
+ - **Accuracy:** All strategies converge around Recall@1 62% and MRR ≈ 0.92; Fuzzy jumps from 51% → 92% MRR.
224
+ - **Latency trade-off:** Extra CrossEncoder pass lifts per-query latency to ~0.18–0.29s depending on strategy.
225
+ - **Recommendation:** Enable the reranker when accuracy matters most; disable it for ultra-low-latency lookups.
226
+
227
+
228
+ You can override the search strategy, hops, and limit at query time:
229
+
230
+ ```python
231
+ # Use fuzzy search instead of default
232
+ results = search.search("user_table", search_type="fuzzy")
233
+
234
+ # Use BM25 for keyword-based search
235
+ results = search.search("transactions payments", search_type="bm25")
236
+
237
+ # Use hybrid for best of both worlds
238
+ results = search.search("where are user refunds?", search_type="hybrid")
239
+
240
+ # Override hops and limit
241
+ results = search.search("user refunds", hops=2, limit=10) # Expand 2 hops, return 10 tables
242
+
243
+ # Disable graph expansion
244
+ results = search.search("user_table", hops=0) # Only direct matches, no foreign key traversal
245
+ ```
246
+
247
+ ### LLM Chunking
248
+
249
+ Use LLM to generate semantic summaries instead of raw schema text:
250
+
251
+ 1. Set `strategy: "llm"` in `config.yml`
252
+ 2. Pass API credentials:
253
+
254
+ ```python
255
+ search = SchemaSearch(
256
+ engine,
257
+ llm_api_key="sk-...",
258
+ llm_base_url="https://api.openai.com/v1/" # optional
259
+ )
260
+ ```
261
+
262
+ ## How It Works
263
+
264
+ 1. **Extract schemas** from database using SQLAlchemy inspector
265
+ 2. **Chunk schemas** into digestible pieces (markdown or LLM-generated summaries)
266
+ 3. **Initial search** using selected strategy (semantic/BM25/fuzzy)
267
+ 4. **Expand via foreign keys** to find related tables (configurable hops)
268
+ 5. **Optional reranking** with CrossEncoder to refine results
269
+ 6. Return top tables with full schema and relationships
270
+
271
+ Cache stored in `.schema_search_cache/` (configurable in `config.yml`)
272
+
273
+ ## License
274
+
275
+ MIT
@@ -0,0 +1,223 @@
1
+ # Schema Search
2
+
3
+ An MCP Server for Natural Language Search over RDBMS Schemas. Find exact tables you need, with all their relationships mapped out, in milliseconds. No vector database setup is required.
4
+
5
+ ## Why
6
+
7
+ You have 200 tables in your database. Someone asks "where are user refunds stored?"
8
+
9
+ You could:
10
+ - Grep through SQL files for 20 minutes
11
+ - Pass the full schema to an LLM and watch it struggle with 200 tables
12
+
13
+ Or **build schematic embeddings of your tables, store in-memory, and query in natural language in an MCP server**.
14
+
15
+ ### Benefits
16
+ - No vector database setup is required
17
+ - Small memory footprint -- easily scales up to 1000 tables and 10,000+ columns.
18
+ - Millisecond query latency
19
+
20
+ ## Install
21
+
22
+ ```bash
23
+ # With uv - PostgreSQL (recommended)
24
+ uv pip install "schema-search[postgres,mcp]"
25
+
26
+ # With pip - PostgreSQL
27
+ pip install "schema-search[postgres,mcp]"
28
+
29
+ # Other databases
30
+ uv pip install "schema-search[mysql,mcp]" # MySQL
31
+ uv pip install "schema-search[snowflake,mcp]" # Snowflake
32
+ uv pip install "schema-search[bigquery,mcp]" # BigQuery
33
+ ```
34
+
35
+ ## MCP Server
36
+
37
+ Integrate with Claude Desktop or any MCP client.
38
+
39
+ ### Setup
40
+
41
+ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
42
+
43
+ **Using uv (Recommended):**
44
+ ```json
45
+ {
46
+ "mcpServers": {
47
+ "schema-search": {
48
+ "command": "uvx",
49
+ "args": ["schema-search[postgres,mcp]", "postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
50
+ }
51
+ }
52
+ }
53
+ ```
54
+
55
+ **Using pip:**
56
+ ```json
57
+ {
58
+ "mcpServers": {
59
+ "schema-search": {
60
+ "command": "path/to/schema-search-mcp", // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search-mcp",
61
+ "args": ["postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
62
+ }
63
+ }
64
+ }
65
+ ```
66
+
67
+
68
+ The LLM API key and base url are only required if you use LLM-generated schema summaries (`config.chunking.strategy = 'llm'`).
69
+
70
+ ### CLI Usage
71
+
72
+ ```bash
73
+ schema-search-mcp "postgresql://user:pass@localhost/db"
74
+ ```
75
+
76
+ Optional args: `[config_path] [llm_api_key] [llm_base_url]`
77
+
78
+ The server exposes `schema_search(query, hops, limit)` for natural language schema queries.
79
+
80
+ ## Python Use
81
+
82
+ ```python
83
+ from sqlalchemy import create_engine
84
+ from schema_search import SchemaSearch
85
+
86
+ engine = create_engine("postgresql://user:pass@localhost/db")
87
+ search = SchemaSearch(engine)
88
+
89
+ search.index(force=False) # default is False
90
+ results = search.search("where are user refunds stored?")
91
+
92
+ for result in results['results']:
93
+ print(result['table']) # "refund_transactions"
94
+ print(result['schema']) # Full column info, types, constraints
95
+ print(result['related_tables']) # ["users", "payments", "transactions"]
96
+
97
+ # Override hops, limit, search strategy
98
+ results = search.search("user_table", hops=0, limit=5, search_type="semantic")
99
+
100
+ ```
101
+
102
+ `SchemaSearch.index()` automatically detects schema changes and refreshes cached metadata, so you rarely need to force a reindex manually.
103
+
104
+ ## Configuration
105
+
106
+ Edit `[config.yml](config.yml)`:
107
+
108
+ ```yaml
109
+ logging:
110
+ level: "WARNING"
111
+
112
+ embedding:
113
+ location: "memory" # Options: "memory", "vectordb" (coming soon)
114
+ model: "multi-qa-MiniLM-L6-cos-v1"
115
+ metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
116
+ batch_size: 32
117
+ show_progress: false
118
+ cache_dir: "/tmp/.schema_search_cache"
119
+
120
+ chunking:
121
+ strategy: "raw" # Options: "raw", "llm"
122
+ max_tokens: 256
123
+ overlap_tokens: 50
124
+ model: "gpt-4o-mini"
125
+
126
+ search:
127
+ # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
128
+ strategy: "hybrid"
129
+ initial_top_k: 20
130
+ rerank_top_k: 5
131
+ semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
132
+ hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
133
+
134
+ reranker:
135
+ # CrossEncoder model for reranking. Set to null to disable reranking
136
+ model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
137
+
138
+ schema:
139
+ include_columns: true
140
+ include_indices: true
141
+ include_foreign_keys: true
142
+ include_constraints: true
143
+ ```
144
+
145
+ ## Search Strategies
146
+
147
+ Schema Search supports four search strategies:
148
+
149
+ - **semantic**: Embedding-based similarity search using sentence transformers
150
+ - **bm25**: Lexical search using BM25 ranking algorithm
151
+ - **fuzzy**: String matching on table/column names using fuzzy matching
152
+ - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% fuzzy)
153
+
154
+ Each strategy performs its own initial ranking, then optionally applies CrossEncoder reranking if `reranker.model` is configured. Set `reranker.model` to `null` to disable reranking.
155
+
156
+ ## Performance Comparison
157
+ We [benchmarked](/tests/test_spider_eval.py) on the Spider dataset (1,234 train queries across 18 databases) using the default `config.yml`.
158
+
159
+ **Memory:** The embedding model requires ~90 MB and the optional reranker adds ~155 MB. Actual process memory depends on your Python runtime.
160
+
161
+ ### Without Reranker (`reranker.model: null`)
162
+ ![Without Reranker](img/spider_benchmark_without_reranker.png)
163
+ - **Indexing:** 0.22s ± 0.08s per database (18 total).
164
+ - **Accuracy:** Hybrid leads with Recall@1 62% / MRR 0.93; Semantic follows at Recall@1 58% / MRR 0.89.
165
+ - **Latency:** BM25 and Fuzzy return in ~5ms; Semantic spends ~15ms; Hybrid (semantic + fuzzy) averages 52ms.
166
+ - **Fuzzy baseline:** Recall@1 22%, highlighting the need for semantic signals on natural-language queries.
167
+
168
+ ### With Reranker (`Alibaba-NLP/gte-reranker-modernbert-base`)
169
+ ![With Reranker](img/spider_benchmark_with_reranker.png)
170
+ - **Indexing:** 0.25s ± 0.05s per database (same 18 DBs).
171
+ - **Accuracy:** All strategies converge around Recall@1 62% and MRR ≈ 0.92; Fuzzy jumps from 51% → 92% MRR.
172
+ - **Latency trade-off:** Extra CrossEncoder pass lifts per-query latency to ~0.18–0.29s depending on strategy.
173
+ - **Recommendation:** Enable the reranker when accuracy matters most; disable it for ultra-low-latency lookups.
174
+
175
+
176
+ You can override the search strategy, hops, and limit at query time:
177
+
178
+ ```python
179
+ # Use fuzzy search instead of default
180
+ results = search.search("user_table", search_type="fuzzy")
181
+
182
+ # Use BM25 for keyword-based search
183
+ results = search.search("transactions payments", search_type="bm25")
184
+
185
+ # Use hybrid for best of both worlds
186
+ results = search.search("where are user refunds?", search_type="hybrid")
187
+
188
+ # Override hops and limit
189
+ results = search.search("user refunds", hops=2, limit=10) # Expand 2 hops, return 10 tables
190
+
191
+ # Disable graph expansion
192
+ results = search.search("user_table", hops=0) # Only direct matches, no foreign key traversal
193
+ ```
194
+
195
+ ### LLM Chunking
196
+
197
+ Use LLM to generate semantic summaries instead of raw schema text:
198
+
199
+ 1. Set `strategy: "llm"` in `config.yml`
200
+ 2. Pass API credentials:
201
+
202
+ ```python
203
+ search = SchemaSearch(
204
+ engine,
205
+ llm_api_key="sk-...",
206
+ llm_base_url="https://api.openai.com/v1/" # optional
207
+ )
208
+ ```
209
+
210
+ ## How It Works
211
+
212
+ 1. **Extract schemas** from database using SQLAlchemy inspector
213
+ 2. **Chunk schemas** into digestible pieces (markdown or LLM-generated summaries)
214
+ 3. **Initial search** using selected strategy (semantic/BM25/fuzzy)
215
+ 4. **Expand via foreign keys** to find related tables (configurable hops)
216
+ 5. **Optional reranking** with CrossEncoder to refine results
217
+ 6. Return top tables with full schema and relationships
218
+
219
+ Cache stored in `.schema_search_cache/` (configurable in `config.yml`)
220
+
221
+ ## License
222
+
223
+ MIT
@@ -0,0 +1,34 @@
1
+ logging:
2
+ level: "WARNING"
3
+
4
+ embedding:
5
+ location: "memory" # Options: "memory", "vectordb" (coming soon)
6
+ model: "multi-qa-MiniLM-L6-cos-v1"
7
+ metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
8
+ batch_size: 32
9
+ show_progress: false
10
+ cache_dir: "/tmp/.schema_search_cache"
11
+
12
+ chunking:
13
+ strategy: "raw" # Options: "raw", "llm"
14
+ max_tokens: 256
15
+ overlap_tokens: 50
16
+ model: "gpt-4o-mini"
17
+
18
+ search:
19
+ # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
20
+ strategy: "bm25"
21
+ initial_top_k: 20
22
+ rerank_top_k: 5
23
+ semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
24
+ hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
25
+
26
+ reranker:
27
+ # CrossEncoder model for reranking. Set to null to disable reranking
28
+ model: # "Alibaba-NLP/gte-reranker-modernbert-base"
29
+
30
+ schema:
31
+ include_columns: true
32
+ include_indices: true
33
+ include_foreign_keys: true
34
+ include_constraints: true
@@ -0,0 +1,26 @@
1
+ from schema_search.schema_search import SchemaSearch
2
+ from schema_search.types import (
3
+ IndexResult,
4
+ SearchResult,
5
+ SearchResultItem,
6
+ SearchType,
7
+ TableSchema,
8
+ ColumnInfo,
9
+ ForeignKeyInfo,
10
+ IndexInfo,
11
+ ConstraintInfo,
12
+ )
13
+
14
+ __version__ = "0.1.0"
15
+ __all__ = [
16
+ "SchemaSearch",
17
+ "IndexResult",
18
+ "SearchResult",
19
+ "SearchResultItem",
20
+ "SearchType",
21
+ "TableSchema",
22
+ "ColumnInfo",
23
+ "ForeignKeyInfo",
24
+ "IndexInfo",
25
+ "ConstraintInfo",
26
+ ]
@@ -0,0 +1,6 @@
1
+ from schema_search.chunkers.base import Chunk, BaseChunker
2
+ from schema_search.chunkers.markdown import MarkdownChunker
3
+ from schema_search.chunkers.llm import LLMChunker
4
+ from schema_search.chunkers.factory import create_chunker
5
+
6
+ __all__ = ["Chunk", "BaseChunker", "MarkdownChunker", "LLMChunker", "create_chunker"]
@@ -0,0 +1,95 @@
1
+ from typing import Dict, List
2
+ from dataclasses import dataclass
3
+ from abc import ABC, abstractmethod
4
+
5
+ from tqdm import tqdm
6
+
7
+ from schema_search.types import TableSchema
8
+
9
+
10
+ @dataclass
11
+ class Chunk:
12
+ table_name: str
13
+ content: str
14
+ chunk_id: int
15
+ token_count: int
16
+
17
+
18
+ class BaseChunker(ABC):
19
+ def __init__(self, max_tokens: int, overlap_tokens: int, show_progress: bool = False):
20
+ self.max_tokens = max_tokens
21
+ self.overlap_tokens = overlap_tokens
22
+ self.show_progress = show_progress
23
+
24
+ def chunk_schemas(self, schemas: Dict[str, TableSchema]) -> List[Chunk]:
25
+ chunks: List[Chunk] = []
26
+ chunk_id = 0
27
+
28
+ iterator = schemas.items()
29
+ if self.show_progress:
30
+ iterator = tqdm(iterator, desc="Chunking tables", unit="table")
31
+
32
+ for table_name, schema in iterator:
33
+ table_chunks = self._chunk_table(table_name, schema, chunk_id)
34
+ chunks.extend(table_chunks)
35
+ chunk_id += len(table_chunks)
36
+
37
+ return chunks
38
+
39
+ @abstractmethod
40
+ def _generate_content(self, table_name: str, schema: TableSchema) -> str:
41
+ pass
42
+
43
+ def _chunk_table(
44
+ self, table_name: str, schema: TableSchema, start_id: int
45
+ ) -> List[Chunk]:
46
+ content = self._generate_content(table_name, schema)
47
+ lines = content.split("\n")
48
+
49
+ header = f"Table: {table_name}"
50
+ header_tokens = self._estimate_tokens(header)
51
+
52
+ chunks: List[Chunk] = []
53
+ current_chunk_lines = [header]
54
+ current_tokens = header_tokens
55
+ chunk_id = start_id
56
+
57
+ for line in lines[1:]:
58
+ line_tokens = self._estimate_tokens(line)
59
+
60
+ if (
61
+ current_tokens + line_tokens > self.max_tokens
62
+ and len(current_chunk_lines) > 1
63
+ ):
64
+ chunk_content = "\n".join(current_chunk_lines)
65
+ chunks.append(
66
+ Chunk(
67
+ table_name=table_name,
68
+ content=chunk_content,
69
+ chunk_id=chunk_id,
70
+ token_count=current_tokens,
71
+ )
72
+ )
73
+ chunk_id += 1
74
+
75
+ current_chunk_lines = [header]
76
+ current_tokens = header_tokens
77
+
78
+ current_chunk_lines.append(line)
79
+ current_tokens += line_tokens
80
+
81
+ if len(current_chunk_lines) > 1:
82
+ chunk_content = "\n".join(current_chunk_lines)
83
+ chunks.append(
84
+ Chunk(
85
+ table_name=table_name,
86
+ content=chunk_content,
87
+ chunk_id=chunk_id,
88
+ token_count=current_tokens,
89
+ )
90
+ )
91
+
92
+ return chunks
93
+
94
+ def _estimate_tokens(self, text: str) -> int:
95
+ return len(text.split()) + len(text) // 4