schema-search 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of schema-search might be problematic. Click here for more details.
- schema_search/mcp_server.py +4 -5
- {schema_search-0.1.6.dist-info → schema_search-0.1.8.dist-info}/METADATA +66 -48
- {schema_search-0.1.6.dist-info → schema_search-0.1.8.dist-info}/RECORD +7 -7
- {schema_search-0.1.6.dist-info → schema_search-0.1.8.dist-info}/WHEEL +0 -0
- {schema_search-0.1.6.dist-info → schema_search-0.1.8.dist-info}/entry_points.txt +0 -0
- {schema_search-0.1.6.dist-info → schema_search-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {schema_search-0.1.6.dist-info → schema_search-0.1.8.dist-info}/top_level.txt +0 -0
schema_search/mcp_server.py
CHANGED
|
@@ -16,7 +16,6 @@ mcp = FastMCP("schema-search")
|
|
|
16
16
|
@mcp.tool()
|
|
17
17
|
def schema_search(
|
|
18
18
|
query: str,
|
|
19
|
-
hops: Optional[int] = None,
|
|
20
19
|
limit: int = 5,
|
|
21
20
|
) -> dict:
|
|
22
21
|
"""Search database schema using natural language.
|
|
@@ -25,14 +24,14 @@ def schema_search(
|
|
|
25
24
|
using semantic similarity. Expands results by traversing foreign key relationships.
|
|
26
25
|
|
|
27
26
|
Args:
|
|
28
|
-
query: Natural language question about database schema (e.g., '
|
|
29
|
-
|
|
30
|
-
limit: Maximum number of table schemas to return in results. Default: 5
|
|
27
|
+
query: Natural language question about database schema (e.g., 'tables related to payments')
|
|
28
|
+
limit: Maximum number of table schemas to return in results. Default: 5; Max: 10.
|
|
31
29
|
|
|
32
30
|
Returns:
|
|
33
31
|
Dictionary with 'results' (list of table schemas with columns, types, constraints, and relationships) and 'latency_sec' (query execution time)
|
|
34
32
|
"""
|
|
35
|
-
|
|
33
|
+
limit = min(limit, 10)
|
|
34
|
+
search_result = mcp.search_engine.search(query, limit=limit) # type: ignore
|
|
36
35
|
return {
|
|
37
36
|
"results": search_result["results"],
|
|
38
37
|
"latency_sec": search_result["latency_sec"],
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: schema-search
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: Natural language database schema search with graph-aware semantic retrieval
|
|
5
5
|
Home-page: https://adibhasan.com/blog/schema-search/
|
|
6
6
|
Author: Adib Hasan
|
|
@@ -83,6 +83,48 @@ uv pip install "schema-search[snowflake,mcp]" # Snowflake
|
|
|
83
83
|
uv pip install "schema-search[bigquery,mcp]" # BigQuery
|
|
84
84
|
```
|
|
85
85
|
|
|
86
|
+
## Configuration
|
|
87
|
+
|
|
88
|
+
Edit [`config.yml`](https://github.com/Neehan/schema-search/blob/main/config.yml):
|
|
89
|
+
|
|
90
|
+
```yaml
|
|
91
|
+
logging:
|
|
92
|
+
level: "WARNING"
|
|
93
|
+
|
|
94
|
+
embedding:
|
|
95
|
+
location: "memory" # Options: "memory", "vectordb" (coming soon)
|
|
96
|
+
model: "multi-qa-MiniLM-L6-cos-v1"
|
|
97
|
+
metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
|
|
98
|
+
batch_size: 32
|
|
99
|
+
show_progress: false
|
|
100
|
+
cache_dir: "/tmp/.schema_search_cache"
|
|
101
|
+
|
|
102
|
+
chunking:
|
|
103
|
+
strategy: "raw" # Options: "raw", "llm"
|
|
104
|
+
max_tokens: 256
|
|
105
|
+
overlap_tokens: 50
|
|
106
|
+
model: "gpt-4o-mini"
|
|
107
|
+
|
|
108
|
+
search:
|
|
109
|
+
# Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
|
|
110
|
+
strategy: "hybrid"
|
|
111
|
+
initial_top_k: 20
|
|
112
|
+
rerank_top_k: 5
|
|
113
|
+
semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
|
|
114
|
+
hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
|
|
115
|
+
|
|
116
|
+
reranker:
|
|
117
|
+
# CrossEncoder model for reranking. Set to null to disable reranking
|
|
118
|
+
model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
|
|
119
|
+
|
|
120
|
+
schema:
|
|
121
|
+
include_columns: true
|
|
122
|
+
include_indices: true
|
|
123
|
+
include_foreign_keys: true
|
|
124
|
+
include_constraints: true
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
|
|
86
128
|
## MCP Server
|
|
87
129
|
|
|
88
130
|
Integrate with Claude Desktop or any MCP client.
|
|
@@ -97,7 +139,13 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
|
|
|
97
139
|
"mcpServers": {
|
|
98
140
|
"schema-search": {
|
|
99
141
|
"command": "uvx",
|
|
100
|
-
"args": [
|
|
142
|
+
"args": [
|
|
143
|
+
"schema-search[postgres,mcp]",
|
|
144
|
+
"postgresql://user:pass@localhost/db",
|
|
145
|
+
"optional/path/to/config.yml",
|
|
146
|
+
"optional llm_api_key",
|
|
147
|
+
"optional llm_base_url"
|
|
148
|
+
]
|
|
101
149
|
}
|
|
102
150
|
}
|
|
103
151
|
}
|
|
@@ -108,8 +156,14 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
|
|
|
108
156
|
{
|
|
109
157
|
"mcpServers": {
|
|
110
158
|
"schema-search": {
|
|
111
|
-
|
|
112
|
-
"
|
|
159
|
+
// conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
|
|
160
|
+
"command": "path/to/schema-search",
|
|
161
|
+
"args": [
|
|
162
|
+
"postgresql://user:pass@localhost/db",
|
|
163
|
+
"optional/path/to/config.yml",
|
|
164
|
+
"optional llm_api_key",
|
|
165
|
+
"optional llm_base_url"
|
|
166
|
+
]
|
|
113
167
|
}
|
|
114
168
|
}
|
|
115
169
|
}
|
|
@@ -121,7 +175,7 @@ The LLM API key and base url are only required if you use LLM-generated schema s
|
|
|
121
175
|
### CLI Usage
|
|
122
176
|
|
|
123
177
|
```bash
|
|
124
|
-
schema-search "postgresql://user:pass@localhost/db"
|
|
178
|
+
schema-search "postgresql://user:pass@localhost/db" "optional/path/to/config.yml"
|
|
125
179
|
```
|
|
126
180
|
|
|
127
181
|
Optional args: `[config_path] [llm_api_key] [llm_base_url]`
|
|
@@ -135,7 +189,12 @@ from sqlalchemy import create_engine
|
|
|
135
189
|
from schema_search import SchemaSearch
|
|
136
190
|
|
|
137
191
|
engine = create_engine("postgresql://user:pass@localhost/db")
|
|
138
|
-
search = SchemaSearch(
|
|
192
|
+
search = SchemaSearch(
|
|
193
|
+
engine=engine,
|
|
194
|
+
config_path="optional/path/to/config.yml", # default: config.yml
|
|
195
|
+
llm_api_key="optional llm api key",
|
|
196
|
+
llm_base_url="optional llm base url"
|
|
197
|
+
)
|
|
139
198
|
|
|
140
199
|
search.index(force=False) # default is False
|
|
141
200
|
results = search.search("where are user refunds stored?")
|
|
@@ -152,47 +211,6 @@ results = search.search("user_table", hops=0, limit=5, search_type="semantic")
|
|
|
152
211
|
|
|
153
212
|
`SchemaSearch.index()` automatically detects schema changes and refreshes cached metadata, so you rarely need to force a reindex manually.
|
|
154
213
|
|
|
155
|
-
## Configuration
|
|
156
|
-
|
|
157
|
-
Edit `[config.yml](config.yml)`:
|
|
158
|
-
|
|
159
|
-
```yaml
|
|
160
|
-
logging:
|
|
161
|
-
level: "WARNING"
|
|
162
|
-
|
|
163
|
-
embedding:
|
|
164
|
-
location: "memory" # Options: "memory", "vectordb" (coming soon)
|
|
165
|
-
model: "multi-qa-MiniLM-L6-cos-v1"
|
|
166
|
-
metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
|
|
167
|
-
batch_size: 32
|
|
168
|
-
show_progress: false
|
|
169
|
-
cache_dir: "/tmp/.schema_search_cache"
|
|
170
|
-
|
|
171
|
-
chunking:
|
|
172
|
-
strategy: "raw" # Options: "raw", "llm"
|
|
173
|
-
max_tokens: 256
|
|
174
|
-
overlap_tokens: 50
|
|
175
|
-
model: "gpt-4o-mini"
|
|
176
|
-
|
|
177
|
-
search:
|
|
178
|
-
# Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
|
|
179
|
-
strategy: "hybrid"
|
|
180
|
-
initial_top_k: 20
|
|
181
|
-
rerank_top_k: 5
|
|
182
|
-
semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
|
|
183
|
-
hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
|
|
184
|
-
|
|
185
|
-
reranker:
|
|
186
|
-
# CrossEncoder model for reranking. Set to null to disable reranking
|
|
187
|
-
model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
|
|
188
|
-
|
|
189
|
-
schema:
|
|
190
|
-
include_columns: true
|
|
191
|
-
include_indices: true
|
|
192
|
-
include_foreign_keys: true
|
|
193
|
-
include_constraints: true
|
|
194
|
-
```
|
|
195
|
-
|
|
196
214
|
## Search Strategies
|
|
197
215
|
|
|
198
216
|
Schema Search supports four search strategies:
|
|
@@ -200,7 +218,7 @@ Schema Search supports four search strategies:
|
|
|
200
218
|
- **semantic**: Embedding-based similarity search using sentence transformers
|
|
201
219
|
- **bm25**: Lexical search using BM25 ranking algorithm
|
|
202
220
|
- **fuzzy**: String matching on table/column names using fuzzy matching
|
|
203
|
-
- **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33%
|
|
221
|
+
- **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% bm25)
|
|
204
222
|
|
|
205
223
|
Each strategy performs its own initial ranking, then optionally applies CrossEncoder reranking if `reranker.model` is configured. Set `reranker.model` to `null` to disable reranking.
|
|
206
224
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
schema_search/__init__.py,sha256=06680k1q7pUf1m-1MNhKJGgHyT2NYiyJTLUIOP74dJY,486
|
|
2
2
|
schema_search/graph_builder.py,sha256=oKiVdVI_EB_ZmnxNiIV7Dt-jyKjV8B1RlbiSWpOSe30,2140
|
|
3
|
-
schema_search/mcp_server.py,sha256=
|
|
3
|
+
schema_search/mcp_server.py,sha256=uFTGONeQ8Zib9r2zw-YO_uzZgVdIVh-_o8deMmNA2i0,2241
|
|
4
4
|
schema_search/metrics.py,sha256=veyPo23aysiU_1MCwTVbBcVNreZFr_RGJwMCKBq1RAs,913
|
|
5
5
|
schema_search/schema_extractor.py,sha256=tpFF5FNPT694qZNoPZoRBjMSZySDt0CxUU0Ljtno6Z8,4280
|
|
6
6
|
schema_search/schema_search.py,sha256=60Xk8R3K--Sjsg4TUOiciEKqVe-lNns7K7O1iFsoxq0,8845
|
|
@@ -26,13 +26,13 @@ schema_search/search/factory.py,sha256=wgcx-xnZ8c7uSvu6oP3Fpoabd2Gl8FyJxn7zu3zZY
|
|
|
26
26
|
schema_search/search/fuzzy.py,sha256=Urn2GtJ5h6j0R3HsRkrMfQCLSTU8jtGaHdfYXL_Nb3A,1865
|
|
27
27
|
schema_search/search/hybrid.py,sha256=T1O46SLCPgpCOnTw2bznnCWmqP9EUkUBLqu5AeQu7oQ,2864
|
|
28
28
|
schema_search/search/semantic.py,sha256=brw7x2hZMCep6QK7WWMT451RnpVcSMuNIZtp51kC6Bo,1673
|
|
29
|
-
schema_search-0.1.
|
|
29
|
+
schema_search-0.1.8.dist-info/licenses/LICENSE,sha256=jOHFAJEjJCD7iBjS2dBe73X5IGDJdAWGosGOUxfCHTM,1067
|
|
30
30
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
31
|
tests/test_integration.py,sha256=8Iiq9NAwAxMoZcnfR19oOcBEGTyIOmt6nSafG6LWpj0,11959
|
|
32
32
|
tests/test_llm_sql_generation.py,sha256=bj6iwTqXfNEvlrSXnbPxbrgEM2nscbrmYHbT-rNBJZ4,11834
|
|
33
33
|
tests/test_spider_eval.py,sha256=xQwrNXpipaDxk-vIKqSy0nOIl-3Nadtof58nZpsAsZA,15333
|
|
34
|
-
schema_search-0.1.
|
|
35
|
-
schema_search-0.1.
|
|
36
|
-
schema_search-0.1.
|
|
37
|
-
schema_search-0.1.
|
|
38
|
-
schema_search-0.1.
|
|
34
|
+
schema_search-0.1.8.dist-info/METADATA,sha256=RoR680IsT6V-LHMffqYrKrHgbdnYag37D3RAqPS7mio,9674
|
|
35
|
+
schema_search-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
36
|
+
schema_search-0.1.8.dist-info/entry_points.txt,sha256=9FAtZWOuIlmRNBPX_v7bn8x_aUcfojAKWU6ruSo48GM,64
|
|
37
|
+
schema_search-0.1.8.dist-info/top_level.txt,sha256=NZTdQFHoJMezNIhtZICGPOuXlCXQkQduQV925Oqf4sk,20
|
|
38
|
+
schema_search-0.1.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|