schema-search 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of schema-search might be problematic. Click here for more details.

@@ -16,7 +16,6 @@ mcp = FastMCP("schema-search")
16
16
  @mcp.tool()
17
17
  def schema_search(
18
18
  query: str,
19
- hops: Optional[int] = None,
20
19
  limit: int = 5,
21
20
  ) -> dict:
22
21
  """Search database schema using natural language.
@@ -25,14 +24,14 @@ def schema_search(
25
24
  using semantic similarity. Expands results by traversing foreign key relationships.
26
25
 
27
26
  Args:
28
- query: Natural language question about database schema (e.g., 'where are user refunds stored?', 'tables related to payments')
29
- hops: Number of foreign key relationship hops for graph expansion. Use 0 for exact matches only, 1-2 to include related tables. If not specified, uses value from config.yml (default: 1)
30
- limit: Maximum number of table schemas to return in results. Default: 5
27
+ query: Natural language question about database schema (e.g., 'tables related to payments')
28
+ limit: Maximum number of table schemas to return in results. Default: 5; Max: 10.
31
29
 
32
30
  Returns:
33
31
  Dictionary with 'results' (list of table schemas with columns, types, constraints, and relationships) and 'latency_sec' (query execution time)
34
32
  """
35
- search_result = mcp.search_engine.search(query, hops=hops, limit=limit) # type: ignore
33
+ limit = min(limit, 10)
34
+ search_result = mcp.search_engine.search(query, limit=limit) # type: ignore
36
35
  return {
37
36
  "results": search_result["results"],
38
37
  "latency_sec": search_result["latency_sec"],
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: schema-search
3
- Version: 0.1.5
4
- Summary: Natural language search for database schemas with graph-aware semantic retrieval
5
- Home-page: https://github.com/neehan/schema-search
6
- Author:
3
+ Version: 0.1.7
4
+ Summary: Natural language database schema search with graph-aware semantic retrieval
5
+ Home-page: https://adibhasan.com/blog/schema-search/
6
+ Author: Adib Hasan
7
7
  Classifier: Development Status :: 3 - Alpha
8
8
  Classifier: Intended Audience :: Developers
9
9
  Classifier: License :: OSI Approved :: MIT License
@@ -38,6 +38,7 @@ Requires-Dist: snowflake-sqlalchemy>=1.4.0; extra == "snowflake"
38
38
  Requires-Dist: snowflake-connector-python>=3.0.0; extra == "snowflake"
39
39
  Provides-Extra: bigquery
40
40
  Requires-Dist: sqlalchemy-bigquery>=1.6.0; extra == "bigquery"
41
+ Dynamic: author
41
42
  Dynamic: classifier
42
43
  Dynamic: description
43
44
  Dynamic: description-content-type
@@ -82,6 +83,48 @@ uv pip install "schema-search[snowflake,mcp]" # Snowflake
82
83
  uv pip install "schema-search[bigquery,mcp]" # BigQuery
83
84
  ```
84
85
 
86
+ ## Configuration
87
+
88
+ Edit [`config.yml`](https://github.com/Neehan/schema-search/blob/main/config.yml):
89
+
90
+ ```yaml
91
+ logging:
92
+ level: "WARNING"
93
+
94
+ embedding:
95
+ location: "memory" # Options: "memory", "vectordb" (coming soon)
96
+ model: "multi-qa-MiniLM-L6-cos-v1"
97
+ metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
98
+ batch_size: 32
99
+ show_progress: false
100
+ cache_dir: "/tmp/.schema_search_cache"
101
+
102
+ chunking:
103
+ strategy: "raw" # Options: "raw", "llm"
104
+ max_tokens: 256
105
+ overlap_tokens: 50
106
+ model: "gpt-4o-mini"
107
+
108
+ search:
109
+ # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
110
+ strategy: "hybrid"
111
+ initial_top_k: 20
112
+ rerank_top_k: 5
113
+ semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
114
+ hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
115
+
116
+ reranker:
117
+ # CrossEncoder model for reranking. Set to null to disable reranking
118
+ model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
119
+
120
+ schema:
121
+ include_columns: true
122
+ include_indices: true
123
+ include_foreign_keys: true
124
+ include_constraints: true
125
+ ```
126
+
127
+
85
128
  ## MCP Server
86
129
 
87
130
  Integrate with Claude Desktop or any MCP client.
@@ -96,7 +139,13 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
96
139
  "mcpServers": {
97
140
  "schema-search": {
98
141
  "command": "uvx",
99
- "args": ["schema-search[postgres,mcp]", "postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
142
+ "args": [
143
+ "schema-search[postgres,mcp]",
144
+ "postgresql://user:pass@localhost/db",
145
+ "optional/path/to/config.yml",
146
+ "optional llm_api_key",
147
+ "optional llm_base_url"
148
+ ]
100
149
  }
101
150
  }
102
151
  }
@@ -107,8 +156,14 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
107
156
  {
108
157
  "mcpServers": {
109
158
  "schema-search": {
110
- "command": "path/to/schema-search", // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
111
- "args": ["postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
159
+ // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
160
+ "command": "path/to/schema-search",
161
+ "args": [
162
+ "postgresql://user:pass@localhost/db",
163
+ "optional/path/to/config.yml",
164
+ "optional llm_api_key",
165
+ "optional llm_base_url"
166
+ ]
112
167
  }
113
168
  }
114
169
  }
@@ -120,7 +175,7 @@ The LLM API key and base url are only required if you use LLM-generated schema s
120
175
  ### CLI Usage
121
176
 
122
177
  ```bash
123
- schema-search "postgresql://user:pass@localhost/db"
178
+ schema-search "postgresql://user:pass@localhost/db" "optional/path/to/config.yml"
124
179
  ```
125
180
 
126
181
  Optional args: `[config_path] [llm_api_key] [llm_base_url]`
@@ -151,47 +206,6 @@ results = search.search("user_table", hops=0, limit=5, search_type="semantic")
151
206
 
152
207
  `SchemaSearch.index()` automatically detects schema changes and refreshes cached metadata, so you rarely need to force a reindex manually.
153
208
 
154
- ## Configuration
155
-
156
- Edit `[config.yml](config.yml)`:
157
-
158
- ```yaml
159
- logging:
160
- level: "WARNING"
161
-
162
- embedding:
163
- location: "memory" # Options: "memory", "vectordb" (coming soon)
164
- model: "multi-qa-MiniLM-L6-cos-v1"
165
- metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
166
- batch_size: 32
167
- show_progress: false
168
- cache_dir: "/tmp/.schema_search_cache"
169
-
170
- chunking:
171
- strategy: "raw" # Options: "raw", "llm"
172
- max_tokens: 256
173
- overlap_tokens: 50
174
- model: "gpt-4o-mini"
175
-
176
- search:
177
- # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
178
- strategy: "hybrid"
179
- initial_top_k: 20
180
- rerank_top_k: 5
181
- semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
182
- hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
183
-
184
- reranker:
185
- # CrossEncoder model for reranking. Set to null to disable reranking
186
- model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
187
-
188
- schema:
189
- include_columns: true
190
- include_indices: true
191
- include_foreign_keys: true
192
- include_constraints: true
193
- ```
194
-
195
209
  ## Search Strategies
196
210
 
197
211
  Schema Search supports four search strategies:
@@ -199,7 +213,7 @@ Schema Search supports four search strategies:
199
213
  - **semantic**: Embedding-based similarity search using sentence transformers
200
214
  - **bm25**: Lexical search using BM25 ranking algorithm
201
215
  - **fuzzy**: String matching on table/column names using fuzzy matching
202
- - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% fuzzy)
216
+ - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% bm25)
203
217
 
204
218
  Each strategy performs its own initial ranking, then optionally applies CrossEncoder reranking if `reranker.model` is configured. Set `reranker.model` to `null` to disable reranking.
205
219
 
@@ -209,14 +223,14 @@ We [benchmarked](/tests/test_spider_eval.py) on the Spider dataset (1,234 train
209
223
  **Memory:** The embedding model requires ~90 MB and the optional reranker adds ~155 MB. Actual process memory depends on your Python runtime.
210
224
 
211
225
  ### Without Reranker (`reranker.model: null`)
212
- ![Without Reranker](img/spider_benchmark_without_reranker.png)
226
+ ![Without Reranker](https://raw.githubusercontent.com/Neehan/schema-search/refs/heads/main/img/spider_benchmark_without_reranker.png)
213
227
  - **Indexing:** 0.22s ± 0.08s per database (18 total).
214
228
  - **Accuracy:** Hybrid leads with Recall@1 62% / MRR 0.93; Semantic follows at Recall@1 58% / MRR 0.89.
215
229
  - **Latency:** BM25 and Fuzzy return in ~5ms; Semantic spends ~15ms; Hybrid (semantic + fuzzy) averages 52ms.
216
230
  - **Fuzzy baseline:** Recall@1 22%, highlighting the need for semantic signals on natural-language queries.
217
231
 
218
232
  ### With Reranker (`Alibaba-NLP/gte-reranker-modernbert-base`)
219
- ![With Reranker](img/spider_benchmark_with_reranker.png)
233
+ ![With Reranker](https://raw.githubusercontent.com/Neehan/schema-search/refs/heads/main/img/spider_benchmark_with_reranker.png)
220
234
  - **Indexing:** 0.25s ± 0.05s per database (same 18 DBs).
221
235
  - **Accuracy:** All strategies converge around Recall@1 62% and MRR ≈ 0.92; Fuzzy jumps from 51% → 92% MRR.
222
236
  - **Latency trade-off:** Extra CrossEncoder pass lifts per-query latency to ~0.18–0.29s depending on strategy.
@@ -266,7 +280,7 @@ search = SchemaSearch(
266
280
  5. **Optional reranking** with CrossEncoder to refine results
267
281
  6. Return top tables with full schema and relationships
268
282
 
269
- Cache stored in `.schema_search_cache/` (configurable in `config.yml`)
283
+ Cache stored in `/tmp/.schema_search_cache/` (configurable in `config.yml`)
270
284
 
271
285
  ## License
272
286
 
@@ -1,6 +1,6 @@
1
1
  schema_search/__init__.py,sha256=06680k1q7pUf1m-1MNhKJGgHyT2NYiyJTLUIOP74dJY,486
2
2
  schema_search/graph_builder.py,sha256=oKiVdVI_EB_ZmnxNiIV7Dt-jyKjV8B1RlbiSWpOSe30,2140
3
- schema_search/mcp_server.py,sha256=pl5PWdqAbSkEuzoxEVkB_nNMt5A3AonlCV-FbwrdWZ0,2475
3
+ schema_search/mcp_server.py,sha256=uFTGONeQ8Zib9r2zw-YO_uzZgVdIVh-_o8deMmNA2i0,2241
4
4
  schema_search/metrics.py,sha256=veyPo23aysiU_1MCwTVbBcVNreZFr_RGJwMCKBq1RAs,913
5
5
  schema_search/schema_extractor.py,sha256=tpFF5FNPT694qZNoPZoRBjMSZySDt0CxUU0Ljtno6Z8,4280
6
6
  schema_search/schema_search.py,sha256=60Xk8R3K--Sjsg4TUOiciEKqVe-lNns7K7O1iFsoxq0,8845
@@ -26,13 +26,13 @@ schema_search/search/factory.py,sha256=wgcx-xnZ8c7uSvu6oP3Fpoabd2Gl8FyJxn7zu3zZY
26
26
  schema_search/search/fuzzy.py,sha256=Urn2GtJ5h6j0R3HsRkrMfQCLSTU8jtGaHdfYXL_Nb3A,1865
27
27
  schema_search/search/hybrid.py,sha256=T1O46SLCPgpCOnTw2bznnCWmqP9EUkUBLqu5AeQu7oQ,2864
28
28
  schema_search/search/semantic.py,sha256=brw7x2hZMCep6QK7WWMT451RnpVcSMuNIZtp51kC6Bo,1673
29
- schema_search-0.1.5.dist-info/licenses/LICENSE,sha256=jOHFAJEjJCD7iBjS2dBe73X5IGDJdAWGosGOUxfCHTM,1067
29
+ schema_search-0.1.7.dist-info/licenses/LICENSE,sha256=jOHFAJEjJCD7iBjS2dBe73X5IGDJdAWGosGOUxfCHTM,1067
30
30
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  tests/test_integration.py,sha256=8Iiq9NAwAxMoZcnfR19oOcBEGTyIOmt6nSafG6LWpj0,11959
32
32
  tests/test_llm_sql_generation.py,sha256=bj6iwTqXfNEvlrSXnbPxbrgEM2nscbrmYHbT-rNBJZ4,11834
33
33
  tests/test_spider_eval.py,sha256=xQwrNXpipaDxk-vIKqSy0nOIl-3Nadtof58nZpsAsZA,15333
34
- schema_search-0.1.5.dist-info/METADATA,sha256=oSfANTlqkUd-yOFntVULaP4y9hHjfqXxO8wiPoZVW4Q,9157
35
- schema_search-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
- schema_search-0.1.5.dist-info/entry_points.txt,sha256=9FAtZWOuIlmRNBPX_v7bn8x_aUcfojAKWU6ruSo48GM,64
37
- schema_search-0.1.5.dist-info/top_level.txt,sha256=NZTdQFHoJMezNIhtZICGPOuXlCXQkQduQV925Oqf4sk,20
38
- schema_search-0.1.5.dist-info/RECORD,,
34
+ schema_search-0.1.7.dist-info/METADATA,sha256=rME6mBAfrGDD7hU4ZnSUo3ceKBMSSh22CvIHOD7djyM,9514
35
+ schema_search-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ schema_search-0.1.7.dist-info/entry_points.txt,sha256=9FAtZWOuIlmRNBPX_v7bn8x_aUcfojAKWU6ruSo48GM,64
37
+ schema_search-0.1.7.dist-info/top_level.txt,sha256=NZTdQFHoJMezNIhtZICGPOuXlCXQkQduQV925Oqf4sk,20
38
+ schema_search-0.1.7.dist-info/RECORD,,