schema-search 0.1.6__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of schema-search might be problematic. Click here for more details.

Files changed (45) hide show
  1. {schema_search-0.1.6/schema_search.egg-info → schema_search-0.1.7}/PKG-INFO +60 -47
  2. {schema_search-0.1.6 → schema_search-0.1.7}/README.md +59 -46
  3. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/mcp_server.py +4 -5
  4. {schema_search-0.1.6 → schema_search-0.1.7/schema_search.egg-info}/PKG-INFO +60 -47
  5. {schema_search-0.1.6 → schema_search-0.1.7}/setup.py +1 -1
  6. {schema_search-0.1.6 → schema_search-0.1.7}/LICENSE +0 -0
  7. {schema_search-0.1.6 → schema_search-0.1.7}/MANIFEST.in +0 -0
  8. {schema_search-0.1.6 → schema_search-0.1.7}/config.yml +0 -0
  9. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/__init__.py +0 -0
  10. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/chunkers/__init__.py +0 -0
  11. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/chunkers/base.py +0 -0
  12. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/chunkers/factory.py +0 -0
  13. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/chunkers/llm.py +0 -0
  14. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/chunkers/markdown.py +0 -0
  15. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/embedding_cache/__init__.py +0 -0
  16. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/embedding_cache/base.py +0 -0
  17. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/embedding_cache/bm25.py +0 -0
  18. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/embedding_cache/factory.py +0 -0
  19. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/embedding_cache/inmemory.py +0 -0
  20. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/graph_builder.py +0 -0
  21. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/metrics.py +0 -0
  22. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/rankers/__init__.py +0 -0
  23. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/rankers/base.py +0 -0
  24. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/rankers/cross_encoder.py +0 -0
  25. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/rankers/factory.py +0 -0
  26. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/schema_extractor.py +0 -0
  27. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/schema_search.py +0 -0
  28. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/search/__init__.py +0 -0
  29. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/search/base.py +0 -0
  30. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/search/bm25.py +0 -0
  31. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/search/factory.py +0 -0
  32. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/search/fuzzy.py +0 -0
  33. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/search/hybrid.py +0 -0
  34. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/search/semantic.py +0 -0
  35. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search/types.py +0 -0
  36. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search.egg-info/SOURCES.txt +0 -0
  37. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search.egg-info/dependency_links.txt +0 -0
  38. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search.egg-info/entry_points.txt +0 -0
  39. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search.egg-info/requires.txt +0 -0
  40. {schema_search-0.1.6 → schema_search-0.1.7}/schema_search.egg-info/top_level.txt +0 -0
  41. {schema_search-0.1.6 → schema_search-0.1.7}/setup.cfg +0 -0
  42. {schema_search-0.1.6 → schema_search-0.1.7}/tests/__init__.py +0 -0
  43. {schema_search-0.1.6 → schema_search-0.1.7}/tests/test_integration.py +0 -0
  44. {schema_search-0.1.6 → schema_search-0.1.7}/tests/test_llm_sql_generation.py +0 -0
  45. {schema_search-0.1.6 → schema_search-0.1.7}/tests/test_spider_eval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: schema-search
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Natural language database schema search with graph-aware semantic retrieval
5
5
  Home-page: https://adibhasan.com/blog/schema-search/
6
6
  Author: Adib Hasan
@@ -83,6 +83,48 @@ uv pip install "schema-search[snowflake,mcp]" # Snowflake
83
83
  uv pip install "schema-search[bigquery,mcp]" # BigQuery
84
84
  ```
85
85
 
86
+ ## Configuration
87
+
88
+ Edit [`config.yml`](https://github.com/Neehan/schema-search/blob/main/config.yml):
89
+
90
+ ```yaml
91
+ logging:
92
+ level: "WARNING"
93
+
94
+ embedding:
95
+ location: "memory" # Options: "memory", "vectordb" (coming soon)
96
+ model: "multi-qa-MiniLM-L6-cos-v1"
97
+ metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
98
+ batch_size: 32
99
+ show_progress: false
100
+ cache_dir: "/tmp/.schema_search_cache"
101
+
102
+ chunking:
103
+ strategy: "raw" # Options: "raw", "llm"
104
+ max_tokens: 256
105
+ overlap_tokens: 50
106
+ model: "gpt-4o-mini"
107
+
108
+ search:
109
+ # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
110
+ strategy: "hybrid"
111
+ initial_top_k: 20
112
+ rerank_top_k: 5
113
+ semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
114
+ hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
115
+
116
+ reranker:
117
+ # CrossEncoder model for reranking. Set to null to disable reranking
118
+ model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
119
+
120
+ schema:
121
+ include_columns: true
122
+ include_indices: true
123
+ include_foreign_keys: true
124
+ include_constraints: true
125
+ ```
126
+
127
+
86
128
  ## MCP Server
87
129
 
88
130
  Integrate with Claude Desktop or any MCP client.
@@ -97,7 +139,13 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
97
139
  "mcpServers": {
98
140
  "schema-search": {
99
141
  "command": "uvx",
100
- "args": ["schema-search[postgres,mcp]", "postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
142
+ "args": [
143
+ "schema-search[postgres,mcp]",
144
+ "postgresql://user:pass@localhost/db",
145
+ "optional/path/to/config.yml",
146
+ "optional llm_api_key",
147
+ "optional llm_base_url"
148
+ ]
101
149
  }
102
150
  }
103
151
  }
@@ -108,8 +156,14 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
108
156
  {
109
157
  "mcpServers": {
110
158
  "schema-search": {
111
- "command": "path/to/schema-search", // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
112
- "args": ["postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
159
+ // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
160
+ "command": "path/to/schema-search",
161
+ "args": [
162
+ "postgresql://user:pass@localhost/db",
163
+ "optional/path/to/config.yml",
164
+ "optional llm_api_key",
165
+ "optional llm_base_url"
166
+ ]
113
167
  }
114
168
  }
115
169
  }
@@ -121,7 +175,7 @@ The LLM API key and base url are only required if you use LLM-generated schema s
121
175
  ### CLI Usage
122
176
 
123
177
  ```bash
124
- schema-search "postgresql://user:pass@localhost/db"
178
+ schema-search "postgresql://user:pass@localhost/db" "optional/path/to/config.yml"
125
179
  ```
126
180
 
127
181
  Optional args: `[config_path] [llm_api_key] [llm_base_url]`
@@ -152,47 +206,6 @@ results = search.search("user_table", hops=0, limit=5, search_type="semantic")
152
206
 
153
207
  `SchemaSearch.index()` automatically detects schema changes and refreshes cached metadata, so you rarely need to force a reindex manually.
154
208
 
155
- ## Configuration
156
-
157
- Edit `[config.yml](config.yml)`:
158
-
159
- ```yaml
160
- logging:
161
- level: "WARNING"
162
-
163
- embedding:
164
- location: "memory" # Options: "memory", "vectordb" (coming soon)
165
- model: "multi-qa-MiniLM-L6-cos-v1"
166
- metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
167
- batch_size: 32
168
- show_progress: false
169
- cache_dir: "/tmp/.schema_search_cache"
170
-
171
- chunking:
172
- strategy: "raw" # Options: "raw", "llm"
173
- max_tokens: 256
174
- overlap_tokens: 50
175
- model: "gpt-4o-mini"
176
-
177
- search:
178
- # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
179
- strategy: "hybrid"
180
- initial_top_k: 20
181
- rerank_top_k: 5
182
- semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
183
- hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
184
-
185
- reranker:
186
- # CrossEncoder model for reranking. Set to null to disable reranking
187
- model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
188
-
189
- schema:
190
- include_columns: true
191
- include_indices: true
192
- include_foreign_keys: true
193
- include_constraints: true
194
- ```
195
-
196
209
  ## Search Strategies
197
210
 
198
211
  Schema Search supports four search strategies:
@@ -200,7 +213,7 @@ Schema Search supports four search strategies:
200
213
  - **semantic**: Embedding-based similarity search using sentence transformers
201
214
  - **bm25**: Lexical search using BM25 ranking algorithm
202
215
  - **fuzzy**: String matching on table/column names using fuzzy matching
203
- - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% fuzzy)
216
+ - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% bm25)
204
217
 
205
218
  Each strategy performs its own initial ranking, then optionally applies CrossEncoder reranking if `reranker.model` is configured. Set `reranker.model` to `null` to disable reranking.
206
219
 
@@ -32,6 +32,48 @@ uv pip install "schema-search[snowflake,mcp]" # Snowflake
32
32
  uv pip install "schema-search[bigquery,mcp]" # BigQuery
33
33
  ```
34
34
 
35
+ ## Configuration
36
+
37
+ Edit [`config.yml`](https://github.com/Neehan/schema-search/blob/main/config.yml):
38
+
39
+ ```yaml
40
+ logging:
41
+ level: "WARNING"
42
+
43
+ embedding:
44
+ location: "memory" # Options: "memory", "vectordb" (coming soon)
45
+ model: "multi-qa-MiniLM-L6-cos-v1"
46
+ metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
47
+ batch_size: 32
48
+ show_progress: false
49
+ cache_dir: "/tmp/.schema_search_cache"
50
+
51
+ chunking:
52
+ strategy: "raw" # Options: "raw", "llm"
53
+ max_tokens: 256
54
+ overlap_tokens: 50
55
+ model: "gpt-4o-mini"
56
+
57
+ search:
58
+ # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
59
+ strategy: "hybrid"
60
+ initial_top_k: 20
61
+ rerank_top_k: 5
62
+ semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
63
+ hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
64
+
65
+ reranker:
66
+ # CrossEncoder model for reranking. Set to null to disable reranking
67
+ model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
68
+
69
+ schema:
70
+ include_columns: true
71
+ include_indices: true
72
+ include_foreign_keys: true
73
+ include_constraints: true
74
+ ```
75
+
76
+
35
77
  ## MCP Server
36
78
 
37
79
  Integrate with Claude Desktop or any MCP client.
@@ -46,7 +88,13 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
46
88
  "mcpServers": {
47
89
  "schema-search": {
48
90
  "command": "uvx",
49
- "args": ["schema-search[postgres,mcp]", "postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
91
+ "args": [
92
+ "schema-search[postgres,mcp]",
93
+ "postgresql://user:pass@localhost/db",
94
+ "optional/path/to/config.yml",
95
+ "optional llm_api_key",
96
+ "optional llm_base_url"
97
+ ]
50
98
  }
51
99
  }
52
100
  }
@@ -57,8 +105,14 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
57
105
  {
58
106
  "mcpServers": {
59
107
  "schema-search": {
60
- "command": "path/to/schema-search", // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
61
- "args": ["postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
108
+ // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
109
+ "command": "path/to/schema-search",
110
+ "args": [
111
+ "postgresql://user:pass@localhost/db",
112
+ "optional/path/to/config.yml",
113
+ "optional llm_api_key",
114
+ "optional llm_base_url"
115
+ ]
62
116
  }
63
117
  }
64
118
  }
@@ -70,7 +124,7 @@ The LLM API key and base url are only required if you use LLM-generated schema s
70
124
  ### CLI Usage
71
125
 
72
126
  ```bash
73
- schema-search "postgresql://user:pass@localhost/db"
127
+ schema-search "postgresql://user:pass@localhost/db" "optional/path/to/config.yml"
74
128
  ```
75
129
 
76
130
  Optional args: `[config_path] [llm_api_key] [llm_base_url]`
@@ -101,47 +155,6 @@ results = search.search("user_table", hops=0, limit=5, search_type="semantic")
101
155
 
102
156
  `SchemaSearch.index()` automatically detects schema changes and refreshes cached metadata, so you rarely need to force a reindex manually.
103
157
 
104
- ## Configuration
105
-
106
- Edit `[config.yml](config.yml)`:
107
-
108
- ```yaml
109
- logging:
110
- level: "WARNING"
111
-
112
- embedding:
113
- location: "memory" # Options: "memory", "vectordb" (coming soon)
114
- model: "multi-qa-MiniLM-L6-cos-v1"
115
- metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
116
- batch_size: 32
117
- show_progress: false
118
- cache_dir: "/tmp/.schema_search_cache"
119
-
120
- chunking:
121
- strategy: "raw" # Options: "raw", "llm"
122
- max_tokens: 256
123
- overlap_tokens: 50
124
- model: "gpt-4o-mini"
125
-
126
- search:
127
- # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
128
- strategy: "hybrid"
129
- initial_top_k: 20
130
- rerank_top_k: 5
131
- semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
132
- hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
133
-
134
- reranker:
135
- # CrossEncoder model for reranking. Set to null to disable reranking
136
- model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
137
-
138
- schema:
139
- include_columns: true
140
- include_indices: true
141
- include_foreign_keys: true
142
- include_constraints: true
143
- ```
144
-
145
158
  ## Search Strategies
146
159
 
147
160
  Schema Search supports four search strategies:
@@ -149,7 +162,7 @@ Schema Search supports four search strategies:
149
162
  - **semantic**: Embedding-based similarity search using sentence transformers
150
163
  - **bm25**: Lexical search using BM25 ranking algorithm
151
164
  - **fuzzy**: String matching on table/column names using fuzzy matching
152
- - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% fuzzy)
165
+ - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% bm25)
153
166
 
154
167
  Each strategy performs its own initial ranking, then optionally applies CrossEncoder reranking if `reranker.model` is configured. Set `reranker.model` to `null` to disable reranking.
155
168
 
@@ -16,7 +16,6 @@ mcp = FastMCP("schema-search")
16
16
  @mcp.tool()
17
17
  def schema_search(
18
18
  query: str,
19
- hops: Optional[int] = None,
20
19
  limit: int = 5,
21
20
  ) -> dict:
22
21
  """Search database schema using natural language.
@@ -25,14 +24,14 @@ def schema_search(
25
24
  using semantic similarity. Expands results by traversing foreign key relationships.
26
25
 
27
26
  Args:
28
- query: Natural language question about database schema (e.g., 'where are user refunds stored?', 'tables related to payments')
29
- hops: Number of foreign key relationship hops for graph expansion. Use 0 for exact matches only, 1-2 to include related tables. If not specified, uses value from config.yml (default: 1)
30
- limit: Maximum number of table schemas to return in results. Default: 5
27
+ query: Natural language question about database schema (e.g., 'tables related to payments')
28
+ limit: Maximum number of table schemas to return in results. Default: 5; Max: 10.
31
29
 
32
30
  Returns:
33
31
  Dictionary with 'results' (list of table schemas with columns, types, constraints, and relationships) and 'latency_sec' (query execution time)
34
32
  """
35
- search_result = mcp.search_engine.search(query, hops=hops, limit=limit) # type: ignore
33
+ limit = min(limit, 10)
34
+ search_result = mcp.search_engine.search(query, limit=limit) # type: ignore
36
35
  return {
37
36
  "results": search_result["results"],
38
37
  "latency_sec": search_result["latency_sec"],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: schema-search
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Natural language database schema search with graph-aware semantic retrieval
5
5
  Home-page: https://adibhasan.com/blog/schema-search/
6
6
  Author: Adib Hasan
@@ -83,6 +83,48 @@ uv pip install "schema-search[snowflake,mcp]" # Snowflake
83
83
  uv pip install "schema-search[bigquery,mcp]" # BigQuery
84
84
  ```
85
85
 
86
+ ## Configuration
87
+
88
+ Edit [`config.yml`](https://github.com/Neehan/schema-search/blob/main/config.yml):
89
+
90
+ ```yaml
91
+ logging:
92
+ level: "WARNING"
93
+
94
+ embedding:
95
+ location: "memory" # Options: "memory", "vectordb" (coming soon)
96
+ model: "multi-qa-MiniLM-L6-cos-v1"
97
+ metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
98
+ batch_size: 32
99
+ show_progress: false
100
+ cache_dir: "/tmp/.schema_search_cache"
101
+
102
+ chunking:
103
+ strategy: "raw" # Options: "raw", "llm"
104
+ max_tokens: 256
105
+ overlap_tokens: 50
106
+ model: "gpt-4o-mini"
107
+
108
+ search:
109
+ # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
110
+ strategy: "hybrid"
111
+ initial_top_k: 20
112
+ rerank_top_k: 5
113
+ semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
114
+ hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
115
+
116
+ reranker:
117
+ # CrossEncoder model for reranking. Set to null to disable reranking
118
+ model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
119
+
120
+ schema:
121
+ include_columns: true
122
+ include_indices: true
123
+ include_foreign_keys: true
124
+ include_constraints: true
125
+ ```
126
+
127
+
86
128
  ## MCP Server
87
129
 
88
130
  Integrate with Claude Desktop or any MCP client.
@@ -97,7 +139,13 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
97
139
  "mcpServers": {
98
140
  "schema-search": {
99
141
  "command": "uvx",
100
- "args": ["schema-search[postgres,mcp]", "postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
142
+ "args": [
143
+ "schema-search[postgres,mcp]",
144
+ "postgresql://user:pass@localhost/db",
145
+ "optional/path/to/config.yml",
146
+ "optional llm_api_key",
147
+ "optional llm_base_url"
148
+ ]
101
149
  }
102
150
  }
103
151
  }
@@ -108,8 +156,14 @@ Add to your MCP config (e.g., `~/.cursor/mcp.json` or Claude Desktop config):
108
156
  {
109
157
  "mcpServers": {
110
158
  "schema-search": {
111
- "command": "path/to/schema-search", // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
112
- "args": ["postgresql://user:pass@localhost/db", "optional config.yml path", "optional llm_api_key", "optional llm_base_url"]
159
+ // conda: /Users/<username>/opt/miniconda3/envs/<your env>/bin/schema-search",
160
+ "command": "path/to/schema-search",
161
+ "args": [
162
+ "postgresql://user:pass@localhost/db",
163
+ "optional/path/to/config.yml",
164
+ "optional llm_api_key",
165
+ "optional llm_base_url"
166
+ ]
113
167
  }
114
168
  }
115
169
  }
@@ -121,7 +175,7 @@ The LLM API key and base url are only required if you use LLM-generated schema s
121
175
  ### CLI Usage
122
176
 
123
177
  ```bash
124
- schema-search "postgresql://user:pass@localhost/db"
178
+ schema-search "postgresql://user:pass@localhost/db" "optional/path/to/config.yml"
125
179
  ```
126
180
 
127
181
  Optional args: `[config_path] [llm_api_key] [llm_base_url]`
@@ -152,47 +206,6 @@ results = search.search("user_table", hops=0, limit=5, search_type="semantic")
152
206
 
153
207
  `SchemaSearch.index()` automatically detects schema changes and refreshes cached metadata, so you rarely need to force a reindex manually.
154
208
 
155
- ## Configuration
156
-
157
- Edit `[config.yml](config.yml)`:
158
-
159
- ```yaml
160
- logging:
161
- level: "WARNING"
162
-
163
- embedding:
164
- location: "memory" # Options: "memory", "vectordb" (coming soon)
165
- model: "multi-qa-MiniLM-L6-cos-v1"
166
- metric: "cosine" # Options: "cosine", "euclidean", "manhattan", "dot"
167
- batch_size: 32
168
- show_progress: false
169
- cache_dir: "/tmp/.schema_search_cache"
170
-
171
- chunking:
172
- strategy: "raw" # Options: "raw", "llm"
173
- max_tokens: 256
174
- overlap_tokens: 50
175
- model: "gpt-4o-mini"
176
-
177
- search:
178
- # Search strategy: "semantic" (embeddings), "bm25" (BM25 lexical), "fuzzy" (fuzzy string matching), "hybrid" (semantic + bm25)
179
- strategy: "hybrid"
180
- initial_top_k: 20
181
- rerank_top_k: 5
182
- semantic_weight: 0.67 # For hybrid search (bm25_weight = 1 - semantic_weight)
183
- hops: 1 # Number of foreign key hops for graph expansion (0-2 recommended)
184
-
185
- reranker:
186
- # CrossEncoder model for reranking. Set to null to disable reranking
187
- model: null # "Alibaba-NLP/gte-reranker-modernbert-base"
188
-
189
- schema:
190
- include_columns: true
191
- include_indices: true
192
- include_foreign_keys: true
193
- include_constraints: true
194
- ```
195
-
196
209
  ## Search Strategies
197
210
 
198
211
  Schema Search supports four search strategies:
@@ -200,7 +213,7 @@ Schema Search supports four search strategies:
200
213
  - **semantic**: Embedding-based similarity search using sentence transformers
201
214
  - **bm25**: Lexical search using BM25 ranking algorithm
202
215
  - **fuzzy**: String matching on table/column names using fuzzy matching
203
- - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% fuzzy)
216
+ - **hybrid**: Combines semantic and bm25 scores (default: 67% semantic, 33% bm25)
204
217
 
205
218
  Each strategy performs its own initial ranking, then optionally applies CrossEncoder reranking if `reranker.model` is configured. Set `reranker.model` to `null` to disable reranking.
206
219
 
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="schema-search",
5
- version="0.1.6",
5
+ version="0.1.7",
6
6
  description="Natural language database schema search with graph-aware semantic retrieval",
7
7
  author="Adib Hasan",
8
8
  long_description=open("README.md").read(),
File without changes
File without changes
File without changes
File without changes