syrch 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syrch-0.1.0/LICENSE +21 -0
- syrch-0.1.0/PKG-INFO +590 -0
- syrch-0.1.0/README.md +543 -0
- syrch-0.1.0/pyproject.toml +57 -0
- syrch-0.1.0/setup.cfg +4 -0
- syrch-0.1.0/src/syrch/__init__.py +19 -0
- syrch-0.1.0/src/syrch/api.py +118 -0
- syrch-0.1.0/src/syrch/cli/__init__.py +0 -0
- syrch-0.1.0/src/syrch/cli/app.py +360 -0
- syrch-0.1.0/src/syrch/core/__init__.py +0 -0
- syrch-0.1.0/src/syrch/core/config.py +163 -0
- syrch-0.1.0/src/syrch/core/logging.py +58 -0
- syrch-0.1.0/src/syrch/core/models.py +92 -0
- syrch-0.1.0/src/syrch/eval/__init__.py +0 -0
- syrch-0.1.0/src/syrch/eval/metrics.py +80 -0
- syrch-0.1.0/src/syrch/eval/report.py +135 -0
- syrch-0.1.0/src/syrch/eval/runner.py +135 -0
- syrch-0.1.0/src/syrch/executors/__init__.py +0 -0
- syrch-0.1.0/src/syrch/executors/base.py +25 -0
- syrch-0.1.0/src/syrch/executors/cached_executor.py +36 -0
- syrch-0.1.0/src/syrch/executors/databricks_executor.py +110 -0
- syrch-0.1.0/src/syrch/executors/jdbc_executor.py +51 -0
- syrch-0.1.0/src/syrch/executors/sqlite_executor.py +56 -0
- syrch-0.1.0/src/syrch/llm/__init__.py +0 -0
- syrch-0.1.0/src/syrch/llm/anthropic_llm.py +50 -0
- syrch-0.1.0/src/syrch/llm/base.py +21 -0
- syrch-0.1.0/src/syrch/llm/cache.py +101 -0
- syrch-0.1.0/src/syrch/llm/openai_llm.py +101 -0
- syrch-0.1.0/src/syrch/search/__init__.py +0 -0
- syrch-0.1.0/src/syrch/search/aggregator.py +189 -0
- syrch-0.1.0/src/syrch/search/calibrator.py +62 -0
- syrch-0.1.0/src/syrch/search/clarify.py +100 -0
- syrch-0.1.0/src/syrch/search/grid.py +239 -0
- syrch-0.1.0/src/syrch/search/pipeline.py +26 -0
- syrch-0.1.0/src/syrch/search/planner.py +219 -0
- syrch-0.1.0/src/syrch/search/rlm_engine.py +303 -0
- syrch-0.1.0/src/syrch/search/scheduler.py +134 -0
- syrch-0.1.0/src/syrch.egg-info/PKG-INFO +590 -0
- syrch-0.1.0/src/syrch.egg-info/SOURCES.txt +49 -0
- syrch-0.1.0/src/syrch.egg-info/dependency_links.txt +1 -0
- syrch-0.1.0/src/syrch.egg-info/entry_points.txt +2 -0
- syrch-0.1.0/src/syrch.egg-info/requires.txt +34 -0
- syrch-0.1.0/src/syrch.egg-info/top_level.txt +1 -0
- syrch-0.1.0/tests/test_cache.py +140 -0
- syrch-0.1.0/tests/test_clarify.py +219 -0
- syrch-0.1.0/tests/test_e2e.py +285 -0
- syrch-0.1.0/tests/test_eval.py +339 -0
- syrch-0.1.0/tests/test_integration.py +802 -0
- syrch-0.1.0/tests/test_planner.py +271 -0
- syrch-0.1.0/tests/test_rlm_engine.py +290 -0
- syrch-0.1.0/tests/test_scheduler.py +72 -0
syrch-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 emeraldgoose
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
syrch-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,590 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: syrch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: NL -> ProblemSpec -> Search(D&C+RLM) -> SQL Evaluation -> Optimal Solution
|
|
5
|
+
Author-email: emeraldgoose <emeraldgoose@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: sql,nlp,search,llm,databricks
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: typer>=0.12
|
|
17
|
+
Requires-Dist: pydantic>=2.0
|
|
18
|
+
Requires-Dist: openai>=1.0
|
|
19
|
+
Requires-Dist: anthropic>=0.30
|
|
20
|
+
Requires-Dist: databricks-sql-connector>=4.0
|
|
21
|
+
Requires-Dist: rich>=13.0
|
|
22
|
+
Requires-Dist: pandas>=2.0
|
|
23
|
+
Requires-Dist: diskcache>=5.6
|
|
24
|
+
Requires-Dist: sqlglot>=25.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Provides-Extra: databricks
|
|
27
|
+
Requires-Dist: databricks-sql-connector>=4.0; extra == "databricks"
|
|
28
|
+
Requires-Dist: databricks-sdk>=0.20; extra == "databricks"
|
|
29
|
+
Requires-Dist: azure-identity>=1.15; extra == "databricks"
|
|
30
|
+
Provides-Extra: pyspark
|
|
31
|
+
Requires-Dist: pyspark>=3.5; extra == "pyspark"
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: databricks-sql-connector>=4.0; extra == "all"
|
|
34
|
+
Requires-Dist: databricks-sdk>=0.20; extra == "all"
|
|
35
|
+
Requires-Dist: azure-identity>=1.15; extra == "all"
|
|
36
|
+
Requires-Dist: pyspark>=3.5; extra == "all"
|
|
37
|
+
Requires-Dist: pytest>=8; extra == "all"
|
|
38
|
+
Requires-Dist: pytest-cov>=5; extra == "all"
|
|
39
|
+
Requires-Dist: mypy>=1.0; extra == "all"
|
|
40
|
+
Requires-Dist: ruff>=0.3; extra == "all"
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-cov>=5; extra == "dev"
|
|
44
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
45
|
+
Requires-Dist: ruff>=0.3; extra == "dev"
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
|
|
48
|
+
# syrch — Symbolic Search Runtime
|
|
49
|
+
|
|
50
|
+
**English** | [[한국어]](README_ko.md)
|
|
51
|
+
|
|
52
|
+
NL Problem → ProblemSpec → Search(D&C+RLM) → SQL Executor → Optimal Solution
|
|
53
|
+
|
|
54
|
+
## Project Goal
|
|
55
|
+
|
|
56
|
+
`symbolic-search-runtime` (syrch) is a **search harness** that finds optimal solutions to natural language problems over structured data. Unlike a simple QA agent that answers in one shot, syrch **explores multiple reasoning paths** using Divide & Conquer decomposition and Recursive Language Models, executing candidate solutions against real databases to select the best result.
|
|
57
|
+
|
|
58
|
+
### Key Ideas
|
|
59
|
+
|
|
60
|
+
- **Divide & Conquer**: Decompose a problem into logically independent sub-problems (sub-tasks), solve each independently, then merge results. Sub-problems can depend on each other forming a DAG.
|
|
61
|
+
- **RLM (Recursive Language Model)**: Each sub-task runs its own REPL loop — generate code → validate syntax → validate schema → execute SQL → check quality → evaluate confidence → refine or stop. Multiple reasoning paths are explored per node.
|
|
62
|
+
- **Confidence Calibration**: LLM self-assessed confidence is discounted by execution signals (retries, errors, empty results) for more reliable scoring.
|
|
63
|
+
- **Grid Search**: Systematic hyperparameter testing (`max_depth`, `high_confidence`, `max_attempts`, `calibration_enabled`) to find optimal configs.
|
|
64
|
+
- **Multi-table Schema**: Planner and RLM see all database tables, not just one.
|
|
65
|
+
- **Search over reasoning, not execution**: D&C splits the *problem space*, not the SQL. Each sub-problem is a complete reasoning unit (think → code → validate → execute → evaluate).
|
|
66
|
+
- **Pluggable Executors**: Abstract `BaseExecutor` with SQLite, JDBC, and Databricks implementations — PEP 249 compatible.
|
|
67
|
+
|
|
68
|
+
## Architecture
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
User Question
|
|
72
|
+
│
|
|
73
|
+
▼
|
|
74
|
+
┌──────────────────┐
|
|
75
|
+
│ Planner │ ← LLM decomposes question into sub-task DAG
|
|
76
|
+
│ (D&C) │ (depends_on, is_atomic, expected_output)
|
|
77
|
+
│ │ Multi-table schema: all tables visible
|
|
78
|
+
└──────┬───────────┘
|
|
79
|
+
│ TaskDAG (topo_layers)
|
|
80
|
+
▼
|
|
81
|
+
┌──────────────────┐
|
|
82
|
+
│ Scheduler │ ← Layer-by-layer DAG execution
|
|
83
|
+
│ │
|
|
84
|
+
│ For each node: │
|
|
85
|
+
│ ┌─────────────┐ │
|
|
86
|
+
│ │ RLM Agent │ │ ← 3-step validation loop:
|
|
87
|
+
│ │ 1. SQLGlot │ │ 1. Syntax check (sqlglot.parse_one)
|
|
88
|
+
│ │ syntax │ │ 2. Schema AST check (valid columns)
|
|
89
|
+
│ │ 2. Schema │ │ 3. Execute + quality check
|
|
90
|
+
│ │ AST check │ │ Confidence calibration applied
|
|
91
|
+
│ │ 3. Execute │ │
|
|
92
|
+
│ │ 4. Quality │ │
|
|
93
|
+
│ │ 5. Calibrate │ │
|
|
94
|
+
│ └─────────────┘ │
|
|
95
|
+
│ │
|
|
96
|
+
│ Pruning: │
|
|
97
|
+
│ conf ≥ threshold → greedy stop
|
|
98
|
+
│ max_attempts hit → best path selected
|
|
99
|
+
└──────┬───────────┘
|
|
100
|
+
│ NodeResults (DataFrames + SQL + confidence)
|
|
101
|
+
▼
|
|
102
|
+
┌──────────────────┐
|
|
103
|
+
│ Aggregator │ ← Merge leaf results → final answer
|
|
104
|
+
│ │ Tiebreaker: equal confidence → lower token_cost
|
|
105
|
+
└──────┬───────────┘
|
|
106
|
+
│ FinalSolution
|
|
107
|
+
▼
|
|
108
|
+
Optimal Answer + SQL + Reasoning Trace
|
|
109
|
+
|
|
110
|
+
═══════ Optional: RLM Clarification ═══════
|
|
111
|
+
│
|
|
112
|
+
▼ (if --interactive)
|
|
113
|
+
┌──────────────────┐
|
|
114
|
+
│ RLM Clarifier │ ← RLM exhaustion detected
|
|
115
|
+
│ │ ambiguity score >= threshold
|
|
116
|
+
│ Node-level: │ → ask user question
|
|
117
|
+
│ ┌─────────────┐ │ → refine ProblemSpec
|
|
118
|
+
│ │ no_sql │ │ → re-run pipeline
|
|
119
|
+
│ │ empty_result │ │
|
|
120
|
+
│ │ quality_fail │ │
|
|
121
|
+
│ │ low_confidence│ │
|
|
122
|
+
│ └─────────────┘ │
|
|
123
|
+
└──────┬───────────┘
|
|
124
|
+
│ Clarification answer → refined problem
|
|
125
|
+
▼
|
|
126
|
+
Back to Planner (retry)
|
|
127
|
+
|
|
128
|
+
═══════ Optional: Grid Search ═══════
|
|
129
|
+
│
|
|
130
|
+
▼
|
|
131
|
+
┌──────────────────┐
|
|
132
|
+
│ Grid Search │ ← 27-54 cells (Product of params)
|
|
133
|
+
│ │ ProcessPoolExecutor (max_workers=3)
|
|
134
|
+
│ │ Reports: config.json, results.json,
|
|
135
|
+
│ │ best.json, summary.md
|
|
136
|
+
└──────┬───────────┘
|
|
137
|
+
│ Best config → run_pipeline again
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### How a Sub-Task Executes (RLM Node)
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
Node "Find top 10% customers"
|
|
144
|
+
│
|
|
145
|
+
├── Attempt 1: SQL path A
|
|
146
|
+
│ ├── [PASS] Syntax check (sqlglot)
|
|
147
|
+
│ ├── [PASS] Schema column check
|
|
148
|
+
│ ├── [PASS] Execute → 5,234 rows
|
|
149
|
+
│ ├── [WARN] Quality: returned 5234 rows (>1000)
|
|
150
|
+
│ └── confidence: 0.72 (below threshold, retry)
|
|
151
|
+
│
|
|
152
|
+
├── Attempt 2: SQL path B
|
|
153
|
+
│ ├── [PASS] Syntax check
|
|
154
|
+
│ ├── [PASS] Schema column check
|
|
155
|
+
│ ├── [PASS] Execute → 534 rows
|
|
156
|
+
│ ├── [PASS] Quality: OK
|
|
157
|
+
│ └── confidence: 0.91 → calibrated: 0.86 (above threshold, stop)
|
|
158
|
+
│
|
|
159
|
+
└── Return best (calibrated) result to parent
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Directory Structure
|
|
163
|
+
|
|
164
|
+
```
|
|
165
|
+
syrch/
|
|
166
|
+
├── pyproject.toml
|
|
167
|
+
├── README.md
|
|
168
|
+
├── AGENTS.md
|
|
169
|
+
├── PLAN.md
|
|
170
|
+
├── LICENSE
|
|
171
|
+
├── .gitignore
|
|
172
|
+
├── benchmarks/example.jsonl
|
|
173
|
+
├── src/syrch/
|
|
174
|
+
│ ├── __init__.py
|
|
175
|
+
│ ├── cli/app.py # Typer CLI
|
|
176
|
+
│ ├── core/
|
|
177
|
+
│ │ ├── models.py # Data types (dataclasses)
|
|
178
|
+
│ │ ├── config.py # ExecutionConfig + config loader
|
|
179
|
+
│ │ └── logging.py # Structured logging
|
|
180
|
+
│ ├── executors/
|
|
181
|
+
│ │ ├── base.py # BaseExecutor (ABC)
|
|
182
|
+
│ │ ├── sqlite_executor.py # SQLite
|
|
183
|
+
│ │ ├── jdbc_executor.py # JDBC via SQLAlchemy
|
|
184
|
+
│ │ ├── databricks_executor.py # Databricks SQL
|
|
185
|
+
│ │ └── cached_executor.py # diskcache-backed SQL cache
|
|
186
|
+
│ ├── llm/
|
|
187
|
+
│ │ ├── base.py # BaseLLM (ABC)
|
|
188
|
+
│ │ ├── openai_llm.py # OpenAI
|
|
189
|
+
│ │ ├── anthropic_llm.py # Anthropic Claude
|
|
190
|
+
│ │ └── cache.py # CachedLLM + CentralCache
|
|
191
|
+
│ ├── search/
|
|
192
|
+
│ │ ├── planner.py # D&C: NL -> TaskDAG
|
|
193
|
+
│ │ ├── scheduler.py # DAG execution engine
|
|
194
|
+
│ │ ├── rlm_engine.py # RLM REPL loop
|
|
195
|
+
│ │ ├── aggregator.py # Result merge
|
|
196
|
+
│ │ ├── calibrator.py # Confidence calibration
|
|
197
|
+
│ │ ├── clarify.py # Ambiguity detection
|
|
198
|
+
│ │ ├── grid.py # Grid search
|
|
199
|
+
│ │ └── pipeline.py # Orchestrator
|
|
200
|
+
│ └── eval/
|
|
201
|
+
│ ├── runner.py # Benchmark harness
|
|
202
|
+
│ ├── metrics.py # Evaluation metrics
|
|
203
|
+
│ └── report.py # Report export
|
|
204
|
+
├── validate_real.py # Real LLM validation
|
|
205
|
+
├── orders_10dim.sqlite # TPC-H derived (7.5M rows)
|
|
206
|
+
├── wikipedia_clickstream.sqlite # Clickstream data (3K rows)
|
|
207
|
+
└── tests/
|
|
208
|
+
├── test_cache.py
|
|
209
|
+
├── test_clarify.py
|
|
210
|
+
├── test_e2e.py
|
|
211
|
+
├── test_eval.py
|
|
212
|
+
├── test_integration.py
|
|
213
|
+
├── test_planner.py
|
|
214
|
+
├── test_rlm_engine.py
|
|
215
|
+
└── test_scheduler.py
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Data Model
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
ProblemSpec { question, schema, all_schemas, goal_metric }
|
|
222
|
+
│
|
|
223
|
+
▼
|
|
224
|
+
TaskDAG { nodes: {A, B, C, ...}, root_id, topo_layers }
|
|
225
|
+
│ 각 TaskNode: { id, description, depends_on, is_atomic, join_type }
|
|
226
|
+
▼
|
|
227
|
+
Scheduler → NodeResult { node_id, data(DataFrame), sql, confidence,
|
|
228
|
+
reasoning_paths, cost_tokens, error }
|
|
229
|
+
│
|
|
230
|
+
▼
|
|
231
|
+
Aggregator → FinalSolution { answer, sql, confidence, data, token_cost, tree }
|
|
232
|
+
(tiebreaker: equal confidence → lower cost_tokens wins)
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Installation
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
# Core (CLI + SQLite)
|
|
239
|
+
pip install syrch
|
|
240
|
+
|
|
241
|
+
# Databricks SQL Warehouse
|
|
242
|
+
pip install "syrch[databricks]"
|
|
243
|
+
|
|
244
|
+
# PySpark executor (inside Databricks Runtime)
|
|
245
|
+
pip install "syrch[pyspark]"
|
|
246
|
+
|
|
247
|
+
# Development (tests + lint)
|
|
248
|
+
pip install -e ".[dev]"
|
|
249
|
+
|
|
250
|
+
# Everything
|
|
251
|
+
pip install "syrch[all]"
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
## Python API (Library Mode)
|
|
255
|
+
|
|
256
|
+
Use directly from Databricks notebooks or Python scripts:
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from syrch import query
|
|
260
|
+
|
|
261
|
+
result = query(
|
|
262
|
+
question="What discount × shipping combo maximizes revenue?",
|
|
263
|
+
executor_type="databricks",
|
|
264
|
+
model="gpt-4o",
|
|
265
|
+
)
|
|
266
|
+
print(result.answer)
|
|
267
|
+
print(result.sql)
|
|
268
|
+
print(result.confidence)
|
|
269
|
+
print(result.data)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## CLI Usage
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
# Install
|
|
276
|
+
pip install syrch
|
|
277
|
+
|
|
278
|
+
# Inspect database schema
|
|
279
|
+
syrch schema wikipedia_clickstream.sqlite
|
|
280
|
+
syrch schema orders_10dim.sqlite -t orders_10dim
|
|
281
|
+
|
|
282
|
+
# Show default config
|
|
283
|
+
syrch config
|
|
284
|
+
|
|
285
|
+
# Solve a problem (requires LLM API key)
|
|
286
|
+
export OPENAI_API_KEY="sk-..."
|
|
287
|
+
syrch search -q "What discount × shipping combo maximizes revenue for top 10% customers?"
|
|
288
|
+
|
|
289
|
+
# With config file
|
|
290
|
+
syrch search -q "..." --config syrch.yml
|
|
291
|
+
|
|
292
|
+
# With options
|
|
293
|
+
syrch search -q "Which click type generates the most traffic?" \
|
|
294
|
+
--db wikipedia_clickstream.sqlite \
|
|
295
|
+
--max-depth 3 \
|
|
296
|
+
--high-conf 0.85 \
|
|
297
|
+
--max-attempts 3 \
|
|
298
|
+
--verbose
|
|
299
|
+
|
|
300
|
+
# Grid search over hyperparameters (54 cells)
|
|
301
|
+
syrch search -q "..." --db orders_10dim.sqlite --grid
|
|
302
|
+
|
|
303
|
+
# Benchmark against expected results
|
|
304
|
+
syrch eval -q "..." --db orders_10dim.sqlite --expected expected.csv
|
|
305
|
+
|
|
306
|
+
# Run benchmark suite
|
|
307
|
+
syrch benchmark benchmarks/orders.jsonl
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### CLI Reference
|
|
311
|
+
|
|
312
|
+
| Command | Option | Description |
|
|
313
|
+
|---------|--------|-------------|
|
|
314
|
+
| `search` | `-q` / `--question` | Natural language problem (required) |
|
|
315
|
+
| | `--db` | Database path (default: `orders_10dim.sqlite`) |
|
|
316
|
+
| | `--max-depth` | Max D&C recursion depth (default: 3) |
|
|
317
|
+
| | `--executor` | `sqlite` / `databricks` / `jdbc` |
|
|
318
|
+
| | `--max-attempts` | Max RLM attempts per node (default: 3) |
|
|
319
|
+
| | `--high-conf` | Confidence threshold for greedy stop (default: 0.85) |
|
|
320
|
+
| | `--budget` | Token budget (default: 100000) |
|
|
321
|
+
| | `--llm` | `openai` / `anthropic` |
|
|
322
|
+
| | `--model` | LLM model name (default: `qwen3.5-4b-4bit`) |
|
|
323
|
+
| | `-v` / `--verbose` | Show reasoning traces |
|
|
324
|
+
| | `--cache/--no-cache` | Enable/disable LLM + SQL cache (default: on) |
|
|
325
|
+
| | `--cache-ttl` | Cache TTL in seconds (default: 86400) |
|
|
326
|
+
| | `--grid` | Run grid search over hyperparameters |
|
|
327
|
+
| | `--grid-parallel/--grid-sequential` | Parallel vs sequential grid execution |
|
|
328
|
+
| | `--grid-max-workers` | Max concurrent API calls (default: 3) |
|
|
329
|
+
| | `--max-concurrency` | Max concurrent LLM calls (default: 5; use 1 for local models) |
|
|
330
|
+
| | `--interactive` | Ask clarification questions when SQL cannot solve the task |
|
|
331
|
+
| | `--non-interactive` | One-shot mode with no clarification (default) |
|
|
332
|
+
| | `--config` | Path to YAML config file (`syrch.yml` or `~/.syrch/config.yml`) |
|
|
333
|
+
| `eval` | `-q` | Question |
|
|
334
|
+
| | `--db` | Database path |
|
|
335
|
+
| | `--executor` | Executor type |
|
|
336
|
+
| | `--expected` | Expected results CSV |
|
|
337
|
+
| | `--report-format` | `md` / `json` |
|
|
338
|
+
| `benchmark` | `PATH` | JSONL benchmark file (positional) |
|
|
339
|
+
| | `--executor` | Executor type |
|
|
340
|
+
| | `--report` | Output report path |
|
|
341
|
+
| `schema` | `DB` | Database path (positional) |
|
|
342
|
+
| | `-t` / `--table` | Specific table |
|
|
343
|
+
| `config` | `--db` | Database path |
|
|
344
|
+
|
|
345
|
+
## Configuration
|
|
346
|
+
|
|
347
|
+
Config loaded from (priority order): **CLI args > env vars (`SYRCH_*`) > config file > Databricks Secrets > defaults**.
|
|
348
|
+
|
|
349
|
+
### Config File (`syrch.yml`)
|
|
350
|
+
|
|
351
|
+
```yaml
|
|
352
|
+
llm:
|
|
353
|
+
provider: openai
|
|
354
|
+
model: qwen3.5-4b-4bit
|
|
355
|
+
base_url: http://100.88.35.18:11434/v1
|
|
356
|
+
temperature: 0.7
|
|
357
|
+
max_tokens_per_call: 4096
|
|
358
|
+
timeout_seconds: 120
|
|
359
|
+
|
|
360
|
+
execution:
|
|
361
|
+
executor_type: sqlite
|
|
362
|
+
max_depth: 3
|
|
363
|
+
max_attempts_per_node: 3
|
|
364
|
+
high_confidence: 0.85
|
|
365
|
+
token_budget: 100000
|
|
366
|
+
cache_enabled: true
|
|
367
|
+
cache_ttl: 86400
|
|
368
|
+
verbose: false
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
Search paths: `./syrch.yml` > `~/.syrch/config.yml` > `--config <path>` explicit override
|
|
372
|
+
|
|
373
|
+
### Environment Variables
|
|
374
|
+
|
|
375
|
+
| Variable | Maps to | Example |
|
|
376
|
+
|----------|---------|---------|
|
|
377
|
+
| `SYRCH_MODEL` | `llm.model` | `gpt-4o` |
|
|
378
|
+
| `SYRCH_API_KEY` | `llm.api_key` | `sk-...` |
|
|
379
|
+
| `SYRCH_BASE_URL` | `llm.base_url` | `http://100.88.35.18:11434/v1` |
|
|
380
|
+
| `SYRCH_MAX_DEPTH` | `execution.max_depth` | `3` |
|
|
381
|
+
| `SYRCH_VERBOSE` | `execution.verbose` | `true` |
|
|
382
|
+
|
|
383
|
+
### Databricks Connection
|
|
384
|
+
|
|
385
|
+
| Variable | Auth type | Description |
|
|
386
|
+
|----------|-----------|-------------|
|
|
387
|
+
| `DATABRICKS_SERVER_HOSTNAME` | all | Databricks workspace URL |
|
|
388
|
+
| `DATABRICKS_HTTP_PATH` | all | SQL Warehouse HTTP path |
|
|
389
|
+
| `DATABRICKS_TOKEN` | `pat` | Personal Access Token |
|
|
390
|
+
| `DATABRICKS_AUTH_TYPE` | all | `pat` (default), `databricks-oauth`, or `azure` |
|
|
391
|
+
| `DATABRICKS_CLIENT_ID` | oauth/azure | OAuth client ID |
|
|
392
|
+
| `DATABRICKS_CLIENT_SECRET` | oauth/azure | OAuth client secret |
|
|
393
|
+
| `AZURE_TENANT_ID` | azure | Azure AD tenant ID |
|
|
394
|
+
|
|
395
|
+
## Structured Logging
|
|
396
|
+
|
|
397
|
+
Internal diagnostics go to **stderr** via `logging`. User-facing output (Solution, SQL) goes to **stdout** via `rich`.
|
|
398
|
+
|
|
399
|
+
```bash
|
|
400
|
+
# Default: WARNING+ only to stderr
|
|
401
|
+
syrch search -q "..."
|
|
402
|
+
|
|
403
|
+
# Verbose: INFO level
|
|
404
|
+
syrch search -q "..." -v
|
|
405
|
+
|
|
406
|
+
# Library mode
|
|
407
|
+
python -c "
|
|
408
|
+
from syrch import query
|
|
409
|
+
result = query('Total revenue?', verbose=True)
|
|
410
|
+
"
|
|
411
|
+
```
|
|
412
|
+
|
|
413
|
+
Log format: `LEVEL:logger_name:message` (stdlib `logging` default)
|
|
414
|
+
|
|
415
|
+
```
|
|
416
|
+
INFO:syrch.scheduler:Layer 0: dispatching 2 nodes
|
|
417
|
+
WARNING:syrch.rlm_engine:Empty result, confidence penalized
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
## CI
|
|
421
|
+
|
|
422
|
+
GitHub Actions (`push`/`PR` → `main`):
|
|
423
|
+
|
|
424
|
+
| Step | Command |
|
|
425
|
+
|------|---------|
|
|
426
|
+
| Lint | `ruff check src/syrch/` |
|
|
427
|
+
| Type check | `mypy src/syrch/ --ignore-missing-imports` |
|
|
428
|
+
| Test | `pytest tests/ -v --cov=src/syrch/` (Python 3.11 + 3.12) |
|
|
429
|
+
|
|
430
|
+
## Confidence Calibration
|
|
431
|
+
|
|
432
|
+
LLM self-assessed confidence is adjusted by execution signals:
|
|
433
|
+
|
|
434
|
+
| Signal | Weight | Effect |
|
|
435
|
+
|--------|--------|--------|
|
|
436
|
+
| `syntax_error` | 0.10 | ×0.90 per occurrence |
|
|
437
|
+
| `execution_error` | 0.10 | ×0.90 per occurrence |
|
|
438
|
+
| `empty_result` | 0.15 | ×0.85 if result is empty |
|
|
439
|
+
| `schema_error` | 0.05 | ×0.95 per occurrence |
|
|
440
|
+
| `null_column` | 0.05 | ×0.95 if result has all-NULL columns |
|
|
441
|
+
| `retry_ratio` | 0.05 | Scales with attempts used |
|
|
442
|
+
|
|
443
|
+
**Heuristic penalties** (aggregator):
|
|
444
|
+
- Empty result: +0.15 per node
|
|
445
|
+
- Error present: +0.15 per node
|
|
446
|
+
- TOP-N mismatch: +0.05 per node
|
|
447
|
+
- "by year" without year column: +0.10 (once, global)
|
|
448
|
+
- **Capped at 0.40 total**
|
|
449
|
+
|
|
450
|
+
Formula: `calibrated = raw × Π(1 - weight_if_applicable)`
|
|
451
|
+
|
|
452
|
+
Disabled by passing `--no-cache` (sets `calibration_enabled=False` in `ExecutionConfig`).
|
|
453
|
+
|
|
454
|
+
## Grid Search
|
|
455
|
+
|
|
456
|
+
Automated hyperparameter search for optimal configuration:
|
|
457
|
+
|
|
458
|
+
```bash
|
|
459
|
+
syrch search -q "What discount × shipping combo maximizes revenue?" \
|
|
460
|
+
--db orders_10dim.sqlite --grid
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
Default parameter grid (54 cells):
|
|
464
|
+
| Parameter | Values |
|
|
465
|
+
|-----------|--------|
|
|
466
|
+
| max_depth | 1, 3, 5 |
|
|
467
|
+
| high_confidence | 0.7, 0.85, 0.95 |
|
|
468
|
+
| max_attempts_per_node | 1, 3, 5 |
|
|
469
|
+
| calibration_enabled | True, False |
|
|
470
|
+
|
|
471
|
+
Output: `autoresearch/reports/{YYYYMMDD-HHMMSS}/{config,results,best}.json` + `summary.md`
|
|
472
|
+
|
|
473
|
+
Best config selection: `exact_match > confidence` (cells with errors are skipped).
|
|
474
|
+
|
|
475
|
+
## Pruning Strategy
|
|
476
|
+
|
|
477
|
+
The RLM engine uses a confidence-based pruning strategy:
|
|
478
|
+
|
|
479
|
+
1. Generate first reasoning path → SQL → 3-step validation (syntax → schema → quality)
|
|
480
|
+
2. Execute → score
|
|
481
|
+
3. Apply confidence calibration (if enabled)
|
|
482
|
+
4. If calibrated confidence ≥ `HIGH_CONFIDENCE` (0.85) → **greedy accept**, stop
|
|
483
|
+
5. If below threshold → generate alternative path
|
|
484
|
+
6. After `max_attempts` → pick **best path** by calibrated confidence
|
|
485
|
+
|
|
486
|
+
This balances search thoroughness with token budget. Simple problems resolve quickly (greedy path), while complex ones explore multiple candidates.
|
|
487
|
+
|
|
488
|
+
## Executor Abstraction
|
|
489
|
+
|
|
490
|
+
All executors conform to `BaseExecutor`:
|
|
491
|
+
|
|
492
|
+
```python
|
|
493
|
+
class BaseExecutor(ABC):
|
|
494
|
+
def execute(sql: str) -> DataFrame: ...
|
|
495
|
+
def get_schema(table_name?: str) -> TableSchema: ...
|
|
496
|
+
def list_tables() -> list[str]: ...
|
|
497
|
+
def close(): ...
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
| Executor | Backend | Connection |
|
|
501
|
+
|----------|---------|------------|
|
|
502
|
+
| `SQLiteExecutor` | SQLite | `sqlite3` (thread-safe via `threading.local`) |
|
|
503
|
+
| `JDBCExecutor` | Any JDBC | SQLAlchemy |
|
|
504
|
+
| `DatabricksExecutor` | Databricks SQL | `databricks-sql-connector` (PEP 249) |
|
|
505
|
+
|
|
506
|
+
## Caching
|
|
507
|
+
|
|
508
|
+
All LLM and SQL calls are cached via `diskcache` (24h TTL):
|
|
509
|
+
|
|
510
|
+
| Layer | Cache | Key |
|
|
511
|
+
|-------|-------|-----|
|
|
512
|
+
| LLM `generate()` | `CachedLLM` | SHA256(system + user + model + temperature) |
|
|
513
|
+
| LLM `generate_json()` | `CachedLLM` | SHA256(system + user + model + temperature) |
|
|
514
|
+
| SQL `execute()` | `CachedExecutor` | SHA256(sql) |
|
|
515
|
+
|
|
516
|
+
Toggle with `--cache/--no-cache` flag; TTL configurable with `--cache-ttl`.
|
|
517
|
+
|
|
518
|
+
## Datasets
|
|
519
|
+
|
|
520
|
+
| Dataset | Rows | Size | Description |
|
|
521
|
+
|---------|------|------|-------------|
|
|
522
|
+
| `wikipedia_clickstream.sqlite` | 3,138 | 280 KB | Aggregated Wikipedia clickstream data with mutual information metadata |
|
|
523
|
+
| `orders_10dim.sqlite` | 7,500,000 | 1.3 GB | TPC-H derived synthetic orders with 10 dimension columns |
|
|
524
|
+
|
|
525
|
+
## Testing
|
|
526
|
+
|
|
527
|
+
```bash
|
|
528
|
+
# Unit tests (FakeLLM, no API key required)
|
|
529
|
+
pytest tests/ -v
|
|
530
|
+
|
|
531
|
+
# All 69 tests pass:
|
|
532
|
+
# 7 cache tests (CentralCache, CachedLLM, CachedExecutor)
|
|
533
|
+
# 9 clarify tests (ambiguity score, question generation, worst detection)
|
|
534
|
+
# 9 e2e tests (real SQLite DBs + pipeline)
|
|
535
|
+
# 14 eval tests (runner, metrics, benchmark, join merge)
|
|
536
|
+
# 8 integration tests (DAG, grid, clarification loop, multi-table)
|
|
537
|
+
# 8 planner tests (decompose, cycle, join keys, recursive)
|
|
538
|
+
# 10 rlm_engine tests (validation, calibration, quality, calibrator)
|
|
539
|
+
# 4 scheduler tests (DAG execution)
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
### Real-world Validation
|
|
543
|
+
|
|
544
|
+
```bash
|
|
545
|
+
# Run full validation (requires LLM API key)
|
|
546
|
+
python validate_real.py
|
|
547
|
+
|
|
548
|
+
# Specific level
|
|
549
|
+
python validate_real.py --level 3 --verbose
|
|
550
|
+
|
|
551
|
+
# Custom question
|
|
552
|
+
python validate_real.py --question "Total revenue by year?" --db orders_10dim.sqlite
|
|
553
|
+
|
|
554
|
+
# With local model
|
|
555
|
+
python validate_real.py --model qwen3.5-4b --max-concurrency 1
|
|
556
|
+
|
|
557
|
+
# Results (2026-06-15, minimax-m3:cloud):
|
|
558
|
+
# L1 Easy 3/3 PASS
|
|
559
|
+
# L2 Medium 3/3 PASS
|
|
560
|
+
# L3 Complex 2/2 PASS
|
|
561
|
+
# L4 Very Complex 2/2 PASS
|
|
562
|
+
# L5 Ambiguous 2/2 AMBIGUOUS (expected)
|
|
563
|
+
# ─────────────────────────────
|
|
564
|
+
# Total 10/10 PASS 100% (2 AMBIGUOUS)
|
|
565
|
+
```
|
|
566
|
+
|
|
567
|
+
## Research Background
|
|
568
|
+
|
|
569
|
+
- **RLM (Recursive Language Model)**: MIT CSAIL OASYS Lab, 2025. Inference paradigm where LLMs recursively decompose input via REPL environments. [`paper`](https://arxiv.org/abs/2512.24601) [`code`](https://github.com/alexzhang13/rlm)
|
|
570
|
+
- **RDD (Recursive Decomposition with Dependencies)**: Formal D&C framework with dependency DAGs. [`paper`](https://arxiv.org/abs/2505.02576)
|
|
571
|
+
- **PAC-MCTS**: Bias-aware pruning with formal guarantees for tree search. [`paper`](https://arxiv.org/abs/2604.14345)
|
|
572
|
+
- **ROMA**: Recursive meta-agent framework with Atomizer/Planner/Executor/Aggregator roles. [`paper`](https://arxiv.org/abs/2602.01848)
|
|
573
|
+
- **Graph Harness**: Structured DAG execution with immutable plan versions. [`paper`](https://arxiv.org/abs/2604.11378)
|
|
574
|
+
- **AdaptOrch**: Topology-aware multi-agent orchestration (parallel/sequential/hierarchical/hybrid). [`paper`](https://arxiv.org/abs/2602.16873)
|
|
575
|
+
- **DST**: Adaptive tree search with confidence-based pruning (26-75% computation reduction). [`paper`](https://arxiv.org/abs/2603.20267)
|
|
576
|
+
- **LLM Compiler**: Parallel task scheduling via dependency graphs; closely related to syrch's DAG scheduler and layer-by-layer execution. [`paper`](https://arxiv.org/abs/2312.13311)
|
|
577
|
+
|
|
578
|
+
## Open Research Questions
|
|
579
|
+
|
|
580
|
+
| Question | Approach |
|
|
581
|
+
|----------|----------|
|
|
582
|
+
| **When to stop dividing?** (Unit case detection) | Experiment with LLM self-assessment + complexity heuristics |
|
|
583
|
+
| **How to merge sub-task results?** | DAG-based REPL variable passing + Aggregator role |
|
|
584
|
+
| **How to prune search space?** | Confidence-based pruning + uncertainty-aware allocation |
|
|
585
|
+
| **Optimal D&C strategy?** | Topology routing (AdaptOrch) based on DAG structure metrics |
|
|
586
|
+
| **Optimal calibration weights?** | Grid search over penalty coefficients per signal |
|
|
587
|
+
| **Join key inference?** | Planner emits join_keys between sub-tasks |
|
|
588
|
+
| **Recursive decomposition?** | Planner recurses on non-atomic sub-tasks |
|
|
589
|
+
| **When SQL cannot solve?** | RLM clarification: ambiguity score → interactive feedback → re-decompose |
|
|
590
|
+
| **Optimal clarification threshold?** | Grid search over score weights + decision boundary |
|