onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
ot_tools/code_search.py
ADDED
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
"""Semantic code search using ChunkHound indexes.
|
|
2
|
+
|
|
3
|
+
Queries existing ChunkHound DuckDB databases for semantic code search.
|
|
4
|
+
Requires projects to be indexed externally with `chunkhound index <project>`.
|
|
5
|
+
Requires OPENAI_API_KEY in secrets.yaml for embedding generation.
|
|
6
|
+
|
|
7
|
+
Reference: https://github.com/chunkhound/chunkhound
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import threading
|
|
14
|
+
from functools import lru_cache
|
|
15
|
+
from typing import TYPE_CHECKING, Any
|
|
16
|
+
|
|
17
|
+
# Pack for dot notation: code.search(), code.status()
|
|
18
|
+
pack = "code"
|
|
19
|
+
|
|
20
|
+
__all__ = ["search", "search_batch", "status"]
|
|
21
|
+
|
|
22
|
+
# Dependency declarations for CLI validation
|
|
23
|
+
__ot_requires__ = {
|
|
24
|
+
"lib": [
|
|
25
|
+
("duckdb", "pip install duckdb"),
|
|
26
|
+
("openai", "pip install openai"),
|
|
27
|
+
],
|
|
28
|
+
"secrets": ["OPENAI_API_KEY"],
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
from pydantic import BaseModel, Field
|
|
32
|
+
|
|
33
|
+
from ot.config import get_tool_config
|
|
34
|
+
from ot.config.secrets import get_secret
|
|
35
|
+
from ot.logging import LogSpan
|
|
36
|
+
from ot.paths import resolve_cwd_path
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from pathlib import Path
|
|
40
|
+
from types import ModuleType
|
|
41
|
+
|
|
42
|
+
from openai import OpenAI
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
# Thread lock for connection cache operations
|
|
47
|
+
_connection_lock = threading.Lock()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Config(BaseModel):
|
|
51
|
+
"""Pack configuration - discovered by registry."""
|
|
52
|
+
|
|
53
|
+
limit: int = Field(
|
|
54
|
+
default=10,
|
|
55
|
+
ge=1,
|
|
56
|
+
le=100,
|
|
57
|
+
description="Maximum number of search results to return",
|
|
58
|
+
)
|
|
59
|
+
base_url: str = Field(
|
|
60
|
+
default="https://openrouter.ai/api/v1",
|
|
61
|
+
description="OpenAI-compatible API base URL for embeddings",
|
|
62
|
+
)
|
|
63
|
+
model: str = Field(
|
|
64
|
+
default="text-embedding-3-small",
|
|
65
|
+
description="Embedding model (must match ChunkHound index)",
|
|
66
|
+
)
|
|
67
|
+
db_path: str = Field(
|
|
68
|
+
default=".chunkhound/chunks.db",
|
|
69
|
+
description="Path to ChunkHound DuckDB database relative to project root",
|
|
70
|
+
)
|
|
71
|
+
provider: str = Field(
|
|
72
|
+
default="openai",
|
|
73
|
+
description="Embedding provider stored in ChunkHound index",
|
|
74
|
+
)
|
|
75
|
+
dimensions: int = Field(
|
|
76
|
+
default=1536,
|
|
77
|
+
description="Embedding dimensions (must match model)",
|
|
78
|
+
)
|
|
79
|
+
content_limit: int = Field(
|
|
80
|
+
default=500,
|
|
81
|
+
ge=100,
|
|
82
|
+
le=10000,
|
|
83
|
+
description="Maximum characters of code content to return (without expand)",
|
|
84
|
+
)
|
|
85
|
+
content_limit_expanded: int = Field(
|
|
86
|
+
default=2000,
|
|
87
|
+
ge=500,
|
|
88
|
+
le=20000,
|
|
89
|
+
description="Maximum characters of code content to return (with expand)",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _get_config() -> Config:
|
|
94
|
+
"""Get code pack configuration."""
|
|
95
|
+
return get_tool_config("code", Config)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _get_db_path(path: str | None = None, db: str | None = None) -> tuple[Path, Path]:
|
|
99
|
+
"""Get the ChunkHound DuckDB path and project root.
|
|
100
|
+
|
|
101
|
+
Uses SDK resolve_cwd_path() for consistent path resolution.
|
|
102
|
+
|
|
103
|
+
Path resolution follows project conventions:
|
|
104
|
+
- If path is None: uses project directory (OT_CWD)
|
|
105
|
+
- If path provided: resolves with prefix/tilde expansion
|
|
106
|
+
- If db is None: uses config.db_path (default: .chunkhound/chunks.db)
|
|
107
|
+
- If db provided: uses that path relative to project root
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
path: Path to project root (default: OT_CWD)
|
|
111
|
+
db: Path to database file relative to project root (default: config.db_path)
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Tuple of (db_path, project_root)
|
|
115
|
+
"""
|
|
116
|
+
config = _get_config()
|
|
117
|
+
project_root = resolve_cwd_path(".") if path is None else resolve_cwd_path(path)
|
|
118
|
+
db_rel = db if db is not None else config.db_path
|
|
119
|
+
db_path = project_root / db_rel
|
|
120
|
+
return db_path, project_root
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _get_openai_client() -> OpenAI:
|
|
124
|
+
"""Get OpenAI client for embedding generation."""
|
|
125
|
+
try:
|
|
126
|
+
from openai import OpenAI
|
|
127
|
+
except ImportError as e:
|
|
128
|
+
raise ImportError(
|
|
129
|
+
"openai is required for code_search. Install with: pip install openai"
|
|
130
|
+
) from e
|
|
131
|
+
|
|
132
|
+
api_key = get_secret("OPENAI_API_KEY") or ""
|
|
133
|
+
if not api_key:
|
|
134
|
+
raise ValueError(
|
|
135
|
+
"OPENAI_API_KEY not configured in secrets.yaml (required for code search embeddings)"
|
|
136
|
+
)
|
|
137
|
+
config = _get_config()
|
|
138
|
+
return OpenAI(api_key=api_key, base_url=config.base_url or None)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _import_duckdb() -> ModuleType:
|
|
142
|
+
"""Lazy import duckdb module."""
|
|
143
|
+
try:
|
|
144
|
+
import duckdb
|
|
145
|
+
except ImportError as e:
|
|
146
|
+
raise ImportError(
|
|
147
|
+
"duckdb is required for code_search. Install with: pip install duckdb"
|
|
148
|
+
) from e
|
|
149
|
+
return duckdb
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@lru_cache(maxsize=4)
|
|
153
|
+
def _get_cached_connection(db_path: str) -> Any:
|
|
154
|
+
"""Get cached read-only connection to ChunkHound database.
|
|
155
|
+
|
|
156
|
+
Connections are cached by path and reused. Call _clear_connection_cache()
|
|
157
|
+
if database is rebuilt.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
db_path: Path to the DuckDB database file.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
DuckDB connection with vss extension loaded.
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
RuntimeError: If VSS extension cannot be loaded.
|
|
167
|
+
"""
|
|
168
|
+
duckdb = _import_duckdb()
|
|
169
|
+
conn = duckdb.connect(db_path, read_only=True)
|
|
170
|
+
try:
|
|
171
|
+
conn.execute("LOAD vss")
|
|
172
|
+
except Exception as e:
|
|
173
|
+
conn.close()
|
|
174
|
+
if "vss" in str(e).lower() or "extension" in str(e).lower():
|
|
175
|
+
raise RuntimeError(
|
|
176
|
+
"DuckDB VSS extension not available.\n"
|
|
177
|
+
"Install with: pip install duckdb # Version 0.9+ includes vss"
|
|
178
|
+
) from e
|
|
179
|
+
raise
|
|
180
|
+
return conn
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _clear_connection_cache() -> None:
|
|
184
|
+
"""Clear cached connections (call after index rebuild)."""
|
|
185
|
+
with _connection_lock:
|
|
186
|
+
_get_cached_connection.cache_clear()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _validate_and_connect(
|
|
190
|
+
db_path: Path,
|
|
191
|
+
project_root: Path,
|
|
192
|
+
config: Config,
|
|
193
|
+
) -> tuple[Any, str]:
|
|
194
|
+
"""Validate database and return connection + embeddings table name.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
db_path: Path to the DuckDB database file.
|
|
198
|
+
project_root: Path to the project root directory.
|
|
199
|
+
config: Pack configuration.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Tuple of (connection, embeddings_table_name).
|
|
203
|
+
|
|
204
|
+
Raises:
|
|
205
|
+
ValueError: If validation fails with user-friendly message.
|
|
206
|
+
"""
|
|
207
|
+
if not db_path.exists():
|
|
208
|
+
raise ValueError(
|
|
209
|
+
f"Project not indexed. Run: chunkhound index {project_root}\n"
|
|
210
|
+
f"Expected database at: {db_path}"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
with _connection_lock:
|
|
214
|
+
conn = _get_cached_connection(str(db_path))
|
|
215
|
+
|
|
216
|
+
tables = [row[0] for row in conn.execute("SHOW TABLES").fetchall()]
|
|
217
|
+
embeddings_table = f"embeddings_{config.dimensions}"
|
|
218
|
+
|
|
219
|
+
if "chunks" not in tables:
|
|
220
|
+
raise ValueError(
|
|
221
|
+
f"Database missing 'chunks' table. Re-index with: chunkhound index {project_root}"
|
|
222
|
+
)
|
|
223
|
+
if embeddings_table not in tables:
|
|
224
|
+
raise ValueError(
|
|
225
|
+
f"Database missing '{embeddings_table}' table. Re-index with: chunkhound index {project_root}"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
return conn, embeddings_table
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _build_search_sql(
|
|
232
|
+
embeddings_table: str,
|
|
233
|
+
dimensions: int,
|
|
234
|
+
provider: str,
|
|
235
|
+
model: str,
|
|
236
|
+
language: str | None = None,
|
|
237
|
+
chunk_type: str | None = None,
|
|
238
|
+
exclude: str | None = None,
|
|
239
|
+
) -> tuple[str, list[Any]]:
|
|
240
|
+
"""Build semantic search SQL query.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
embeddings_table: Name of the embeddings table.
|
|
244
|
+
dimensions: Embedding dimensions.
|
|
245
|
+
provider: Embedding provider.
|
|
246
|
+
model: Embedding model.
|
|
247
|
+
language: Optional language filter.
|
|
248
|
+
chunk_type: Optional chunk type filter.
|
|
249
|
+
exclude: Optional pipe-separated exclude patterns.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Tuple of (sql_template, params). Caller must prepend embedding param
|
|
253
|
+
and append limit param.
|
|
254
|
+
"""
|
|
255
|
+
sql = f"""
|
|
256
|
+
SELECT
|
|
257
|
+
c.id as chunk_id,
|
|
258
|
+
c.symbol,
|
|
259
|
+
c.code as content,
|
|
260
|
+
c.chunk_type,
|
|
261
|
+
c.start_line,
|
|
262
|
+
c.end_line,
|
|
263
|
+
f.path as file_path,
|
|
264
|
+
f.language,
|
|
265
|
+
array_cosine_similarity(e.embedding, ?::FLOAT[{dimensions}]) as similarity
|
|
266
|
+
FROM {embeddings_table} e
|
|
267
|
+
JOIN chunks c ON e.chunk_id = c.id
|
|
268
|
+
JOIN files f ON c.file_id = f.id
|
|
269
|
+
WHERE e.provider = ? AND e.model = ?
|
|
270
|
+
"""
|
|
271
|
+
params: list[Any] = [provider, model]
|
|
272
|
+
|
|
273
|
+
if language:
|
|
274
|
+
sql += " AND LOWER(f.language) = LOWER(?)"
|
|
275
|
+
params.append(language)
|
|
276
|
+
|
|
277
|
+
if chunk_type:
|
|
278
|
+
sql += " AND LOWER(c.chunk_type) = LOWER(?)"
|
|
279
|
+
params.append(chunk_type)
|
|
280
|
+
|
|
281
|
+
if exclude:
|
|
282
|
+
for pattern in (p.strip() for p in exclude.split("|") if p.strip()):
|
|
283
|
+
sql += " AND f.path NOT LIKE ?"
|
|
284
|
+
params.append(f"%{pattern}%")
|
|
285
|
+
|
|
286
|
+
return sql, params
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _row_to_result(row: tuple, matched_query: str | None = None) -> dict[str, Any]:
|
|
290
|
+
"""Convert a database row to a result dictionary.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
row: Tuple from database query.
|
|
294
|
+
matched_query: Optional query that matched this result (for batch).
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Result dictionary with standardized keys.
|
|
298
|
+
"""
|
|
299
|
+
result = {
|
|
300
|
+
"chunk_id": row[0],
|
|
301
|
+
"symbol": row[1],
|
|
302
|
+
"content": row[2],
|
|
303
|
+
"chunk_type": row[3],
|
|
304
|
+
"start_line": row[4],
|
|
305
|
+
"end_line": row[5],
|
|
306
|
+
"file_path": row[6],
|
|
307
|
+
"language": row[7],
|
|
308
|
+
"similarity": row[8],
|
|
309
|
+
}
|
|
310
|
+
if matched_query is not None:
|
|
311
|
+
result["matched_query"] = matched_query
|
|
312
|
+
return result
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _generate_embedding(query: str) -> list[float]:
|
|
316
|
+
"""Generate embedding vector for a search query."""
|
|
317
|
+
config = _get_config()
|
|
318
|
+
with LogSpan(span="code.embedding", model=config.model, queryLen=len(query)) as span:
|
|
319
|
+
client = _get_openai_client()
|
|
320
|
+
response = client.embeddings.create(
|
|
321
|
+
model=config.model,
|
|
322
|
+
input=query,
|
|
323
|
+
)
|
|
324
|
+
span.add(dimensions=len(response.data[0].embedding))
|
|
325
|
+
return response.data[0].embedding
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _generate_embeddings_batch(queries: list[str]) -> list[list[float]]:
|
|
329
|
+
"""Generate embedding vectors for multiple queries in a single API call."""
|
|
330
|
+
config = _get_config()
|
|
331
|
+
with LogSpan(
|
|
332
|
+
span="code.embedding_batch", model=config.model, queryCount=len(queries)
|
|
333
|
+
) as span:
|
|
334
|
+
client = _get_openai_client()
|
|
335
|
+
response = client.embeddings.create(
|
|
336
|
+
model=config.model,
|
|
337
|
+
input=queries,
|
|
338
|
+
)
|
|
339
|
+
embeddings = [item.embedding for item in response.data]
|
|
340
|
+
span.add(dimensions=len(embeddings[0]) if embeddings else 0)
|
|
341
|
+
return embeddings
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _format_result(
|
|
345
|
+
result: dict[str, Any],
|
|
346
|
+
project_root: Path | None = None,
|
|
347
|
+
expand: int | None = None,
|
|
348
|
+
) -> dict[str, Any]:
|
|
349
|
+
"""Format a search result for output.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
result: Raw search result from database
|
|
353
|
+
project_root: Project root for file reading (needed for expand)
|
|
354
|
+
expand: Number of context lines to include around match
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Formatted result dict. Content is truncated to `content_limit` chars
|
|
358
|
+
(default 500) or `content_limit_expanded` chars (default 2000) when
|
|
359
|
+
expand is used. These limits are configurable via pack config.
|
|
360
|
+
"""
|
|
361
|
+
config = _get_config()
|
|
362
|
+
content = result.get("content", "")
|
|
363
|
+
start_line = result.get("start_line")
|
|
364
|
+
end_line = result.get("end_line")
|
|
365
|
+
|
|
366
|
+
# Expand content if requested and we have valid line numbers
|
|
367
|
+
if expand and project_root and start_line and end_line:
|
|
368
|
+
file_path = project_root / result.get("file_path", "")
|
|
369
|
+
if file_path.exists():
|
|
370
|
+
try:
|
|
371
|
+
lines = file_path.read_text().splitlines()
|
|
372
|
+
# Calculate expanded range (1-indexed to 0-indexed)
|
|
373
|
+
exp_start = max(0, start_line - 1 - expand)
|
|
374
|
+
exp_end = min(len(lines), end_line + expand)
|
|
375
|
+
content = "\n".join(lines[exp_start:exp_end])
|
|
376
|
+
start_line = exp_start + 1
|
|
377
|
+
end_line = exp_end
|
|
378
|
+
except Exception as e:
|
|
379
|
+
# Log but don't fail - expansion is optional enhancement
|
|
380
|
+
logger.debug("Failed to expand content from %s: %s", file_path, e)
|
|
381
|
+
|
|
382
|
+
# Apply content truncation from config
|
|
383
|
+
content_limit = config.content_limit_expanded if expand else config.content_limit
|
|
384
|
+
|
|
385
|
+
return {
|
|
386
|
+
"file": result.get("file_path", "unknown"),
|
|
387
|
+
"name": result.get("symbol", ""),
|
|
388
|
+
"type": result.get("chunk_type", ""),
|
|
389
|
+
"language": result.get("language", ""),
|
|
390
|
+
"lines": f"{start_line or '?'}-{end_line or '?'}",
|
|
391
|
+
"score": round(result.get("similarity", 0.0), 4),
|
|
392
|
+
"content": content[:content_limit],
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def search(
|
|
397
|
+
*,
|
|
398
|
+
query: str,
|
|
399
|
+
limit: int | None = None,
|
|
400
|
+
language: str | None = None,
|
|
401
|
+
chunk_type: str | None = None,
|
|
402
|
+
expand: int | None = None,
|
|
403
|
+
exclude: str | None = None,
|
|
404
|
+
path: str | None = None,
|
|
405
|
+
db: str | None = None,
|
|
406
|
+
) -> str:
|
|
407
|
+
"""Search for code semantically in a ChunkHound-indexed project.
|
|
408
|
+
|
|
409
|
+
Finds code by meaning rather than exact keyword matches. For example,
|
|
410
|
+
searching for "authentication" can find functions named `verify_jwt_token`.
|
|
411
|
+
|
|
412
|
+
Requires the project to be indexed first with:
|
|
413
|
+
chunkhound index /path/to/project
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
query: Natural language search query (e.g., "error handling", "database connection")
|
|
417
|
+
limit: Maximum number of results to return (defaults to config)
|
|
418
|
+
language: Filter results by language (e.g., "python", "typescript")
|
|
419
|
+
chunk_type: Filter by type (e.g., "function", "class", "method", "comment")
|
|
420
|
+
expand: Number of context lines to include around each match
|
|
421
|
+
exclude: Pipe-separated patterns to exclude (e.g., "test|mock|fixture")
|
|
422
|
+
path: Path to project root (default: cwd)
|
|
423
|
+
db: Path to database file relative to project root (default: .chunkhound/chunks.db)
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
Formatted search results with file paths, line numbers, code snippets,
|
|
427
|
+
and relevance scores. Returns error message if project not indexed.
|
|
428
|
+
|
|
429
|
+
Example:
|
|
430
|
+
# Search in current directory
|
|
431
|
+
code.search(query="authentication logic")
|
|
432
|
+
|
|
433
|
+
# Find Python functions only
|
|
434
|
+
code.search(query="database queries", language="python", chunk_type="function")
|
|
435
|
+
|
|
436
|
+
# Get expanded context
|
|
437
|
+
code.search(query="error handling", expand=10)
|
|
438
|
+
|
|
439
|
+
# Exclude test files
|
|
440
|
+
code.search(query="validation", exclude="test|mock")
|
|
441
|
+
"""
|
|
442
|
+
if limit is None:
|
|
443
|
+
limit = get_tool_config("code", Config).limit
|
|
444
|
+
db_path, project_root = _get_db_path(path, db)
|
|
445
|
+
|
|
446
|
+
with LogSpan(
|
|
447
|
+
span="code.search",
|
|
448
|
+
project=str(project_root),
|
|
449
|
+
query=query,
|
|
450
|
+
limit=limit,
|
|
451
|
+
language=language,
|
|
452
|
+
chunk_type=chunk_type,
|
|
453
|
+
expand=expand,
|
|
454
|
+
exclude=exclude,
|
|
455
|
+
) as s:
|
|
456
|
+
try:
|
|
457
|
+
# Validate database and get connection
|
|
458
|
+
config = _get_config()
|
|
459
|
+
conn, embeddings_table = _validate_and_connect(db_path, project_root, config)
|
|
460
|
+
|
|
461
|
+
# Generate query embedding
|
|
462
|
+
embedding = _generate_embedding(query)
|
|
463
|
+
|
|
464
|
+
# Build semantic search query
|
|
465
|
+
sql, params = _build_search_sql(
|
|
466
|
+
embeddings_table=embeddings_table,
|
|
467
|
+
dimensions=config.dimensions,
|
|
468
|
+
provider=config.provider,
|
|
469
|
+
model=config.model,
|
|
470
|
+
language=language,
|
|
471
|
+
chunk_type=chunk_type,
|
|
472
|
+
exclude=exclude,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Prepend embedding and append limit
|
|
476
|
+
params = [embedding, *params, limit]
|
|
477
|
+
sql += " ORDER BY similarity DESC LIMIT ?"
|
|
478
|
+
|
|
479
|
+
# Execute search
|
|
480
|
+
results = conn.execute(sql, params).fetchall()
|
|
481
|
+
|
|
482
|
+
if not results:
|
|
483
|
+
s.add("resultCount", 0)
|
|
484
|
+
return f"No results found for: {query}"
|
|
485
|
+
|
|
486
|
+
# Format results
|
|
487
|
+
formatted = [
|
|
488
|
+
_format_result(_row_to_result(row), project_root, expand)
|
|
489
|
+
for row in results
|
|
490
|
+
]
|
|
491
|
+
|
|
492
|
+
# Build output
|
|
493
|
+
output_lines = [f"Found {len(formatted)} results for: {query}\n"]
|
|
494
|
+
for i, r in enumerate(formatted, 1):
|
|
495
|
+
output_lines.append(
|
|
496
|
+
f"{i}. [{r['type']}] {r['name']} ({r['language']})\n"
|
|
497
|
+
f" File: {r['file']}:{r['lines']}\n"
|
|
498
|
+
f" Score: {r['score']}\n"
|
|
499
|
+
f" ```\n{r['content']}\n ```\n"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
output = "\n".join(output_lines)
|
|
503
|
+
s.add("resultCount", len(formatted))
|
|
504
|
+
s.add("outputLen", len(output))
|
|
505
|
+
return output
|
|
506
|
+
|
|
507
|
+
except ValueError as e:
|
|
508
|
+
# Validation errors (not indexed, missing tables)
|
|
509
|
+
s.add("error", "validation_failed")
|
|
510
|
+
return f"Error: {e}"
|
|
511
|
+
except Exception as e:
|
|
512
|
+
s.add("error", str(e))
|
|
513
|
+
return f"Error searching code: {e}"
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def search_batch(
|
|
517
|
+
*,
|
|
518
|
+
queries: str,
|
|
519
|
+
limit: int | None = None,
|
|
520
|
+
language: str | None = None,
|
|
521
|
+
chunk_type: str | None = None,
|
|
522
|
+
expand: int | None = None,
|
|
523
|
+
exclude: str | None = None,
|
|
524
|
+
path: str | None = None,
|
|
525
|
+
db: str | None = None,
|
|
526
|
+
) -> str:
|
|
527
|
+
"""Run multiple semantic searches and return merged, deduplicated results.
|
|
528
|
+
|
|
529
|
+
Uses batch embedding API (single call) for efficiency. Results are
|
|
530
|
+
deduplicated by file+lines, keeping the highest score.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
queries: Pipe-separated search queries (e.g., "auth logic|token validation|session")
|
|
534
|
+
limit: Maximum results per query (defaults to config)
|
|
535
|
+
language: Filter by language (e.g., "python")
|
|
536
|
+
chunk_type: Filter by type (e.g., "function", "class")
|
|
537
|
+
expand: Number of context lines to include around each match
|
|
538
|
+
exclude: Pipe-separated patterns to exclude (e.g., "test|mock")
|
|
539
|
+
path: Path to project root (default: cwd)
|
|
540
|
+
db: Path to database file relative to project root (default: .chunkhound/chunks.db)
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
Merged results sorted by score, with duplicates removed.
|
|
544
|
+
|
|
545
|
+
Example:
|
|
546
|
+
# Multiple related queries
|
|
547
|
+
code.search_batch(queries="authentication|login|session handling")
|
|
548
|
+
|
|
549
|
+
# Exclude test files
|
|
550
|
+
code.search_batch(queries="error handling|validation", exclude="test|mock")
|
|
551
|
+
"""
|
|
552
|
+
if limit is None:
|
|
553
|
+
limit = get_tool_config("code", Config).limit
|
|
554
|
+
db_path, project_root = _get_db_path(path, db)
|
|
555
|
+
|
|
556
|
+
# Parse pipe-separated queries
|
|
557
|
+
query_list = [q.strip() for q in queries.split("|") if q.strip()]
|
|
558
|
+
if not query_list:
|
|
559
|
+
return "Error: No valid queries provided"
|
|
560
|
+
|
|
561
|
+
with LogSpan(
|
|
562
|
+
span="code.search_batch",
|
|
563
|
+
project=str(project_root),
|
|
564
|
+
queryCount=len(query_list),
|
|
565
|
+
limit=limit,
|
|
566
|
+
exclude=exclude,
|
|
567
|
+
) as s:
|
|
568
|
+
try:
|
|
569
|
+
# Validate database and get connection
|
|
570
|
+
config = _get_config()
|
|
571
|
+
conn, embeddings_table = _validate_and_connect(db_path, project_root, config)
|
|
572
|
+
|
|
573
|
+
# Generate all embeddings in a single API call
|
|
574
|
+
embeddings = _generate_embeddings_batch(query_list)
|
|
575
|
+
|
|
576
|
+
# Build base SQL query (reused for all queries)
|
|
577
|
+
base_sql, base_params = _build_search_sql(
|
|
578
|
+
embeddings_table=embeddings_table,
|
|
579
|
+
dimensions=config.dimensions,
|
|
580
|
+
provider=config.provider,
|
|
581
|
+
model=config.model,
|
|
582
|
+
language=language,
|
|
583
|
+
chunk_type=chunk_type,
|
|
584
|
+
exclude=exclude,
|
|
585
|
+
)
|
|
586
|
+
base_sql += " ORDER BY similarity DESC LIMIT ?"
|
|
587
|
+
|
|
588
|
+
# Collect all results
|
|
589
|
+
all_results: dict[str, dict[str, Any]] = {} # key: file:lines
|
|
590
|
+
|
|
591
|
+
for query, embedding in zip(query_list, embeddings, strict=True):
|
|
592
|
+
# Prepend embedding and append limit
|
|
593
|
+
params = [embedding, *base_params, limit]
|
|
594
|
+
results = conn.execute(base_sql, params).fetchall()
|
|
595
|
+
|
|
596
|
+
for row in results:
|
|
597
|
+
result = _row_to_result(row, matched_query=query)
|
|
598
|
+
# Dedupe key: file path + line range
|
|
599
|
+
key = f"{row[6]}:{row[4]}-{row[5]}"
|
|
600
|
+
if key not in all_results or row[8] > all_results[key]["similarity"]:
|
|
601
|
+
all_results[key] = result
|
|
602
|
+
|
|
603
|
+
if not all_results:
|
|
604
|
+
s.add("resultCount", 0)
|
|
605
|
+
return f"No results found for queries: {', '.join(query_list)}"
|
|
606
|
+
|
|
607
|
+
# Sort by similarity and format
|
|
608
|
+
sorted_results = sorted(
|
|
609
|
+
all_results.values(), key=lambda x: x["similarity"], reverse=True
|
|
610
|
+
)
|
|
611
|
+
formatted = [
|
|
612
|
+
_format_result(r, project_root, expand) for r in sorted_results
|
|
613
|
+
]
|
|
614
|
+
|
|
615
|
+
# Build output
|
|
616
|
+
output_lines = [
|
|
617
|
+
f"Found {len(formatted)} results for {len(query_list)} queries\n"
|
|
618
|
+
]
|
|
619
|
+
for i, r in enumerate(formatted, 1):
|
|
620
|
+
output_lines.append(
|
|
621
|
+
f"{i}. [{r['type']}] {r['name']} ({r['language']})\n"
|
|
622
|
+
f" File: {r['file']}:{r['lines']}\n"
|
|
623
|
+
f" Score: {r['score']}\n"
|
|
624
|
+
f" ```\n{r['content']}\n ```\n"
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
output = "\n".join(output_lines)
|
|
628
|
+
s.add("resultCount", len(formatted))
|
|
629
|
+
s.add("outputLen", len(output))
|
|
630
|
+
return output
|
|
631
|
+
|
|
632
|
+
except ValueError as e:
|
|
633
|
+
# Validation errors (not indexed, missing tables)
|
|
634
|
+
s.add("error", "validation_failed")
|
|
635
|
+
return f"Error: {e}"
|
|
636
|
+
except Exception as e:
|
|
637
|
+
s.add("error", str(e))
|
|
638
|
+
return f"Error in batch search: {e}"
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def status(*, path: str | None = None, db: str | None = None) -> str:
|
|
642
|
+
"""Check if a project has a ChunkHound index and show statistics.
|
|
643
|
+
|
|
644
|
+
Args:
|
|
645
|
+
path: Path to project root (default: cwd)
|
|
646
|
+
db: Path to database file relative to project root (default: .chunkhound/chunks.db)
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
Index statistics (file count, chunk count, languages) or
|
|
650
|
+
instructions for indexing if not indexed.
|
|
651
|
+
|
|
652
|
+
Example:
|
|
653
|
+
# Current directory
|
|
654
|
+
code.status()
|
|
655
|
+
|
|
656
|
+
# Explicit path
|
|
657
|
+
code.status(path="/path/to/project")
|
|
658
|
+
"""
|
|
659
|
+
db_path, project_root = _get_db_path(path, db)
|
|
660
|
+
|
|
661
|
+
with LogSpan(span="code.status", project=str(project_root)) as s:
|
|
662
|
+
if not db_path.exists():
|
|
663
|
+
s.add("indexed", False)
|
|
664
|
+
return (
|
|
665
|
+
f"Project not indexed.\n\n"
|
|
666
|
+
f"To enable semantic code search, run:\n"
|
|
667
|
+
f" chunkhound index {project_root}\n\n"
|
|
668
|
+
f"This creates a searchable index at:\n"
|
|
669
|
+
f" {db_path}"
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
try:
|
|
673
|
+
with _connection_lock:
|
|
674
|
+
conn = _get_cached_connection(str(db_path))
|
|
675
|
+
tables = [row[0] for row in conn.execute("SHOW TABLES").fetchall()]
|
|
676
|
+
|
|
677
|
+
stats: dict[str, object] = {"tables": tables, "indexed": True}
|
|
678
|
+
|
|
679
|
+
# Get chunk statistics
|
|
680
|
+
if "chunks" in tables:
|
|
681
|
+
chunk_count = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
|
|
682
|
+
stats["chunk_count"] = chunk_count
|
|
683
|
+
|
|
684
|
+
# Get language distribution
|
|
685
|
+
try:
|
|
686
|
+
lang_results = conn.execute("""
|
|
687
|
+
SELECT f.language, COUNT(*) as cnt
|
|
688
|
+
FROM chunks c
|
|
689
|
+
JOIN files f ON c.file_id = f.id
|
|
690
|
+
GROUP BY f.language
|
|
691
|
+
ORDER BY cnt DESC
|
|
692
|
+
""").fetchall()
|
|
693
|
+
stats["languages"] = {row[0]: row[1] for row in lang_results}
|
|
694
|
+
except Exception:
|
|
695
|
+
pass # Language stats are optional
|
|
696
|
+
|
|
697
|
+
# Get file statistics
|
|
698
|
+
if "files" in tables:
|
|
699
|
+
file_count = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
|
|
700
|
+
stats["file_count"] = file_count
|
|
701
|
+
|
|
702
|
+
# Get embedding statistics
|
|
703
|
+
# Note: embeddings_table is safe - derived from validated config.dimensions (int)
|
|
704
|
+
config = _get_config()
|
|
705
|
+
embeddings_table = f"embeddings_{config.dimensions}"
|
|
706
|
+
if embeddings_table in tables:
|
|
707
|
+
emb_count = conn.execute(
|
|
708
|
+
f"SELECT COUNT(*) FROM {embeddings_table}"
|
|
709
|
+
).fetchone()[0]
|
|
710
|
+
stats["embedding_count"] = emb_count
|
|
711
|
+
|
|
712
|
+
# Format output
|
|
713
|
+
output_lines = [
|
|
714
|
+
f"Project indexed: {project_root}\n",
|
|
715
|
+
f"Database: {db_path}\n",
|
|
716
|
+
]
|
|
717
|
+
|
|
718
|
+
if "file_count" in stats:
|
|
719
|
+
output_lines.append(f"Files: {stats['file_count']}")
|
|
720
|
+
if "chunk_count" in stats:
|
|
721
|
+
output_lines.append(f"Chunks: {stats['chunk_count']}")
|
|
722
|
+
if "embedding_count" in stats:
|
|
723
|
+
output_lines.append(f"Embeddings: {stats['embedding_count']}")
|
|
724
|
+
if "languages" in stats:
|
|
725
|
+
langs = ", ".join(f"{k}: {v}" for k, v in stats["languages"].items())
|
|
726
|
+
output_lines.append(f"Languages: {langs}")
|
|
727
|
+
|
|
728
|
+
output_lines.append(f"\nTables: {', '.join(tables)}")
|
|
729
|
+
|
|
730
|
+
for key, value in stats.items():
|
|
731
|
+
s.add(key, value)
|
|
732
|
+
return "\n".join(output_lines)
|
|
733
|
+
|
|
734
|
+
except Exception as e:
|
|
735
|
+
s.add("error", str(e))
|
|
736
|
+
return f"Error reading index: {e}"
|