deepy-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepy/__init__.py +9 -0
- deepy/__main__.py +7 -0
- deepy/cli.py +413 -0
- deepy/config/__init__.py +21 -0
- deepy/config/settings.py +237 -0
- deepy/data/__init__.py +1 -0
- deepy/data/tools/AskUserQuestion.md +10 -0
- deepy/data/tools/WebFetch.md +9 -0
- deepy/data/tools/WebSearch.md +9 -0
- deepy/data/tools/__init__.py +1 -0
- deepy/data/tools/bash.md +7 -0
- deepy/data/tools/edit.md +13 -0
- deepy/data/tools/modify.md +17 -0
- deepy/data/tools/read.md +8 -0
- deepy/data/tools/write.md +12 -0
- deepy/errors.py +63 -0
- deepy/llm/__init__.py +13 -0
- deepy/llm/agent.py +31 -0
- deepy/llm/context.py +109 -0
- deepy/llm/events.py +187 -0
- deepy/llm/model_capabilities.py +7 -0
- deepy/llm/provider.py +81 -0
- deepy/llm/replay.py +120 -0
- deepy/llm/runner.py +412 -0
- deepy/llm/thinking.py +30 -0
- deepy/prompts/__init__.py +6 -0
- deepy/prompts/compact.py +100 -0
- deepy/prompts/rules.py +24 -0
- deepy/prompts/runtime_context.py +98 -0
- deepy/prompts/system.py +72 -0
- deepy/prompts/tool_docs.py +21 -0
- deepy/sessions/__init__.py +17 -0
- deepy/sessions/jsonl.py +306 -0
- deepy/sessions/manager.py +202 -0
- deepy/skills.py +202 -0
- deepy/status.py +65 -0
- deepy/tools/__init__.py +6 -0
- deepy/tools/agents.py +343 -0
- deepy/tools/builtin.py +2113 -0
- deepy/tools/file_state.py +85 -0
- deepy/tools/result.py +54 -0
- deepy/tools/shell_utils.py +83 -0
- deepy/ui/__init__.py +5 -0
- deepy/ui/app.py +118 -0
- deepy/ui/ask_user_question.py +182 -0
- deepy/ui/exit_summary.py +142 -0
- deepy/ui/loading_text.py +87 -0
- deepy/ui/markdown.py +152 -0
- deepy/ui/message_view.py +546 -0
- deepy/ui/prompt_buffer.py +176 -0
- deepy/ui/prompt_input.py +286 -0
- deepy/ui/session_list.py +140 -0
- deepy/ui/session_picker.py +179 -0
- deepy/ui/slash_commands.py +67 -0
- deepy/ui/styles.py +21 -0
- deepy/ui/terminal.py +959 -0
- deepy/ui/thinking_state.py +29 -0
- deepy/ui/welcome.py +195 -0
- deepy/update_check.py +195 -0
- deepy/usage.py +192 -0
- deepy/utils/__init__.py +15 -0
- deepy/utils/debug_logger.py +62 -0
- deepy/utils/error_logger.py +107 -0
- deepy/utils/json.py +29 -0
- deepy/utils/notify.py +66 -0
- deepy_cli-0.1.1.dist-info/METADATA +205 -0
- deepy_cli-0.1.1.dist-info/RECORD +69 -0
- deepy_cli-0.1.1.dist-info/WHEEL +4 -0
- deepy_cli-0.1.1.dist-info/entry_points.txt +3 -0
deepy/tools/builtin.py
ADDED
|
@@ -0,0 +1,2113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import math
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import signal
|
|
8
|
+
import shlex
|
|
9
|
+
import subprocess
|
|
10
|
+
import tempfile
|
|
11
|
+
import time
|
|
12
|
+
import urllib.parse
|
|
13
|
+
import urllib.request
|
|
14
|
+
import uuid
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from difflib import unified_diff
|
|
17
|
+
from fnmatch import fnmatch
|
|
18
|
+
from html.parser import HTMLParser
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from deepy.config import Settings
|
|
22
|
+
from deepy.utils import json as json_utils
|
|
23
|
+
|
|
24
|
+
from .file_state import FileSnippet, FileState
|
|
25
|
+
from .result import ToolResult
|
|
26
|
+
from .shell_utils import build_disable_extglob_command
|
|
27
|
+
from .shell_utils import build_shell_init_command
|
|
28
|
+
from .shell_utils import rewrite_windows_null_redirect
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
DEFAULT_LINE_LIMIT = 2_000
|
|
32
|
+
MAX_LINE_LENGTH = 2_000
|
|
33
|
+
MAX_BASH_OUTPUT_CHARS = 30_000
|
|
34
|
+
MAX_BASH_CAPTURE_CHARS = 10 * 1024 * 1024
|
|
35
|
+
MAX_WEB_FETCH_BYTES = 2 * 1024 * 1024
|
|
36
|
+
MAX_WEB_FETCH_OUTPUT_CHARS = 30_000
|
|
37
|
+
DEFAULT_WEB_SEARCH_URL = "https://html.duckduckgo.com/html/"
|
|
38
|
+
DEFAULT_WEB_SEARCH_RESULTS = 8
|
|
39
|
+
PDF_LARGE_PAGE_THRESHOLD = 10
|
|
40
|
+
PDF_MAX_PAGE_RANGE = 20
|
|
41
|
+
MAX_CANDIDATE_COUNT = 5
|
|
42
|
+
MIN_FUZZY_SCORE = 0.45
|
|
43
|
+
IGNORED_DIRECTORY_ENTRIES = {
|
|
44
|
+
".git",
|
|
45
|
+
".mypy_cache",
|
|
46
|
+
".pytest_cache",
|
|
47
|
+
".ruff_cache",
|
|
48
|
+
".venv",
|
|
49
|
+
"__pycache__",
|
|
50
|
+
"build",
|
|
51
|
+
"dist",
|
|
52
|
+
"node_modules",
|
|
53
|
+
"wheels",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _resolve_in_cwd(cwd: Path, path: str) -> Path:
|
|
58
|
+
candidate = Path(path).expanduser()
|
|
59
|
+
if not candidate.is_absolute():
|
|
60
|
+
candidate = cwd / candidate
|
|
61
|
+
return candidate.resolve()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _resolve_read_target(cwd: Path, path: str) -> tuple[Path | None, str | None]:
|
|
65
|
+
candidate = Path(path).expanduser()
|
|
66
|
+
target = _resolve_in_cwd(cwd, path)
|
|
67
|
+
if target.exists() or candidate.is_absolute():
|
|
68
|
+
return target, None
|
|
69
|
+
if candidate.parts and candidate.parts[0] == "..":
|
|
70
|
+
return None, "Relative read paths must stay within the current project."
|
|
71
|
+
|
|
72
|
+
suffix = _normalize_relative_suffix(path)
|
|
73
|
+
if not suffix:
|
|
74
|
+
return target, None
|
|
75
|
+
matches = _find_suffix_matches(cwd, suffix)
|
|
76
|
+
if len(matches) > 1:
|
|
77
|
+
shown = "\n".join(str(match) for match in matches[:3])
|
|
78
|
+
more = f"\n...and {len(matches) - 3} more." if len(matches) > 3 else ""
|
|
79
|
+
return (
|
|
80
|
+
None,
|
|
81
|
+
"File path is ambiguous and may refer to multiple files:\n" + shown + more,
|
|
82
|
+
)
|
|
83
|
+
if len(matches) == 1:
|
|
84
|
+
return matches[0], None
|
|
85
|
+
return target, None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _snippet_metadata(snippet: FileSnippet) -> dict[str, object]:
|
|
89
|
+
return {
|
|
90
|
+
"id": snippet.id,
|
|
91
|
+
"filePath": str(snippet.path),
|
|
92
|
+
"file_path": str(snippet.path),
|
|
93
|
+
"startLine": snippet.start_line,
|
|
94
|
+
"endLine": snippet.end_line,
|
|
95
|
+
"start_line": snippet.start_line,
|
|
96
|
+
"end_line": snippet.end_line,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _edit_scope(text: str, snippet: FileSnippet | None) -> tuple[int, int]:
|
|
101
|
+
if snippet is None:
|
|
102
|
+
return 0, len(text)
|
|
103
|
+
return _line_scope_offsets(text, snippet.start_line, snippet.end_line)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _line_scope_offsets(text: str, start_line: int, end_line: int) -> tuple[int, int]:
|
|
107
|
+
lines = text.splitlines(keepends=True)
|
|
108
|
+
if not lines:
|
|
109
|
+
return 0, 0
|
|
110
|
+
start_idx = min(max(start_line - 1, 0), len(lines))
|
|
111
|
+
end_idx = min(max(end_line, start_idx), len(lines))
|
|
112
|
+
start = sum(len(line) for line in lines[:start_idx])
|
|
113
|
+
end = sum(len(line) for line in lines[:end_idx])
|
|
114
|
+
return start, end
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass(frozen=True)
|
|
118
|
+
class MatchOccurrence:
|
|
119
|
+
start_offset: int
|
|
120
|
+
end_offset: int
|
|
121
|
+
start_line: int
|
|
122
|
+
end_line: int
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass(frozen=True)
|
|
126
|
+
class ClosestMatch:
|
|
127
|
+
text: str
|
|
128
|
+
start_line: int
|
|
129
|
+
end_line: int
|
|
130
|
+
score: float
|
|
131
|
+
strategy: str
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass(frozen=True)
|
|
135
|
+
class TextFileMetadata:
|
|
136
|
+
content: str
|
|
137
|
+
encoding: str
|
|
138
|
+
line_endings: str
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass(frozen=True)
|
|
142
|
+
class WebSearchPreparation:
|
|
143
|
+
original_query: str
|
|
144
|
+
resolved_query: str
|
|
145
|
+
dominant_language: str
|
|
146
|
+
language_reason: str
|
|
147
|
+
translated: bool = False
|
|
148
|
+
|
|
149
|
+
def metadata(self) -> dict[str, object]:
|
|
150
|
+
return {
|
|
151
|
+
"query": self.resolved_query,
|
|
152
|
+
"originalQuery": self.original_query,
|
|
153
|
+
"resolvedQuery": self.resolved_query,
|
|
154
|
+
"translated": self.translated,
|
|
155
|
+
"dominantLanguage": self.dominant_language,
|
|
156
|
+
"languageReason": self.language_reason,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass(frozen=True)
|
|
161
|
+
class WebSearchResult:
|
|
162
|
+
title: str
|
|
163
|
+
url: str
|
|
164
|
+
snippet: str = ""
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _find_occurrences(text: str, needle: str, scope: tuple[int, int]) -> list[MatchOccurrence]:
|
|
168
|
+
matches: list[MatchOccurrence] = []
|
|
169
|
+
scoped_text = text[scope[0] : scope[1]]
|
|
170
|
+
search_index = 0
|
|
171
|
+
while True:
|
|
172
|
+
found = scoped_text.find(needle, search_index)
|
|
173
|
+
if found == -1:
|
|
174
|
+
return matches
|
|
175
|
+
start_offset = scope[0] + found
|
|
176
|
+
end_offset = start_offset + len(needle)
|
|
177
|
+
matches.append(
|
|
178
|
+
MatchOccurrence(
|
|
179
|
+
start_offset=start_offset,
|
|
180
|
+
end_offset=end_offset,
|
|
181
|
+
start_line=_offset_to_line(text, start_offset),
|
|
182
|
+
end_line=_offset_to_line(text, max(start_offset, end_offset - 1)),
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
search_index = found + len(needle)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _offset_to_line(text: str, offset: int) -> int:
|
|
189
|
+
if offset <= 0:
|
|
190
|
+
return 1
|
|
191
|
+
return text.count("\n", 0, min(offset, len(text))) + 1
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _build_candidate_metadata(
|
|
195
|
+
file_state: FileState,
|
|
196
|
+
path: Path,
|
|
197
|
+
text: str,
|
|
198
|
+
matches: list[MatchOccurrence],
|
|
199
|
+
) -> list[dict[str, object]]:
|
|
200
|
+
candidates = []
|
|
201
|
+
for match in matches[:MAX_CANDIDATE_COUNT]:
|
|
202
|
+
preview = _build_candidate_preview(text, match.start_line, match.end_line)
|
|
203
|
+
snippet = file_state.create_snippet(
|
|
204
|
+
path,
|
|
205
|
+
start_line=match.start_line,
|
|
206
|
+
end_line=match.end_line,
|
|
207
|
+
text=preview,
|
|
208
|
+
)
|
|
209
|
+
candidates.append(
|
|
210
|
+
{
|
|
211
|
+
"snippet_id": snippet.id,
|
|
212
|
+
"start_line": match.start_line,
|
|
213
|
+
"end_line": match.end_line,
|
|
214
|
+
"preview": preview,
|
|
215
|
+
}
|
|
216
|
+
)
|
|
217
|
+
return candidates
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _build_candidate_preview(text: str, start_line: int, end_line: int) -> str:
|
|
221
|
+
lines = text.splitlines()
|
|
222
|
+
selected = lines[start_line - 1 : end_line]
|
|
223
|
+
return "\n".join(
|
|
224
|
+
f"{str(start_line + index).rjust(6)}\t{line}"
|
|
225
|
+
for index, line in enumerate(selected)
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _build_closest_match_metadata(
|
|
230
|
+
file_state: FileState,
|
|
231
|
+
path: Path,
|
|
232
|
+
closest_match: ClosestMatch,
|
|
233
|
+
) -> dict[str, object]:
|
|
234
|
+
preview = _build_candidate_preview(text=closest_match.text, start_line=1, end_line=10)
|
|
235
|
+
if preview:
|
|
236
|
+
preview = _renumber_preview(preview, closest_match.start_line)
|
|
237
|
+
snippet = file_state.create_snippet(
|
|
238
|
+
path,
|
|
239
|
+
start_line=closest_match.start_line,
|
|
240
|
+
end_line=closest_match.end_line,
|
|
241
|
+
text=preview,
|
|
242
|
+
)
|
|
243
|
+
return {
|
|
244
|
+
"snippet_id": snippet.id,
|
|
245
|
+
"start_line": closest_match.start_line,
|
|
246
|
+
"end_line": closest_match.end_line,
|
|
247
|
+
"similarity": round(closest_match.score, 3),
|
|
248
|
+
"strategy": closest_match.strategy,
|
|
249
|
+
"preview": preview,
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _renumber_preview(preview: str, start_line: int) -> str:
|
|
254
|
+
lines = [line.split("\t", 1)[1] if "\t" in line else line for line in preview.splitlines()]
|
|
255
|
+
return "\n".join(
|
|
256
|
+
f"{str(start_line + index).rjust(6)}\t{line}"
|
|
257
|
+
for index, line in enumerate(lines)
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _format_scope_metadata(
|
|
262
|
+
path: Path,
|
|
263
|
+
snippet: FileSnippet | None,
|
|
264
|
+
scope: tuple[int, int],
|
|
265
|
+
text: str,
|
|
266
|
+
) -> dict[str, object]:
|
|
267
|
+
if snippet is not None:
|
|
268
|
+
return {
|
|
269
|
+
**_snippet_metadata(snippet),
|
|
270
|
+
"type": "snippet",
|
|
271
|
+
"snippet_id": snippet.id,
|
|
272
|
+
}
|
|
273
|
+
return {
|
|
274
|
+
"type": "full",
|
|
275
|
+
"filePath": str(path),
|
|
276
|
+
"file_path": str(path),
|
|
277
|
+
"startLine": 1,
|
|
278
|
+
"endLine": _offset_to_line(text, max(scope[0], scope[1] - 1)),
|
|
279
|
+
"start_line": 1,
|
|
280
|
+
"end_line": _offset_to_line(text, max(scope[0], scope[1] - 1)),
|
|
281
|
+
"snippet_id": None,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _apply_replacements(
|
|
286
|
+
text: str,
|
|
287
|
+
matches: list[MatchOccurrence],
|
|
288
|
+
replacement: str,
|
|
289
|
+
replace_all: bool,
|
|
290
|
+
) -> str:
|
|
291
|
+
selected_matches = matches if replace_all else matches[:1]
|
|
292
|
+
result = []
|
|
293
|
+
cursor = 0
|
|
294
|
+
for match in selected_matches:
|
|
295
|
+
result.append(text[cursor : match.start_offset])
|
|
296
|
+
result.append(replacement)
|
|
297
|
+
cursor = match.end_offset
|
|
298
|
+
result.append(text[cursor:])
|
|
299
|
+
return "".join(result)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _find_loose_escape_occurrences(
|
|
303
|
+
text: str,
|
|
304
|
+
needle: str,
|
|
305
|
+
scope: tuple[int, int],
|
|
306
|
+
) -> list[tuple[MatchOccurrence, float, str]]:
|
|
307
|
+
pattern = _build_loose_escape_pattern(needle)
|
|
308
|
+
if pattern is None:
|
|
309
|
+
return []
|
|
310
|
+
scoped_text = text[scope[0] : scope[1]]
|
|
311
|
+
normalized_needle = _normalize_loose_text(needle)
|
|
312
|
+
matches = []
|
|
313
|
+
for regex_match in pattern.finditer(scoped_text):
|
|
314
|
+
start_offset = scope[0] + regex_match.start()
|
|
315
|
+
end_offset = scope[0] + regex_match.end()
|
|
316
|
+
matched_text = regex_match.group(0)
|
|
317
|
+
matches.append(
|
|
318
|
+
(
|
|
319
|
+
MatchOccurrence(
|
|
320
|
+
start_offset=start_offset,
|
|
321
|
+
end_offset=end_offset,
|
|
322
|
+
start_line=_offset_to_line(text, start_offset),
|
|
323
|
+
end_line=_offset_to_line(text, max(start_offset, end_offset - 1)),
|
|
324
|
+
),
|
|
325
|
+
_similarity_score(normalized_needle, _normalize_loose_text(matched_text)),
|
|
326
|
+
matched_text,
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
return matches
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _build_loose_escape_pattern(source: str) -> re.Pattern[str] | None:
|
|
333
|
+
if not source:
|
|
334
|
+
return None
|
|
335
|
+
pattern = []
|
|
336
|
+
index = 0
|
|
337
|
+
while index < len(source):
|
|
338
|
+
if source[index] == "\\":
|
|
339
|
+
slash_end = index
|
|
340
|
+
while slash_end < len(source) and source[slash_end] == "\\":
|
|
341
|
+
slash_end += 1
|
|
342
|
+
if slash_end < len(source) and source[slash_end] in "\"'`\\":
|
|
343
|
+
pattern.append(r"\\*")
|
|
344
|
+
pattern.append(re.escape(source[slash_end]))
|
|
345
|
+
index = slash_end + 1
|
|
346
|
+
continue
|
|
347
|
+
pattern.append(re.escape(source[index:slash_end]))
|
|
348
|
+
index = slash_end
|
|
349
|
+
continue
|
|
350
|
+
pattern.append(re.escape(source[index]))
|
|
351
|
+
index += 1
|
|
352
|
+
return re.compile("".join(pattern))
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _find_closest_match(
|
|
356
|
+
text: str,
|
|
357
|
+
needle: str,
|
|
358
|
+
scope: tuple[int, int],
|
|
359
|
+
) -> ClosestMatch | None:
|
|
360
|
+
loose_matches = _find_loose_escape_occurrences(text, needle, scope)
|
|
361
|
+
best_loose: ClosestMatch | None = None
|
|
362
|
+
for occurrence, score, matched_text in loose_matches:
|
|
363
|
+
candidate = ClosestMatch(
|
|
364
|
+
text=matched_text,
|
|
365
|
+
start_line=occurrence.start_line,
|
|
366
|
+
end_line=occurrence.end_line,
|
|
367
|
+
score=score,
|
|
368
|
+
strategy="loose_escape",
|
|
369
|
+
)
|
|
370
|
+
if best_loose is None or candidate.score > best_loose.score:
|
|
371
|
+
best_loose = candidate
|
|
372
|
+
if best_loose is not None:
|
|
373
|
+
return best_loose
|
|
374
|
+
|
|
375
|
+
normalized_target = _normalize_loose_text(needle)
|
|
376
|
+
target_line_count = max(1, len(needle.splitlines()) or 1)
|
|
377
|
+
window_sizes = sorted({max(1, target_line_count - 1), target_line_count, target_line_count + 1})
|
|
378
|
+
start_line = _offset_to_line(text, scope[0])
|
|
379
|
+
end_line = _offset_to_line(text, max(scope[0], scope[1] - 1))
|
|
380
|
+
best_match: ClosestMatch | None = None
|
|
381
|
+
for line in range(start_line, end_line + 1):
|
|
382
|
+
for window_size in window_sizes:
|
|
383
|
+
candidate_end = line + window_size - 1
|
|
384
|
+
if candidate_end > end_line:
|
|
385
|
+
continue
|
|
386
|
+
candidate_text = _slice_lines(text, line, candidate_end)
|
|
387
|
+
score = _similarity_score(normalized_target, _normalize_loose_text(candidate_text))
|
|
388
|
+
if score < MIN_FUZZY_SCORE:
|
|
389
|
+
continue
|
|
390
|
+
candidate = ClosestMatch(
|
|
391
|
+
text=candidate_text,
|
|
392
|
+
start_line=line,
|
|
393
|
+
end_line=candidate_end,
|
|
394
|
+
score=score,
|
|
395
|
+
strategy="fuzzy_window",
|
|
396
|
+
)
|
|
397
|
+
if best_match is None or candidate.score > best_match.score:
|
|
398
|
+
best_match = candidate
|
|
399
|
+
return best_match
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _correct_escaped_strings_with_llm(
|
|
403
|
+
settings: Settings,
|
|
404
|
+
*,
|
|
405
|
+
snippet_text: str,
|
|
406
|
+
old: str,
|
|
407
|
+
new: str,
|
|
408
|
+
matched_text: str,
|
|
409
|
+
) -> tuple[str, str] | None:
|
|
410
|
+
if not settings.model.api_key or not settings.model.base_url or not settings.model.name:
|
|
411
|
+
return None
|
|
412
|
+
try:
|
|
413
|
+
content = _edit_correction_chat(settings, snippet_text, old, new, matched_text)
|
|
414
|
+
parsed = _parse_corrected_edit_strings(content)
|
|
415
|
+
if parsed is None:
|
|
416
|
+
return None
|
|
417
|
+
corrected_old, corrected_new = parsed
|
|
418
|
+
if _normalize_loose_text(corrected_old) != _normalize_loose_text(old):
|
|
419
|
+
return None
|
|
420
|
+
if _normalize_loose_text(corrected_new) != _normalize_loose_text(new):
|
|
421
|
+
return None
|
|
422
|
+
if corrected_old == corrected_new:
|
|
423
|
+
return None
|
|
424
|
+
return corrected_old, corrected_new
|
|
425
|
+
except Exception:
|
|
426
|
+
return None
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _edit_correction_chat(
|
|
430
|
+
settings: Settings,
|
|
431
|
+
snippet_text: str,
|
|
432
|
+
old: str,
|
|
433
|
+
new: str,
|
|
434
|
+
matched_text: str,
|
|
435
|
+
) -> str:
|
|
436
|
+
from openai import OpenAI
|
|
437
|
+
|
|
438
|
+
client = OpenAI(api_key=settings.model.api_key, base_url=settings.model.base_url)
|
|
439
|
+
response = client.chat.completions.create(
|
|
440
|
+
model=settings.model.name,
|
|
441
|
+
messages=[
|
|
442
|
+
{
|
|
443
|
+
"role": "system",
|
|
444
|
+
"content": (
|
|
445
|
+
"You correct file-edit strings when the only problem is escaping. "
|
|
446
|
+
"Return XML only using <response><corrected_old_string>...</corrected_old_string>"
|
|
447
|
+
"<corrected_new_string>...</corrected_new_string></response>. "
|
|
448
|
+
"Do not change semantics; only fix quoting or escaping so corrected_old_string "
|
|
449
|
+
"matches the snippet exactly."
|
|
450
|
+
),
|
|
451
|
+
},
|
|
452
|
+
{
|
|
453
|
+
"role": "user",
|
|
454
|
+
"content": (
|
|
455
|
+
"<request>\n"
|
|
456
|
+
f" <snippet_text><![CDATA[{snippet_text}]]></snippet_text>\n"
|
|
457
|
+
f" <old_string><![CDATA[{old}]]></old_string>\n"
|
|
458
|
+
f" <new_string><![CDATA[{new}]]></new_string>\n"
|
|
459
|
+
f" <matched_text><![CDATA[{matched_text}]]></matched_text>\n"
|
|
460
|
+
"</request>\n"
|
|
461
|
+
"<output_format>\n"
|
|
462
|
+
" <response>\n"
|
|
463
|
+
" <corrected_old_string><![CDATA[...]]></corrected_old_string>\n"
|
|
464
|
+
" <corrected_new_string><![CDATA[...]]></corrected_new_string>\n"
|
|
465
|
+
" </response>\n"
|
|
466
|
+
"</output_format>"
|
|
467
|
+
),
|
|
468
|
+
},
|
|
469
|
+
],
|
|
470
|
+
)
|
|
471
|
+
content = response.choices[0].message.content
|
|
472
|
+
return content.strip() if isinstance(content, str) else ""
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def _parse_corrected_edit_strings(content: str) -> tuple[str, str] | None:
|
|
476
|
+
normalized = _strip_code_fence(content).strip()
|
|
477
|
+
if not normalized:
|
|
478
|
+
return None
|
|
479
|
+
old_match = re.search(
|
|
480
|
+
r"<corrected_old_string>(?:<!\[CDATA\[([\s\S]*?)\]\]>|([\s\S]*?))</corrected_old_string>",
|
|
481
|
+
normalized,
|
|
482
|
+
flags=re.IGNORECASE,
|
|
483
|
+
)
|
|
484
|
+
new_match = re.search(
|
|
485
|
+
r"<corrected_new_string>(?:<!\[CDATA\[([\s\S]*?)\]\]>|([\s\S]*?))</corrected_new_string>",
|
|
486
|
+
normalized,
|
|
487
|
+
flags=re.IGNORECASE,
|
|
488
|
+
)
|
|
489
|
+
corrected_old = old_match.group(1) or old_match.group(2) if old_match else None
|
|
490
|
+
corrected_new = new_match.group(1) or new_match.group(2) if new_match else None
|
|
491
|
+
if isinstance(corrected_old, str) and isinstance(corrected_new, str):
|
|
492
|
+
return corrected_old, corrected_new
|
|
493
|
+
return None
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _slice_lines(text: str, start_line: int, end_line: int) -> str:
|
|
497
|
+
lines = text.splitlines(keepends=True)
|
|
498
|
+
return "".join(lines[start_line - 1 : end_line])
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def _normalize_loose_text(value: str) -> str:
|
|
502
|
+
normalized = value.replace("\r\n", "\n").replace("\r", "\n")
|
|
503
|
+
normalized = re.sub(r"\\+(?=[\"'`\\])", "", normalized)
|
|
504
|
+
normalized = re.sub(r"[ \t]+", " ", normalized)
|
|
505
|
+
return normalized.strip()
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def _similarity_score(left: str, right: str) -> float:
|
|
509
|
+
if left == right:
|
|
510
|
+
return 1.0
|
|
511
|
+
if not left or not right:
|
|
512
|
+
return 0.0
|
|
513
|
+
left_bigrams = _to_bigrams(left)
|
|
514
|
+
right_bigrams = _to_bigrams(right)
|
|
515
|
+
if not left_bigrams or not right_bigrams:
|
|
516
|
+
return 1.0 if left == right else 0.0
|
|
517
|
+
right_counts: dict[str, int] = {}
|
|
518
|
+
for bigram in right_bigrams:
|
|
519
|
+
right_counts[bigram] = right_counts.get(bigram, 0) + 1
|
|
520
|
+
overlap = 0
|
|
521
|
+
for bigram in left_bigrams:
|
|
522
|
+
count = right_counts.get(bigram, 0)
|
|
523
|
+
if count > 0:
|
|
524
|
+
overlap += 1
|
|
525
|
+
right_counts[bigram] = count - 1
|
|
526
|
+
return (2 * overlap) / (len(left_bigrams) + len(right_bigrams))
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _to_bigrams(value: str) -> list[str]:
|
|
530
|
+
if len(value) < 2:
|
|
531
|
+
return [value]
|
|
532
|
+
return [value[index : index + 2] for index in range(len(value) - 1)]
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def _prepare_web_search_query(query: str) -> WebSearchPreparation:
|
|
536
|
+
stripped = " ".join(query.split())
|
|
537
|
+
contains_chinese = _contains_chinese_char(stripped)
|
|
538
|
+
if contains_chinese:
|
|
539
|
+
return WebSearchPreparation(
|
|
540
|
+
original_query=query,
|
|
541
|
+
resolved_query=stripped,
|
|
542
|
+
dominant_language="zh",
|
|
543
|
+
language_reason="The query contains Chinese characters.",
|
|
544
|
+
)
|
|
545
|
+
return WebSearchPreparation(
|
|
546
|
+
original_query=query,
|
|
547
|
+
resolved_query=stripped,
|
|
548
|
+
dominant_language="en",
|
|
549
|
+
language_reason="The query does not contain Chinese characters.",
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def _prepare_web_search_query_with_llm(
|
|
554
|
+
query: str,
|
|
555
|
+
settings: Settings,
|
|
556
|
+
) -> tuple[WebSearchPreparation, str | None]:
|
|
557
|
+
stripped = " ".join(query.split())
|
|
558
|
+
if not settings.model.api_key or not settings.model.base_url or not settings.model.name:
|
|
559
|
+
return (
|
|
560
|
+
_prepare_web_search_query(query),
|
|
561
|
+
"WebSearch default mode requires a valid LLM configuration.",
|
|
562
|
+
)
|
|
563
|
+
try:
|
|
564
|
+
decision = _decide_search_language_with_llm(stripped, settings)
|
|
565
|
+
contains_chinese = _contains_chinese_char(stripped)
|
|
566
|
+
if decision["dominant_language"] == "en" and contains_chinese:
|
|
567
|
+
translated = _translate_search_query_with_llm(stripped, "English", settings)
|
|
568
|
+
if translated:
|
|
569
|
+
return (
|
|
570
|
+
WebSearchPreparation(
|
|
571
|
+
original_query=query,
|
|
572
|
+
resolved_query=translated,
|
|
573
|
+
dominant_language="en",
|
|
574
|
+
language_reason=decision["reason"],
|
|
575
|
+
translated=True,
|
|
576
|
+
),
|
|
577
|
+
None,
|
|
578
|
+
)
|
|
579
|
+
if decision["dominant_language"] == "zh" and not contains_chinese:
|
|
580
|
+
translated = _translate_search_query_with_llm(stripped, "Chinese", settings)
|
|
581
|
+
if translated:
|
|
582
|
+
return (
|
|
583
|
+
WebSearchPreparation(
|
|
584
|
+
original_query=query,
|
|
585
|
+
resolved_query=translated,
|
|
586
|
+
dominant_language="zh",
|
|
587
|
+
language_reason=decision["reason"],
|
|
588
|
+
translated=True,
|
|
589
|
+
),
|
|
590
|
+
None,
|
|
591
|
+
)
|
|
592
|
+
return (
|
|
593
|
+
WebSearchPreparation(
|
|
594
|
+
original_query=query,
|
|
595
|
+
resolved_query=stripped,
|
|
596
|
+
dominant_language=decision["dominant_language"],
|
|
597
|
+
language_reason=decision["reason"],
|
|
598
|
+
),
|
|
599
|
+
None,
|
|
600
|
+
)
|
|
601
|
+
except Exception as exc:
|
|
602
|
+
return _prepare_web_search_query(query), str(exc)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def _decide_search_language_with_llm(query: str, settings: Settings) -> dict[str, str]:
|
|
606
|
+
prompt = (
|
|
607
|
+
"Decide whether the topic below has more useful online material in English or Chinese.\n\n"
|
|
608
|
+
"Topic:\n"
|
|
609
|
+
"```text\n"
|
|
610
|
+
f"{query}\n"
|
|
611
|
+
"```\n\n"
|
|
612
|
+
"Return strict JSON:\n"
|
|
613
|
+
'{"dominant_language":"en"|"zh","reason":"one short sentence"}\n'
|
|
614
|
+
"Do not include markdown or any extra text."
|
|
615
|
+
)
|
|
616
|
+
parsed = _parse_json_response(_web_search_chat(settings, prompt))
|
|
617
|
+
dominant_language = parsed.get("dominant_language")
|
|
618
|
+
if dominant_language not in {"en", "zh"}:
|
|
619
|
+
raise ValueError(f"Unexpected dominant language: {dominant_language}")
|
|
620
|
+
reason = parsed.get("reason")
|
|
621
|
+
return {
|
|
622
|
+
"dominant_language": dominant_language,
|
|
623
|
+
"reason": reason if isinstance(reason, str) else "",
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def _translate_search_query_with_llm(query: str, target_language: str, settings: Settings) -> str:
|
|
628
|
+
prompt = (
|
|
629
|
+
f"Translate the query text below into {target_language}.\n\n"
|
|
630
|
+
"Requirements:\n"
|
|
631
|
+
"- Preserve product names, library names, API names, versions, and abbreviations when appropriate.\n"
|
|
632
|
+
"- Return only the translated query, without quotes or explanation.\n\n"
|
|
633
|
+
"Query:\n"
|
|
634
|
+
"```text\n"
|
|
635
|
+
f"{query}\n"
|
|
636
|
+
"```"
|
|
637
|
+
)
|
|
638
|
+
return _strip_code_fence(_web_search_chat(settings, prompt)).strip().strip("\"'")
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def _web_search_chat(settings: Settings, prompt: str) -> str:
|
|
642
|
+
from openai import OpenAI
|
|
643
|
+
|
|
644
|
+
client = OpenAI(api_key=settings.model.api_key, base_url=settings.model.base_url)
|
|
645
|
+
response = client.chat.completions.create(
|
|
646
|
+
model=settings.model.name,
|
|
647
|
+
messages=[{"role": "user", "content": prompt}],
|
|
648
|
+
)
|
|
649
|
+
content = response.choices[0].message.content
|
|
650
|
+
if isinstance(content, str):
|
|
651
|
+
return content.strip()
|
|
652
|
+
if isinstance(content, list):
|
|
653
|
+
parts = []
|
|
654
|
+
for part in content:
|
|
655
|
+
text = part.get("text") if isinstance(part, dict) else getattr(part, "text", "")
|
|
656
|
+
if isinstance(text, str):
|
|
657
|
+
parts.append(text)
|
|
658
|
+
return "\n".join(parts).strip()
|
|
659
|
+
return ""
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def _parse_json_response(text: str) -> dict[str, object]:
|
|
663
|
+
cleaned = _strip_code_fence(text).strip()
|
|
664
|
+
try:
|
|
665
|
+
parsed = json_utils.loads(cleaned)
|
|
666
|
+
except json_utils.JSONDecodeError:
|
|
667
|
+
first_brace = cleaned.find("{")
|
|
668
|
+
last_brace = cleaned.rfind("}")
|
|
669
|
+
if first_brace < 0 or last_brace <= first_brace:
|
|
670
|
+
raise ValueError(f"Failed to parse JSON response: {cleaned or '<empty>'}")
|
|
671
|
+
parsed = json_utils.loads(cleaned[first_brace : last_brace + 1])
|
|
672
|
+
if not isinstance(parsed, dict):
|
|
673
|
+
raise ValueError("JSON response must be an object.")
|
|
674
|
+
return parsed
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def _strip_code_fence(text: str) -> str:
|
|
678
|
+
trimmed = text.strip()
|
|
679
|
+
match = re.match(r"^```(?:[\w-]+)?\n([\s\S]*?)\n```$", trimmed)
|
|
680
|
+
return match.group(1) if match else trimmed
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def _contains_chinese_char(text: str) -> bool:
|
|
684
|
+
return any("\u4e00" <= char <= "\u9fff" for char in text)
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def _format_web_search_activity_label(query: str) -> str:
|
|
688
|
+
normalized = " ".join(query.split())
|
|
689
|
+
if len(normalized) > 180:
|
|
690
|
+
normalized = normalized[:177] + "..."
|
|
691
|
+
return f"WebSearch: {normalized}"
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
class _SearchResultParser(HTMLParser):
|
|
695
|
+
def __init__(self) -> None:
|
|
696
|
+
super().__init__(convert_charrefs=True)
|
|
697
|
+
self.results: list[WebSearchResult] = []
|
|
698
|
+
self._current_title: list[str] | None = None
|
|
699
|
+
self._current_url: str = ""
|
|
700
|
+
self._snippet_index: int | None = None
|
|
701
|
+
self._snippet_chunks: list[str] = []
|
|
702
|
+
|
|
703
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
704
|
+
values = {key: value or "" for key, value in attrs}
|
|
705
|
+
classes = set(values.get("class", "").split())
|
|
706
|
+
if tag == "a" and "result__a" in classes:
|
|
707
|
+
self._current_title = []
|
|
708
|
+
self._current_url = _decode_search_result_url(values.get("href", ""))
|
|
709
|
+
return
|
|
710
|
+
if "result__snippet" in classes and self.results:
|
|
711
|
+
self._snippet_index = len(self.results) - 1
|
|
712
|
+
self._snippet_chunks = []
|
|
713
|
+
|
|
714
|
+
def handle_data(self, data: str) -> None:
|
|
715
|
+
if self._current_title is not None:
|
|
716
|
+
self._current_title.append(data)
|
|
717
|
+
elif self._snippet_index is not None:
|
|
718
|
+
self._snippet_chunks.append(data)
|
|
719
|
+
|
|
720
|
+
def handle_endtag(self, tag: str) -> None:
|
|
721
|
+
if tag == "a" and self._current_title is not None:
|
|
722
|
+
title = " ".join("".join(self._current_title).split())
|
|
723
|
+
if title and self._current_url:
|
|
724
|
+
self.results.append(WebSearchResult(title=title, url=self._current_url))
|
|
725
|
+
self._current_title = None
|
|
726
|
+
self._current_url = ""
|
|
727
|
+
return
|
|
728
|
+
if self._snippet_index is not None and tag in {"a", "div", "td"}:
|
|
729
|
+
snippet = " ".join("".join(self._snippet_chunks).split())
|
|
730
|
+
if snippet:
|
|
731
|
+
result = self.results[self._snippet_index]
|
|
732
|
+
self.results[self._snippet_index] = WebSearchResult(
|
|
733
|
+
title=result.title,
|
|
734
|
+
url=result.url,
|
|
735
|
+
snippet=snippet,
|
|
736
|
+
)
|
|
737
|
+
self._snippet_index = None
|
|
738
|
+
self._snippet_chunks = []
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _decode_search_result_url(href: str) -> str:
|
|
742
|
+
parsed = urllib.parse.urlparse(href)
|
|
743
|
+
query = urllib.parse.parse_qs(parsed.query)
|
|
744
|
+
target = query.get("uddg", [""])[0]
|
|
745
|
+
if target:
|
|
746
|
+
return target
|
|
747
|
+
if parsed.scheme and parsed.netloc:
|
|
748
|
+
return href
|
|
749
|
+
return urllib.parse.urljoin("https://duckduckgo.com", href)
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def _parse_search_results(html: str) -> list[WebSearchResult]:
|
|
753
|
+
parser = _SearchResultParser()
|
|
754
|
+
parser.feed(html)
|
|
755
|
+
unique: list[WebSearchResult] = []
|
|
756
|
+
seen_urls: set[str] = set()
|
|
757
|
+
for result in parser.results:
|
|
758
|
+
if result.url in seen_urls:
|
|
759
|
+
continue
|
|
760
|
+
seen_urls.add(result.url)
|
|
761
|
+
unique.append(result)
|
|
762
|
+
return unique
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
def _format_search_results(query: str, results: list[WebSearchResult]) -> str:
|
|
766
|
+
lines = [f"Web search results for: {query}", ""]
|
|
767
|
+
for index, result in enumerate(results[:DEFAULT_WEB_SEARCH_RESULTS], start=1):
|
|
768
|
+
lines.append(f"{index}. {result.title}")
|
|
769
|
+
lines.append(f" {result.url}")
|
|
770
|
+
if result.snippet:
|
|
771
|
+
lines.append(f" {result.snippet}")
|
|
772
|
+
lines.append("")
|
|
773
|
+
return "\n".join(lines).strip()
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
class _ReadableHtmlParser(HTMLParser):
|
|
777
|
+
BLOCK_TAGS = {
|
|
778
|
+
"address",
|
|
779
|
+
"article",
|
|
780
|
+
"aside",
|
|
781
|
+
"blockquote",
|
|
782
|
+
"br",
|
|
783
|
+
"dd",
|
|
784
|
+
"div",
|
|
785
|
+
"dl",
|
|
786
|
+
"dt",
|
|
787
|
+
"figcaption",
|
|
788
|
+
"figure",
|
|
789
|
+
"footer",
|
|
790
|
+
"h1",
|
|
791
|
+
"h2",
|
|
792
|
+
"h3",
|
|
793
|
+
"h4",
|
|
794
|
+
"h5",
|
|
795
|
+
"h6",
|
|
796
|
+
"header",
|
|
797
|
+
"hr",
|
|
798
|
+
"li",
|
|
799
|
+
"main",
|
|
800
|
+
"nav",
|
|
801
|
+
"ol",
|
|
802
|
+
"p",
|
|
803
|
+
"pre",
|
|
804
|
+
"section",
|
|
805
|
+
"table",
|
|
806
|
+
"tbody",
|
|
807
|
+
"td",
|
|
808
|
+
"tfoot",
|
|
809
|
+
"th",
|
|
810
|
+
"thead",
|
|
811
|
+
"tr",
|
|
812
|
+
"ul",
|
|
813
|
+
}
|
|
814
|
+
SKIP_TAGS = {"script", "style", "noscript", "svg"}
|
|
815
|
+
|
|
816
|
+
def __init__(self) -> None:
|
|
817
|
+
super().__init__(convert_charrefs=True)
|
|
818
|
+
self.title_parts: list[str] = []
|
|
819
|
+
self.text_parts: list[str] = []
|
|
820
|
+
self._in_title = False
|
|
821
|
+
self._skip_depth = 0
|
|
822
|
+
|
|
823
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
824
|
+
del attrs
|
|
825
|
+
normalized = tag.lower()
|
|
826
|
+
if normalized in self.SKIP_TAGS:
|
|
827
|
+
self._skip_depth += 1
|
|
828
|
+
return
|
|
829
|
+
if normalized == "title":
|
|
830
|
+
self._in_title = True
|
|
831
|
+
return
|
|
832
|
+
if normalized in self.BLOCK_TAGS:
|
|
833
|
+
self._append_newline()
|
|
834
|
+
|
|
835
|
+
def handle_endtag(self, tag: str) -> None:
|
|
836
|
+
normalized = tag.lower()
|
|
837
|
+
if normalized in self.SKIP_TAGS and self._skip_depth > 0:
|
|
838
|
+
self._skip_depth -= 1
|
|
839
|
+
return
|
|
840
|
+
if normalized == "title":
|
|
841
|
+
self._in_title = False
|
|
842
|
+
return
|
|
843
|
+
if normalized in self.BLOCK_TAGS:
|
|
844
|
+
self._append_newline()
|
|
845
|
+
|
|
846
|
+
def handle_data(self, data: str) -> None:
|
|
847
|
+
text = " ".join(data.split())
|
|
848
|
+
if not text:
|
|
849
|
+
return
|
|
850
|
+
if self._in_title:
|
|
851
|
+
self.title_parts.append(text)
|
|
852
|
+
return
|
|
853
|
+
if self._skip_depth:
|
|
854
|
+
return
|
|
855
|
+
self.text_parts.append(text)
|
|
856
|
+
|
|
857
|
+
def _append_newline(self) -> None:
|
|
858
|
+
if self.text_parts and self.text_parts[-1] != "\n":
|
|
859
|
+
self.text_parts.append("\n")
|
|
860
|
+
|
|
861
|
+
@property
|
|
862
|
+
def title(self) -> str:
|
|
863
|
+
return " ".join(self.title_parts).strip()
|
|
864
|
+
|
|
865
|
+
@property
|
|
866
|
+
def readable_text(self) -> str:
|
|
867
|
+
raw = " ".join(self.text_parts)
|
|
868
|
+
raw = re.sub(r"[ \t]*\n[ \t]*", "\n", raw)
|
|
869
|
+
raw = re.sub(r"\n{3,}", "\n\n", raw)
|
|
870
|
+
return "\n".join(line.strip() for line in raw.splitlines()).strip()
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
def _validate_web_fetch_url(url: str) -> tuple[str | None, str | None]:
|
|
874
|
+
stripped = url.strip()
|
|
875
|
+
parsed = urllib.parse.urlparse(stripped)
|
|
876
|
+
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
|
877
|
+
return None, "WebFetch requires a complete http or https URL."
|
|
878
|
+
return stripped, None
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def _charset_from_content_type(content_type: str) -> str:
|
|
882
|
+
match = re.search(r"charset=([^\s;]+)", content_type, flags=re.IGNORECASE)
|
|
883
|
+
return match.group(1).strip("\"'") if match else "utf-8"
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
def _is_html_response(content_type: str, text: str) -> bool:
|
|
887
|
+
lowered = content_type.lower()
|
|
888
|
+
if "html" in lowered:
|
|
889
|
+
return True
|
|
890
|
+
prefix = text[:500].lower()
|
|
891
|
+
return "<html" in prefix or "<!doctype html" in prefix
|
|
892
|
+
|
|
893
|
+
|
|
894
|
+
def _extract_readable_html(html: str) -> tuple[str, str]:
|
|
895
|
+
parser = _ReadableHtmlParser()
|
|
896
|
+
parser.feed(html)
|
|
897
|
+
parser.close()
|
|
898
|
+
return parser.title, parser.readable_text
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
def _format_web_fetch_output(
|
|
902
|
+
*,
|
|
903
|
+
url: str,
|
|
904
|
+
final_url: str,
|
|
905
|
+
content_type: str,
|
|
906
|
+
title: str,
|
|
907
|
+
text: str,
|
|
908
|
+
bytes_truncated: bool,
|
|
909
|
+
) -> str:
|
|
910
|
+
lines = [
|
|
911
|
+
f"URL: {url}",
|
|
912
|
+
f"Final URL: {final_url}",
|
|
913
|
+
]
|
|
914
|
+
if title:
|
|
915
|
+
lines.append(f"Title: {title}")
|
|
916
|
+
if content_type:
|
|
917
|
+
lines.append(f"Content-Type: {content_type}")
|
|
918
|
+
if bytes_truncated:
|
|
919
|
+
lines.append(f"Note: response body was truncated at {MAX_WEB_FETCH_BYTES:,} bytes.")
|
|
920
|
+
lines.append("")
|
|
921
|
+
lines.append(text.strip() if text.strip() else "[No readable text extracted.]")
|
|
922
|
+
return "\n".join(lines).strip()
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
@dataclass
|
|
926
|
+
class ToolRuntime:
|
|
927
|
+
cwd: Path
|
|
928
|
+
settings: Settings
|
|
929
|
+
file_state: FileState = field(default_factory=FileState)
|
|
930
|
+
running_processes: dict[str, dict[str, str]] = field(default_factory=dict)
|
|
931
|
+
|
|
932
|
+
def read(
|
|
933
|
+
self,
|
|
934
|
+
path: str,
|
|
935
|
+
start_line: int = 1,
|
|
936
|
+
limit: int | None = None,
|
|
937
|
+
pages: str | None = None,
|
|
938
|
+
) -> str:
|
|
939
|
+
name = "read"
|
|
940
|
+
target, error = _resolve_read_target(self.cwd, path)
|
|
941
|
+
if error is not None:
|
|
942
|
+
return ToolResult.error_result(name, error).to_json()
|
|
943
|
+
if target is None or not target.exists():
|
|
944
|
+
return ToolResult.error_result(name, f"File does not exist: {path}").to_json()
|
|
945
|
+
if target.is_dir():
|
|
946
|
+
entries, visible_count, ignored_count = _format_directory_entries(target, self.cwd)
|
|
947
|
+
return ToolResult.ok_result(
|
|
948
|
+
name,
|
|
949
|
+
entries,
|
|
950
|
+
metadata={
|
|
951
|
+
"path": str(target),
|
|
952
|
+
"kind": "directory",
|
|
953
|
+
"entryCount": len(list(target.iterdir())),
|
|
954
|
+
"visibleEntryCount": visible_count,
|
|
955
|
+
"ignoredEntryCount": ignored_count,
|
|
956
|
+
},
|
|
957
|
+
).to_json()
|
|
958
|
+
|
|
959
|
+
if target.suffix.lower() == ".ipynb":
|
|
960
|
+
output, error = _format_notebook(target)
|
|
961
|
+
if error is not None:
|
|
962
|
+
return ToolResult.error_result(name, error, metadata={"path": str(target)}).to_json()
|
|
963
|
+
return ToolResult.ok_result(
|
|
964
|
+
name,
|
|
965
|
+
output,
|
|
966
|
+
metadata={
|
|
967
|
+
"path": str(target),
|
|
968
|
+
"kind": "notebook",
|
|
969
|
+
"trackedForWrite": False,
|
|
970
|
+
},
|
|
971
|
+
).to_json()
|
|
972
|
+
|
|
973
|
+
if target.suffix.lower() == ".pdf":
|
|
974
|
+
return _read_pdf(target, pages)
|
|
975
|
+
|
|
976
|
+
mime = _image_mime_type(target.suffix.lower())
|
|
977
|
+
if mime is not None:
|
|
978
|
+
data = target.read_bytes()
|
|
979
|
+
return ToolResult(
|
|
980
|
+
ok=True,
|
|
981
|
+
name=name,
|
|
982
|
+
output="File loaded.",
|
|
983
|
+
metadata={"path": str(target), "mime": mime, "bytes": len(data)},
|
|
984
|
+
followUpMessages=[_build_image_follow_up_message(target, mime, data)],
|
|
985
|
+
).to_json()
|
|
986
|
+
|
|
987
|
+
text_metadata = _read_text_metadata(target)
|
|
988
|
+
text = text_metadata.content
|
|
989
|
+
lines = text.splitlines()
|
|
990
|
+
start = max(start_line, 1) - 1
|
|
991
|
+
effective_limit = limit if limit and limit > 0 else DEFAULT_LINE_LIMIT
|
|
992
|
+
selected = lines[start : start + effective_limit]
|
|
993
|
+
formatted_lines = [_truncate_line(line) for line in selected]
|
|
994
|
+
truncated = start + len(selected) < len(lines) or any(
|
|
995
|
+
len(line) > MAX_LINE_LENGTH for line in selected
|
|
996
|
+
)
|
|
997
|
+
full_file_read = start == 0 and not truncated
|
|
998
|
+
numbered = "\n".join(
|
|
999
|
+
f"{idx + start + 1}: {line}" for idx, line in enumerate(formatted_lines)
|
|
1000
|
+
)
|
|
1001
|
+
if full_file_read:
|
|
1002
|
+
self.file_state.mark_read(target)
|
|
1003
|
+
snippet_metadata = None
|
|
1004
|
+
if not full_file_read and selected:
|
|
1005
|
+
snippet = self.file_state.create_snippet(
|
|
1006
|
+
target,
|
|
1007
|
+
start_line=start + 1,
|
|
1008
|
+
end_line=start + len(selected),
|
|
1009
|
+
text="\n".join(selected),
|
|
1010
|
+
)
|
|
1011
|
+
self.file_state.mark_read(target, full=False)
|
|
1012
|
+
snippet_metadata = _snippet_metadata(snippet)
|
|
1013
|
+
metadata = {
|
|
1014
|
+
"path": str(target),
|
|
1015
|
+
"kind": "file",
|
|
1016
|
+
"startLine": start + 1,
|
|
1017
|
+
"lineCount": len(selected),
|
|
1018
|
+
"lineLimit": effective_limit,
|
|
1019
|
+
"totalLines": len(lines),
|
|
1020
|
+
"truncated": truncated,
|
|
1021
|
+
"trackedForWrite": full_file_read,
|
|
1022
|
+
"encoding": text_metadata.encoding,
|
|
1023
|
+
}
|
|
1024
|
+
if snippet_metadata is not None:
|
|
1025
|
+
metadata["snippet"] = snippet_metadata
|
|
1026
|
+
return ToolResult.ok_result(
|
|
1027
|
+
name,
|
|
1028
|
+
numbered,
|
|
1029
|
+
metadata=metadata,
|
|
1030
|
+
).to_json()
|
|
1031
|
+
|
|
1032
|
+
def modify(
|
|
1033
|
+
self,
|
|
1034
|
+
path: str | None,
|
|
1035
|
+
*,
|
|
1036
|
+
content: object | None = None,
|
|
1037
|
+
old: str | None = None,
|
|
1038
|
+
new: str | None = None,
|
|
1039
|
+
replace_all: bool = False,
|
|
1040
|
+
snippet_id: str | None = None,
|
|
1041
|
+
) -> str:
|
|
1042
|
+
has_content = content is not None
|
|
1043
|
+
has_replacement = old is not None or new is not None
|
|
1044
|
+
if has_content and has_replacement:
|
|
1045
|
+
return ToolResult.error_result(
|
|
1046
|
+
"modify",
|
|
1047
|
+
"Use either content for a new file or old_string/new_string for an existing file, not both.",
|
|
1048
|
+
).to_json()
|
|
1049
|
+
if has_content:
|
|
1050
|
+
if not path:
|
|
1051
|
+
return ToolResult.error_result("modify", "file_path is required for new files.").to_json()
|
|
1052
|
+
target = _resolve_in_cwd(self.cwd, path)
|
|
1053
|
+
if target.exists():
|
|
1054
|
+
return ToolResult.error_result(
|
|
1055
|
+
"modify",
|
|
1056
|
+
"File already exists. Read it and use old_string/new_string with modify instead of content.",
|
|
1057
|
+
metadata={"path": str(target)},
|
|
1058
|
+
).to_json()
|
|
1059
|
+
return self.write(path, content)
|
|
1060
|
+
if old is None or new is None:
|
|
1061
|
+
return ToolResult.error_result(
|
|
1062
|
+
"modify",
|
|
1063
|
+
"Provide content for a new file, or both old_string and new_string for an existing file.",
|
|
1064
|
+
).to_json()
|
|
1065
|
+
return self.edit(path, old, new, replace_all=replace_all, snippet_id=snippet_id)
|
|
1066
|
+
|
|
1067
|
+
def write(self, path: str, content: object) -> str:
|
|
1068
|
+
name = "write"
|
|
1069
|
+
target = _resolve_in_cwd(self.cwd, path)
|
|
1070
|
+
ok, error = self.file_state.check_writable(target, require_read=True)
|
|
1071
|
+
if not ok:
|
|
1072
|
+
return ToolResult.error_result(name, error or "File is not writable.").to_json()
|
|
1073
|
+
text_content, repair_metadata, content_error = _coerce_write_content(target, content)
|
|
1074
|
+
if content_error is not None:
|
|
1075
|
+
return ToolResult.error_result(name, content_error).to_json()
|
|
1076
|
+
existing_metadata = _read_text_metadata(target) if target.exists() else None
|
|
1077
|
+
old_content = existing_metadata.content if existing_metadata is not None else ""
|
|
1078
|
+
encoding = existing_metadata.encoding if existing_metadata is not None else "utf8"
|
|
1079
|
+
line_endings = _detect_line_endings(old_content or text_content)
|
|
1080
|
+
normalized_content = _normalize_line_endings(text_content, line_endings)
|
|
1081
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
1082
|
+
_write_text_with_encoding(target, normalized_content, encoding)
|
|
1083
|
+
self.file_state.mark_written(target)
|
|
1084
|
+
diff = _unified_diff(old_content, normalized_content, path=str(target))
|
|
1085
|
+
return ToolResult.ok_result(
|
|
1086
|
+
name,
|
|
1087
|
+
f"Wrote {target}",
|
|
1088
|
+
metadata={
|
|
1089
|
+
"path": str(target),
|
|
1090
|
+
"encoding": encoding,
|
|
1091
|
+
"line_endings": line_endings,
|
|
1092
|
+
**repair_metadata,
|
|
1093
|
+
"diff": diff,
|
|
1094
|
+
"diff_preview": diff,
|
|
1095
|
+
},
|
|
1096
|
+
).to_json()
|
|
1097
|
+
|
|
1098
|
+
def edit(
|
|
1099
|
+
self,
|
|
1100
|
+
path: str | None,
|
|
1101
|
+
old: str,
|
|
1102
|
+
new: str,
|
|
1103
|
+
replace_all: bool = False,
|
|
1104
|
+
snippet_id: str | None = None,
|
|
1105
|
+
) -> str:
|
|
1106
|
+
name = "edit"
|
|
1107
|
+
if not old:
|
|
1108
|
+
return ToolResult.error_result(name, "old text must not be empty.").to_json()
|
|
1109
|
+
snippet = None
|
|
1110
|
+
if snippet_id:
|
|
1111
|
+
snippet = self.file_state.get_snippet(snippet_id)
|
|
1112
|
+
if snippet is None:
|
|
1113
|
+
return ToolResult.error_result(name, f"Unknown snippet_id: {snippet_id}").to_json()
|
|
1114
|
+
target = snippet.path
|
|
1115
|
+
if path:
|
|
1116
|
+
requested_target = _resolve_in_cwd(self.cwd, path)
|
|
1117
|
+
if requested_target != target:
|
|
1118
|
+
return ToolResult.error_result(
|
|
1119
|
+
name,
|
|
1120
|
+
"snippet_id does not belong to the provided file path.",
|
|
1121
|
+
).to_json()
|
|
1122
|
+
else:
|
|
1123
|
+
if not path:
|
|
1124
|
+
return ToolResult.error_result(
|
|
1125
|
+
name,
|
|
1126
|
+
"path is required unless snippet_id is provided.",
|
|
1127
|
+
).to_json()
|
|
1128
|
+
target = _resolve_in_cwd(self.cwd, path)
|
|
1129
|
+
if not target.exists():
|
|
1130
|
+
return ToolResult.error_result(name, f"File does not exist: {target}").to_json()
|
|
1131
|
+
ok, error = self.file_state.check_writable(
|
|
1132
|
+
target,
|
|
1133
|
+
require_read=True,
|
|
1134
|
+
allow_partial=snippet is not None,
|
|
1135
|
+
)
|
|
1136
|
+
if not ok:
|
|
1137
|
+
return ToolResult.error_result(name, error or "File is not writable.").to_json()
|
|
1138
|
+
text_metadata = _read_text_metadata(target)
|
|
1139
|
+
text = text_metadata.content
|
|
1140
|
+
scope = _edit_scope(text, snippet)
|
|
1141
|
+
matches = _find_occurrences(text, old, scope)
|
|
1142
|
+
matched_via = "exact"
|
|
1143
|
+
replacement_new = new
|
|
1144
|
+
if not matches:
|
|
1145
|
+
loose_matches = _find_loose_escape_occurrences(text, old, scope)
|
|
1146
|
+
if len(loose_matches) == 1 and loose_matches[0][1] == 1.0:
|
|
1147
|
+
corrected = _correct_escaped_strings_with_llm(
|
|
1148
|
+
self.settings,
|
|
1149
|
+
snippet_text=text[scope[0] : scope[1]],
|
|
1150
|
+
old=old,
|
|
1151
|
+
new=new,
|
|
1152
|
+
matched_text=loose_matches[0][2],
|
|
1153
|
+
)
|
|
1154
|
+
if corrected is not None:
|
|
1155
|
+
corrected_old, corrected_new = corrected
|
|
1156
|
+
corrected_matches = _find_occurrences(text, corrected_old, scope)
|
|
1157
|
+
if corrected_matches:
|
|
1158
|
+
matches = corrected_matches
|
|
1159
|
+
replacement_new = corrected_new
|
|
1160
|
+
matched_via = "llm_escape_correction"
|
|
1161
|
+
if not matches:
|
|
1162
|
+
matches = [loose_matches[0][0]]
|
|
1163
|
+
matched_via = "loose_escape"
|
|
1164
|
+
if not matches:
|
|
1165
|
+
closest_match = _find_closest_match(text, old, scope)
|
|
1166
|
+
metadata = {"scope": _format_scope_metadata(target, snippet, scope, text)}
|
|
1167
|
+
if closest_match is not None:
|
|
1168
|
+
metadata["closest_match"] = _build_closest_match_metadata(
|
|
1169
|
+
self.file_state,
|
|
1170
|
+
target,
|
|
1171
|
+
closest_match,
|
|
1172
|
+
)
|
|
1173
|
+
return ToolResult.error_result(
|
|
1174
|
+
name,
|
|
1175
|
+
"old_string not found in file.",
|
|
1176
|
+
metadata=metadata,
|
|
1177
|
+
).to_json()
|
|
1178
|
+
occurrences = len(matches)
|
|
1179
|
+
if occurrences > 1 and not replace_all:
|
|
1180
|
+
return ToolResult.error_result(
|
|
1181
|
+
name,
|
|
1182
|
+
"old_string is not unique; use snippet_id, replace_all, or provide more context.",
|
|
1183
|
+
metadata={
|
|
1184
|
+
"occurrences": occurrences,
|
|
1185
|
+
"match_count": occurrences,
|
|
1186
|
+
"scope": _format_scope_metadata(target, snippet, scope, text),
|
|
1187
|
+
"candidates": _build_candidate_metadata(
|
|
1188
|
+
self.file_state,
|
|
1189
|
+
target,
|
|
1190
|
+
text,
|
|
1191
|
+
matches,
|
|
1192
|
+
),
|
|
1193
|
+
},
|
|
1194
|
+
).to_json()
|
|
1195
|
+
line_endings = text_metadata.line_endings
|
|
1196
|
+
normalized_new = _normalize_line_endings(replacement_new, line_endings)
|
|
1197
|
+
updated = _apply_replacements(text, matches, normalized_new, replace_all)
|
|
1198
|
+
_write_text_with_encoding(target, updated, text_metadata.encoding)
|
|
1199
|
+
self.file_state.mark_written(target)
|
|
1200
|
+
diff = _unified_diff(text, updated, path=str(target))
|
|
1201
|
+
metadata = {
|
|
1202
|
+
"path": str(target),
|
|
1203
|
+
"file_path": str(target),
|
|
1204
|
+
"occurrences": occurrences if replace_all else 1,
|
|
1205
|
+
"matched_via": matched_via,
|
|
1206
|
+
"encoding": text_metadata.encoding,
|
|
1207
|
+
"line_endings": line_endings,
|
|
1208
|
+
"read_scope_type": "snippet" if snippet is not None else "full",
|
|
1209
|
+
"diff": diff,
|
|
1210
|
+
"diff_preview": diff,
|
|
1211
|
+
}
|
|
1212
|
+
if snippet is not None:
|
|
1213
|
+
metadata["scope"] = _format_scope_metadata(target, snippet, scope, text)
|
|
1214
|
+
return ToolResult.ok_result(name, f"Edited {target}", metadata=metadata).to_json()
|
|
1215
|
+
|
|
1216
|
+
def bash(self, command: str, timeout_ms: int = 120_000) -> str:
|
|
1217
|
+
name = "bash"
|
|
1218
|
+
timeout = max(timeout_ms, 1) / 1000
|
|
1219
|
+
marker = f"__DEEPY_CWD_{uuid.uuid4().hex}__"
|
|
1220
|
+
shell_path, shell_args = _build_shell_command(command, marker)
|
|
1221
|
+
process: subprocess.Popen[str] | None = None
|
|
1222
|
+
process_id: str | None = None
|
|
1223
|
+
try:
|
|
1224
|
+
with (
|
|
1225
|
+
tempfile.TemporaryFile(mode="w+", encoding="utf-8", errors="replace") as stdout_file,
|
|
1226
|
+
tempfile.TemporaryFile(mode="w+", encoding="utf-8", errors="replace") as stderr_file,
|
|
1227
|
+
):
|
|
1228
|
+
process = subprocess.Popen(
|
|
1229
|
+
[shell_path, *shell_args],
|
|
1230
|
+
cwd=self.cwd,
|
|
1231
|
+
text=True,
|
|
1232
|
+
stdout=stdout_file,
|
|
1233
|
+
stderr=stderr_file,
|
|
1234
|
+
stdin=subprocess.DEVNULL,
|
|
1235
|
+
start_new_session=os.name != "nt",
|
|
1236
|
+
)
|
|
1237
|
+
process_id = str(process.pid)
|
|
1238
|
+
self.running_processes[process_id] = {
|
|
1239
|
+
"startTime": _now_iso(),
|
|
1240
|
+
"command": command,
|
|
1241
|
+
}
|
|
1242
|
+
try:
|
|
1243
|
+
process.wait(timeout=timeout)
|
|
1244
|
+
except subprocess.TimeoutExpired:
|
|
1245
|
+
_terminate_process(process)
|
|
1246
|
+
process.wait()
|
|
1247
|
+
stdout, stdout_capture_truncated = _read_captured_output(stdout_file)
|
|
1248
|
+
stderr, stderr_capture_truncated = _read_captured_output(stderr_file)
|
|
1249
|
+
output, output_truncated = _truncate_output((stdout or "") + (stderr or ""))
|
|
1250
|
+
return ToolResult.error_result(
|
|
1251
|
+
name,
|
|
1252
|
+
f"Command timed out after {timeout_ms}ms.",
|
|
1253
|
+
output=output,
|
|
1254
|
+
metadata={
|
|
1255
|
+
"cwd": str(self.cwd),
|
|
1256
|
+
"timeoutMs": timeout_ms,
|
|
1257
|
+
"processId": process_id,
|
|
1258
|
+
"shellPath": shell_path,
|
|
1259
|
+
"interrupted": True,
|
|
1260
|
+
"outputTruncated": output_truncated,
|
|
1261
|
+
"captureTruncated": stdout_capture_truncated
|
|
1262
|
+
or stderr_capture_truncated,
|
|
1263
|
+
},
|
|
1264
|
+
).to_json()
|
|
1265
|
+
stdout, stdout_capture_truncated = _read_captured_output(stdout_file)
|
|
1266
|
+
stderr, stderr_capture_truncated = _read_captured_output(stderr_file)
|
|
1267
|
+
finally:
|
|
1268
|
+
if process_id is not None:
|
|
1269
|
+
self.running_processes.pop(process_id, None)
|
|
1270
|
+
|
|
1271
|
+
stdout, final_cwd, exit_code = _extract_bash_sentinel(stdout or "", marker)
|
|
1272
|
+
if final_cwd is not None and final_cwd.is_dir():
|
|
1273
|
+
self.cwd = final_cwd
|
|
1274
|
+
returncode = exit_code if exit_code is not None else process.returncode
|
|
1275
|
+
output, output_truncated = _truncate_output(stdout + (stderr or ""))
|
|
1276
|
+
result = ToolResult.ok_result if returncode == 0 else ToolResult.error_result
|
|
1277
|
+
if returncode == 0:
|
|
1278
|
+
return result(
|
|
1279
|
+
name,
|
|
1280
|
+
output,
|
|
1281
|
+
metadata={
|
|
1282
|
+
"cwd": str(self.cwd),
|
|
1283
|
+
"exitCode": returncode,
|
|
1284
|
+
"processId": process_id,
|
|
1285
|
+
"shellPath": shell_path,
|
|
1286
|
+
"outputTruncated": output_truncated,
|
|
1287
|
+
"captureTruncated": stdout_capture_truncated or stderr_capture_truncated,
|
|
1288
|
+
},
|
|
1289
|
+
).to_json()
|
|
1290
|
+
return result(
|
|
1291
|
+
name,
|
|
1292
|
+
f"Command exited with code {returncode}.",
|
|
1293
|
+
output=output,
|
|
1294
|
+
metadata={
|
|
1295
|
+
"cwd": str(self.cwd),
|
|
1296
|
+
"exitCode": returncode,
|
|
1297
|
+
"processId": process_id,
|
|
1298
|
+
"shellPath": shell_path,
|
|
1299
|
+
"outputTruncated": output_truncated,
|
|
1300
|
+
"captureTruncated": stdout_capture_truncated or stderr_capture_truncated,
|
|
1301
|
+
},
|
|
1302
|
+
).to_json()
|
|
1303
|
+
|
|
1304
|
+
def ask_user_question(self, questions: object) -> str:
|
|
1305
|
+
parsed_questions, error = _parse_ask_user_questions(questions)
|
|
1306
|
+
if error is not None:
|
|
1307
|
+
return ToolResult.error_result("AskUserQuestion", error).to_json()
|
|
1308
|
+
return ToolResult(
|
|
1309
|
+
ok=True,
|
|
1310
|
+
name="AskUserQuestion",
|
|
1311
|
+
output=_build_question_summary(parsed_questions),
|
|
1312
|
+
metadata={"kind": "ask_user_question", "questions": parsed_questions},
|
|
1313
|
+
awaitUserResponse=True,
|
|
1314
|
+
).to_json()
|
|
1315
|
+
|
|
1316
|
+
def web_search(self, query: str) -> str:
|
|
1317
|
+
name = "WebSearch"
|
|
1318
|
+
if not query.strip():
|
|
1319
|
+
return ToolResult.error_result(name, 'Missing required "query" string.').to_json()
|
|
1320
|
+
command = self.settings.tools.web_search.command
|
|
1321
|
+
if command:
|
|
1322
|
+
return self._web_search_command(query, command)
|
|
1323
|
+
api_url = self.settings.tools.web_search.api_url
|
|
1324
|
+
if api_url:
|
|
1325
|
+
return self._web_search_api(query, api_url)
|
|
1326
|
+
return self._web_search_builtin(query)
|
|
1327
|
+
|
|
1328
|
+
def web_fetch(self, url: str) -> str:
|
|
1329
|
+
name = "WebFetch"
|
|
1330
|
+
target_url, validation_error = _validate_web_fetch_url(url)
|
|
1331
|
+
if validation_error is not None or target_url is None:
|
|
1332
|
+
return ToolResult.error_result(name, validation_error or 'Missing required "url" string.').to_json()
|
|
1333
|
+
|
|
1334
|
+
activity_label = f"WebFetch: {target_url}"
|
|
1335
|
+
activity_id = f"web-fetch-{uuid.uuid4().hex}"
|
|
1336
|
+
self.running_processes[activity_id] = {
|
|
1337
|
+
"startTime": _now_iso(),
|
|
1338
|
+
"command": activity_label,
|
|
1339
|
+
}
|
|
1340
|
+
request = urllib.request.Request(
|
|
1341
|
+
target_url,
|
|
1342
|
+
headers={
|
|
1343
|
+
"User-Agent": (
|
|
1344
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
1345
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Deepy/0.1"
|
|
1346
|
+
),
|
|
1347
|
+
"Accept": "text/html,application/xhtml+xml,text/plain;q=0.9,*/*;q=0.8",
|
|
1348
|
+
},
|
|
1349
|
+
method="GET",
|
|
1350
|
+
)
|
|
1351
|
+
try:
|
|
1352
|
+
with urllib.request.urlopen(request, timeout=30) as response:
|
|
1353
|
+
final_url = response.geturl()
|
|
1354
|
+
content_type = response.headers.get("Content-Type", "")
|
|
1355
|
+
body = response.read(MAX_WEB_FETCH_BYTES + 1)
|
|
1356
|
+
except Exception as exc:
|
|
1357
|
+
return ToolResult.error_result(
|
|
1358
|
+
name,
|
|
1359
|
+
f"WebFetch request failed: {exc}",
|
|
1360
|
+
metadata={
|
|
1361
|
+
"url": target_url,
|
|
1362
|
+
"activityLabel": activity_label,
|
|
1363
|
+
},
|
|
1364
|
+
).to_json()
|
|
1365
|
+
finally:
|
|
1366
|
+
self.running_processes.pop(activity_id, None)
|
|
1367
|
+
|
|
1368
|
+
bytes_truncated = len(body) > MAX_WEB_FETCH_BYTES
|
|
1369
|
+
body = body[:MAX_WEB_FETCH_BYTES]
|
|
1370
|
+
charset = _charset_from_content_type(content_type)
|
|
1371
|
+
decoded = body.decode(charset, errors="replace")
|
|
1372
|
+
if _is_html_response(content_type, decoded):
|
|
1373
|
+
title, readable_text = _extract_readable_html(decoded)
|
|
1374
|
+
else:
|
|
1375
|
+
title = ""
|
|
1376
|
+
readable_text = decoded.strip()
|
|
1377
|
+
output = _format_web_fetch_output(
|
|
1378
|
+
url=target_url,
|
|
1379
|
+
final_url=final_url,
|
|
1380
|
+
content_type=content_type,
|
|
1381
|
+
title=title,
|
|
1382
|
+
text=readable_text,
|
|
1383
|
+
bytes_truncated=bytes_truncated,
|
|
1384
|
+
)
|
|
1385
|
+
output, output_truncated = _truncate_output(output, MAX_WEB_FETCH_OUTPUT_CHARS)
|
|
1386
|
+
return ToolResult.ok_result(
|
|
1387
|
+
name,
|
|
1388
|
+
output,
|
|
1389
|
+
metadata={
|
|
1390
|
+
"url": target_url,
|
|
1391
|
+
"finalUrl": final_url,
|
|
1392
|
+
"contentType": content_type,
|
|
1393
|
+
"charset": charset,
|
|
1394
|
+
"byteCount": len(body),
|
|
1395
|
+
"bodyTruncated": bytes_truncated,
|
|
1396
|
+
"outputTruncated": output_truncated,
|
|
1397
|
+
"activityLabel": activity_label,
|
|
1398
|
+
},
|
|
1399
|
+
).to_json()
|
|
1400
|
+
|
|
1401
|
+
def _web_search_command(self, query: str, command: str) -> str:
|
|
1402
|
+
name = "WebSearch"
|
|
1403
|
+
prepared = _prepare_web_search_query(query)
|
|
1404
|
+
activity_label = _format_web_search_activity_label(query)
|
|
1405
|
+
process: subprocess.Popen[str] | None = None
|
|
1406
|
+
try:
|
|
1407
|
+
process = subprocess.Popen(
|
|
1408
|
+
f"{command} {shlex.quote(prepared.resolved_query)}",
|
|
1409
|
+
shell=True,
|
|
1410
|
+
cwd=self.cwd,
|
|
1411
|
+
text=True,
|
|
1412
|
+
stdout=subprocess.PIPE,
|
|
1413
|
+
stderr=subprocess.PIPE,
|
|
1414
|
+
stdin=subprocess.DEVNULL,
|
|
1415
|
+
executable="/bin/zsh",
|
|
1416
|
+
)
|
|
1417
|
+
process_id = str(process.pid)
|
|
1418
|
+
self.running_processes[process_id] = {
|
|
1419
|
+
"startTime": _now_iso(),
|
|
1420
|
+
"command": activity_label,
|
|
1421
|
+
}
|
|
1422
|
+
stdout, stderr = process.communicate(timeout=60)
|
|
1423
|
+
except subprocess.TimeoutExpired:
|
|
1424
|
+
if process is not None:
|
|
1425
|
+
_terminate_process(process)
|
|
1426
|
+
stdout, stderr = process.communicate()
|
|
1427
|
+
self.running_processes.pop(str(process.pid), None)
|
|
1428
|
+
output, output_truncated = _truncate_output((stdout or "") + (stderr or ""))
|
|
1429
|
+
return ToolResult.error_result(
|
|
1430
|
+
name,
|
|
1431
|
+
"WebSearch command timed out after 60000ms.",
|
|
1432
|
+
output=output,
|
|
1433
|
+
metadata={
|
|
1434
|
+
**prepared.metadata(),
|
|
1435
|
+
"activityLabel": activity_label,
|
|
1436
|
+
"outputTruncated": output_truncated,
|
|
1437
|
+
"interrupted": True,
|
|
1438
|
+
},
|
|
1439
|
+
).to_json()
|
|
1440
|
+
finally:
|
|
1441
|
+
if process is not None:
|
|
1442
|
+
self.running_processes.pop(str(process.pid), None)
|
|
1443
|
+
output = (stdout or "") + (stderr or "")
|
|
1444
|
+
output, output_truncated = _truncate_output(output)
|
|
1445
|
+
if process.returncode != 0:
|
|
1446
|
+
return ToolResult.error_result(
|
|
1447
|
+
name,
|
|
1448
|
+
f"WebSearch command exited with code {process.returncode}.",
|
|
1449
|
+
output=output,
|
|
1450
|
+
metadata={
|
|
1451
|
+
**prepared.metadata(),
|
|
1452
|
+
"exitCode": process.returncode,
|
|
1453
|
+
"activityLabel": activity_label,
|
|
1454
|
+
"outputTruncated": output_truncated,
|
|
1455
|
+
},
|
|
1456
|
+
).to_json()
|
|
1457
|
+
return ToolResult.ok_result(
|
|
1458
|
+
name,
|
|
1459
|
+
output,
|
|
1460
|
+
metadata={
|
|
1461
|
+
**prepared.metadata(),
|
|
1462
|
+
"exitCode": process.returncode,
|
|
1463
|
+
"activityLabel": activity_label,
|
|
1464
|
+
"outputTruncated": output_truncated,
|
|
1465
|
+
},
|
|
1466
|
+
).to_json()
|
|
1467
|
+
|
|
1468
|
+
def _web_search_builtin(self, query: str) -> str:
|
|
1469
|
+
name = "WebSearch"
|
|
1470
|
+
prepared, prepare_error = _prepare_web_search_query_with_llm(query, self.settings)
|
|
1471
|
+
search_url = (
|
|
1472
|
+
DEFAULT_WEB_SEARCH_URL
|
|
1473
|
+
+ "?"
|
|
1474
|
+
+ urllib.parse.urlencode({"q": prepared.resolved_query}, doseq=False)
|
|
1475
|
+
)
|
|
1476
|
+
activity_label = _format_web_search_activity_label(prepared.resolved_query)
|
|
1477
|
+
activity_id = f"web-search-{uuid.uuid4().hex}"
|
|
1478
|
+
self.running_processes[activity_id] = {
|
|
1479
|
+
"startTime": _now_iso(),
|
|
1480
|
+
"command": activity_label,
|
|
1481
|
+
}
|
|
1482
|
+
request = urllib.request.Request(
|
|
1483
|
+
search_url,
|
|
1484
|
+
headers={
|
|
1485
|
+
"User-Agent": (
|
|
1486
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
1487
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Deepy/0.1"
|
|
1488
|
+
)
|
|
1489
|
+
},
|
|
1490
|
+
method="GET",
|
|
1491
|
+
)
|
|
1492
|
+
try:
|
|
1493
|
+
with urllib.request.urlopen(request, timeout=30) as response:
|
|
1494
|
+
body = response.read().decode("utf-8", errors="replace")
|
|
1495
|
+
except Exception as exc:
|
|
1496
|
+
return ToolResult.error_result(
|
|
1497
|
+
name,
|
|
1498
|
+
f"WebSearch request failed: {exc}",
|
|
1499
|
+
metadata={
|
|
1500
|
+
**prepared.metadata(),
|
|
1501
|
+
"backend": "duckduckgo_html",
|
|
1502
|
+
"searchUrl": search_url,
|
|
1503
|
+
"activityLabel": activity_label,
|
|
1504
|
+
**({"queryPreparationWarning": prepare_error} if prepare_error else {}),
|
|
1505
|
+
},
|
|
1506
|
+
).to_json()
|
|
1507
|
+
finally:
|
|
1508
|
+
self.running_processes.pop(activity_id, None)
|
|
1509
|
+
|
|
1510
|
+
results = _parse_search_results(body)
|
|
1511
|
+
if not results:
|
|
1512
|
+
return ToolResult.error_result(
|
|
1513
|
+
name,
|
|
1514
|
+
"WebSearch returned no parseable results.",
|
|
1515
|
+
metadata={
|
|
1516
|
+
**prepared.metadata(),
|
|
1517
|
+
"backend": "duckduckgo_html",
|
|
1518
|
+
"searchUrl": search_url,
|
|
1519
|
+
"activityLabel": activity_label,
|
|
1520
|
+
**({"queryPreparationWarning": prepare_error} if prepare_error else {}),
|
|
1521
|
+
},
|
|
1522
|
+
).to_json()
|
|
1523
|
+
return ToolResult.ok_result(
|
|
1524
|
+
name,
|
|
1525
|
+
_format_search_results(prepared.resolved_query, results),
|
|
1526
|
+
metadata={
|
|
1527
|
+
**prepared.metadata(),
|
|
1528
|
+
"backend": "duckduckgo_html",
|
|
1529
|
+
"searchUrl": search_url,
|
|
1530
|
+
"activityLabel": activity_label,
|
|
1531
|
+
"resultCount": min(len(results), DEFAULT_WEB_SEARCH_RESULTS),
|
|
1532
|
+
**({"queryPreparationWarning": prepare_error} if prepare_error else {}),
|
|
1533
|
+
},
|
|
1534
|
+
).to_json()
|
|
1535
|
+
|
|
1536
|
+
def _web_search_api(self, query: str, api_url: str) -> str:
|
|
1537
|
+
name = "WebSearch"
|
|
1538
|
+
prepared, prepare_error = _prepare_web_search_query_with_llm(query, self.settings)
|
|
1539
|
+
if prepare_error is not None:
|
|
1540
|
+
return ToolResult.error_result(
|
|
1541
|
+
name,
|
|
1542
|
+
f"WebSearch custom API mode failed: {prepare_error}",
|
|
1543
|
+
metadata={"query": query, "apiUrl": api_url},
|
|
1544
|
+
).to_json()
|
|
1545
|
+
machine_id = self.settings.tools.web_search.machine_id
|
|
1546
|
+
if not machine_id:
|
|
1547
|
+
return ToolResult.error_result(
|
|
1548
|
+
name,
|
|
1549
|
+
"WebSearch custom API mode requires machine_id in the TOML tools.web_search config.",
|
|
1550
|
+
metadata={**prepared.metadata(), "apiUrl": api_url},
|
|
1551
|
+
).to_json()
|
|
1552
|
+
body = json_utils.dumps({"query": prepared.resolved_query}).encode("utf-8")
|
|
1553
|
+
request = urllib.request.Request(
|
|
1554
|
+
api_url,
|
|
1555
|
+
data=body,
|
|
1556
|
+
headers={
|
|
1557
|
+
"Content-Type": "application/json",
|
|
1558
|
+
"Token": machine_id,
|
|
1559
|
+
},
|
|
1560
|
+
method="POST",
|
|
1561
|
+
)
|
|
1562
|
+
try:
|
|
1563
|
+
with urllib.request.urlopen(request, timeout=30) as response:
|
|
1564
|
+
body = response.read().decode("utf-8", errors="replace")
|
|
1565
|
+
except Exception as exc:
|
|
1566
|
+
return ToolResult.error_result(
|
|
1567
|
+
name,
|
|
1568
|
+
f"WebSearch API request failed: {exc}",
|
|
1569
|
+
metadata={**prepared.metadata(), "apiUrl": api_url},
|
|
1570
|
+
).to_json()
|
|
1571
|
+
output = body.strip()
|
|
1572
|
+
try:
|
|
1573
|
+
payload = json_utils.loads(body)
|
|
1574
|
+
except json_utils.JSONDecodeError:
|
|
1575
|
+
payload = None
|
|
1576
|
+
if isinstance(payload, dict):
|
|
1577
|
+
result = payload.get("result")
|
|
1578
|
+
if isinstance(result, str) and result.strip():
|
|
1579
|
+
output = result.strip()
|
|
1580
|
+
if not output:
|
|
1581
|
+
return ToolResult.error_result(
|
|
1582
|
+
name,
|
|
1583
|
+
"WebSearch custom API mode failed: The web search response was empty.",
|
|
1584
|
+
metadata={**prepared.metadata(), "apiUrl": api_url},
|
|
1585
|
+
).to_json()
|
|
1586
|
+
return ToolResult.ok_result(
|
|
1587
|
+
name,
|
|
1588
|
+
output,
|
|
1589
|
+
metadata={**prepared.metadata(), "apiUrl": api_url, "usedMachineId": bool(machine_id)},
|
|
1590
|
+
).to_json()
|
|
1591
|
+
|
|
1592
|
+
|
|
1593
|
+
def _unified_diff(old: str, new: str, *, path: str) -> str:
|
|
1594
|
+
return "".join(
|
|
1595
|
+
unified_diff(
|
|
1596
|
+
old.splitlines(keepends=True),
|
|
1597
|
+
new.splitlines(keepends=True),
|
|
1598
|
+
fromfile=f"a/{path}",
|
|
1599
|
+
tofile=f"b/{path}",
|
|
1600
|
+
)
|
|
1601
|
+
)
|
|
1602
|
+
|
|
1603
|
+
|
|
1604
|
+
def _read_text_preserving_newlines(path: Path) -> str:
|
|
1605
|
+
return _read_text_metadata(path).content
|
|
1606
|
+
|
|
1607
|
+
|
|
1608
|
+
def _read_text_metadata(path: Path) -> TextFileMetadata:
|
|
1609
|
+
data = path.read_bytes()
|
|
1610
|
+
encoding = _detect_text_encoding(data)
|
|
1611
|
+
python_encoding = "utf-16" if encoding == "utf16le" else "utf-8"
|
|
1612
|
+
text = data.decode(python_encoding, errors="replace")
|
|
1613
|
+
return TextFileMetadata(
|
|
1614
|
+
content=text,
|
|
1615
|
+
encoding=encoding,
|
|
1616
|
+
line_endings=_detect_line_endings(text),
|
|
1617
|
+
)
|
|
1618
|
+
|
|
1619
|
+
|
|
1620
|
+
def _detect_text_encoding(data: bytes) -> str:
|
|
1621
|
+
if len(data) >= 2 and data[0] == 0xFF and data[1] == 0xFE:
|
|
1622
|
+
return "utf16le"
|
|
1623
|
+
return "utf8"
|
|
1624
|
+
|
|
1625
|
+
|
|
1626
|
+
def _write_text_with_encoding(path: Path, content: str, encoding: str) -> None:
|
|
1627
|
+
python_encoding = "utf-16" if encoding == "utf16le" else "utf-8"
|
|
1628
|
+
path.write_text(content, encoding=python_encoding)
|
|
1629
|
+
|
|
1630
|
+
|
|
1631
|
+
def _coerce_write_content(path: Path, content: object) -> tuple[str, dict[str, object], str | None]:
|
|
1632
|
+
if isinstance(content, str):
|
|
1633
|
+
return content, {}, None
|
|
1634
|
+
if path.suffix.lower() == ".json" and content is not None and not isinstance(content, bytes):
|
|
1635
|
+
try:
|
|
1636
|
+
return (
|
|
1637
|
+
json_utils.dumps_pretty(content),
|
|
1638
|
+
{"input_repaired": True, "repair_kind": "json-stringify-content"},
|
|
1639
|
+
None,
|
|
1640
|
+
)
|
|
1641
|
+
except TypeError as exc:
|
|
1642
|
+
return "", {}, f"JSON content is not serializable: {exc}"
|
|
1643
|
+
return "", {}, "content must be a string."
|
|
1644
|
+
|
|
1645
|
+
|
|
1646
|
+
def _format_notebook(path: Path) -> tuple[str, str | None]:
|
|
1647
|
+
raw = _read_text_preserving_newlines(path)
|
|
1648
|
+
if not raw:
|
|
1649
|
+
return "WARNING: File is empty.", None
|
|
1650
|
+
try:
|
|
1651
|
+
parsed = json_utils.loads(raw)
|
|
1652
|
+
except json_utils.JSONDecodeError as exc:
|
|
1653
|
+
return "", f"Failed to parse notebook JSON: {exc}"
|
|
1654
|
+
if not isinstance(parsed, dict):
|
|
1655
|
+
return "WARNING: Notebook has no cells.", None
|
|
1656
|
+
|
|
1657
|
+
cells = parsed.get("cells")
|
|
1658
|
+
lines: list[str] = []
|
|
1659
|
+
if isinstance(cells, list):
|
|
1660
|
+
for index, cell in enumerate(cells):
|
|
1661
|
+
if not isinstance(cell, dict):
|
|
1662
|
+
continue
|
|
1663
|
+
cell_type = cell.get("cell_type") if isinstance(cell.get("cell_type"), str) else "unknown"
|
|
1664
|
+
lines.append(f"# Cell {index + 1} ({cell_type})")
|
|
1665
|
+
lines.extend(_normalize_notebook_field(cell.get("source")))
|
|
1666
|
+
|
|
1667
|
+
outputs = cell.get("outputs")
|
|
1668
|
+
if not isinstance(outputs, list):
|
|
1669
|
+
continue
|
|
1670
|
+
for output_index, output in enumerate(outputs):
|
|
1671
|
+
if not isinstance(output, dict):
|
|
1672
|
+
continue
|
|
1673
|
+
output_type = (
|
|
1674
|
+
output.get("output_type")
|
|
1675
|
+
if isinstance(output.get("output_type"), str)
|
|
1676
|
+
else "output"
|
|
1677
|
+
)
|
|
1678
|
+
lines.append(f"# Output {output_index + 1} ({output_type})")
|
|
1679
|
+
lines.extend(_format_notebook_output(output))
|
|
1680
|
+
|
|
1681
|
+
if not lines:
|
|
1682
|
+
return "WARNING: Notebook has no cells.", None
|
|
1683
|
+
return "\n".join(f"{idx + 1}: {line}" for idx, line in enumerate(lines)), None
|
|
1684
|
+
|
|
1685
|
+
|
|
1686
|
+
def _normalize_notebook_field(value: object) -> list[str]:
|
|
1687
|
+
if isinstance(value, list):
|
|
1688
|
+
return [str(item).removesuffix("\n").removesuffix("\r") for item in value]
|
|
1689
|
+
if isinstance(value, str):
|
|
1690
|
+
return value.splitlines()
|
|
1691
|
+
return []
|
|
1692
|
+
|
|
1693
|
+
|
|
1694
|
+
def _format_notebook_output(output: dict[str, object]) -> list[str]:
|
|
1695
|
+
lines = _normalize_notebook_field(output.get("text"))
|
|
1696
|
+
data = output.get("data")
|
|
1697
|
+
if isinstance(data, dict):
|
|
1698
|
+
lines.extend(_normalize_notebook_field(data.get("text/plain")))
|
|
1699
|
+
image_png = data.get("image/png")
|
|
1700
|
+
if isinstance(image_png, str):
|
|
1701
|
+
lines.append(f"[image/png {len(image_png)} chars]")
|
|
1702
|
+
image_jpeg = data.get("image/jpeg")
|
|
1703
|
+
if isinstance(image_jpeg, str):
|
|
1704
|
+
lines.append(f"[image/jpeg {len(image_jpeg)} chars]")
|
|
1705
|
+
traceback = output.get("traceback")
|
|
1706
|
+
if isinstance(traceback, list):
|
|
1707
|
+
lines.extend(str(item).removesuffix("\n").removesuffix("\r") for item in traceback)
|
|
1708
|
+
return lines or ["[output omitted]"]
|
|
1709
|
+
|
|
1710
|
+
|
|
1711
|
+
@dataclass(frozen=True)
|
|
1712
|
+
class PageRange:
|
|
1713
|
+
start: int
|
|
1714
|
+
end: int
|
|
1715
|
+
|
|
1716
|
+
@property
|
|
1717
|
+
def count(self) -> int:
|
|
1718
|
+
return self.end - self.start + 1
|
|
1719
|
+
|
|
1720
|
+
def label(self) -> str:
|
|
1721
|
+
return f"{self.start}-{self.end}"
|
|
1722
|
+
|
|
1723
|
+
|
|
1724
|
+
def _read_pdf(path: Path, pages: str | None) -> str:
|
|
1725
|
+
data = path.read_bytes()
|
|
1726
|
+
page_count = _count_pdf_pages(data)
|
|
1727
|
+
page_range, range_error = _parse_page_range(pages)
|
|
1728
|
+
if range_error is not None:
|
|
1729
|
+
return ToolResult.error_result("read", range_error, metadata={"path": str(path)}).to_json()
|
|
1730
|
+
|
|
1731
|
+
if page_range is None and page_count is not None and page_count > PDF_LARGE_PAGE_THRESHOLD:
|
|
1732
|
+
return ToolResult.error_result(
|
|
1733
|
+
"read",
|
|
1734
|
+
f'PDF has {page_count} pages; provide "pages" to read a range.',
|
|
1735
|
+
metadata={"path": str(path), "pageCount": page_count},
|
|
1736
|
+
).to_json()
|
|
1737
|
+
if page_range is not None and page_range.count > PDF_MAX_PAGE_RANGE:
|
|
1738
|
+
return ToolResult.error_result(
|
|
1739
|
+
"read",
|
|
1740
|
+
f"PDF page range exceeds {PDF_MAX_PAGE_RANGE} pages.",
|
|
1741
|
+
metadata={"path": str(path), "pageCount": page_count},
|
|
1742
|
+
).to_json()
|
|
1743
|
+
if page_range is not None and page_count is not None and page_range.end > page_count:
|
|
1744
|
+
return ToolResult.error_result(
|
|
1745
|
+
"read",
|
|
1746
|
+
f"PDF page range exceeds total page count ({page_count}).",
|
|
1747
|
+
metadata={"path": str(path), "pageCount": page_count},
|
|
1748
|
+
).to_json()
|
|
1749
|
+
|
|
1750
|
+
encoded = base64.b64encode(data).decode("ascii")
|
|
1751
|
+
return ToolResult.ok_result(
|
|
1752
|
+
"read",
|
|
1753
|
+
f"data:application/pdf;base64,{encoded}",
|
|
1754
|
+
metadata={
|
|
1755
|
+
"path": str(path),
|
|
1756
|
+
"mime": "application/pdf",
|
|
1757
|
+
"encoding": "base64",
|
|
1758
|
+
"bytes": len(data),
|
|
1759
|
+
"pageCount": page_count,
|
|
1760
|
+
"pages": page_range.label() if page_range is not None else None,
|
|
1761
|
+
},
|
|
1762
|
+
).to_json()
|
|
1763
|
+
|
|
1764
|
+
|
|
1765
|
+
def _count_pdf_pages(data: bytes) -> int | None:
|
|
1766
|
+
try:
|
|
1767
|
+
text = data.decode("latin1", errors="ignore")
|
|
1768
|
+
except Exception:
|
|
1769
|
+
return None
|
|
1770
|
+
return len(re.findall(r"/Type\s*/Page\b(?!s)", text))
|
|
1771
|
+
|
|
1772
|
+
|
|
1773
|
+
def _parse_page_range(value: str | None) -> tuple[PageRange | None, str | None]:
|
|
1774
|
+
if value is None or not value.strip():
|
|
1775
|
+
return None, None
|
|
1776
|
+
trimmed = value.strip()
|
|
1777
|
+
if "," in trimmed:
|
|
1778
|
+
return None, 'pages must be a single range like "1-5" or "3".'
|
|
1779
|
+
parts = [part.strip() for part in trimmed.split("-")]
|
|
1780
|
+
if len(parts) == 1:
|
|
1781
|
+
start, error = _parse_positive_int(parts[0], "pages")
|
|
1782
|
+
return (PageRange(start, start), None) if error is None else (None, error)
|
|
1783
|
+
if len(parts) == 2:
|
|
1784
|
+
start, start_error = _parse_positive_int(parts[0], "pages")
|
|
1785
|
+
if start_error is not None:
|
|
1786
|
+
return None, start_error
|
|
1787
|
+
end, end_error = _parse_positive_int(parts[1], "pages")
|
|
1788
|
+
if end_error is not None:
|
|
1789
|
+
return None, end_error
|
|
1790
|
+
if end < start:
|
|
1791
|
+
return None, "pages range end must be >= start."
|
|
1792
|
+
return PageRange(start, end), None
|
|
1793
|
+
return None, 'pages must be a single range like "1-5" or "3".'
|
|
1794
|
+
|
|
1795
|
+
|
|
1796
|
+
def _parse_positive_int(value: str, label: str) -> tuple[int, str | None]:
|
|
1797
|
+
try:
|
|
1798
|
+
numeric = float(value)
|
|
1799
|
+
except ValueError:
|
|
1800
|
+
return 0, f"{label} must be a number."
|
|
1801
|
+
if not math.isfinite(numeric):
|
|
1802
|
+
return 0, f"{label} must be a number."
|
|
1803
|
+
integer = int(numeric)
|
|
1804
|
+
if integer < 1:
|
|
1805
|
+
return 0, f"{label} must be >= 1."
|
|
1806
|
+
return integer, None
|
|
1807
|
+
|
|
1808
|
+
|
|
1809
|
+
IMAGE_MIME_TYPES = {
|
|
1810
|
+
".png": "image/png",
|
|
1811
|
+
".jpg": "image/jpeg",
|
|
1812
|
+
".jpeg": "image/jpeg",
|
|
1813
|
+
".gif": "image/gif",
|
|
1814
|
+
".webp": "image/webp",
|
|
1815
|
+
".bmp": "image/bmp",
|
|
1816
|
+
".tif": "image/tiff",
|
|
1817
|
+
".tiff": "image/tiff",
|
|
1818
|
+
".svg": "image/svg+xml",
|
|
1819
|
+
".ico": "image/x-icon",
|
|
1820
|
+
".avif": "image/avif",
|
|
1821
|
+
}
|
|
1822
|
+
|
|
1823
|
+
|
|
1824
|
+
def _image_mime_type(suffix: str) -> str | None:
|
|
1825
|
+
return IMAGE_MIME_TYPES.get(suffix)
|
|
1826
|
+
|
|
1827
|
+
|
|
1828
|
+
def _build_image_follow_up_message(path: Path, mime: str, data: bytes) -> dict[str, object]:
|
|
1829
|
+
encoded = base64.b64encode(data).decode("ascii")
|
|
1830
|
+
return {
|
|
1831
|
+
"role": "system",
|
|
1832
|
+
"content": [
|
|
1833
|
+
{
|
|
1834
|
+
"type": "input_text",
|
|
1835
|
+
"text": (
|
|
1836
|
+
f"The read tool has loaded `{path.name}`. "
|
|
1837
|
+
"Use the attached image content to answer the original request."
|
|
1838
|
+
),
|
|
1839
|
+
},
|
|
1840
|
+
{
|
|
1841
|
+
"type": "input_image",
|
|
1842
|
+
"image_url": f"data:{mime};base64,{encoded}",
|
|
1843
|
+
},
|
|
1844
|
+
],
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
|
|
1848
|
+
def _detect_line_endings(text: str) -> str:
|
|
1849
|
+
return "CRLF" if "\r\n" in text else "LF"
|
|
1850
|
+
|
|
1851
|
+
|
|
1852
|
+
def _normalize_line_endings(text: str, line_endings: str) -> str:
|
|
1853
|
+
normalized = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
1854
|
+
return normalized.replace("\n", "\r\n") if line_endings == "CRLF" else normalized
|
|
1855
|
+
|
|
1856
|
+
|
|
1857
|
+
def _truncate_line(line: str) -> str:
|
|
1858
|
+
if len(line) <= MAX_LINE_LENGTH:
|
|
1859
|
+
return line
|
|
1860
|
+
return line[:MAX_LINE_LENGTH] + "... [truncated]"
|
|
1861
|
+
|
|
1862
|
+
|
|
1863
|
+
def _truncate_output(output: str, max_chars: int = MAX_BASH_OUTPUT_CHARS) -> tuple[str, bool]:
|
|
1864
|
+
if len(output) <= max_chars:
|
|
1865
|
+
return output, False
|
|
1866
|
+
omitted = len(output) - max_chars
|
|
1867
|
+
return output[:max_chars] + f"\n... [truncated {omitted} chars]", True
|
|
1868
|
+
|
|
1869
|
+
|
|
1870
|
+
def _read_captured_output(stream) -> tuple[str, bool]:
|
|
1871
|
+
stream.flush()
|
|
1872
|
+
stream.seek(0)
|
|
1873
|
+
text = stream.read(MAX_BASH_CAPTURE_CHARS + 1)
|
|
1874
|
+
if len(text) <= MAX_BASH_CAPTURE_CHARS:
|
|
1875
|
+
return text, False
|
|
1876
|
+
return text[:MAX_BASH_CAPTURE_CHARS], True
|
|
1877
|
+
|
|
1878
|
+
|
|
1879
|
+
def _build_shell_command(command: str, marker: str) -> tuple[str, list[str]]:
|
|
1880
|
+
shell_path = _resolve_shell_path()
|
|
1881
|
+
normalized_command = rewrite_windows_null_redirect(command)
|
|
1882
|
+
parts = [
|
|
1883
|
+
part
|
|
1884
|
+
for part in (
|
|
1885
|
+
build_shell_init_command(shell_path),
|
|
1886
|
+
build_disable_extglob_command(shell_path),
|
|
1887
|
+
normalized_command,
|
|
1888
|
+
"__deepy_exit=$?",
|
|
1889
|
+
f"printf '\\n{marker}CWD=%s\\n{marker}EXIT=%s\\n' \"$PWD\" \"$__deepy_exit\"",
|
|
1890
|
+
"exit $__deepy_exit",
|
|
1891
|
+
)
|
|
1892
|
+
if part
|
|
1893
|
+
]
|
|
1894
|
+
return shell_path, ["-c", "{ " + "; ".join(parts) + "; } < /dev/null"]
|
|
1895
|
+
|
|
1896
|
+
|
|
1897
|
+
def _resolve_shell_path() -> str:
|
|
1898
|
+
shell_path = os.environ.get("SHELL")
|
|
1899
|
+
if shell_path:
|
|
1900
|
+
return shell_path
|
|
1901
|
+
return "/bin/zsh" if Path("/bin/zsh").exists() else "/bin/sh"
|
|
1902
|
+
|
|
1903
|
+
|
|
1904
|
+
def _now_iso() -> str:
|
|
1905
|
+
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
1906
|
+
|
|
1907
|
+
|
|
1908
|
+
def _terminate_process(process: subprocess.Popen[str]) -> None:
|
|
1909
|
+
try:
|
|
1910
|
+
if os.name != "nt":
|
|
1911
|
+
os.killpg(process.pid, signal.SIGKILL)
|
|
1912
|
+
else:
|
|
1913
|
+
process.kill()
|
|
1914
|
+
except OSError:
|
|
1915
|
+
return
|
|
1916
|
+
|
|
1917
|
+
|
|
1918
|
+
def _format_directory_entries(path: Path, project_root: Path) -> tuple[str, int, int]:
|
|
1919
|
+
lines: list[str] = []
|
|
1920
|
+
ignored_count = 0
|
|
1921
|
+
gitignore = _load_gitignore_matcher(project_root)
|
|
1922
|
+
for entry in sorted(path.iterdir(), key=lambda item: (not item.is_dir(), item.name.lower())):
|
|
1923
|
+
if _is_ignored_entry(entry, project_root, gitignore):
|
|
1924
|
+
ignored_count += 1
|
|
1925
|
+
continue
|
|
1926
|
+
suffix = "/" if entry.is_dir() else ""
|
|
1927
|
+
try:
|
|
1928
|
+
size = entry.stat().st_size
|
|
1929
|
+
except OSError:
|
|
1930
|
+
size = 0
|
|
1931
|
+
lines.append(f"{entry.name}{suffix}\t{size}")
|
|
1932
|
+
return "\n".join(lines), len(lines), ignored_count
|
|
1933
|
+
|
|
1934
|
+
|
|
1935
|
+
def _normalize_relative_suffix(path: str) -> str:
|
|
1936
|
+
suffix = path.replace("\\", "/").strip("/")
|
|
1937
|
+
parts = [part for part in suffix.split("/") if part and part != "."]
|
|
1938
|
+
return "/".join(parts)
|
|
1939
|
+
|
|
1940
|
+
|
|
1941
|
+
def _find_suffix_matches(root: Path, suffix: str) -> list[Path]:
|
|
1942
|
+
matches: list[Path] = []
|
|
1943
|
+
gitignore = _load_gitignore_matcher(root)
|
|
1944
|
+
for current, dirnames, filenames in os.walk(root):
|
|
1945
|
+
dirnames[:] = [
|
|
1946
|
+
dirname
|
|
1947
|
+
for dirname in dirnames
|
|
1948
|
+
if not _is_ignored_entry(Path(current) / dirname, root, gitignore)
|
|
1949
|
+
]
|
|
1950
|
+
current_path = Path(current)
|
|
1951
|
+
for filename in filenames:
|
|
1952
|
+
full_path = current_path / filename
|
|
1953
|
+
if _is_ignored_entry(full_path, root, gitignore):
|
|
1954
|
+
continue
|
|
1955
|
+
try:
|
|
1956
|
+
relative = full_path.relative_to(root).as_posix()
|
|
1957
|
+
except ValueError:
|
|
1958
|
+
continue
|
|
1959
|
+
if relative.endswith(suffix):
|
|
1960
|
+
matches.append(full_path.resolve())
|
|
1961
|
+
return matches
|
|
1962
|
+
|
|
1963
|
+
|
|
1964
|
+
def _is_ignored_entry(
|
|
1965
|
+
path: Path,
|
|
1966
|
+
project_root: Path,
|
|
1967
|
+
gitignore: "GitignoreMatcher",
|
|
1968
|
+
) -> bool:
|
|
1969
|
+
if path.name in IGNORED_DIRECTORY_ENTRIES:
|
|
1970
|
+
return True
|
|
1971
|
+
try:
|
|
1972
|
+
relative = path.relative_to(project_root).as_posix()
|
|
1973
|
+
except ValueError:
|
|
1974
|
+
return False
|
|
1975
|
+
return gitignore.ignores(relative, path.is_dir())
|
|
1976
|
+
|
|
1977
|
+
|
|
1978
|
+
@dataclass(frozen=True)
|
|
1979
|
+
class GitignorePattern:
|
|
1980
|
+
pattern: str
|
|
1981
|
+
negated: bool = False
|
|
1982
|
+
|
|
1983
|
+
|
|
1984
|
+
@dataclass(frozen=True)
|
|
1985
|
+
class GitignoreMatcher:
|
|
1986
|
+
patterns: tuple[GitignorePattern, ...]
|
|
1987
|
+
|
|
1988
|
+
def ignores(self, relative_path: str, is_dir: bool) -> bool:
|
|
1989
|
+
normalized = relative_path.strip("/")
|
|
1990
|
+
if not normalized:
|
|
1991
|
+
return False
|
|
1992
|
+
ignored = False
|
|
1993
|
+
for item in self.patterns:
|
|
1994
|
+
if _gitignore_pattern_matches(item.pattern, normalized, is_dir):
|
|
1995
|
+
ignored = not item.negated
|
|
1996
|
+
return ignored
|
|
1997
|
+
|
|
1998
|
+
|
|
1999
|
+
def _load_gitignore_matcher(project_root: Path) -> GitignoreMatcher:
|
|
2000
|
+
gitignore = project_root / ".gitignore"
|
|
2001
|
+
if not gitignore.is_file():
|
|
2002
|
+
return GitignoreMatcher(())
|
|
2003
|
+
patterns: list[GitignorePattern] = []
|
|
2004
|
+
for raw_line in gitignore.read_text(encoding="utf-8", errors="replace").splitlines():
|
|
2005
|
+
line = raw_line.strip()
|
|
2006
|
+
if not line or line.startswith("#"):
|
|
2007
|
+
continue
|
|
2008
|
+
negated = line.startswith("!")
|
|
2009
|
+
if negated:
|
|
2010
|
+
line = line[1:].strip()
|
|
2011
|
+
if line:
|
|
2012
|
+
patterns.append(GitignorePattern(line.replace("\\", "/"), negated))
|
|
2013
|
+
return GitignoreMatcher(tuple(patterns))
|
|
2014
|
+
|
|
2015
|
+
|
|
2016
|
+
def _gitignore_pattern_matches(pattern: str, relative_path: str, is_dir: bool) -> bool:
|
|
2017
|
+
directory_only = pattern.endswith("/")
|
|
2018
|
+
normalized_pattern = pattern.strip("/")
|
|
2019
|
+
if not normalized_pattern:
|
|
2020
|
+
return False
|
|
2021
|
+
if directory_only and not is_dir:
|
|
2022
|
+
return relative_path.startswith(normalized_pattern + "/")
|
|
2023
|
+
if "/" in normalized_pattern:
|
|
2024
|
+
return fnmatch(relative_path, normalized_pattern) or relative_path.startswith(
|
|
2025
|
+
normalized_pattern + "/"
|
|
2026
|
+
)
|
|
2027
|
+
parts = relative_path.split("/")
|
|
2028
|
+
return any(fnmatch(part, normalized_pattern) for part in parts)
|
|
2029
|
+
|
|
2030
|
+
|
|
2031
|
+
def _parse_ask_user_questions(value: object) -> tuple[list[dict[str, object]], str | None]:
|
|
2032
|
+
if not isinstance(value, list) or not value:
|
|
2033
|
+
return [], '"questions" must be a non-empty array.'
|
|
2034
|
+
|
|
2035
|
+
questions: list[dict[str, object]] = []
|
|
2036
|
+
for index, item in enumerate(value):
|
|
2037
|
+
if not isinstance(item, dict):
|
|
2038
|
+
return [], f"Question at index {index} must be an object."
|
|
2039
|
+
|
|
2040
|
+
question = _trimmed_string(item.get("question"))
|
|
2041
|
+
if not question:
|
|
2042
|
+
return [], f'Question at index {index} is missing a non-empty "question" string.'
|
|
2043
|
+
|
|
2044
|
+
raw_options = item.get("options")
|
|
2045
|
+
if not isinstance(raw_options, list) or not raw_options:
|
|
2046
|
+
return [], f'Question at index {index} must include a non-empty "options" array.'
|
|
2047
|
+
|
|
2048
|
+
options: list[dict[str, str]] = []
|
|
2049
|
+
for option_index, option in enumerate(raw_options):
|
|
2050
|
+
if not isinstance(option, dict):
|
|
2051
|
+
return [], f"Option {option_index} for question {index} must be an object."
|
|
2052
|
+
|
|
2053
|
+
label = _trimmed_string(option.get("label"))
|
|
2054
|
+
if not label:
|
|
2055
|
+
return (
|
|
2056
|
+
[],
|
|
2057
|
+
f'Option {option_index} for question {index} is missing a non-empty "label" string.',
|
|
2058
|
+
)
|
|
2059
|
+
|
|
2060
|
+
parsed_option = {"label": label}
|
|
2061
|
+
description = _trimmed_string(option.get("description"))
|
|
2062
|
+
if description:
|
|
2063
|
+
parsed_option["description"] = description
|
|
2064
|
+
options.append(parsed_option)
|
|
2065
|
+
|
|
2066
|
+
parsed_question: dict[str, object] = {
|
|
2067
|
+
"question": question,
|
|
2068
|
+
"options": options,
|
|
2069
|
+
}
|
|
2070
|
+
multi_select = item.get("multiSelect")
|
|
2071
|
+
if isinstance(multi_select, bool):
|
|
2072
|
+
parsed_question["multiSelect"] = multi_select
|
|
2073
|
+
questions.append(parsed_question)
|
|
2074
|
+
|
|
2075
|
+
return questions, None
|
|
2076
|
+
|
|
2077
|
+
|
|
2078
|
+
def _build_question_summary(questions: list[dict[str, object]]) -> str:
|
|
2079
|
+
lines = ["Waiting for user input."]
|
|
2080
|
+
for index, item in enumerate(questions):
|
|
2081
|
+
lines.append("")
|
|
2082
|
+
lines.append(f"{index + 1}. {item['question']}")
|
|
2083
|
+
lines.append(f" Mode: {'multi-select' if item.get('multiSelect') else 'single-select'}")
|
|
2084
|
+
for option in item["options"]:
|
|
2085
|
+
if not isinstance(option, dict):
|
|
2086
|
+
continue
|
|
2087
|
+
lines.append(f" - {option['label']}")
|
|
2088
|
+
if option.get("description"):
|
|
2089
|
+
lines.append(f" {option['description']}")
|
|
2090
|
+
lines.append(" - Other")
|
|
2091
|
+
return "\n".join(lines)
|
|
2092
|
+
|
|
2093
|
+
|
|
2094
|
+
def _trimmed_string(value: object) -> str:
|
|
2095
|
+
return value.strip() if isinstance(value, str) else ""
|
|
2096
|
+
|
|
2097
|
+
|
|
2098
|
+
def _extract_bash_sentinel(stdout: str, marker: str) -> tuple[str, Path | None, int | None]:
|
|
2099
|
+
start = stdout.rfind(f"\n{marker}CWD=")
|
|
2100
|
+
if start == -1:
|
|
2101
|
+
return stdout, None, None
|
|
2102
|
+
visible = stdout[:start]
|
|
2103
|
+
tail = stdout[start + 1 :].splitlines()
|
|
2104
|
+
cwd: Path | None = None
|
|
2105
|
+
exit_code: int | None = None
|
|
2106
|
+
for line in tail:
|
|
2107
|
+
if line.startswith(f"{marker}CWD="):
|
|
2108
|
+
cwd = Path(line.removeprefix(f"{marker}CWD=")).resolve()
|
|
2109
|
+
elif line.startswith(f"{marker}EXIT="):
|
|
2110
|
+
raw = line.removeprefix(f"{marker}EXIT=")
|
|
2111
|
+
if raw.isdigit():
|
|
2112
|
+
exit_code = int(raw)
|
|
2113
|
+
return visible, cwd, exit_code
|