teddy-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- teddy_cli-0.1.0.dist-info/LICENSE +677 -0
- teddy_cli-0.1.0.dist-info/METADATA +33 -0
- teddy_cli-0.1.0.dist-info/RECORD +143 -0
- teddy_cli-0.1.0.dist-info/WHEEL +4 -0
- teddy_cli-0.1.0.dist-info/entry_points.txt +3 -0
- teddy_executor/__init__.py +1 -0
- teddy_executor/__main__.py +335 -0
- teddy_executor/adapters/__init__.py +0 -0
- teddy_executor/adapters/inbound/__init__.py +0 -0
- teddy_executor/adapters/inbound/cli_formatter.py +107 -0
- teddy_executor/adapters/inbound/cli_helpers.py +249 -0
- teddy_executor/adapters/inbound/console_plan_reviewer.py +69 -0
- teddy_executor/adapters/inbound/session_cli_handlers.py +366 -0
- teddy_executor/adapters/inbound/textual_plan_reviewer.py +78 -0
- teddy_executor/adapters/inbound/textual_plan_reviewer_app.py +367 -0
- teddy_executor/adapters/inbound/textual_plan_reviewer_editor.py +281 -0
- teddy_executor/adapters/inbound/textual_plan_reviewer_execution.py +213 -0
- teddy_executor/adapters/inbound/textual_plan_reviewer_helpers.py +308 -0
- teddy_executor/adapters/inbound/textual_plan_reviewer_logic.py +345 -0
- teddy_executor/adapters/inbound/textual_plan_reviewer_previews.py +227 -0
- teddy_executor/adapters/inbound/textual_plan_reviewer_widgets.py +246 -0
- teddy_executor/adapters/outbound/__init__.py +7 -0
- teddy_executor/adapters/outbound/console_interactor.py +212 -0
- teddy_executor/adapters/outbound/console_interactor_ask_loop.py +121 -0
- teddy_executor/adapters/outbound/console_interactor_helpers.py +95 -0
- teddy_executor/adapters/outbound/console_tooling.py +62 -0
- teddy_executor/adapters/outbound/filesystem_helpers.py +61 -0
- teddy_executor/adapters/outbound/litellm_adapter.py +462 -0
- teddy_executor/adapters/outbound/local_file_system_adapter.py +300 -0
- teddy_executor/adapters/outbound/local_repo_tree_generator.py +96 -0
- teddy_executor/adapters/outbound/openrouter_hydrator.py +89 -0
- teddy_executor/adapters/outbound/shell_adapter.py +344 -0
- teddy_executor/adapters/outbound/shell_command_builder.py +105 -0
- teddy_executor/adapters/outbound/system_environment_adapter.py +62 -0
- teddy_executor/adapters/outbound/system_environment_inspector.py +54 -0
- teddy_executor/adapters/outbound/system_time_adapter.py +22 -0
- teddy_executor/adapters/outbound/web_scraper_adapter.py +346 -0
- teddy_executor/adapters/outbound/web_searcher_adapter.py +122 -0
- teddy_executor/adapters/outbound/yaml_config_adapter.py +105 -0
- teddy_executor/container.py +333 -0
- teddy_executor/core/__init__.py +0 -0
- teddy_executor/core/domain/__init__.py +0 -0
- teddy_executor/core/domain/models/__init__.py +44 -0
- teddy_executor/core/domain/models/action_ports.py +28 -0
- teddy_executor/core/domain/models/change_set.py +10 -0
- teddy_executor/core/domain/models/exceptions.py +40 -0
- teddy_executor/core/domain/models/execution_report.py +65 -0
- teddy_executor/core/domain/models/orchestrator_ports.py +26 -0
- teddy_executor/core/domain/models/plan.py +85 -0
- teddy_executor/core/domain/models/planning_ports.py +43 -0
- teddy_executor/core/domain/models/project_context.py +56 -0
- teddy_executor/core/domain/models/report_assembly_data.py +18 -0
- teddy_executor/core/domain/models/session.py +17 -0
- teddy_executor/core/domain/models/shell_output.py +12 -0
- teddy_executor/core/domain/models/web_search_results.py +26 -0
- teddy_executor/core/ports/__init__.py +0 -0
- teddy_executor/core/ports/inbound/__init__.py +0 -0
- teddy_executor/core/ports/inbound/edit_simulator.py +33 -0
- teddy_executor/core/ports/inbound/get_context_use_case.py +32 -0
- teddy_executor/core/ports/inbound/init.py +15 -0
- teddy_executor/core/ports/inbound/plan_parser.py +52 -0
- teddy_executor/core/ports/inbound/plan_reviewer.py +44 -0
- teddy_executor/core/ports/inbound/plan_validator.py +26 -0
- teddy_executor/core/ports/inbound/planning_use_case.py +30 -0
- teddy_executor/core/ports/inbound/run_plan_use_case.py +60 -0
- teddy_executor/core/ports/outbound/__init__.py +34 -0
- teddy_executor/core/ports/outbound/config_service.py +29 -0
- teddy_executor/core/ports/outbound/environment_inspector.py +30 -0
- teddy_executor/core/ports/outbound/execution_report_assembler.py +19 -0
- teddy_executor/core/ports/outbound/file_system_manager.py +131 -0
- teddy_executor/core/ports/outbound/llm_client.py +90 -0
- teddy_executor/core/ports/outbound/markdown_report_formatter.py +26 -0
- teddy_executor/core/ports/outbound/prompt_manager.py +55 -0
- teddy_executor/core/ports/outbound/repo_tree_generator.py +17 -0
- teddy_executor/core/ports/outbound/session_loop_guard.py +16 -0
- teddy_executor/core/ports/outbound/session_manager.py +97 -0
- teddy_executor/core/ports/outbound/session_repository.py +65 -0
- teddy_executor/core/ports/outbound/shell_executor.py +24 -0
- teddy_executor/core/ports/outbound/system_environment.py +25 -0
- teddy_executor/core/ports/outbound/time_service.py +28 -0
- teddy_executor/core/ports/outbound/user_interactor.py +126 -0
- teddy_executor/core/ports/outbound/web_scraper.py +24 -0
- teddy_executor/core/ports/outbound/web_searcher.py +25 -0
- teddy_executor/core/services/__init__.py +0 -0
- teddy_executor/core/services/action_changeset_builder.py +90 -0
- teddy_executor/core/services/action_diff_manager.py +110 -0
- teddy_executor/core/services/action_dispatcher.py +142 -0
- teddy_executor/core/services/action_executor.py +209 -0
- teddy_executor/core/services/action_factory.py +197 -0
- teddy_executor/core/services/action_parser_complex.py +216 -0
- teddy_executor/core/services/action_parser_strategies.py +84 -0
- teddy_executor/core/services/context_service.py +437 -0
- teddy_executor/core/services/edit_simulator.py +128 -0
- teddy_executor/core/services/execution_orchestrator.py +295 -0
- teddy_executor/core/services/execution_report_assembler.py +62 -0
- teddy_executor/core/services/init_service.py +80 -0
- teddy_executor/core/services/markdown_plan_parser.py +309 -0
- teddy_executor/core/services/markdown_report_formatter.py +143 -0
- teddy_executor/core/services/parser_infrastructure.py +222 -0
- teddy_executor/core/services/parser_metadata.py +153 -0
- teddy_executor/core/services/parser_reporting.py +267 -0
- teddy_executor/core/services/plan_validator.py +82 -0
- teddy_executor/core/services/planning_service.py +242 -0
- teddy_executor/core/services/prompt_manager.py +146 -0
- teddy_executor/core/services/session_lifecycle_manager.py +228 -0
- teddy_executor/core/services/session_loop_guard.py +46 -0
- teddy_executor/core/services/session_orchestrator.py +538 -0
- teddy_executor/core/services/session_planner.py +43 -0
- teddy_executor/core/services/session_pruning_service.py +438 -0
- teddy_executor/core/services/session_replanner.py +105 -0
- teddy_executor/core/services/session_repository.py +194 -0
- teddy_executor/core/services/session_service.py +529 -0
- teddy_executor/core/services/templates/execution_report.md.j2 +290 -0
- teddy_executor/core/services/validation_rules/__init__.py +4 -0
- teddy_executor/core/services/validation_rules/edit.py +207 -0
- teddy_executor/core/services/validation_rules/edit_matcher.py +247 -0
- teddy_executor/core/services/validation_rules/edit_matcher_heuristics.py +84 -0
- teddy_executor/core/services/validation_rules/execute.py +37 -0
- teddy_executor/core/services/validation_rules/filesystem.py +73 -0
- teddy_executor/core/services/validation_rules/helpers.py +178 -0
- teddy_executor/core/services/validation_rules/message.py +29 -0
- teddy_executor/core/utils/__init__.py +1 -0
- teddy_executor/core/utils/diff.py +57 -0
- teddy_executor/core/utils/io.py +75 -0
- teddy_executor/core/utils/markdown.py +131 -0
- teddy_executor/core/utils/serialization.py +39 -0
- teddy_executor/core/utils/string.py +351 -0
- teddy_executor/prompts.py +45 -0
- teddy_executor/registries/__init__.py +1 -0
- teddy_executor/registries/infrastructure.py +147 -0
- teddy_executor/registries/reviewer.py +57 -0
- teddy_executor/registries/validators.py +47 -0
- teddy_executor/resources/__init__.py +1 -0
- teddy_executor/resources/config/.gitignore +2 -0
- teddy_executor/resources/config/__init__.py +1 -0
- teddy_executor/resources/config/config.yaml +49 -0
- teddy_executor/resources/config/init.context +5 -0
- teddy_executor/resources/config/prompts/architect.xml +462 -0
- teddy_executor/resources/config/prompts/assistant.xml +336 -0
- teddy_executor/resources/config/prompts/debugger.xml +456 -0
- teddy_executor/resources/config/prompts/developer.xml +481 -0
- teddy_executor/resources/config/prompts/pathfinder.xml +502 -0
- teddy_executor/resources/config/prompts/prototyper.xml +425 -0
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
from teddy_executor.core.ports.outbound.web_scraper import WebScraper
|
|
2
|
+
from teddy_executor.core.ports.outbound.config_service import IConfigService
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
MIN_GITHUB_CONTENT_LENGTH = 10
|
|
6
|
+
HTTP_BAD_REQUEST = 400
|
|
7
|
+
HTTP_FORBIDDEN = 403
|
|
8
|
+
HTTP_NOT_ACCEPTABLE = 406
|
|
9
|
+
HTTP_TOO_MANY_REQUESTS = 429
|
|
10
|
+
HTTP_INTERNAL_SERVER_ERROR = 500
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class WebScraperAdapter(WebScraper):
|
|
14
|
+
"""
|
|
15
|
+
An adapter that implements the WebScraper port using requests and trafilatura.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, config_service: IConfigService = None): # type: ignore
|
|
19
|
+
self._config_service = config_service
|
|
20
|
+
|
|
21
|
+
def _get_trafilatura(self):
|
|
22
|
+
"""Lazy-load trafilatura to keep CLI startup fast."""
|
|
23
|
+
import trafilatura
|
|
24
|
+
|
|
25
|
+
return trafilatura
|
|
26
|
+
|
|
27
|
+
def _get_bs4(self):
|
|
28
|
+
"""Lazy-load BeautifulSoup to keep CLI startup fast."""
|
|
29
|
+
from bs4 import BeautifulSoup
|
|
30
|
+
|
|
31
|
+
return BeautifulSoup
|
|
32
|
+
|
|
33
|
+
def _extract_github_conversation(self, html: str) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Extracts issue or pull request content and comments from GitHub HTML.
|
|
36
|
+
Uses a hybrid strategy: JSON-embedded data (primary) and CSS selectors (fallback).
|
|
37
|
+
"""
|
|
38
|
+
import json
|
|
39
|
+
|
|
40
|
+
soup = self._get_bs4()(html, "html.parser")
|
|
41
|
+
|
|
42
|
+
# 1. Primary: Try high-fidelity extraction from embedded JSON data
|
|
43
|
+
scripts = soup.find_all("script", type="application/json")
|
|
44
|
+
for script in scripts:
|
|
45
|
+
if not script.string:
|
|
46
|
+
continue
|
|
47
|
+
try:
|
|
48
|
+
data = json.loads(script.string)
|
|
49
|
+
result = self._parse_github_json(data)
|
|
50
|
+
if result:
|
|
51
|
+
return result
|
|
52
|
+
except (json.JSONDecodeError, TypeError, AttributeError):
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
# 2. Fallback: CSS-based scraping
|
|
56
|
+
return self._scrape_github_html(soup)
|
|
57
|
+
|
|
58
|
+
def _find_key_recursive(self, obj, target_key):
|
|
59
|
+
"""Recursively search for a key in a nested dictionary/list."""
|
|
60
|
+
if isinstance(obj, dict):
|
|
61
|
+
if target_key in obj:
|
|
62
|
+
return obj[target_key]
|
|
63
|
+
for v in obj.values():
|
|
64
|
+
res = self._find_key_recursive(v, target_key)
|
|
65
|
+
if res:
|
|
66
|
+
return res
|
|
67
|
+
elif isinstance(obj, list):
|
|
68
|
+
for item in obj:
|
|
69
|
+
res = self._find_key_recursive(item, target_key)
|
|
70
|
+
if res:
|
|
71
|
+
return res
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
def _gather_edges_recursive(self, obj, edges_out: list):
|
|
75
|
+
"""Recursively gather all 'edges' lists into the output list."""
|
|
76
|
+
if isinstance(obj, dict):
|
|
77
|
+
if "edges" in obj and isinstance(obj["edges"], list):
|
|
78
|
+
edges_out.extend(obj["edges"])
|
|
79
|
+
for v in obj.values():
|
|
80
|
+
self._gather_edges_recursive(v, edges_out)
|
|
81
|
+
elif isinstance(obj, list):
|
|
82
|
+
for i in obj:
|
|
83
|
+
self._gather_edges_recursive(i, edges_out)
|
|
84
|
+
|
|
85
|
+
def _parse_github_json(self, data: dict) -> str | None:
|
|
86
|
+
"""Helper to recursively find and parse issue/PR data from JSON."""
|
|
87
|
+
container = self._find_key_recursive(data, "issue") or self._find_key_recursive(
|
|
88
|
+
data, "pullRequest"
|
|
89
|
+
)
|
|
90
|
+
if not container or not isinstance(container, dict):
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
title = (
|
|
94
|
+
container.get("title")
|
|
95
|
+
or container.get("titleHtml")
|
|
96
|
+
or container.get("titleText")
|
|
97
|
+
or "Unknown Title"
|
|
98
|
+
)
|
|
99
|
+
body = container.get("body") or container.get("bodyHTML", "")
|
|
100
|
+
|
|
101
|
+
all_edges: list[dict] = []
|
|
102
|
+
self._gather_edges_recursive(data, all_edges)
|
|
103
|
+
|
|
104
|
+
comments = []
|
|
105
|
+
seen_ids = set()
|
|
106
|
+
for edge in all_edges:
|
|
107
|
+
node = edge.get("node", {}) if isinstance(edge, dict) else {}
|
|
108
|
+
node_id = node.get("id")
|
|
109
|
+
if node_id and node_id not in seen_ids:
|
|
110
|
+
if node.get("__typename") in [
|
|
111
|
+
"IssueComment",
|
|
112
|
+
"PullRequestReview",
|
|
113
|
+
"PullRequestReviewComment",
|
|
114
|
+
]:
|
|
115
|
+
seen_ids.add(node_id)
|
|
116
|
+
author = node.get("author", {}).get("login", "unknown")
|
|
117
|
+
c_body = node.get("body") or node.get("bodyHTML") or ""
|
|
118
|
+
comments.append(
|
|
119
|
+
f"### {node.get('__typename')} by {author}\n{c_body}\n\n"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
return f"# {title}\n\n## Description\n{body}\n\n" + "".join(comments)
|
|
123
|
+
|
|
124
|
+
def _scrape_github_html(self, soup) -> str:
|
|
125
|
+
"""Helper for CSS-based fallback scraping."""
|
|
126
|
+
title_elem = soup.select_one(".markdown-title") or soup.select_one(
|
|
127
|
+
".gh-header-title"
|
|
128
|
+
)
|
|
129
|
+
title = title_elem.get_text(strip=True) if title_elem else "GitHub Content"
|
|
130
|
+
|
|
131
|
+
bodies = soup.select(".markdown-body")
|
|
132
|
+
content_blocks = []
|
|
133
|
+
for i, block in enumerate(bodies):
|
|
134
|
+
text = block.get_text(separator="\n", strip=True)
|
|
135
|
+
if len(text) > MIN_GITHUB_CONTENT_LENGTH:
|
|
136
|
+
label = "Description" if i == 0 else f"Comment {i}"
|
|
137
|
+
content_blocks.append(f"## {label}\n{text}\n\n")
|
|
138
|
+
|
|
139
|
+
return f"# {title}\n\n" + "".join(content_blocks)
|
|
140
|
+
|
|
141
|
+
def _is_retryable_error(self, status_code: int | None) -> bool:
|
|
142
|
+
"""Determines if an HTTP error is transient and should be retried."""
|
|
143
|
+
if not status_code:
|
|
144
|
+
return True
|
|
145
|
+
# Retry on 5xx (server errors) or 429 (Too Many Requests)
|
|
146
|
+
return (
|
|
147
|
+
status_code >= HTTP_INTERNAL_SERVER_ERROR
|
|
148
|
+
or status_code == HTTP_TOO_MANY_REQUESTS
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def _fetch_with_ua(self, url: str, ua: str, max_retries: int) -> str | None:
|
|
152
|
+
"""Internal helper to attempt fetch with a specific User-Agent and retries."""
|
|
153
|
+
import requests
|
|
154
|
+
import time
|
|
155
|
+
|
|
156
|
+
headers = {
|
|
157
|
+
"User-Agent": ua,
|
|
158
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
159
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
160
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
161
|
+
"DNT": "1",
|
|
162
|
+
"Connection": "keep-alive",
|
|
163
|
+
"Upgrade-Insecure-Requests": "1",
|
|
164
|
+
"Sec-Fetch-Dest": "document",
|
|
165
|
+
"Sec-Fetch-Mode": "navigate",
|
|
166
|
+
"Sec-Fetch-Site": "none",
|
|
167
|
+
"Sec-Fetch-User": "?1",
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
for attempt in range(max_retries):
|
|
171
|
+
try:
|
|
172
|
+
response = requests.get(url, headers=headers, timeout=20)
|
|
173
|
+
response.raise_for_status()
|
|
174
|
+
return response.text
|
|
175
|
+
except requests.exceptions.HTTPError as e:
|
|
176
|
+
status_code = getattr(e.response, "status_code", None)
|
|
177
|
+
|
|
178
|
+
# 403/406 signal a need for UA rotation, not same-UA retry
|
|
179
|
+
if status_code in [HTTP_FORBIDDEN, HTTP_NOT_ACCEPTABLE]:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
if self._is_retryable_error(status_code) and attempt < max_retries - 1:
|
|
183
|
+
time.sleep(2**attempt)
|
|
184
|
+
continue
|
|
185
|
+
raise
|
|
186
|
+
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
|
|
187
|
+
if attempt < max_retries - 1:
|
|
188
|
+
time.sleep(2**attempt)
|
|
189
|
+
continue
|
|
190
|
+
return None
|
|
191
|
+
except Exception:
|
|
192
|
+
return None
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
def _fetch_with_rotation(self, url: str) -> str | None:
|
|
196
|
+
"""Attempts to fetch HTML content using a rotating pool of User-Agents and retries."""
|
|
197
|
+
user_agents = [
|
|
198
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
199
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
200
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
201
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1",
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
max_retries = 3
|
|
205
|
+
if self._config_service:
|
|
206
|
+
val = self._config_service.get_setting("research.max_scraper_retries", 3)
|
|
207
|
+
if val is not None:
|
|
208
|
+
max_retries = int(val)
|
|
209
|
+
|
|
210
|
+
last_error: Exception | None = None
|
|
211
|
+
|
|
212
|
+
for ua in user_agents:
|
|
213
|
+
try:
|
|
214
|
+
html_content = self._fetch_with_ua(url, ua, max_retries)
|
|
215
|
+
if html_content:
|
|
216
|
+
return html_content
|
|
217
|
+
except Exception as e:
|
|
218
|
+
last_error = e
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
# Final Resort: Trafilatura's internal fetcher
|
|
222
|
+
html_content = self._get_trafilatura().fetch_url(url)
|
|
223
|
+
if not html_content and last_error:
|
|
224
|
+
raise last_error
|
|
225
|
+
|
|
226
|
+
return html_content
|
|
227
|
+
|
|
228
|
+
def _handle_github_raw(self, url: str) -> str | None:
|
|
229
|
+
"""Handles specialized fetching for GitHub raw content with retries."""
|
|
230
|
+
import requests
|
|
231
|
+
import time
|
|
232
|
+
|
|
233
|
+
is_raw_github = url.startswith("https://raw.githubusercontent.com/")
|
|
234
|
+
is_github_blob = url.startswith("https://github.com/") and "/blob/" in url
|
|
235
|
+
|
|
236
|
+
if not (is_raw_github or is_github_blob):
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
target_url = (
|
|
240
|
+
url.replace("github.com", "raw.githubusercontent.com").replace(
|
|
241
|
+
"/blob/", "/"
|
|
242
|
+
)
|
|
243
|
+
if is_github_blob
|
|
244
|
+
else url
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
max_retries = 3
|
|
248
|
+
if self._config_service:
|
|
249
|
+
val = self._config_service.get_setting("research.max_scraper_retries", 3)
|
|
250
|
+
if val is not None:
|
|
251
|
+
max_retries = int(val)
|
|
252
|
+
|
|
253
|
+
for attempt in range(max_retries):
|
|
254
|
+
try:
|
|
255
|
+
response = requests.get(
|
|
256
|
+
target_url,
|
|
257
|
+
headers={
|
|
258
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
259
|
+
},
|
|
260
|
+
timeout=30,
|
|
261
|
+
)
|
|
262
|
+
response.raise_for_status()
|
|
263
|
+
return response.text
|
|
264
|
+
except (
|
|
265
|
+
requests.exceptions.HTTPError,
|
|
266
|
+
requests.exceptions.ConnectionError,
|
|
267
|
+
requests.exceptions.Timeout,
|
|
268
|
+
) as e:
|
|
269
|
+
if attempt < max_retries - 1:
|
|
270
|
+
status_code = getattr(
|
|
271
|
+
getattr(e, "response", None), "status_code", None
|
|
272
|
+
)
|
|
273
|
+
# Don't retry on most 4xx
|
|
274
|
+
if (
|
|
275
|
+
status_code
|
|
276
|
+
and HTTP_BAD_REQUEST <= status_code < HTTP_INTERNAL_SERVER_ERROR
|
|
277
|
+
and status_code not in [HTTP_FORBIDDEN, HTTP_TOO_MANY_REQUESTS]
|
|
278
|
+
):
|
|
279
|
+
raise
|
|
280
|
+
time.sleep(2**attempt)
|
|
281
|
+
continue
|
|
282
|
+
raise
|
|
283
|
+
return None
|
|
284
|
+
|
|
285
|
+
def _truncate_content(self, markdown_content: str) -> str:
|
|
286
|
+
"""Truncates markdown content based on read.max_lines configuration."""
|
|
287
|
+
if not self._config_service or not markdown_content:
|
|
288
|
+
return markdown_content
|
|
289
|
+
|
|
290
|
+
max_lines = self._config_service.get_setting("read.max_lines", None)
|
|
291
|
+
if max_lines is not None:
|
|
292
|
+
try:
|
|
293
|
+
max_lines_int = int(max_lines)
|
|
294
|
+
lines = markdown_content.splitlines()
|
|
295
|
+
if len(lines) > max_lines_int:
|
|
296
|
+
return "\n".join(lines[:max_lines_int])
|
|
297
|
+
except (ValueError, TypeError):
|
|
298
|
+
pass
|
|
299
|
+
return markdown_content
|
|
300
|
+
|
|
301
|
+
def get_content(self, url: str, **_kwargs) -> str:
|
|
302
|
+
"""
|
|
303
|
+
Fetches and extracts the content from the given URL.
|
|
304
|
+
Employs a multi-stage stealth rotation to bypass automated blocking.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
url: The URL to fetch content from.
|
|
308
|
+
**_kwargs: Optional extraction hints.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
The extracted text content.
|
|
312
|
+
"""
|
|
313
|
+
# 1. Specialized handling for GitHub raw content
|
|
314
|
+
raw_github_content = self._handle_github_raw(url)
|
|
315
|
+
if raw_github_content is not None:
|
|
316
|
+
return raw_github_content
|
|
317
|
+
|
|
318
|
+
# 2. Multi-stage Stealth Rotation for general URLs
|
|
319
|
+
html_content = self._fetch_with_rotation(url)
|
|
320
|
+
if not html_content:
|
|
321
|
+
return ""
|
|
322
|
+
|
|
323
|
+
# 3. Routing: Use specialized extractor for GitHub Issues and Pull Requests
|
|
324
|
+
is_github_domain = (
|
|
325
|
+
"github.com" in url or "localhost" in url or "127.0.0.1" in url
|
|
326
|
+
)
|
|
327
|
+
if is_github_domain and ("/issues/" in url or "/pull/" in url):
|
|
328
|
+
github_content = self._extract_github_conversation(html_content)
|
|
329
|
+
if github_content:
|
|
330
|
+
return github_content
|
|
331
|
+
|
|
332
|
+
trafilatura = self._get_trafilatura()
|
|
333
|
+
markdown_content = trafilatura.extract(
|
|
334
|
+
html_content,
|
|
335
|
+
output_format="markdown",
|
|
336
|
+
include_links=True,
|
|
337
|
+
include_formatting=True,
|
|
338
|
+
favor_recall=False,
|
|
339
|
+
include_comments=False,
|
|
340
|
+
include_tables=True,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
if not markdown_content:
|
|
344
|
+
return ""
|
|
345
|
+
|
|
346
|
+
return self._truncate_content(markdown_content)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Callable, List, Optional
|
|
3
|
+
from teddy_executor.core.domain.models import (
|
|
4
|
+
QueryResult,
|
|
5
|
+
SearchResult,
|
|
6
|
+
WebSearchError,
|
|
7
|
+
WebSearchResults,
|
|
8
|
+
)
|
|
9
|
+
from teddy_executor.core.ports.outbound import IConfigService, IWebSearcher
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class WebSearcherAdapter(IWebSearcher):
|
|
16
|
+
"""
|
|
17
|
+
An adapter that uses the ddgs library to perform web searches.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
config_service: IConfigService,
|
|
23
|
+
ddgs_factory: Optional[Callable[..., Any]] = None,
|
|
24
|
+
):
|
|
25
|
+
self._config_service = config_service
|
|
26
|
+
self._ddgs_factory = ddgs_factory
|
|
27
|
+
|
|
28
|
+
def _apply_ddgs_monkeypatch(self) -> None:
|
|
29
|
+
"""Applies a structural patch to DDGS to preserve word boundaries."""
|
|
30
|
+
from ddgs.base import BaseSearchEngine
|
|
31
|
+
|
|
32
|
+
def patched_extract_results(self, html_text: str):
|
|
33
|
+
html_text = self.pre_process_html(html_text)
|
|
34
|
+
tree = self.extract_tree(html_text)
|
|
35
|
+
items = tree.xpath(self.items_xpath)
|
|
36
|
+
results = []
|
|
37
|
+
for item in items:
|
|
38
|
+
result = self.result_type()
|
|
39
|
+
for key, value in self.elements_xpath.items():
|
|
40
|
+
parts = (x.strip() for x in item.xpath(value))
|
|
41
|
+
# JOIN WITH SPACE instead of empty string to preserve boundaries
|
|
42
|
+
data = " ".join(" ".join(parts).split())
|
|
43
|
+
result.__setattr__(key, data)
|
|
44
|
+
results.append(result)
|
|
45
|
+
return results
|
|
46
|
+
|
|
47
|
+
# Apply the structural patch to the base class
|
|
48
|
+
BaseSearchEngine.extract_results = patched_extract_results # type: ignore[method-assign]
|
|
49
|
+
|
|
50
|
+
def _clean_snippet(self, text: str) -> str:
|
|
51
|
+
"""Fixes missing spaces after punctuation in raw text."""
|
|
52
|
+
import re
|
|
53
|
+
|
|
54
|
+
if not text:
|
|
55
|
+
return ""
|
|
56
|
+
# Fix missing space after period, comma, or colon followed by a letter
|
|
57
|
+
return re.sub(r"([.,:])([A-Za-z])", r"\1 \2", text)
|
|
58
|
+
|
|
59
|
+
def _execute_single_query(
|
|
60
|
+
self, ddgs_client: Any, query: str, total_queries: int
|
|
61
|
+
) -> QueryResult:
|
|
62
|
+
"""Executes a single search query and maps results."""
|
|
63
|
+
try:
|
|
64
|
+
max_results = self._config_service.get_setting("research.max_results", 5)
|
|
65
|
+
# DDGS.text returns a generator, so we convert it to a list
|
|
66
|
+
results = list(ddgs_client.text(query, max_results=max_results))
|
|
67
|
+
|
|
68
|
+
search_results_for_query: List[SearchResult] = []
|
|
69
|
+
for res in results:
|
|
70
|
+
url = res.get("href", "")
|
|
71
|
+
# Map library 'body' to our 'description'
|
|
72
|
+
item: SearchResult = {
|
|
73
|
+
"title": res.get("title", ""),
|
|
74
|
+
"href": url,
|
|
75
|
+
"description": self._clean_snippet(res.get("body", "")),
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
search_results_for_query.append(item)
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"query": query,
|
|
82
|
+
"results": search_results_for_query,
|
|
83
|
+
}
|
|
84
|
+
except Exception as e:
|
|
85
|
+
# Log the individual query failure but continue with other queries.
|
|
86
|
+
# This prevents one failing query from sabotaging the entire action.
|
|
87
|
+
logger.warning(f"Search query '{query}' failed: {e}")
|
|
88
|
+
|
|
89
|
+
# If this is the ONLY query, we still want to raise the error
|
|
90
|
+
# to maintain failure transparency (Stop the Line).
|
|
91
|
+
if total_queries == 1:
|
|
92
|
+
raise WebSearchError(f"Failed to execute search: {e}") from e
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
"query": query,
|
|
96
|
+
"results": [],
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def search(self, queries: List[str]) -> WebSearchResults:
|
|
100
|
+
"""
|
|
101
|
+
Performs a web search for each query and maps the results.
|
|
102
|
+
"""
|
|
103
|
+
from ddgs import DDGS
|
|
104
|
+
|
|
105
|
+
self._apply_ddgs_monkeypatch()
|
|
106
|
+
all_query_results: List[QueryResult] = []
|
|
107
|
+
factory = self._ddgs_factory or DDGS
|
|
108
|
+
|
|
109
|
+
# Globally disable logging (CRITICAL and below) to silence noisy
|
|
110
|
+
# third-party HTTP clients (urllib3, httpx, curl_cffi) used by DDGS.
|
|
111
|
+
logging.disable(logging.CRITICAL)
|
|
112
|
+
try:
|
|
113
|
+
with factory() as ddgs_client:
|
|
114
|
+
for query in queries:
|
|
115
|
+
result = self._execute_single_query(
|
|
116
|
+
ddgs_client, query, len(queries)
|
|
117
|
+
)
|
|
118
|
+
all_query_results.append(result)
|
|
119
|
+
|
|
120
|
+
return {"query_results": all_query_results}
|
|
121
|
+
finally:
|
|
122
|
+
logging.disable(logging.NOTSET)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from importlib import resources
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
import yaml
|
|
5
|
+
from teddy_executor.core.ports.outbound.config_service import IConfigService
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class YamlConfigAdapter(IConfigService):
|
|
9
|
+
"""
|
|
10
|
+
Implements IConfigService by reading configuration from a YAML file.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self, config_path: str = ".teddy/config.yaml", root_dir: Optional[str] = None
|
|
15
|
+
):
|
|
16
|
+
if root_dir:
|
|
17
|
+
self._config_path = os.path.join(root_dir, config_path)
|
|
18
|
+
else:
|
|
19
|
+
self._config_path = config_path
|
|
20
|
+
self._config: Dict[str, Any] = self._load_layered_config()
|
|
21
|
+
|
|
22
|
+
def _load_layered_config(self) -> Dict[str, Any]:
|
|
23
|
+
"""Loads the baseline config and merges it with the user config."""
|
|
24
|
+
# 1. Load Bundled Baseline
|
|
25
|
+
config = self._load_baseline()
|
|
26
|
+
|
|
27
|
+
# 2. Load User Overrides
|
|
28
|
+
user_config = self._load_user_config()
|
|
29
|
+
|
|
30
|
+
# 3. Simple Deep Merge (Layered)
|
|
31
|
+
self._merge_dicts(config, user_config)
|
|
32
|
+
|
|
33
|
+
return config
|
|
34
|
+
|
|
35
|
+
def _load_baseline(self) -> Dict[str, Any]:
|
|
36
|
+
"""Loads the bundled baseline config from package resources."""
|
|
37
|
+
try:
|
|
38
|
+
resource_path = resources.files("teddy_executor.resources.config").joinpath(
|
|
39
|
+
"config.yaml"
|
|
40
|
+
)
|
|
41
|
+
with resource_path.open("r", encoding="utf-8") as f:
|
|
42
|
+
data = yaml.safe_load(f)
|
|
43
|
+
return data if isinstance(data, dict) else {}
|
|
44
|
+
except (yaml.YAMLError, OSError, ImportError, AttributeError):
|
|
45
|
+
return {}
|
|
46
|
+
|
|
47
|
+
def _load_user_config(self) -> Dict[str, Any]:
|
|
48
|
+
"""Loads the user-specific YAML configuration file if it exists."""
|
|
49
|
+
if not os.path.exists(self._config_path):
|
|
50
|
+
return {}
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
with open(self._config_path, "r", encoding="utf-8") as f:
|
|
54
|
+
data = yaml.safe_load(f)
|
|
55
|
+
return data if isinstance(data, dict) else {}
|
|
56
|
+
except (yaml.YAMLError, OSError):
|
|
57
|
+
return {}
|
|
58
|
+
|
|
59
|
+
def _merge_dicts(self, base: Dict[str, Any], overrides: Dict[str, Any]) -> None:
|
|
60
|
+
"""Recursively merges overrides into base. Prunes keys set to None."""
|
|
61
|
+
for key, value in overrides.items():
|
|
62
|
+
if value is None:
|
|
63
|
+
if key in base:
|
|
64
|
+
del base[key]
|
|
65
|
+
elif isinstance(value, dict):
|
|
66
|
+
if key not in base or not isinstance(base[key], dict):
|
|
67
|
+
base[key] = {}
|
|
68
|
+
self._merge_dicts(base[key], value)
|
|
69
|
+
else:
|
|
70
|
+
base[key] = value
|
|
71
|
+
|
|
72
|
+
def get_setting(self, key: str, default: Optional[Any] = None) -> Optional[Any]:
|
|
73
|
+
"""
|
|
74
|
+
Retrieves a configuration value by its key from the loaded YAML.
|
|
75
|
+
Supports nested keys using dot notation (e.g., 'outer.inner').
|
|
76
|
+
"""
|
|
77
|
+
if not key:
|
|
78
|
+
return default
|
|
79
|
+
|
|
80
|
+
# 1. Try exact match first (highest priority: top-level user overrides)
|
|
81
|
+
if key in self._config:
|
|
82
|
+
return self._config[key]
|
|
83
|
+
|
|
84
|
+
# 2. Try nested resolution (standard hierarchical structure)
|
|
85
|
+
parts = key.split(".")
|
|
86
|
+
result = self._resolve_nested(parts)
|
|
87
|
+
|
|
88
|
+
if result is not None:
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
return default
|
|
92
|
+
|
|
93
|
+
def get_config_path(self) -> str:
|
|
94
|
+
"""Returns the path to the configuration file."""
|
|
95
|
+
return self._config_path
|
|
96
|
+
|
|
97
|
+
def _resolve_nested(self, parts: list[str]) -> Optional[Any]:
|
|
98
|
+
"""Iteratively resolves nested keys."""
|
|
99
|
+
current = self._config
|
|
100
|
+
for part in parts:
|
|
101
|
+
if isinstance(current, dict) and part in current:
|
|
102
|
+
current = current[part]
|
|
103
|
+
else:
|
|
104
|
+
return None
|
|
105
|
+
return current
|