entari-plugin-hyw 0.3.5__py3-none-any.whl → 4.0.0rc14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/Untitled-1 +1865 -0
- entari_plugin_hyw/__init__.py +979 -116
- entari_plugin_hyw/filters.py +83 -0
- entari_plugin_hyw/history.py +251 -0
- entari_plugin_hyw/misc.py +214 -0
- entari_plugin_hyw/search_cache.py +154 -0
- entari_plugin_hyw-4.0.0rc14.dist-info/METADATA +118 -0
- entari_plugin_hyw-4.0.0rc14.dist-info/RECORD +72 -0
- {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/WHEEL +1 -1
- {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/top_level.txt +1 -0
- hyw_core/__init__.py +94 -0
- hyw_core/agent.py +768 -0
- hyw_core/browser_control/__init__.py +63 -0
- hyw_core/browser_control/assets/card-dist/index.html +425 -0
- hyw_core/browser_control/assets/card-dist/logos/anthropic.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/cerebras.svg +9 -0
- hyw_core/browser_control/assets/card-dist/logos/deepseek.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/gemini.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/google.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/grok.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/huggingface.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/microsoft.svg +15 -0
- hyw_core/browser_control/assets/card-dist/logos/minimax.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/mistral.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/nvida.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/openai.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/openrouter.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/perplexity.svg +24 -0
- hyw_core/browser_control/assets/card-dist/logos/qwen.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/xai.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/xiaomi.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/zai.png +0 -0
- hyw_core/browser_control/assets/card-dist/vite.svg +1 -0
- hyw_core/browser_control/assets/index.html +5691 -0
- hyw_core/browser_control/assets/logos/anthropic.svg +1 -0
- hyw_core/browser_control/assets/logos/cerebras.svg +9 -0
- hyw_core/browser_control/assets/logos/deepseek.png +0 -0
- hyw_core/browser_control/assets/logos/gemini.svg +1 -0
- hyw_core/browser_control/assets/logos/google.svg +1 -0
- hyw_core/browser_control/assets/logos/grok.png +0 -0
- hyw_core/browser_control/assets/logos/huggingface.png +0 -0
- hyw_core/browser_control/assets/logos/microsoft.svg +15 -0
- hyw_core/browser_control/assets/logos/minimax.png +0 -0
- hyw_core/browser_control/assets/logos/mistral.png +0 -0
- hyw_core/browser_control/assets/logos/nvida.png +0 -0
- hyw_core/browser_control/assets/logos/openai.svg +1 -0
- hyw_core/browser_control/assets/logos/openrouter.png +0 -0
- hyw_core/browser_control/assets/logos/perplexity.svg +24 -0
- hyw_core/browser_control/assets/logos/qwen.png +0 -0
- hyw_core/browser_control/assets/logos/xai.png +0 -0
- hyw_core/browser_control/assets/logos/xiaomi.png +0 -0
- hyw_core/browser_control/assets/logos/zai.png +0 -0
- hyw_core/browser_control/engines/__init__.py +15 -0
- hyw_core/browser_control/engines/base.py +13 -0
- hyw_core/browser_control/engines/default.py +166 -0
- hyw_core/browser_control/engines/duckduckgo.py +171 -0
- hyw_core/browser_control/landing.html +172 -0
- hyw_core/browser_control/manager.py +173 -0
- hyw_core/browser_control/renderer.py +446 -0
- hyw_core/browser_control/service.py +940 -0
- hyw_core/config.py +154 -0
- hyw_core/core.py +462 -0
- hyw_core/crawling/__init__.py +18 -0
- hyw_core/crawling/completeness.py +437 -0
- hyw_core/crawling/models.py +88 -0
- hyw_core/definitions.py +104 -0
- hyw_core/image_cache.py +274 -0
- hyw_core/pipeline.py +502 -0
- hyw_core/search.py +171 -0
- hyw_core/stages/__init__.py +21 -0
- hyw_core/stages/base.py +95 -0
- hyw_core/stages/summary.py +191 -0
- entari_plugin_hyw/agent.py +0 -419
- entari_plugin_hyw/compressor.py +0 -59
- entari_plugin_hyw/tools.py +0 -236
- entari_plugin_hyw/vision.py +0 -35
- entari_plugin_hyw-0.3.5.dist-info/METADATA +0 -112
- entari_plugin_hyw-0.3.5.dist-info/RECORD +0 -9
hyw_core/pipeline.py
ADDED
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Modular Pipeline Dispatcher
|
|
3
|
+
|
|
4
|
+
New pipeline architecture: Instruct Loop (x2) -> Summary.
|
|
5
|
+
Simpler flow with self-correction/feedback loop.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import time
|
|
10
|
+
import re
|
|
11
|
+
from typing import Any, Dict, List, Optional, Callable, Awaitable
|
|
12
|
+
|
|
13
|
+
from loguru import logger
|
|
14
|
+
from openai import AsyncOpenAI
|
|
15
|
+
|
|
16
|
+
from .stages.base import StageContext, StageResult
|
|
17
|
+
from .stages.base import StageContext, StageResult, BaseStage
|
|
18
|
+
from .stages.summary import SummaryStage
|
|
19
|
+
from .search import SearchService
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ModularPipeline:
|
|
23
|
+
"""
|
|
24
|
+
Modular Pipeline.
|
|
25
|
+
|
|
26
|
+
Flow:
|
|
27
|
+
1. Input Analysis:
|
|
28
|
+
- If Images -> Skip Search -> Summary
|
|
29
|
+
- If Text -> Execute Search (or URL fetch) -> Summary
|
|
30
|
+
2. Summary: Generate final response.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: Any, search_service: SearchService, send_func: Optional[Callable[[str], Awaitable[None]]] = None):
|
|
34
|
+
self.config = config
|
|
35
|
+
self.send_func = send_func
|
|
36
|
+
self.search_service = search_service
|
|
37
|
+
self.client = AsyncOpenAI(base_url=config.base_url, api_key=config.api_key)
|
|
38
|
+
|
|
39
|
+
# Initialize stages
|
|
40
|
+
self.summary_stage = SummaryStage(config, self.search_service, self.client)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def _send_func(self) -> Optional[Callable[[str], Awaitable[None]]]:
|
|
44
|
+
"""Getter for _send_func (alias for send_func)."""
|
|
45
|
+
return self.send_func
|
|
46
|
+
|
|
47
|
+
@_send_func.setter
|
|
48
|
+
def _send_func(self, value: Optional[Callable[[str], Awaitable[None]]]):
|
|
49
|
+
"""Setter for _send_func - updates send_func and propagates to stages."""
|
|
50
|
+
self.send_func = value
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
async def execute(
|
|
54
|
+
self,
|
|
55
|
+
user_input: str,
|
|
56
|
+
conversation_history: List[Dict],
|
|
57
|
+
model_name: str = None,
|
|
58
|
+
images: List[str] = None,
|
|
59
|
+
) -> Dict[str, Any]:
|
|
60
|
+
"""Execute the modular pipeline."""
|
|
61
|
+
start_time = time.time()
|
|
62
|
+
stats = {"start_time": start_time}
|
|
63
|
+
usage_totals = {"input_tokens": 0, "output_tokens": 0}
|
|
64
|
+
active_model = model_name or self.config.model_name
|
|
65
|
+
if not active_model:
|
|
66
|
+
# Fallback to instruct model for logging/context
|
|
67
|
+
active_model = self.config.get_model_config("instruct").model_name
|
|
68
|
+
|
|
69
|
+
context = StageContext(
|
|
70
|
+
user_input=user_input,
|
|
71
|
+
images=images or [],
|
|
72
|
+
conversation_history=conversation_history,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Determine if model supports image input
|
|
76
|
+
model_cfg_dict = next((m for m in self.config.models if m.get("name") == active_model), None)
|
|
77
|
+
if model_cfg_dict:
|
|
78
|
+
context.image_input_supported = model_cfg_dict.get("image_input", True)
|
|
79
|
+
else:
|
|
80
|
+
context.image_input_supported = True # Default to True if unknown
|
|
81
|
+
|
|
82
|
+
logger.info(f"Pipeline Execution: Model '{active_model}' Image Input Supported: {context.image_input_supported}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
trace: Dict[str, Any] = {
|
|
86
|
+
"instruct_rounds": [],
|
|
87
|
+
"summary": None,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
logger.info(f"Pipeline: Processing '{user_input[:30]}...'")
|
|
92
|
+
|
|
93
|
+
# === Image-First Logic ===
|
|
94
|
+
# When user provides images, skip search and go directly to Instruct
|
|
95
|
+
# Images will be passed through to both Instruct and Summary stages
|
|
96
|
+
has_user_images = bool(images)
|
|
97
|
+
if has_user_images:
|
|
98
|
+
logger.info(f"Pipeline: {len(images)} user image(s) detected. Skipping search -> Instruct.")
|
|
99
|
+
|
|
100
|
+
# === Search-First Logic (only when no images) ===
|
|
101
|
+
# 1. URL Detection
|
|
102
|
+
# Updated to capture full URLs including queries and paths
|
|
103
|
+
url_pattern = re.compile(r'https?://(?:[-\w./?=&%#]+)')
|
|
104
|
+
found_urls = url_pattern.findall(user_input)
|
|
105
|
+
|
|
106
|
+
hit_content = False
|
|
107
|
+
|
|
108
|
+
# Skip URL fetch and search if user provided images or long query
|
|
109
|
+
is_long_query = len(user_input) > 20
|
|
110
|
+
if has_user_images:
|
|
111
|
+
hit_content = False # Force into Instruct path
|
|
112
|
+
elif is_long_query:
|
|
113
|
+
logger.info(f"Pipeline: Long query ({len(user_input)} chars). Skipping direct search/fetch -> Instruct.")
|
|
114
|
+
hit_content = False
|
|
115
|
+
elif found_urls:
|
|
116
|
+
logger.info(f"Pipeline: Detected {len(found_urls)} URLs. Executing direct fetch...")
|
|
117
|
+
# Fetch pages (borrowing logic from InstructStage's batch fetch would be ideal,
|
|
118
|
+
# but we'll use search_service directly and simulate what Instruct did for context)
|
|
119
|
+
|
|
120
|
+
# Fetch
|
|
121
|
+
fetch_results = await self.search_service.fetch_pages_batch(found_urls)
|
|
122
|
+
|
|
123
|
+
# Pre-render screenshots if needed (similar to InstructStage logic)
|
|
124
|
+
# For brevity/cleanliness, assuming fetch_pages_batch returns what we need or we process it.
|
|
125
|
+
# Ideally we want screenshots for the UI. The serivce.fetch_page usually returns raw data.
|
|
126
|
+
# We need to render them if we want screenshots.
|
|
127
|
+
# To keep it simple for this file, we'll skip complex screenshot rendering here OR
|
|
128
|
+
# we rely on the summary stage to just use the text.
|
|
129
|
+
# But the user logic implies "Search/Fetch Hit -> Summary".
|
|
130
|
+
|
|
131
|
+
# Let's populate context.web_results
|
|
132
|
+
for i, page_data in enumerate(fetch_results):
|
|
133
|
+
if page_data.get("content"):
|
|
134
|
+
hit_content = True
|
|
135
|
+
context.web_results.append({
|
|
136
|
+
"_id": context.next_id(),
|
|
137
|
+
"_type": "page",
|
|
138
|
+
"title": page_data.get("title", "Page"),
|
|
139
|
+
"url": page_data.get("url", found_urls[i]),
|
|
140
|
+
"content": page_data.get("content", ""),
|
|
141
|
+
"images": page_data.get("images", []),
|
|
142
|
+
# For now, no screenshot unless we call renderer.
|
|
143
|
+
# If critical, we can add it later.
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
# 2. Search (if no URLs or just always try search if simple query?)
|
|
147
|
+
# The prompt says: "judging result quantity > 0".
|
|
148
|
+
if not hit_content and not has_user_images and not is_long_query and user_input.strip():
|
|
149
|
+
logger.info("Pipeline: No URLs found or fetched. Executing direct search...")
|
|
150
|
+
search_start = time.time()
|
|
151
|
+
search_results = await self.search_service.search(user_input)
|
|
152
|
+
context.search_time = time.time() - search_start
|
|
153
|
+
|
|
154
|
+
# Filter out the raw debug page
|
|
155
|
+
valid_results = [r for r in search_results if not r.get("_hidden")]
|
|
156
|
+
|
|
157
|
+
if valid_results:
|
|
158
|
+
logger.info(f"Pipeline: Search found {len(valid_results)} results in {context.search_time:.2f}s. Proceeding to Summary.")
|
|
159
|
+
hit_content = True
|
|
160
|
+
for item in search_results: # Add all, including hidden debug ones if needed by history
|
|
161
|
+
item["_id"] = context.next_id()
|
|
162
|
+
if "_type" not in item: item["_type"] = "search"
|
|
163
|
+
item["query"] = user_input
|
|
164
|
+
context.web_results.append(item)
|
|
165
|
+
else:
|
|
166
|
+
logger.info("Pipeline: Search yielded 0 results.")
|
|
167
|
+
|
|
168
|
+
# === Branching ===
|
|
169
|
+
if hit_content and not has_user_images:
|
|
170
|
+
# -> Summary Stage (search/URL results available)
|
|
171
|
+
logger.info("Pipeline: Content found (URL/Search). Proceeding to Summary.")
|
|
172
|
+
|
|
173
|
+
# If no content was found and no images, we still proceed to Summary but with empty context (Direct Chat)
|
|
174
|
+
# If images, we proceed to Summary with images.
|
|
175
|
+
|
|
176
|
+
# Refusal check from search results? (Unlikely, but good to keep in mind)
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# === Parallel Execution: Summary Generation + Image Prefetching ===
|
|
181
|
+
# We run image prefetching concurrently with Summary generation to save time.
|
|
182
|
+
|
|
183
|
+
# 1. Prepare candidates for prefetch (all images in search results)
|
|
184
|
+
all_candidate_urls = set()
|
|
185
|
+
for r in context.web_results:
|
|
186
|
+
# Add images from search results/pages
|
|
187
|
+
if r.get("images"):
|
|
188
|
+
for img in r["images"]:
|
|
189
|
+
if img and isinstance(img, str) and img.startswith("http"):
|
|
190
|
+
all_candidate_urls.add(img)
|
|
191
|
+
|
|
192
|
+
prefetch_list = list(all_candidate_urls)
|
|
193
|
+
logger.info(f"Pipeline: Starting parallel execution (Summary + Prefetch {len(prefetch_list)} images)")
|
|
194
|
+
|
|
195
|
+
# 2. Define parallel tasks with timing
|
|
196
|
+
async def timed_summary():
|
|
197
|
+
t0 = time.time()
|
|
198
|
+
# Collect page screenshots if image mode
|
|
199
|
+
summary_input_images = list(images) if images else []
|
|
200
|
+
if context.image_input_supported:
|
|
201
|
+
# Collect pre-rendered screenshots from web_results
|
|
202
|
+
for r in context.web_results:
|
|
203
|
+
if r.get("_type") == "page" and r.get("screenshot_b64"):
|
|
204
|
+
summary_input_images.append(r["screenshot_b64"])
|
|
205
|
+
|
|
206
|
+
if context.should_refuse:
|
|
207
|
+
return StageResult(success=True, data={"content": "Refused"}, usage={}, trace={}), 0.0
|
|
208
|
+
|
|
209
|
+
res = await self.summary_stage.execute(
|
|
210
|
+
context,
|
|
211
|
+
images=summary_input_images if summary_input_images else None
|
|
212
|
+
)
|
|
213
|
+
duration = time.time() - t0
|
|
214
|
+
return res, duration
|
|
215
|
+
|
|
216
|
+
async def timed_prefetch():
|
|
217
|
+
t0 = time.time()
|
|
218
|
+
if not prefetch_list:
|
|
219
|
+
return {}, 0.0
|
|
220
|
+
try:
|
|
221
|
+
from .image_cache import get_image_cache
|
|
222
|
+
cache = get_image_cache()
|
|
223
|
+
# Start prefetch (non-blocking kickoff)
|
|
224
|
+
cache.start_prefetch(prefetch_list)
|
|
225
|
+
# Wait for results (blocking until done)
|
|
226
|
+
res = await cache.get_all_cached(prefetch_list)
|
|
227
|
+
duration = time.time() - t0
|
|
228
|
+
return res, duration
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.warning(f"Pipeline: Prefetch failed: {e}")
|
|
231
|
+
return {}, time.time() - t0
|
|
232
|
+
|
|
233
|
+
# 3. Execute concurrently
|
|
234
|
+
summary_task = asyncio.create_task(timed_summary())
|
|
235
|
+
prefetch_task = asyncio.create_task(timed_prefetch())
|
|
236
|
+
|
|
237
|
+
# Wait for both to complete
|
|
238
|
+
await asyncio.wait([summary_task, prefetch_task])
|
|
239
|
+
|
|
240
|
+
# 4. Process results and log timing
|
|
241
|
+
summary_result, summary_time = await summary_task
|
|
242
|
+
cached_map, prefetch_time = await prefetch_task
|
|
243
|
+
|
|
244
|
+
if context.should_refuse:
|
|
245
|
+
# Double check if summary triggered refusal
|
|
246
|
+
return self._build_refusal_response(context, conversation_history, active_model, stats)
|
|
247
|
+
|
|
248
|
+
time_diff = abs(summary_time - prefetch_time)
|
|
249
|
+
if summary_time > prefetch_time:
|
|
250
|
+
logger.info(f"Pipeline: Image Prefetch finished first ({prefetch_time:.2f}s). Summary took {summary_time:.2f}s. (Waited {time_diff:.2f}s for Summary)")
|
|
251
|
+
else:
|
|
252
|
+
logger.info(f"Pipeline: Summary finished first ({summary_time:.2f}s). Image Prefetch took {prefetch_time:.2f}s. (Waited {time_diff:.2f}s for Prefetch)")
|
|
253
|
+
|
|
254
|
+
trace["summary"] = summary_result.trace
|
|
255
|
+
usage_totals["input_tokens"] += summary_result.usage.get("input_tokens", 0)
|
|
256
|
+
usage_totals["output_tokens"] += summary_result.usage.get("output_tokens", 0)
|
|
257
|
+
|
|
258
|
+
summary_content = summary_result.data.get("content", "")
|
|
259
|
+
|
|
260
|
+
# === Result Assembly ===
|
|
261
|
+
stats["total_time"] = time.time() - start_time
|
|
262
|
+
structured = self._parse_response(summary_content, context)
|
|
263
|
+
|
|
264
|
+
# === Apply Cached Images ===
|
|
265
|
+
# Update structured response using the map from parallel prefetch
|
|
266
|
+
if cached_map:
|
|
267
|
+
try:
|
|
268
|
+
total_replaced = 0
|
|
269
|
+
for ref in structured.get("references", []):
|
|
270
|
+
if ref.get("images"):
|
|
271
|
+
new_images = []
|
|
272
|
+
for img in ref["images"]:
|
|
273
|
+
# 1. Already Base64 -> Keep it
|
|
274
|
+
if img.startswith("data:"):
|
|
275
|
+
new_images.append(img)
|
|
276
|
+
continue
|
|
277
|
+
|
|
278
|
+
# 2. Check cache
|
|
279
|
+
cached_val = cached_map.get(img)
|
|
280
|
+
if cached_val and cached_val.startswith("data:"):
|
|
281
|
+
new_images.append(cached_val)
|
|
282
|
+
total_replaced += 1
|
|
283
|
+
# 3. Else -> DROP IT (as per policy)
|
|
284
|
+
ref["images"] = new_images
|
|
285
|
+
logger.debug(f"Pipeline: Replaced {total_replaced} images with cached versions")
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.warning(f"Pipeline: Applying cached images failed: {e}")
|
|
288
|
+
|
|
289
|
+
# Debug: Log image counts
|
|
290
|
+
total_ref_images = sum(len(ref.get("images", []) or []) for ref in structured.get("references", []))
|
|
291
|
+
logger.info(f"Pipeline: Final structured response has {len(structured.get('references', []))} refs with {total_ref_images} images total")
|
|
292
|
+
|
|
293
|
+
stages_used = self._build_stages_ui(trace, context, images)
|
|
294
|
+
|
|
295
|
+
conversation_history.append({"role": "user", "content": user_input})
|
|
296
|
+
conversation_history.append({"role": "assistant", "content": summary_content})
|
|
297
|
+
|
|
298
|
+
return {
|
|
299
|
+
"llm_response": summary_content,
|
|
300
|
+
"structured_response": structured,
|
|
301
|
+
"stats": stats,
|
|
302
|
+
"model_used": active_model,
|
|
303
|
+
"conversation_history": conversation_history,
|
|
304
|
+
"trace_markdown": self._render_trace_markdown(trace),
|
|
305
|
+
"billing_info": {
|
|
306
|
+
"input_tokens": usage_totals["input_tokens"],
|
|
307
|
+
"output_tokens": usage_totals["output_tokens"],
|
|
308
|
+
"total_cost": 0.0
|
|
309
|
+
},
|
|
310
|
+
"stages_used": stages_used,
|
|
311
|
+
"web_results": context.web_results,
|
|
312
|
+
"trace": trace,
|
|
313
|
+
|
|
314
|
+
"instruct_traces": trace.get("instruct_rounds", []),
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
except Exception as e:
|
|
318
|
+
logger.error(f"Pipeline: Critical Error - {e}")
|
|
319
|
+
import traceback
|
|
320
|
+
logger.error(traceback.format_exc())
|
|
321
|
+
return {
|
|
322
|
+
"llm_response": f"Error: {e}",
|
|
323
|
+
"stats": stats,
|
|
324
|
+
"error": str(e)
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
def _build_refusal_response(self, context, history, model, stats):
|
|
328
|
+
return {
|
|
329
|
+
"llm_response": "Refused",
|
|
330
|
+
"structured_response": {},
|
|
331
|
+
"stats": stats,
|
|
332
|
+
"model_used": model,
|
|
333
|
+
"conversation_history": history,
|
|
334
|
+
"refuse_answer": True,
|
|
335
|
+
"refuse_reason": context.refuse_reason
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
def _parse_response(self, text: str, context: StageContext) -> Dict[str, Any]:
|
|
339
|
+
"""Parse response and extract citations, prioritizing fetched items."""
|
|
340
|
+
import re
|
|
341
|
+
parsed = {"response": "", "references": [], "page_references": [], "image_references": []}
|
|
342
|
+
if not text: return parsed
|
|
343
|
+
|
|
344
|
+
# Simple cleanup
|
|
345
|
+
ref_pattern = re.compile(r'(?:\n\s*|^)\s*(?:#{1,3}|\*\*)\s*(?:References|Citations|Sources|参考资料)[\s\S]*$', re.IGNORECASE | re.MULTILINE)
|
|
346
|
+
body_text = ref_pattern.sub('', text)
|
|
347
|
+
|
|
348
|
+
# 1. Identify all cited numeric IDs from [N]
|
|
349
|
+
cited_ids = []
|
|
350
|
+
for m in re.finditer(r'\[(\d+)\]', body_text):
|
|
351
|
+
try:
|
|
352
|
+
cid = int(m.group(1))
|
|
353
|
+
if cid not in cited_ids: cited_ids.append(cid)
|
|
354
|
+
except: pass
|
|
355
|
+
|
|
356
|
+
# 2. Collect cited items and determine "is_fetched" status
|
|
357
|
+
cited_items = []
|
|
358
|
+
for cid in cited_ids:
|
|
359
|
+
item = next((r for r in context.web_results if r.get("_id") == cid), None)
|
|
360
|
+
if not item: continue
|
|
361
|
+
|
|
362
|
+
# Check if this URL was fetched (appears as a "page" result)
|
|
363
|
+
is_fetched = any(r.get("_type") == "page" and r.get("url") == item.get("url") for r in context.web_results)
|
|
364
|
+
cited_items.append({
|
|
365
|
+
"original_id": cid,
|
|
366
|
+
"item": item,
|
|
367
|
+
"is_fetched": is_fetched
|
|
368
|
+
})
|
|
369
|
+
|
|
370
|
+
# 3. Sort: Fetched pages first, then regular search results
|
|
371
|
+
cited_items.sort(key=lambda x: x["is_fetched"], reverse=True)
|
|
372
|
+
|
|
373
|
+
# 4. Create Re-indexing Map
|
|
374
|
+
reindex_map = {}
|
|
375
|
+
for i, entry in enumerate(cited_items):
|
|
376
|
+
reindex_map[entry["original_id"]] = i + 1
|
|
377
|
+
|
|
378
|
+
# Populate result references in sorted order
|
|
379
|
+
item = entry["item"]
|
|
380
|
+
ref_entry = {
|
|
381
|
+
"title": item.get("title", ""),
|
|
382
|
+
"url": item.get("url", ""),
|
|
383
|
+
"domain": item.get("domain", ""),
|
|
384
|
+
"snippet": (item.get("content", "") or "")[:200] + "...", # More snippet
|
|
385
|
+
"is_fetched": entry["is_fetched"],
|
|
386
|
+
"type": item.get("_type", "search"),
|
|
387
|
+
"raw_screenshot_b64": item.get("raw_screenshot_b64"), # Real page screenshot for Sources
|
|
388
|
+
"images": item.get("images"),
|
|
389
|
+
}
|
|
390
|
+
# Add to unified list (frontend can handle splitting if needed, but we provide sorted order)
|
|
391
|
+
parsed["references"].append(ref_entry)
|
|
392
|
+
|
|
393
|
+
# 5. Replace [N] in text with new indices
|
|
394
|
+
def repl(m):
|
|
395
|
+
try:
|
|
396
|
+
oid = int(m.group(1))
|
|
397
|
+
return f"[{reindex_map[oid]}]" if oid in reindex_map else m.group(0)
|
|
398
|
+
except: return m.group(0)
|
|
399
|
+
|
|
400
|
+
parsed["response"] = re.sub(r'\[(\d+)\]', repl, body_text).strip()
|
|
401
|
+
return parsed
|
|
402
|
+
|
|
403
|
+
def _build_stages_ui(self, trace: Dict[str, Any], context: StageContext, images: List[str]) -> List[Dict[str, Any]]:
|
|
404
|
+
stages = []
|
|
405
|
+
|
|
406
|
+
# 1. Search Results
|
|
407
|
+
search_refs = []
|
|
408
|
+
seen = set()
|
|
409
|
+
for r in context.web_results:
|
|
410
|
+
if r.get("_type") == "search" and r.get("url") not in seen:
|
|
411
|
+
seen.add(r["url"])
|
|
412
|
+
is_fetched = any(p.get("url") == r["url"] for p in context.web_results if p.get("_type") == "page")
|
|
413
|
+
search_refs.append({
|
|
414
|
+
"title": r.get("title", ""),
|
|
415
|
+
"url": r["url"],
|
|
416
|
+
"snippet": (r.get("content", "") or "")[:100] + "...",
|
|
417
|
+
"is_fetched": is_fetched
|
|
418
|
+
})
|
|
419
|
+
|
|
420
|
+
# Sort: Fetched first
|
|
421
|
+
search_refs.sort(key=lambda x: x["is_fetched"], reverse=True)
|
|
422
|
+
|
|
423
|
+
logger.debug(f"_build_stages_ui: Found {len(search_refs)} search refs from {len(context.web_results)} web_results")
|
|
424
|
+
|
|
425
|
+
if search_refs:
|
|
426
|
+
stages.append({
|
|
427
|
+
"name": "Search",
|
|
428
|
+
"model": "Web Search",
|
|
429
|
+
"icon_config": "openai",
|
|
430
|
+
"provider": "Web",
|
|
431
|
+
"references": search_refs,
|
|
432
|
+
"description": f"Found {len(search_refs)} results.",
|
|
433
|
+
"time": getattr(context, 'search_time', 0)
|
|
434
|
+
})
|
|
435
|
+
|
|
436
|
+
# 2. Instruct Rounds
|
|
437
|
+
for i, t in enumerate(trace.get("instruct_rounds", [])):
|
|
438
|
+
stage_name = t.get("stage_name", f"Analysis {i+1}")
|
|
439
|
+
tool_count = t.get("tool_calls", 0)
|
|
440
|
+
desc = t.get("output", "")
|
|
441
|
+
|
|
442
|
+
if tool_count > 0:
|
|
443
|
+
# If tools were used, prefer showing tool info even if there's reasoning
|
|
444
|
+
desc = f"Executed {tool_count} tool calls."
|
|
445
|
+
elif not desc:
|
|
446
|
+
desc = "Processing..."
|
|
447
|
+
|
|
448
|
+
# Calculate cost from config prices
|
|
449
|
+
usage = t.get("usage", {})
|
|
450
|
+
instruct_cfg = self.config.get_model_config("instruct")
|
|
451
|
+
input_price = instruct_cfg.input_price or 0
|
|
452
|
+
output_price = instruct_cfg.output_price or 0
|
|
453
|
+
cost = (usage.get("input_tokens", 0) * input_price + usage.get("output_tokens", 0) * output_price) / 1_000_000
|
|
454
|
+
|
|
455
|
+
stages.append({
|
|
456
|
+
"name": stage_name,
|
|
457
|
+
"model": t.get("model"),
|
|
458
|
+
"icon_config": "google",
|
|
459
|
+
"provider": "Instruct",
|
|
460
|
+
"time": t.get("time", 0),
|
|
461
|
+
"description": desc,
|
|
462
|
+
"usage": usage,
|
|
463
|
+
"cost": cost
|
|
464
|
+
})
|
|
465
|
+
|
|
466
|
+
# 3. Summary
|
|
467
|
+
if trace.get("summary"):
|
|
468
|
+
s = trace["summary"]
|
|
469
|
+
usage = s.get("usage", {})
|
|
470
|
+
main_cfg = self.config.get_model_config("main")
|
|
471
|
+
input_price = main_cfg.input_price or 0
|
|
472
|
+
output_price = main_cfg.output_price or 0
|
|
473
|
+
cost = (usage.get("input_tokens", 0) * input_price + usage.get("output_tokens", 0) * output_price) / 1_000_000
|
|
474
|
+
|
|
475
|
+
stages.append({
|
|
476
|
+
"name": "Summary",
|
|
477
|
+
"model": s.get("model"),
|
|
478
|
+
"icon_config": "google",
|
|
479
|
+
"provider": "Summary",
|
|
480
|
+
"time": s.get("time", 0),
|
|
481
|
+
"description": "Generated final answer.",
|
|
482
|
+
"usage": usage,
|
|
483
|
+
"cost": cost
|
|
484
|
+
})
|
|
485
|
+
|
|
486
|
+
return stages
|
|
487
|
+
|
|
488
|
+
def _render_trace_markdown(self, trace: Dict[str, Any]) -> str:
|
|
489
|
+
parts = ["# Pipeline Trace\n"]
|
|
490
|
+
if trace.get("instruct_rounds"):
|
|
491
|
+
parts.append(f"## Instruct ({len(trace['instruct_rounds'])} rounds)\n")
|
|
492
|
+
for i, r in enumerate(trace["instruct_rounds"]):
|
|
493
|
+
name = r.get("stage_name", f"Round {i+1}")
|
|
494
|
+
parts.append(f"### {name}\n" + str(r))
|
|
495
|
+
if trace.get("summary"):
|
|
496
|
+
parts.append("## Summary\n" + str(trace["summary"]))
|
|
497
|
+
return "\n".join(parts)
|
|
498
|
+
|
|
499
|
+
async def close(self):
|
|
500
|
+
try:
|
|
501
|
+
await self.search_service.close()
|
|
502
|
+
except: pass
|
hyw_core/search.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import re
|
|
4
|
+
import time
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
from .browser_control.service import get_screenshot_service
|
|
9
|
+
# Search engines from browser_control subpackage
|
|
10
|
+
from .browser_control.engines.duckduckgo import DuckDuckGoEngine
|
|
11
|
+
from .browser_control.engines.default import DefaultEngine
|
|
12
|
+
|
|
13
|
+
class SearchService:
|
|
14
|
+
def __init__(self, config: Any):
|
|
15
|
+
self.config = config
|
|
16
|
+
self._headless = getattr(config, "headless", True)
|
|
17
|
+
self._fetch_timeout = getattr(config, "fetch_timeout", 20.0)
|
|
18
|
+
self._default_limit = getattr(config, "search_limit", 10)
|
|
19
|
+
|
|
20
|
+
# Domain blocking
|
|
21
|
+
self._blocked_domains = getattr(config, "blocked_domains", []) or []
|
|
22
|
+
|
|
23
|
+
# Select Engine - DuckDuckGo is the default and only engine
|
|
24
|
+
self._engine_name = getattr(config, "search_engine", None)
|
|
25
|
+
if self._engine_name:
|
|
26
|
+
self._engine_name = self._engine_name.lower()
|
|
27
|
+
|
|
28
|
+
if self._engine_name == "default_address_bar":
|
|
29
|
+
# Explicitly requested address bar capability if needed
|
|
30
|
+
self._engine = DefaultEngine()
|
|
31
|
+
else:
|
|
32
|
+
# Default: use DuckDuckGo
|
|
33
|
+
self._engine = DuckDuckGoEngine()
|
|
34
|
+
self._engine_name = "duckduckgo"
|
|
35
|
+
|
|
36
|
+
logger.info(f"SearchService initialized with engine: {self._engine_name}")
|
|
37
|
+
|
|
38
|
+
def _build_search_url(self, query: str) -> str:
|
|
39
|
+
return self._engine.build_url(query, self._default_limit)
|
|
40
|
+
|
|
41
|
+
async def search_batch(self, queries: List[str]) -> List[List[Dict[str, Any]]]:
|
|
42
|
+
"""Execute multiple searches concurrently using standard URL navigation."""
|
|
43
|
+
logger.info(f"SearchService: Batch searching {len(queries)} queries in parallel...")
|
|
44
|
+
tasks = [self.search(q) for q in queries]
|
|
45
|
+
return await asyncio.gather(*tasks)
|
|
46
|
+
|
|
47
|
+
async def search(self, query: str) -> List[Dict[str, Any]]:
|
|
48
|
+
"""
|
|
49
|
+
Main search entry point.
|
|
50
|
+
Returns parsed search results only.
|
|
51
|
+
"""
|
|
52
|
+
if not query:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
# Apply blocking
|
|
56
|
+
final_query = query
|
|
57
|
+
if self._blocked_domains and "-site:" not in query:
|
|
58
|
+
exclusions = " ".join([f"-site:{d}" for d in self._blocked_domains])
|
|
59
|
+
final_query = f"{query} {exclusions}"
|
|
60
|
+
|
|
61
|
+
url = self._build_search_url(final_query)
|
|
62
|
+
|
|
63
|
+
results = []
|
|
64
|
+
try:
|
|
65
|
+
# Check if this is an address bar search (DefaultEngine)
|
|
66
|
+
if url.startswith("__ADDRESS_BAR_SEARCH__:"):
|
|
67
|
+
# Extract query from marker
|
|
68
|
+
search_query = url.replace("__ADDRESS_BAR_SEARCH__:", "")
|
|
69
|
+
logger.info(f"Search: '{query}' -> [Address Bar Search]")
|
|
70
|
+
|
|
71
|
+
# Use address bar input method
|
|
72
|
+
service = get_screenshot_service(headless=self._headless)
|
|
73
|
+
page_data = await service.search_via_address_bar(search_query)
|
|
74
|
+
else:
|
|
75
|
+
logger.info(f"Search: '{query}' -> {url}")
|
|
76
|
+
# Standard URL navigation
|
|
77
|
+
page_data = await self.fetch_page_raw(url, include_screenshot=False)
|
|
78
|
+
|
|
79
|
+
content = page_data.get("html", "") or page_data.get("content", "")
|
|
80
|
+
|
|
81
|
+
# Debug: Log content length
|
|
82
|
+
logger.debug(f"Search: Raw content length = {len(content)} chars")
|
|
83
|
+
if len(content) < 500:
|
|
84
|
+
logger.warning(f"Search: Content too short, may be empty/blocked. First 500 chars: {content[:500]}")
|
|
85
|
+
|
|
86
|
+
# Parse Results (skip raw page - only return parsed results)
|
|
87
|
+
if content and not content.startswith("Error"):
|
|
88
|
+
parsed = self._engine.parse(content)
|
|
89
|
+
|
|
90
|
+
# Debug: Log parse result
|
|
91
|
+
logger.info(f"Search: Engine {self._engine_name} parsed {len(parsed)} results from {len(content)} chars")
|
|
92
|
+
|
|
93
|
+
# JAVASCRIPT IMAGE INJECTION
|
|
94
|
+
# Inject base64 images from JS extraction if available
|
|
95
|
+
# This provides robust fallback if HTTP URLs fail to load
|
|
96
|
+
js_images = page_data.get("images", [])
|
|
97
|
+
if js_images:
|
|
98
|
+
logger.info(f"Search: Injecting {len(js_images)} base64 images into top results")
|
|
99
|
+
for i, img_b64 in enumerate(js_images):
|
|
100
|
+
if i < len(parsed):
|
|
101
|
+
b64_src = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
|
|
102
|
+
if "images" not in parsed[i]: parsed[i]["images"] = []
|
|
103
|
+
# Prepend to prioritize base64 (guaranteed render) over HTTP URLs
|
|
104
|
+
parsed[i]["images"].insert(0, b64_src)
|
|
105
|
+
|
|
106
|
+
logger.info(f"Search parsed {len(parsed)} results for '{query}' using {self._engine_name}")
|
|
107
|
+
|
|
108
|
+
# ALWAYS add raw search page as hidden item for debug saving
|
|
109
|
+
# (even when 0 results, so we can debug the parser)
|
|
110
|
+
results.append({
|
|
111
|
+
"title": f"[DEBUG] Raw Search: {query}",
|
|
112
|
+
"url": url,
|
|
113
|
+
"content": content[:50000], # Limit to 50KB
|
|
114
|
+
"_type": "search_raw_page",
|
|
115
|
+
"_hidden": True, # Don't show to LLM
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
results.extend(parsed)
|
|
119
|
+
else:
|
|
120
|
+
logger.warning(f"Search failed/empty for '{query}': {content[:100]}")
|
|
121
|
+
|
|
122
|
+
return results
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"Search error for '{query}': {e}")
|
|
126
|
+
# Ensure we return at least an error item
|
|
127
|
+
return [{
|
|
128
|
+
"title": f"Error Search: {query}",
|
|
129
|
+
"url": url,
|
|
130
|
+
"content": f"Error: {e}",
|
|
131
|
+
"type": "search_raw_page",
|
|
132
|
+
"_hidden": True
|
|
133
|
+
}]
|
|
134
|
+
|
|
135
|
+
async def fetch_pages_batch(self, urls: List[str], include_screenshot: bool = True) -> List[Dict[str, Any]]:
|
|
136
|
+
"""Fetch multiple pages concurrently."""
|
|
137
|
+
tasks = [self.fetch_page(u, include_screenshot=include_screenshot) for u in urls]
|
|
138
|
+
return await asyncio.gather(*tasks)
|
|
139
|
+
|
|
140
|
+
async def fetch_page(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
|
|
141
|
+
"""
|
|
142
|
+
Fetch a single page for reading/extracting content.
|
|
143
|
+
"""
|
|
144
|
+
if timeout is None:
|
|
145
|
+
timeout = self._fetch_timeout
|
|
146
|
+
return await self.fetch_page_raw(url, timeout, include_screenshot=include_screenshot)
|
|
147
|
+
|
|
148
|
+
async def fetch_page_raw(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
|
|
149
|
+
"""Internal: Get raw data from browser service."""
|
|
150
|
+
if timeout is None:
|
|
151
|
+
timeout = self._fetch_timeout
|
|
152
|
+
service = get_screenshot_service(headless=self._headless)
|
|
153
|
+
return await service.fetch_page(url, timeout=timeout, include_screenshot=include_screenshot)
|
|
154
|
+
|
|
155
|
+
async def screenshot_url(self, url: str, full_page: bool = True) -> Optional[str]:
|
|
156
|
+
"""
|
|
157
|
+
Capture a screenshot of a URL.
|
|
158
|
+
Delegates to screenshot service.
|
|
159
|
+
"""
|
|
160
|
+
service = get_screenshot_service(headless=self._headless)
|
|
161
|
+
return await service.screenshot_url(url, full_page=full_page)
|
|
162
|
+
|
|
163
|
+
async def screenshot_with_content(self, url: str, max_content_length: int = 8000) -> Dict[str, Any]:
|
|
164
|
+
"""
|
|
165
|
+
Capture screenshot and extract page content.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Dict with screenshot_b64, content (truncated), title, url
|
|
169
|
+
"""
|
|
170
|
+
service = get_screenshot_service(headless=self._headless)
|
|
171
|
+
return await service.screenshot_with_content(url, max_content_length=max_content_length)
|