entari-plugin-hyw 0.3.5__py3-none-any.whl → 4.0.0rc14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (78) hide show
  1. entari_plugin_hyw/Untitled-1 +1865 -0
  2. entari_plugin_hyw/__init__.py +979 -116
  3. entari_plugin_hyw/filters.py +83 -0
  4. entari_plugin_hyw/history.py +251 -0
  5. entari_plugin_hyw/misc.py +214 -0
  6. entari_plugin_hyw/search_cache.py +154 -0
  7. entari_plugin_hyw-4.0.0rc14.dist-info/METADATA +118 -0
  8. entari_plugin_hyw-4.0.0rc14.dist-info/RECORD +72 -0
  9. {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/WHEEL +1 -1
  10. {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/top_level.txt +1 -0
  11. hyw_core/__init__.py +94 -0
  12. hyw_core/agent.py +768 -0
  13. hyw_core/browser_control/__init__.py +63 -0
  14. hyw_core/browser_control/assets/card-dist/index.html +425 -0
  15. hyw_core/browser_control/assets/card-dist/logos/anthropic.svg +1 -0
  16. hyw_core/browser_control/assets/card-dist/logos/cerebras.svg +9 -0
  17. hyw_core/browser_control/assets/card-dist/logos/deepseek.png +0 -0
  18. hyw_core/browser_control/assets/card-dist/logos/gemini.svg +1 -0
  19. hyw_core/browser_control/assets/card-dist/logos/google.svg +1 -0
  20. hyw_core/browser_control/assets/card-dist/logos/grok.png +0 -0
  21. hyw_core/browser_control/assets/card-dist/logos/huggingface.png +0 -0
  22. hyw_core/browser_control/assets/card-dist/logos/microsoft.svg +15 -0
  23. hyw_core/browser_control/assets/card-dist/logos/minimax.png +0 -0
  24. hyw_core/browser_control/assets/card-dist/logos/mistral.png +0 -0
  25. hyw_core/browser_control/assets/card-dist/logos/nvida.png +0 -0
  26. hyw_core/browser_control/assets/card-dist/logos/openai.svg +1 -0
  27. hyw_core/browser_control/assets/card-dist/logos/openrouter.png +0 -0
  28. hyw_core/browser_control/assets/card-dist/logos/perplexity.svg +24 -0
  29. hyw_core/browser_control/assets/card-dist/logos/qwen.png +0 -0
  30. hyw_core/browser_control/assets/card-dist/logos/xai.png +0 -0
  31. hyw_core/browser_control/assets/card-dist/logos/xiaomi.png +0 -0
  32. hyw_core/browser_control/assets/card-dist/logos/zai.png +0 -0
  33. hyw_core/browser_control/assets/card-dist/vite.svg +1 -0
  34. hyw_core/browser_control/assets/index.html +5691 -0
  35. hyw_core/browser_control/assets/logos/anthropic.svg +1 -0
  36. hyw_core/browser_control/assets/logos/cerebras.svg +9 -0
  37. hyw_core/browser_control/assets/logos/deepseek.png +0 -0
  38. hyw_core/browser_control/assets/logos/gemini.svg +1 -0
  39. hyw_core/browser_control/assets/logos/google.svg +1 -0
  40. hyw_core/browser_control/assets/logos/grok.png +0 -0
  41. hyw_core/browser_control/assets/logos/huggingface.png +0 -0
  42. hyw_core/browser_control/assets/logos/microsoft.svg +15 -0
  43. hyw_core/browser_control/assets/logos/minimax.png +0 -0
  44. hyw_core/browser_control/assets/logos/mistral.png +0 -0
  45. hyw_core/browser_control/assets/logos/nvida.png +0 -0
  46. hyw_core/browser_control/assets/logos/openai.svg +1 -0
  47. hyw_core/browser_control/assets/logos/openrouter.png +0 -0
  48. hyw_core/browser_control/assets/logos/perplexity.svg +24 -0
  49. hyw_core/browser_control/assets/logos/qwen.png +0 -0
  50. hyw_core/browser_control/assets/logos/xai.png +0 -0
  51. hyw_core/browser_control/assets/logos/xiaomi.png +0 -0
  52. hyw_core/browser_control/assets/logos/zai.png +0 -0
  53. hyw_core/browser_control/engines/__init__.py +15 -0
  54. hyw_core/browser_control/engines/base.py +13 -0
  55. hyw_core/browser_control/engines/default.py +166 -0
  56. hyw_core/browser_control/engines/duckduckgo.py +171 -0
  57. hyw_core/browser_control/landing.html +172 -0
  58. hyw_core/browser_control/manager.py +173 -0
  59. hyw_core/browser_control/renderer.py +446 -0
  60. hyw_core/browser_control/service.py +940 -0
  61. hyw_core/config.py +154 -0
  62. hyw_core/core.py +462 -0
  63. hyw_core/crawling/__init__.py +18 -0
  64. hyw_core/crawling/completeness.py +437 -0
  65. hyw_core/crawling/models.py +88 -0
  66. hyw_core/definitions.py +104 -0
  67. hyw_core/image_cache.py +274 -0
  68. hyw_core/pipeline.py +502 -0
  69. hyw_core/search.py +171 -0
  70. hyw_core/stages/__init__.py +21 -0
  71. hyw_core/stages/base.py +95 -0
  72. hyw_core/stages/summary.py +191 -0
  73. entari_plugin_hyw/agent.py +0 -419
  74. entari_plugin_hyw/compressor.py +0 -59
  75. entari_plugin_hyw/tools.py +0 -236
  76. entari_plugin_hyw/vision.py +0 -35
  77. entari_plugin_hyw-0.3.5.dist-info/METADATA +0 -112
  78. entari_plugin_hyw-0.3.5.dist-info/RECORD +0 -9
hyw_core/pipeline.py ADDED
@@ -0,0 +1,502 @@
1
+ """
2
+ Modular Pipeline Dispatcher
3
+
4
+ New pipeline architecture: Instruct Loop (x2) -> Summary.
5
+ Simpler flow with self-correction/feedback loop.
6
+ """
7
+
8
+ import asyncio
9
+ import time
10
+ import re
11
+ from typing import Any, Dict, List, Optional, Callable, Awaitable
12
+
13
+ from loguru import logger
14
+ from openai import AsyncOpenAI
15
+
16
+ from .stages.base import StageContext, StageResult
17
+ from .stages.base import StageContext, StageResult, BaseStage
18
+ from .stages.summary import SummaryStage
19
+ from .search import SearchService
20
+
21
+
22
+ class ModularPipeline:
23
+ """
24
+ Modular Pipeline.
25
+
26
+ Flow:
27
+ 1. Input Analysis:
28
+ - If Images -> Skip Search -> Summary
29
+ - If Text -> Execute Search (or URL fetch) -> Summary
30
+ 2. Summary: Generate final response.
31
+ """
32
+
33
+ def __init__(self, config: Any, search_service: SearchService, send_func: Optional[Callable[[str], Awaitable[None]]] = None):
34
+ self.config = config
35
+ self.send_func = send_func
36
+ self.search_service = search_service
37
+ self.client = AsyncOpenAI(base_url=config.base_url, api_key=config.api_key)
38
+
39
+ # Initialize stages
40
+ self.summary_stage = SummaryStage(config, self.search_service, self.client)
41
+
42
+ @property
43
+ def _send_func(self) -> Optional[Callable[[str], Awaitable[None]]]:
44
+ """Getter for _send_func (alias for send_func)."""
45
+ return self.send_func
46
+
47
+ @_send_func.setter
48
+ def _send_func(self, value: Optional[Callable[[str], Awaitable[None]]]):
49
+ """Setter for _send_func - updates send_func and propagates to stages."""
50
+ self.send_func = value
51
+
52
+
53
+ async def execute(
54
+ self,
55
+ user_input: str,
56
+ conversation_history: List[Dict],
57
+ model_name: str = None,
58
+ images: List[str] = None,
59
+ ) -> Dict[str, Any]:
60
+ """Execute the modular pipeline."""
61
+ start_time = time.time()
62
+ stats = {"start_time": start_time}
63
+ usage_totals = {"input_tokens": 0, "output_tokens": 0}
64
+ active_model = model_name or self.config.model_name
65
+ if not active_model:
66
+ # Fallback to instruct model for logging/context
67
+ active_model = self.config.get_model_config("instruct").model_name
68
+
69
+ context = StageContext(
70
+ user_input=user_input,
71
+ images=images or [],
72
+ conversation_history=conversation_history,
73
+ )
74
+
75
+ # Determine if model supports image input
76
+ model_cfg_dict = next((m for m in self.config.models if m.get("name") == active_model), None)
77
+ if model_cfg_dict:
78
+ context.image_input_supported = model_cfg_dict.get("image_input", True)
79
+ else:
80
+ context.image_input_supported = True # Default to True if unknown
81
+
82
+ logger.info(f"Pipeline Execution: Model '{active_model}' Image Input Supported: {context.image_input_supported}")
83
+
84
+
85
+ trace: Dict[str, Any] = {
86
+ "instruct_rounds": [],
87
+ "summary": None,
88
+ }
89
+
90
+ try:
91
+ logger.info(f"Pipeline: Processing '{user_input[:30]}...'")
92
+
93
+ # === Image-First Logic ===
94
+ # When user provides images, skip search and go directly to Instruct
95
+ # Images will be passed through to both Instruct and Summary stages
96
+ has_user_images = bool(images)
97
+ if has_user_images:
98
+ logger.info(f"Pipeline: {len(images)} user image(s) detected. Skipping search -> Instruct.")
99
+
100
+ # === Search-First Logic (only when no images) ===
101
+ # 1. URL Detection
102
+ # Updated to capture full URLs including queries and paths
103
+ url_pattern = re.compile(r'https?://(?:[-\w./?=&%#]+)')
104
+ found_urls = url_pattern.findall(user_input)
105
+
106
+ hit_content = False
107
+
108
+ # Skip URL fetch and search if user provided images or long query
109
+ is_long_query = len(user_input) > 20
110
+ if has_user_images:
111
+ hit_content = False # Force into Instruct path
112
+ elif is_long_query:
113
+ logger.info(f"Pipeline: Long query ({len(user_input)} chars). Skipping direct search/fetch -> Instruct.")
114
+ hit_content = False
115
+ elif found_urls:
116
+ logger.info(f"Pipeline: Detected {len(found_urls)} URLs. Executing direct fetch...")
117
+ # Fetch pages (borrowing logic from InstructStage's batch fetch would be ideal,
118
+ # but we'll use search_service directly and simulate what Instruct did for context)
119
+
120
+ # Fetch
121
+ fetch_results = await self.search_service.fetch_pages_batch(found_urls)
122
+
123
+ # Pre-render screenshots if needed (similar to InstructStage logic)
124
+ # For brevity/cleanliness, assuming fetch_pages_batch returns what we need or we process it.
125
+ # Ideally we want screenshots for the UI. The serivce.fetch_page usually returns raw data.
126
+ # We need to render them if we want screenshots.
127
+ # To keep it simple for this file, we'll skip complex screenshot rendering here OR
128
+ # we rely on the summary stage to just use the text.
129
+ # But the user logic implies "Search/Fetch Hit -> Summary".
130
+
131
+ # Let's populate context.web_results
132
+ for i, page_data in enumerate(fetch_results):
133
+ if page_data.get("content"):
134
+ hit_content = True
135
+ context.web_results.append({
136
+ "_id": context.next_id(),
137
+ "_type": "page",
138
+ "title": page_data.get("title", "Page"),
139
+ "url": page_data.get("url", found_urls[i]),
140
+ "content": page_data.get("content", ""),
141
+ "images": page_data.get("images", []),
142
+ # For now, no screenshot unless we call renderer.
143
+ # If critical, we can add it later.
144
+ })
145
+
146
+ # 2. Search (if no URLs or just always try search if simple query?)
147
+ # The prompt says: "judging result quantity > 0".
148
+ if not hit_content and not has_user_images and not is_long_query and user_input.strip():
149
+ logger.info("Pipeline: No URLs found or fetched. Executing direct search...")
150
+ search_start = time.time()
151
+ search_results = await self.search_service.search(user_input)
152
+ context.search_time = time.time() - search_start
153
+
154
+ # Filter out the raw debug page
155
+ valid_results = [r for r in search_results if not r.get("_hidden")]
156
+
157
+ if valid_results:
158
+ logger.info(f"Pipeline: Search found {len(valid_results)} results in {context.search_time:.2f}s. Proceeding to Summary.")
159
+ hit_content = True
160
+ for item in search_results: # Add all, including hidden debug ones if needed by history
161
+ item["_id"] = context.next_id()
162
+ if "_type" not in item: item["_type"] = "search"
163
+ item["query"] = user_input
164
+ context.web_results.append(item)
165
+ else:
166
+ logger.info("Pipeline: Search yielded 0 results.")
167
+
168
+ # === Branching ===
169
+ if hit_content and not has_user_images:
170
+ # -> Summary Stage (search/URL results available)
171
+ logger.info("Pipeline: Content found (URL/Search). Proceeding to Summary.")
172
+
173
+ # If no content was found and no images, we still proceed to Summary but with empty context (Direct Chat)
174
+ # If images, we proceed to Summary with images.
175
+
176
+ # Refusal check from search results? (Unlikely, but good to keep in mind)
177
+ pass
178
+
179
+
180
+ # === Parallel Execution: Summary Generation + Image Prefetching ===
181
+ # We run image prefetching concurrently with Summary generation to save time.
182
+
183
+ # 1. Prepare candidates for prefetch (all images in search results)
184
+ all_candidate_urls = set()
185
+ for r in context.web_results:
186
+ # Add images from search results/pages
187
+ if r.get("images"):
188
+ for img in r["images"]:
189
+ if img and isinstance(img, str) and img.startswith("http"):
190
+ all_candidate_urls.add(img)
191
+
192
+ prefetch_list = list(all_candidate_urls)
193
+ logger.info(f"Pipeline: Starting parallel execution (Summary + Prefetch {len(prefetch_list)} images)")
194
+
195
+ # 2. Define parallel tasks with timing
196
+ async def timed_summary():
197
+ t0 = time.time()
198
+ # Collect page screenshots if image mode
199
+ summary_input_images = list(images) if images else []
200
+ if context.image_input_supported:
201
+ # Collect pre-rendered screenshots from web_results
202
+ for r in context.web_results:
203
+ if r.get("_type") == "page" and r.get("screenshot_b64"):
204
+ summary_input_images.append(r["screenshot_b64"])
205
+
206
+ if context.should_refuse:
207
+ return StageResult(success=True, data={"content": "Refused"}, usage={}, trace={}), 0.0
208
+
209
+ res = await self.summary_stage.execute(
210
+ context,
211
+ images=summary_input_images if summary_input_images else None
212
+ )
213
+ duration = time.time() - t0
214
+ return res, duration
215
+
216
+ async def timed_prefetch():
217
+ t0 = time.time()
218
+ if not prefetch_list:
219
+ return {}, 0.0
220
+ try:
221
+ from .image_cache import get_image_cache
222
+ cache = get_image_cache()
223
+ # Start prefetch (non-blocking kickoff)
224
+ cache.start_prefetch(prefetch_list)
225
+ # Wait for results (blocking until done)
226
+ res = await cache.get_all_cached(prefetch_list)
227
+ duration = time.time() - t0
228
+ return res, duration
229
+ except Exception as e:
230
+ logger.warning(f"Pipeline: Prefetch failed: {e}")
231
+ return {}, time.time() - t0
232
+
233
+ # 3. Execute concurrently
234
+ summary_task = asyncio.create_task(timed_summary())
235
+ prefetch_task = asyncio.create_task(timed_prefetch())
236
+
237
+ # Wait for both to complete
238
+ await asyncio.wait([summary_task, prefetch_task])
239
+
240
+ # 4. Process results and log timing
241
+ summary_result, summary_time = await summary_task
242
+ cached_map, prefetch_time = await prefetch_task
243
+
244
+ if context.should_refuse:
245
+ # Double check if summary triggered refusal
246
+ return self._build_refusal_response(context, conversation_history, active_model, stats)
247
+
248
+ time_diff = abs(summary_time - prefetch_time)
249
+ if summary_time > prefetch_time:
250
+ logger.info(f"Pipeline: Image Prefetch finished first ({prefetch_time:.2f}s). Summary took {summary_time:.2f}s. (Waited {time_diff:.2f}s for Summary)")
251
+ else:
252
+ logger.info(f"Pipeline: Summary finished first ({summary_time:.2f}s). Image Prefetch took {prefetch_time:.2f}s. (Waited {time_diff:.2f}s for Prefetch)")
253
+
254
+ trace["summary"] = summary_result.trace
255
+ usage_totals["input_tokens"] += summary_result.usage.get("input_tokens", 0)
256
+ usage_totals["output_tokens"] += summary_result.usage.get("output_tokens", 0)
257
+
258
+ summary_content = summary_result.data.get("content", "")
259
+
260
+ # === Result Assembly ===
261
+ stats["total_time"] = time.time() - start_time
262
+ structured = self._parse_response(summary_content, context)
263
+
264
+ # === Apply Cached Images ===
265
+ # Update structured response using the map from parallel prefetch
266
+ if cached_map:
267
+ try:
268
+ total_replaced = 0
269
+ for ref in structured.get("references", []):
270
+ if ref.get("images"):
271
+ new_images = []
272
+ for img in ref["images"]:
273
+ # 1. Already Base64 -> Keep it
274
+ if img.startswith("data:"):
275
+ new_images.append(img)
276
+ continue
277
+
278
+ # 2. Check cache
279
+ cached_val = cached_map.get(img)
280
+ if cached_val and cached_val.startswith("data:"):
281
+ new_images.append(cached_val)
282
+ total_replaced += 1
283
+ # 3. Else -> DROP IT (as per policy)
284
+ ref["images"] = new_images
285
+ logger.debug(f"Pipeline: Replaced {total_replaced} images with cached versions")
286
+ except Exception as e:
287
+ logger.warning(f"Pipeline: Applying cached images failed: {e}")
288
+
289
+ # Debug: Log image counts
290
+ total_ref_images = sum(len(ref.get("images", []) or []) for ref in structured.get("references", []))
291
+ logger.info(f"Pipeline: Final structured response has {len(structured.get('references', []))} refs with {total_ref_images} images total")
292
+
293
+ stages_used = self._build_stages_ui(trace, context, images)
294
+
295
+ conversation_history.append({"role": "user", "content": user_input})
296
+ conversation_history.append({"role": "assistant", "content": summary_content})
297
+
298
+ return {
299
+ "llm_response": summary_content,
300
+ "structured_response": structured,
301
+ "stats": stats,
302
+ "model_used": active_model,
303
+ "conversation_history": conversation_history,
304
+ "trace_markdown": self._render_trace_markdown(trace),
305
+ "billing_info": {
306
+ "input_tokens": usage_totals["input_tokens"],
307
+ "output_tokens": usage_totals["output_tokens"],
308
+ "total_cost": 0.0
309
+ },
310
+ "stages_used": stages_used,
311
+ "web_results": context.web_results,
312
+ "trace": trace,
313
+
314
+ "instruct_traces": trace.get("instruct_rounds", []),
315
+ }
316
+
317
+ except Exception as e:
318
+ logger.error(f"Pipeline: Critical Error - {e}")
319
+ import traceback
320
+ logger.error(traceback.format_exc())
321
+ return {
322
+ "llm_response": f"Error: {e}",
323
+ "stats": stats,
324
+ "error": str(e)
325
+ }
326
+
327
+ def _build_refusal_response(self, context, history, model, stats):
328
+ return {
329
+ "llm_response": "Refused",
330
+ "structured_response": {},
331
+ "stats": stats,
332
+ "model_used": model,
333
+ "conversation_history": history,
334
+ "refuse_answer": True,
335
+ "refuse_reason": context.refuse_reason
336
+ }
337
+
338
+ def _parse_response(self, text: str, context: StageContext) -> Dict[str, Any]:
339
+ """Parse response and extract citations, prioritizing fetched items."""
340
+ import re
341
+ parsed = {"response": "", "references": [], "page_references": [], "image_references": []}
342
+ if not text: return parsed
343
+
344
+ # Simple cleanup
345
+ ref_pattern = re.compile(r'(?:\n\s*|^)\s*(?:#{1,3}|\*\*)\s*(?:References|Citations|Sources|参考资料)[\s\S]*$', re.IGNORECASE | re.MULTILINE)
346
+ body_text = ref_pattern.sub('', text)
347
+
348
+ # 1. Identify all cited numeric IDs from [N]
349
+ cited_ids = []
350
+ for m in re.finditer(r'\[(\d+)\]', body_text):
351
+ try:
352
+ cid = int(m.group(1))
353
+ if cid not in cited_ids: cited_ids.append(cid)
354
+ except: pass
355
+
356
+ # 2. Collect cited items and determine "is_fetched" status
357
+ cited_items = []
358
+ for cid in cited_ids:
359
+ item = next((r for r in context.web_results if r.get("_id") == cid), None)
360
+ if not item: continue
361
+
362
+ # Check if this URL was fetched (appears as a "page" result)
363
+ is_fetched = any(r.get("_type") == "page" and r.get("url") == item.get("url") for r in context.web_results)
364
+ cited_items.append({
365
+ "original_id": cid,
366
+ "item": item,
367
+ "is_fetched": is_fetched
368
+ })
369
+
370
+ # 3. Sort: Fetched pages first, then regular search results
371
+ cited_items.sort(key=lambda x: x["is_fetched"], reverse=True)
372
+
373
+ # 4. Create Re-indexing Map
374
+ reindex_map = {}
375
+ for i, entry in enumerate(cited_items):
376
+ reindex_map[entry["original_id"]] = i + 1
377
+
378
+ # Populate result references in sorted order
379
+ item = entry["item"]
380
+ ref_entry = {
381
+ "title": item.get("title", ""),
382
+ "url": item.get("url", ""),
383
+ "domain": item.get("domain", ""),
384
+ "snippet": (item.get("content", "") or "")[:200] + "...", # More snippet
385
+ "is_fetched": entry["is_fetched"],
386
+ "type": item.get("_type", "search"),
387
+ "raw_screenshot_b64": item.get("raw_screenshot_b64"), # Real page screenshot for Sources
388
+ "images": item.get("images"),
389
+ }
390
+ # Add to unified list (frontend can handle splitting if needed, but we provide sorted order)
391
+ parsed["references"].append(ref_entry)
392
+
393
+ # 5. Replace [N] in text with new indices
394
+ def repl(m):
395
+ try:
396
+ oid = int(m.group(1))
397
+ return f"[{reindex_map[oid]}]" if oid in reindex_map else m.group(0)
398
+ except: return m.group(0)
399
+
400
+ parsed["response"] = re.sub(r'\[(\d+)\]', repl, body_text).strip()
401
+ return parsed
402
+
403
+ def _build_stages_ui(self, trace: Dict[str, Any], context: StageContext, images: List[str]) -> List[Dict[str, Any]]:
404
+ stages = []
405
+
406
+ # 1. Search Results
407
+ search_refs = []
408
+ seen = set()
409
+ for r in context.web_results:
410
+ if r.get("_type") == "search" and r.get("url") not in seen:
411
+ seen.add(r["url"])
412
+ is_fetched = any(p.get("url") == r["url"] for p in context.web_results if p.get("_type") == "page")
413
+ search_refs.append({
414
+ "title": r.get("title", ""),
415
+ "url": r["url"],
416
+ "snippet": (r.get("content", "") or "")[:100] + "...",
417
+ "is_fetched": is_fetched
418
+ })
419
+
420
+ # Sort: Fetched first
421
+ search_refs.sort(key=lambda x: x["is_fetched"], reverse=True)
422
+
423
+ logger.debug(f"_build_stages_ui: Found {len(search_refs)} search refs from {len(context.web_results)} web_results")
424
+
425
+ if search_refs:
426
+ stages.append({
427
+ "name": "Search",
428
+ "model": "Web Search",
429
+ "icon_config": "openai",
430
+ "provider": "Web",
431
+ "references": search_refs,
432
+ "description": f"Found {len(search_refs)} results.",
433
+ "time": getattr(context, 'search_time', 0)
434
+ })
435
+
436
+ # 2. Instruct Rounds
437
+ for i, t in enumerate(trace.get("instruct_rounds", [])):
438
+ stage_name = t.get("stage_name", f"Analysis {i+1}")
439
+ tool_count = t.get("tool_calls", 0)
440
+ desc = t.get("output", "")
441
+
442
+ if tool_count > 0:
443
+ # If tools were used, prefer showing tool info even if there's reasoning
444
+ desc = f"Executed {tool_count} tool calls."
445
+ elif not desc:
446
+ desc = "Processing..."
447
+
448
+ # Calculate cost from config prices
449
+ usage = t.get("usage", {})
450
+ instruct_cfg = self.config.get_model_config("instruct")
451
+ input_price = instruct_cfg.input_price or 0
452
+ output_price = instruct_cfg.output_price or 0
453
+ cost = (usage.get("input_tokens", 0) * input_price + usage.get("output_tokens", 0) * output_price) / 1_000_000
454
+
455
+ stages.append({
456
+ "name": stage_name,
457
+ "model": t.get("model"),
458
+ "icon_config": "google",
459
+ "provider": "Instruct",
460
+ "time": t.get("time", 0),
461
+ "description": desc,
462
+ "usage": usage,
463
+ "cost": cost
464
+ })
465
+
466
+ # 3. Summary
467
+ if trace.get("summary"):
468
+ s = trace["summary"]
469
+ usage = s.get("usage", {})
470
+ main_cfg = self.config.get_model_config("main")
471
+ input_price = main_cfg.input_price or 0
472
+ output_price = main_cfg.output_price or 0
473
+ cost = (usage.get("input_tokens", 0) * input_price + usage.get("output_tokens", 0) * output_price) / 1_000_000
474
+
475
+ stages.append({
476
+ "name": "Summary",
477
+ "model": s.get("model"),
478
+ "icon_config": "google",
479
+ "provider": "Summary",
480
+ "time": s.get("time", 0),
481
+ "description": "Generated final answer.",
482
+ "usage": usage,
483
+ "cost": cost
484
+ })
485
+
486
+ return stages
487
+
488
+ def _render_trace_markdown(self, trace: Dict[str, Any]) -> str:
489
+ parts = ["# Pipeline Trace\n"]
490
+ if trace.get("instruct_rounds"):
491
+ parts.append(f"## Instruct ({len(trace['instruct_rounds'])} rounds)\n")
492
+ for i, r in enumerate(trace["instruct_rounds"]):
493
+ name = r.get("stage_name", f"Round {i+1}")
494
+ parts.append(f"### {name}\n" + str(r))
495
+ if trace.get("summary"):
496
+ parts.append("## Summary\n" + str(trace["summary"]))
497
+ return "\n".join(parts)
498
+
499
+ async def close(self):
500
+ try:
501
+ await self.search_service.close()
502
+ except: pass
hyw_core/search.py ADDED
@@ -0,0 +1,171 @@
1
+ import asyncio
2
+ import urllib.parse
3
+ import re
4
+ import time
5
+ from typing import List, Dict, Any, Optional
6
+ from loguru import logger
7
+
8
+ from .browser_control.service import get_screenshot_service
9
+ # Search engines from browser_control subpackage
10
+ from .browser_control.engines.duckduckgo import DuckDuckGoEngine
11
+ from .browser_control.engines.default import DefaultEngine
12
+
13
+ class SearchService:
14
+ def __init__(self, config: Any):
15
+ self.config = config
16
+ self._headless = getattr(config, "headless", True)
17
+ self._fetch_timeout = getattr(config, "fetch_timeout", 20.0)
18
+ self._default_limit = getattr(config, "search_limit", 10)
19
+
20
+ # Domain blocking
21
+ self._blocked_domains = getattr(config, "blocked_domains", []) or []
22
+
23
+ # Select Engine - DuckDuckGo is the default and only engine
24
+ self._engine_name = getattr(config, "search_engine", None)
25
+ if self._engine_name:
26
+ self._engine_name = self._engine_name.lower()
27
+
28
+ if self._engine_name == "default_address_bar":
29
+ # Explicitly requested address bar capability if needed
30
+ self._engine = DefaultEngine()
31
+ else:
32
+ # Default: use DuckDuckGo
33
+ self._engine = DuckDuckGoEngine()
34
+ self._engine_name = "duckduckgo"
35
+
36
+ logger.info(f"SearchService initialized with engine: {self._engine_name}")
37
+
38
+ def _build_search_url(self, query: str) -> str:
39
+ return self._engine.build_url(query, self._default_limit)
40
+
41
+ async def search_batch(self, queries: List[str]) -> List[List[Dict[str, Any]]]:
42
+ """Execute multiple searches concurrently using standard URL navigation."""
43
+ logger.info(f"SearchService: Batch searching {len(queries)} queries in parallel...")
44
+ tasks = [self.search(q) for q in queries]
45
+ return await asyncio.gather(*tasks)
46
+
47
+ async def search(self, query: str) -> List[Dict[str, Any]]:
48
+ """
49
+ Main search entry point.
50
+ Returns parsed search results only.
51
+ """
52
+ if not query:
53
+ return []
54
+
55
+ # Apply blocking
56
+ final_query = query
57
+ if self._blocked_domains and "-site:" not in query:
58
+ exclusions = " ".join([f"-site:{d}" for d in self._blocked_domains])
59
+ final_query = f"{query} {exclusions}"
60
+
61
+ url = self._build_search_url(final_query)
62
+
63
+ results = []
64
+ try:
65
+ # Check if this is an address bar search (DefaultEngine)
66
+ if url.startswith("__ADDRESS_BAR_SEARCH__:"):
67
+ # Extract query from marker
68
+ search_query = url.replace("__ADDRESS_BAR_SEARCH__:", "")
69
+ logger.info(f"Search: '{query}' -> [Address Bar Search]")
70
+
71
+ # Use address bar input method
72
+ service = get_screenshot_service(headless=self._headless)
73
+ page_data = await service.search_via_address_bar(search_query)
74
+ else:
75
+ logger.info(f"Search: '{query}' -> {url}")
76
+ # Standard URL navigation
77
+ page_data = await self.fetch_page_raw(url, include_screenshot=False)
78
+
79
+ content = page_data.get("html", "") or page_data.get("content", "")
80
+
81
+ # Debug: Log content length
82
+ logger.debug(f"Search: Raw content length = {len(content)} chars")
83
+ if len(content) < 500:
84
+ logger.warning(f"Search: Content too short, may be empty/blocked. First 500 chars: {content[:500]}")
85
+
86
+ # Parse Results (skip raw page - only return parsed results)
87
+ if content and not content.startswith("Error"):
88
+ parsed = self._engine.parse(content)
89
+
90
+ # Debug: Log parse result
91
+ logger.info(f"Search: Engine {self._engine_name} parsed {len(parsed)} results from {len(content)} chars")
92
+
93
+ # JAVASCRIPT IMAGE INJECTION
94
+ # Inject base64 images from JS extraction if available
95
+ # This provides robust fallback if HTTP URLs fail to load
96
+ js_images = page_data.get("images", [])
97
+ if js_images:
98
+ logger.info(f"Search: Injecting {len(js_images)} base64 images into top results")
99
+ for i, img_b64 in enumerate(js_images):
100
+ if i < len(parsed):
101
+ b64_src = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
102
+ if "images" not in parsed[i]: parsed[i]["images"] = []
103
+ # Prepend to prioritize base64 (guaranteed render) over HTTP URLs
104
+ parsed[i]["images"].insert(0, b64_src)
105
+
106
+ logger.info(f"Search parsed {len(parsed)} results for '{query}' using {self._engine_name}")
107
+
108
+ # ALWAYS add raw search page as hidden item for debug saving
109
+ # (even when 0 results, so we can debug the parser)
110
+ results.append({
111
+ "title": f"[DEBUG] Raw Search: {query}",
112
+ "url": url,
113
+ "content": content[:50000], # Limit to 50KB
114
+ "_type": "search_raw_page",
115
+ "_hidden": True, # Don't show to LLM
116
+ })
117
+
118
+ results.extend(parsed)
119
+ else:
120
+ logger.warning(f"Search failed/empty for '{query}': {content[:100]}")
121
+
122
+ return results
123
+
124
+ except Exception as e:
125
+ logger.error(f"Search error for '{query}': {e}")
126
+ # Ensure we return at least an error item
127
+ return [{
128
+ "title": f"Error Search: {query}",
129
+ "url": url,
130
+ "content": f"Error: {e}",
131
+ "type": "search_raw_page",
132
+ "_hidden": True
133
+ }]
134
+
135
+ async def fetch_pages_batch(self, urls: List[str], include_screenshot: bool = True) -> List[Dict[str, Any]]:
136
+ """Fetch multiple pages concurrently."""
137
+ tasks = [self.fetch_page(u, include_screenshot=include_screenshot) for u in urls]
138
+ return await asyncio.gather(*tasks)
139
+
140
+ async def fetch_page(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
141
+ """
142
+ Fetch a single page for reading/extracting content.
143
+ """
144
+ if timeout is None:
145
+ timeout = self._fetch_timeout
146
+ return await self.fetch_page_raw(url, timeout, include_screenshot=include_screenshot)
147
+
148
+ async def fetch_page_raw(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
149
+ """Internal: Get raw data from browser service."""
150
+ if timeout is None:
151
+ timeout = self._fetch_timeout
152
+ service = get_screenshot_service(headless=self._headless)
153
+ return await service.fetch_page(url, timeout=timeout, include_screenshot=include_screenshot)
154
+
155
+ async def screenshot_url(self, url: str, full_page: bool = True) -> Optional[str]:
156
+ """
157
+ Capture a screenshot of a URL.
158
+ Delegates to screenshot service.
159
+ """
160
+ service = get_screenshot_service(headless=self._headless)
161
+ return await service.screenshot_url(url, full_page=full_page)
162
+
163
+ async def screenshot_with_content(self, url: str, max_content_length: int = 8000) -> Dict[str, Any]:
164
+ """
165
+ Capture screenshot and extract page content.
166
+
167
+ Returns:
168
+ Dict with screenshot_b64, content (truncated), title, url
169
+ """
170
+ service = get_screenshot_service(headless=self._headless)
171
+ return await service.screenshot_with_content(url, max_content_length=max_content_length)