entari-plugin-hyw 4.0.0rc6__py3-none-any.whl → 4.0.0rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -75,7 +75,7 @@ class HistoryManager:
75
75
  self._context_history[context_id] = []
76
76
  self._context_history[context_id].append(key)
77
77
 
78
- def save_to_disk(self, key: str, save_root: str = "data/conversations", image_path: Optional[str] = None, web_results: Optional[List[Dict]] = None):
78
+ def save_to_disk(self, key: str, save_root: str = "data/conversations", image_path: Optional[str] = None, web_results: Optional[List[Dict]] = None, vision_trace: Optional[Dict] = None, instruct_traces: Optional[List[Dict]] = None):
79
79
  """Save conversation history to specific folder structure"""
80
80
  import os
81
81
  import time
@@ -198,51 +198,41 @@ class HistoryManager:
198
198
  except Exception as e:
199
199
  print(f"Failed to copy output image: {e}")
200
200
 
201
- # 4. Save Full Log (Readme style)
202
- timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
203
- model_name = meta.get("model", "unknown")
204
- code = self._key_to_code.get(key, "N/A")
205
-
206
- md_content = f"# Conversation Log: {folder_name}\n\n"
207
- md_content += f"- **Time**: {timestamp}\n"
208
- md_content += f"- **Code**: {code}\n"
209
- md_content += f"- **Model**: {model_name}\n\n"
210
-
211
- md_content += "## History\n\n"
212
-
213
- for msg in self._history[key]:
214
- role = msg.get("role", "unknown").upper()
215
- content = msg.get("content", "")
216
-
217
- md_content += f"### {role}\n\n"
201
+ # 4. Save Vision Log (if vision stage was used)
202
+ if vision_trace and not vision_trace.get("skipped"):
203
+ vision_md = "# Vision Stage Log\n\n"
204
+ vision_md += f"- **Model**: {vision_trace.get('model', 'unknown')}\n"
205
+ vision_md += f"- **Time**: {vision_trace.get('time', 0):.2f}s\n"
206
+ vision_md += f"- **Images Count**: {vision_trace.get('images_count', 0)}\n"
207
+ vision_md += f"- **Input Tokens**: {vision_trace.get('usage', {}).get('input_tokens', 0)}\n"
208
+ vision_md += f"- **Output Tokens**: {vision_trace.get('usage', {}).get('output_tokens', 0)}\n\n"
209
+ vision_md += "## Vision Description Output\n\n"
210
+ vision_md += f"```\n{vision_trace.get('output', '')}\n```\n"
218
211
 
219
- tool_calls = msg.get("tool_calls")
220
- if tool_calls:
221
- try:
222
- tc_str = json.dumps(tool_calls, ensure_ascii=False, indent=2)
223
- except:
224
- tc_str = str(tool_calls)
225
- md_content += f"**Tool Calls**:\n```json\n{tc_str}\n```\n\n"
226
-
227
- if role == "TOOL":
228
- try:
229
- # Try parsing as JSON first
230
- if isinstance(content, str):
231
- parsed = json.loads(content)
232
- pretty = json.dumps(parsed, ensure_ascii=False, indent=2)
233
- md_content += f"**Output**:\n```json\n{pretty}\n```\n\n"
234
- else:
235
- md_content += f"**Output**:\n```text\n{content}\n```\n\n"
236
- except:
237
- md_content += f"**Output**:\n```text\n{content}\n```\n\n"
238
- else:
239
- if content:
240
- md_content += f"{content}\n\n"
241
-
242
- md_content += "---\n\n"
212
+ with open(os.path.join(folder_path, "vision_log.md"), "w", encoding="utf-8") as f:
213
+ f.write(vision_md)
243
214
 
244
- with open(os.path.join(folder_path, "full_log.md"), "w", encoding="utf-8") as f:
245
- f.write(md_content)
215
+ # 5. Save Instruct Log (all instruct rounds)
216
+ if instruct_traces:
217
+ instruct_md = "# Instruct Stage Log\n\n"
218
+ for i, trace in enumerate(instruct_traces):
219
+ stage_name = trace.get("stage_name", f"Round {i+1}")
220
+ instruct_md += f"## {stage_name}\n\n"
221
+ instruct_md += f"- **Model**: {trace.get('model', 'unknown')}\n"
222
+ instruct_md += f"- **Time**: {trace.get('time', 0):.2f}s\n"
223
+ instruct_md += f"- **Tool Calls**: {trace.get('tool_calls', 0)}\n"
224
+ instruct_md += f"- **Input Tokens**: {trace.get('usage', {}).get('input_tokens', 0)}\n"
225
+ instruct_md += f"- **Output Tokens**: {trace.get('usage', {}).get('output_tokens', 0)}\n\n"
226
+
227
+ output = trace.get("output", "")
228
+ if output:
229
+ instruct_md += "### Reasoning Output\n\n"
230
+ instruct_md += f"```\n{output}\n```\n\n"
231
+
232
+ instruct_md += "---\n\n"
233
+
234
+ with open(os.path.join(folder_path, "instruct_log.md"), "w", encoding="utf-8") as f:
235
+ f.write(instruct_md)
246
236
 
247
237
  except Exception as e:
248
238
  print(f"Failed to save conversation: {e}")
@@ -16,6 +16,7 @@ from .stage_base import StageContext
16
16
  from .stage_instruct import InstructStage
17
17
  from .stage_instruct_deepsearch import InstructDeepsearchStage
18
18
  from .stage_summary import SummaryStage
19
+ from .stage_vision import VisionStage
19
20
  from .search import SearchService
20
21
 
21
22
 
@@ -36,9 +37,15 @@ class ModularPipeline:
36
37
  self.client = AsyncOpenAI(base_url=config.base_url, api_key=config.api_key)
37
38
 
38
39
  # Initialize stages
39
- self.instruct_stage = InstructStage(config, self.search_service, self.client)
40
+ self.instruct_stage = InstructStage(config, self.search_service, self.client, send_func=send_func)
40
41
  self.instruct_deepsearch_stage = InstructDeepsearchStage(config, self.search_service, self.client)
41
42
  self.summary_stage = SummaryStage(config, self.search_service, self.client)
43
+ self.vision_stage = VisionStage(config, self.search_service, self.client)
44
+
45
+ def _has_vision_model(self) -> bool:
46
+ """Check if a vision model is configured."""
47
+ vision_cfg = self.config.get_model_config("vision")
48
+ return bool(vision_cfg.get("model_name"))
42
49
 
43
50
  async def execute(
44
51
  self,
@@ -54,6 +61,9 @@ class ModularPipeline:
54
61
  stats = {"start_time": start_time}
55
62
  usage_totals = {"input_tokens": 0, "output_tokens": 0}
56
63
  active_model = model_name or self.config.model_name
64
+ if not active_model:
65
+ # Fallback to instruct model for logging/context
66
+ active_model = self.config.get_model_config("instruct").get("model_name")
57
67
 
58
68
  context = StageContext(
59
69
  user_input=user_input,
@@ -79,6 +89,24 @@ class ModularPipeline:
79
89
  try:
80
90
  logger.info(f"Pipeline: Processing '{user_input[:30]}...'")
81
91
 
92
+ # === Stage 0: Vision (if images and vision model configured) ===
93
+ if images and self._has_vision_model():
94
+ logger.info("Pipeline: Stage 0 - Vision (generating image description)")
95
+ vision_result = await self.vision_stage.execute(context, images)
96
+
97
+ if vision_result.success and vision_result.data.get("description"):
98
+ context.vision_description = vision_result.data["description"]
99
+ logger.info(f"Pipeline: Vision description generated ({len(context.vision_description)} chars)")
100
+
101
+ # Add vision trace
102
+ trace["vision"] = vision_result.trace
103
+ usage_totals["input_tokens"] += vision_result.usage.get("input_tokens", 0)
104
+ usage_totals["output_tokens"] += vision_result.usage.get("output_tokens", 0)
105
+
106
+ # Clear images since we have the description now
107
+ # (don't pass raw images to later stages when using vision model)
108
+ images = []
109
+
82
110
  # === Stage 1: Instruct (Initial Discovery) ===
83
111
  logger.info("Pipeline: Stage 1 - Instruct")
84
112
  instruct_result = await self.instruct_stage.execute(context)
@@ -115,20 +143,73 @@ class ModularPipeline:
115
143
  else:
116
144
  logger.info("Pipeline: Mode is 'fast', skipping deepsearch stage")
117
145
 
118
- # === Stage 3: Summary ===
119
- # Collect page screenshots if image mode (already rendered in InstructStage)
120
- all_images = list(images) if images else []
146
+ # === Parallel Execution: Summary Generation + Image Prefetching ===
147
+ # We run image prefetching concurrently with Summary generation to save time.
148
+
149
+ # 1. Prepare candidates for prefetch (all images in search results)
150
+ all_candidate_urls = set()
151
+ for r in context.web_results:
152
+ # Add images from search results/pages
153
+ if r.get("images"):
154
+ for img in r["images"]:
155
+ if img and isinstance(img, str) and img.startswith("http"):
156
+ all_candidate_urls.add(img)
157
+
158
+ prefetch_list = list(all_candidate_urls)
159
+ logger.info(f"Pipeline: Starting parallel execution (Summary + Prefetch {len(prefetch_list)} images)")
160
+
161
+ # 2. Define parallel tasks with timing
162
+ async def timed_summary():
163
+ t0 = time.time()
164
+ # Collect page screenshots if image mode
165
+ summary_input_images = list(images) if images else []
166
+ if context.image_input_supported:
167
+ # Collect pre-rendered screenshots from web_results
168
+ for r in context.web_results:
169
+ if r.get("_type") == "page" and r.get("screenshot_b64"):
170
+ summary_input_images.append(r["screenshot_b64"])
171
+
172
+ res = await self.summary_stage.execute(
173
+ context,
174
+ images=summary_input_images if summary_input_images else None
175
+ )
176
+ duration = time.time() - t0
177
+ return res, duration
178
+
179
+ async def timed_prefetch():
180
+ t0 = time.time()
181
+ if not prefetch_list:
182
+ return {}, 0.0
183
+ try:
184
+ from .image_cache import get_image_cache
185
+ cache = get_image_cache()
186
+ # Start prefetch (non-blocking kickoff)
187
+ cache.start_prefetch(prefetch_list)
188
+ # Wait for results (blocking until done)
189
+ res = await cache.get_all_cached(prefetch_list)
190
+ duration = time.time() - t0
191
+ return res, duration
192
+ except Exception as e:
193
+ logger.warning(f"Pipeline: Prefetch failed: {e}")
194
+ return {}, time.time() - t0
195
+
196
+ # 3. Execute concurrently
197
+ summary_task = asyncio.create_task(timed_summary())
198
+ prefetch_task = asyncio.create_task(timed_prefetch())
199
+
200
+ # Wait for both to complete
201
+ await asyncio.wait([summary_task, prefetch_task])
202
+
203
+ # 4. Process results and log timing
204
+ summary_result, summary_time = await summary_task
205
+ cached_map, prefetch_time = await prefetch_task
121
206
 
122
- if context.image_input_supported:
123
- # Collect pre-rendered screenshots from web_results
124
- for r in context.web_results:
125
- if r.get("_type") == "page" and r.get("screenshot_b64"):
126
- all_images.append(r["screenshot_b64"])
207
+ time_diff = abs(summary_time - prefetch_time)
208
+ if summary_time > prefetch_time:
209
+ logger.info(f"Pipeline: Image Prefetch finished first ({prefetch_time:.2f}s). Summary took {summary_time:.2f}s. (Waited {time_diff:.2f}s for Summary)")
210
+ else:
211
+ logger.info(f"Pipeline: Summary finished first ({summary_time:.2f}s). Image Prefetch took {prefetch_time:.2f}s. (Waited {time_diff:.2f}s for Prefetch)")
127
212
 
128
- summary_result = await self.summary_stage.execute(
129
- context,
130
- images=all_images if all_images else None
131
- )
132
213
  trace["summary"] = summary_result.trace
133
214
  usage_totals["input_tokens"] += summary_result.usage.get("input_tokens", 0)
134
215
  usage_totals["output_tokens"] += summary_result.usage.get("output_tokens", 0)
@@ -139,40 +220,30 @@ class ModularPipeline:
139
220
  stats["total_time"] = time.time() - start_time
140
221
  structured = self._parse_response(summary_content, context)
141
222
 
142
- # === Image Caching (Prefetch images for UI) ===
143
- try:
144
- from .image_cache import get_image_cache
145
- cache = get_image_cache()
146
-
147
- # 1. Collect all image URLs from structured response
148
- all_image_urls = []
149
- for ref in structured.get("references", []):
150
- if ref.get("images"):
151
- all_image_urls.extend([img for img in ref["images"] if img and img.startswith("http")])
152
-
153
- if all_image_urls:
154
- # 2. Prefetch (wait for them as we are about to render)
155
- cached_map = await cache.get_all_cached(all_image_urls)
156
-
157
- # 3. Update structured response with cached (base64) URLs
223
+ # === Apply Cached Images ===
224
+ # Update structured response using the map from parallel prefetch
225
+ if cached_map:
226
+ try:
227
+ total_replaced = 0
158
228
  for ref in structured.get("references", []):
159
229
  if ref.get("images"):
160
- # Keep cached images, but preserve original URLs as fallback
161
230
  new_images = []
162
231
  for img in ref["images"]:
163
- # 1. Already Base64 (from Search Injection) -> Keep it
232
+ # 1. Already Base64 -> Keep it
164
233
  if img.startswith("data:"):
165
234
  new_images.append(img)
166
235
  continue
167
-
168
- # 2. Cached successfully -> Keep it
236
+
237
+ # 2. Check cache
169
238
  cached_val = cached_map.get(img)
170
239
  if cached_val and cached_val.startswith("data:"):
171
240
  new_images.append(cached_val)
172
- # 3. Else -> DROP IT (User request: "Delete Fallback, must download in advance")
241
+ total_replaced += 1
242
+ # 3. Else -> DROP IT (as per policy)
173
243
  ref["images"] = new_images
174
- except Exception as e:
175
- logger.warning(f"Pipeline: Image caching failed: {e}")
244
+ logger.debug(f"Pipeline: Replaced {total_replaced} images with cached versions")
245
+ except Exception as e:
246
+ logger.warning(f"Pipeline: Applying cached images failed: {e}")
176
247
 
177
248
  # Debug: Log image counts
178
249
  total_ref_images = sum(len(ref.get("images", []) or []) for ref in structured.get("references", []))
@@ -197,6 +268,8 @@ class ModularPipeline:
197
268
  },
198
269
  "stages_used": stages_used,
199
270
  "web_results": context.web_results,
271
+ "vision_trace": trace.get("vision"),
272
+ "instruct_traces": trace.get("instruct_rounds", []),
200
273
  }
201
274
 
202
275
  except Exception as e:
@@ -314,6 +387,27 @@ class ModularPipeline:
314
387
  "references": search_refs,
315
388
  "description": f"Found {len(search_refs)} results."
316
389
  })
390
+
391
+ # 2. Vision Stage (if used)
392
+ if trace.get("vision"):
393
+ v = trace["vision"]
394
+ if not v.get("skipped"):
395
+ usage = v.get("usage", {})
396
+ vision_cfg = self.config.get_model_config("vision")
397
+ input_price = vision_cfg.get("input_price") or 0
398
+ output_price = vision_cfg.get("output_price") or 0
399
+ cost = (usage.get("input_tokens", 0) * input_price + usage.get("output_tokens", 0) * output_price) / 1_000_000
400
+
401
+ stages.append({
402
+ "name": "Vision",
403
+ "model": v.get("model"),
404
+ "icon_config": "google",
405
+ "provider": "Vision",
406
+ "time": v.get("time", 0),
407
+ "description": f"Analyzed {v.get('images_count', 0)} image(s).",
408
+ "usage": usage,
409
+ "cost": cost
410
+ })
317
411
 
318
412
  # 2. Instruct Rounds
319
413
  for i, t in enumerate(trace.get("instruct_rounds", [])):
@@ -10,6 +10,7 @@ from .browser.service import get_screenshot_service
10
10
  from .browser.engines.bing import BingEngine
11
11
  from .browser.engines.duckduckgo import DuckDuckGoEngine
12
12
  from .browser.engines.google import GoogleEngine
13
+ from .browser.engines.default import DefaultEngine
13
14
 
14
15
  class SearchService:
15
16
  def __init__(self, config: Any):
@@ -21,8 +22,11 @@ class SearchService:
21
22
  # Domain blocking
22
23
  self._blocked_domains = getattr(config, "blocked_domains", []) or []
23
24
 
24
- # Select Engine
25
- self._engine_name = getattr(config, "search_engine", "bing").lower()
25
+ # Select Engine - DefaultEngine when not specified
26
+ self._engine_name = getattr(config, "search_engine", None)
27
+ if self._engine_name:
28
+ self._engine_name = self._engine_name.lower()
29
+
26
30
  if self._engine_name == "bing":
27
31
  self._engine = BingEngine()
28
32
  elif self._engine_name == "google":
@@ -30,8 +34,9 @@ class SearchService:
30
34
  elif self._engine_name == "duckduckgo":
31
35
  self._engine = DuckDuckGoEngine()
32
36
  else:
33
- # Default fallback
34
- self._engine = BingEngine()
37
+ # Default: use browser address bar search (Google-based)
38
+ self._engine = DefaultEngine()
39
+ self._engine_name = "default"
35
40
 
36
41
  logger.info(f"SearchService initialized with engine: {self._engine_name}")
37
42
 
@@ -39,7 +44,8 @@ class SearchService:
39
44
  return self._engine.build_url(query, self._default_limit)
40
45
 
41
46
  async def search_batch(self, queries: List[str]) -> List[List[Dict[str, Any]]]:
42
- """Execute multiple searches concurrently."""
47
+ """Execute multiple searches concurrently using standard URL navigation."""
48
+ logger.info(f"SearchService: Batch searching {len(queries)} queries in parallel...")
43
49
  tasks = [self.search(q) for q in queries]
44
50
  return await asyncio.gather(*tasks)
45
51
 
@@ -58,17 +64,36 @@ class SearchService:
58
64
  final_query = f"{query} {exclusions}"
59
65
 
60
66
  url = self._build_search_url(final_query)
61
- logger.info(f"Search: '{query}' -> {url}")
62
-
67
+
63
68
  results = []
64
69
  try:
65
- # Fetch - Search parsing doesn't need screenshot, only HTML
66
- page_data = await self.fetch_page_raw(url, include_screenshot=False)
70
+ # Check if this is an address bar search (DefaultEngine)
71
+ if url.startswith("__ADDRESS_BAR_SEARCH__:"):
72
+ # Extract query from marker
73
+ search_query = url.replace("__ADDRESS_BAR_SEARCH__:", "")
74
+ logger.info(f"Search: '{query}' -> [Address Bar Search]")
75
+
76
+ # Use address bar input method
77
+ service = get_screenshot_service(headless=self._headless)
78
+ page_data = await service.search_via_address_bar(search_query)
79
+ else:
80
+ logger.info(f"Search: '{query}' -> {url}")
81
+ # Standard URL navigation
82
+ page_data = await self.fetch_page_raw(url, include_screenshot=False)
83
+
67
84
  content = page_data.get("html", "") or page_data.get("content", "")
85
+
86
+ # Debug: Log content length
87
+ logger.debug(f"Search: Raw content length = {len(content)} chars")
88
+ if len(content) < 500:
89
+ logger.warning(f"Search: Content too short, may be empty/blocked. First 500 chars: {content[:500]}")
68
90
 
69
91
  # Parse Results (skip raw page - only return parsed results)
70
92
  if content and not content.startswith("Error"):
71
93
  parsed = self._engine.parse(content)
94
+
95
+ # Debug: Log parse result
96
+ logger.info(f"Search: Engine {self._engine_name} parsed {len(parsed)} results from {len(content)} chars")
72
97
 
73
98
  # JAVASCRIPT IMAGE INJECTION
74
99
  # Inject base64 images from JS extraction if available
@@ -84,6 +109,17 @@ class SearchService:
84
109
  parsed[i]["images"].insert(0, b64_src)
85
110
 
86
111
  logger.info(f"Search parsed {len(parsed)} results for '{query}' using {self._engine_name}")
112
+
113
+ # ALWAYS add raw search page as hidden item for debug saving
114
+ # (even when 0 results, so we can debug the parser)
115
+ results.append({
116
+ "title": f"[DEBUG] Raw Search: {query}",
117
+ "url": url,
118
+ "content": content[:50000], # Limit to 50KB
119
+ "_type": "search_raw_page",
120
+ "_hidden": True, # Don't show to LLM
121
+ })
122
+
87
123
  results.extend(parsed)
88
124
  else:
89
125
  logger.warning(f"Search failed/empty for '{query}': {content[:100]}")
@@ -39,6 +39,9 @@ class StageContext:
39
39
  # Model capabilities
40
40
  image_input_supported: bool = True
41
41
 
42
+ # Vision description (from VisionStage)
43
+ vision_description: str = ""
44
+
42
45
  def next_id(self) -> int:
43
46
  """Get next global ID."""
44
47
  self.global_id_counter += 1
@@ -8,7 +8,7 @@ Analyze user query and execute initial searches.
8
8
  import json
9
9
  import time
10
10
  import asyncio
11
- from typing import Any, Dict, List, Optional, Tuple
11
+ from typing import Any, Dict, List, Optional, Tuple, Callable, Awaitable
12
12
  from loguru import logger
13
13
  from openai import AsyncOpenAI
14
14
 
@@ -26,14 +26,15 @@ class InstructStage(BaseStage):
26
26
  def name(self) -> str:
27
27
  return "Instruct"
28
28
 
29
- def __init__(self, config: Any, search_service: Any, client: AsyncOpenAI):
29
+ def __init__(self, config: Any, search_service: Any, client: AsyncOpenAI, send_func: Optional[Callable[[str], Awaitable[None]]] = None):
30
30
  super().__init__(config, search_service, client)
31
+ self.send_func = send_func
31
32
 
32
33
  self.refuse_answer_tool = get_refuse_answer_tool()
33
34
  self.web_search_tool = get_web_search_tool()
34
35
  self.crawl_page_tool = get_crawl_page_tool()
35
36
  self.set_mode_tool = get_set_mode_tool()
36
-
37
+
37
38
  async def execute(self, context: StageContext) -> StageResult:
38
39
  start_time = time.time()
39
40
 
@@ -113,6 +114,7 @@ class InstructStage(BaseStage):
113
114
  model = model_cfg.get("model_name") or self.config.model_name
114
115
 
115
116
  try:
117
+ logger.info(f"Instruct: Sending LLM request to {model}...")
116
118
  response = await client.chat.completions.create(
117
119
  model=model,
118
120
  messages=messages,
@@ -186,6 +188,14 @@ class InstructStage(BaseStage):
186
188
  if mode in ("fast", "deepsearch"):
187
189
  context.selected_mode = mode
188
190
  logger.info(f"Instruct: Mode set to '{mode}'")
191
+
192
+ # Notify immediately if deepsearch
193
+ if mode == "deepsearch" and self.send_func:
194
+ try:
195
+ await self.send_func("🔍 正在进行深度研究,可能需要一些时间,请耐心等待...")
196
+ except Exception as e:
197
+ logger.warning(f"Instruct: Failed to send notification: {e}")
198
+
189
199
  results_for_context.append({
190
200
  "id": tc_id, "name": name, "content": f"Mode set to: {mode}"
191
201
  })
@@ -47,6 +47,11 @@ class SummaryStage(BaseStage):
47
47
  # Build Context Message
48
48
  context_message = f"## Web Search & Page Content\n\n```context\n{full_context}\n```"
49
49
 
50
+ # Add vision description if present (from VisionStage)
51
+ if context.vision_description:
52
+ vision_context = f"## 用户图片描述\n\n{context.vision_description}"
53
+ context_message = f"{vision_context}\n\n{context_message}"
54
+
50
55
  # Build user content
51
56
  user_text = context.user_input or "..."
52
57
  if images:
@@ -104,6 +109,7 @@ class SummaryStage(BaseStage):
104
109
  "provider": model_cfg.get("model_provider") or "Unknown",
105
110
  "usage": usage,
106
111
  "system_prompt": system_prompt,
112
+ "context_message": context_message, # Includes vision description + search results
107
113
  "output": content,
108
114
  "time": time.time() - start_time,
109
115
  "images_count": len(images) if images else 0,
@@ -0,0 +1,113 @@
1
+ """
2
+ Vision Stage
3
+
4
+ Generates image description using a vision-capable model.
5
+ The description is then passed as context to subsequent stages.
6
+ """
7
+
8
+ import time
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from loguru import logger
12
+ from openai import AsyncOpenAI
13
+
14
+ from .stage_base import BaseStage, StageContext, StageResult
15
+ from .definitions import VISION_DESCRIPTION_SP
16
+
17
+
18
+ class VisionStage(BaseStage):
19
+ """
20
+ Vision Stage: Generate image description.
21
+
22
+ Takes user images and text, calls a vision model to produce
23
+ a detailed description of the image content.
24
+ """
25
+
26
+ @property
27
+ def name(self) -> str:
28
+ return "Vision"
29
+
30
+ async def execute(
31
+ self,
32
+ context: StageContext,
33
+ images: List[str] = None
34
+ ) -> StageResult:
35
+ """Generate image description."""
36
+ start_time = time.time()
37
+
38
+ if not images:
39
+ return StageResult(
40
+ success=True,
41
+ data={"description": ""},
42
+ trace={"skipped": True, "reason": "No images provided"}
43
+ )
44
+
45
+ # Get model config for vision stage
46
+ model_cfg = self.config.get_model_config("vision")
47
+ model = model_cfg.get("model_name")
48
+
49
+ if not model:
50
+ logger.warning("VisionStage: No vision model configured, skipping")
51
+ return StageResult(
52
+ success=True,
53
+ data={"description": ""},
54
+ trace={"skipped": True, "reason": "No vision model configured"}
55
+ )
56
+
57
+ client = self._client_for(
58
+ api_key=model_cfg.get("api_key"),
59
+ base_url=model_cfg.get("base_url")
60
+ )
61
+
62
+ # Build user content with images
63
+ user_text = context.user_input or "请描述这张图片"
64
+ user_content: List[Dict[str, Any]] = [{"type": "text", "text": user_text}]
65
+
66
+ for img_b64 in images:
67
+ url = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
68
+ user_content.append({"type": "image_url", "image_url": {"url": url}})
69
+
70
+ messages = [
71
+ {"role": "system", "content": VISION_DESCRIPTION_SP},
72
+ {"role": "user", "content": user_content}
73
+ ]
74
+
75
+ try:
76
+ logger.info(f"VisionStage: Calling model '{model}' with {len(images)} image(s)")
77
+ response = await client.chat.completions.create(
78
+ model=model,
79
+ messages=messages,
80
+ temperature=0.3, # Lower temperature for factual description
81
+ extra_body=model_cfg.get("extra_body"),
82
+ )
83
+ except Exception as e:
84
+ logger.error(f"VisionStage LLM error: {e}")
85
+ return StageResult(
86
+ success=False,
87
+ error=str(e),
88
+ data={"description": ""},
89
+ trace={"error": str(e)}
90
+ )
91
+
92
+ usage = {"input_tokens": 0, "output_tokens": 0}
93
+ if hasattr(response, "usage") and response.usage:
94
+ usage["input_tokens"] = getattr(response.usage, "prompt_tokens", 0) or 0
95
+ usage["output_tokens"] = getattr(response.usage, "completion_tokens", 0) or 0
96
+
97
+ description = (response.choices[0].message.content or "").strip()
98
+
99
+ logger.info(f"VisionStage: Generated description ({len(description)} chars)")
100
+
101
+ return StageResult(
102
+ success=True,
103
+ data={"description": description},
104
+ usage=usage,
105
+ trace={
106
+ "model": model,
107
+ "provider": model_cfg.get("model_provider") or "Unknown",
108
+ "usage": usage,
109
+ "output": description,
110
+ "time": time.time() - start_time,
111
+ "images_count": len(images),
112
+ }
113
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: entari_plugin_hyw
3
- Version: 4.0.0rc6
3
+ Version: 4.0.0rc7
4
4
  Summary: Use large language models to interpret chat messages
5
5
  Author-email: kumoSleeping <zjr2992@outlook.com>
6
6
  License: MIT