entari-plugin-hyw 3.4.2__py3-none-any.whl → 3.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +78 -158
- entari_plugin_hyw/assets/card-dist/index.html +396 -0
- entari_plugin_hyw/assets/card-dist/logos/anthropic.svg +1 -0
- entari_plugin_hyw/assets/card-dist/logos/cerebras.svg +9 -0
- entari_plugin_hyw/assets/card-dist/logos/deepseek.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/gemini.svg +1 -0
- entari_plugin_hyw/assets/card-dist/logos/google.svg +1 -0
- entari_plugin_hyw/assets/card-dist/logos/grok.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/huggingface.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/microsoft.svg +15 -0
- entari_plugin_hyw/assets/card-dist/logos/minimax.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/mistral.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/nvida.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/openai.svg +1 -0
- entari_plugin_hyw/assets/card-dist/logos/openrouter.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/perplexity.svg +24 -0
- entari_plugin_hyw/assets/card-dist/logos/qwen.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/xai.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/xiaomi.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/zai.png +0 -0
- entari_plugin_hyw/assets/card-dist/vite.svg +1 -0
- entari_plugin_hyw/card-ui/.gitignore +24 -0
- entari_plugin_hyw/card-ui/README.md +5 -0
- entari_plugin_hyw/card-ui/index.html +16 -0
- entari_plugin_hyw/card-ui/package-lock.json +2342 -0
- entari_plugin_hyw/card-ui/package.json +31 -0
- entari_plugin_hyw/card-ui/public/logos/anthropic.svg +1 -0
- entari_plugin_hyw/card-ui/public/logos/cerebras.svg +9 -0
- entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/gemini.svg +1 -0
- entari_plugin_hyw/card-ui/public/logos/google.svg +1 -0
- entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/microsoft.svg +15 -0
- entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/openai.svg +1 -0
- entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/perplexity.svg +24 -0
- entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
- entari_plugin_hyw/card-ui/public/vite.svg +1 -0
- entari_plugin_hyw/card-ui/src/App.vue +410 -0
- entari_plugin_hyw/card-ui/src/assets/vue.svg +1 -0
- entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +41 -0
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +385 -0
- entari_plugin_hyw/card-ui/src/components/SectionCard.vue +41 -0
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +183 -0
- entari_plugin_hyw/card-ui/src/main.ts +5 -0
- entari_plugin_hyw/card-ui/src/style.css +8 -0
- entari_plugin_hyw/card-ui/src/test_regex.js +103 -0
- entari_plugin_hyw/card-ui/src/types.ts +52 -0
- entari_plugin_hyw/card-ui/tsconfig.app.json +16 -0
- entari_plugin_hyw/card-ui/tsconfig.json +7 -0
- entari_plugin_hyw/card-ui/tsconfig.node.json +26 -0
- entari_plugin_hyw/card-ui/vite.config.ts +16 -0
- entari_plugin_hyw/{core/history.py → history.py} +25 -1
- entari_plugin_hyw/image_cache.py +283 -0
- entari_plugin_hyw/{utils/misc.py → misc.py} +0 -3
- entari_plugin_hyw/{core/pipeline.py → pipeline.py} +236 -86
- entari_plugin_hyw/{utils/prompts_cn.py → prompts.py} +10 -25
- entari_plugin_hyw/render_vue.py +314 -0
- entari_plugin_hyw/{utils/search.py → search.py} +227 -10
- {entari_plugin_hyw-3.4.2.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/METADATA +5 -2
- entari_plugin_hyw-3.5.0rc2.dist-info/RECORD +88 -0
- entari_plugin_hyw/assets/libs/highlight.css +0 -10
- entari_plugin_hyw/assets/libs/highlight.js +0 -1213
- entari_plugin_hyw/assets/libs/katex-auto-render.js +0 -1
- entari_plugin_hyw/assets/libs/katex.css +0 -1
- entari_plugin_hyw/assets/libs/katex.js +0 -1
- entari_plugin_hyw/assets/libs/tailwind.css +0 -1
- entari_plugin_hyw/assets/package-lock.json +0 -953
- entari_plugin_hyw/assets/package.json +0 -16
- entari_plugin_hyw/assets/tailwind.config.js +0 -12
- entari_plugin_hyw/assets/tailwind.input.css +0 -235
- entari_plugin_hyw/assets/template.html +0 -157
- entari_plugin_hyw/assets/template.html.bak +0 -157
- entari_plugin_hyw/assets/template.j2 +0 -400
- entari_plugin_hyw/core/__init__.py +0 -0
- entari_plugin_hyw/core/config.py +0 -38
- entari_plugin_hyw/core/hyw.py +0 -48
- entari_plugin_hyw/core/render.py +0 -630
- entari_plugin_hyw/utils/__init__.py +0 -2
- entari_plugin_hyw/utils/browser.py +0 -40
- entari_plugin_hyw/utils/playwright_tool.py +0 -36
- entari_plugin_hyw/utils/prompts.py +0 -119
- entari_plugin_hyw-3.4.2.dist-info/RECORD +0 -49
- {entari_plugin_hyw-3.4.2.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-3.4.2.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import html
|
|
3
3
|
import json
|
|
4
|
+
import re
|
|
4
5
|
import time
|
|
5
6
|
from contextlib import asynccontextmanager
|
|
6
7
|
from typing import Any, Dict, List, Optional, Tuple
|
|
@@ -8,16 +9,14 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
8
9
|
from loguru import logger
|
|
9
10
|
from openai import AsyncOpenAI
|
|
10
11
|
|
|
11
|
-
from .
|
|
12
|
-
from
|
|
13
|
-
from
|
|
12
|
+
from .search import SearchService
|
|
13
|
+
from .image_cache import get_cached_images
|
|
14
|
+
from .prompts import (
|
|
14
15
|
AGENT_SP,
|
|
15
16
|
AGENT_SP_INSTRUCT_VISION_ADD,
|
|
16
17
|
AGENT_SP_TOOLS_STANDARD_ADD,
|
|
17
18
|
AGENT_SP_TOOLS_AGENT_ADD,
|
|
18
19
|
AGENT_SP_SEARCH_ADD,
|
|
19
|
-
AGENT_SP_PAGE_ADD,
|
|
20
|
-
AGENT_SP_IMAGE_SEARCH_ADD,
|
|
21
20
|
INSTRUCT_SP,
|
|
22
21
|
INSTRUCT_SP_VISION_ADD,
|
|
23
22
|
VISION_SP,
|
|
@@ -33,7 +32,7 @@ class ProcessingPipeline:
|
|
|
33
32
|
Core pipeline (vision -> instruct/search -> agent).
|
|
34
33
|
"""
|
|
35
34
|
|
|
36
|
-
def __init__(self, config:
|
|
35
|
+
def __init__(self, config: Any):
|
|
37
36
|
self.config = config
|
|
38
37
|
self.search_service = SearchService(config)
|
|
39
38
|
self.client = AsyncOpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
|
|
@@ -120,11 +119,9 @@ class ProcessingPipeline:
|
|
|
120
119
|
final_response_content = ""
|
|
121
120
|
structured: Dict[str, Any] = {}
|
|
122
121
|
|
|
123
|
-
# Reset search cache and ID
|
|
122
|
+
# Reset search cache and ID counter for this execution
|
|
124
123
|
self.all_web_results = []
|
|
125
|
-
self.
|
|
126
|
-
self.page_id_counter = 0
|
|
127
|
-
self.image_id_counter = 0
|
|
124
|
+
self.global_id_counter = 0
|
|
128
125
|
|
|
129
126
|
try:
|
|
130
127
|
logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
|
|
@@ -188,7 +185,8 @@ class ProcessingPipeline:
|
|
|
188
185
|
vision_text=vision_text,
|
|
189
186
|
model=instruct_model,
|
|
190
187
|
)
|
|
191
|
-
|
|
188
|
+
# Instruct time excludes search time (search_time is returned separately)
|
|
189
|
+
instruct_time = time.time() - instruct_start - search_time
|
|
192
190
|
|
|
193
191
|
# Calculate Instruct Cost
|
|
194
192
|
instruct_cost = 0.0
|
|
@@ -265,17 +263,18 @@ class ProcessingPipeline:
|
|
|
265
263
|
if vision_text:
|
|
266
264
|
system_prompt += AGENT_SP_INSTRUCT_VISION_ADD.format(vision_msgs=vision_text)
|
|
267
265
|
|
|
268
|
-
# Append search results
|
|
269
|
-
if has_search_results and search_msgs_text:
|
|
270
|
-
system_prompt += AGENT_SP_SEARCH_ADD.format(search_msgs=search_msgs_text)
|
|
271
|
-
|
|
272
|
-
# Append crawled page content
|
|
266
|
+
# Append all search results (text, page, image) in one block
|
|
273
267
|
page_msgs_text = self._format_page_msgs()
|
|
268
|
+
all_search_parts = []
|
|
269
|
+
if has_search_results and search_msgs_text:
|
|
270
|
+
all_search_parts.append(search_msgs_text)
|
|
274
271
|
if page_msgs_text:
|
|
275
|
-
|
|
276
|
-
|
|
272
|
+
all_search_parts.append(page_msgs_text)
|
|
277
273
|
if has_image_results and image_msgs_text:
|
|
278
|
-
|
|
274
|
+
all_search_parts.append(image_msgs_text)
|
|
275
|
+
|
|
276
|
+
if all_search_parts:
|
|
277
|
+
system_prompt += AGENT_SP_SEARCH_ADD.format(search_msgs="\n".join(all_search_parts))
|
|
279
278
|
|
|
280
279
|
last_system_prompt = system_prompt
|
|
281
280
|
|
|
@@ -331,6 +330,7 @@ class ProcessingPipeline:
|
|
|
331
330
|
"tool_results": [],
|
|
332
331
|
"tool_time": tool_exec_time,
|
|
333
332
|
"llm_time": step_llm_time,
|
|
333
|
+
"usage": step_usage,
|
|
334
334
|
}
|
|
335
335
|
for i, result in enumerate(results):
|
|
336
336
|
tc = tool_calls[i]
|
|
@@ -428,7 +428,7 @@ class ProcessingPipeline:
|
|
|
428
428
|
stages_used.append({
|
|
429
429
|
"name": "Vision",
|
|
430
430
|
"model": v_model,
|
|
431
|
-
"icon_config":
|
|
431
|
+
"icon_config": infer_icon(v_model, v_base_url),
|
|
432
432
|
"provider": infer_provider(v_base_url),
|
|
433
433
|
"time": v.get("time", 0),
|
|
434
434
|
"cost": v.get("cost", 0.0)
|
|
@@ -441,20 +441,33 @@ class ProcessingPipeline:
|
|
|
441
441
|
stages_used.append({
|
|
442
442
|
"name": "Instruct",
|
|
443
443
|
"model": i_model,
|
|
444
|
-
"icon_config":
|
|
444
|
+
"icon_config": infer_icon(i_model, i_base_url),
|
|
445
445
|
"provider": infer_provider(i_base_url),
|
|
446
446
|
"time": i.get("time", 0),
|
|
447
447
|
"cost": i.get("cost", 0.0)
|
|
448
448
|
})
|
|
449
449
|
|
|
450
|
-
if
|
|
450
|
+
# Show Search stage if we have ANY search results (text OR image)
|
|
451
|
+
if (has_search_results or has_image_results) and search_payloads:
|
|
452
|
+
# Collect initial search results for the Search stage card
|
|
453
|
+
initial_refs = [
|
|
454
|
+
{"title": r.get("title", ""), "url": r.get("url", ""), "domain": r.get("domain", "")}
|
|
455
|
+
for r in self.all_web_results if r.get("_type") == "search"
|
|
456
|
+
]
|
|
457
|
+
initial_images = [
|
|
458
|
+
{"title": r.get("title", ""), "url": r.get("url", ""), "thumbnail": r.get("thumbnail", "")}
|
|
459
|
+
for r in self.all_web_results if r.get("_type") == "image"
|
|
460
|
+
]
|
|
461
|
+
|
|
451
462
|
stages_used.append({
|
|
452
463
|
"name": "Search",
|
|
453
464
|
"model": getattr(self.config, "search_name", "DuckDuckGo"),
|
|
454
465
|
"icon_config": "search",
|
|
455
466
|
"provider": getattr(self.config, 'search_provider', 'Crawl4AI'),
|
|
456
467
|
"time": search_time,
|
|
457
|
-
"cost": 0.0
|
|
468
|
+
"cost": 0.0,
|
|
469
|
+
"references": initial_refs,
|
|
470
|
+
"image_references": initial_images
|
|
458
471
|
})
|
|
459
472
|
|
|
460
473
|
# Add Crawler stage if Instruct used crawl_page
|
|
@@ -496,18 +509,24 @@ class ProcessingPipeline:
|
|
|
496
509
|
a_model = a.get("model", "") or active_model
|
|
497
510
|
a_base_url = a.get("base_url", "") or self.config.base_url
|
|
498
511
|
steps = a.get("steps", [])
|
|
499
|
-
agent_icon =
|
|
512
|
+
agent_icon = infer_icon(a_model, a_base_url)
|
|
500
513
|
agent_provider = infer_provider(a_base_url)
|
|
501
514
|
|
|
502
515
|
for s in steps:
|
|
503
516
|
if "tool_calls" in s:
|
|
504
517
|
# 1. Agent Thought Stage (with LLM time)
|
|
518
|
+
# Calculate step cost
|
|
519
|
+
step_usage = s.get("usage", {})
|
|
520
|
+
step_cost = 0.0
|
|
521
|
+
if a_in_price > 0 or a_out_price > 0:
|
|
522
|
+
step_cost = (step_usage.get("input_tokens", 0) / 1_000_000 * a_in_price) + (step_usage.get("output_tokens", 0) / 1_000_000 * a_out_price)
|
|
523
|
+
|
|
505
524
|
stages_used.append({
|
|
506
525
|
"name": "Agent",
|
|
507
526
|
"model": a_model,
|
|
508
527
|
"icon_config": agent_icon,
|
|
509
528
|
"provider": agent_provider,
|
|
510
|
-
"time": s.get("llm_time", 0), "cost":
|
|
529
|
+
"time": s.get("llm_time", 0), "cost": step_cost
|
|
511
530
|
})
|
|
512
531
|
|
|
513
532
|
# 2. Grouped Tool Stages
|
|
@@ -587,11 +606,33 @@ class ProcessingPipeline:
|
|
|
587
606
|
"time": 0, "cost": 0
|
|
588
607
|
})
|
|
589
608
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
609
|
+
# Assign total time/cost to last Agent stage
|
|
610
|
+
# Sum up total time/cost for UI/stats (implicit via loop above)
|
|
611
|
+
# No need to assign everything to last agent anymore as we distribute it.
|
|
612
|
+
|
|
613
|
+
# --- Final Filter: Only show cited items in workflow cards ---
|
|
614
|
+
cited_urls = {ref['url'] for ref in (structured.get("references", []) +
|
|
615
|
+
structured.get("page_references", []) +
|
|
616
|
+
structured.get("image_references", []))}
|
|
617
|
+
|
|
618
|
+
# Find images already rendered in markdown content (to avoid duplicate display)
|
|
619
|
+
markdown_image_urls = set()
|
|
620
|
+
md_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
|
|
621
|
+
for match in md_img_pattern.finditer(final_content):
|
|
622
|
+
markdown_image_urls.add(match.group(1))
|
|
623
|
+
|
|
624
|
+
for s in stages_used:
|
|
625
|
+
if "references" in s and s["references"]:
|
|
626
|
+
s["references"] = [r for r in s["references"] if r.get("url") in cited_urls]
|
|
627
|
+
# Filter out images already shown in markdown content
|
|
628
|
+
# Check both url AND thumbnail since either might be used in markdown
|
|
629
|
+
if "image_references" in s and s["image_references"]:
|
|
630
|
+
s["image_references"] = [
|
|
631
|
+
r for r in s["image_references"]
|
|
632
|
+
if r.get("url") not in markdown_image_urls and (r.get("thumbnail") or "") not in markdown_image_urls
|
|
633
|
+
]
|
|
634
|
+
if "crawled_pages" in s and s["crawled_pages"]:
|
|
635
|
+
s["crawled_pages"] = [r for r in s["crawled_pages"] if r.get("url") in cited_urls]
|
|
595
636
|
|
|
596
637
|
# Clean up conversation history: Remove tool calls and results to save tokens and avoid ID conflicts
|
|
597
638
|
# Keep only 'user' messages and 'assistant' messages without tool_calls (final answers)
|
|
@@ -606,6 +647,67 @@ class ProcessingPipeline:
|
|
|
606
647
|
# Update the reference (since it might be used by caller)
|
|
607
648
|
current_history[:] = cleaned_history
|
|
608
649
|
|
|
650
|
+
# --- Apply cached images to reduce render time ---
|
|
651
|
+
# Collect all image URLs that need caching (avoid duplicates when thumbnail == url)
|
|
652
|
+
all_image_urls = set()
|
|
653
|
+
for img_ref in structured.get("image_references", []):
|
|
654
|
+
if img_ref.get("thumbnail"):
|
|
655
|
+
all_image_urls.add(img_ref["thumbnail"])
|
|
656
|
+
if img_ref.get("url"):
|
|
657
|
+
all_image_urls.add(img_ref["url"])
|
|
658
|
+
|
|
659
|
+
for stage in stages_used:
|
|
660
|
+
for img_ref in stage.get("image_references", []):
|
|
661
|
+
if img_ref.get("thumbnail"):
|
|
662
|
+
all_image_urls.add(img_ref["thumbnail"])
|
|
663
|
+
if img_ref.get("url"):
|
|
664
|
+
all_image_urls.add(img_ref["url"])
|
|
665
|
+
|
|
666
|
+
# Also collect image URLs from markdown content
|
|
667
|
+
markdown_img_pattern = re.compile(r'!\[.*?\]\((https?://[^)]+)\)')
|
|
668
|
+
markdown_urls = markdown_img_pattern.findall(final_content)
|
|
669
|
+
all_image_urls.update(markdown_urls)
|
|
670
|
+
|
|
671
|
+
# Get cached versions (waits for pending downloads, with timeout)
|
|
672
|
+
if all_image_urls:
|
|
673
|
+
try:
|
|
674
|
+
cached_map = await get_cached_images(list(all_image_urls), wait_timeout=3.0)
|
|
675
|
+
|
|
676
|
+
# Apply cached URLs to structured response
|
|
677
|
+
for img_ref in structured.get("image_references", []):
|
|
678
|
+
if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
|
|
679
|
+
img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
|
|
680
|
+
if img_ref.get("url") and img_ref["url"] in cached_map:
|
|
681
|
+
img_ref["url"] = cached_map[img_ref["url"]]
|
|
682
|
+
|
|
683
|
+
# Apply cached URLs to stages
|
|
684
|
+
for stage in stages_used:
|
|
685
|
+
for img_ref in stage.get("image_references", []):
|
|
686
|
+
if img_ref.get("thumbnail") and img_ref["thumbnail"] in cached_map:
|
|
687
|
+
img_ref["thumbnail"] = cached_map[img_ref["thumbnail"]]
|
|
688
|
+
if img_ref.get("url") and img_ref["url"] in cached_map:
|
|
689
|
+
img_ref["url"] = cached_map[img_ref["url"]]
|
|
690
|
+
|
|
691
|
+
# Replace image URLs in markdown content with cached versions
|
|
692
|
+
def replace_markdown_img(match):
|
|
693
|
+
full_match = match.group(0)
|
|
694
|
+
url = match.group(1)
|
|
695
|
+
cached_url = cached_map.get(url)
|
|
696
|
+
if cached_url and cached_url != url:
|
|
697
|
+
return full_match.replace(url, cached_url)
|
|
698
|
+
return full_match
|
|
699
|
+
|
|
700
|
+
final_content = markdown_img_pattern.sub(replace_markdown_img, final_content)
|
|
701
|
+
structured["response"] = markdown_img_pattern.sub(replace_markdown_img, structured.get("response", ""))
|
|
702
|
+
|
|
703
|
+
# Log cache stats
|
|
704
|
+
from .image_cache import get_image_cache
|
|
705
|
+
cache_stats = get_image_cache().get_stats()
|
|
706
|
+
logger.info(f"ImageCache stats: {cache_stats}")
|
|
707
|
+
|
|
708
|
+
except Exception as e:
|
|
709
|
+
logger.warning(f"Failed to apply image cache: {e}")
|
|
710
|
+
|
|
609
711
|
return {
|
|
610
712
|
"llm_response": final_content,
|
|
611
713
|
"structured_response": structured,
|
|
@@ -627,11 +729,7 @@ class ProcessingPipeline:
|
|
|
627
729
|
}
|
|
628
730
|
|
|
629
731
|
def _parse_tagged_response(self, text: str) -> Dict[str, Any]:
|
|
630
|
-
"""Parse response and auto-infer references from
|
|
631
|
-
|
|
632
|
-
New simplified format:
|
|
633
|
-
- Body text uses [1][2] format for citations
|
|
634
|
-
- No ref code block needed - we auto-infer from citations
|
|
732
|
+
"""Parse response and auto-infer references from citations and markdown images.
|
|
635
733
|
"""
|
|
636
734
|
parsed = {"response": "", "references": [], "page_references": [], "image_references": [], "flow_steps": []}
|
|
637
735
|
if not text:
|
|
@@ -639,9 +737,14 @@ class ProcessingPipeline:
|
|
|
639
737
|
|
|
640
738
|
import re
|
|
641
739
|
|
|
642
|
-
|
|
740
|
+
# 1. Strip trailing reference/source list
|
|
741
|
+
body_text = text
|
|
742
|
+
ref_list_pattern = re.compile(r'(?:\n\s*|^)\s*(?:#{1,3}|\*\*)\s*(?:References|Citations|Sources|参考资料|引用)[\s\S]*$', re.IGNORECASE | re.MULTILINE)
|
|
743
|
+
body_text = ref_list_pattern.sub('', body_text)
|
|
744
|
+
|
|
745
|
+
remaining_text = body_text.strip()
|
|
643
746
|
|
|
644
|
-
#
|
|
747
|
+
# 2. Unwrap JSON if necessary
|
|
645
748
|
try:
|
|
646
749
|
if remaining_text.strip().startswith("{") and "action" in remaining_text:
|
|
647
750
|
data = json.loads(remaining_text)
|
|
@@ -650,67 +753,114 @@ class ProcessingPipeline:
|
|
|
650
753
|
except Exception:
|
|
651
754
|
pass
|
|
652
755
|
|
|
653
|
-
#
|
|
756
|
+
# 3. Identify all citations [N] and direct markdown images ![]()
|
|
757
|
+
cited_ids = []
|
|
654
758
|
body_pattern = re.compile(r'\[(\d+)\]')
|
|
655
|
-
id_order = [] # Preserve citation order
|
|
656
|
-
|
|
657
759
|
for match in body_pattern.finditer(remaining_text):
|
|
658
760
|
try:
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
761
|
+
cited_ids.append(int(match.group(1)))
|
|
762
|
+
except ValueError: pass
|
|
763
|
+
|
|
764
|
+
# Also find direct URLs in ![]()
|
|
765
|
+
direct_image_urls = []
|
|
766
|
+
img_pattern = re.compile(r'!\[.*?\]\((.*?)\)')
|
|
767
|
+
for match in img_pattern.finditer(remaining_text):
|
|
768
|
+
url = match.group(1).strip()
|
|
769
|
+
if url and not url.startswith('['): # Not a [N] citation
|
|
770
|
+
direct_image_urls.append(url)
|
|
771
|
+
|
|
772
|
+
# 4. Build Citation Maps and Reference Lists
|
|
773
|
+
unified_id_map = {}
|
|
774
|
+
# Keep track of what we've already added to avoid duplicates
|
|
775
|
+
seen_urls = set()
|
|
668
776
|
|
|
777
|
+
# id_order needs to be unique and preserve appearance order
|
|
778
|
+
id_order = []
|
|
779
|
+
for id_val in cited_ids:
|
|
780
|
+
if id_val not in id_order:
|
|
781
|
+
id_order.append(id_val)
|
|
782
|
+
|
|
783
|
+
# Process [N] citations first to determine numbering
|
|
669
784
|
for old_id in id_order:
|
|
670
|
-
# Find in all_web_results by _id
|
|
671
785
|
result_item = next((r for r in self.all_web_results if r.get("_id") == old_id), None)
|
|
786
|
+
if not result_item: continue
|
|
672
787
|
|
|
788
|
+
url = result_item.get("url", "")
|
|
789
|
+
item_type = result_item.get("_type", "")
|
|
790
|
+
|
|
791
|
+
entry = {
|
|
792
|
+
"title": result_item.get("title", ""),
|
|
793
|
+
"url": url,
|
|
794
|
+
"domain": result_item.get("domain", "")
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
if item_type == "search":
|
|
798
|
+
parsed["references"].append(entry)
|
|
799
|
+
unified_id_map[old_id] = len(parsed["references"]) + len(parsed["page_references"])
|
|
800
|
+
seen_urls.add(url)
|
|
801
|
+
elif item_type == "page":
|
|
802
|
+
parsed["page_references"].append(entry)
|
|
803
|
+
unified_id_map[old_id] = len(parsed["references"]) + len(parsed["page_references"])
|
|
804
|
+
seen_urls.add(url)
|
|
805
|
+
elif item_type == "image":
|
|
806
|
+
entry["thumbnail"] = result_item.get("thumbnail", "")
|
|
807
|
+
if url not in seen_urls:
|
|
808
|
+
parsed["image_references"].append(entry)
|
|
809
|
+
seen_urls.add(url)
|
|
810
|
+
# Note: Images cited as [N] might be used in text like 
|
|
811
|
+
# We'll handle this in replacement
|
|
812
|
+
|
|
813
|
+
# Now handle direct image URLs from ![]() that weren't cited as [N]
|
|
814
|
+
for url in direct_image_urls:
|
|
815
|
+
if url in seen_urls: continue
|
|
816
|
+
# Find in all_web_results
|
|
817
|
+
result_item = next((r for r in self.all_web_results if (r.get("url") == url or r.get("image") == url) and r.get("_type") == "image"), None)
|
|
673
818
|
if result_item:
|
|
674
819
|
entry = {
|
|
675
820
|
"title": result_item.get("title", ""),
|
|
676
|
-
"url":
|
|
677
|
-
"domain": result_item.get("domain", "")
|
|
821
|
+
"url": url,
|
|
822
|
+
"domain": result_item.get("domain", ""),
|
|
823
|
+
"thumbnail": result_item.get("thumbnail", "")
|
|
678
824
|
}
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
# Auto-classify by type
|
|
683
|
-
if item_type == "search":
|
|
684
|
-
parsed["references"].append(entry)
|
|
685
|
-
old_to_new_map[old_id] = len(parsed["references"])
|
|
686
|
-
elif item_type == "page":
|
|
687
|
-
parsed["page_references"].append(entry)
|
|
688
|
-
old_to_new_map[old_id] = len(parsed["page_references"])
|
|
689
|
-
elif item_type == "image":
|
|
690
|
-
# Collect image but don't add to map (will be stripped from text)
|
|
691
|
-
entry["thumbnail"] = result_item.get("thumbnail", "")
|
|
692
|
-
parsed["image_references"].append(entry)
|
|
693
|
-
# Note: no old_to_new_map entry - image citations will be removed
|
|
694
|
-
|
|
695
|
-
# 4. Replace [old_id] with [new_id] in text, or remove if image
|
|
696
|
-
def replace_id(match):
|
|
697
|
-
try:
|
|
698
|
-
old_id = int(match.group(1))
|
|
699
|
-
new_id = old_to_new_map.get(old_id)
|
|
700
|
-
if new_id is not None:
|
|
701
|
-
return f"[{new_id}]"
|
|
702
|
-
else:
|
|
703
|
-
# Check if it's an image reference (not in map)
|
|
704
|
-
item = next((r for r in self.all_web_results if r.get("_id") == old_id), None)
|
|
705
|
-
if item and item.get("_type") == "image":
|
|
706
|
-
return "" # Remove image citations from text
|
|
707
|
-
except ValueError:
|
|
708
|
-
pass
|
|
709
|
-
return match.group(0)
|
|
825
|
+
parsed["image_references"].append(entry)
|
|
826
|
+
seen_urls.add(url)
|
|
710
827
|
|
|
711
|
-
|
|
828
|
+
# 5. Replacement Logic
|
|
829
|
+
# Define image replacement map separately to handle 
|
|
830
|
+
image_url_map = {} # old_id -> raw_url
|
|
831
|
+
for old_id in id_order:
|
|
832
|
+
item = next((r for r in self.all_web_results if r.get("_id") == old_id), None)
|
|
833
|
+
if item and item.get("_type") == "image":
|
|
834
|
+
image_url_map[old_id] = item.get("url", "")
|
|
835
|
+
|
|
836
|
+
def refined_replace(text):
|
|
837
|
+
# First, handle  specifically
|
|
838
|
+
# We want to replace the [N] with the actual URL so the markdown renders
|
|
839
|
+
def sub_img_ref(match):
|
|
840
|
+
alt = match.group(1)
|
|
841
|
+
ref = match.group(2)
|
|
842
|
+
inner_match = body_pattern.match(ref)
|
|
843
|
+
if inner_match:
|
|
844
|
+
oid = int(inner_match.group(1))
|
|
845
|
+
if oid in image_url_map:
|
|
846
|
+
return f""
|
|
847
|
+
return match.group(0)
|
|
848
|
+
|
|
849
|
+
text = re.sub(r'!\[(.*?)\]\((.*?)\)', sub_img_ref, text)
|
|
850
|
+
|
|
851
|
+
# Then handle normal [N] replacements
|
|
852
|
+
def sub_norm_ref(match):
|
|
853
|
+
oid = int(match.group(1))
|
|
854
|
+
if oid in unified_id_map:
|
|
855
|
+
return f"[{unified_id_map[oid]}]"
|
|
856
|
+
if oid in image_url_map:
|
|
857
|
+
return "" # Remove standalone image citations like [5] if they aren't in ![]()
|
|
858
|
+
return match.group(0)
|
|
859
|
+
|
|
860
|
+
return body_pattern.sub(sub_norm_ref, text)
|
|
712
861
|
|
|
713
|
-
|
|
862
|
+
final_text = refined_replace(remaining_text)
|
|
863
|
+
parsed["response"] = final_text.strip()
|
|
714
864
|
return parsed
|
|
715
865
|
|
|
716
866
|
async def _safe_route_tool(self, tool_call):
|
|
@@ -1053,4 +1203,4 @@ class ProcessingPipeline:
|
|
|
1053
1203
|
except Exception:
|
|
1054
1204
|
pass
|
|
1055
1205
|
# Do NOT close shared crawler here, as pipeline instances are now per-request.
|
|
1056
|
-
# Shared crawler lifecycle is managed
|
|
1206
|
+
# Shared crawler lifecycle is managed globally.
|
|
@@ -34,7 +34,7 @@ INSTRUCT_SP = """# 你是一个专业的指导专家.
|
|
|
34
34
|
{tools_desc}
|
|
35
35
|
|
|
36
36
|
## 你的回复
|
|
37
|
-
调用工具后无需回复额外文本节省token.
|
|
37
|
+
调用工具后无需回复额外文本节省 token.
|
|
38
38
|
|
|
39
39
|
## 用户消息
|
|
40
40
|
```
|
|
@@ -57,16 +57,19 @@ AGENT_SP = """# 你是一个 Agent 总控专家, 你需要理解用户意图,
|
|
|
57
57
|
|
|
58
58
|
## 过程要求
|
|
59
59
|
当不调用工具发送文本, 即会变成最终回复, 请遵守:
|
|
60
|
-
- 直接给出一篇报告, 无需回答用户消息
|
|
61
60
|
- 语言: {language}, 百科式风格, 语言严谨不啰嗦.
|
|
62
61
|
- 正文格式:
|
|
63
|
-
-
|
|
64
|
-
-
|
|
65
|
-
-
|
|
62
|
+
- 先给出一个 `# `大标题约 8-10 个字, 不要有多余废话, 不要直接回答用户的提问.
|
|
63
|
+
- 然后紧接着给出一个 <summary>...</summary>, 除了给出一个约 100 字的纯文本简介, 介绍本次输出的长文的清晰、重点概括.
|
|
64
|
+
- 随后开始详细二级标题 + markdown 正文, 语言描绘格式丰富多样, 简洁准确可信.
|
|
65
|
+
- 请不要给出过长的代码、表格列数等, 请控制字数在 600 字内, 只讲重点和准确的数据.
|
|
66
|
+
- 不支持渲染: 链接, 图片链接, mermaid
|
|
67
|
+
- 支持渲染: 公式, 代码高亮, 只在需要的时候给出.
|
|
68
|
+
- 图片链接、链接框架会自动渲染出, 你无需显式给出.
|
|
66
69
|
- 引用:
|
|
67
70
|
> 重要: 所有正文内容必须基于实际信息, 保证百分百真实度
|
|
68
71
|
- 信息来源已按获取顺序编号为 [1], [2], [3]...
|
|
69
|
-
- 正文中直接使用 [1]
|
|
72
|
+
- 正文中直接使用 [1] 格式引用, 只引用对回答有帮助的来源, 一次只能引用一个
|
|
70
73
|
- 无需给出参考文献列表, 系统会自动生成
|
|
71
74
|
|
|
72
75
|
## 用户消息
|
|
@@ -96,24 +99,6 @@ AGENT_SP_INSTRUCT_VISION_ADD = """
|
|
|
96
99
|
"""
|
|
97
100
|
|
|
98
101
|
AGENT_SP_SEARCH_ADD = """
|
|
99
|
-
##
|
|
100
|
-
```text
|
|
102
|
+
## 联网信息
|
|
101
103
|
{search_msgs}
|
|
102
|
-
```
|
|
103
|
-
"""
|
|
104
|
-
|
|
105
|
-
AGENT_SP_PAGE_ADD = """
|
|
106
|
-
## 页面内容专家消息
|
|
107
|
-
```text
|
|
108
|
-
{page_msgs}
|
|
109
|
-
```
|
|
110
|
-
- 引用页面内容时, 必须使用 `page:id` 格式
|
|
111
|
-
"""
|
|
112
|
-
|
|
113
|
-
AGENT_SP_IMAGE_SEARCH_ADD = """
|
|
114
|
-
## 图像搜索专家消息
|
|
115
|
-
```text
|
|
116
|
-
{image_search_msgs}
|
|
117
|
-
```
|
|
118
|
-
- 每进行一次 internal_image_search, 挑选 1 张图像插入正文
|
|
119
104
|
"""
|