entari-plugin-hyw 3.3.4__py3-none-any.whl → 3.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- entari_plugin_hyw/__init__.py +14 -351
- entari_plugin_hyw/assets/libs/tailwind.css +1 -1
- entari_plugin_hyw/assets/tailwind.input.css +1 -1
- entari_plugin_hyw/assets/template.j2 +113 -20
- entari_plugin_hyw/core/config.py +2 -0
- entari_plugin_hyw/core/pipeline.py +116 -112
- entari_plugin_hyw/core/render.py +39 -42
- entari_plugin_hyw/utils/prompts.py +26 -15
- entari_plugin_hyw/utils/search.py +234 -4
- {entari_plugin_hyw-3.3.4.dist-info → entari_plugin_hyw-3.3.6.dist-info}/METADATA +2 -1
- {entari_plugin_hyw-3.3.4.dist-info → entari_plugin_hyw-3.3.6.dist-info}/RECORD +13 -14
- entari_plugin_hyw/core/render.py.bak +0 -926
- {entari_plugin_hyw-3.3.4.dist-info → entari_plugin_hyw-3.3.6.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-3.3.4.dist-info → entari_plugin_hyw-3.3.6.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,40 @@
|
|
|
16
16
|
<script>{{ katex_auto_render_js | safe }}</script>
|
|
17
17
|
<!-- @formatter:on -->
|
|
18
18
|
|
|
19
|
+
<style>
|
|
20
|
+
/* Fallback style for broken images in markdown content */
|
|
21
|
+
.img-error-fallback {
|
|
22
|
+
display: flex;
|
|
23
|
+
align-items: center;
|
|
24
|
+
justify-content: center;
|
|
25
|
+
gap: 8px;
|
|
26
|
+
width: 100%;
|
|
27
|
+
aspect-ratio: 16 / 9;
|
|
28
|
+
margin-bottom: 8px;
|
|
29
|
+
background: linear-gradient(135deg, #d3e4fd 0%, #b7d3fe 50%, #8bb9fc 100%);
|
|
30
|
+
border-radius: 12px;
|
|
31
|
+
color: white;
|
|
32
|
+
font-size: 14px;
|
|
33
|
+
font-weight: 500;
|
|
34
|
+
box-shadow: 0 4px 12px rgba(59, 130, 246, 0.25);
|
|
35
|
+
}
|
|
36
|
+
.img-error-fallback i {
|
|
37
|
+
font-size: 20px;
|
|
38
|
+
}
|
|
39
|
+
/* Dynamic image sizing based on aspect ratio */
|
|
40
|
+
#markdown-content img {
|
|
41
|
+
border-radius: 8px;
|
|
42
|
+
margin-bottom: 8px;
|
|
43
|
+
}
|
|
44
|
+
#markdown-content img.img-horizontal {
|
|
45
|
+
width: 100%;
|
|
46
|
+
height: auto;
|
|
47
|
+
}
|
|
48
|
+
#markdown-content img.img-vertical {
|
|
49
|
+
width: 60%;
|
|
50
|
+
height: auto;
|
|
51
|
+
}
|
|
52
|
+
</style>
|
|
19
53
|
</head>
|
|
20
54
|
|
|
21
55
|
<body class="bg-[#f2f2f2] p-0 box-border m-0 font-sans text-gray-800">
|
|
@@ -135,7 +169,7 @@
|
|
|
135
169
|
{{ list_card(stage.icon_html, title_html, subtitle_html=stats_html, is_compact=True, icon_box_class=icon_box_class) }}
|
|
136
170
|
|
|
137
171
|
{# Nested Children (Indent & Connect) #}
|
|
138
|
-
{% if stage.references or stage.flow_steps or stage.crawled_pages %}
|
|
172
|
+
{% if stage.references or stage.image_references or stage.flow_steps or stage.crawled_pages %}
|
|
139
173
|
<div class="ml-4 pl-4 border-l-2 border-gray-200 mt-2 flex flex-col gap-2">
|
|
140
174
|
|
|
141
175
|
{# References #}
|
|
@@ -158,18 +192,23 @@
|
|
|
158
192
|
{% endfor %}
|
|
159
193
|
{% endif %}
|
|
160
194
|
|
|
161
|
-
{#
|
|
162
|
-
{% if stage.
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
{% set
|
|
195
|
+
{# Image References #}
|
|
196
|
+
{% if stage.image_references %}
|
|
197
|
+
<div class="text-[12px] uppercase font-bold text-blue-600 tracking-wider mb-1 mt-2">Images</div>
|
|
198
|
+
{% for img in stage.image_references %}
|
|
199
|
+
{% set favicon_url = "https://www.google.com/s2/favicons?domain=" + img.domain + "&sz=32" %}
|
|
166
200
|
|
|
167
|
-
{% set
|
|
168
|
-
{
|
|
169
|
-
{%
|
|
201
|
+
{% set img_icon %}
|
|
202
|
+
<img src="{{ favicon_url }}" class="w-3.5 h-3.5 rounded-sm opacity-80">
|
|
203
|
+
{% endset %}
|
|
170
204
|
|
|
171
|
-
{
|
|
172
|
-
|
|
205
|
+
{% set img_icon_box = "bg-white rounded border border-gray-100 w-6 h-6 shrink-0" %}
|
|
206
|
+
|
|
207
|
+
{% set title_html = '<div class="text-[13px] font-medium text-gray-900 truncate">' + img.title + '</div>' %}
|
|
208
|
+
{% set subtitle_html = '<div class="text-[12px] text-gray-500 truncate">' + img.domain + '</div>' %}
|
|
209
|
+
|
|
210
|
+
{{ list_card(img_icon, title_html, subtitle_html=subtitle_html, link_url=img.url, is_compact=True, icon_box_class=img_icon_box) }}
|
|
211
|
+
{% endfor %}
|
|
173
212
|
{% endif %}
|
|
174
213
|
|
|
175
214
|
{# Crawled Pages #}
|
|
@@ -272,23 +311,46 @@
|
|
|
272
311
|
const fragment = document.createDocumentFragment();
|
|
273
312
|
let lastIndex = 0;
|
|
274
313
|
const text = textNode.nodeValue;
|
|
275
|
-
|
|
314
|
+
// Regex to capture:
|
|
315
|
+
// 1. Optional brackets/parens: [(
|
|
316
|
+
// 2. Type: search/page
|
|
317
|
+
// 3. IDs: 1 or 1,2,3
|
|
318
|
+
// 4. Closing: )]
|
|
319
|
+
const regex = /[\[\(]?(search|page):\s*([\d,\s]+)[\]\)]?/gi;
|
|
276
320
|
let match;
|
|
277
321
|
|
|
278
322
|
while ((match = regex.exec(text)) !== null) {
|
|
323
|
+
// Validate match: simple check to ensure it contains digits
|
|
324
|
+
if (!/\d/.test(match[2])) continue;
|
|
325
|
+
|
|
279
326
|
fragment.appendChild(document.createTextNode(text.substring(lastIndex, match.index)));
|
|
280
327
|
|
|
328
|
+
const fullMatch = match[0];
|
|
281
329
|
const type = match[1].toLowerCase();
|
|
282
|
-
const
|
|
330
|
+
const idString = match[2];
|
|
283
331
|
|
|
284
|
-
|
|
285
|
-
const
|
|
286
|
-
const colorClass = isPage
|
|
287
|
-
? "text-orange-600 bg-orange-50 border-orange-200"
|
|
288
|
-
: "text-blue-600 bg-blue-50 border-blue-200";
|
|
332
|
+
// Parse IDs (split by comma or space)
|
|
333
|
+
const ids = idString.split(/[,\s]+/).filter(s => s.trim().length > 0);
|
|
289
334
|
|
|
290
|
-
|
|
291
|
-
|
|
335
|
+
// Check for standard format (allow plain or [brackets])
|
|
336
|
+
// Standard: search:1, [search:1], page:1, [page:1]
|
|
337
|
+
// Non-standard: (page:1), page:1,2, (page:1,2)
|
|
338
|
+
const isStandard = /^[\[]?(search|page):\d+[\]]?$/i.test(fullMatch);
|
|
339
|
+
|
|
340
|
+
if (!isStandard) {
|
|
341
|
+
console.warn(`[Template] Detected non-standard citation format: "${fullMatch}". Rendered as: ${type}:${ids.join(',')}`);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
ids.forEach(id => {
|
|
345
|
+
const span = document.createElement("span");
|
|
346
|
+
const isPage = type === "page";
|
|
347
|
+
const colorClass = isPage
|
|
348
|
+
? "text-orange-600 bg-orange-50 border-orange-200"
|
|
349
|
+
: "text-blue-600 bg-blue-50 border-blue-200";
|
|
350
|
+
|
|
351
|
+
span.innerHTML = `<span class="inline-flex items-center justify-center min-w-[14px] h-4 px-0.5 text-[9px] font-bold ${colorClass} border rounded align-top -top-0.5 relative mx-0.5 cursor-default" title="${type}:${id}">${id}</span>`;
|
|
352
|
+
fragment.appendChild(span.firstElementChild);
|
|
353
|
+
});
|
|
292
354
|
|
|
293
355
|
lastIndex = regex.lastIndex;
|
|
294
356
|
}
|
|
@@ -301,6 +363,37 @@
|
|
|
301
363
|
}
|
|
302
364
|
|
|
303
365
|
processCitations(contentDiv);
|
|
366
|
+
|
|
367
|
+
// Handle broken images in markdown content
|
|
368
|
+
const contentImages = contentDiv.querySelectorAll('img');
|
|
369
|
+
contentImages.forEach(img => {
|
|
370
|
+
// Apply sizing class based on aspect ratio
|
|
371
|
+
const applySizeClass = function() {
|
|
372
|
+
if (this.naturalWidth >= this.naturalHeight) {
|
|
373
|
+
this.classList.add('img-horizontal');
|
|
374
|
+
} else {
|
|
375
|
+
this.classList.add('img-vertical');
|
|
376
|
+
}
|
|
377
|
+
};
|
|
378
|
+
|
|
379
|
+
img.onerror = function() {
|
|
380
|
+
const fallback = document.createElement('span');
|
|
381
|
+
fallback.className = 'img-error-fallback';
|
|
382
|
+
fallback.innerHTML = `<span style="font-size: 18px;">(。•́︿•̀。)</span><span>渲染失败</span>`;
|
|
383
|
+
this.parentNode.replaceChild(fallback, this);
|
|
384
|
+
};
|
|
385
|
+
|
|
386
|
+
// Check if image already loaded
|
|
387
|
+
if (img.complete) {
|
|
388
|
+
if (img.naturalHeight === 0) {
|
|
389
|
+
img.onerror();
|
|
390
|
+
} else {
|
|
391
|
+
applySizeClass.call(img);
|
|
392
|
+
}
|
|
393
|
+
} else {
|
|
394
|
+
img.onload = applySizeClass;
|
|
395
|
+
}
|
|
396
|
+
});
|
|
304
397
|
});
|
|
305
398
|
</script>
|
|
306
399
|
</body>
|
entari_plugin_hyw/core/config.py
CHANGED
|
@@ -18,6 +18,7 @@ class HYWConfig:
|
|
|
18
18
|
search_base_url: str = "https://lite.duckduckgo.com/lite/?q={query}"
|
|
19
19
|
image_search_base_url: str = "https://duckduckgo.com/?q={query}&iax=images&ia=images"
|
|
20
20
|
search_params: Optional[str] = None # e.g. "&kl=cn-zh" for China region
|
|
21
|
+
search_limit: int = 8
|
|
21
22
|
extra_body: Optional[Dict[str, Any]] = None
|
|
22
23
|
temperature: float = 0.4
|
|
23
24
|
max_turns: int = 10
|
|
@@ -34,3 +35,4 @@ class HYWConfig:
|
|
|
34
35
|
vision_output_price: Optional[float] = None
|
|
35
36
|
intruct_input_price: Optional[float] = None
|
|
36
37
|
intruct_output_price: Optional[float] = None
|
|
38
|
+
|
|
@@ -39,6 +39,10 @@ class ProcessingPipeline:
|
|
|
39
39
|
self.client = AsyncOpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
|
|
40
40
|
self.all_web_results = [] # Cache for search results
|
|
41
41
|
self.current_mode = "standard" # standard | agent
|
|
42
|
+
# Independent ID counters for each type
|
|
43
|
+
self.search_id_counter = 0
|
|
44
|
+
self.page_id_counter = 0
|
|
45
|
+
self.image_id_counter = 0
|
|
42
46
|
|
|
43
47
|
self.web_search_tool = {
|
|
44
48
|
"type": "function",
|
|
@@ -118,8 +122,11 @@ class ProcessingPipeline:
|
|
|
118
122
|
final_response_content = ""
|
|
119
123
|
structured: Dict[str, Any] = {}
|
|
120
124
|
|
|
121
|
-
# Reset search cache for this execution
|
|
125
|
+
# Reset search cache and ID counters for this execution
|
|
122
126
|
self.all_web_results = []
|
|
127
|
+
self.search_id_counter = 0
|
|
128
|
+
self.page_id_counter = 0
|
|
129
|
+
self.image_id_counter = 0
|
|
123
130
|
|
|
124
131
|
try:
|
|
125
132
|
logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
|
|
@@ -244,8 +251,8 @@ class ProcessingPipeline:
|
|
|
244
251
|
search_msgs_text = self._format_search_msgs()
|
|
245
252
|
image_msgs_text = self._format_image_search_msgs()
|
|
246
253
|
|
|
247
|
-
has_search_results = any(
|
|
248
|
-
has_image_results = any(r.get("
|
|
254
|
+
has_search_results = any(r.get("_type") == "search" for r in self.all_web_results)
|
|
255
|
+
has_image_results = any(r.get("_type") == "image" for r in self.all_web_results)
|
|
249
256
|
|
|
250
257
|
# Build agent system prompt
|
|
251
258
|
agent_prompt_tpl = getattr(self.config, "agent_system_prompt", None) or AGENT_SP
|
|
@@ -462,7 +469,7 @@ class ProcessingPipeline:
|
|
|
462
469
|
for tc in crawl_calls:
|
|
463
470
|
url = tc.get("arguments", {}).get("url", "")
|
|
464
471
|
# Try to find cached result
|
|
465
|
-
found = next((r for r in self.all_web_results if r.get("url") == url and r.get("
|
|
472
|
+
found = next((r for r in self.all_web_results if r.get("url") == url and r.get("_type") == "page"), None)
|
|
466
473
|
if found:
|
|
467
474
|
try:
|
|
468
475
|
from urllib.parse import urlparse
|
|
@@ -588,6 +595,19 @@ class ProcessingPipeline:
|
|
|
588
595
|
last_agent["time"] = a.get("time", 0)
|
|
589
596
|
last_agent["cost"] = a.get("cost", 0.0)
|
|
590
597
|
|
|
598
|
+
# Clean up conversation history: Remove tool calls and results to save tokens and avoid ID conflicts
|
|
599
|
+
# Keep only 'user' messages and 'assistant' messages without tool_calls (final answers)
|
|
600
|
+
cleaned_history = []
|
|
601
|
+
for msg in current_history:
|
|
602
|
+
if msg.get("role") == "tool":
|
|
603
|
+
continue
|
|
604
|
+
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
605
|
+
continue
|
|
606
|
+
cleaned_history.append(msg)
|
|
607
|
+
|
|
608
|
+
# Update the reference (since it might be used by caller)
|
|
609
|
+
current_history[:] = cleaned_history
|
|
610
|
+
|
|
591
611
|
return {
|
|
592
612
|
"llm_response": final_content,
|
|
593
613
|
"structured_response": structured,
|
|
@@ -609,8 +629,8 @@ class ProcessingPipeline:
|
|
|
609
629
|
}
|
|
610
630
|
|
|
611
631
|
def _parse_tagged_response(self, text: str) -> Dict[str, Any]:
|
|
612
|
-
"""Parse response for references and page references."""
|
|
613
|
-
parsed = {"response": "", "references": [], "page_references": [], "flow_steps": []}
|
|
632
|
+
"""Parse response for references and page references reordered by appearance."""
|
|
633
|
+
parsed = {"response": "", "references": [], "page_references": [], "image_references": [], "flow_steps": []}
|
|
614
634
|
if not text:
|
|
615
635
|
return parsed
|
|
616
636
|
|
|
@@ -620,7 +640,6 @@ class ProcessingPipeline:
|
|
|
620
640
|
|
|
621
641
|
# 1. Try to unwrap JSON if the model acted like a ReAct agent
|
|
622
642
|
try:
|
|
623
|
-
# Check if it looks like JSON first to avoid performance hit
|
|
624
643
|
if remaining_text.strip().startswith("{") and "action" in remaining_text:
|
|
625
644
|
data = json.loads(remaining_text)
|
|
626
645
|
if isinstance(data, dict) and "action_input" in data:
|
|
@@ -628,86 +647,80 @@ class ProcessingPipeline:
|
|
|
628
647
|
except Exception:
|
|
629
648
|
pass
|
|
630
649
|
|
|
631
|
-
|
|
632
|
-
page_id_map = {} # Map original page ID (str) -> new index (int)
|
|
633
|
-
|
|
634
|
-
# Parse References Block (unified: contains both [search] and [page] entries)
|
|
650
|
+
# 2. Remove the original references block if present (we will rebuild it)
|
|
635
651
|
ref_block_match = re.search(r'```references\s*(.*?)\s*```', remaining_text, re.DOTALL | re.IGNORECASE)
|
|
636
652
|
if ref_block_match:
|
|
637
|
-
ref_content = ref_block_match.group(1).strip()
|
|
638
|
-
for line in ref_content.split("\n"):
|
|
639
|
-
line = line.strip()
|
|
640
|
-
if not line: continue
|
|
641
|
-
|
|
642
|
-
# Match [id] [type] [title](url)
|
|
643
|
-
# e.g. [1] [search] [文本描述](url) or [5] [page] [页面标题](url)
|
|
644
|
-
id_match = re.match(r"^\[(\d+)\]", line)
|
|
645
|
-
type_match = re.search(r"\[(search|page)\]", line, re.IGNORECASE)
|
|
646
|
-
link_match = re.search(r"\[([^\[\]]+)\]\(([^)]+)\)", line)
|
|
647
|
-
|
|
648
|
-
idx = None
|
|
649
|
-
if id_match:
|
|
650
|
-
try:
|
|
651
|
-
idx = int(id_match.group(1))
|
|
652
|
-
except ValueError:
|
|
653
|
-
pass
|
|
654
|
-
|
|
655
|
-
ref_type = "search" # default
|
|
656
|
-
if type_match:
|
|
657
|
-
ref_type = type_match.group(1).lower()
|
|
658
|
-
|
|
659
|
-
entry = None
|
|
660
|
-
if idx is not None and self.all_web_results:
|
|
661
|
-
# For page type, only match crawled items
|
|
662
|
-
if ref_type == "page":
|
|
663
|
-
found = next((r for r in self.all_web_results if r.get("_id") == idx and r.get("is_crawled")), None)
|
|
664
|
-
else:
|
|
665
|
-
found = next((r for r in self.all_web_results if r.get("_id") == idx and not r.get("is_crawled")), None)
|
|
666
|
-
|
|
667
|
-
if found:
|
|
668
|
-
entry = {
|
|
669
|
-
"title": found.get("title"),
|
|
670
|
-
"url": found.get("url"),
|
|
671
|
-
"domain": found.get("domain", "")
|
|
672
|
-
}
|
|
673
|
-
|
|
674
|
-
if not entry and link_match:
|
|
675
|
-
entry = {"title": link_match.group(1), "url": link_match.group(2)}
|
|
676
|
-
|
|
677
|
-
if entry:
|
|
678
|
-
if ref_type == "page":
|
|
679
|
-
parsed["page_references"].append(entry)
|
|
680
|
-
if idx is not None:
|
|
681
|
-
page_id_map[str(idx)] = len(parsed["page_references"])
|
|
682
|
-
else:
|
|
683
|
-
parsed["references"].append(entry)
|
|
684
|
-
if idx is not None:
|
|
685
|
-
id_map[str(idx)] = len(parsed["references"])
|
|
686
|
-
|
|
687
653
|
remaining_text = remaining_text.replace(ref_block_match.group(0), "").strip()
|
|
688
654
|
|
|
689
|
-
#
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
655
|
+
# 3. Scan text for [type:id] tags and rebuild references in order of appearance
|
|
656
|
+
# Pattern matches [search:123], [page:123], [image:123]
|
|
657
|
+
pattern = re.compile(r'\[(search|page|image):(\d+)\]', re.IGNORECASE)
|
|
658
|
+
|
|
659
|
+
matches = list(pattern.finditer(remaining_text))
|
|
660
|
+
|
|
661
|
+
search_map = {} # old_id_str -> new_id (int)
|
|
662
|
+
page_map = {}
|
|
663
|
+
image_map = {}
|
|
664
|
+
|
|
665
|
+
for m in matches:
|
|
666
|
+
tag_type = m.group(1).lower()
|
|
667
|
+
old_id_str = m.group(2)
|
|
668
|
+
try:
|
|
669
|
+
old_id = int(old_id_str)
|
|
670
|
+
except ValueError:
|
|
671
|
+
continue
|
|
672
|
+
|
|
673
|
+
# Check if we already processed this ID for this type
|
|
674
|
+
if tag_type == "search" and old_id_str in search_map: continue
|
|
675
|
+
if tag_type == "page" and old_id_str in page_map: continue
|
|
676
|
+
if tag_type == "image" and old_id_str in image_map: continue
|
|
677
|
+
|
|
678
|
+
# Find in all_web_results
|
|
679
|
+
result_item = next((r for r in self.all_web_results if r.get("_id") == old_id and r.get("_type") == tag_type), None)
|
|
680
|
+
|
|
681
|
+
if not result_item:
|
|
682
|
+
continue
|
|
683
|
+
|
|
684
|
+
entry = {
|
|
685
|
+
"title": result_item.get("title", ""),
|
|
686
|
+
"url": result_item.get("url", ""),
|
|
687
|
+
"domain": result_item.get("domain", "")
|
|
688
|
+
}
|
|
689
|
+
if tag_type == "image":
|
|
690
|
+
entry["thumbnail"] = result_item.get("thumbnail", "")
|
|
691
|
+
|
|
692
|
+
# Add to respective list and map
|
|
693
|
+
if tag_type == "search":
|
|
694
|
+
parsed["references"].append(entry)
|
|
695
|
+
search_map[old_id_str] = len(parsed["references"])
|
|
696
|
+
elif tag_type == "page":
|
|
697
|
+
parsed["page_references"].append(entry)
|
|
698
|
+
page_map[old_id_str] = len(parsed["page_references"])
|
|
699
|
+
elif tag_type == "image":
|
|
700
|
+
parsed["image_references"].append(entry)
|
|
701
|
+
image_map[old_id_str] = len(parsed["image_references"])
|
|
702
|
+
|
|
703
|
+
# 4. Replace tags in text with new sequential IDs
|
|
704
|
+
def replace_tag(match):
|
|
705
|
+
tag_type = match.group(1).lower()
|
|
706
|
+
old_id = match.group(2)
|
|
707
|
+
|
|
708
|
+
new_id = None
|
|
709
|
+
if tag_type == "search":
|
|
710
|
+
new_id = search_map.get(old_id)
|
|
711
|
+
elif tag_type == "page":
|
|
712
|
+
new_id = page_map.get(old_id)
|
|
713
|
+
elif tag_type == "image":
|
|
714
|
+
new_id = image_map.get(old_id)
|
|
715
|
+
|
|
716
|
+
if new_id is not None:
|
|
717
|
+
if tag_type == "image":
|
|
718
|
+
return ""
|
|
719
|
+
return f"[{tag_type}:{new_id}]"
|
|
720
|
+
|
|
721
|
+
return match.group(0)
|
|
722
|
+
|
|
723
|
+
remaining_text = pattern.sub(replace_tag, remaining_text)
|
|
711
724
|
|
|
712
725
|
parsed["response"] = remaining_text.strip()
|
|
713
726
|
return parsed
|
|
@@ -730,12 +743,11 @@ class ProcessingPipeline:
|
|
|
730
743
|
query = args.get("query")
|
|
731
744
|
web = await self.search_service.search(query)
|
|
732
745
|
|
|
733
|
-
# Cache results and assign IDs
|
|
734
|
-
current_max_id = max([item.get("_id", 0) for item in self.all_web_results], default=0)
|
|
735
|
-
|
|
746
|
+
# Cache results and assign search-specific IDs
|
|
736
747
|
for item in web:
|
|
737
|
-
|
|
738
|
-
item["_id"] =
|
|
748
|
+
self.search_id_counter += 1
|
|
749
|
+
item["_id"] = self.search_id_counter
|
|
750
|
+
item["_type"] = "search"
|
|
739
751
|
item["query"] = query
|
|
740
752
|
self.all_web_results.append(item)
|
|
741
753
|
|
|
@@ -745,10 +757,11 @@ class ProcessingPipeline:
|
|
|
745
757
|
query = args.get("query")
|
|
746
758
|
images = await self.search_service.image_search(query)
|
|
747
759
|
|
|
748
|
-
|
|
760
|
+
# Cache results and assign image-specific IDs
|
|
749
761
|
for item in images:
|
|
750
|
-
|
|
751
|
-
item["_id"] =
|
|
762
|
+
self.image_id_counter += 1
|
|
763
|
+
item["_id"] = self.image_id_counter
|
|
764
|
+
item["_type"] = "image"
|
|
752
765
|
item["query"] = query
|
|
753
766
|
item["is_image"] = True
|
|
754
767
|
self.all_web_results.append(item)
|
|
@@ -761,15 +774,15 @@ class ProcessingPipeline:
|
|
|
761
774
|
# Returns Dict: {content, title, url}
|
|
762
775
|
result_dict = await self.search_service.fetch_page(url)
|
|
763
776
|
|
|
764
|
-
# Cache the crawled content
|
|
765
|
-
|
|
766
|
-
current_max_id += 1
|
|
777
|
+
# Cache the crawled content with page-specific ID
|
|
778
|
+
self.page_id_counter += 1
|
|
767
779
|
|
|
768
780
|
cached_item = {
|
|
769
|
-
"_id":
|
|
781
|
+
"_id": self.page_id_counter,
|
|
782
|
+
"_type": "page",
|
|
770
783
|
"title": result_dict.get("title", "Page"),
|
|
771
784
|
"url": result_dict.get("url", url),
|
|
772
|
-
"content": result_dict.get("content", "")
|
|
785
|
+
"content": result_dict.get("content", ""),
|
|
773
786
|
"domain": "",
|
|
774
787
|
"is_crawled": True,
|
|
775
788
|
}
|
|
@@ -940,18 +953,13 @@ class ProcessingPipeline:
|
|
|
940
953
|
if not self.all_web_results:
|
|
941
954
|
return ""
|
|
942
955
|
|
|
943
|
-
def clip(s: str, n: int) -> str:
|
|
944
|
-
s = (s or "").strip()
|
|
945
|
-
return s if len(s) <= n else s[: n - 1] + "…"
|
|
946
|
-
|
|
947
956
|
lines = []
|
|
948
957
|
for res in self.all_web_results:
|
|
949
|
-
if res.get("
|
|
950
|
-
if res.get("is_crawled"): continue # Skip crawled pages (handled separately)
|
|
958
|
+
if res.get("_type") != "search": continue # Only search results
|
|
951
959
|
idx = res.get("_id")
|
|
952
|
-
title =
|
|
960
|
+
title = (res.get("title", "") or "").strip()
|
|
953
961
|
url = res.get("url", "")
|
|
954
|
-
content =
|
|
962
|
+
content = (res.get("content", "") or "").strip()
|
|
955
963
|
lines.append(f"[{idx}] Title: {title}\nURL: {url}\nSnippet: {content}\n")
|
|
956
964
|
|
|
957
965
|
return "\n".join(lines)
|
|
@@ -961,17 +969,13 @@ class ProcessingPipeline:
|
|
|
961
969
|
if not self.all_web_results:
|
|
962
970
|
return ""
|
|
963
971
|
|
|
964
|
-
def clip(s: str, n: int) -> str:
|
|
965
|
-
s = (s or "").strip()
|
|
966
|
-
return s if len(s) <= n else s[: n - 1] + "…"
|
|
967
|
-
|
|
968
972
|
lines = []
|
|
969
973
|
for res in self.all_web_results:
|
|
970
|
-
if
|
|
974
|
+
if res.get("_type") != "page": continue # Only page results
|
|
971
975
|
idx = res.get("_id")
|
|
972
|
-
title =
|
|
976
|
+
title = (res.get("title", "") or "").strip()
|
|
973
977
|
url = res.get("url", "")
|
|
974
|
-
content =
|
|
978
|
+
content = (res.get("content", "") or "").strip()
|
|
975
979
|
lines.append(f"[{idx}] Title: {title}\nURL: {url}\nContent: {content}\n")
|
|
976
980
|
|
|
977
981
|
return "\n".join(lines)
|
|
@@ -982,7 +986,7 @@ class ProcessingPipeline:
|
|
|
982
986
|
|
|
983
987
|
lines = []
|
|
984
988
|
for res in self.all_web_results:
|
|
985
|
-
if
|
|
989
|
+
if res.get("_type") != "image": continue # Only image results
|
|
986
990
|
idx = res.get("_id")
|
|
987
991
|
title = res.get("title", "")
|
|
988
992
|
url = res.get("image", "") or res.get("url", "")
|