entari-plugin-hyw 3.3.5__py3-none-any.whl → 3.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +14 -351
- entari_plugin_hyw/assets/libs/tailwind.css +1 -1
- entari_plugin_hyw/assets/tailwind.input.css +1 -1
- entari_plugin_hyw/assets/template.j2 +113 -20
- entari_plugin_hyw/core/config.py +1 -0
- entari_plugin_hyw/core/pipeline.py +131 -103
- entari_plugin_hyw/core/render.py +65 -41
- entari_plugin_hyw/utils/prompts.py +26 -16
- entari_plugin_hyw/utils/search.py +233 -3
- entari_plugin_hyw-3.3.7.dist-info/METADATA +142 -0
- {entari_plugin_hyw-3.3.5.dist-info → entari_plugin_hyw-3.3.7.dist-info}/RECORD +13 -14
- entari_plugin_hyw/core/render.py.bak +0 -926
- entari_plugin_hyw-3.3.5.dist-info/METADATA +0 -142
- {entari_plugin_hyw-3.3.5.dist-info → entari_plugin_hyw-3.3.7.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-3.3.5.dist-info → entari_plugin_hyw-3.3.7.dist-info}/top_level.txt +0 -0
|
@@ -16,6 +16,40 @@
|
|
|
16
16
|
<script>{{ katex_auto_render_js | safe }}</script>
|
|
17
17
|
<!-- @formatter:on -->
|
|
18
18
|
|
|
19
|
+
<style>
|
|
20
|
+
/* Fallback style for broken images in markdown content */
|
|
21
|
+
.img-error-fallback {
|
|
22
|
+
display: flex;
|
|
23
|
+
align-items: center;
|
|
24
|
+
justify-content: center;
|
|
25
|
+
gap: 8px;
|
|
26
|
+
width: 100%;
|
|
27
|
+
aspect-ratio: 16 / 9;
|
|
28
|
+
margin-bottom: 8px;
|
|
29
|
+
background: linear-gradient(135deg, #d3e4fd 0%, #b7d3fe 50%, #8bb9fc 100%);
|
|
30
|
+
border-radius: 12px;
|
|
31
|
+
color: white;
|
|
32
|
+
font-size: 14px;
|
|
33
|
+
font-weight: 500;
|
|
34
|
+
box-shadow: 0 4px 12px rgba(59, 130, 246, 0.25);
|
|
35
|
+
}
|
|
36
|
+
.img-error-fallback i {
|
|
37
|
+
font-size: 20px;
|
|
38
|
+
}
|
|
39
|
+
/* Dynamic image sizing based on aspect ratio */
|
|
40
|
+
#markdown-content img {
|
|
41
|
+
border-radius: 8px;
|
|
42
|
+
margin-bottom: 8px;
|
|
43
|
+
}
|
|
44
|
+
#markdown-content img.img-horizontal {
|
|
45
|
+
width: 100%;
|
|
46
|
+
height: auto;
|
|
47
|
+
}
|
|
48
|
+
#markdown-content img.img-vertical {
|
|
49
|
+
width: 60%;
|
|
50
|
+
height: auto;
|
|
51
|
+
}
|
|
52
|
+
</style>
|
|
19
53
|
</head>
|
|
20
54
|
|
|
21
55
|
<body class="bg-[#f2f2f2] p-0 box-border m-0 font-sans text-gray-800">
|
|
@@ -135,7 +169,7 @@
|
|
|
135
169
|
{{ list_card(stage.icon_html, title_html, subtitle_html=stats_html, is_compact=True, icon_box_class=icon_box_class) }}
|
|
136
170
|
|
|
137
171
|
{# Nested Children (Indent & Connect) #}
|
|
138
|
-
{% if stage.references or stage.flow_steps or stage.crawled_pages %}
|
|
172
|
+
{% if stage.references or stage.image_references or stage.flow_steps or stage.crawled_pages %}
|
|
139
173
|
<div class="ml-4 pl-4 border-l-2 border-gray-200 mt-2 flex flex-col gap-2">
|
|
140
174
|
|
|
141
175
|
{# References #}
|
|
@@ -158,18 +192,23 @@
|
|
|
158
192
|
{% endfor %}
|
|
159
193
|
{% endif %}
|
|
160
194
|
|
|
161
|
-
{#
|
|
162
|
-
{% if stage.
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
{% set
|
|
195
|
+
{# Image References #}
|
|
196
|
+
{% if stage.image_references %}
|
|
197
|
+
<div class="text-[12px] uppercase font-bold text-blue-600 tracking-wider mb-1 mt-2">Images</div>
|
|
198
|
+
{% for img in stage.image_references %}
|
|
199
|
+
{% set favicon_url = "https://www.google.com/s2/favicons?domain=" + img.domain + "&sz=32" %}
|
|
166
200
|
|
|
167
|
-
{% set
|
|
168
|
-
{
|
|
169
|
-
{%
|
|
201
|
+
{% set img_icon %}
|
|
202
|
+
<img src="{{ favicon_url }}" class="w-3.5 h-3.5 rounded-sm opacity-80">
|
|
203
|
+
{% endset %}
|
|
170
204
|
|
|
171
|
-
{
|
|
172
|
-
|
|
205
|
+
{% set img_icon_box = "bg-white rounded border border-gray-100 w-6 h-6 shrink-0" %}
|
|
206
|
+
|
|
207
|
+
{% set title_html = '<div class="text-[13px] font-medium text-gray-900 truncate">' + img.title + '</div>' %}
|
|
208
|
+
{% set subtitle_html = '<div class="text-[12px] text-gray-500 truncate">' + img.domain + '</div>' %}
|
|
209
|
+
|
|
210
|
+
{{ list_card(img_icon, title_html, subtitle_html=subtitle_html, link_url=img.url, is_compact=True, icon_box_class=img_icon_box) }}
|
|
211
|
+
{% endfor %}
|
|
173
212
|
{% endif %}
|
|
174
213
|
|
|
175
214
|
{# Crawled Pages #}
|
|
@@ -272,23 +311,46 @@
|
|
|
272
311
|
const fragment = document.createDocumentFragment();
|
|
273
312
|
let lastIndex = 0;
|
|
274
313
|
const text = textNode.nodeValue;
|
|
275
|
-
|
|
314
|
+
// Regex to capture:
|
|
315
|
+
// 1. Optional brackets/parens: [(
|
|
316
|
+
// 2. Type: search/page
|
|
317
|
+
// 3. IDs: 1 or 1,2,3
|
|
318
|
+
// 4. Closing: )]
|
|
319
|
+
const regex = /[\[\(]?(search|page):\s*([\d,\s]+)[\]\)]?/gi;
|
|
276
320
|
let match;
|
|
277
321
|
|
|
278
322
|
while ((match = regex.exec(text)) !== null) {
|
|
323
|
+
// Validate match: simple check to ensure it contains digits
|
|
324
|
+
if (!/\d/.test(match[2])) continue;
|
|
325
|
+
|
|
279
326
|
fragment.appendChild(document.createTextNode(text.substring(lastIndex, match.index)));
|
|
280
327
|
|
|
328
|
+
const fullMatch = match[0];
|
|
281
329
|
const type = match[1].toLowerCase();
|
|
282
|
-
const
|
|
330
|
+
const idString = match[2];
|
|
283
331
|
|
|
284
|
-
|
|
285
|
-
const
|
|
286
|
-
const colorClass = isPage
|
|
287
|
-
? "text-orange-600 bg-orange-50 border-orange-200"
|
|
288
|
-
: "text-blue-600 bg-blue-50 border-blue-200";
|
|
332
|
+
// Parse IDs (split by comma or space)
|
|
333
|
+
const ids = idString.split(/[,\s]+/).filter(s => s.trim().length > 0);
|
|
289
334
|
|
|
290
|
-
|
|
291
|
-
|
|
335
|
+
// Check for standard format (allow plain or [brackets])
|
|
336
|
+
// Standard: search:1, [search:1], page:1, [page:1]
|
|
337
|
+
// Non-standard: (page:1), page:1,2, (page:1,2)
|
|
338
|
+
const isStandard = /^[\[]?(search|page):\d+[\]]?$/i.test(fullMatch);
|
|
339
|
+
|
|
340
|
+
if (!isStandard) {
|
|
341
|
+
console.warn(`[Template] Detected non-standard citation format: "${fullMatch}". Rendered as: ${type}:${ids.join(',')}`);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
ids.forEach(id => {
|
|
345
|
+
const span = document.createElement("span");
|
|
346
|
+
const isPage = type === "page";
|
|
347
|
+
const colorClass = isPage
|
|
348
|
+
? "text-orange-600 bg-orange-50 border-orange-200"
|
|
349
|
+
: "text-blue-600 bg-blue-50 border-blue-200";
|
|
350
|
+
|
|
351
|
+
span.innerHTML = `<span class="inline-flex items-center justify-center min-w-[14px] h-4 px-0.5 text-[9px] font-bold ${colorClass} border rounded align-top -top-0.5 relative mx-0.5 cursor-default" title="${type}:${id}">${id}</span>`;
|
|
352
|
+
fragment.appendChild(span.firstElementChild);
|
|
353
|
+
});
|
|
292
354
|
|
|
293
355
|
lastIndex = regex.lastIndex;
|
|
294
356
|
}
|
|
@@ -301,6 +363,37 @@
|
|
|
301
363
|
}
|
|
302
364
|
|
|
303
365
|
processCitations(contentDiv);
|
|
366
|
+
|
|
367
|
+
// Handle broken images in markdown content
|
|
368
|
+
const contentImages = contentDiv.querySelectorAll('img');
|
|
369
|
+
contentImages.forEach(img => {
|
|
370
|
+
// Apply sizing class based on aspect ratio
|
|
371
|
+
const applySizeClass = function() {
|
|
372
|
+
if (this.naturalWidth >= this.naturalHeight) {
|
|
373
|
+
this.classList.add('img-horizontal');
|
|
374
|
+
} else {
|
|
375
|
+
this.classList.add('img-vertical');
|
|
376
|
+
}
|
|
377
|
+
};
|
|
378
|
+
|
|
379
|
+
img.onerror = function() {
|
|
380
|
+
const fallback = document.createElement('span');
|
|
381
|
+
fallback.className = 'img-error-fallback';
|
|
382
|
+
fallback.innerHTML = `<span style="font-size: 18px;">(。•́︿•̀。)</span><span>渲染失败</span>`;
|
|
383
|
+
this.parentNode.replaceChild(fallback, this);
|
|
384
|
+
};
|
|
385
|
+
|
|
386
|
+
// Check if image already loaded
|
|
387
|
+
if (img.complete) {
|
|
388
|
+
if (img.naturalHeight === 0) {
|
|
389
|
+
img.onerror();
|
|
390
|
+
} else {
|
|
391
|
+
applySizeClass.call(img);
|
|
392
|
+
}
|
|
393
|
+
} else {
|
|
394
|
+
img.onload = applySizeClass;
|
|
395
|
+
}
|
|
396
|
+
});
|
|
304
397
|
});
|
|
305
398
|
</script>
|
|
306
399
|
</body>
|
entari_plugin_hyw/core/config.py
CHANGED
|
@@ -39,6 +39,10 @@ class ProcessingPipeline:
|
|
|
39
39
|
self.client = AsyncOpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
|
|
40
40
|
self.all_web_results = [] # Cache for search results
|
|
41
41
|
self.current_mode = "standard" # standard | agent
|
|
42
|
+
# Independent ID counters for each type
|
|
43
|
+
self.search_id_counter = 0
|
|
44
|
+
self.page_id_counter = 0
|
|
45
|
+
self.image_id_counter = 0
|
|
42
46
|
|
|
43
47
|
self.web_search_tool = {
|
|
44
48
|
"type": "function",
|
|
@@ -118,8 +122,11 @@ class ProcessingPipeline:
|
|
|
118
122
|
final_response_content = ""
|
|
119
123
|
structured: Dict[str, Any] = {}
|
|
120
124
|
|
|
121
|
-
# Reset search cache for this execution
|
|
125
|
+
# Reset search cache and ID counters for this execution
|
|
122
126
|
self.all_web_results = []
|
|
127
|
+
self.search_id_counter = 0
|
|
128
|
+
self.page_id_counter = 0
|
|
129
|
+
self.image_id_counter = 0
|
|
123
130
|
|
|
124
131
|
try:
|
|
125
132
|
logger.info(f"Pipeline: Starting workflow for '{user_input}' using {active_model}")
|
|
@@ -244,8 +251,8 @@ class ProcessingPipeline:
|
|
|
244
251
|
search_msgs_text = self._format_search_msgs()
|
|
245
252
|
image_msgs_text = self._format_image_search_msgs()
|
|
246
253
|
|
|
247
|
-
has_search_results = any(
|
|
248
|
-
has_image_results = any(r.get("
|
|
254
|
+
has_search_results = any(r.get("_type") == "search" for r in self.all_web_results)
|
|
255
|
+
has_image_results = any(r.get("_type") == "image" for r in self.all_web_results)
|
|
249
256
|
|
|
250
257
|
# Build agent system prompt
|
|
251
258
|
agent_prompt_tpl = getattr(self.config, "agent_system_prompt", None) or AGENT_SP
|
|
@@ -462,7 +469,7 @@ class ProcessingPipeline:
|
|
|
462
469
|
for tc in crawl_calls:
|
|
463
470
|
url = tc.get("arguments", {}).get("url", "")
|
|
464
471
|
# Try to find cached result
|
|
465
|
-
found = next((r for r in self.all_web_results if r.get("url") == url and r.get("
|
|
472
|
+
found = next((r for r in self.all_web_results if r.get("url") == url and r.get("_type") == "page"), None)
|
|
466
473
|
if found:
|
|
467
474
|
try:
|
|
468
475
|
from urllib.parse import urlparse
|
|
@@ -588,6 +595,19 @@ class ProcessingPipeline:
|
|
|
588
595
|
last_agent["time"] = a.get("time", 0)
|
|
589
596
|
last_agent["cost"] = a.get("cost", 0.0)
|
|
590
597
|
|
|
598
|
+
# Clean up conversation history: Remove tool calls and results to save tokens and avoid ID conflicts
|
|
599
|
+
# Keep only 'user' messages and 'assistant' messages without tool_calls (final answers)
|
|
600
|
+
cleaned_history = []
|
|
601
|
+
for msg in current_history:
|
|
602
|
+
if msg.get("role") == "tool":
|
|
603
|
+
continue
|
|
604
|
+
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
|
605
|
+
continue
|
|
606
|
+
cleaned_history.append(msg)
|
|
607
|
+
|
|
608
|
+
# Update the reference (since it might be used by caller)
|
|
609
|
+
current_history[:] = cleaned_history
|
|
610
|
+
|
|
591
611
|
return {
|
|
592
612
|
"llm_response": final_content,
|
|
593
613
|
"structured_response": structured,
|
|
@@ -609,8 +629,8 @@ class ProcessingPipeline:
|
|
|
609
629
|
}
|
|
610
630
|
|
|
611
631
|
def _parse_tagged_response(self, text: str) -> Dict[str, Any]:
|
|
612
|
-
"""Parse response for references and page references."""
|
|
613
|
-
parsed = {"response": "", "references": [], "page_references": [], "flow_steps": []}
|
|
632
|
+
"""Parse response for references and page references reordered by appearance."""
|
|
633
|
+
parsed = {"response": "", "references": [], "page_references": [], "image_references": [], "flow_steps": []}
|
|
614
634
|
if not text:
|
|
615
635
|
return parsed
|
|
616
636
|
|
|
@@ -620,7 +640,6 @@ class ProcessingPipeline:
|
|
|
620
640
|
|
|
621
641
|
# 1. Try to unwrap JSON if the model acted like a ReAct agent
|
|
622
642
|
try:
|
|
623
|
-
# Check if it looks like JSON first to avoid performance hit
|
|
624
643
|
if remaining_text.strip().startswith("{") and "action" in remaining_text:
|
|
625
644
|
data = json.loads(remaining_text)
|
|
626
645
|
if isinstance(data, dict) and "action_input" in data:
|
|
@@ -628,86 +647,104 @@ class ProcessingPipeline:
|
|
|
628
647
|
except Exception:
|
|
629
648
|
pass
|
|
630
649
|
|
|
631
|
-
|
|
632
|
-
|
|
650
|
+
# 2. Extract references from text first (Order by appearance)
|
|
651
|
+
# Pattern matches [search:123], [page:123], [image:123]
|
|
652
|
+
pattern = re.compile(r'\[(search|page|image):(\d+)\]', re.IGNORECASE)
|
|
653
|
+
|
|
654
|
+
matches = list(pattern.finditer(remaining_text))
|
|
655
|
+
|
|
656
|
+
search_map = {} # old_id_str -> new_id (int)
|
|
657
|
+
page_map = {}
|
|
658
|
+
image_map = {}
|
|
659
|
+
|
|
660
|
+
def process_ref(tag_type, old_id):
|
|
661
|
+
# Find in all_web_results
|
|
662
|
+
result_item = next((r for r in self.all_web_results if r.get("_id") == old_id and r.get("_type") == tag_type), None)
|
|
663
|
+
|
|
664
|
+
if not result_item:
|
|
665
|
+
return
|
|
666
|
+
|
|
667
|
+
entry = {
|
|
668
|
+
"title": result_item.get("title", ""),
|
|
669
|
+
"url": result_item.get("url", ""),
|
|
670
|
+
"domain": result_item.get("domain", "")
|
|
671
|
+
}
|
|
672
|
+
if tag_type == "image":
|
|
673
|
+
entry["thumbnail"] = result_item.get("thumbnail", "")
|
|
674
|
+
|
|
675
|
+
# Add to respective list and map
|
|
676
|
+
# Check maps to avoid duplicates
|
|
677
|
+
if tag_type == "search":
|
|
678
|
+
if str(old_id) not in search_map:
|
|
679
|
+
parsed["references"].append(entry)
|
|
680
|
+
search_map[str(old_id)] = len(parsed["references"])
|
|
681
|
+
elif tag_type == "page":
|
|
682
|
+
if str(old_id) not in page_map:
|
|
683
|
+
parsed["page_references"].append(entry)
|
|
684
|
+
page_map[str(old_id)] = len(parsed["page_references"])
|
|
685
|
+
elif tag_type == "image":
|
|
686
|
+
if str(old_id) not in image_map:
|
|
687
|
+
parsed["image_references"].append(entry)
|
|
688
|
+
image_map[str(old_id)] = len(parsed["image_references"])
|
|
689
|
+
|
|
690
|
+
# Pass 1: Text Body
|
|
691
|
+
for m in matches:
|
|
692
|
+
try:
|
|
693
|
+
process_ref(m.group(1).lower(), int(m.group(2)))
|
|
694
|
+
except ValueError:
|
|
695
|
+
continue
|
|
633
696
|
|
|
634
|
-
#
|
|
697
|
+
# 3. Pass 2: References Block (Capture items missed in text)
|
|
635
698
|
ref_block_match = re.search(r'```references\s*(.*?)\s*```', remaining_text, re.DOTALL | re.IGNORECASE)
|
|
636
699
|
if ref_block_match:
|
|
637
700
|
ref_content = ref_block_match.group(1).strip()
|
|
701
|
+
remaining_text = remaining_text.replace(ref_block_match.group(0), "").strip()
|
|
702
|
+
|
|
638
703
|
for line in ref_content.split("\n"):
|
|
639
704
|
line = line.strip()
|
|
640
705
|
if not line: continue
|
|
706
|
+
# Match [id] [type]
|
|
707
|
+
# e.g. [1] [image] ... or [image:1] ...
|
|
641
708
|
|
|
642
|
-
#
|
|
643
|
-
|
|
644
|
-
id_match = re.match(r"^\[(\d+)\]", line)
|
|
645
|
-
type_match = re.search(r"\[(search|page)\]", line, re.IGNORECASE)
|
|
646
|
-
link_match = re.search(r"\[([^\[\]]+)\]\(([^)]+)\)", line)
|
|
647
|
-
|
|
648
|
-
idx = None
|
|
709
|
+
# Check for [id] [type] format
|
|
710
|
+
id_match = re.match(r"^\[(\d+)\]\s*\[(search|page|image)\]", line, re.IGNORECASE)
|
|
649
711
|
if id_match:
|
|
650
712
|
try:
|
|
651
|
-
|
|
713
|
+
process_ref(id_match.group(2).lower(), int(id_match.group(1)))
|
|
652
714
|
except ValueError:
|
|
653
715
|
pass
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
if idx is not None:
|
|
685
|
-
id_map[str(idx)] = len(parsed["references"])
|
|
686
|
-
|
|
687
|
-
remaining_text = remaining_text.replace(ref_block_match.group(0), "").strip()
|
|
716
|
+
else:
|
|
717
|
+
# Check for [type:id] format in list
|
|
718
|
+
alt_match = re.match(r"^\[(search|page|image):(\d+)\]", line, re.IGNORECASE)
|
|
719
|
+
if alt_match:
|
|
720
|
+
try:
|
|
721
|
+
process_ref(alt_match.group(1).lower(), int(alt_match.group(2)))
|
|
722
|
+
except ValueError:
|
|
723
|
+
pass
|
|
724
|
+
|
|
725
|
+
# 4. Replace tags in text with new sequential IDs
|
|
726
|
+
|
|
727
|
+
# 4. Replace tags in text with new sequential IDs
|
|
728
|
+
def replace_tag(match):
|
|
729
|
+
tag_type = match.group(1).lower()
|
|
730
|
+
old_id = match.group(2)
|
|
731
|
+
|
|
732
|
+
new_id = None
|
|
733
|
+
if tag_type == "search":
|
|
734
|
+
new_id = search_map.get(old_id)
|
|
735
|
+
elif tag_type == "page":
|
|
736
|
+
new_id = page_map.get(old_id)
|
|
737
|
+
elif tag_type == "image":
|
|
738
|
+
new_id = image_map.get(old_id)
|
|
739
|
+
|
|
740
|
+
if new_id is not None:
|
|
741
|
+
if tag_type == "image":
|
|
742
|
+
return ""
|
|
743
|
+
return f"[{tag_type}:{new_id}]"
|
|
744
|
+
|
|
745
|
+
return match.group(0)
|
|
688
746
|
|
|
689
|
-
|
|
690
|
-
if id_map:
|
|
691
|
-
def replace_search_citation(match):
|
|
692
|
-
old_id = match.group(1) or match.group(2)
|
|
693
|
-
if old_id in id_map:
|
|
694
|
-
return f"`search:{id_map[old_id]}`"
|
|
695
|
-
return match.group(0)
|
|
696
|
-
|
|
697
|
-
remaining_text = re.sub(r'\[(\d+)\]', replace_search_citation, remaining_text)
|
|
698
|
-
remaining_text = re.sub(r'(?<!`)search:(\d+)(?!`)', replace_search_citation, remaining_text)
|
|
699
|
-
remaining_text = re.sub(r'`search:(\d+)`', replace_search_citation, remaining_text)
|
|
700
|
-
|
|
701
|
-
# Replace page:id citations
|
|
702
|
-
if page_id_map:
|
|
703
|
-
def replace_page_citation(match):
|
|
704
|
-
old_id = match.group(1)
|
|
705
|
-
if old_id in page_id_map:
|
|
706
|
-
return f"`page:{page_id_map[old_id]}`"
|
|
707
|
-
return match.group(0)
|
|
708
|
-
|
|
709
|
-
remaining_text = re.sub(r'(?<!`)page:(\d+)(?!`)', replace_page_citation, remaining_text)
|
|
710
|
-
remaining_text = re.sub(r'`page:(\d+)`', replace_page_citation, remaining_text)
|
|
747
|
+
remaining_text = pattern.sub(replace_tag, remaining_text)
|
|
711
748
|
|
|
712
749
|
parsed["response"] = remaining_text.strip()
|
|
713
750
|
return parsed
|
|
@@ -730,12 +767,11 @@ class ProcessingPipeline:
|
|
|
730
767
|
query = args.get("query")
|
|
731
768
|
web = await self.search_service.search(query)
|
|
732
769
|
|
|
733
|
-
# Cache results and assign IDs
|
|
734
|
-
current_max_id = max([item.get("_id", 0) for item in self.all_web_results], default=0)
|
|
735
|
-
|
|
770
|
+
# Cache results and assign search-specific IDs
|
|
736
771
|
for item in web:
|
|
737
|
-
|
|
738
|
-
item["_id"] =
|
|
772
|
+
self.search_id_counter += 1
|
|
773
|
+
item["_id"] = self.search_id_counter
|
|
774
|
+
item["_type"] = "search"
|
|
739
775
|
item["query"] = query
|
|
740
776
|
self.all_web_results.append(item)
|
|
741
777
|
|
|
@@ -745,10 +781,11 @@ class ProcessingPipeline:
|
|
|
745
781
|
query = args.get("query")
|
|
746
782
|
images = await self.search_service.image_search(query)
|
|
747
783
|
|
|
748
|
-
|
|
784
|
+
# Cache results and assign image-specific IDs
|
|
749
785
|
for item in images:
|
|
750
|
-
|
|
751
|
-
item["_id"] =
|
|
786
|
+
self.image_id_counter += 1
|
|
787
|
+
item["_id"] = self.image_id_counter
|
|
788
|
+
item["_type"] = "image"
|
|
752
789
|
item["query"] = query
|
|
753
790
|
item["is_image"] = True
|
|
754
791
|
self.all_web_results.append(item)
|
|
@@ -761,15 +798,15 @@ class ProcessingPipeline:
|
|
|
761
798
|
# Returns Dict: {content, title, url}
|
|
762
799
|
result_dict = await self.search_service.fetch_page(url)
|
|
763
800
|
|
|
764
|
-
# Cache the crawled content
|
|
765
|
-
|
|
766
|
-
current_max_id += 1
|
|
801
|
+
# Cache the crawled content with page-specific ID
|
|
802
|
+
self.page_id_counter += 1
|
|
767
803
|
|
|
768
804
|
cached_item = {
|
|
769
|
-
"_id":
|
|
805
|
+
"_id": self.page_id_counter,
|
|
806
|
+
"_type": "page",
|
|
770
807
|
"title": result_dict.get("title", "Page"),
|
|
771
808
|
"url": result_dict.get("url", url),
|
|
772
|
-
"content": result_dict.get("content", "")
|
|
809
|
+
"content": result_dict.get("content", ""),
|
|
773
810
|
"domain": "",
|
|
774
811
|
"is_crawled": True,
|
|
775
812
|
}
|
|
@@ -940,18 +977,13 @@ class ProcessingPipeline:
|
|
|
940
977
|
if not self.all_web_results:
|
|
941
978
|
return ""
|
|
942
979
|
|
|
943
|
-
def clip(s: str, n: int) -> str:
|
|
944
|
-
s = (s or "").strip()
|
|
945
|
-
return s if len(s) <= n else s[: n - 1] + "…"
|
|
946
|
-
|
|
947
980
|
lines = []
|
|
948
981
|
for res in self.all_web_results:
|
|
949
|
-
if res.get("
|
|
950
|
-
if res.get("is_crawled"): continue # Skip crawled pages (handled separately)
|
|
982
|
+
if res.get("_type") != "search": continue # Only search results
|
|
951
983
|
idx = res.get("_id")
|
|
952
|
-
title =
|
|
984
|
+
title = (res.get("title", "") or "").strip()
|
|
953
985
|
url = res.get("url", "")
|
|
954
|
-
content =
|
|
986
|
+
content = (res.get("content", "") or "").strip()
|
|
955
987
|
lines.append(f"[{idx}] Title: {title}\nURL: {url}\nSnippet: {content}\n")
|
|
956
988
|
|
|
957
989
|
return "\n".join(lines)
|
|
@@ -961,17 +993,13 @@ class ProcessingPipeline:
|
|
|
961
993
|
if not self.all_web_results:
|
|
962
994
|
return ""
|
|
963
995
|
|
|
964
|
-
def clip(s: str, n: int) -> str:
|
|
965
|
-
s = (s or "").strip()
|
|
966
|
-
return s if len(s) <= n else s[: n - 1] + "…"
|
|
967
|
-
|
|
968
996
|
lines = []
|
|
969
997
|
for res in self.all_web_results:
|
|
970
|
-
if
|
|
998
|
+
if res.get("_type") != "page": continue # Only page results
|
|
971
999
|
idx = res.get("_id")
|
|
972
|
-
title =
|
|
1000
|
+
title = (res.get("title", "") or "").strip()
|
|
973
1001
|
url = res.get("url", "")
|
|
974
|
-
content =
|
|
1002
|
+
content = (res.get("content", "") or "").strip()
|
|
975
1003
|
lines.append(f"[{idx}] Title: {title}\nURL: {url}\nContent: {content}\n")
|
|
976
1004
|
|
|
977
1005
|
return "\n".join(lines)
|
|
@@ -982,7 +1010,7 @@ class ProcessingPipeline:
|
|
|
982
1010
|
|
|
983
1011
|
lines = []
|
|
984
1012
|
for res in self.all_web_results:
|
|
985
|
-
if
|
|
1013
|
+
if res.get("_type") != "image": continue # Only image results
|
|
986
1014
|
idx = res.get("_id")
|
|
987
1015
|
title = res.get("title", "")
|
|
988
1016
|
url = res.get("image", "") or res.get("url", "")
|