entari-plugin-hyw 3.5.0rc1__py3-none-any.whl → 3.5.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +77 -82
- entari_plugin_hyw/assets/card-dist/index.html +360 -99
- entari_plugin_hyw/card-ui/src/App.vue +246 -52
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +122 -67
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +46 -26
- entari_plugin_hyw/card-ui/src/test_regex.js +103 -0
- entari_plugin_hyw/card-ui/src/types.ts +1 -0
- entari_plugin_hyw/{core/history.py → history.py} +25 -1
- entari_plugin_hyw/image_cache.py +283 -0
- entari_plugin_hyw/{core/pipeline.py → pipeline.py} +102 -27
- entari_plugin_hyw/{utils/prompts.py → prompts.py} +7 -24
- entari_plugin_hyw/render_vue.py +314 -0
- entari_plugin_hyw/{utils/search.py → search.py} +227 -10
- {entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/METADATA +1 -1
- {entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/RECORD +18 -29
- entari_plugin_hyw/core/__init__.py +0 -0
- entari_plugin_hyw/core/config.py +0 -35
- entari_plugin_hyw/core/hyw.py +0 -48
- entari_plugin_hyw/core/render_vue.py +0 -255
- entari_plugin_hyw/test_output/render_0.jpg +0 -0
- entari_plugin_hyw/test_output/render_1.jpg +0 -0
- entari_plugin_hyw/test_output/render_2.jpg +0 -0
- entari_plugin_hyw/test_output/render_3.jpg +0 -0
- entari_plugin_hyw/test_output/render_4.jpg +0 -0
- entari_plugin_hyw/tests/ui_test_output.jpg +0 -0
- entari_plugin_hyw/tests/verify_ui.py +0 -139
- entari_plugin_hyw/utils/__init__.py +0 -2
- entari_plugin_hyw/utils/browser.py +0 -40
- entari_plugin_hyw/utils/playwright_tool.py +0 -36
- /entari_plugin_hyw/{utils/misc.py → misc.py} +0 -0
- {entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-3.5.0rc1.dist-info → entari_plugin_hyw-3.5.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vue-based Card Renderer (Minimal Python)
|
|
3
|
+
|
|
4
|
+
Python only provides raw data. All frontend logic (markdown, syntax highlighting,
|
|
5
|
+
math rendering, citations) is handled by the Vue frontend.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import gc
|
|
10
|
+
import os
|
|
11
|
+
import threading
|
|
12
|
+
import asyncio
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import List, Dict, Any
|
|
15
|
+
from concurrent.futures import Future
|
|
16
|
+
|
|
17
|
+
from loguru import logger
|
|
18
|
+
from playwright.async_api import async_playwright
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ContentRenderer:
|
|
22
|
+
"""Minimal renderer with background browser thread for instant startup."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, template_path: str = None, auto_start: bool = True):
|
|
25
|
+
if template_path is None:
|
|
26
|
+
current_dir = Path(__file__).parent
|
|
27
|
+
template_path = current_dir / "assets" / "card-dist" / "index.html"
|
|
28
|
+
|
|
29
|
+
self.template_path = Path(template_path)
|
|
30
|
+
if not self.template_path.exists():
|
|
31
|
+
raise FileNotFoundError(f"Vue template not found: {self.template_path}")
|
|
32
|
+
|
|
33
|
+
self.template_content = self.template_path.read_text(encoding="utf-8")
|
|
34
|
+
logger.info(f"ContentRenderer: loaded Vue template ({len(self.template_content)} bytes)")
|
|
35
|
+
|
|
36
|
+
# Browser state (managed by background thread)
|
|
37
|
+
self._playwright = None
|
|
38
|
+
self._browser = None
|
|
39
|
+
self._context = None
|
|
40
|
+
self._page = None
|
|
41
|
+
self._render_count = 0
|
|
42
|
+
self._max_renders_before_restart = 50
|
|
43
|
+
|
|
44
|
+
# Background event loop for playwright
|
|
45
|
+
self._loop: asyncio.AbstractEventLoop = None
|
|
46
|
+
self._thread: threading.Thread = None
|
|
47
|
+
self._ready = threading.Event()
|
|
48
|
+
self._lock = threading.Lock()
|
|
49
|
+
|
|
50
|
+
if auto_start:
|
|
51
|
+
self._start_background_loop()
|
|
52
|
+
|
|
53
|
+
def _start_background_loop(self):
|
|
54
|
+
"""Start dedicated event loop in background thread."""
|
|
55
|
+
def _run_loop():
|
|
56
|
+
self._loop = asyncio.new_event_loop()
|
|
57
|
+
asyncio.set_event_loop(self._loop)
|
|
58
|
+
# Start browser immediately
|
|
59
|
+
self._loop.run_until_complete(self._init_browser())
|
|
60
|
+
self._ready.set()
|
|
61
|
+
# Keep loop running for future tasks
|
|
62
|
+
self._loop.run_forever()
|
|
63
|
+
|
|
64
|
+
self._thread = threading.Thread(target=_run_loop, daemon=True, name="ContentRenderer-Browser")
|
|
65
|
+
self._thread.start()
|
|
66
|
+
logger.info("ContentRenderer: Background browser thread started")
|
|
67
|
+
|
|
68
|
+
async def _init_browser(self, timeout: int = 6000):
|
|
69
|
+
"""Initialize browser and page with warmup render (runs in background loop)."""
|
|
70
|
+
logger.info("ContentRenderer: Starting browser...")
|
|
71
|
+
try:
|
|
72
|
+
self._playwright = await async_playwright().start()
|
|
73
|
+
self._browser = await self._playwright.chromium.launch(
|
|
74
|
+
headless=True,
|
|
75
|
+
args=['--no-sandbox', '--disable-setuid-sandbox']
|
|
76
|
+
)
|
|
77
|
+
self._context = await self._browser.new_context(
|
|
78
|
+
viewport={"width": 540, "height": 1400},
|
|
79
|
+
device_scale_factor=2.0,
|
|
80
|
+
)
|
|
81
|
+
self._page = await self._context.new_page()
|
|
82
|
+
await self._page.goto(self.template_path.as_uri(), wait_until="domcontentloaded", timeout=timeout)
|
|
83
|
+
|
|
84
|
+
# Pre-warm the page with initial data so Vue compiles and renders
|
|
85
|
+
warmup_data = {
|
|
86
|
+
"markdown": "# Ready",
|
|
87
|
+
"total_time": 0,
|
|
88
|
+
"stages": [],
|
|
89
|
+
"references": [],
|
|
90
|
+
"page_references": [],
|
|
91
|
+
"image_references": [],
|
|
92
|
+
"stats": {},
|
|
93
|
+
"theme_color": "#ef4444",
|
|
94
|
+
}
|
|
95
|
+
await self._page.evaluate("(data) => window.updateRenderData(data)", warmup_data)
|
|
96
|
+
await asyncio.sleep(0.1) # Let Vue render
|
|
97
|
+
logger.success("ContentRenderer: Browser + page ready!")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"ContentRenderer: Failed to start browser: {e}")
|
|
100
|
+
raise
|
|
101
|
+
|
|
102
|
+
def _run_in_background(self, coro) -> Future:
|
|
103
|
+
"""Schedule coroutine in background loop and return Future."""
|
|
104
|
+
if not self._loop or not self._loop.is_running():
|
|
105
|
+
raise RuntimeError("Background loop not running")
|
|
106
|
+
return asyncio.run_coroutine_threadsafe(coro, self._loop)
|
|
107
|
+
|
|
108
|
+
async def start(self, timeout: int = 6000):
|
|
109
|
+
"""Wait for browser to be ready (for compatibility)."""
|
|
110
|
+
ready = await asyncio.to_thread(self._ready.wait, timeout / 1000)
|
|
111
|
+
if not ready:
|
|
112
|
+
raise TimeoutError("Browser startup timeout")
|
|
113
|
+
|
|
114
|
+
async def close(self):
|
|
115
|
+
"""Clean up browser resources."""
|
|
116
|
+
if self._loop and self._loop.is_running():
|
|
117
|
+
future = self._run_in_background(self._close_internal())
|
|
118
|
+
# Use asyncio.to_thread to wait without blocking the event loop
|
|
119
|
+
await asyncio.to_thread(future.result, 10)
|
|
120
|
+
if self._loop:
|
|
121
|
+
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
122
|
+
if self._thread:
|
|
123
|
+
# Use asyncio.to_thread to wait without blocking the event loop
|
|
124
|
+
await asyncio.to_thread(self._thread.join, 5)
|
|
125
|
+
logger.info("ContentRenderer: Browser closed.")
|
|
126
|
+
|
|
127
|
+
async def _close_internal(self):
|
|
128
|
+
"""Internal close (runs in background loop)."""
|
|
129
|
+
if self._page:
|
|
130
|
+
await self._page.close()
|
|
131
|
+
self._page = None
|
|
132
|
+
if self._context:
|
|
133
|
+
await self._context.close()
|
|
134
|
+
self._context = None
|
|
135
|
+
if self._browser:
|
|
136
|
+
await self._browser.close()
|
|
137
|
+
self._browser = None
|
|
138
|
+
if self._playwright:
|
|
139
|
+
await self._playwright.stop()
|
|
140
|
+
self._playwright = None
|
|
141
|
+
|
|
142
|
+
async def _ensure_page(self):
|
|
143
|
+
"""Ensure page is ready, restart if needed (runs in background loop)."""
|
|
144
|
+
if self._render_count >= self._max_renders_before_restart:
|
|
145
|
+
logger.info(f"ContentRenderer: Restarting browser after {self._render_count} renders...")
|
|
146
|
+
await self._close_internal()
|
|
147
|
+
self._render_count = 0
|
|
148
|
+
|
|
149
|
+
if not self._page:
|
|
150
|
+
await self._init_browser()
|
|
151
|
+
|
|
152
|
+
async def render(
|
|
153
|
+
self,
|
|
154
|
+
markdown_content: str,
|
|
155
|
+
output_path: str,
|
|
156
|
+
stats: Dict[str, Any] = None,
|
|
157
|
+
references: List[Dict[str, Any]] = None,
|
|
158
|
+
page_references: List[Dict[str, Any]] = None,
|
|
159
|
+
image_references: List[Dict[str, Any]] = None,
|
|
160
|
+
stages_used: List[Dict[str, Any]] = None,
|
|
161
|
+
image_timeout: int = 3000,
|
|
162
|
+
theme_color: str = "#ef4444",
|
|
163
|
+
**kwargs
|
|
164
|
+
) -> bool:
|
|
165
|
+
"""Render content to image."""
|
|
166
|
+
# Wait for browser ready (non-blocking)
|
|
167
|
+
ready = await asyncio.to_thread(self._ready.wait, 30)
|
|
168
|
+
if not ready:
|
|
169
|
+
logger.error("ContentRenderer: Browser not ready after 30s")
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
# Prepare data
|
|
173
|
+
resolved_output_path = Path(output_path).resolve()
|
|
174
|
+
resolved_output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
175
|
+
|
|
176
|
+
stats_dict = stats[0] if isinstance(stats, list) and stats else (stats or {})
|
|
177
|
+
|
|
178
|
+
render_data = {
|
|
179
|
+
"markdown": markdown_content,
|
|
180
|
+
"total_time": stats_dict.get("total_time", 0) or 0,
|
|
181
|
+
"stages": [
|
|
182
|
+
{
|
|
183
|
+
"name": s.get("name", "Step"),
|
|
184
|
+
"model": s.get("model", ""),
|
|
185
|
+
"provider": s.get("provider", ""),
|
|
186
|
+
"time": s.get("time", 0),
|
|
187
|
+
"cost": s.get("cost", 0),
|
|
188
|
+
"references": s.get("references") or s.get("search_results"),
|
|
189
|
+
"image_references": s.get("image_references"),
|
|
190
|
+
"crawled_pages": s.get("crawled_pages"),
|
|
191
|
+
}
|
|
192
|
+
for s in (stages_used or [])
|
|
193
|
+
],
|
|
194
|
+
"references": references or [],
|
|
195
|
+
"page_references": page_references or [],
|
|
196
|
+
"image_references": image_references or [],
|
|
197
|
+
"stats": stats_dict,
|
|
198
|
+
"theme_color": theme_color,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
# Reorder images in stages
|
|
202
|
+
self._reorder_images_in_stages(render_data["markdown"], render_data["stages"])
|
|
203
|
+
|
|
204
|
+
# Run render in background loop (non-blocking wait for result)
|
|
205
|
+
try:
|
|
206
|
+
future = self._run_in_background(
|
|
207
|
+
self._render_internal(render_data, str(resolved_output_path), image_timeout)
|
|
208
|
+
)
|
|
209
|
+
# Use asyncio.to_thread to wait for the future without blocking the event loop
|
|
210
|
+
return await asyncio.to_thread(future.result, 60)
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(f"ContentRenderer: render failed ({e})")
|
|
213
|
+
return False
|
|
214
|
+
|
|
215
|
+
async def _render_internal(self, render_data: dict, output_path: str, image_timeout: int) -> bool:
|
|
216
|
+
"""Internal render (runs in background loop)."""
|
|
217
|
+
import time
|
|
218
|
+
start_time = time.time()
|
|
219
|
+
|
|
220
|
+
try:
|
|
221
|
+
await self._ensure_page()
|
|
222
|
+
|
|
223
|
+
# Update data via JS
|
|
224
|
+
await self._page.evaluate("(data) => window.updateRenderData(data)", render_data)
|
|
225
|
+
|
|
226
|
+
# Wait for Vue to update DOM
|
|
227
|
+
await asyncio.sleep(0.1)
|
|
228
|
+
|
|
229
|
+
# Wait for images to load
|
|
230
|
+
try:
|
|
231
|
+
await self._page.wait_for_function(
|
|
232
|
+
"() => Array.from(document.images).every(img => img.complete)",
|
|
233
|
+
timeout=image_timeout
|
|
234
|
+
)
|
|
235
|
+
except Exception:
|
|
236
|
+
logger.warning(f"ContentRenderer: Timeout waiting for images ({image_timeout}ms)")
|
|
237
|
+
|
|
238
|
+
# Take screenshot
|
|
239
|
+
element = await self._page.query_selector("#main-container")
|
|
240
|
+
if element:
|
|
241
|
+
await element.screenshot(path=output_path, type="jpeg", quality=88)
|
|
242
|
+
else:
|
|
243
|
+
await self._page.screenshot(path=output_path, full_page=True, type="jpeg", quality=88)
|
|
244
|
+
|
|
245
|
+
self._render_count += 1
|
|
246
|
+
duration = time.time() - start_time
|
|
247
|
+
logger.success(f"ContentRenderer: Rendered in {duration:.3f}s (No.{self._render_count})")
|
|
248
|
+
return True
|
|
249
|
+
|
|
250
|
+
except Exception as exc:
|
|
251
|
+
logger.error(f"ContentRenderer: render failed ({exc})")
|
|
252
|
+
# Reset page to force restart next time
|
|
253
|
+
self._page = None
|
|
254
|
+
return False
|
|
255
|
+
finally:
|
|
256
|
+
gc.collect()
|
|
257
|
+
|
|
258
|
+
async def render_models_list(
|
|
259
|
+
self,
|
|
260
|
+
models: List[Dict[str, Any]],
|
|
261
|
+
output_path: str,
|
|
262
|
+
default_base_url: str = "https://openrouter.ai/api/v1",
|
|
263
|
+
**kwargs
|
|
264
|
+
) -> bool:
|
|
265
|
+
"""Render models list."""
|
|
266
|
+
lines = ["# 模型列表"]
|
|
267
|
+
for idx, model in enumerate(models or [], start=1):
|
|
268
|
+
name = model.get("name", "unknown")
|
|
269
|
+
base_url = model.get("base_url") or default_base_url
|
|
270
|
+
provider = model.get("provider", "")
|
|
271
|
+
lines.append(f"{idx}. **{name}** \n - base_url: {base_url} \n - provider: {provider}")
|
|
272
|
+
|
|
273
|
+
markdown_content = "\n\n".join(lines) if len(lines) > 1 else "# 模型列表\n暂无模型"
|
|
274
|
+
|
|
275
|
+
return await self.render(
|
|
276
|
+
markdown_content=markdown_content,
|
|
277
|
+
output_path=output_path,
|
|
278
|
+
stats={},
|
|
279
|
+
references=[],
|
|
280
|
+
stages_used=[],
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
def _reorder_images_in_stages(self, markdown: str, stages: List[Dict[str, Any]]) -> None:
|
|
284
|
+
"""Reorder image references in stages based on appearance in markdown."""
|
|
285
|
+
import re
|
|
286
|
+
|
|
287
|
+
img_urls = []
|
|
288
|
+
for match in re.finditer(r'!\[.*?\]\((.*?)\)', markdown):
|
|
289
|
+
url_part = match.group(1).split()[0].strip()
|
|
290
|
+
if url_part and url_part not in img_urls:
|
|
291
|
+
img_urls.append(url_part)
|
|
292
|
+
|
|
293
|
+
if not img_urls:
|
|
294
|
+
return
|
|
295
|
+
|
|
296
|
+
for stage in stages:
|
|
297
|
+
refs = stage.get("image_references")
|
|
298
|
+
if not refs:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
ref_map = {r["url"]: r for r in refs}
|
|
302
|
+
new_refs = []
|
|
303
|
+
seen_urls = set()
|
|
304
|
+
|
|
305
|
+
for url in img_urls:
|
|
306
|
+
if url in ref_map:
|
|
307
|
+
new_refs.append(ref_map[url])
|
|
308
|
+
seen_urls.add(url)
|
|
309
|
+
|
|
310
|
+
for r in refs:
|
|
311
|
+
if r["url"] not in seen_urls:
|
|
312
|
+
new_refs.append(r)
|
|
313
|
+
|
|
314
|
+
stage["image_references"] = new_refs
|
|
@@ -22,6 +22,9 @@ except ImportError:
|
|
|
22
22
|
except ImportError:
|
|
23
23
|
DDGS = None
|
|
24
24
|
|
|
25
|
+
# Import image cache for prefetching
|
|
26
|
+
from .image_cache import prefetch_images
|
|
27
|
+
|
|
25
28
|
# Shared crawler instance to avoid repeated init
|
|
26
29
|
_shared_crawler: Optional[AsyncWebCrawler] = None
|
|
27
30
|
|
|
@@ -46,7 +49,8 @@ async def close_shared_crawler():
|
|
|
46
49
|
class SearchService:
|
|
47
50
|
"""
|
|
48
51
|
Multi-strategy search & fetch service.
|
|
49
|
-
|
|
52
|
+
Search providers: 'crawl4ai' (default), 'httpx', 'ddgs'.
|
|
53
|
+
Fetch providers: 'crawl4ai' (default), 'jinaai'.
|
|
50
54
|
"""
|
|
51
55
|
def __init__(self, config: Any):
|
|
52
56
|
self.config = config
|
|
@@ -56,8 +60,11 @@ class SearchService:
|
|
|
56
60
|
# Configuration for retries/timeouts
|
|
57
61
|
self._search_timeout = getattr(config, "search_timeout", 10.0)
|
|
58
62
|
self._search_retries = getattr(config, "search_retries", 2)
|
|
59
|
-
|
|
60
|
-
|
|
63
|
+
# Separate providers for search and page fetching
|
|
64
|
+
self._search_provider = getattr(config, "search_provider", "crawl4ai")
|
|
65
|
+
self._fetch_provider = getattr(config, "fetch_provider", "crawl4ai")
|
|
66
|
+
self._jina_api_key = getattr(config, "jina_api_key", None)
|
|
67
|
+
logger.info(f"SearchService initialized: search_provider='{self._search_provider}', fetch_provider='{self._fetch_provider}', limit={self._default_limit}, timeout={self._search_timeout}s")
|
|
61
68
|
|
|
62
69
|
def _build_search_url(self, query: str) -> str:
|
|
63
70
|
encoded_query = urllib.parse.quote(query)
|
|
@@ -82,7 +89,7 @@ class SearchService:
|
|
|
82
89
|
if not query:
|
|
83
90
|
return []
|
|
84
91
|
|
|
85
|
-
provider = self.
|
|
92
|
+
provider = self._search_provider.lower()
|
|
86
93
|
logger.info(f"SearchService: searching for '{query}' using provider='{provider}'")
|
|
87
94
|
|
|
88
95
|
if provider == "httpx":
|
|
@@ -355,11 +362,122 @@ class SearchService:
|
|
|
355
362
|
|
|
356
363
|
async def fetch_page(self, url: str) -> Dict[str, str]:
|
|
357
364
|
"""
|
|
358
|
-
Fetch a single page
|
|
365
|
+
Fetch a single page and return cleaned markdown/text plus metadata.
|
|
366
|
+
Dispatches to jinaai or Crawl4AI based on fetch_provider config.
|
|
359
367
|
"""
|
|
360
368
|
if not url:
|
|
361
369
|
return {"content": "Error: missing url", "title": "Error", "url": ""}
|
|
362
370
|
|
|
371
|
+
provider = self._fetch_provider.lower()
|
|
372
|
+
logger.info(f"SearchService: fetching page '{url}' using fetch_provider='{provider}'")
|
|
373
|
+
|
|
374
|
+
if provider == "jinaai":
|
|
375
|
+
return await self._fetch_page_jinaai(url)
|
|
376
|
+
else:
|
|
377
|
+
return await self._fetch_page_crawl4ai(url)
|
|
378
|
+
|
|
379
|
+
async def _fetch_page_jinaai(self, url: str) -> Dict[str, str]:
|
|
380
|
+
"""
|
|
381
|
+
Fetch page via Jina AI Reader - returns clean markdown content.
|
|
382
|
+
https://r.jina.ai/{url}
|
|
383
|
+
"""
|
|
384
|
+
if not httpx:
|
|
385
|
+
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
386
|
+
return await self._fetch_page_crawl4ai(url)
|
|
387
|
+
|
|
388
|
+
jina_url = f"https://r.jina.ai/{url}"
|
|
389
|
+
headers = {
|
|
390
|
+
"X-Return-Format": "markdown",
|
|
391
|
+
}
|
|
392
|
+
# Add authorization header if API key is configured
|
|
393
|
+
if self._jina_api_key:
|
|
394
|
+
headers["Authorization"] = f"Bearer {self._jina_api_key}"
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
398
|
+
resp = await client.get(jina_url, headers=headers)
|
|
399
|
+
resp.raise_for_status()
|
|
400
|
+
content = resp.text
|
|
401
|
+
|
|
402
|
+
# Jina AI returns markdown content directly
|
|
403
|
+
# Try to extract title from first heading or first line
|
|
404
|
+
title = "No Title"
|
|
405
|
+
lines = content.strip().split('\n')
|
|
406
|
+
for line in lines:
|
|
407
|
+
line = line.strip()
|
|
408
|
+
if line.startswith('# '):
|
|
409
|
+
title = line[2:].strip()
|
|
410
|
+
break
|
|
411
|
+
elif line and not line.startswith('!') and not line.startswith('['):
|
|
412
|
+
# Use first non-empty, non-image, non-link line as title
|
|
413
|
+
title = line[:100]
|
|
414
|
+
break
|
|
415
|
+
|
|
416
|
+
logger.info(f"SearchService(jinaai): fetched page, title='{title}', content_len={len(content)}")
|
|
417
|
+
return {
|
|
418
|
+
"content": content[:8000],
|
|
419
|
+
"title": title,
|
|
420
|
+
"url": url
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
except Exception as e:
|
|
424
|
+
logger.error(f"SearchService(jinaai) fetch_page failed: {e}")
|
|
425
|
+
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
426
|
+
|
|
427
|
+
async def _fetch_page_httpx(self, url: str) -> Dict[str, str]:
|
|
428
|
+
"""
|
|
429
|
+
Fetch page via httpx - fast, no browser overhead.
|
|
430
|
+
"""
|
|
431
|
+
if not httpx:
|
|
432
|
+
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
433
|
+
return await self._fetch_page_crawl4ai(url)
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
|
|
437
|
+
resp = await client.get(url, headers={
|
|
438
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
439
|
+
})
|
|
440
|
+
resp.raise_for_status()
|
|
441
|
+
html_content = resp.text
|
|
442
|
+
|
|
443
|
+
# Extract title from HTML
|
|
444
|
+
title = "No Title"
|
|
445
|
+
title_match = re.search(r'<title[^>]*>([^<]+)</title>', html_content, re.IGNORECASE)
|
|
446
|
+
if title_match:
|
|
447
|
+
title = html.unescape(title_match.group(1).strip())
|
|
448
|
+
|
|
449
|
+
# Try og:title as fallback
|
|
450
|
+
if title == "No Title":
|
|
451
|
+
og_match = re.search(r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
|
|
452
|
+
if og_match:
|
|
453
|
+
title = html.unescape(og_match.group(1).strip())
|
|
454
|
+
|
|
455
|
+
# Simple HTML to text conversion
|
|
456
|
+
# Remove script/style tags
|
|
457
|
+
content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', html_content, flags=re.IGNORECASE)
|
|
458
|
+
content = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', html_content, flags=re.IGNORECASE)
|
|
459
|
+
# Remove HTML tags
|
|
460
|
+
content = re.sub(r'<[^>]+>', ' ', content)
|
|
461
|
+
# Decode entities
|
|
462
|
+
content = html.unescape(content)
|
|
463
|
+
# Normalize whitespace
|
|
464
|
+
content = re.sub(r'\s+', ' ', content).strip()
|
|
465
|
+
|
|
466
|
+
logger.info(f"SearchService(httpx): fetched page, title='{title}', content_len={len(content)}")
|
|
467
|
+
return {
|
|
468
|
+
"content": content[:8000],
|
|
469
|
+
"title": title,
|
|
470
|
+
"url": url
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
except Exception as e:
|
|
474
|
+
logger.error(f"SearchService(httpx) fetch_page failed: {e}")
|
|
475
|
+
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
476
|
+
|
|
477
|
+
async def _fetch_page_crawl4ai(self, url: str) -> Dict[str, str]:
|
|
478
|
+
"""
|
|
479
|
+
Fetch page via Crawl4AI - full browser rendering.
|
|
480
|
+
"""
|
|
363
481
|
try:
|
|
364
482
|
crawler = await self._get_crawler()
|
|
365
483
|
result = await crawler.arun(
|
|
@@ -416,17 +534,116 @@ class SearchService:
|
|
|
416
534
|
pass
|
|
417
535
|
self._crawler = None
|
|
418
536
|
|
|
419
|
-
async def image_search(self, query: str) -> List[Dict[str, str]]:
|
|
537
|
+
async def image_search(self, query: str, prefetch: bool = True) -> List[Dict[str, str]]:
|
|
420
538
|
"""
|
|
421
|
-
Image search
|
|
539
|
+
Image search - dispatches to httpx, ddgs, or Crawl4AI based on search_provider.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
query: Search query
|
|
543
|
+
prefetch: If True, automatically start prefetching images for caching
|
|
422
544
|
"""
|
|
423
545
|
if not query:
|
|
424
546
|
return []
|
|
425
547
|
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
548
|
+
provider = self._search_provider.lower()
|
|
549
|
+
logger.info(f"SearchService: image searching for '{query}' using provider='{provider}'")
|
|
550
|
+
|
|
551
|
+
if provider == "ddgs":
|
|
552
|
+
results = await self._search_ddgs_images(query)
|
|
553
|
+
elif provider == "httpx":
|
|
554
|
+
results = await self._image_search_httpx(query)
|
|
555
|
+
else:
|
|
556
|
+
results = await self._image_search_crawl4ai(query)
|
|
557
|
+
|
|
558
|
+
# Start prefetching images in background for faster rendering
|
|
559
|
+
if prefetch and results:
|
|
560
|
+
urls_to_prefetch = []
|
|
561
|
+
for img in results:
|
|
562
|
+
# Prefer thumbnail for prefetch (smaller, used in UI)
|
|
563
|
+
thumb = img.get("thumbnail")
|
|
564
|
+
url = img.get("url")
|
|
565
|
+
if thumb:
|
|
566
|
+
urls_to_prefetch.append(thumb)
|
|
567
|
+
if url and url != thumb:
|
|
568
|
+
urls_to_prefetch.append(url)
|
|
569
|
+
|
|
570
|
+
if urls_to_prefetch:
|
|
571
|
+
logger.info(f"SearchService: Starting prefetch for {len(urls_to_prefetch)} images")
|
|
572
|
+
await prefetch_images(urls_to_prefetch)
|
|
573
|
+
|
|
574
|
+
return results
|
|
575
|
+
|
|
576
|
+
async def _image_search_httpx(self, query: str) -> List[Dict[str, str]]:
|
|
577
|
+
"""
|
|
578
|
+
Image search via httpx - parse img tags from HTML response.
|
|
579
|
+
"""
|
|
580
|
+
if not httpx:
|
|
581
|
+
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
582
|
+
return await self._image_search_crawl4ai(query)
|
|
583
|
+
|
|
584
|
+
url = self._build_image_url(query)
|
|
585
|
+
logger.info(f"SearchService(httpx Image): fetching {url}")
|
|
586
|
+
|
|
587
|
+
try:
|
|
588
|
+
async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
|
|
589
|
+
resp = await client.get(url, headers={
|
|
590
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
591
|
+
})
|
|
592
|
+
resp.raise_for_status()
|
|
593
|
+
html_content = resp.text
|
|
594
|
+
|
|
595
|
+
# Parse img tags from HTML
|
|
596
|
+
# Match: <img ... src="..." ... > or <img ... data-src="..." ...>
|
|
597
|
+
img_regex = re.compile(r'<img[^>]+(?:src|data-src)=["\']([^"\']+)["\'][^>]*(?:alt=["\']([^"\']*)["\'])?[^>]*>', re.IGNORECASE)
|
|
598
|
+
|
|
599
|
+
images = []
|
|
600
|
+
seen = set()
|
|
601
|
+
|
|
602
|
+
for match in img_regex.finditer(html_content):
|
|
603
|
+
src = match.group(1) or ""
|
|
604
|
+
alt = match.group(2) or ""
|
|
605
|
+
|
|
606
|
+
if not src:
|
|
607
|
+
continue
|
|
608
|
+
if src.startswith("//"):
|
|
609
|
+
src = "https:" + src
|
|
610
|
+
if not src.startswith("http"):
|
|
611
|
+
continue
|
|
612
|
+
# Skip tiny icons/placeholders
|
|
613
|
+
if "favicon" in src.lower() or "logo" in src.lower() or "icon" in src.lower():
|
|
614
|
+
continue
|
|
615
|
+
if src in seen:
|
|
616
|
+
continue
|
|
617
|
+
seen.add(src)
|
|
618
|
+
|
|
619
|
+
alt = html.unescape(alt.strip()) if alt else "Image"
|
|
620
|
+
domain = urllib.parse.urlparse(src).hostname or ""
|
|
621
|
+
|
|
622
|
+
images.append({
|
|
623
|
+
"title": alt,
|
|
624
|
+
"url": src,
|
|
625
|
+
"thumbnail": src, # Use same URL as thumbnail
|
|
626
|
+
"domain": domain,
|
|
627
|
+
"content": alt,
|
|
628
|
+
})
|
|
629
|
+
|
|
630
|
+
if len(images) >= self._default_limit:
|
|
631
|
+
break
|
|
632
|
+
|
|
633
|
+
if not images:
|
|
634
|
+
logger.warning(f"SearchService(httpx): no images parsed from HTML")
|
|
635
|
+
else:
|
|
636
|
+
logger.info(f"SearchService(httpx): parsed {len(images)} images")
|
|
637
|
+
return images
|
|
638
|
+
|
|
639
|
+
except Exception as e:
|
|
640
|
+
logger.error(f"SearchService(httpx) image_search failed: {e}")
|
|
641
|
+
return []
|
|
429
642
|
|
|
643
|
+
async def _image_search_crawl4ai(self, query: str) -> List[Dict[str, str]]:
|
|
644
|
+
"""
|
|
645
|
+
Image search via Crawl4AI media extraction.
|
|
646
|
+
"""
|
|
430
647
|
url = self._build_image_url(query)
|
|
431
648
|
logger.info(f"SearchService(Crawl4AI Image): fetching {url}")
|
|
432
649
|
|
|
@@ -1,5 +1,12 @@
|
|
|
1
|
-
entari_plugin_hyw/__init__.py,sha256=
|
|
2
|
-
entari_plugin_hyw/
|
|
1
|
+
entari_plugin_hyw/__init__.py,sha256=LHhCnOL5X6Il5OOrL4Q9HH1PHZtGfof9rTsRDQpVYpQ,15790
|
|
2
|
+
entari_plugin_hyw/history.py,sha256=zYtON0FgkA_AcXerLV335OzpIP30eAxDEp7NHCFFXis,7016
|
|
3
|
+
entari_plugin_hyw/image_cache.py,sha256=-GWNgzmIZv8OF2qEECqcQcuzGnhz5vmZ2-GWdPkEX4I,10373
|
|
4
|
+
entari_plugin_hyw/misc.py,sha256=pW-eSRKGJjJhVfz8Z-N0bTIHL57Jq3ynPzVFuy7YWnI,3479
|
|
5
|
+
entari_plugin_hyw/pipeline.py,sha256=Q4896twMwL_UbaabA7v-TFlLl1fduTrtPkFyiH3Qvyc,56173
|
|
6
|
+
entari_plugin_hyw/prompts.py,sha256=ZE0UN4GuUuG6HCzH4RbfQFHVCC9zT4_xByiG2L-U70I,3961
|
|
7
|
+
entari_plugin_hyw/render_vue.py,sha256=BkjfIB4JJ-CTvbreSHbczMu5NyDZtKjK-0HVQpYGacQ,12393
|
|
8
|
+
entari_plugin_hyw/search.py,sha256=yauAkNs3bM79dWoMSbn5ngRsxGLAHRbiAYSaGOxzVUQ,28316
|
|
9
|
+
entari_plugin_hyw/assets/card-dist/index.html,sha256=2DEOmrRDN6QwoqsywMfe8zdHvzHxrn5Rnhgg2bJA8UE,2016347
|
|
3
10
|
entari_plugin_hyw/assets/card-dist/vite.svg,sha256=SnSK_UQ5GLsWWRyDTEAdrjPoeGGrXbrQgRw6O0qSFPs,1497
|
|
4
11
|
entari_plugin_hyw/assets/card-dist/logos/anthropic.svg,sha256=ASsy1ypo3osNc3n-B0R81tk_dIFsVgg7qQORrd5T2kA,558
|
|
5
12
|
entari_plugin_hyw/assets/card-dist/logos/cerebras.svg,sha256=bpmiiYTODwc06knTmPj3GQ7NNtosMog5lkggvB_Z-7M,44166
|
|
@@ -65,35 +72,17 @@ entari_plugin_hyw/card-ui/public/logos/qwen.png,sha256=eqLbnIPbjh2_PsODU_mmqjeD8
|
|
|
65
72
|
entari_plugin_hyw/card-ui/public/logos/xai.png,sha256=uSulvvDVqoA4RUOW0ZAkdvBVM2rpyGJRZIbn5dEFspw,362
|
|
66
73
|
entari_plugin_hyw/card-ui/public/logos/xiaomi.png,sha256=WHxlDFGU5FCjb-ure3ngdGG18-efYZUUfqA3_lqCUN0,4084
|
|
67
74
|
entari_plugin_hyw/card-ui/public/logos/zai.png,sha256=K-gnabdsjMLInppHA1Op7Nyt33iegrx1x-yNlvCZ0Tc,2351
|
|
68
|
-
entari_plugin_hyw/card-ui/src/App.vue,sha256=
|
|
75
|
+
entari_plugin_hyw/card-ui/src/App.vue,sha256=DA5CLpWtl_bm3n3WBpEhdmun9IkLaRE1E_jWMeOmwwY,17656
|
|
69
76
|
entari_plugin_hyw/card-ui/src/main.ts,sha256=rm653lPnK5fuTIj-iNLpgr8GAmayuCoKop7IWfo0IBk,111
|
|
70
77
|
entari_plugin_hyw/card-ui/src/style.css,sha256=LnQEZyUqsj-IuESW1YBRqAz7T6LMayMDuiNKP5TAB1o,188
|
|
71
|
-
entari_plugin_hyw/card-ui/src/
|
|
78
|
+
entari_plugin_hyw/card-ui/src/test_regex.js,sha256=cWmclm6LRKYfjeN1RT5HECdltmo1HvS2BwGCYY_4l14,3040
|
|
79
|
+
entari_plugin_hyw/card-ui/src/types.ts,sha256=jzI8MXDKxz7WRZmGVb-bToO8BEFW2AOp8iek1n4VuQY,1246
|
|
72
80
|
entari_plugin_hyw/card-ui/src/assets/vue.svg,sha256=VTLbNPHFKEGIG6uK7KbD6NCSvSGmiaZfTY-M-AQe750,496
|
|
73
81
|
entari_plugin_hyw/card-ui/src/components/HelloWorld.vue,sha256=yvBIzJua9BfikUOR1I7GYytlnBbgB6xyposxslAoRLU,856
|
|
74
|
-
entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue,sha256=
|
|
82
|
+
entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue,sha256=w6XbsYa0UQVnb_E7Z6CR2HwKMXWEEzAgQxJ5DQbL9yg,13068
|
|
75
83
|
entari_plugin_hyw/card-ui/src/components/SectionCard.vue,sha256=owcDNx2JYVmF2J5SYCroR2gvg_cPApQsNunjK1WJpVI,1433
|
|
76
|
-
entari_plugin_hyw/card-ui/src/components/StageCard.vue,sha256=
|
|
77
|
-
entari_plugin_hyw/
|
|
78
|
-
entari_plugin_hyw/
|
|
79
|
-
entari_plugin_hyw/
|
|
80
|
-
entari_plugin_hyw/
|
|
81
|
-
entari_plugin_hyw/core/pipeline.py,sha256=15SyghuDkNX81kr3ESeLmCyy2eL2RZgt44-PQY1gdx8,51793
|
|
82
|
-
entari_plugin_hyw/core/render_vue.py,sha256=54XtNiacH_9eyBhcD0dZtL0_wOy05YaK2td13jyVXcs,9694
|
|
83
|
-
entari_plugin_hyw/test_output/render_0.jpg,sha256=uoODJLt-PPY7iCpmlo9ehPIlMMiN6dmmlkT9tpIHEn8,109963
|
|
84
|
-
entari_plugin_hyw/test_output/render_1.jpg,sha256=uoODJLt-PPY7iCpmlo9ehPIlMMiN6dmmlkT9tpIHEn8,109963
|
|
85
|
-
entari_plugin_hyw/test_output/render_2.jpg,sha256=uoODJLt-PPY7iCpmlo9ehPIlMMiN6dmmlkT9tpIHEn8,109963
|
|
86
|
-
entari_plugin_hyw/test_output/render_3.jpg,sha256=uoODJLt-PPY7iCpmlo9ehPIlMMiN6dmmlkT9tpIHEn8,109963
|
|
87
|
-
entari_plugin_hyw/test_output/render_4.jpg,sha256=uoODJLt-PPY7iCpmlo9ehPIlMMiN6dmmlkT9tpIHEn8,109963
|
|
88
|
-
entari_plugin_hyw/tests/ui_test_output.jpg,sha256=UwkpO8saGV0HubXq9Oo0WpdVON9t5_7FSmQrak_YfpE,1586227
|
|
89
|
-
entari_plugin_hyw/tests/verify_ui.py,sha256=Zln2mNCaJopvDBOFujux0K1UF5PfjpmscBq6q3-dKPI,6728
|
|
90
|
-
entari_plugin_hyw/utils/__init__.py,sha256=TnkxDqYr0zgRE7TC92tVbUaY8m1UyyoLg2zvzQ8nMVI,84
|
|
91
|
-
entari_plugin_hyw/utils/browser.py,sha256=LJlFh-oSqt9mQBpMALxbYGUG__t1YLUo7RxUAslsWUc,1416
|
|
92
|
-
entari_plugin_hyw/utils/misc.py,sha256=pW-eSRKGJjJhVfz8Z-N0bTIHL57Jq3ynPzVFuy7YWnI,3479
|
|
93
|
-
entari_plugin_hyw/utils/playwright_tool.py,sha256=ZZNkzFtUt_Gxny3Od4boBAgNF9J0N84uySatzn1Bwe4,1272
|
|
94
|
-
entari_plugin_hyw/utils/prompts.py,sha256=Zqgs9ywMri-LLyPjPYA1qcA9GVBfqjZ3qVHb7K7Fnx8,4284
|
|
95
|
-
entari_plugin_hyw/utils/search.py,sha256=Bvz2KFw3Gr2nuvmlo_8ExLHvO353NKX-YN35A2FCsBw,19047
|
|
96
|
-
entari_plugin_hyw-3.5.0rc1.dist-info/METADATA,sha256=RDxS81rieII4yZMeKuhLS-1sd365gGH6N5zjHa4kiu0,3740
|
|
97
|
-
entari_plugin_hyw-3.5.0rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
98
|
-
entari_plugin_hyw-3.5.0rc1.dist-info/top_level.txt,sha256=TIDsn6XPs6KA5e3ezsE65JoXsy03ejDdrB41I4SPjmo,18
|
|
99
|
-
entari_plugin_hyw-3.5.0rc1.dist-info/RECORD,,
|
|
84
|
+
entari_plugin_hyw/card-ui/src/components/StageCard.vue,sha256=4nIZXKDtvsAKwc3-lkeONrpdT2Fbw_fgn16SdJOOku4,9066
|
|
85
|
+
entari_plugin_hyw-3.5.0rc2.dist-info/METADATA,sha256=r67ZlZIVbMlpS99bqIHs14JS3xiL8v_jz0ziQR69LrQ,3740
|
|
86
|
+
entari_plugin_hyw-3.5.0rc2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
87
|
+
entari_plugin_hyw-3.5.0rc2.dist-info/top_level.txt,sha256=TIDsn6XPs6KA5e3ezsE65JoXsy03ejDdrB41I4SPjmo,18
|
|
88
|
+
entari_plugin_hyw-3.5.0rc2.dist-info/RECORD,,
|
|
File without changes
|