entari-plugin-hyw 2.2.5__py3-none-any.whl → 3.5.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. entari_plugin_hyw/__init__.py +371 -315
  2. entari_plugin_hyw/assets/card-dist/index.html +396 -0
  3. entari_plugin_hyw/assets/card-dist/logos/anthropic.svg +1 -0
  4. entari_plugin_hyw/assets/card-dist/logos/cerebras.svg +9 -0
  5. entari_plugin_hyw/assets/card-dist/logos/deepseek.png +0 -0
  6. entari_plugin_hyw/assets/card-dist/logos/gemini.svg +1 -0
  7. entari_plugin_hyw/assets/card-dist/logos/google.svg +1 -0
  8. entari_plugin_hyw/assets/card-dist/logos/grok.png +0 -0
  9. entari_plugin_hyw/assets/card-dist/logos/huggingface.png +0 -0
  10. entari_plugin_hyw/assets/card-dist/logos/microsoft.svg +15 -0
  11. entari_plugin_hyw/assets/card-dist/logos/minimax.png +0 -0
  12. entari_plugin_hyw/assets/card-dist/logos/mistral.png +0 -0
  13. entari_plugin_hyw/assets/card-dist/logos/nvida.png +0 -0
  14. entari_plugin_hyw/assets/card-dist/logos/openai.svg +1 -0
  15. entari_plugin_hyw/assets/card-dist/logos/openrouter.png +0 -0
  16. entari_plugin_hyw/assets/card-dist/logos/perplexity.svg +24 -0
  17. entari_plugin_hyw/assets/card-dist/logos/qwen.png +0 -0
  18. entari_plugin_hyw/assets/card-dist/logos/xai.png +0 -0
  19. entari_plugin_hyw/assets/card-dist/logos/xiaomi.png +0 -0
  20. entari_plugin_hyw/assets/card-dist/logos/zai.png +0 -0
  21. entari_plugin_hyw/assets/card-dist/vite.svg +1 -0
  22. entari_plugin_hyw/assets/icon/anthropic.svg +1 -0
  23. entari_plugin_hyw/assets/icon/cerebras.svg +9 -0
  24. entari_plugin_hyw/assets/icon/deepseek.png +0 -0
  25. entari_plugin_hyw/assets/icon/gemini.svg +1 -0
  26. entari_plugin_hyw/assets/icon/google.svg +1 -0
  27. entari_plugin_hyw/assets/icon/grok.png +0 -0
  28. entari_plugin_hyw/assets/icon/huggingface.png +0 -0
  29. entari_plugin_hyw/assets/icon/microsoft.svg +15 -0
  30. entari_plugin_hyw/assets/icon/minimax.png +0 -0
  31. entari_plugin_hyw/assets/icon/mistral.png +0 -0
  32. entari_plugin_hyw/assets/icon/nvida.png +0 -0
  33. entari_plugin_hyw/assets/icon/openai.svg +1 -0
  34. entari_plugin_hyw/assets/icon/openrouter.png +0 -0
  35. entari_plugin_hyw/assets/icon/perplexity.svg +24 -0
  36. entari_plugin_hyw/assets/icon/qwen.png +0 -0
  37. entari_plugin_hyw/assets/icon/xai.png +0 -0
  38. entari_plugin_hyw/assets/icon/xiaomi.png +0 -0
  39. entari_plugin_hyw/assets/icon/zai.png +0 -0
  40. entari_plugin_hyw/card-ui/.gitignore +24 -0
  41. entari_plugin_hyw/card-ui/README.md +5 -0
  42. entari_plugin_hyw/card-ui/index.html +16 -0
  43. entari_plugin_hyw/card-ui/package-lock.json +2342 -0
  44. entari_plugin_hyw/card-ui/package.json +31 -0
  45. entari_plugin_hyw/card-ui/public/logos/anthropic.svg +1 -0
  46. entari_plugin_hyw/card-ui/public/logos/cerebras.svg +9 -0
  47. entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
  48. entari_plugin_hyw/card-ui/public/logos/gemini.svg +1 -0
  49. entari_plugin_hyw/card-ui/public/logos/google.svg +1 -0
  50. entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
  51. entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
  52. entari_plugin_hyw/card-ui/public/logos/microsoft.svg +15 -0
  53. entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
  54. entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
  55. entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
  56. entari_plugin_hyw/card-ui/public/logos/openai.svg +1 -0
  57. entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
  58. entari_plugin_hyw/card-ui/public/logos/perplexity.svg +24 -0
  59. entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
  60. entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
  61. entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
  62. entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
  63. entari_plugin_hyw/card-ui/public/vite.svg +1 -0
  64. entari_plugin_hyw/card-ui/src/App.vue +412 -0
  65. entari_plugin_hyw/card-ui/src/assets/vue.svg +1 -0
  66. entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +41 -0
  67. entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +386 -0
  68. entari_plugin_hyw/card-ui/src/components/SectionCard.vue +41 -0
  69. entari_plugin_hyw/card-ui/src/components/StageCard.vue +237 -0
  70. entari_plugin_hyw/card-ui/src/main.ts +5 -0
  71. entari_plugin_hyw/card-ui/src/style.css +29 -0
  72. entari_plugin_hyw/card-ui/src/test_regex.js +103 -0
  73. entari_plugin_hyw/card-ui/src/types.ts +52 -0
  74. entari_plugin_hyw/card-ui/tsconfig.app.json +16 -0
  75. entari_plugin_hyw/card-ui/tsconfig.json +7 -0
  76. entari_plugin_hyw/card-ui/tsconfig.node.json +26 -0
  77. entari_plugin_hyw/card-ui/vite.config.ts +16 -0
  78. entari_plugin_hyw/history.py +170 -0
  79. entari_plugin_hyw/image_cache.py +274 -0
  80. entari_plugin_hyw/misc.py +128 -0
  81. entari_plugin_hyw/pipeline.py +1338 -0
  82. entari_plugin_hyw/prompts.py +108 -0
  83. entari_plugin_hyw/render_vue.py +314 -0
  84. entari_plugin_hyw/search.py +696 -0
  85. entari_plugin_hyw-3.5.0rc6.dist-info/METADATA +116 -0
  86. entari_plugin_hyw-3.5.0rc6.dist-info/RECORD +88 -0
  87. entari_plugin_hyw/hyw_core.py +0 -555
  88. entari_plugin_hyw-2.2.5.dist-info/METADATA +0 -135
  89. entari_plugin_hyw-2.2.5.dist-info/RECORD +0 -6
  90. {entari_plugin_hyw-2.2.5.dist-info → entari_plugin_hyw-3.5.0rc6.dist-info}/WHEEL +0 -0
  91. {entari_plugin_hyw-2.2.5.dist-info → entari_plugin_hyw-3.5.0rc6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,696 @@
1
+ import urllib.parse
2
+ import asyncio
3
+ import re
4
+ import html
5
+ from typing import List, Dict, Optional, Any
6
+ from loguru import logger
7
+ from crawl4ai import AsyncWebCrawler
8
+ from crawl4ai.async_configs import CrawlerRunConfig
9
+ from crawl4ai.cache_context import CacheMode
10
+
11
+ # Optional imports for new strategies
12
+ try:
13
+ import httpx
14
+ except ImportError:
15
+ httpx = None
16
+
17
+ try:
18
+ from ddgs import DDGS
19
+ except ImportError:
20
+ try:
21
+ from duckduckgo_search import DDGS
22
+ except ImportError:
23
+ DDGS = None
24
+
25
+ # Import image cache for prefetching
26
+ from .image_cache import prefetch_images
27
+
28
+ # Shared crawler instance to avoid repeated init
29
+ _shared_crawler: Optional[AsyncWebCrawler] = None
30
+
31
+
32
+ async def get_shared_crawler() -> AsyncWebCrawler:
33
+ global _shared_crawler
34
+ if _shared_crawler is None:
35
+ _shared_crawler = AsyncWebCrawler()
36
+ await _shared_crawler.start()
37
+ return _shared_crawler
38
+
39
+
40
+ async def close_shared_crawler():
41
+ global _shared_crawler
42
+ if _shared_crawler:
43
+ try:
44
+ await _shared_crawler.close()
45
+ except Exception:
46
+ pass
47
+ _shared_crawler = None
48
+
49
+ class SearchService:
50
+ """
51
+ Multi-strategy search & fetch service.
52
+ Search providers: 'crawl4ai' (default), 'httpx', 'ddgs'.
53
+ Fetch providers: 'crawl4ai' (default), 'jinaai'.
54
+ """
55
+ def __init__(self, config: Any):
56
+ self.config = config
57
+ self._default_limit = getattr(config, "search_limit", 8)
58
+ self._crawler: Optional[AsyncWebCrawler] = None
59
+
60
+ # Configuration for retries/timeouts
61
+ self._search_timeout = getattr(config, "search_timeout", 10.0)
62
+ self._search_retries = getattr(config, "search_retries", 2)
63
+ # Separate providers for search and page fetching
64
+ self._search_provider = getattr(config, "search_provider", "crawl4ai")
65
+ self._fetch_provider = getattr(config, "fetch_provider", "crawl4ai")
66
+ self._jina_api_key = getattr(config, "jina_api_key", None)
67
+ logger.info(f"SearchService initialized: search_provider='{self._search_provider}', fetch_provider='{self._fetch_provider}', limit={self._default_limit}, timeout={self._search_timeout}s")
68
+
69
+ def _build_search_url(self, query: str) -> str:
70
+ encoded_query = urllib.parse.quote(query)
71
+ base = getattr(self.config, "search_base_url", "https://lite.duckduckgo.com/lite/?q={query}")
72
+ if "{query}" in base:
73
+ return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
74
+ sep = "&" if "?" in base else "?"
75
+ return f"{base}{sep}q={encoded_query}"
76
+
77
+ def _build_image_url(self, query: str) -> str:
78
+ encoded_query = urllib.parse.quote(query)
79
+ base = getattr(self.config, "image_search_base_url", "https://duckduckgo.com/?q={query}&iax=images&ia=images")
80
+ if "{query}" in base:
81
+ return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
82
+ sep = "&" if "?" in base else "?"
83
+ return f"{base}{sep}q={encoded_query}&iax=images&ia=images"
84
+
85
+ async def search(self, query: str) -> List[Dict[str, str]]:
86
+ """
87
+ Dispatch search to the configured provider.
88
+ """
89
+ if not query:
90
+ return []
91
+
92
+ provider = self._search_provider.lower()
93
+ logger.info(f"SearchService: searching for '{query}' using provider='{provider}'")
94
+
95
+ if provider == "httpx":
96
+ return await self._search_httpx(query)
97
+ elif provider == "ddgs":
98
+ return await self._search_ddgs(query)
99
+ else:
100
+ # Default to crawl4ai for backward compatibility or explicit choice
101
+ return await self._search_crawl4ai(query)
102
+
103
+ async def _search_httpx(self, query: str) -> List[Dict[str, str]]:
104
+ """
105
+ Directly fetch https://lite.duckduckgo.com/lite/ via httpx and parse HTML.
106
+ Fast, no browser overhead.
107
+ """
108
+ if not httpx:
109
+ logger.error("SearchService: httpx not installed, fallback to crawl4ai")
110
+ return await self._search_crawl4ai(query)
111
+
112
+ url = self._build_search_url(query)
113
+
114
+ results: List[Dict[str, str]] = []
115
+ try:
116
+ async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
117
+ resp = await client.get(url, headers={
118
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
119
+ })
120
+ resp.raise_for_status()
121
+ html_content = resp.text
122
+
123
+ # Regex parsing for DDG Lite
124
+ snippet_regex = re.compile(r'<td[^>]*>(.*?)</td>', re.DOTALL)
125
+ link_regex = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.DOTALL)
126
+
127
+ raw_links = link_regex.findall(html_content)
128
+
129
+ seen = set()
130
+ for href, text in raw_links:
131
+ if len(results) >= self._default_limit:
132
+ break
133
+
134
+ # Clean href
135
+ if "duckduckgo.com" in href:
136
+ if "uddg=" in href:
137
+ parsed = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
138
+ href = parsed.get("uddg", [href])[0]
139
+ else:
140
+ continue
141
+
142
+ if not href.startswith("http"):
143
+ continue
144
+
145
+ if href in seen:
146
+ continue
147
+ seen.add(href)
148
+
149
+ # Title clean
150
+ title = re.sub(r'<[^>]+>', '', text).strip()
151
+ title = html.unescape(title)
152
+
153
+ results.append({
154
+ "title": title,
155
+ "url": href,
156
+ "domain": urllib.parse.urlparse(href).hostname or "",
157
+ "content": title
158
+ })
159
+
160
+ if not results:
161
+ logger.warning("SearchService(httpx): No results parsed via regex.")
162
+
163
+ return results
164
+
165
+ except Exception as e:
166
+ logger.error(f"SearchService(httpx) failed: {e}")
167
+ return []
168
+
169
+ async def _search_ddgs(self, query: str) -> List[Dict[str, str]]:
170
+ """
171
+ Use duckduckgo_search library (Sync DDGS).
172
+ Executes in thread pool to allow async usage.
173
+ Supports retries and timeouts.
174
+ """
175
+ if not DDGS:
176
+ logger.error("SearchService: duckduckgo_search not installed, fallback to crawl4ai")
177
+ return await self._search_crawl4ai(query)
178
+
179
+ def _do_sync_search():
180
+ """Sync search function to run in thread"""
181
+ results: List[Dict[str, str]] = []
182
+ final_exc = None
183
+
184
+ for attempt in range(self._search_retries + 1):
185
+ try:
186
+ with DDGS(timeout=self._search_timeout) as ddgs:
187
+ # Use positional argument for query to be safe across versions
188
+ ddgs_gen = ddgs.text(
189
+ query,
190
+ region='cn-zh',
191
+ safesearch='moderate',
192
+ max_results=self._default_limit,
193
+ backend="duckduckgo",
194
+ )
195
+
196
+ if ddgs_gen:
197
+ for r in ddgs_gen:
198
+ results.append({
199
+ "title": r.get("title", ""),
200
+ "url": r.get("href", ""),
201
+ "domain": urllib.parse.urlparse(r.get("href", "")).hostname or "",
202
+ "content": r.get("body", "")
203
+ })
204
+ if len(results) >= self._default_limit:
205
+ break
206
+
207
+ return results, None
208
+
209
+ except Exception as e:
210
+ final_exc = e
211
+ if attempt < self._search_retries:
212
+ import time
213
+ time.sleep(1)
214
+
215
+ return [], final_exc
216
+
217
+ # Run sync search in executor
218
+ try:
219
+ results, err = await asyncio.to_thread(_do_sync_search)
220
+
221
+ if err:
222
+ logger.warning(f"SearchService(ddgs) text search failed after retries: {err}")
223
+ return []
224
+
225
+ logger.info(f"SearchService(ddgs): Got {len(results)} text results")
226
+ return results
227
+
228
+ except Exception as e:
229
+ logger.error(f"SearchService(ddgs) thread execution failed: {e}")
230
+ return []
231
+
232
+ async def _search_ddgs_images(self, query: str) -> List[Dict[str, str]]:
233
+ """
234
+ Use duckduckgo_search library for images.
235
+ """
236
+ if not DDGS:
237
+ return []
238
+
239
+ def _do_sync_image_search():
240
+ results: List[Dict[str, str]] = []
241
+ final_exc = None
242
+
243
+ for attempt in range(self._search_retries + 1):
244
+ try:
245
+ with DDGS(timeout=self._search_timeout) as ddgs:
246
+ ddgs_gen = ddgs.images(
247
+ query,
248
+ region='cn-zh',
249
+ safesearch='moderate',
250
+ max_results=self._default_limit,
251
+ )
252
+
253
+ if ddgs_gen:
254
+ for r in ddgs_gen:
255
+ # DDGS images returns: title, image, thumbnail, url, source, etc.
256
+ # API might differ, adapt to standard format
257
+ results.append({
258
+ "title": r.get("title", "Image"),
259
+ "url": r.get("image", "") or r.get("url", ""), # Full image URL
260
+ "thumbnail": r.get("thumbnail", ""),
261
+ "domain": r.get("source", "") or urllib.parse.urlparse(r.get("url", "")).hostname or "",
262
+ })
263
+ if len(results) >= self._default_limit:
264
+ break
265
+
266
+ return results, None
267
+ except Exception as e:
268
+ final_exc = e
269
+ if attempt < self._search_retries:
270
+ import time
271
+ time.sleep(1)
272
+
273
+ return [], final_exc
274
+
275
+ try:
276
+ results, err = await asyncio.to_thread(_do_sync_image_search)
277
+ if err:
278
+ logger.warning(f"SearchService(ddgs) image search failed: {err}")
279
+ return []
280
+
281
+ logger.info(f"SearchService(ddgs): Got {len(results)} image results")
282
+ return results
283
+ except Exception as e:
284
+ logger.error(f"SearchService(ddgs) image thread failed: {e}")
285
+ return []
286
+
287
+ async def _search_crawl4ai(self, query: str) -> List[Dict[str, str]]:
288
+ """
289
+ Crawl the configured SERP using Crawl4AI and return parsed results.
290
+ Original implementation.
291
+ """
292
+ if not query:
293
+ return []
294
+
295
+ url = self._build_search_url(query)
296
+ logger.info(f"SearchService(Crawl4AI): fetching {url}")
297
+
298
+ try:
299
+ crawler = await self._get_crawler()
300
+ result = await crawler.arun(
301
+ url=url,
302
+ config=CrawlerRunConfig(
303
+ wait_until="domcontentloaded",
304
+ wait_for="article",
305
+ cache_mode=CacheMode.BYPASS,
306
+ word_count_threshold=1,
307
+ screenshot=False,
308
+ capture_console_messages=False,
309
+ capture_network_requests=False,
310
+ ),
311
+ )
312
+ return self._parse_markdown_result(result, limit=self._default_limit)
313
+ except Exception as e:
314
+ logger.error(f"Crawl4AI search failed: {e}")
315
+ return []
316
+
317
+ def _parse_markdown_result(self, result, limit: int = 8) -> List[Dict[str, str]]:
318
+ """Parse Crawl4AI result into search items without manual HTML parsing."""
319
+ md = (result.markdown or result.extracted_content or "").strip()
320
+ lines = [ln.strip() for ln in md.splitlines() if ln.strip()]
321
+ links = result.links.get("external", []) if getattr(result, "links", None) else []
322
+ seen = set()
323
+ results: List[Dict[str, str]] = []
324
+
325
+ def find_snippet(url: str, domain: str) -> str:
326
+ for i, ln in enumerate(lines):
327
+ # Avoid matching DDG internal links (filter by site, etc) which contain the target URL in query params
328
+ if "duckduckgo.com" in ln:
329
+ continue
330
+
331
+ if url in ln or (domain and domain in ln):
332
+ # If this line looks like the title/link line (e.g. starts with [Title](url) or similar),
333
+ # and the NEXT line exists, return the next line as snippet.
334
+ if i + 1 < len(lines) and len(ln) < 300: # heuristic: title lines are usually shorter
335
+ return lines[i+1][:400]
336
+ return ln[:400]
337
+ # fallback to first non-empty line
338
+ return lines[0][:400] if lines else ""
339
+
340
+ for link in links:
341
+ url = link.get("href") or ""
342
+ if not url or url in seen:
343
+ continue
344
+ seen.add(url)
345
+ domain = urllib.parse.urlparse(url).hostname or ""
346
+ title = link.get("title") or link.get("text") or url
347
+ snippet = find_snippet(url, domain)
348
+ results.append({
349
+ "title": title.strip(),
350
+ "url": url,
351
+ "domain": domain,
352
+ "content": snippet or title,
353
+ })
354
+ if len(results) >= limit:
355
+ break
356
+
357
+ if not results:
358
+ logger.warning(f"SearchService: no results parsed; md_length={len(md)}, links={len(links)}")
359
+ else:
360
+ logger.info(f"SearchService: parsed {len(results)} results via Crawl4AI links")
361
+ return results
362
+
363
+ async def fetch_page(self, url: str) -> Dict[str, str]:
364
+ """
365
+ Fetch a single page and return cleaned markdown/text plus metadata.
366
+ Dispatches to jinaai or Crawl4AI based on fetch_provider config.
367
+ """
368
+ if not url:
369
+ return {"content": "Error: missing url", "title": "Error", "url": ""}
370
+
371
+ provider = self._fetch_provider.lower()
372
+ logger.info(f"SearchService: fetching page '{url}' using fetch_provider='{provider}'")
373
+
374
+ if provider == "jinaai":
375
+ return await self._fetch_page_jinaai(url)
376
+ else:
377
+ return await self._fetch_page_crawl4ai(url)
378
+
379
+ async def _fetch_page_jinaai(self, url: str) -> Dict[str, str]:
380
+ """
381
+ Fetch page via Jina AI Reader - returns clean markdown content.
382
+ https://r.jina.ai/{url}
383
+ """
384
+ if not httpx:
385
+ logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
386
+ return await self._fetch_page_crawl4ai(url)
387
+
388
+ jina_url = f"https://r.jina.ai/{url}"
389
+ headers = {
390
+ "X-Return-Format": "markdown",
391
+ }
392
+ # Add authorization header if API key is configured
393
+ if self._jina_api_key:
394
+ headers["Authorization"] = f"Bearer {self._jina_api_key}"
395
+
396
+ try:
397
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
398
+ resp = await client.get(jina_url, headers=headers)
399
+ resp.raise_for_status()
400
+ content = resp.text
401
+
402
+ # Jina AI returns markdown content directly
403
+ # Try to extract title from first heading or first line
404
+ title = "No Title"
405
+ lines = content.strip().split('\n')
406
+ for line in lines:
407
+ line = line.strip()
408
+ if line.startswith('# '):
409
+ title = line[2:].strip()
410
+ break
411
+ elif line and not line.startswith('!') and not line.startswith('['):
412
+ # Use first non-empty, non-image, non-link line as title
413
+ title = line[:100]
414
+ break
415
+
416
+ logger.info(f"SearchService(jinaai): fetched page, title='{title}', content_len={len(content)}")
417
+ return {
418
+ "content": content[:8000],
419
+ "title": title,
420
+ "url": url
421
+ }
422
+
423
+ except Exception as e:
424
+ logger.error(f"SearchService(jinaai) fetch_page failed: {e}")
425
+ return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
426
+
427
+ async def _fetch_page_httpx(self, url: str) -> Dict[str, str]:
428
+ """
429
+ Fetch page via httpx - fast, no browser overhead.
430
+ """
431
+ if not httpx:
432
+ logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
433
+ return await self._fetch_page_crawl4ai(url)
434
+
435
+ try:
436
+ async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
437
+ resp = await client.get(url, headers={
438
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
439
+ })
440
+ resp.raise_for_status()
441
+ html_content = resp.text
442
+
443
+ # Extract title from HTML
444
+ title = "No Title"
445
+ title_match = re.search(r'<title[^>]*>([^<]+)</title>', html_content, re.IGNORECASE)
446
+ if title_match:
447
+ title = html.unescape(title_match.group(1).strip())
448
+
449
+ # Try og:title as fallback
450
+ if title == "No Title":
451
+ og_match = re.search(r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
452
+ if og_match:
453
+ title = html.unescape(og_match.group(1).strip())
454
+
455
+ # Simple HTML to text conversion
456
+ # Remove script/style tags
457
+ content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', html_content, flags=re.IGNORECASE)
458
+ content = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', html_content, flags=re.IGNORECASE)
459
+ # Remove HTML tags
460
+ content = re.sub(r'<[^>]+>', ' ', content)
461
+ # Decode entities
462
+ content = html.unescape(content)
463
+ # Normalize whitespace
464
+ content = re.sub(r'\s+', ' ', content).strip()
465
+
466
+ logger.info(f"SearchService(httpx): fetched page, title='{title}', content_len={len(content)}")
467
+ return {
468
+ "content": content[:8000],
469
+ "title": title,
470
+ "url": url
471
+ }
472
+
473
+ except Exception as e:
474
+ logger.error(f"SearchService(httpx) fetch_page failed: {e}")
475
+ return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
476
+
477
+ async def _fetch_page_crawl4ai(self, url: str) -> Dict[str, str]:
478
+ """
479
+ Fetch page via Crawl4AI - full browser rendering.
480
+ """
481
+ try:
482
+ crawler = await self._get_crawler()
483
+ result = await crawler.arun(
484
+ url=url,
485
+ config=CrawlerRunConfig(
486
+ wait_until="networkidle",
487
+ wait_for_images=False, # Faster: skip image loading
488
+ cache_mode=CacheMode.BYPASS,
489
+ word_count_threshold=1,
490
+ screenshot=False,
491
+ capture_console_messages=False,
492
+ capture_network_requests=False,
493
+ ),
494
+ )
495
+ if not result.success:
496
+ return {"content": f"Error: crawl failed ({result.error_message or 'unknown'})", "title": "Error", "url": url}
497
+
498
+ content = result.markdown or result.extracted_content or result.cleaned_html or result.html or ""
499
+ # Extract metadata if available, otherwise fallback
500
+ title = "No Title"
501
+ if result.metadata:
502
+ title = result.metadata.get("title") or result.metadata.get("og:title") or title
503
+
504
+ # If metadata title is missing/generic, try to grab from links or url? No, metadata is best.
505
+ if title == "No Title" and result.links:
506
+ # Minimal fallback not really possible without parsing HTML again or regex
507
+ pass
508
+
509
+ return {
510
+ "content": content[:8000],
511
+ "title": title,
512
+ "url": result.url or url
513
+ }
514
+ except Exception as e:
515
+ logger.error(f"Crawl4AI fetch failed: {e}")
516
+ return {"content": f"Error: crawl failed ({e})", "title": "Error", "url": url}
517
+
518
+ async def _get_crawler(self) -> AsyncWebCrawler:
519
+ # Prefer shared crawler to minimize INIT logs; fall back to local if needed
520
+ try:
521
+ return await get_shared_crawler()
522
+ except Exception as e:
523
+ logger.warning(f"Shared crawler unavailable, creating local: {e}")
524
+ if self._crawler is None:
525
+ self._crawler = AsyncWebCrawler()
526
+ await self._crawler.start()
527
+ return self._crawler
528
+
529
+ async def close(self):
530
+ if self._crawler:
531
+ try:
532
+ await self._crawler.close()
533
+ except Exception:
534
+ pass
535
+ self._crawler = None
536
+
537
+ async def image_search(self, query: str, prefetch: bool = True) -> List[Dict[str, str]]:
538
+ """
539
+ Image search - dispatches to httpx, ddgs, or Crawl4AI based on search_provider.
540
+
541
+ Args:
542
+ query: Search query
543
+ prefetch: If True, automatically start prefetching images for caching
544
+ """
545
+ if not query:
546
+ return []
547
+
548
+ provider = self._search_provider.lower()
549
+ logger.info(f"SearchService: image searching for '{query}' using provider='{provider}'")
550
+
551
+ if provider == "ddgs":
552
+ results = await self._search_ddgs_images(query)
553
+ elif provider == "httpx":
554
+ results = await self._image_search_httpx(query)
555
+ else:
556
+ results = await self._image_search_crawl4ai(query)
557
+
558
+ # Start prefetching images in background for faster rendering
559
+ if prefetch and results:
560
+ urls_to_prefetch = []
561
+ for img in results:
562
+ # Prefer thumbnail for prefetch (smaller, used in UI)
563
+ thumb = img.get("thumbnail")
564
+ url = img.get("url")
565
+ if thumb:
566
+ urls_to_prefetch.append(thumb)
567
+ if url and url != thumb:
568
+ urls_to_prefetch.append(url)
569
+
570
+ if urls_to_prefetch:
571
+ logger.info(f"SearchService: Starting prefetch for {len(urls_to_prefetch)} images")
572
+ await prefetch_images(urls_to_prefetch)
573
+
574
+ return results
575
+
576
+ async def _image_search_httpx(self, query: str) -> List[Dict[str, str]]:
577
+ """
578
+ Image search via httpx - parse img tags from HTML response.
579
+ """
580
+ if not httpx:
581
+ logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
582
+ return await self._image_search_crawl4ai(query)
583
+
584
+ url = self._build_image_url(query)
585
+ logger.info(f"SearchService(httpx Image): fetching {url}")
586
+
587
+ try:
588
+ async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
589
+ resp = await client.get(url, headers={
590
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
591
+ })
592
+ resp.raise_for_status()
593
+ html_content = resp.text
594
+
595
+ # Parse img tags from HTML
596
+ # Match: <img ... src="..." ... > or <img ... data-src="..." ...>
597
+ img_regex = re.compile(r'<img[^>]+(?:src|data-src)=["\']([^"\']+)["\'][^>]*(?:alt=["\']([^"\']*)["\'])?[^>]*>', re.IGNORECASE)
598
+
599
+ images = []
600
+ seen = set()
601
+
602
+ for match in img_regex.finditer(html_content):
603
+ src = match.group(1) or ""
604
+ alt = match.group(2) or ""
605
+
606
+ if not src:
607
+ continue
608
+ if src.startswith("//"):
609
+ src = "https:" + src
610
+ if not src.startswith("http"):
611
+ continue
612
+ # Skip tiny icons/placeholders
613
+ if "favicon" in src.lower() or "logo" in src.lower() or "icon" in src.lower():
614
+ continue
615
+ if src in seen:
616
+ continue
617
+ seen.add(src)
618
+
619
+ alt = html.unescape(alt.strip()) if alt else "Image"
620
+ domain = urllib.parse.urlparse(src).hostname or ""
621
+
622
+ images.append({
623
+ "title": alt,
624
+ "url": src,
625
+ "thumbnail": src, # Use same URL as thumbnail
626
+ "domain": domain,
627
+ "content": alt,
628
+ })
629
+
630
+ if len(images) >= self._default_limit:
631
+ break
632
+
633
+ if not images:
634
+ logger.warning(f"SearchService(httpx): no images parsed from HTML")
635
+ else:
636
+ logger.info(f"SearchService(httpx): parsed {len(images)} images")
637
+ return images
638
+
639
+ except Exception as e:
640
+ logger.error(f"SearchService(httpx) image_search failed: {e}")
641
+ return []
642
+
643
+ async def _image_search_crawl4ai(self, query: str) -> List[Dict[str, str]]:
644
+ """
645
+ Image search via Crawl4AI media extraction.
646
+ """
647
+ url = self._build_image_url(query)
648
+ logger.info(f"SearchService(Crawl4AI Image): fetching {url}")
649
+
650
+ try:
651
+ # Use image crawler (text_mode=False) for image search
652
+ crawler = await self._get_crawler()
653
+ result = await crawler.arun(
654
+ url=url,
655
+ config=CrawlerRunConfig(
656
+ wait_until="domcontentloaded", # 不需要等 networkidle
657
+ wait_for_images=False, # 只需要 img src,不需要图片内容
658
+ wait_for="img",
659
+ cache_mode=CacheMode.BYPASS,
660
+ word_count_threshold=1,
661
+ screenshot=False,
662
+ capture_console_messages=False,
663
+ capture_network_requests=False,
664
+ ),
665
+ )
666
+ images = []
667
+ seen = set()
668
+ for img in result.media.get("images", []):
669
+ src = img.get("src") or ""
670
+ if not src:
671
+ continue
672
+ if src.startswith("//"):
673
+ src = "https:" + src
674
+ if not src.startswith("http"):
675
+ continue
676
+ if src in seen:
677
+ continue
678
+ seen.add(src)
679
+ alt = (img.get("alt") or img.get("desc") or "").strip()
680
+ domain = urllib.parse.urlparse(src).hostname or ""
681
+ images.append({
682
+ "title": alt or "Image",
683
+ "url": src,
684
+ "domain": domain,
685
+ "content": alt or "Image",
686
+ })
687
+ if len(images) >= self._default_limit:
688
+ break
689
+ if not images:
690
+ logger.warning(f"SearchService: no images parsed; media_count={len(result.media.get('images', []))}")
691
+ else:
692
+ logger.info(f"SearchService: parsed {len(images)} images via Crawl4AI media")
693
+ return images
694
+ except Exception as e:
695
+ logger.error(f"Crawl4AI image search failed: {e}")
696
+ return []