entari-plugin-hyw 4.0.0rc4__py3-none-any.whl → 4.0.0rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (30) hide show
  1. entari_plugin_hyw/__init__.py +216 -75
  2. entari_plugin_hyw/assets/card-dist/index.html +70 -79
  3. entari_plugin_hyw/browser/__init__.py +10 -0
  4. entari_plugin_hyw/browser/engines/base.py +13 -0
  5. entari_plugin_hyw/browser/engines/bing.py +95 -0
  6. entari_plugin_hyw/browser/engines/duckduckgo.py +137 -0
  7. entari_plugin_hyw/browser/engines/google.py +155 -0
  8. entari_plugin_hyw/browser/landing.html +172 -0
  9. entari_plugin_hyw/browser/manager.py +153 -0
  10. entari_plugin_hyw/browser/service.py +304 -0
  11. entari_plugin_hyw/card-ui/src/App.vue +526 -182
  12. entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +7 -11
  13. entari_plugin_hyw/card-ui/src/components/StageCard.vue +33 -30
  14. entari_plugin_hyw/card-ui/src/types.ts +9 -0
  15. entari_plugin_hyw/definitions.py +155 -0
  16. entari_plugin_hyw/history.py +111 -33
  17. entari_plugin_hyw/misc.py +34 -0
  18. entari_plugin_hyw/modular_pipeline.py +384 -0
  19. entari_plugin_hyw/render_vue.py +326 -239
  20. entari_plugin_hyw/search.py +95 -708
  21. entari_plugin_hyw/stage_base.py +92 -0
  22. entari_plugin_hyw/stage_instruct.py +345 -0
  23. entari_plugin_hyw/stage_instruct_deepsearch.py +104 -0
  24. entari_plugin_hyw/stage_summary.py +164 -0
  25. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/METADATA +4 -4
  26. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/RECORD +28 -16
  27. entari_plugin_hyw/pipeline.py +0 -1219
  28. entari_plugin_hyw/prompts.py +0 -47
  29. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/WHEEL +0 -0
  30. {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/top_level.txt +0 -0
@@ -1,735 +1,122 @@
1
- import urllib.parse
2
1
  import asyncio
2
+ import urllib.parse
3
3
  import re
4
- import html
5
- from typing import List, Dict, Optional, Any
4
+ import time
5
+ from typing import List, Dict, Any, Optional
6
6
  from loguru import logger
7
- from crawl4ai import AsyncWebCrawler
8
- from crawl4ai.async_configs import CrawlerRunConfig, DefaultMarkdownGenerator
9
- from crawl4ai.cache_context import CacheMode
10
-
11
- # Optional imports for new strategies
12
- try:
13
- import httpx
14
- except ImportError:
15
- httpx = None
16
-
17
- try:
18
- from ddgs import DDGS
19
- except ImportError:
20
- try:
21
- from duckduckgo_search import DDGS
22
- except ImportError:
23
- DDGS = None
24
-
25
- # Import image cache for prefetching
26
- from .image_cache import prefetch_images
27
-
28
- # Shared crawler instance to avoid repeated init
29
- _shared_crawler: Optional[AsyncWebCrawler] = None
30
-
31
-
32
- async def get_shared_crawler() -> AsyncWebCrawler:
33
- global _shared_crawler
34
- if _shared_crawler is None:
35
- _shared_crawler = AsyncWebCrawler()
36
- await _shared_crawler.start()
37
- return _shared_crawler
38
7
 
39
-
40
- async def close_shared_crawler():
41
- global _shared_crawler
42
- if _shared_crawler:
43
- try:
44
- await _shared_crawler.close()
45
- except Exception:
46
- pass
47
- _shared_crawler = None
8
+ from .browser.service import get_screenshot_service
9
+ # New engines
10
+ from .browser.engines.bing import BingEngine
11
+ from .browser.engines.duckduckgo import DuckDuckGoEngine
12
+ from .browser.engines.google import GoogleEngine
48
13
 
49
14
  class SearchService:
50
- """
51
- Multi-strategy search & fetch service.
52
- Search providers: 'crawl4ai' (default), 'httpx', 'ddgs'.
53
- Fetch providers: 'crawl4ai' (default), 'jinaai'.
54
- """
55
15
  def __init__(self, config: Any):
56
16
  self.config = config
57
- self._default_limit = getattr(config, "search_limit", 8)
58
- self._crawler: Optional[AsyncWebCrawler] = None
17
+ self._headless = getattr(config, "headless", True)
18
+ self._fetch_timeout = getattr(config, "fetch_timeout", 20.0)
19
+ self._default_limit = getattr(config, "search_limit", 10)
59
20
 
60
- # Configuration for retries/timeouts
61
- self._search_timeout = getattr(config, "search_timeout", 10.0)
62
- self._search_retries = getattr(config, "search_retries", 2)
63
- # Separate providers for search and page fetching
64
- self._search_provider = getattr(config, "search_provider", "crawl4ai")
65
- self._fetch_provider = getattr(config, "fetch_provider", "crawl4ai")
66
- self._jina_api_key = getattr(config, "jina_api_key", None)
21
+ # Domain blocking
22
+ self._blocked_domains = getattr(config, "blocked_domains", []) or []
67
23
 
68
- # Blocked domains for search filtering
69
- self._blocked_domains = getattr(config, "fetch_blocked_domains", None)
70
- if self._blocked_domains is None:
71
- self._blocked_domains = ["wikipedia.org", "csdn.net", "sohu.com", "sogou.com"]
72
- if isinstance(self._blocked_domains, str):
73
- self._blocked_domains = [d.strip() for d in self._blocked_domains.split(",")]
74
-
75
- logger.info(f"SearchService initialized: search_provider='{self._search_provider}', fetch_provider='{self._fetch_provider}', limit={self._default_limit}, timeout={self._search_timeout}s, blocked={self._blocked_domains}")
76
-
77
- def _build_search_url(self, query: str) -> str:
78
- # Note: query is already modified with -site:... in search() before calling this
79
- encoded_query = urllib.parse.quote(query)
80
- base = getattr(self.config, "search_base_url", "https://lite.duckduckgo.com/lite/?q={query}")
81
- if "{query}" in base:
82
- return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
83
- sep = "&" if "?" in base else "?"
84
- return f"{base}{sep}q={encoded_query}"
85
-
86
- def _build_image_url(self, query: str) -> str:
87
- # Images usually don't need strict text site blocking, but we can apply it if desired.
88
- # For now, we apply it to image search as well for consistency.
89
- encoded_query = urllib.parse.quote(query)
90
- base = getattr(self.config, "image_search_base_url", "https://duckduckgo.com/?q={query}&iax=images&ia=images")
91
- if "{query}" in base:
92
- return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
93
- sep = "&" if "?" in base else "?"
94
- return f"{base}{sep}q={encoded_query}&iax=images&ia=images"
95
-
96
- async def search(self, query: str) -> List[Dict[str, str]]:
97
- """
98
- Dispatch search to the configured provider.
99
- """
100
- if not query:
101
- return []
102
-
103
- # Apply blocked domains to query
104
- if self._blocked_domains:
105
- exclusions = " ".join([f"-site:{d}" for d in self._blocked_domains])
106
- # Only append if not already present (simple check)
107
- if "-site:" not in query:
108
- original_query = query
109
- query = f"{query} {exclusions}"
110
- logger.debug(f"SearchService: Modified query '{original_query}' -> '{query}'")
111
-
112
- provider = self._search_provider.lower()
113
- logger.info(f"SearchService: Query='{query}' | Provider='{provider}'")
114
-
115
- if provider == "httpx":
116
- return await self._search_httpx(query)
117
- elif provider == "ddgs":
118
- return await self._search_ddgs(query)
24
+ # Select Engine
25
+ self._engine_name = getattr(config, "search_engine", "bing").lower()
26
+ if self._engine_name == "bing":
27
+ self._engine = BingEngine()
28
+ elif self._engine_name == "google":
29
+ self._engine = GoogleEngine()
30
+ elif self._engine_name == "duckduckgo":
31
+ self._engine = DuckDuckGoEngine()
119
32
  else:
120
- # Default to crawl4ai for backward compatibility or explicit choice
121
- return await self._search_crawl4ai(query)
122
-
123
- async def _search_httpx(self, query: str) -> List[Dict[str, str]]:
124
- """
125
- Directly fetch https://lite.duckduckgo.com/lite/ via httpx and parse HTML.
126
- Fast, no browser overhead.
127
- """
128
- if not httpx:
129
- logger.error("SearchService: httpx not installed, fallback to crawl4ai")
130
- return await self._search_crawl4ai(query)
131
-
132
- url = self._build_search_url(query)
33
+ # Default fallback
34
+ self._engine = BingEngine()
133
35
 
134
- results: List[Dict[str, str]] = []
135
- try:
136
- async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
137
- resp = await client.get(url, headers={
138
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
139
- })
140
- resp.raise_for_status()
141
- html_content = resp.text
142
-
143
- # Regex parsing for DDG Lite
144
- snippet_regex = re.compile(r'<td[^>]*>(.*?)</td>', re.DOTALL)
145
- link_regex = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.DOTALL)
146
-
147
- raw_links = link_regex.findall(html_content)
148
-
149
- seen = set()
150
- for href, text in raw_links:
151
- if len(results) >= self._default_limit:
152
- break
153
-
154
- # Clean href
155
- if "duckduckgo.com" in href:
156
- if "uddg=" in href:
157
- parsed = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
158
- href = parsed.get("uddg", [href])[0]
159
- else:
160
- continue
161
-
162
- if not href.startswith("http"):
163
- continue
164
-
165
- if href in seen:
166
- continue
167
- seen.add(href)
168
-
169
- # Title clean
170
- title = re.sub(r'<[^>]+>', '', text).strip()
171
- title = html.unescape(title)
172
-
173
- results.append({
174
- "title": title,
175
- "url": href,
176
- "domain": urllib.parse.urlparse(href).hostname or "",
177
- "content": title
178
- })
179
-
180
- if not results:
181
- logger.warning("SearchService(httpx): No results parsed via regex.")
182
-
183
- return results
184
-
185
- except Exception as e:
186
- logger.error(f"SearchService(httpx) failed: {e}")
187
- return []
188
-
189
- async def _search_ddgs(self, query: str) -> List[Dict[str, str]]:
190
- """
191
- Use duckduckgo_search library (Sync DDGS).
192
- Executes in thread pool to allow async usage.
193
- Supports retries and timeouts.
194
- """
195
- if not DDGS:
196
- logger.error("SearchService: duckduckgo_search not installed, fallback to crawl4ai")
197
- return await self._search_crawl4ai(query)
198
-
199
- def _do_sync_search():
200
- """Sync search function to run in thread"""
201
- results: List[Dict[str, str]] = []
202
- final_exc = None
203
-
204
- for attempt in range(self._search_retries + 1):
205
- try:
206
- with DDGS(timeout=self._search_timeout) as ddgs:
207
- # Use positional argument for query to be safe across versions
208
- ddgs_gen = ddgs.text(
209
- query,
210
- region='cn-zh',
211
- safesearch='moderate',
212
- max_results=self._default_limit,
213
- backend="duckduckgo",
214
- )
215
-
216
- if ddgs_gen:
217
- for r in ddgs_gen:
218
- results.append({
219
- "title": r.get("title", ""),
220
- "url": r.get("href", ""),
221
- "domain": urllib.parse.urlparse(r.get("href", "")).hostname or "",
222
- "content": r.get("body", "")
223
- })
224
- if len(results) >= self._default_limit:
225
- break
226
-
227
- return results, None
228
-
229
- except Exception as e:
230
- final_exc = e
231
- if attempt < self._search_retries:
232
- import time
233
- time.sleep(1)
234
-
235
- return [], final_exc
236
-
237
- # Run sync search in executor
238
- try:
239
- results, err = await asyncio.to_thread(_do_sync_search)
240
-
241
- if err:
242
- logger.warning(f"SearchService(ddgs) text search failed after retries: {err}")
243
- raise Exception(f"DuckDuckGo API Error: {err}")
244
-
245
- logger.info(f"SearchService(ddgs): Got {len(results)} text results")
246
- return results
247
-
248
- except Exception as e:
249
- logger.error(f"SearchService(ddgs) thread execution failed: {e}")
250
- raise e
251
-
252
- async def _search_ddgs_images(self, query: str) -> List[Dict[str, str]]:
253
- """
254
- Use duckduckgo_search library for images.
255
- """
256
- if not DDGS:
257
- return []
258
-
259
- def _do_sync_image_search():
260
- results: List[Dict[str, str]] = []
261
- final_exc = None
262
-
263
- for attempt in range(self._search_retries + 1):
264
- try:
265
- with DDGS(timeout=self._search_timeout) as ddgs:
266
- ddgs_gen = ddgs.images(
267
- query,
268
- region='cn-zh',
269
- safesearch='moderate',
270
- max_results=self._default_limit,
271
- )
272
-
273
- if ddgs_gen:
274
- for r in ddgs_gen:
275
- # DDGS images returns: title, image, thumbnail, url, source, etc.
276
- # API might differ, adapt to standard format
277
- results.append({
278
- "title": r.get("title", "Image"),
279
- "url": r.get("image", "") or r.get("url", ""), # Full image URL
280
- "thumbnail": r.get("thumbnail", ""),
281
- "domain": r.get("source", "") or urllib.parse.urlparse(r.get("url", "")).hostname or "",
282
- })
283
- if len(results) >= self._default_limit:
284
- break
285
-
286
- return results, None
287
- except Exception as e:
288
- final_exc = e
289
- if attempt < self._search_retries:
290
- import time
291
- time.sleep(1)
292
-
293
- return [], final_exc
294
-
295
- try:
296
- results, err = await asyncio.to_thread(_do_sync_image_search)
297
- if err:
298
- logger.warning(f"SearchService(ddgs) image search failed: {err}")
299
- return []
300
-
301
- logger.info(f"SearchService(ddgs): Got {len(results)} image results")
302
- return results
303
- except Exception as e:
304
- logger.error(f"SearchService(ddgs) image thread failed: {e}")
305
- return []
306
-
307
- async def _search_crawl4ai(self, query: str) -> List[Dict[str, str]]:
308
- """
309
- Crawl the configured SERP using Crawl4AI and return parsed results.
310
- Original implementation.
311
- """
312
- if not query:
313
- return []
314
-
315
- url = self._build_search_url(query)
316
- logger.info(f"SearchService(Crawl4AI): fetching {url}")
317
-
318
- try:
319
- crawler = await self._get_crawler()
320
- result = await crawler.arun(
321
- url=url,
322
- config=CrawlerRunConfig(
323
- wait_until="domcontentloaded",
324
- wait_for="article",
325
- cache_mode=CacheMode.BYPASS,
326
- word_count_threshold=1,
327
- screenshot=False,
328
- capture_console_messages=False,
329
- capture_network_requests=False,
330
- ),
331
- )
332
- return self._parse_markdown_result(result, limit=self._default_limit)
333
- except Exception as e:
334
- logger.error(f"Crawl4AI search failed: {e}")
335
- return []
36
+ logger.info(f"SearchService initialized with engine: {self._engine_name}")
336
37
 
337
- def _parse_markdown_result(self, result, limit: int = 8) -> List[Dict[str, str]]:
338
- """Parse Crawl4AI result into search items without manual HTML parsing."""
339
- md = (result.markdown or result.extracted_content or "").strip()
340
- lines = [ln.strip() for ln in md.splitlines() if ln.strip()]
341
- links = result.links.get("external", []) if getattr(result, "links", None) else []
342
- seen = set()
343
- results: List[Dict[str, str]] = []
344
-
345
- def find_snippet(url: str, domain: str) -> str:
346
- for i, ln in enumerate(lines):
347
- # Avoid matching DDG internal links (filter by site, etc) which contain the target URL in query params
348
- if "duckduckgo.com" in ln:
349
- continue
350
-
351
- if url in ln or (domain and domain in ln):
352
- # If this line looks like the title/link line (e.g. starts with [Title](url) or similar),
353
- # and the NEXT line exists, return the next line as snippet.
354
- if i + 1 < len(lines) and len(ln) < 300: # heuristic: title lines are usually shorter
355
- return lines[i+1][:400]
356
- return ln[:400]
357
- # fallback to first non-empty line
358
- return lines[0][:400] if lines else ""
359
-
360
- for link in links:
361
- url = link.get("href") or ""
362
- if not url or url in seen:
363
- continue
364
- seen.add(url)
365
- domain = urllib.parse.urlparse(url).hostname or ""
366
- title = link.get("title") or link.get("text") or url
367
- snippet = find_snippet(url, domain)
368
- results.append({
369
- "title": title.strip(),
370
- "url": url,
371
- "domain": domain,
372
- "content": snippet or title,
373
- })
374
- if len(results) >= limit:
375
- break
376
-
377
- if not results:
378
- logger.warning(f"SearchService: no results parsed; md_length={len(md)}, links={len(links)}")
379
- else:
380
- logger.info(f"SearchService: parsed {len(results)} results via Crawl4AI links")
381
- return results
382
-
383
- async def fetch_page(self, url: str) -> Dict[str, str]:
384
- """
385
- Fetch a single page and return cleaned markdown/text plus metadata.
386
- Dispatches to jinaai or Crawl4AI based on fetch_provider config.
387
- """
388
- if not url:
389
- return {"content": "Error: missing url", "title": "Error", "url": ""}
390
-
391
- provider = self._fetch_provider.lower()
392
- logger.info(f"SearchService: fetching page '{url}' using fetch_provider='{provider}'")
393
-
394
- if provider == "jinaai":
395
- return await self._fetch_page_jinaai(url)
396
- else:
397
- return await self._fetch_page_crawl4ai(url)
398
-
399
- async def _fetch_page_jinaai(self, url: str) -> Dict[str, str]:
400
- """
401
- Fetch page via Jina AI Reader - returns clean markdown content.
402
- https://r.jina.ai/{url}
403
- """
404
- if not httpx:
405
- logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
406
- return await self._fetch_page_crawl4ai(url)
407
-
408
- jina_url = f"https://r.jina.ai/{url}"
409
- headers = {
410
- "X-Return-Format": "markdown",
411
- }
412
- # Add authorization header if API key is configured
413
- if self._jina_api_key:
414
- headers["Authorization"] = f"Bearer {self._jina_api_key}"
415
-
416
- try:
417
- async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
418
- resp = await client.get(jina_url, headers=headers)
419
- resp.raise_for_status()
420
- content = resp.text
421
-
422
- # Jina AI returns markdown content directly
423
- # Try to extract title from first heading or first line
424
- title = "No Title"
425
- lines = content.strip().split('\n')
426
- for line in lines:
427
- line = line.strip()
428
- if line.startswith('# '):
429
- title = line[2:].strip()
430
- break
431
- elif line and not line.startswith('!') and not line.startswith('['):
432
- # Use first non-empty, non-image, non-link line as title
433
- title = line[:100]
434
- break
435
-
436
- logger.info(f"SearchService(jinaai): fetched page, title='{title}', content_len={len(content)}")
437
- return {
438
- "content": content[:8000],
439
- "title": title,
440
- "url": url,
441
- "images": []
442
- }
443
-
444
- except Exception as e:
445
- logger.error(f"SearchService(jinaai) fetch_page failed: {e}")
446
- return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
447
-
448
- async def _fetch_page_httpx(self, url: str) -> Dict[str, str]:
449
- """
450
- Fetch page via httpx - fast, no browser overhead.
451
- """
452
- if not httpx:
453
- logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
454
- return await self._fetch_page_crawl4ai(url)
455
-
456
- try:
457
- async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
458
- resp = await client.get(url, headers={
459
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
460
- })
461
- resp.raise_for_status()
462
- html_content = resp.text
463
-
464
- # Extract title from HTML
465
- title = "No Title"
466
- title_match = re.search(r'<title[^>]*>([^<]+)</title>', html_content, re.IGNORECASE)
467
- if title_match:
468
- title = html.unescape(title_match.group(1).strip())
469
-
470
- # Try og:title as fallback
471
- if title == "No Title":
472
- og_match = re.search(r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
473
- if og_match:
474
- title = html.unescape(og_match.group(1).strip())
475
-
476
- # Simple HTML to text conversion
477
- # Remove script/style tags
478
- content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', html_content, flags=re.IGNORECASE)
479
- content = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', html_content, flags=re.IGNORECASE)
480
- # Remove HTML tags
481
- content = re.sub(r'<[^>]+>', ' ', content)
482
- # Decode entities
483
- content = html.unescape(content)
484
- # Normalize whitespace
485
- content = re.sub(r'\s+', ' ', content).strip()
486
-
487
- logger.info(f"SearchService(httpx): fetched page, title='{title}', content_len={len(content)}")
488
- return {
489
- "content": content[:8000],
490
- "title": title,
491
- "url": url,
492
- "images": []
493
- }
494
-
495
- except Exception as e:
496
- logger.error(f"SearchService(httpx) fetch_page failed: {e}")
497
- return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
498
-
499
- async def _fetch_page_crawl4ai(self, url: str) -> Dict[str, str]:
500
- """
501
- Fetch page via Crawl4AI - full browser rendering.
502
- """
503
- try:
504
- crawler = await self._get_crawler()
505
- result = await crawler.arun(
506
- url=url,
507
- config=CrawlerRunConfig(
508
- wait_until="networkidle",
509
- wait_for_images=False, # Faster: skip image loading
510
- cache_mode=CacheMode.BYPASS,
511
- word_count_threshold=1,
512
- screenshot=False,
513
- # Markdown config from test.py
514
- markdown_generator=DefaultMarkdownGenerator(
515
- options={
516
- "ignore_links": True,
517
- "ignore_images": False,
518
- "skip_internal_links": True
519
- }
520
- ),
521
- capture_console_messages=False,
522
- capture_network_requests=False,
523
- ),
524
- )
525
- if not result.success:
526
- return {"content": f"Error: crawl failed ({result.error_message or 'unknown'})", "title": "Error", "url": url}
527
-
528
- content = result.markdown or result.extracted_content or result.cleaned_html or result.html or ""
529
- # Extract metadata if available, otherwise fallback
530
- title = "No Title"
531
- if result.metadata:
532
- title = result.metadata.get("title") or result.metadata.get("og:title") or title
533
-
534
- # If metadata title is missing/generic, try to grab from links or url? No, metadata is best.
535
- if title == "No Title" and result.links:
536
- # Minimal fallback not really possible without parsing HTML again or regex
537
- pass
538
-
539
- # Extract images from media
540
- images = []
541
- if result.media and "images" in result.media:
542
- for img in result.media["images"]:
543
- src = img.get("src")
544
- if src and src.startswith("http"):
545
- images.append(src)
546
-
547
- return {
548
- "content": content[:8000],
549
- "title": title,
550
- "url": result.url or url,
551
- "images": images
552
- }
553
- except Exception as e:
554
- logger.error(f"Crawl4AI fetch failed: {e}")
555
- return {"content": f"Error: crawl failed ({e})", "title": "Error", "url": url}
556
-
557
- async def _get_crawler(self) -> AsyncWebCrawler:
558
- # Prefer shared crawler to minimize INIT logs; fall back to local if needed
559
- try:
560
- return await get_shared_crawler()
561
- except Exception as e:
562
- logger.warning(f"Shared crawler unavailable, creating local: {e}")
563
- if self._crawler is None:
564
- self._crawler = AsyncWebCrawler()
565
- await self._crawler.start()
566
- return self._crawler
38
+ def _build_search_url(self, query: str) -> str:
39
+ return self._engine.build_url(query, self._default_limit)
567
40
 
568
- async def close(self):
569
- if self._crawler:
570
- try:
571
- await self._crawler.close()
572
- except Exception:
573
- pass
574
- self._crawler = None
41
+ async def search_batch(self, queries: List[str]) -> List[List[Dict[str, Any]]]:
42
+ """Execute multiple searches concurrently."""
43
+ tasks = [self.search(q) for q in queries]
44
+ return await asyncio.gather(*tasks)
575
45
 
576
- async def image_search(self, query: str, prefetch: bool = True) -> List[Dict[str, str]]:
46
+ async def search(self, query: str) -> List[Dict[str, Any]]:
577
47
  """
578
- Image search - dispatches to httpx, ddgs, or Crawl4AI based on search_provider.
579
-
580
- Args:
581
- query: Search query
582
- prefetch: If True, automatically start prefetching images for caching
48
+ Main search entry point.
49
+ Returns parsed search results only.
583
50
  """
584
51
  if not query:
585
52
  return []
586
53
 
587
- provider = self._search_provider.lower()
588
- logger.info(f"SearchService: image searching for '{query}' using provider='{provider}'")
54
+ # Apply blocking
55
+ final_query = query
56
+ if self._blocked_domains and "-site:" not in query:
57
+ exclusions = " ".join([f"-site:{d}" for d in self._blocked_domains])
58
+ final_query = f"{query} {exclusions}"
589
59
 
590
- if provider == "ddgs":
591
- results = await self._search_ddgs_images(query)
592
- elif provider == "httpx":
593
- results = await self._image_search_httpx(query)
594
- else:
595
- results = await self._image_search_crawl4ai(query)
596
-
597
- # Start prefetching images in background for faster rendering
598
- if prefetch and results:
599
- urls_to_prefetch = []
600
- for img in results:
601
- # Prefer thumbnail for prefetch (smaller, used in UI)
602
- thumb = img.get("thumbnail")
603
- url = img.get("url")
604
- if thumb:
605
- urls_to_prefetch.append(thumb)
606
- if url and url != thumb:
607
- urls_to_prefetch.append(url)
608
-
609
- if urls_to_prefetch:
610
- logger.info(f"SearchService: Starting prefetch for {len(urls_to_prefetch)} images")
611
- await prefetch_images(urls_to_prefetch)
612
-
613
- return results
614
-
615
- async def _image_search_httpx(self, query: str) -> List[Dict[str, str]]:
616
- """
617
- Image search via httpx - parse img tags from HTML response.
618
- """
619
- if not httpx:
620
- logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
621
- return await self._image_search_crawl4ai(query)
622
-
623
- url = self._build_image_url(query)
624
- logger.info(f"SearchService(httpx Image): fetching {url}")
60
+ url = self._build_search_url(final_query)
61
+ logger.info(f"Search: '{query}' -> {url}")
625
62
 
63
+ results = []
626
64
  try:
627
- async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
628
- resp = await client.get(url, headers={
629
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
630
- })
631
- resp.raise_for_status()
632
- html_content = resp.text
633
-
634
- # Parse img tags from HTML
635
- # Match: <img ... src="..." ... > or <img ... data-src="..." ...>
636
- img_regex = re.compile(r'<img[^>]+(?:src|data-src)=["\']([^"\']+)["\'][^>]*(?:alt=["\']([^"\']*)["\'])?[^>]*>', re.IGNORECASE)
637
-
638
- images = []
639
- seen = set()
640
-
641
- for match in img_regex.finditer(html_content):
642
- src = match.group(1) or ""
643
- alt = match.group(2) or ""
644
-
645
- if not src:
646
- continue
647
- if src.startswith("//"):
648
- src = "https:" + src
649
- if not src.startswith("http"):
650
- continue
651
- # Skip tiny icons/placeholders
652
- if "favicon" in src.lower() or "logo" in src.lower() or "icon" in src.lower():
653
- continue
654
- if src in seen:
655
- continue
656
- seen.add(src)
657
-
658
- alt = html.unescape(alt.strip()) if alt else "Image"
659
- domain = urllib.parse.urlparse(src).hostname or ""
660
-
661
- images.append({
662
- "title": alt,
663
- "url": src,
664
- "thumbnail": src, # Use same URL as thumbnail
665
- "domain": domain,
666
- "content": alt,
667
- })
668
-
669
- if len(images) >= self._default_limit:
670
- break
671
-
672
- if not images:
673
- logger.warning(f"SearchService(httpx): no images parsed from HTML")
65
+ # Fetch - Search parsing doesn't need screenshot, only HTML
66
+ page_data = await self.fetch_page_raw(url, include_screenshot=False)
67
+ content = page_data.get("html", "") or page_data.get("content", "")
68
+
69
+ # Parse Results (skip raw page - only return parsed results)
70
+ if content and not content.startswith("Error"):
71
+ parsed = self._engine.parse(content)
72
+
73
+ # JAVASCRIPT IMAGE INJECTION
74
+ # Inject base64 images from JS extraction if available
75
+ # This provides robust fallback if HTTP URLs fail to load
76
+ js_images = page_data.get("images", [])
77
+ if js_images:
78
+ logger.info(f"Search: Injecting {len(js_images)} base64 images into top results")
79
+ for i, img_b64 in enumerate(js_images):
80
+ if i < len(parsed):
81
+ b64_src = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
82
+ if "images" not in parsed[i]: parsed[i]["images"] = []
83
+ # Prepend to prioritize base64 (guaranteed render) over HTTP URLs
84
+ parsed[i]["images"].insert(0, b64_src)
85
+
86
+ logger.info(f"Search parsed {len(parsed)} results for '{query}' using {self._engine_name}")
87
+ results.extend(parsed)
674
88
  else:
675
- logger.info(f"SearchService(httpx): parsed {len(images)} images")
676
- return images
677
-
678
- except Exception as e:
679
- logger.error(f"SearchService(httpx) image_search failed: {e}")
680
- return []
89
+ logger.warning(f"Search failed/empty for '{query}': {content[:100]}")
681
90
 
682
- async def _image_search_crawl4ai(self, query: str) -> List[Dict[str, str]]:
683
- """
684
- Image search via Crawl4AI media extraction.
685
- """
686
- url = self._build_image_url(query)
687
- logger.info(f"SearchService(Crawl4AI Image): fetching {url}")
91
+ return results
688
92
 
689
- try:
690
- # Use image crawler (text_mode=False) for image search
691
- crawler = await self._get_crawler()
692
- result = await crawler.arun(
693
- url=url,
694
- config=CrawlerRunConfig(
695
- wait_until="domcontentloaded", # 不需要等 networkidle
696
- wait_for_images=False, # 只需要 img src,不需要图片内容
697
- wait_for="img",
698
- cache_mode=CacheMode.BYPASS,
699
- word_count_threshold=1,
700
- screenshot=False,
701
- capture_console_messages=False,
702
- capture_network_requests=False,
703
- ),
704
- )
705
- images = []
706
- seen = set()
707
- for img in result.media.get("images", []):
708
- src = img.get("src") or ""
709
- if not src:
710
- continue
711
- if src.startswith("//"):
712
- src = "https:" + src
713
- if not src.startswith("http"):
714
- continue
715
- if src in seen:
716
- continue
717
- seen.add(src)
718
- alt = (img.get("alt") or img.get("desc") or "").strip()
719
- domain = urllib.parse.urlparse(src).hostname or ""
720
- images.append({
721
- "title": alt or "Image",
722
- "url": src,
723
- "domain": domain,
724
- "content": alt or "Image",
725
- })
726
- if len(images) >= self._default_limit:
727
- break
728
- if not images:
729
- logger.warning(f"SearchService: no images parsed; media_count={len(result.media.get('images', []))}")
730
- else:
731
- logger.info(f"SearchService: parsed {len(images)} images via Crawl4AI media")
732
- return images
733
93
  except Exception as e:
734
- logger.error(f"Crawl4AI image search failed: {e}")
735
- return []
94
+ logger.error(f"Search error for '{query}': {e}")
95
+ # Ensure we return at least an error item
96
+ return [{
97
+ "title": f"Error Search: {query}",
98
+ "url": url,
99
+ "content": f"Error: {e}",
100
+ "type": "search_raw_page",
101
+ "_hidden": True
102
+ }]
103
+
104
+ async def fetch_pages_batch(self, urls: List[str], include_screenshot: bool = True) -> List[Dict[str, Any]]:
105
+ """Fetch multiple pages concurrently."""
106
+ tasks = [self.fetch_page(u, include_screenshot=include_screenshot) for u in urls]
107
+ return await asyncio.gather(*tasks)
108
+
109
+ async def fetch_page(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
110
+ """
111
+ Fetch a single page for reading/extracting content.
112
+ """
113
+ if timeout is None:
114
+ timeout = self._fetch_timeout
115
+ return await self.fetch_page_raw(url, timeout, include_screenshot=include_screenshot)
116
+
117
+ async def fetch_page_raw(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
118
+ """Internal: Get raw data from browser service."""
119
+ if timeout is None:
120
+ timeout = self._fetch_timeout
121
+ service = get_screenshot_service(headless=self._headless)
122
+ return await service.fetch_page(url, timeout=timeout, include_screenshot=include_screenshot)