entari-plugin-hyw 4.0.0rc4__py3-none-any.whl → 4.0.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +216 -75
- entari_plugin_hyw/assets/card-dist/index.html +70 -79
- entari_plugin_hyw/browser/__init__.py +10 -0
- entari_plugin_hyw/browser/engines/base.py +13 -0
- entari_plugin_hyw/browser/engines/bing.py +95 -0
- entari_plugin_hyw/browser/engines/duckduckgo.py +137 -0
- entari_plugin_hyw/browser/engines/google.py +155 -0
- entari_plugin_hyw/browser/landing.html +172 -0
- entari_plugin_hyw/browser/manager.py +153 -0
- entari_plugin_hyw/browser/service.py +304 -0
- entari_plugin_hyw/card-ui/src/App.vue +526 -182
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +7 -11
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +33 -30
- entari_plugin_hyw/card-ui/src/types.ts +9 -0
- entari_plugin_hyw/definitions.py +155 -0
- entari_plugin_hyw/history.py +111 -33
- entari_plugin_hyw/misc.py +34 -0
- entari_plugin_hyw/modular_pipeline.py +384 -0
- entari_plugin_hyw/render_vue.py +326 -239
- entari_plugin_hyw/search.py +95 -708
- entari_plugin_hyw/stage_base.py +92 -0
- entari_plugin_hyw/stage_instruct.py +345 -0
- entari_plugin_hyw/stage_instruct_deepsearch.py +104 -0
- entari_plugin_hyw/stage_summary.py +164 -0
- {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/METADATA +4 -4
- {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/RECORD +28 -16
- entari_plugin_hyw/pipeline.py +0 -1219
- entari_plugin_hyw/prompts.py +0 -47
- {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-4.0.0rc4.dist-info → entari_plugin_hyw-4.0.0rc6.dist-info}/top_level.txt +0 -0
entari_plugin_hyw/search.py
CHANGED
|
@@ -1,735 +1,122 @@
|
|
|
1
|
-
import urllib.parse
|
|
2
1
|
import asyncio
|
|
2
|
+
import urllib.parse
|
|
3
3
|
import re
|
|
4
|
-
import
|
|
5
|
-
from typing import List, Dict,
|
|
4
|
+
import time
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
6
|
from loguru import logger
|
|
7
|
-
from crawl4ai import AsyncWebCrawler
|
|
8
|
-
from crawl4ai.async_configs import CrawlerRunConfig, DefaultMarkdownGenerator
|
|
9
|
-
from crawl4ai.cache_context import CacheMode
|
|
10
|
-
|
|
11
|
-
# Optional imports for new strategies
|
|
12
|
-
try:
|
|
13
|
-
import httpx
|
|
14
|
-
except ImportError:
|
|
15
|
-
httpx = None
|
|
16
|
-
|
|
17
|
-
try:
|
|
18
|
-
from ddgs import DDGS
|
|
19
|
-
except ImportError:
|
|
20
|
-
try:
|
|
21
|
-
from duckduckgo_search import DDGS
|
|
22
|
-
except ImportError:
|
|
23
|
-
DDGS = None
|
|
24
|
-
|
|
25
|
-
# Import image cache for prefetching
|
|
26
|
-
from .image_cache import prefetch_images
|
|
27
|
-
|
|
28
|
-
# Shared crawler instance to avoid repeated init
|
|
29
|
-
_shared_crawler: Optional[AsyncWebCrawler] = None
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
async def get_shared_crawler() -> AsyncWebCrawler:
|
|
33
|
-
global _shared_crawler
|
|
34
|
-
if _shared_crawler is None:
|
|
35
|
-
_shared_crawler = AsyncWebCrawler()
|
|
36
|
-
await _shared_crawler.start()
|
|
37
|
-
return _shared_crawler
|
|
38
7
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
await _shared_crawler.close()
|
|
45
|
-
except Exception:
|
|
46
|
-
pass
|
|
47
|
-
_shared_crawler = None
|
|
8
|
+
from .browser.service import get_screenshot_service
|
|
9
|
+
# New engines
|
|
10
|
+
from .browser.engines.bing import BingEngine
|
|
11
|
+
from .browser.engines.duckduckgo import DuckDuckGoEngine
|
|
12
|
+
from .browser.engines.google import GoogleEngine
|
|
48
13
|
|
|
49
14
|
class SearchService:
|
|
50
|
-
"""
|
|
51
|
-
Multi-strategy search & fetch service.
|
|
52
|
-
Search providers: 'crawl4ai' (default), 'httpx', 'ddgs'.
|
|
53
|
-
Fetch providers: 'crawl4ai' (default), 'jinaai'.
|
|
54
|
-
"""
|
|
55
15
|
def __init__(self, config: Any):
|
|
56
16
|
self.config = config
|
|
57
|
-
self.
|
|
58
|
-
self.
|
|
17
|
+
self._headless = getattr(config, "headless", True)
|
|
18
|
+
self._fetch_timeout = getattr(config, "fetch_timeout", 20.0)
|
|
19
|
+
self._default_limit = getattr(config, "search_limit", 10)
|
|
59
20
|
|
|
60
|
-
#
|
|
61
|
-
self.
|
|
62
|
-
self._search_retries = getattr(config, "search_retries", 2)
|
|
63
|
-
# Separate providers for search and page fetching
|
|
64
|
-
self._search_provider = getattr(config, "search_provider", "crawl4ai")
|
|
65
|
-
self._fetch_provider = getattr(config, "fetch_provider", "crawl4ai")
|
|
66
|
-
self._jina_api_key = getattr(config, "jina_api_key", None)
|
|
21
|
+
# Domain blocking
|
|
22
|
+
self._blocked_domains = getattr(config, "blocked_domains", []) or []
|
|
67
23
|
|
|
68
|
-
#
|
|
69
|
-
self.
|
|
70
|
-
if self.
|
|
71
|
-
self.
|
|
72
|
-
|
|
73
|
-
self.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def _build_search_url(self, query: str) -> str:
|
|
78
|
-
# Note: query is already modified with -site:... in search() before calling this
|
|
79
|
-
encoded_query = urllib.parse.quote(query)
|
|
80
|
-
base = getattr(self.config, "search_base_url", "https://lite.duckduckgo.com/lite/?q={query}")
|
|
81
|
-
if "{query}" in base:
|
|
82
|
-
return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
|
|
83
|
-
sep = "&" if "?" in base else "?"
|
|
84
|
-
return f"{base}{sep}q={encoded_query}"
|
|
85
|
-
|
|
86
|
-
def _build_image_url(self, query: str) -> str:
|
|
87
|
-
# Images usually don't need strict text site blocking, but we can apply it if desired.
|
|
88
|
-
# For now, we apply it to image search as well for consistency.
|
|
89
|
-
encoded_query = urllib.parse.quote(query)
|
|
90
|
-
base = getattr(self.config, "image_search_base_url", "https://duckduckgo.com/?q={query}&iax=images&ia=images")
|
|
91
|
-
if "{query}" in base:
|
|
92
|
-
return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
|
|
93
|
-
sep = "&" if "?" in base else "?"
|
|
94
|
-
return f"{base}{sep}q={encoded_query}&iax=images&ia=images"
|
|
95
|
-
|
|
96
|
-
async def search(self, query: str) -> List[Dict[str, str]]:
|
|
97
|
-
"""
|
|
98
|
-
Dispatch search to the configured provider.
|
|
99
|
-
"""
|
|
100
|
-
if not query:
|
|
101
|
-
return []
|
|
102
|
-
|
|
103
|
-
# Apply blocked domains to query
|
|
104
|
-
if self._blocked_domains:
|
|
105
|
-
exclusions = " ".join([f"-site:{d}" for d in self._blocked_domains])
|
|
106
|
-
# Only append if not already present (simple check)
|
|
107
|
-
if "-site:" not in query:
|
|
108
|
-
original_query = query
|
|
109
|
-
query = f"{query} {exclusions}"
|
|
110
|
-
logger.debug(f"SearchService: Modified query '{original_query}' -> '{query}'")
|
|
111
|
-
|
|
112
|
-
provider = self._search_provider.lower()
|
|
113
|
-
logger.info(f"SearchService: Query='{query}' | Provider='{provider}'")
|
|
114
|
-
|
|
115
|
-
if provider == "httpx":
|
|
116
|
-
return await self._search_httpx(query)
|
|
117
|
-
elif provider == "ddgs":
|
|
118
|
-
return await self._search_ddgs(query)
|
|
24
|
+
# Select Engine
|
|
25
|
+
self._engine_name = getattr(config, "search_engine", "bing").lower()
|
|
26
|
+
if self._engine_name == "bing":
|
|
27
|
+
self._engine = BingEngine()
|
|
28
|
+
elif self._engine_name == "google":
|
|
29
|
+
self._engine = GoogleEngine()
|
|
30
|
+
elif self._engine_name == "duckduckgo":
|
|
31
|
+
self._engine = DuckDuckGoEngine()
|
|
119
32
|
else:
|
|
120
|
-
# Default
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
async def _search_httpx(self, query: str) -> List[Dict[str, str]]:
|
|
124
|
-
"""
|
|
125
|
-
Directly fetch https://lite.duckduckgo.com/lite/ via httpx and parse HTML.
|
|
126
|
-
Fast, no browser overhead.
|
|
127
|
-
"""
|
|
128
|
-
if not httpx:
|
|
129
|
-
logger.error("SearchService: httpx not installed, fallback to crawl4ai")
|
|
130
|
-
return await self._search_crawl4ai(query)
|
|
131
|
-
|
|
132
|
-
url = self._build_search_url(query)
|
|
33
|
+
# Default fallback
|
|
34
|
+
self._engine = BingEngine()
|
|
133
35
|
|
|
134
|
-
|
|
135
|
-
try:
|
|
136
|
-
async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
|
|
137
|
-
resp = await client.get(url, headers={
|
|
138
|
-
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
|
|
139
|
-
})
|
|
140
|
-
resp.raise_for_status()
|
|
141
|
-
html_content = resp.text
|
|
142
|
-
|
|
143
|
-
# Regex parsing for DDG Lite
|
|
144
|
-
snippet_regex = re.compile(r'<td[^>]*>(.*?)</td>', re.DOTALL)
|
|
145
|
-
link_regex = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.DOTALL)
|
|
146
|
-
|
|
147
|
-
raw_links = link_regex.findall(html_content)
|
|
148
|
-
|
|
149
|
-
seen = set()
|
|
150
|
-
for href, text in raw_links:
|
|
151
|
-
if len(results) >= self._default_limit:
|
|
152
|
-
break
|
|
153
|
-
|
|
154
|
-
# Clean href
|
|
155
|
-
if "duckduckgo.com" in href:
|
|
156
|
-
if "uddg=" in href:
|
|
157
|
-
parsed = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
|
|
158
|
-
href = parsed.get("uddg", [href])[0]
|
|
159
|
-
else:
|
|
160
|
-
continue
|
|
161
|
-
|
|
162
|
-
if not href.startswith("http"):
|
|
163
|
-
continue
|
|
164
|
-
|
|
165
|
-
if href in seen:
|
|
166
|
-
continue
|
|
167
|
-
seen.add(href)
|
|
168
|
-
|
|
169
|
-
# Title clean
|
|
170
|
-
title = re.sub(r'<[^>]+>', '', text).strip()
|
|
171
|
-
title = html.unescape(title)
|
|
172
|
-
|
|
173
|
-
results.append({
|
|
174
|
-
"title": title,
|
|
175
|
-
"url": href,
|
|
176
|
-
"domain": urllib.parse.urlparse(href).hostname or "",
|
|
177
|
-
"content": title
|
|
178
|
-
})
|
|
179
|
-
|
|
180
|
-
if not results:
|
|
181
|
-
logger.warning("SearchService(httpx): No results parsed via regex.")
|
|
182
|
-
|
|
183
|
-
return results
|
|
184
|
-
|
|
185
|
-
except Exception as e:
|
|
186
|
-
logger.error(f"SearchService(httpx) failed: {e}")
|
|
187
|
-
return []
|
|
188
|
-
|
|
189
|
-
async def _search_ddgs(self, query: str) -> List[Dict[str, str]]:
|
|
190
|
-
"""
|
|
191
|
-
Use duckduckgo_search library (Sync DDGS).
|
|
192
|
-
Executes in thread pool to allow async usage.
|
|
193
|
-
Supports retries and timeouts.
|
|
194
|
-
"""
|
|
195
|
-
if not DDGS:
|
|
196
|
-
logger.error("SearchService: duckduckgo_search not installed, fallback to crawl4ai")
|
|
197
|
-
return await self._search_crawl4ai(query)
|
|
198
|
-
|
|
199
|
-
def _do_sync_search():
|
|
200
|
-
"""Sync search function to run in thread"""
|
|
201
|
-
results: List[Dict[str, str]] = []
|
|
202
|
-
final_exc = None
|
|
203
|
-
|
|
204
|
-
for attempt in range(self._search_retries + 1):
|
|
205
|
-
try:
|
|
206
|
-
with DDGS(timeout=self._search_timeout) as ddgs:
|
|
207
|
-
# Use positional argument for query to be safe across versions
|
|
208
|
-
ddgs_gen = ddgs.text(
|
|
209
|
-
query,
|
|
210
|
-
region='cn-zh',
|
|
211
|
-
safesearch='moderate',
|
|
212
|
-
max_results=self._default_limit,
|
|
213
|
-
backend="duckduckgo",
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
if ddgs_gen:
|
|
217
|
-
for r in ddgs_gen:
|
|
218
|
-
results.append({
|
|
219
|
-
"title": r.get("title", ""),
|
|
220
|
-
"url": r.get("href", ""),
|
|
221
|
-
"domain": urllib.parse.urlparse(r.get("href", "")).hostname or "",
|
|
222
|
-
"content": r.get("body", "")
|
|
223
|
-
})
|
|
224
|
-
if len(results) >= self._default_limit:
|
|
225
|
-
break
|
|
226
|
-
|
|
227
|
-
return results, None
|
|
228
|
-
|
|
229
|
-
except Exception as e:
|
|
230
|
-
final_exc = e
|
|
231
|
-
if attempt < self._search_retries:
|
|
232
|
-
import time
|
|
233
|
-
time.sleep(1)
|
|
234
|
-
|
|
235
|
-
return [], final_exc
|
|
236
|
-
|
|
237
|
-
# Run sync search in executor
|
|
238
|
-
try:
|
|
239
|
-
results, err = await asyncio.to_thread(_do_sync_search)
|
|
240
|
-
|
|
241
|
-
if err:
|
|
242
|
-
logger.warning(f"SearchService(ddgs) text search failed after retries: {err}")
|
|
243
|
-
raise Exception(f"DuckDuckGo API Error: {err}")
|
|
244
|
-
|
|
245
|
-
logger.info(f"SearchService(ddgs): Got {len(results)} text results")
|
|
246
|
-
return results
|
|
247
|
-
|
|
248
|
-
except Exception as e:
|
|
249
|
-
logger.error(f"SearchService(ddgs) thread execution failed: {e}")
|
|
250
|
-
raise e
|
|
251
|
-
|
|
252
|
-
async def _search_ddgs_images(self, query: str) -> List[Dict[str, str]]:
|
|
253
|
-
"""
|
|
254
|
-
Use duckduckgo_search library for images.
|
|
255
|
-
"""
|
|
256
|
-
if not DDGS:
|
|
257
|
-
return []
|
|
258
|
-
|
|
259
|
-
def _do_sync_image_search():
|
|
260
|
-
results: List[Dict[str, str]] = []
|
|
261
|
-
final_exc = None
|
|
262
|
-
|
|
263
|
-
for attempt in range(self._search_retries + 1):
|
|
264
|
-
try:
|
|
265
|
-
with DDGS(timeout=self._search_timeout) as ddgs:
|
|
266
|
-
ddgs_gen = ddgs.images(
|
|
267
|
-
query,
|
|
268
|
-
region='cn-zh',
|
|
269
|
-
safesearch='moderate',
|
|
270
|
-
max_results=self._default_limit,
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
if ddgs_gen:
|
|
274
|
-
for r in ddgs_gen:
|
|
275
|
-
# DDGS images returns: title, image, thumbnail, url, source, etc.
|
|
276
|
-
# API might differ, adapt to standard format
|
|
277
|
-
results.append({
|
|
278
|
-
"title": r.get("title", "Image"),
|
|
279
|
-
"url": r.get("image", "") or r.get("url", ""), # Full image URL
|
|
280
|
-
"thumbnail": r.get("thumbnail", ""),
|
|
281
|
-
"domain": r.get("source", "") or urllib.parse.urlparse(r.get("url", "")).hostname or "",
|
|
282
|
-
})
|
|
283
|
-
if len(results) >= self._default_limit:
|
|
284
|
-
break
|
|
285
|
-
|
|
286
|
-
return results, None
|
|
287
|
-
except Exception as e:
|
|
288
|
-
final_exc = e
|
|
289
|
-
if attempt < self._search_retries:
|
|
290
|
-
import time
|
|
291
|
-
time.sleep(1)
|
|
292
|
-
|
|
293
|
-
return [], final_exc
|
|
294
|
-
|
|
295
|
-
try:
|
|
296
|
-
results, err = await asyncio.to_thread(_do_sync_image_search)
|
|
297
|
-
if err:
|
|
298
|
-
logger.warning(f"SearchService(ddgs) image search failed: {err}")
|
|
299
|
-
return []
|
|
300
|
-
|
|
301
|
-
logger.info(f"SearchService(ddgs): Got {len(results)} image results")
|
|
302
|
-
return results
|
|
303
|
-
except Exception as e:
|
|
304
|
-
logger.error(f"SearchService(ddgs) image thread failed: {e}")
|
|
305
|
-
return []
|
|
306
|
-
|
|
307
|
-
async def _search_crawl4ai(self, query: str) -> List[Dict[str, str]]:
|
|
308
|
-
"""
|
|
309
|
-
Crawl the configured SERP using Crawl4AI and return parsed results.
|
|
310
|
-
Original implementation.
|
|
311
|
-
"""
|
|
312
|
-
if not query:
|
|
313
|
-
return []
|
|
314
|
-
|
|
315
|
-
url = self._build_search_url(query)
|
|
316
|
-
logger.info(f"SearchService(Crawl4AI): fetching {url}")
|
|
317
|
-
|
|
318
|
-
try:
|
|
319
|
-
crawler = await self._get_crawler()
|
|
320
|
-
result = await crawler.arun(
|
|
321
|
-
url=url,
|
|
322
|
-
config=CrawlerRunConfig(
|
|
323
|
-
wait_until="domcontentloaded",
|
|
324
|
-
wait_for="article",
|
|
325
|
-
cache_mode=CacheMode.BYPASS,
|
|
326
|
-
word_count_threshold=1,
|
|
327
|
-
screenshot=False,
|
|
328
|
-
capture_console_messages=False,
|
|
329
|
-
capture_network_requests=False,
|
|
330
|
-
),
|
|
331
|
-
)
|
|
332
|
-
return self._parse_markdown_result(result, limit=self._default_limit)
|
|
333
|
-
except Exception as e:
|
|
334
|
-
logger.error(f"Crawl4AI search failed: {e}")
|
|
335
|
-
return []
|
|
36
|
+
logger.info(f"SearchService initialized with engine: {self._engine_name}")
|
|
336
37
|
|
|
337
|
-
def
|
|
338
|
-
|
|
339
|
-
md = (result.markdown or result.extracted_content or "").strip()
|
|
340
|
-
lines = [ln.strip() for ln in md.splitlines() if ln.strip()]
|
|
341
|
-
links = result.links.get("external", []) if getattr(result, "links", None) else []
|
|
342
|
-
seen = set()
|
|
343
|
-
results: List[Dict[str, str]] = []
|
|
344
|
-
|
|
345
|
-
def find_snippet(url: str, domain: str) -> str:
|
|
346
|
-
for i, ln in enumerate(lines):
|
|
347
|
-
# Avoid matching DDG internal links (filter by site, etc) which contain the target URL in query params
|
|
348
|
-
if "duckduckgo.com" in ln:
|
|
349
|
-
continue
|
|
350
|
-
|
|
351
|
-
if url in ln or (domain and domain in ln):
|
|
352
|
-
# If this line looks like the title/link line (e.g. starts with [Title](url) or similar),
|
|
353
|
-
# and the NEXT line exists, return the next line as snippet.
|
|
354
|
-
if i + 1 < len(lines) and len(ln) < 300: # heuristic: title lines are usually shorter
|
|
355
|
-
return lines[i+1][:400]
|
|
356
|
-
return ln[:400]
|
|
357
|
-
# fallback to first non-empty line
|
|
358
|
-
return lines[0][:400] if lines else ""
|
|
359
|
-
|
|
360
|
-
for link in links:
|
|
361
|
-
url = link.get("href") or ""
|
|
362
|
-
if not url or url in seen:
|
|
363
|
-
continue
|
|
364
|
-
seen.add(url)
|
|
365
|
-
domain = urllib.parse.urlparse(url).hostname or ""
|
|
366
|
-
title = link.get("title") or link.get("text") or url
|
|
367
|
-
snippet = find_snippet(url, domain)
|
|
368
|
-
results.append({
|
|
369
|
-
"title": title.strip(),
|
|
370
|
-
"url": url,
|
|
371
|
-
"domain": domain,
|
|
372
|
-
"content": snippet or title,
|
|
373
|
-
})
|
|
374
|
-
if len(results) >= limit:
|
|
375
|
-
break
|
|
376
|
-
|
|
377
|
-
if not results:
|
|
378
|
-
logger.warning(f"SearchService: no results parsed; md_length={len(md)}, links={len(links)}")
|
|
379
|
-
else:
|
|
380
|
-
logger.info(f"SearchService: parsed {len(results)} results via Crawl4AI links")
|
|
381
|
-
return results
|
|
382
|
-
|
|
383
|
-
async def fetch_page(self, url: str) -> Dict[str, str]:
|
|
384
|
-
"""
|
|
385
|
-
Fetch a single page and return cleaned markdown/text plus metadata.
|
|
386
|
-
Dispatches to jinaai or Crawl4AI based on fetch_provider config.
|
|
387
|
-
"""
|
|
388
|
-
if not url:
|
|
389
|
-
return {"content": "Error: missing url", "title": "Error", "url": ""}
|
|
390
|
-
|
|
391
|
-
provider = self._fetch_provider.lower()
|
|
392
|
-
logger.info(f"SearchService: fetching page '{url}' using fetch_provider='{provider}'")
|
|
393
|
-
|
|
394
|
-
if provider == "jinaai":
|
|
395
|
-
return await self._fetch_page_jinaai(url)
|
|
396
|
-
else:
|
|
397
|
-
return await self._fetch_page_crawl4ai(url)
|
|
398
|
-
|
|
399
|
-
async def _fetch_page_jinaai(self, url: str) -> Dict[str, str]:
|
|
400
|
-
"""
|
|
401
|
-
Fetch page via Jina AI Reader - returns clean markdown content.
|
|
402
|
-
https://r.jina.ai/{url}
|
|
403
|
-
"""
|
|
404
|
-
if not httpx:
|
|
405
|
-
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
406
|
-
return await self._fetch_page_crawl4ai(url)
|
|
407
|
-
|
|
408
|
-
jina_url = f"https://r.jina.ai/{url}"
|
|
409
|
-
headers = {
|
|
410
|
-
"X-Return-Format": "markdown",
|
|
411
|
-
}
|
|
412
|
-
# Add authorization header if API key is configured
|
|
413
|
-
if self._jina_api_key:
|
|
414
|
-
headers["Authorization"] = f"Bearer {self._jina_api_key}"
|
|
415
|
-
|
|
416
|
-
try:
|
|
417
|
-
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
418
|
-
resp = await client.get(jina_url, headers=headers)
|
|
419
|
-
resp.raise_for_status()
|
|
420
|
-
content = resp.text
|
|
421
|
-
|
|
422
|
-
# Jina AI returns markdown content directly
|
|
423
|
-
# Try to extract title from first heading or first line
|
|
424
|
-
title = "No Title"
|
|
425
|
-
lines = content.strip().split('\n')
|
|
426
|
-
for line in lines:
|
|
427
|
-
line = line.strip()
|
|
428
|
-
if line.startswith('# '):
|
|
429
|
-
title = line[2:].strip()
|
|
430
|
-
break
|
|
431
|
-
elif line and not line.startswith('!') and not line.startswith('['):
|
|
432
|
-
# Use first non-empty, non-image, non-link line as title
|
|
433
|
-
title = line[:100]
|
|
434
|
-
break
|
|
435
|
-
|
|
436
|
-
logger.info(f"SearchService(jinaai): fetched page, title='{title}', content_len={len(content)}")
|
|
437
|
-
return {
|
|
438
|
-
"content": content[:8000],
|
|
439
|
-
"title": title,
|
|
440
|
-
"url": url,
|
|
441
|
-
"images": []
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
except Exception as e:
|
|
445
|
-
logger.error(f"SearchService(jinaai) fetch_page failed: {e}")
|
|
446
|
-
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
447
|
-
|
|
448
|
-
async def _fetch_page_httpx(self, url: str) -> Dict[str, str]:
|
|
449
|
-
"""
|
|
450
|
-
Fetch page via httpx - fast, no browser overhead.
|
|
451
|
-
"""
|
|
452
|
-
if not httpx:
|
|
453
|
-
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
454
|
-
return await self._fetch_page_crawl4ai(url)
|
|
455
|
-
|
|
456
|
-
try:
|
|
457
|
-
async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
|
|
458
|
-
resp = await client.get(url, headers={
|
|
459
|
-
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
460
|
-
})
|
|
461
|
-
resp.raise_for_status()
|
|
462
|
-
html_content = resp.text
|
|
463
|
-
|
|
464
|
-
# Extract title from HTML
|
|
465
|
-
title = "No Title"
|
|
466
|
-
title_match = re.search(r'<title[^>]*>([^<]+)</title>', html_content, re.IGNORECASE)
|
|
467
|
-
if title_match:
|
|
468
|
-
title = html.unescape(title_match.group(1).strip())
|
|
469
|
-
|
|
470
|
-
# Try og:title as fallback
|
|
471
|
-
if title == "No Title":
|
|
472
|
-
og_match = re.search(r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
|
|
473
|
-
if og_match:
|
|
474
|
-
title = html.unescape(og_match.group(1).strip())
|
|
475
|
-
|
|
476
|
-
# Simple HTML to text conversion
|
|
477
|
-
# Remove script/style tags
|
|
478
|
-
content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', html_content, flags=re.IGNORECASE)
|
|
479
|
-
content = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', html_content, flags=re.IGNORECASE)
|
|
480
|
-
# Remove HTML tags
|
|
481
|
-
content = re.sub(r'<[^>]+>', ' ', content)
|
|
482
|
-
# Decode entities
|
|
483
|
-
content = html.unescape(content)
|
|
484
|
-
# Normalize whitespace
|
|
485
|
-
content = re.sub(r'\s+', ' ', content).strip()
|
|
486
|
-
|
|
487
|
-
logger.info(f"SearchService(httpx): fetched page, title='{title}', content_len={len(content)}")
|
|
488
|
-
return {
|
|
489
|
-
"content": content[:8000],
|
|
490
|
-
"title": title,
|
|
491
|
-
"url": url,
|
|
492
|
-
"images": []
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
except Exception as e:
|
|
496
|
-
logger.error(f"SearchService(httpx) fetch_page failed: {e}")
|
|
497
|
-
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
498
|
-
|
|
499
|
-
async def _fetch_page_crawl4ai(self, url: str) -> Dict[str, str]:
|
|
500
|
-
"""
|
|
501
|
-
Fetch page via Crawl4AI - full browser rendering.
|
|
502
|
-
"""
|
|
503
|
-
try:
|
|
504
|
-
crawler = await self._get_crawler()
|
|
505
|
-
result = await crawler.arun(
|
|
506
|
-
url=url,
|
|
507
|
-
config=CrawlerRunConfig(
|
|
508
|
-
wait_until="networkidle",
|
|
509
|
-
wait_for_images=False, # Faster: skip image loading
|
|
510
|
-
cache_mode=CacheMode.BYPASS,
|
|
511
|
-
word_count_threshold=1,
|
|
512
|
-
screenshot=False,
|
|
513
|
-
# Markdown config from test.py
|
|
514
|
-
markdown_generator=DefaultMarkdownGenerator(
|
|
515
|
-
options={
|
|
516
|
-
"ignore_links": True,
|
|
517
|
-
"ignore_images": False,
|
|
518
|
-
"skip_internal_links": True
|
|
519
|
-
}
|
|
520
|
-
),
|
|
521
|
-
capture_console_messages=False,
|
|
522
|
-
capture_network_requests=False,
|
|
523
|
-
),
|
|
524
|
-
)
|
|
525
|
-
if not result.success:
|
|
526
|
-
return {"content": f"Error: crawl failed ({result.error_message or 'unknown'})", "title": "Error", "url": url}
|
|
527
|
-
|
|
528
|
-
content = result.markdown or result.extracted_content or result.cleaned_html or result.html or ""
|
|
529
|
-
# Extract metadata if available, otherwise fallback
|
|
530
|
-
title = "No Title"
|
|
531
|
-
if result.metadata:
|
|
532
|
-
title = result.metadata.get("title") or result.metadata.get("og:title") or title
|
|
533
|
-
|
|
534
|
-
# If metadata title is missing/generic, try to grab from links or url? No, metadata is best.
|
|
535
|
-
if title == "No Title" and result.links:
|
|
536
|
-
# Minimal fallback not really possible without parsing HTML again or regex
|
|
537
|
-
pass
|
|
538
|
-
|
|
539
|
-
# Extract images from media
|
|
540
|
-
images = []
|
|
541
|
-
if result.media and "images" in result.media:
|
|
542
|
-
for img in result.media["images"]:
|
|
543
|
-
src = img.get("src")
|
|
544
|
-
if src and src.startswith("http"):
|
|
545
|
-
images.append(src)
|
|
546
|
-
|
|
547
|
-
return {
|
|
548
|
-
"content": content[:8000],
|
|
549
|
-
"title": title,
|
|
550
|
-
"url": result.url or url,
|
|
551
|
-
"images": images
|
|
552
|
-
}
|
|
553
|
-
except Exception as e:
|
|
554
|
-
logger.error(f"Crawl4AI fetch failed: {e}")
|
|
555
|
-
return {"content": f"Error: crawl failed ({e})", "title": "Error", "url": url}
|
|
556
|
-
|
|
557
|
-
async def _get_crawler(self) -> AsyncWebCrawler:
|
|
558
|
-
# Prefer shared crawler to minimize INIT logs; fall back to local if needed
|
|
559
|
-
try:
|
|
560
|
-
return await get_shared_crawler()
|
|
561
|
-
except Exception as e:
|
|
562
|
-
logger.warning(f"Shared crawler unavailable, creating local: {e}")
|
|
563
|
-
if self._crawler is None:
|
|
564
|
-
self._crawler = AsyncWebCrawler()
|
|
565
|
-
await self._crawler.start()
|
|
566
|
-
return self._crawler
|
|
38
|
+
def _build_search_url(self, query: str) -> str:
|
|
39
|
+
return self._engine.build_url(query, self._default_limit)
|
|
567
40
|
|
|
568
|
-
async def
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
except Exception:
|
|
573
|
-
pass
|
|
574
|
-
self._crawler = None
|
|
41
|
+
async def search_batch(self, queries: List[str]) -> List[List[Dict[str, Any]]]:
|
|
42
|
+
"""Execute multiple searches concurrently."""
|
|
43
|
+
tasks = [self.search(q) for q in queries]
|
|
44
|
+
return await asyncio.gather(*tasks)
|
|
575
45
|
|
|
576
|
-
async def
|
|
46
|
+
async def search(self, query: str) -> List[Dict[str, Any]]:
|
|
577
47
|
"""
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
Args:
|
|
581
|
-
query: Search query
|
|
582
|
-
prefetch: If True, automatically start prefetching images for caching
|
|
48
|
+
Main search entry point.
|
|
49
|
+
Returns parsed search results only.
|
|
583
50
|
"""
|
|
584
51
|
if not query:
|
|
585
52
|
return []
|
|
586
53
|
|
|
587
|
-
|
|
588
|
-
|
|
54
|
+
# Apply blocking
|
|
55
|
+
final_query = query
|
|
56
|
+
if self._blocked_domains and "-site:" not in query:
|
|
57
|
+
exclusions = " ".join([f"-site:{d}" for d in self._blocked_domains])
|
|
58
|
+
final_query = f"{query} {exclusions}"
|
|
589
59
|
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
elif provider == "httpx":
|
|
593
|
-
results = await self._image_search_httpx(query)
|
|
594
|
-
else:
|
|
595
|
-
results = await self._image_search_crawl4ai(query)
|
|
596
|
-
|
|
597
|
-
# Start prefetching images in background for faster rendering
|
|
598
|
-
if prefetch and results:
|
|
599
|
-
urls_to_prefetch = []
|
|
600
|
-
for img in results:
|
|
601
|
-
# Prefer thumbnail for prefetch (smaller, used in UI)
|
|
602
|
-
thumb = img.get("thumbnail")
|
|
603
|
-
url = img.get("url")
|
|
604
|
-
if thumb:
|
|
605
|
-
urls_to_prefetch.append(thumb)
|
|
606
|
-
if url and url != thumb:
|
|
607
|
-
urls_to_prefetch.append(url)
|
|
608
|
-
|
|
609
|
-
if urls_to_prefetch:
|
|
610
|
-
logger.info(f"SearchService: Starting prefetch for {len(urls_to_prefetch)} images")
|
|
611
|
-
await prefetch_images(urls_to_prefetch)
|
|
612
|
-
|
|
613
|
-
return results
|
|
614
|
-
|
|
615
|
-
async def _image_search_httpx(self, query: str) -> List[Dict[str, str]]:
|
|
616
|
-
"""
|
|
617
|
-
Image search via httpx - parse img tags from HTML response.
|
|
618
|
-
"""
|
|
619
|
-
if not httpx:
|
|
620
|
-
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
621
|
-
return await self._image_search_crawl4ai(query)
|
|
622
|
-
|
|
623
|
-
url = self._build_image_url(query)
|
|
624
|
-
logger.info(f"SearchService(httpx Image): fetching {url}")
|
|
60
|
+
url = self._build_search_url(final_query)
|
|
61
|
+
logger.info(f"Search: '{query}' -> {url}")
|
|
625
62
|
|
|
63
|
+
results = []
|
|
626
64
|
try:
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
continue
|
|
651
|
-
# Skip tiny icons/placeholders
|
|
652
|
-
if "favicon" in src.lower() or "logo" in src.lower() or "icon" in src.lower():
|
|
653
|
-
continue
|
|
654
|
-
if src in seen:
|
|
655
|
-
continue
|
|
656
|
-
seen.add(src)
|
|
657
|
-
|
|
658
|
-
alt = html.unescape(alt.strip()) if alt else "Image"
|
|
659
|
-
domain = urllib.parse.urlparse(src).hostname or ""
|
|
660
|
-
|
|
661
|
-
images.append({
|
|
662
|
-
"title": alt,
|
|
663
|
-
"url": src,
|
|
664
|
-
"thumbnail": src, # Use same URL as thumbnail
|
|
665
|
-
"domain": domain,
|
|
666
|
-
"content": alt,
|
|
667
|
-
})
|
|
668
|
-
|
|
669
|
-
if len(images) >= self._default_limit:
|
|
670
|
-
break
|
|
671
|
-
|
|
672
|
-
if not images:
|
|
673
|
-
logger.warning(f"SearchService(httpx): no images parsed from HTML")
|
|
65
|
+
# Fetch - Search parsing doesn't need screenshot, only HTML
|
|
66
|
+
page_data = await self.fetch_page_raw(url, include_screenshot=False)
|
|
67
|
+
content = page_data.get("html", "") or page_data.get("content", "")
|
|
68
|
+
|
|
69
|
+
# Parse Results (skip raw page - only return parsed results)
|
|
70
|
+
if content and not content.startswith("Error"):
|
|
71
|
+
parsed = self._engine.parse(content)
|
|
72
|
+
|
|
73
|
+
# JAVASCRIPT IMAGE INJECTION
|
|
74
|
+
# Inject base64 images from JS extraction if available
|
|
75
|
+
# This provides robust fallback if HTTP URLs fail to load
|
|
76
|
+
js_images = page_data.get("images", [])
|
|
77
|
+
if js_images:
|
|
78
|
+
logger.info(f"Search: Injecting {len(js_images)} base64 images into top results")
|
|
79
|
+
for i, img_b64 in enumerate(js_images):
|
|
80
|
+
if i < len(parsed):
|
|
81
|
+
b64_src = f"data:image/jpeg;base64,{img_b64}" if not img_b64.startswith("data:") else img_b64
|
|
82
|
+
if "images" not in parsed[i]: parsed[i]["images"] = []
|
|
83
|
+
# Prepend to prioritize base64 (guaranteed render) over HTTP URLs
|
|
84
|
+
parsed[i]["images"].insert(0, b64_src)
|
|
85
|
+
|
|
86
|
+
logger.info(f"Search parsed {len(parsed)} results for '{query}' using {self._engine_name}")
|
|
87
|
+
results.extend(parsed)
|
|
674
88
|
else:
|
|
675
|
-
logger.
|
|
676
|
-
return images
|
|
677
|
-
|
|
678
|
-
except Exception as e:
|
|
679
|
-
logger.error(f"SearchService(httpx) image_search failed: {e}")
|
|
680
|
-
return []
|
|
89
|
+
logger.warning(f"Search failed/empty for '{query}': {content[:100]}")
|
|
681
90
|
|
|
682
|
-
|
|
683
|
-
"""
|
|
684
|
-
Image search via Crawl4AI media extraction.
|
|
685
|
-
"""
|
|
686
|
-
url = self._build_image_url(query)
|
|
687
|
-
logger.info(f"SearchService(Crawl4AI Image): fetching {url}")
|
|
91
|
+
return results
|
|
688
92
|
|
|
689
|
-
try:
|
|
690
|
-
# Use image crawler (text_mode=False) for image search
|
|
691
|
-
crawler = await self._get_crawler()
|
|
692
|
-
result = await crawler.arun(
|
|
693
|
-
url=url,
|
|
694
|
-
config=CrawlerRunConfig(
|
|
695
|
-
wait_until="domcontentloaded", # 不需要等 networkidle
|
|
696
|
-
wait_for_images=False, # 只需要 img src,不需要图片内容
|
|
697
|
-
wait_for="img",
|
|
698
|
-
cache_mode=CacheMode.BYPASS,
|
|
699
|
-
word_count_threshold=1,
|
|
700
|
-
screenshot=False,
|
|
701
|
-
capture_console_messages=False,
|
|
702
|
-
capture_network_requests=False,
|
|
703
|
-
),
|
|
704
|
-
)
|
|
705
|
-
images = []
|
|
706
|
-
seen = set()
|
|
707
|
-
for img in result.media.get("images", []):
|
|
708
|
-
src = img.get("src") or ""
|
|
709
|
-
if not src:
|
|
710
|
-
continue
|
|
711
|
-
if src.startswith("//"):
|
|
712
|
-
src = "https:" + src
|
|
713
|
-
if not src.startswith("http"):
|
|
714
|
-
continue
|
|
715
|
-
if src in seen:
|
|
716
|
-
continue
|
|
717
|
-
seen.add(src)
|
|
718
|
-
alt = (img.get("alt") or img.get("desc") or "").strip()
|
|
719
|
-
domain = urllib.parse.urlparse(src).hostname or ""
|
|
720
|
-
images.append({
|
|
721
|
-
"title": alt or "Image",
|
|
722
|
-
"url": src,
|
|
723
|
-
"domain": domain,
|
|
724
|
-
"content": alt or "Image",
|
|
725
|
-
})
|
|
726
|
-
if len(images) >= self._default_limit:
|
|
727
|
-
break
|
|
728
|
-
if not images:
|
|
729
|
-
logger.warning(f"SearchService: no images parsed; media_count={len(result.media.get('images', []))}")
|
|
730
|
-
else:
|
|
731
|
-
logger.info(f"SearchService: parsed {len(images)} images via Crawl4AI media")
|
|
732
|
-
return images
|
|
733
93
|
except Exception as e:
|
|
734
|
-
logger.error(f"
|
|
735
|
-
return
|
|
94
|
+
logger.error(f"Search error for '{query}': {e}")
|
|
95
|
+
# Ensure we return at least an error item
|
|
96
|
+
return [{
|
|
97
|
+
"title": f"Error Search: {query}",
|
|
98
|
+
"url": url,
|
|
99
|
+
"content": f"Error: {e}",
|
|
100
|
+
"type": "search_raw_page",
|
|
101
|
+
"_hidden": True
|
|
102
|
+
}]
|
|
103
|
+
|
|
104
|
+
async def fetch_pages_batch(self, urls: List[str], include_screenshot: bool = True) -> List[Dict[str, Any]]:
|
|
105
|
+
"""Fetch multiple pages concurrently."""
|
|
106
|
+
tasks = [self.fetch_page(u, include_screenshot=include_screenshot) for u in urls]
|
|
107
|
+
return await asyncio.gather(*tasks)
|
|
108
|
+
|
|
109
|
+
async def fetch_page(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
|
|
110
|
+
"""
|
|
111
|
+
Fetch a single page for reading/extracting content.
|
|
112
|
+
"""
|
|
113
|
+
if timeout is None:
|
|
114
|
+
timeout = self._fetch_timeout
|
|
115
|
+
return await self.fetch_page_raw(url, timeout, include_screenshot=include_screenshot)
|
|
116
|
+
|
|
117
|
+
async def fetch_page_raw(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
|
|
118
|
+
"""Internal: Get raw data from browser service."""
|
|
119
|
+
if timeout is None:
|
|
120
|
+
timeout = self._fetch_timeout
|
|
121
|
+
service = get_screenshot_service(headless=self._headless)
|
|
122
|
+
return await service.fetch_page(url, timeout=timeout, include_screenshot=include_screenshot)
|