entari-plugin-hyw 2.2.5__py3-none-any.whl → 3.5.0rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- entari_plugin_hyw/__init__.py +371 -315
- entari_plugin_hyw/assets/card-dist/index.html +396 -0
- entari_plugin_hyw/assets/card-dist/logos/anthropic.svg +1 -0
- entari_plugin_hyw/assets/card-dist/logos/cerebras.svg +9 -0
- entari_plugin_hyw/assets/card-dist/logos/deepseek.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/gemini.svg +1 -0
- entari_plugin_hyw/assets/card-dist/logos/google.svg +1 -0
- entari_plugin_hyw/assets/card-dist/logos/grok.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/huggingface.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/microsoft.svg +15 -0
- entari_plugin_hyw/assets/card-dist/logos/minimax.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/mistral.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/nvida.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/openai.svg +1 -0
- entari_plugin_hyw/assets/card-dist/logos/openrouter.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/perplexity.svg +24 -0
- entari_plugin_hyw/assets/card-dist/logos/qwen.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/xai.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/xiaomi.png +0 -0
- entari_plugin_hyw/assets/card-dist/logos/zai.png +0 -0
- entari_plugin_hyw/assets/card-dist/vite.svg +1 -0
- entari_plugin_hyw/assets/icon/anthropic.svg +1 -0
- entari_plugin_hyw/assets/icon/cerebras.svg +9 -0
- entari_plugin_hyw/assets/icon/deepseek.png +0 -0
- entari_plugin_hyw/assets/icon/gemini.svg +1 -0
- entari_plugin_hyw/assets/icon/google.svg +1 -0
- entari_plugin_hyw/assets/icon/grok.png +0 -0
- entari_plugin_hyw/assets/icon/huggingface.png +0 -0
- entari_plugin_hyw/assets/icon/microsoft.svg +15 -0
- entari_plugin_hyw/assets/icon/minimax.png +0 -0
- entari_plugin_hyw/assets/icon/mistral.png +0 -0
- entari_plugin_hyw/assets/icon/nvida.png +0 -0
- entari_plugin_hyw/assets/icon/openai.svg +1 -0
- entari_plugin_hyw/assets/icon/openrouter.png +0 -0
- entari_plugin_hyw/assets/icon/perplexity.svg +24 -0
- entari_plugin_hyw/assets/icon/qwen.png +0 -0
- entari_plugin_hyw/assets/icon/xai.png +0 -0
- entari_plugin_hyw/assets/icon/xiaomi.png +0 -0
- entari_plugin_hyw/assets/icon/zai.png +0 -0
- entari_plugin_hyw/card-ui/.gitignore +24 -0
- entari_plugin_hyw/card-ui/README.md +5 -0
- entari_plugin_hyw/card-ui/index.html +16 -0
- entari_plugin_hyw/card-ui/package-lock.json +2342 -0
- entari_plugin_hyw/card-ui/package.json +31 -0
- entari_plugin_hyw/card-ui/public/logos/anthropic.svg +1 -0
- entari_plugin_hyw/card-ui/public/logos/cerebras.svg +9 -0
- entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/gemini.svg +1 -0
- entari_plugin_hyw/card-ui/public/logos/google.svg +1 -0
- entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/microsoft.svg +15 -0
- entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/openai.svg +1 -0
- entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/perplexity.svg +24 -0
- entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
- entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
- entari_plugin_hyw/card-ui/public/vite.svg +1 -0
- entari_plugin_hyw/card-ui/src/App.vue +412 -0
- entari_plugin_hyw/card-ui/src/assets/vue.svg +1 -0
- entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +41 -0
- entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +386 -0
- entari_plugin_hyw/card-ui/src/components/SectionCard.vue +41 -0
- entari_plugin_hyw/card-ui/src/components/StageCard.vue +237 -0
- entari_plugin_hyw/card-ui/src/main.ts +5 -0
- entari_plugin_hyw/card-ui/src/style.css +29 -0
- entari_plugin_hyw/card-ui/src/test_regex.js +103 -0
- entari_plugin_hyw/card-ui/src/types.ts +52 -0
- entari_plugin_hyw/card-ui/tsconfig.app.json +16 -0
- entari_plugin_hyw/card-ui/tsconfig.json +7 -0
- entari_plugin_hyw/card-ui/tsconfig.node.json +26 -0
- entari_plugin_hyw/card-ui/vite.config.ts +16 -0
- entari_plugin_hyw/history.py +170 -0
- entari_plugin_hyw/image_cache.py +274 -0
- entari_plugin_hyw/misc.py +128 -0
- entari_plugin_hyw/pipeline.py +1338 -0
- entari_plugin_hyw/prompts.py +108 -0
- entari_plugin_hyw/render_vue.py +314 -0
- entari_plugin_hyw/search.py +696 -0
- entari_plugin_hyw-3.5.0rc6.dist-info/METADATA +116 -0
- entari_plugin_hyw-3.5.0rc6.dist-info/RECORD +88 -0
- entari_plugin_hyw/hyw_core.py +0 -555
- entari_plugin_hyw-2.2.5.dist-info/METADATA +0 -135
- entari_plugin_hyw-2.2.5.dist-info/RECORD +0 -6
- {entari_plugin_hyw-2.2.5.dist-info → entari_plugin_hyw-3.5.0rc6.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-2.2.5.dist-info → entari_plugin_hyw-3.5.0rc6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,696 @@
|
|
|
1
|
+
import urllib.parse
|
|
2
|
+
import asyncio
|
|
3
|
+
import re
|
|
4
|
+
import html
|
|
5
|
+
from typing import List, Dict, Optional, Any
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from crawl4ai import AsyncWebCrawler
|
|
8
|
+
from crawl4ai.async_configs import CrawlerRunConfig
|
|
9
|
+
from crawl4ai.cache_context import CacheMode
|
|
10
|
+
|
|
11
|
+
# Optional imports for new strategies
|
|
12
|
+
try:
|
|
13
|
+
import httpx
|
|
14
|
+
except ImportError:
|
|
15
|
+
httpx = None
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from ddgs import DDGS
|
|
19
|
+
except ImportError:
|
|
20
|
+
try:
|
|
21
|
+
from duckduckgo_search import DDGS
|
|
22
|
+
except ImportError:
|
|
23
|
+
DDGS = None
|
|
24
|
+
|
|
25
|
+
# Import image cache for prefetching
|
|
26
|
+
from .image_cache import prefetch_images
|
|
27
|
+
|
|
28
|
+
# Shared crawler instance to avoid repeated init
|
|
29
|
+
_shared_crawler: Optional[AsyncWebCrawler] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def get_shared_crawler() -> AsyncWebCrawler:
|
|
33
|
+
global _shared_crawler
|
|
34
|
+
if _shared_crawler is None:
|
|
35
|
+
_shared_crawler = AsyncWebCrawler()
|
|
36
|
+
await _shared_crawler.start()
|
|
37
|
+
return _shared_crawler
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
async def close_shared_crawler():
|
|
41
|
+
global _shared_crawler
|
|
42
|
+
if _shared_crawler:
|
|
43
|
+
try:
|
|
44
|
+
await _shared_crawler.close()
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
_shared_crawler = None
|
|
48
|
+
|
|
49
|
+
class SearchService:
|
|
50
|
+
"""
|
|
51
|
+
Multi-strategy search & fetch service.
|
|
52
|
+
Search providers: 'crawl4ai' (default), 'httpx', 'ddgs'.
|
|
53
|
+
Fetch providers: 'crawl4ai' (default), 'jinaai'.
|
|
54
|
+
"""
|
|
55
|
+
def __init__(self, config: Any):
|
|
56
|
+
self.config = config
|
|
57
|
+
self._default_limit = getattr(config, "search_limit", 8)
|
|
58
|
+
self._crawler: Optional[AsyncWebCrawler] = None
|
|
59
|
+
|
|
60
|
+
# Configuration for retries/timeouts
|
|
61
|
+
self._search_timeout = getattr(config, "search_timeout", 10.0)
|
|
62
|
+
self._search_retries = getattr(config, "search_retries", 2)
|
|
63
|
+
# Separate providers for search and page fetching
|
|
64
|
+
self._search_provider = getattr(config, "search_provider", "crawl4ai")
|
|
65
|
+
self._fetch_provider = getattr(config, "fetch_provider", "crawl4ai")
|
|
66
|
+
self._jina_api_key = getattr(config, "jina_api_key", None)
|
|
67
|
+
logger.info(f"SearchService initialized: search_provider='{self._search_provider}', fetch_provider='{self._fetch_provider}', limit={self._default_limit}, timeout={self._search_timeout}s")
|
|
68
|
+
|
|
69
|
+
def _build_search_url(self, query: str) -> str:
|
|
70
|
+
encoded_query = urllib.parse.quote(query)
|
|
71
|
+
base = getattr(self.config, "search_base_url", "https://lite.duckduckgo.com/lite/?q={query}")
|
|
72
|
+
if "{query}" in base:
|
|
73
|
+
return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
|
|
74
|
+
sep = "&" if "?" in base else "?"
|
|
75
|
+
return f"{base}{sep}q={encoded_query}"
|
|
76
|
+
|
|
77
|
+
def _build_image_url(self, query: str) -> str:
|
|
78
|
+
encoded_query = urllib.parse.quote(query)
|
|
79
|
+
base = getattr(self.config, "image_search_base_url", "https://duckduckgo.com/?q={query}&iax=images&ia=images")
|
|
80
|
+
if "{query}" in base:
|
|
81
|
+
return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
|
|
82
|
+
sep = "&" if "?" in base else "?"
|
|
83
|
+
return f"{base}{sep}q={encoded_query}&iax=images&ia=images"
|
|
84
|
+
|
|
85
|
+
async def search(self, query: str) -> List[Dict[str, str]]:
|
|
86
|
+
"""
|
|
87
|
+
Dispatch search to the configured provider.
|
|
88
|
+
"""
|
|
89
|
+
if not query:
|
|
90
|
+
return []
|
|
91
|
+
|
|
92
|
+
provider = self._search_provider.lower()
|
|
93
|
+
logger.info(f"SearchService: searching for '{query}' using provider='{provider}'")
|
|
94
|
+
|
|
95
|
+
if provider == "httpx":
|
|
96
|
+
return await self._search_httpx(query)
|
|
97
|
+
elif provider == "ddgs":
|
|
98
|
+
return await self._search_ddgs(query)
|
|
99
|
+
else:
|
|
100
|
+
# Default to crawl4ai for backward compatibility or explicit choice
|
|
101
|
+
return await self._search_crawl4ai(query)
|
|
102
|
+
|
|
103
|
+
async def _search_httpx(self, query: str) -> List[Dict[str, str]]:
|
|
104
|
+
"""
|
|
105
|
+
Directly fetch https://lite.duckduckgo.com/lite/ via httpx and parse HTML.
|
|
106
|
+
Fast, no browser overhead.
|
|
107
|
+
"""
|
|
108
|
+
if not httpx:
|
|
109
|
+
logger.error("SearchService: httpx not installed, fallback to crawl4ai")
|
|
110
|
+
return await self._search_crawl4ai(query)
|
|
111
|
+
|
|
112
|
+
url = self._build_search_url(query)
|
|
113
|
+
|
|
114
|
+
results: List[Dict[str, str]] = []
|
|
115
|
+
try:
|
|
116
|
+
async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
|
|
117
|
+
resp = await client.get(url, headers={
|
|
118
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
|
|
119
|
+
})
|
|
120
|
+
resp.raise_for_status()
|
|
121
|
+
html_content = resp.text
|
|
122
|
+
|
|
123
|
+
# Regex parsing for DDG Lite
|
|
124
|
+
snippet_regex = re.compile(r'<td[^>]*>(.*?)</td>', re.DOTALL)
|
|
125
|
+
link_regex = re.compile(r'<a[^>]+href="([^"]+)"[^>]*>(.*?)</a>', re.DOTALL)
|
|
126
|
+
|
|
127
|
+
raw_links = link_regex.findall(html_content)
|
|
128
|
+
|
|
129
|
+
seen = set()
|
|
130
|
+
for href, text in raw_links:
|
|
131
|
+
if len(results) >= self._default_limit:
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
# Clean href
|
|
135
|
+
if "duckduckgo.com" in href:
|
|
136
|
+
if "uddg=" in href:
|
|
137
|
+
parsed = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
|
|
138
|
+
href = parsed.get("uddg", [href])[0]
|
|
139
|
+
else:
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
if not href.startswith("http"):
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
if href in seen:
|
|
146
|
+
continue
|
|
147
|
+
seen.add(href)
|
|
148
|
+
|
|
149
|
+
# Title clean
|
|
150
|
+
title = re.sub(r'<[^>]+>', '', text).strip()
|
|
151
|
+
title = html.unescape(title)
|
|
152
|
+
|
|
153
|
+
results.append({
|
|
154
|
+
"title": title,
|
|
155
|
+
"url": href,
|
|
156
|
+
"domain": urllib.parse.urlparse(href).hostname or "",
|
|
157
|
+
"content": title
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
if not results:
|
|
161
|
+
logger.warning("SearchService(httpx): No results parsed via regex.")
|
|
162
|
+
|
|
163
|
+
return results
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"SearchService(httpx) failed: {e}")
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
async def _search_ddgs(self, query: str) -> List[Dict[str, str]]:
|
|
170
|
+
"""
|
|
171
|
+
Use duckduckgo_search library (Sync DDGS).
|
|
172
|
+
Executes in thread pool to allow async usage.
|
|
173
|
+
Supports retries and timeouts.
|
|
174
|
+
"""
|
|
175
|
+
if not DDGS:
|
|
176
|
+
logger.error("SearchService: duckduckgo_search not installed, fallback to crawl4ai")
|
|
177
|
+
return await self._search_crawl4ai(query)
|
|
178
|
+
|
|
179
|
+
def _do_sync_search():
|
|
180
|
+
"""Sync search function to run in thread"""
|
|
181
|
+
results: List[Dict[str, str]] = []
|
|
182
|
+
final_exc = None
|
|
183
|
+
|
|
184
|
+
for attempt in range(self._search_retries + 1):
|
|
185
|
+
try:
|
|
186
|
+
with DDGS(timeout=self._search_timeout) as ddgs:
|
|
187
|
+
# Use positional argument for query to be safe across versions
|
|
188
|
+
ddgs_gen = ddgs.text(
|
|
189
|
+
query,
|
|
190
|
+
region='cn-zh',
|
|
191
|
+
safesearch='moderate',
|
|
192
|
+
max_results=self._default_limit,
|
|
193
|
+
backend="duckduckgo",
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
if ddgs_gen:
|
|
197
|
+
for r in ddgs_gen:
|
|
198
|
+
results.append({
|
|
199
|
+
"title": r.get("title", ""),
|
|
200
|
+
"url": r.get("href", ""),
|
|
201
|
+
"domain": urllib.parse.urlparse(r.get("href", "")).hostname or "",
|
|
202
|
+
"content": r.get("body", "")
|
|
203
|
+
})
|
|
204
|
+
if len(results) >= self._default_limit:
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
return results, None
|
|
208
|
+
|
|
209
|
+
except Exception as e:
|
|
210
|
+
final_exc = e
|
|
211
|
+
if attempt < self._search_retries:
|
|
212
|
+
import time
|
|
213
|
+
time.sleep(1)
|
|
214
|
+
|
|
215
|
+
return [], final_exc
|
|
216
|
+
|
|
217
|
+
# Run sync search in executor
|
|
218
|
+
try:
|
|
219
|
+
results, err = await asyncio.to_thread(_do_sync_search)
|
|
220
|
+
|
|
221
|
+
if err:
|
|
222
|
+
logger.warning(f"SearchService(ddgs) text search failed after retries: {err}")
|
|
223
|
+
return []
|
|
224
|
+
|
|
225
|
+
logger.info(f"SearchService(ddgs): Got {len(results)} text results")
|
|
226
|
+
return results
|
|
227
|
+
|
|
228
|
+
except Exception as e:
|
|
229
|
+
logger.error(f"SearchService(ddgs) thread execution failed: {e}")
|
|
230
|
+
return []
|
|
231
|
+
|
|
232
|
+
async def _search_ddgs_images(self, query: str) -> List[Dict[str, str]]:
|
|
233
|
+
"""
|
|
234
|
+
Use duckduckgo_search library for images.
|
|
235
|
+
"""
|
|
236
|
+
if not DDGS:
|
|
237
|
+
return []
|
|
238
|
+
|
|
239
|
+
def _do_sync_image_search():
|
|
240
|
+
results: List[Dict[str, str]] = []
|
|
241
|
+
final_exc = None
|
|
242
|
+
|
|
243
|
+
for attempt in range(self._search_retries + 1):
|
|
244
|
+
try:
|
|
245
|
+
with DDGS(timeout=self._search_timeout) as ddgs:
|
|
246
|
+
ddgs_gen = ddgs.images(
|
|
247
|
+
query,
|
|
248
|
+
region='cn-zh',
|
|
249
|
+
safesearch='moderate',
|
|
250
|
+
max_results=self._default_limit,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
if ddgs_gen:
|
|
254
|
+
for r in ddgs_gen:
|
|
255
|
+
# DDGS images returns: title, image, thumbnail, url, source, etc.
|
|
256
|
+
# API might differ, adapt to standard format
|
|
257
|
+
results.append({
|
|
258
|
+
"title": r.get("title", "Image"),
|
|
259
|
+
"url": r.get("image", "") or r.get("url", ""), # Full image URL
|
|
260
|
+
"thumbnail": r.get("thumbnail", ""),
|
|
261
|
+
"domain": r.get("source", "") or urllib.parse.urlparse(r.get("url", "")).hostname or "",
|
|
262
|
+
})
|
|
263
|
+
if len(results) >= self._default_limit:
|
|
264
|
+
break
|
|
265
|
+
|
|
266
|
+
return results, None
|
|
267
|
+
except Exception as e:
|
|
268
|
+
final_exc = e
|
|
269
|
+
if attempt < self._search_retries:
|
|
270
|
+
import time
|
|
271
|
+
time.sleep(1)
|
|
272
|
+
|
|
273
|
+
return [], final_exc
|
|
274
|
+
|
|
275
|
+
try:
|
|
276
|
+
results, err = await asyncio.to_thread(_do_sync_image_search)
|
|
277
|
+
if err:
|
|
278
|
+
logger.warning(f"SearchService(ddgs) image search failed: {err}")
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
logger.info(f"SearchService(ddgs): Got {len(results)} image results")
|
|
282
|
+
return results
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logger.error(f"SearchService(ddgs) image thread failed: {e}")
|
|
285
|
+
return []
|
|
286
|
+
|
|
287
|
+
async def _search_crawl4ai(self, query: str) -> List[Dict[str, str]]:
|
|
288
|
+
"""
|
|
289
|
+
Crawl the configured SERP using Crawl4AI and return parsed results.
|
|
290
|
+
Original implementation.
|
|
291
|
+
"""
|
|
292
|
+
if not query:
|
|
293
|
+
return []
|
|
294
|
+
|
|
295
|
+
url = self._build_search_url(query)
|
|
296
|
+
logger.info(f"SearchService(Crawl4AI): fetching {url}")
|
|
297
|
+
|
|
298
|
+
try:
|
|
299
|
+
crawler = await self._get_crawler()
|
|
300
|
+
result = await crawler.arun(
|
|
301
|
+
url=url,
|
|
302
|
+
config=CrawlerRunConfig(
|
|
303
|
+
wait_until="domcontentloaded",
|
|
304
|
+
wait_for="article",
|
|
305
|
+
cache_mode=CacheMode.BYPASS,
|
|
306
|
+
word_count_threshold=1,
|
|
307
|
+
screenshot=False,
|
|
308
|
+
capture_console_messages=False,
|
|
309
|
+
capture_network_requests=False,
|
|
310
|
+
),
|
|
311
|
+
)
|
|
312
|
+
return self._parse_markdown_result(result, limit=self._default_limit)
|
|
313
|
+
except Exception as e:
|
|
314
|
+
logger.error(f"Crawl4AI search failed: {e}")
|
|
315
|
+
return []
|
|
316
|
+
|
|
317
|
+
def _parse_markdown_result(self, result, limit: int = 8) -> List[Dict[str, str]]:
|
|
318
|
+
"""Parse Crawl4AI result into search items without manual HTML parsing."""
|
|
319
|
+
md = (result.markdown or result.extracted_content or "").strip()
|
|
320
|
+
lines = [ln.strip() for ln in md.splitlines() if ln.strip()]
|
|
321
|
+
links = result.links.get("external", []) if getattr(result, "links", None) else []
|
|
322
|
+
seen = set()
|
|
323
|
+
results: List[Dict[str, str]] = []
|
|
324
|
+
|
|
325
|
+
def find_snippet(url: str, domain: str) -> str:
|
|
326
|
+
for i, ln in enumerate(lines):
|
|
327
|
+
# Avoid matching DDG internal links (filter by site, etc) which contain the target URL in query params
|
|
328
|
+
if "duckduckgo.com" in ln:
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
if url in ln or (domain and domain in ln):
|
|
332
|
+
# If this line looks like the title/link line (e.g. starts with [Title](url) or similar),
|
|
333
|
+
# and the NEXT line exists, return the next line as snippet.
|
|
334
|
+
if i + 1 < len(lines) and len(ln) < 300: # heuristic: title lines are usually shorter
|
|
335
|
+
return lines[i+1][:400]
|
|
336
|
+
return ln[:400]
|
|
337
|
+
# fallback to first non-empty line
|
|
338
|
+
return lines[0][:400] if lines else ""
|
|
339
|
+
|
|
340
|
+
for link in links:
|
|
341
|
+
url = link.get("href") or ""
|
|
342
|
+
if not url or url in seen:
|
|
343
|
+
continue
|
|
344
|
+
seen.add(url)
|
|
345
|
+
domain = urllib.parse.urlparse(url).hostname or ""
|
|
346
|
+
title = link.get("title") or link.get("text") or url
|
|
347
|
+
snippet = find_snippet(url, domain)
|
|
348
|
+
results.append({
|
|
349
|
+
"title": title.strip(),
|
|
350
|
+
"url": url,
|
|
351
|
+
"domain": domain,
|
|
352
|
+
"content": snippet or title,
|
|
353
|
+
})
|
|
354
|
+
if len(results) >= limit:
|
|
355
|
+
break
|
|
356
|
+
|
|
357
|
+
if not results:
|
|
358
|
+
logger.warning(f"SearchService: no results parsed; md_length={len(md)}, links={len(links)}")
|
|
359
|
+
else:
|
|
360
|
+
logger.info(f"SearchService: parsed {len(results)} results via Crawl4AI links")
|
|
361
|
+
return results
|
|
362
|
+
|
|
363
|
+
async def fetch_page(self, url: str) -> Dict[str, str]:
|
|
364
|
+
"""
|
|
365
|
+
Fetch a single page and return cleaned markdown/text plus metadata.
|
|
366
|
+
Dispatches to jinaai or Crawl4AI based on fetch_provider config.
|
|
367
|
+
"""
|
|
368
|
+
if not url:
|
|
369
|
+
return {"content": "Error: missing url", "title": "Error", "url": ""}
|
|
370
|
+
|
|
371
|
+
provider = self._fetch_provider.lower()
|
|
372
|
+
logger.info(f"SearchService: fetching page '{url}' using fetch_provider='{provider}'")
|
|
373
|
+
|
|
374
|
+
if provider == "jinaai":
|
|
375
|
+
return await self._fetch_page_jinaai(url)
|
|
376
|
+
else:
|
|
377
|
+
return await self._fetch_page_crawl4ai(url)
|
|
378
|
+
|
|
379
|
+
async def _fetch_page_jinaai(self, url: str) -> Dict[str, str]:
|
|
380
|
+
"""
|
|
381
|
+
Fetch page via Jina AI Reader - returns clean markdown content.
|
|
382
|
+
https://r.jina.ai/{url}
|
|
383
|
+
"""
|
|
384
|
+
if not httpx:
|
|
385
|
+
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
386
|
+
return await self._fetch_page_crawl4ai(url)
|
|
387
|
+
|
|
388
|
+
jina_url = f"https://r.jina.ai/{url}"
|
|
389
|
+
headers = {
|
|
390
|
+
"X-Return-Format": "markdown",
|
|
391
|
+
}
|
|
392
|
+
# Add authorization header if API key is configured
|
|
393
|
+
if self._jina_api_key:
|
|
394
|
+
headers["Authorization"] = f"Bearer {self._jina_api_key}"
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
|
398
|
+
resp = await client.get(jina_url, headers=headers)
|
|
399
|
+
resp.raise_for_status()
|
|
400
|
+
content = resp.text
|
|
401
|
+
|
|
402
|
+
# Jina AI returns markdown content directly
|
|
403
|
+
# Try to extract title from first heading or first line
|
|
404
|
+
title = "No Title"
|
|
405
|
+
lines = content.strip().split('\n')
|
|
406
|
+
for line in lines:
|
|
407
|
+
line = line.strip()
|
|
408
|
+
if line.startswith('# '):
|
|
409
|
+
title = line[2:].strip()
|
|
410
|
+
break
|
|
411
|
+
elif line and not line.startswith('!') and not line.startswith('['):
|
|
412
|
+
# Use first non-empty, non-image, non-link line as title
|
|
413
|
+
title = line[:100]
|
|
414
|
+
break
|
|
415
|
+
|
|
416
|
+
logger.info(f"SearchService(jinaai): fetched page, title='{title}', content_len={len(content)}")
|
|
417
|
+
return {
|
|
418
|
+
"content": content[:8000],
|
|
419
|
+
"title": title,
|
|
420
|
+
"url": url
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
except Exception as e:
|
|
424
|
+
logger.error(f"SearchService(jinaai) fetch_page failed: {e}")
|
|
425
|
+
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
426
|
+
|
|
427
|
+
async def _fetch_page_httpx(self, url: str) -> Dict[str, str]:
|
|
428
|
+
"""
|
|
429
|
+
Fetch page via httpx - fast, no browser overhead.
|
|
430
|
+
"""
|
|
431
|
+
if not httpx:
|
|
432
|
+
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
433
|
+
return await self._fetch_page_crawl4ai(url)
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
|
|
437
|
+
resp = await client.get(url, headers={
|
|
438
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
439
|
+
})
|
|
440
|
+
resp.raise_for_status()
|
|
441
|
+
html_content = resp.text
|
|
442
|
+
|
|
443
|
+
# Extract title from HTML
|
|
444
|
+
title = "No Title"
|
|
445
|
+
title_match = re.search(r'<title[^>]*>([^<]+)</title>', html_content, re.IGNORECASE)
|
|
446
|
+
if title_match:
|
|
447
|
+
title = html.unescape(title_match.group(1).strip())
|
|
448
|
+
|
|
449
|
+
# Try og:title as fallback
|
|
450
|
+
if title == "No Title":
|
|
451
|
+
og_match = re.search(r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\']([^"\']+)["\']', html_content, re.IGNORECASE)
|
|
452
|
+
if og_match:
|
|
453
|
+
title = html.unescape(og_match.group(1).strip())
|
|
454
|
+
|
|
455
|
+
# Simple HTML to text conversion
|
|
456
|
+
# Remove script/style tags
|
|
457
|
+
content = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', html_content, flags=re.IGNORECASE)
|
|
458
|
+
content = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', html_content, flags=re.IGNORECASE)
|
|
459
|
+
# Remove HTML tags
|
|
460
|
+
content = re.sub(r'<[^>]+>', ' ', content)
|
|
461
|
+
# Decode entities
|
|
462
|
+
content = html.unescape(content)
|
|
463
|
+
# Normalize whitespace
|
|
464
|
+
content = re.sub(r'\s+', ' ', content).strip()
|
|
465
|
+
|
|
466
|
+
logger.info(f"SearchService(httpx): fetched page, title='{title}', content_len={len(content)}")
|
|
467
|
+
return {
|
|
468
|
+
"content": content[:8000],
|
|
469
|
+
"title": title,
|
|
470
|
+
"url": url
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
except Exception as e:
|
|
474
|
+
logger.error(f"SearchService(httpx) fetch_page failed: {e}")
|
|
475
|
+
return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
|
|
476
|
+
|
|
477
|
+
async def _fetch_page_crawl4ai(self, url: str) -> Dict[str, str]:
|
|
478
|
+
"""
|
|
479
|
+
Fetch page via Crawl4AI - full browser rendering.
|
|
480
|
+
"""
|
|
481
|
+
try:
|
|
482
|
+
crawler = await self._get_crawler()
|
|
483
|
+
result = await crawler.arun(
|
|
484
|
+
url=url,
|
|
485
|
+
config=CrawlerRunConfig(
|
|
486
|
+
wait_until="networkidle",
|
|
487
|
+
wait_for_images=False, # Faster: skip image loading
|
|
488
|
+
cache_mode=CacheMode.BYPASS,
|
|
489
|
+
word_count_threshold=1,
|
|
490
|
+
screenshot=False,
|
|
491
|
+
capture_console_messages=False,
|
|
492
|
+
capture_network_requests=False,
|
|
493
|
+
),
|
|
494
|
+
)
|
|
495
|
+
if not result.success:
|
|
496
|
+
return {"content": f"Error: crawl failed ({result.error_message or 'unknown'})", "title": "Error", "url": url}
|
|
497
|
+
|
|
498
|
+
content = result.markdown or result.extracted_content or result.cleaned_html or result.html or ""
|
|
499
|
+
# Extract metadata if available, otherwise fallback
|
|
500
|
+
title = "No Title"
|
|
501
|
+
if result.metadata:
|
|
502
|
+
title = result.metadata.get("title") or result.metadata.get("og:title") or title
|
|
503
|
+
|
|
504
|
+
# If metadata title is missing/generic, try to grab from links or url? No, metadata is best.
|
|
505
|
+
if title == "No Title" and result.links:
|
|
506
|
+
# Minimal fallback not really possible without parsing HTML again or regex
|
|
507
|
+
pass
|
|
508
|
+
|
|
509
|
+
return {
|
|
510
|
+
"content": content[:8000],
|
|
511
|
+
"title": title,
|
|
512
|
+
"url": result.url or url
|
|
513
|
+
}
|
|
514
|
+
except Exception as e:
|
|
515
|
+
logger.error(f"Crawl4AI fetch failed: {e}")
|
|
516
|
+
return {"content": f"Error: crawl failed ({e})", "title": "Error", "url": url}
|
|
517
|
+
|
|
518
|
+
async def _get_crawler(self) -> AsyncWebCrawler:
|
|
519
|
+
# Prefer shared crawler to minimize INIT logs; fall back to local if needed
|
|
520
|
+
try:
|
|
521
|
+
return await get_shared_crawler()
|
|
522
|
+
except Exception as e:
|
|
523
|
+
logger.warning(f"Shared crawler unavailable, creating local: {e}")
|
|
524
|
+
if self._crawler is None:
|
|
525
|
+
self._crawler = AsyncWebCrawler()
|
|
526
|
+
await self._crawler.start()
|
|
527
|
+
return self._crawler
|
|
528
|
+
|
|
529
|
+
async def close(self):
|
|
530
|
+
if self._crawler:
|
|
531
|
+
try:
|
|
532
|
+
await self._crawler.close()
|
|
533
|
+
except Exception:
|
|
534
|
+
pass
|
|
535
|
+
self._crawler = None
|
|
536
|
+
|
|
537
|
+
async def image_search(self, query: str, prefetch: bool = True) -> List[Dict[str, str]]:
|
|
538
|
+
"""
|
|
539
|
+
Image search - dispatches to httpx, ddgs, or Crawl4AI based on search_provider.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
query: Search query
|
|
543
|
+
prefetch: If True, automatically start prefetching images for caching
|
|
544
|
+
"""
|
|
545
|
+
if not query:
|
|
546
|
+
return []
|
|
547
|
+
|
|
548
|
+
provider = self._search_provider.lower()
|
|
549
|
+
logger.info(f"SearchService: image searching for '{query}' using provider='{provider}'")
|
|
550
|
+
|
|
551
|
+
if provider == "ddgs":
|
|
552
|
+
results = await self._search_ddgs_images(query)
|
|
553
|
+
elif provider == "httpx":
|
|
554
|
+
results = await self._image_search_httpx(query)
|
|
555
|
+
else:
|
|
556
|
+
results = await self._image_search_crawl4ai(query)
|
|
557
|
+
|
|
558
|
+
# Start prefetching images in background for faster rendering
|
|
559
|
+
if prefetch and results:
|
|
560
|
+
urls_to_prefetch = []
|
|
561
|
+
for img in results:
|
|
562
|
+
# Prefer thumbnail for prefetch (smaller, used in UI)
|
|
563
|
+
thumb = img.get("thumbnail")
|
|
564
|
+
url = img.get("url")
|
|
565
|
+
if thumb:
|
|
566
|
+
urls_to_prefetch.append(thumb)
|
|
567
|
+
if url and url != thumb:
|
|
568
|
+
urls_to_prefetch.append(url)
|
|
569
|
+
|
|
570
|
+
if urls_to_prefetch:
|
|
571
|
+
logger.info(f"SearchService: Starting prefetch for {len(urls_to_prefetch)} images")
|
|
572
|
+
await prefetch_images(urls_to_prefetch)
|
|
573
|
+
|
|
574
|
+
return results
|
|
575
|
+
|
|
576
|
+
async def _image_search_httpx(self, query: str) -> List[Dict[str, str]]:
|
|
577
|
+
"""
|
|
578
|
+
Image search via httpx - parse img tags from HTML response.
|
|
579
|
+
"""
|
|
580
|
+
if not httpx:
|
|
581
|
+
logger.warning("SearchService: httpx not installed, fallback to crawl4ai")
|
|
582
|
+
return await self._image_search_crawl4ai(query)
|
|
583
|
+
|
|
584
|
+
url = self._build_image_url(query)
|
|
585
|
+
logger.info(f"SearchService(httpx Image): fetching {url}")
|
|
586
|
+
|
|
587
|
+
try:
|
|
588
|
+
async with httpx.AsyncClient(timeout=self._search_timeout, follow_redirects=True) as client:
|
|
589
|
+
resp = await client.get(url, headers={
|
|
590
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
591
|
+
})
|
|
592
|
+
resp.raise_for_status()
|
|
593
|
+
html_content = resp.text
|
|
594
|
+
|
|
595
|
+
# Parse img tags from HTML
|
|
596
|
+
# Match: <img ... src="..." ... > or <img ... data-src="..." ...>
|
|
597
|
+
img_regex = re.compile(r'<img[^>]+(?:src|data-src)=["\']([^"\']+)["\'][^>]*(?:alt=["\']([^"\']*)["\'])?[^>]*>', re.IGNORECASE)
|
|
598
|
+
|
|
599
|
+
images = []
|
|
600
|
+
seen = set()
|
|
601
|
+
|
|
602
|
+
for match in img_regex.finditer(html_content):
|
|
603
|
+
src = match.group(1) or ""
|
|
604
|
+
alt = match.group(2) or ""
|
|
605
|
+
|
|
606
|
+
if not src:
|
|
607
|
+
continue
|
|
608
|
+
if src.startswith("//"):
|
|
609
|
+
src = "https:" + src
|
|
610
|
+
if not src.startswith("http"):
|
|
611
|
+
continue
|
|
612
|
+
# Skip tiny icons/placeholders
|
|
613
|
+
if "favicon" in src.lower() or "logo" in src.lower() or "icon" in src.lower():
|
|
614
|
+
continue
|
|
615
|
+
if src in seen:
|
|
616
|
+
continue
|
|
617
|
+
seen.add(src)
|
|
618
|
+
|
|
619
|
+
alt = html.unescape(alt.strip()) if alt else "Image"
|
|
620
|
+
domain = urllib.parse.urlparse(src).hostname or ""
|
|
621
|
+
|
|
622
|
+
images.append({
|
|
623
|
+
"title": alt,
|
|
624
|
+
"url": src,
|
|
625
|
+
"thumbnail": src, # Use same URL as thumbnail
|
|
626
|
+
"domain": domain,
|
|
627
|
+
"content": alt,
|
|
628
|
+
})
|
|
629
|
+
|
|
630
|
+
if len(images) >= self._default_limit:
|
|
631
|
+
break
|
|
632
|
+
|
|
633
|
+
if not images:
|
|
634
|
+
logger.warning(f"SearchService(httpx): no images parsed from HTML")
|
|
635
|
+
else:
|
|
636
|
+
logger.info(f"SearchService(httpx): parsed {len(images)} images")
|
|
637
|
+
return images
|
|
638
|
+
|
|
639
|
+
except Exception as e:
|
|
640
|
+
logger.error(f"SearchService(httpx) image_search failed: {e}")
|
|
641
|
+
return []
|
|
642
|
+
|
|
643
|
+
async def _image_search_crawl4ai(self, query: str) -> List[Dict[str, str]]:
|
|
644
|
+
"""
|
|
645
|
+
Image search via Crawl4AI media extraction.
|
|
646
|
+
"""
|
|
647
|
+
url = self._build_image_url(query)
|
|
648
|
+
logger.info(f"SearchService(Crawl4AI Image): fetching {url}")
|
|
649
|
+
|
|
650
|
+
try:
|
|
651
|
+
# Use image crawler (text_mode=False) for image search
|
|
652
|
+
crawler = await self._get_crawler()
|
|
653
|
+
result = await crawler.arun(
|
|
654
|
+
url=url,
|
|
655
|
+
config=CrawlerRunConfig(
|
|
656
|
+
wait_until="domcontentloaded", # 不需要等 networkidle
|
|
657
|
+
wait_for_images=False, # 只需要 img src,不需要图片内容
|
|
658
|
+
wait_for="img",
|
|
659
|
+
cache_mode=CacheMode.BYPASS,
|
|
660
|
+
word_count_threshold=1,
|
|
661
|
+
screenshot=False,
|
|
662
|
+
capture_console_messages=False,
|
|
663
|
+
capture_network_requests=False,
|
|
664
|
+
),
|
|
665
|
+
)
|
|
666
|
+
images = []
|
|
667
|
+
seen = set()
|
|
668
|
+
for img in result.media.get("images", []):
|
|
669
|
+
src = img.get("src") or ""
|
|
670
|
+
if not src:
|
|
671
|
+
continue
|
|
672
|
+
if src.startswith("//"):
|
|
673
|
+
src = "https:" + src
|
|
674
|
+
if not src.startswith("http"):
|
|
675
|
+
continue
|
|
676
|
+
if src in seen:
|
|
677
|
+
continue
|
|
678
|
+
seen.add(src)
|
|
679
|
+
alt = (img.get("alt") or img.get("desc") or "").strip()
|
|
680
|
+
domain = urllib.parse.urlparse(src).hostname or ""
|
|
681
|
+
images.append({
|
|
682
|
+
"title": alt or "Image",
|
|
683
|
+
"url": src,
|
|
684
|
+
"domain": domain,
|
|
685
|
+
"content": alt or "Image",
|
|
686
|
+
})
|
|
687
|
+
if len(images) >= self._default_limit:
|
|
688
|
+
break
|
|
689
|
+
if not images:
|
|
690
|
+
logger.warning(f"SearchService: no images parsed; media_count={len(result.media.get('images', []))}")
|
|
691
|
+
else:
|
|
692
|
+
logger.info(f"SearchService: parsed {len(images)} images via Crawl4AI media")
|
|
693
|
+
return images
|
|
694
|
+
except Exception as e:
|
|
695
|
+
logger.error(f"Crawl4AI image search failed: {e}")
|
|
696
|
+
return []
|