hammad-python 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ham/__init__.py +200 -0
- {hammad_python-0.0.30.dist-info → hammad_python-0.0.32.dist-info}/METADATA +6 -32
- hammad_python-0.0.32.dist-info/RECORD +6 -0
- hammad/__init__.py +0 -84
- hammad/_internal.py +0 -256
- hammad/_main.py +0 -226
- hammad/cache/__init__.py +0 -40
- hammad/cache/base_cache.py +0 -181
- hammad/cache/cache.py +0 -169
- hammad/cache/decorators.py +0 -261
- hammad/cache/file_cache.py +0 -80
- hammad/cache/ttl_cache.py +0 -74
- hammad/cli/__init__.py +0 -33
- hammad/cli/animations.py +0 -573
- hammad/cli/plugins.py +0 -867
- hammad/cli/styles/__init__.py +0 -55
- hammad/cli/styles/settings.py +0 -139
- hammad/cli/styles/types.py +0 -358
- hammad/cli/styles/utils.py +0 -634
- hammad/data/__init__.py +0 -90
- hammad/data/collections/__init__.py +0 -49
- hammad/data/collections/collection.py +0 -326
- hammad/data/collections/indexes/__init__.py +0 -37
- hammad/data/collections/indexes/qdrant/__init__.py +0 -1
- hammad/data/collections/indexes/qdrant/index.py +0 -723
- hammad/data/collections/indexes/qdrant/settings.py +0 -94
- hammad/data/collections/indexes/qdrant/utils.py +0 -210
- hammad/data/collections/indexes/tantivy/__init__.py +0 -1
- hammad/data/collections/indexes/tantivy/index.py +0 -426
- hammad/data/collections/indexes/tantivy/settings.py +0 -40
- hammad/data/collections/indexes/tantivy/utils.py +0 -176
- hammad/data/configurations/__init__.py +0 -35
- hammad/data/configurations/configuration.py +0 -564
- hammad/data/models/__init__.py +0 -50
- hammad/data/models/extensions/__init__.py +0 -4
- hammad/data/models/extensions/pydantic/__init__.py +0 -42
- hammad/data/models/extensions/pydantic/converters.py +0 -759
- hammad/data/models/fields.py +0 -546
- hammad/data/models/model.py +0 -1078
- hammad/data/models/utils.py +0 -280
- hammad/data/sql/__init__.py +0 -24
- hammad/data/sql/database.py +0 -576
- hammad/data/sql/types.py +0 -127
- hammad/data/types/__init__.py +0 -75
- hammad/data/types/file.py +0 -431
- hammad/data/types/multimodal/__init__.py +0 -36
- hammad/data/types/multimodal/audio.py +0 -200
- hammad/data/types/multimodal/image.py +0 -182
- hammad/data/types/text.py +0 -1308
- hammad/formatting/__init__.py +0 -33
- hammad/formatting/json/__init__.py +0 -27
- hammad/formatting/json/converters.py +0 -158
- hammad/formatting/text/__init__.py +0 -63
- hammad/formatting/text/converters.py +0 -723
- hammad/formatting/text/markdown.py +0 -131
- hammad/formatting/yaml/__init__.py +0 -26
- hammad/formatting/yaml/converters.py +0 -5
- hammad/genai/__init__.py +0 -217
- hammad/genai/a2a/__init__.py +0 -32
- hammad/genai/a2a/workers.py +0 -552
- hammad/genai/agents/__init__.py +0 -59
- hammad/genai/agents/agent.py +0 -1973
- hammad/genai/agents/run.py +0 -1024
- hammad/genai/agents/types/__init__.py +0 -42
- hammad/genai/agents/types/agent_context.py +0 -13
- hammad/genai/agents/types/agent_event.py +0 -128
- hammad/genai/agents/types/agent_hooks.py +0 -220
- hammad/genai/agents/types/agent_messages.py +0 -31
- hammad/genai/agents/types/agent_response.py +0 -125
- hammad/genai/agents/types/agent_stream.py +0 -327
- hammad/genai/graphs/__init__.py +0 -125
- hammad/genai/graphs/_utils.py +0 -190
- hammad/genai/graphs/base.py +0 -1828
- hammad/genai/graphs/plugins.py +0 -316
- hammad/genai/graphs/types.py +0 -638
- hammad/genai/models/__init__.py +0 -1
- hammad/genai/models/embeddings/__init__.py +0 -43
- hammad/genai/models/embeddings/model.py +0 -226
- hammad/genai/models/embeddings/run.py +0 -163
- hammad/genai/models/embeddings/types/__init__.py +0 -37
- hammad/genai/models/embeddings/types/embedding_model_name.py +0 -75
- hammad/genai/models/embeddings/types/embedding_model_response.py +0 -76
- hammad/genai/models/embeddings/types/embedding_model_run_params.py +0 -66
- hammad/genai/models/embeddings/types/embedding_model_settings.py +0 -47
- hammad/genai/models/language/__init__.py +0 -57
- hammad/genai/models/language/model.py +0 -1098
- hammad/genai/models/language/run.py +0 -878
- hammad/genai/models/language/types/__init__.py +0 -40
- hammad/genai/models/language/types/language_model_instructor_mode.py +0 -47
- hammad/genai/models/language/types/language_model_messages.py +0 -28
- hammad/genai/models/language/types/language_model_name.py +0 -239
- hammad/genai/models/language/types/language_model_request.py +0 -127
- hammad/genai/models/language/types/language_model_response.py +0 -217
- hammad/genai/models/language/types/language_model_response_chunk.py +0 -56
- hammad/genai/models/language/types/language_model_settings.py +0 -89
- hammad/genai/models/language/types/language_model_stream.py +0 -600
- hammad/genai/models/language/utils/__init__.py +0 -28
- hammad/genai/models/language/utils/requests.py +0 -421
- hammad/genai/models/language/utils/structured_outputs.py +0 -135
- hammad/genai/models/model_provider.py +0 -4
- hammad/genai/models/multimodal.py +0 -47
- hammad/genai/models/reranking.py +0 -26
- hammad/genai/types/__init__.py +0 -1
- hammad/genai/types/base.py +0 -215
- hammad/genai/types/history.py +0 -290
- hammad/genai/types/tools.py +0 -507
- hammad/logging/__init__.py +0 -35
- hammad/logging/decorators.py +0 -834
- hammad/logging/logger.py +0 -1018
- hammad/mcp/__init__.py +0 -53
- hammad/mcp/client/__init__.py +0 -35
- hammad/mcp/client/client.py +0 -624
- hammad/mcp/client/client_service.py +0 -400
- hammad/mcp/client/settings.py +0 -178
- hammad/mcp/servers/__init__.py +0 -26
- hammad/mcp/servers/launcher.py +0 -1161
- hammad/runtime/__init__.py +0 -32
- hammad/runtime/decorators.py +0 -142
- hammad/runtime/run.py +0 -299
- hammad/service/__init__.py +0 -49
- hammad/service/create.py +0 -527
- hammad/service/decorators.py +0 -283
- hammad/types.py +0 -288
- hammad/typing/__init__.py +0 -435
- hammad/web/__init__.py +0 -43
- hammad/web/http/__init__.py +0 -1
- hammad/web/http/client.py +0 -944
- hammad/web/models.py +0 -275
- hammad/web/openapi/__init__.py +0 -1
- hammad/web/openapi/client.py +0 -740
- hammad/web/search/__init__.py +0 -1
- hammad/web/search/client.py +0 -1023
- hammad/web/utils.py +0 -472
- hammad_python-0.0.30.dist-info/RECORD +0 -135
- {hammad → ham}/py.typed +0 -0
- {hammad_python-0.0.30.dist-info → hammad_python-0.0.32.dist-info}/WHEEL +0 -0
- {hammad_python-0.0.30.dist-info → hammad_python-0.0.32.dist-info}/licenses/LICENSE +0 -0
hammad/web/search/client.py
DELETED
@@ -1,1023 +0,0 @@
|
|
1
|
-
"""hammad.web.search.client"""
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
import asyncio
|
6
|
-
import logging
|
7
|
-
from typing import Any, Dict, List, Literal, Optional, Union, overload
|
8
|
-
from urllib.parse import urljoin, urlparse
|
9
|
-
|
10
|
-
import httpx
|
11
|
-
from tenacity import (
|
12
|
-
AsyncRetrying,
|
13
|
-
retry_if_exception_type,
|
14
|
-
stop_after_attempt,
|
15
|
-
wait_exponential,
|
16
|
-
before_sleep_log,
|
17
|
-
)
|
18
|
-
|
19
|
-
from ..models import (
|
20
|
-
SearchResult,
|
21
|
-
NewsResult,
|
22
|
-
SearchResults,
|
23
|
-
NewsResults,
|
24
|
-
WebPageResult,
|
25
|
-
WebPageErrorResult,
|
26
|
-
WebPageResults,
|
27
|
-
ExtractedLinks,
|
28
|
-
ExtractedLink,
|
29
|
-
LinkInfo,
|
30
|
-
ImageInfo,
|
31
|
-
SelectedElement,
|
32
|
-
)
|
33
|
-
|
34
|
-
__all__ = ("AsyncSearchClient", "SearchClient", "create_search_client")
|
35
|
-
|
36
|
-
|
37
|
-
class AsyncSearchClient:
|
38
|
-
"""
|
39
|
-
Search client that provides web search and page parsing capabilities.
|
40
|
-
|
41
|
-
This client uses lazy loading for DuckDuckGo search and selectolax HTML parsing
|
42
|
-
to minimize import overhead and memory usage.
|
43
|
-
"""
|
44
|
-
|
45
|
-
def __init__(
|
46
|
-
self,
|
47
|
-
*,
|
48
|
-
timeout: float = 30.0,
|
49
|
-
max_concurrent: int = 5,
|
50
|
-
user_agent: Optional[str] = None,
|
51
|
-
default_headers: Optional[Dict[str, str]] = None,
|
52
|
-
max_retries: int = 3,
|
53
|
-
):
|
54
|
-
"""
|
55
|
-
Initialize the SearchClient.
|
56
|
-
|
57
|
-
Args:
|
58
|
-
timeout: Default timeout for HTTP requests in seconds
|
59
|
-
max_concurrent: Maximum number of concurrent requests for batch operations
|
60
|
-
user_agent: User-Agent header for HTTP requests
|
61
|
-
default_headers: Default headers to include in HTTP requests
|
62
|
-
max_retries: Maximum number of retry attempts for failed requests
|
63
|
-
"""
|
64
|
-
self.timeout = timeout
|
65
|
-
self.max_concurrent = max_concurrent
|
66
|
-
self.user_agent = user_agent or "Mozilla/5.0 (compatible; SearchClient/1.0)"
|
67
|
-
self.default_headers = default_headers or {}
|
68
|
-
self.max_retries = max_retries
|
69
|
-
|
70
|
-
# Lazy-loaded resources
|
71
|
-
self._ddgs_client = None
|
72
|
-
self._selectolax_parser_class = None
|
73
|
-
|
74
|
-
def _get_duckduckgo_client(self):
|
75
|
-
"""Get a DuckDuckGo search client using lazy import and singleton pattern."""
|
76
|
-
if self._ddgs_client is None:
|
77
|
-
try:
|
78
|
-
from ddgs import DDGS
|
79
|
-
|
80
|
-
self._ddgs_client = DDGS
|
81
|
-
except ImportError as e:
|
82
|
-
raise ImportError(
|
83
|
-
"duckduckgo_search is required for web search functionality. "
|
84
|
-
"Install with: pip install duckduckgo-search"
|
85
|
-
) from e
|
86
|
-
return self._ddgs_client
|
87
|
-
|
88
|
-
def _get_selectolax_parser(self):
|
89
|
-
"""Get selectolax HTMLParser class using lazy import and singleton pattern."""
|
90
|
-
if self._selectolax_parser_class is None:
|
91
|
-
try:
|
92
|
-
from selectolax.parser import HTMLParser
|
93
|
-
|
94
|
-
self._selectolax_parser_class = HTMLParser
|
95
|
-
except ImportError as e:
|
96
|
-
raise ImportError(
|
97
|
-
"selectolax is required for HTML parsing functionality. "
|
98
|
-
"Install with: pip install selectolax"
|
99
|
-
) from e
|
100
|
-
return self._selectolax_parser_class
|
101
|
-
|
102
|
-
def _get_default_headers(self) -> Dict[str, str]:
|
103
|
-
"""Get default headers for HTTP requests."""
|
104
|
-
headers = {"User-Agent": self.user_agent}
|
105
|
-
headers.update(self.default_headers)
|
106
|
-
return headers
|
107
|
-
|
108
|
-
async def search(
|
109
|
-
self,
|
110
|
-
query: str,
|
111
|
-
*,
|
112
|
-
max_results: int = 10,
|
113
|
-
region: str = "wt-wt",
|
114
|
-
safesearch: Literal["on", "moderate", "off"] = "moderate",
|
115
|
-
timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
|
116
|
-
backend: Literal["auto", "html", "lite"] = "auto",
|
117
|
-
max_retries: Optional[int] = None,
|
118
|
-
) -> SearchResults:
|
119
|
-
"""
|
120
|
-
(deprecated in favor of `web_search`)
|
121
|
-
|
122
|
-
Args:
|
123
|
-
query: Search query string
|
124
|
-
max_results: Maximum number of results to return (default: 10)
|
125
|
-
region: Search region (default: "wt-wt" for worldwide)
|
126
|
-
safesearch: Safe search setting (default: "moderate")
|
127
|
-
timelimit: Time limit for results (d=day, w=week, m=month, y=year)
|
128
|
-
backend: Search backend to use (default: "auto")
|
129
|
-
max_retries: Maximum number of retry attempts (uses instance default if not provided)
|
130
|
-
|
131
|
-
Returns:
|
132
|
-
List of search result dictionaries with 'title', 'href', and 'body' keys
|
133
|
-
|
134
|
-
Raises:
|
135
|
-
ValueError: If query is empty
|
136
|
-
Exception: If search fails after all retries
|
137
|
-
"""
|
138
|
-
from rich import print
|
139
|
-
|
140
|
-
print(
|
141
|
-
"[bold yellow]WARNING: [/bold yellow] [yellow]Using `AsyncSearchClient.[bold light_salmon3]search[/bold light_salmon3]` is now deprecated in favor of `AsyncSearchClient.[bold light_salmon3]web_search[/bold light_salmon3]`[/yellow]"
|
142
|
-
)
|
143
|
-
return await self.web_search(
|
144
|
-
query,
|
145
|
-
max_results=max_results,
|
146
|
-
region=region,
|
147
|
-
safesearch=safesearch,
|
148
|
-
timelimit=timelimit,
|
149
|
-
)
|
150
|
-
|
151
|
-
async def web_search(
|
152
|
-
self,
|
153
|
-
query: str,
|
154
|
-
*,
|
155
|
-
max_results: int = 10,
|
156
|
-
region: str = "wt-wt",
|
157
|
-
safesearch: Literal["on", "moderate", "off"] = "moderate",
|
158
|
-
timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
|
159
|
-
backend: Literal["auto", "html", "lite"] = "auto",
|
160
|
-
max_retries: Optional[int] = None,
|
161
|
-
) -> SearchResults:
|
162
|
-
"""
|
163
|
-
Search the web using DuckDuckGo search.
|
164
|
-
|
165
|
-
Args:
|
166
|
-
query: Search query string
|
167
|
-
max_results: Maximum number of results to return (default: 10)
|
168
|
-
region: Search region (default: "wt-wt" for worldwide)
|
169
|
-
safesearch: Safe search setting (default: "moderate")
|
170
|
-
timelimit: Time limit for results (d=day, w=week, m=month, y=year)
|
171
|
-
backend: Search backend to use (default: "auto")
|
172
|
-
max_retries: Maximum number of retry attempts (uses instance default if not provided)
|
173
|
-
|
174
|
-
Returns:
|
175
|
-
List of search result dictionaries with 'title', 'href', and 'body' keys
|
176
|
-
|
177
|
-
Raises:
|
178
|
-
ValueError: If query is empty
|
179
|
-
Exception: If search fails after all retries
|
180
|
-
"""
|
181
|
-
if not query or not query.strip():
|
182
|
-
raise ValueError("Query cannot be empty")
|
183
|
-
|
184
|
-
retries = max_retries if max_retries is not None else self.max_retries
|
185
|
-
|
186
|
-
async def _do_search():
|
187
|
-
DDGS = self._get_duckduckgo_client()
|
188
|
-
with DDGS() as ddgs:
|
189
|
-
raw_results = list(
|
190
|
-
ddgs.text(
|
191
|
-
query.strip(),
|
192
|
-
region=region,
|
193
|
-
safesearch=safesearch,
|
194
|
-
timelimit=timelimit,
|
195
|
-
backend=backend,
|
196
|
-
max_results=max_results,
|
197
|
-
)
|
198
|
-
)
|
199
|
-
|
200
|
-
# Convert raw results to SearchResult models
|
201
|
-
search_results = [
|
202
|
-
SearchResult(
|
203
|
-
title=result.get("title", ""),
|
204
|
-
href=result.get("href", ""),
|
205
|
-
body=result.get("body", ""),
|
206
|
-
)
|
207
|
-
for result in raw_results
|
208
|
-
]
|
209
|
-
|
210
|
-
return SearchResults(query=query.strip(), results=search_results)
|
211
|
-
|
212
|
-
async for attempt in AsyncRetrying(
|
213
|
-
stop=stop_after_attempt(retries + 1),
|
214
|
-
wait=wait_exponential(multiplier=1, min=1, max=10),
|
215
|
-
retry=retry_if_exception_type(Exception),
|
216
|
-
before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
|
217
|
-
):
|
218
|
-
with attempt:
|
219
|
-
return await _do_search()
|
220
|
-
|
221
|
-
async def search_news(
|
222
|
-
self,
|
223
|
-
query: str,
|
224
|
-
*,
|
225
|
-
max_results: int = 10,
|
226
|
-
region: str = "wt-wt",
|
227
|
-
safesearch: Literal["on", "moderate", "off"] = "moderate",
|
228
|
-
timelimit: Optional[Literal["d", "w", "m"]] = None,
|
229
|
-
max_retries: Optional[int] = None,
|
230
|
-
) -> NewsResults:
|
231
|
-
"""
|
232
|
-
Search for news using DuckDuckGo news search.
|
233
|
-
|
234
|
-
Args:
|
235
|
-
query: Search query string
|
236
|
-
max_results: Maximum number of results to return (default: 10)
|
237
|
-
region: Search region (default: "wt-wt" for worldwide)
|
238
|
-
safesearch: Safe search setting (default: "moderate")
|
239
|
-
timelimit: Time limit for results (d=day, w=week, m=month)
|
240
|
-
max_retries: Maximum number of retry attempts (uses instance default if not provided)
|
241
|
-
|
242
|
-
Returns:
|
243
|
-
List of news result dictionaries with date, title, body, url, image, and source
|
244
|
-
|
245
|
-
Raises:
|
246
|
-
ValueError: If query is empty
|
247
|
-
Exception: If search fails after all retries
|
248
|
-
"""
|
249
|
-
if not query or not query.strip():
|
250
|
-
raise ValueError("Query cannot be empty")
|
251
|
-
|
252
|
-
retries = max_retries if max_retries is not None else self.max_retries
|
253
|
-
|
254
|
-
async def _do_news_search():
|
255
|
-
DDGS = self._get_duckduckgo_client()
|
256
|
-
with DDGS() as ddgs:
|
257
|
-
raw_results = list(
|
258
|
-
ddgs.news(
|
259
|
-
query.strip(),
|
260
|
-
region=region,
|
261
|
-
safesearch=safesearch,
|
262
|
-
timelimit=timelimit,
|
263
|
-
max_results=max_results,
|
264
|
-
)
|
265
|
-
)
|
266
|
-
|
267
|
-
# Convert raw results to NewsResult models
|
268
|
-
news_results = [
|
269
|
-
NewsResult(
|
270
|
-
date=result.get("date", ""),
|
271
|
-
title=result.get("title", ""),
|
272
|
-
body=result.get("body", ""),
|
273
|
-
url=result.get("url", ""),
|
274
|
-
image=result.get("image", ""),
|
275
|
-
source=result.get("source", ""),
|
276
|
-
)
|
277
|
-
for result in raw_results
|
278
|
-
]
|
279
|
-
|
280
|
-
return NewsResults(query=query.strip(), results=news_results)
|
281
|
-
|
282
|
-
async for attempt in AsyncRetrying(
|
283
|
-
stop=stop_after_attempt(retries + 1),
|
284
|
-
wait=wait_exponential(multiplier=1, min=1, max=10),
|
285
|
-
retry=retry_if_exception_type(Exception),
|
286
|
-
before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
|
287
|
-
):
|
288
|
-
with attempt:
|
289
|
-
return await _do_news_search()
|
290
|
-
|
291
|
-
async def read_web_page(
|
292
|
-
self,
|
293
|
-
url: str,
|
294
|
-
*,
|
295
|
-
timeout: Optional[float] = None,
|
296
|
-
headers: Optional[Dict[str, str]] = None,
|
297
|
-
extract_text: bool = True,
|
298
|
-
extract_links: bool = False,
|
299
|
-
extract_images: bool = False,
|
300
|
-
css_selector: Optional[str] = None,
|
301
|
-
max_retries: Optional[int] = None,
|
302
|
-
) -> WebPageResult:
|
303
|
-
"""
|
304
|
-
Read and parse a single web page using selectolax.
|
305
|
-
|
306
|
-
Args:
|
307
|
-
url: URL to fetch and parse
|
308
|
-
timeout: Request timeout in seconds (uses default if not provided)
|
309
|
-
headers: Optional HTTP headers to send
|
310
|
-
extract_text: Whether to extract text content (default: True)
|
311
|
-
extract_links: Whether to extract links (default: False)
|
312
|
-
extract_images: Whether to extract images (default: False)
|
313
|
-
css_selector: Optional CSS selector to extract specific elements
|
314
|
-
max_retries: Maximum number of retry attempts (uses instance default if not provided)
|
315
|
-
|
316
|
-
Returns:
|
317
|
-
Dictionary containing parsed content and metadata
|
318
|
-
|
319
|
-
Raises:
|
320
|
-
httpx.HTTPError: If request fails after all retries
|
321
|
-
Exception: If parsing fails
|
322
|
-
"""
|
323
|
-
effective_headers = self._get_default_headers()
|
324
|
-
if headers:
|
325
|
-
effective_headers.update(headers)
|
326
|
-
|
327
|
-
request_timeout = timeout or self.timeout
|
328
|
-
retries = max_retries if max_retries is not None else self.max_retries
|
329
|
-
|
330
|
-
async def _do_fetch_and_parse():
|
331
|
-
async with httpx.AsyncClient(
|
332
|
-
timeout=request_timeout, follow_redirects=True
|
333
|
-
) as client:
|
334
|
-
response = await client.get(url, headers=effective_headers)
|
335
|
-
response.raise_for_status()
|
336
|
-
|
337
|
-
# Parse HTML content
|
338
|
-
HTMLParser = self._get_selectolax_parser()
|
339
|
-
parser = HTMLParser(response.text)
|
340
|
-
|
341
|
-
title = ""
|
342
|
-
text = ""
|
343
|
-
links = []
|
344
|
-
images = []
|
345
|
-
selected_elements = []
|
346
|
-
|
347
|
-
# Extract title
|
348
|
-
title_node = parser.css_first("title")
|
349
|
-
if title_node:
|
350
|
-
title = title_node.text(strip=True)
|
351
|
-
|
352
|
-
# Extract text content
|
353
|
-
if extract_text:
|
354
|
-
if css_selector:
|
355
|
-
selected_nodes = parser.css(css_selector)
|
356
|
-
text = " ".join(
|
357
|
-
node.text(strip=True) for node in selected_nodes
|
358
|
-
)
|
359
|
-
else:
|
360
|
-
text = parser.text(strip=True)
|
361
|
-
|
362
|
-
# Extract links
|
363
|
-
if extract_links:
|
364
|
-
link_nodes = parser.css("a[href]")
|
365
|
-
links = [
|
366
|
-
LinkInfo(
|
367
|
-
href=node.attrs.get("href", ""),
|
368
|
-
text=node.text(strip=True),
|
369
|
-
)
|
370
|
-
for node in link_nodes
|
371
|
-
if node.attrs.get("href")
|
372
|
-
]
|
373
|
-
|
374
|
-
# Extract images
|
375
|
-
if extract_images:
|
376
|
-
img_nodes = parser.css("img[src]")
|
377
|
-
images = [
|
378
|
-
ImageInfo(
|
379
|
-
src=node.attrs.get("src", ""),
|
380
|
-
alt=node.attrs.get("alt", ""),
|
381
|
-
title=node.attrs.get("title", ""),
|
382
|
-
)
|
383
|
-
for node in img_nodes
|
384
|
-
if node.attrs.get("src")
|
385
|
-
]
|
386
|
-
|
387
|
-
# Extract selected elements
|
388
|
-
if css_selector:
|
389
|
-
selected_nodes = parser.css(css_selector)
|
390
|
-
selected_elements = [
|
391
|
-
SelectedElement(
|
392
|
-
tag=node.tag,
|
393
|
-
text=node.text(strip=True),
|
394
|
-
html=node.html,
|
395
|
-
attributes=dict(node.attributes),
|
396
|
-
)
|
397
|
-
for node in selected_nodes
|
398
|
-
]
|
399
|
-
|
400
|
-
return WebPageResult(
|
401
|
-
url=url,
|
402
|
-
status_code=response.status_code,
|
403
|
-
content_type=response.headers.get("content-type", ""),
|
404
|
-
title=title,
|
405
|
-
text=text,
|
406
|
-
links=links,
|
407
|
-
images=images,
|
408
|
-
selected_elements=selected_elements,
|
409
|
-
)
|
410
|
-
|
411
|
-
async for attempt in AsyncRetrying(
|
412
|
-
stop=stop_after_attempt(retries + 1),
|
413
|
-
wait=wait_exponential(multiplier=1, min=1, max=10),
|
414
|
-
retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
|
415
|
-
before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
|
416
|
-
):
|
417
|
-
with attempt:
|
418
|
-
return await _do_fetch_and_parse()
|
419
|
-
|
420
|
-
async def read_web_pages(
|
421
|
-
self,
|
422
|
-
urls: List[str],
|
423
|
-
*,
|
424
|
-
timeout: Optional[float] = None,
|
425
|
-
headers: Optional[Dict[str, str]] = None,
|
426
|
-
extract_text: bool = True,
|
427
|
-
extract_links: bool = False,
|
428
|
-
extract_images: bool = False,
|
429
|
-
css_selector: Optional[str] = None,
|
430
|
-
max_concurrent: Optional[int] = None,
|
431
|
-
max_retries: Optional[int] = None,
|
432
|
-
) -> WebPageResults:
|
433
|
-
"""
|
434
|
-
Read and parse multiple web pages concurrently using selectolax.
|
435
|
-
|
436
|
-
Args:
|
437
|
-
urls: List of URLs to fetch and parse
|
438
|
-
timeout: Request timeout in seconds (uses default if not provided)
|
439
|
-
headers: Optional HTTP headers to send
|
440
|
-
extract_text: Whether to extract text content (default: True)
|
441
|
-
extract_links: Whether to extract links (default: False)
|
442
|
-
extract_images: Whether to extract images (default: False)
|
443
|
-
css_selector: Optional CSS selector to extract specific elements
|
444
|
-
max_concurrent: Maximum number of concurrent requests (uses default if not provided)
|
445
|
-
max_retries: Maximum number of retry attempts (uses instance default if not provided)
|
446
|
-
|
447
|
-
Returns:
|
448
|
-
List of dictionaries containing parsed content and metadata
|
449
|
-
|
450
|
-
Raises:
|
451
|
-
Exception: If any critical error occurs
|
452
|
-
"""
|
453
|
-
if not urls:
|
454
|
-
return []
|
455
|
-
|
456
|
-
# Remove duplicates while preserving order
|
457
|
-
unique_urls = []
|
458
|
-
seen = set()
|
459
|
-
for url in urls:
|
460
|
-
if url not in seen:
|
461
|
-
unique_urls.append(url)
|
462
|
-
seen.add(url)
|
463
|
-
|
464
|
-
# Create semaphore for concurrency control
|
465
|
-
concurrent_limit = max_concurrent or self.max_concurrent
|
466
|
-
semaphore = asyncio.Semaphore(concurrent_limit)
|
467
|
-
|
468
|
-
async def fetch_page(url: str) -> Dict[str, Any]:
|
469
|
-
async with semaphore:
|
470
|
-
try:
|
471
|
-
return await self.read_web_page(
|
472
|
-
url=url,
|
473
|
-
timeout=timeout,
|
474
|
-
headers=headers,
|
475
|
-
extract_text=extract_text,
|
476
|
-
extract_links=extract_links,
|
477
|
-
extract_images=extract_images,
|
478
|
-
css_selector=css_selector,
|
479
|
-
max_retries=max_retries,
|
480
|
-
)
|
481
|
-
except Exception as e:
|
482
|
-
return WebPageErrorResult(
|
483
|
-
url=url,
|
484
|
-
error=str(e),
|
485
|
-
status_code=None,
|
486
|
-
content_type="",
|
487
|
-
title="",
|
488
|
-
text="",
|
489
|
-
links=[],
|
490
|
-
images=[],
|
491
|
-
selected_elements=[],
|
492
|
-
)
|
493
|
-
|
494
|
-
# Execute all requests concurrently
|
495
|
-
tasks = [fetch_page(url) for url in unique_urls]
|
496
|
-
results = await asyncio.gather(*tasks, return_exceptions=False)
|
497
|
-
|
498
|
-
return WebPageResults(urls=unique_urls, results=results)
|
499
|
-
|
500
|
-
async def extract_page_links(
|
501
|
-
self,
|
502
|
-
url: str,
|
503
|
-
*,
|
504
|
-
timeout: Optional[float] = None,
|
505
|
-
headers: Optional[Dict[str, str]] = None,
|
506
|
-
css_selector: str = "a[href]",
|
507
|
-
include_external: bool = True,
|
508
|
-
include_internal: bool = True,
|
509
|
-
base_url: Optional[str] = None,
|
510
|
-
max_retries: Optional[int] = None,
|
511
|
-
) -> ExtractedLinks:
|
512
|
-
"""
|
513
|
-
Extract links from a web page using selectolax.
|
514
|
-
|
515
|
-
Args:
|
516
|
-
url: URL to fetch and extract links from
|
517
|
-
timeout: Request timeout in seconds (uses default if not provided)
|
518
|
-
headers: Optional HTTP headers to send
|
519
|
-
css_selector: CSS selector for links (default: "a[href]")
|
520
|
-
include_external: Whether to include external links (default: True)
|
521
|
-
include_internal: Whether to include internal links (default: True)
|
522
|
-
base_url: Base URL for resolving relative links (uses page URL if not provided)
|
523
|
-
max_retries: Maximum number of retry attempts (uses instance default if not provided)
|
524
|
-
|
525
|
-
Returns:
|
526
|
-
List of link dictionaries with href, text, title, and type (internal/external)
|
527
|
-
|
528
|
-
Raises:
|
529
|
-
httpx.HTTPError: If request fails after all retries
|
530
|
-
Exception: If parsing fails
|
531
|
-
"""
|
532
|
-
effective_headers = self._get_default_headers()
|
533
|
-
if headers:
|
534
|
-
effective_headers.update(headers)
|
535
|
-
|
536
|
-
request_timeout = timeout or self.timeout
|
537
|
-
retries = max_retries if max_retries is not None else self.max_retries
|
538
|
-
|
539
|
-
async def _do_extract_links():
|
540
|
-
async with httpx.AsyncClient(
|
541
|
-
timeout=request_timeout, follow_redirects=True
|
542
|
-
) as client:
|
543
|
-
response = await client.get(url, headers=effective_headers)
|
544
|
-
response.raise_for_status()
|
545
|
-
|
546
|
-
# Parse HTML content
|
547
|
-
HTMLParser = self._get_selectolax_parser()
|
548
|
-
parser = HTMLParser(response.text)
|
549
|
-
|
550
|
-
# Use provided base_url or extract from the page
|
551
|
-
effective_base_url = base_url or url
|
552
|
-
|
553
|
-
# Get the domain for internal/external classification
|
554
|
-
parsed_base = urlparse(effective_base_url)
|
555
|
-
base_domain = parsed_base.netloc
|
556
|
-
|
557
|
-
# Extract links
|
558
|
-
link_nodes = parser.css(css_selector)
|
559
|
-
links = []
|
560
|
-
|
561
|
-
for node in link_nodes:
|
562
|
-
href = node.attrs.get("href", "").strip()
|
563
|
-
if not href:
|
564
|
-
continue
|
565
|
-
|
566
|
-
# Resolve relative URLs
|
567
|
-
absolute_href = urljoin(effective_base_url, href)
|
568
|
-
parsed_href = urlparse(absolute_href)
|
569
|
-
|
570
|
-
# Determine if link is internal or external
|
571
|
-
is_internal = (
|
572
|
-
parsed_href.netloc == base_domain or not parsed_href.netloc
|
573
|
-
)
|
574
|
-
link_type = "internal" if is_internal else "external"
|
575
|
-
|
576
|
-
# Filter based on include flags
|
577
|
-
if (is_internal and not include_internal) or (
|
578
|
-
not is_internal and not include_external
|
579
|
-
):
|
580
|
-
continue
|
581
|
-
|
582
|
-
link_info = ExtractedLink(
|
583
|
-
href=absolute_href,
|
584
|
-
original_href=href,
|
585
|
-
text=node.text(strip=True),
|
586
|
-
title=node.attrs.get("title", ""),
|
587
|
-
type=link_type,
|
588
|
-
)
|
589
|
-
|
590
|
-
links.append(link_info)
|
591
|
-
|
592
|
-
return ExtractedLinks(url=url, results=links)
|
593
|
-
|
594
|
-
async for attempt in AsyncRetrying(
|
595
|
-
stop=stop_after_attempt(retries + 1),
|
596
|
-
wait=wait_exponential(multiplier=1, min=1, max=10),
|
597
|
-
retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
|
598
|
-
before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
|
599
|
-
):
|
600
|
-
with attempt:
|
601
|
-
return await _do_extract_links()
|
602
|
-
|
603
|
-
|
604
|
-
class SearchClient:
|
605
|
-
"""
|
606
|
-
Synchronous wrapper around AsyncSearchClient.
|
607
|
-
|
608
|
-
This class provides a synchronous interface to the search functionality
|
609
|
-
by running async operations in an event loop.
|
610
|
-
"""
|
611
|
-
|
612
|
-
def __init__(
|
613
|
-
self,
|
614
|
-
*,
|
615
|
-
timeout: float = 30.0,
|
616
|
-
max_concurrent: int = 5,
|
617
|
-
user_agent: Optional[str] = None,
|
618
|
-
default_headers: Optional[Dict[str, str]] = None,
|
619
|
-
max_retries: int = 3,
|
620
|
-
):
|
621
|
-
"""
|
622
|
-
Initialize the SearchClient.
|
623
|
-
|
624
|
-
Args:
|
625
|
-
timeout: Default timeout for HTTP requests in seconds
|
626
|
-
max_concurrent: Maximum number of concurrent requests for batch operations
|
627
|
-
user_agent: User-Agent header for HTTP requests
|
628
|
-
default_headers: Default headers to include in HTTP requests
|
629
|
-
max_retries: Maximum number of retry attempts for failed requests
|
630
|
-
"""
|
631
|
-
self._async_client = AsyncSearchClient(
|
632
|
-
timeout=timeout,
|
633
|
-
max_concurrent=max_concurrent,
|
634
|
-
user_agent=user_agent,
|
635
|
-
default_headers=default_headers,
|
636
|
-
max_retries=max_retries,
|
637
|
-
)
|
638
|
-
|
639
|
-
def _run_async(self, coro):
|
640
|
-
"""Run an async coroutine in a new event loop."""
|
641
|
-
try:
|
642
|
-
# Try to get the current event loop
|
643
|
-
loop = asyncio.get_running_loop()
|
644
|
-
# If we're already in an event loop, we need to use a thread
|
645
|
-
import concurrent.futures
|
646
|
-
|
647
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
648
|
-
future = executor.submit(asyncio.run, coro)
|
649
|
-
return future.result()
|
650
|
-
except RuntimeError:
|
651
|
-
# No event loop running, we can create our own
|
652
|
-
return asyncio.run(coro)
|
653
|
-
|
654
|
-
def search(
|
655
|
-
self,
|
656
|
-
query: str,
|
657
|
-
*,
|
658
|
-
max_results: int = 10,
|
659
|
-
region: str = "wt-wt",
|
660
|
-
safesearch: str = "moderate",
|
661
|
-
backend: str = "api",
|
662
|
-
) -> SearchResults:
|
663
|
-
"""
|
664
|
-
Synchronous web search using DuckDuckGo.
|
665
|
-
|
666
|
-
Args:
|
667
|
-
query: Search query string
|
668
|
-
max_results: Maximum number of results to return
|
669
|
-
region: Search region (default: "wt-wt" for worldwide)
|
670
|
-
safesearch: Safe search setting ("on", "moderate", "off")
|
671
|
-
backend: Search backend ("api", "html", "lite")
|
672
|
-
|
673
|
-
Returns:
|
674
|
-
List of search result dictionaries with keys: title, href, body
|
675
|
-
"""
|
676
|
-
return self._run_async(
|
677
|
-
self._async_client.search(
|
678
|
-
query,
|
679
|
-
max_results=max_results,
|
680
|
-
region=region,
|
681
|
-
safesearch=safesearch,
|
682
|
-
backend=backend,
|
683
|
-
)
|
684
|
-
)
|
685
|
-
|
686
|
-
def get_page_content(
|
687
|
-
self,
|
688
|
-
url: str,
|
689
|
-
*,
|
690
|
-
timeout: Optional[float] = None,
|
691
|
-
retries: int = 3,
|
692
|
-
encoding: Optional[str] = None,
|
693
|
-
) -> str:
|
694
|
-
"""
|
695
|
-
Synchronously fetch and return the text content of a web page.
|
696
|
-
|
697
|
-
Args:
|
698
|
-
url: URL of the web page to fetch
|
699
|
-
timeout: Request timeout in seconds (uses client default if not specified)
|
700
|
-
retries: Number of retry attempts for failed requests
|
701
|
-
encoding: Text encoding to use (auto-detected if not specified)
|
702
|
-
|
703
|
-
Returns:
|
704
|
-
Plain text content of the web page
|
705
|
-
"""
|
706
|
-
return self._run_async(
|
707
|
-
self._async_client.get_page_content(
|
708
|
-
url, timeout=timeout, retries=retries, encoding=encoding
|
709
|
-
)
|
710
|
-
)
|
711
|
-
|
712
|
-
def extract_links(
|
713
|
-
self,
|
714
|
-
url: str,
|
715
|
-
*,
|
716
|
-
css_selector: str = "a[href]",
|
717
|
-
include_internal: bool = True,
|
718
|
-
include_external: bool = True,
|
719
|
-
timeout: Optional[float] = None,
|
720
|
-
retries: int = 3,
|
721
|
-
) -> ExtractedLinks:
|
722
|
-
"""
|
723
|
-
Synchronously extract links from a web page.
|
724
|
-
|
725
|
-
Args:
|
726
|
-
url: URL of the web page to parse
|
727
|
-
css_selector: CSS selector for link elements
|
728
|
-
include_internal: Whether to include internal links
|
729
|
-
include_external: Whether to include external links
|
730
|
-
timeout: Request timeout in seconds
|
731
|
-
retries: Number of retry attempts for failed requests
|
732
|
-
|
733
|
-
Returns:
|
734
|
-
List of link dictionaries with keys: href, original_href, text, title, type
|
735
|
-
"""
|
736
|
-
return self._run_async(
|
737
|
-
self._async_client.extract_links(
|
738
|
-
url,
|
739
|
-
css_selector=css_selector,
|
740
|
-
include_internal=include_internal,
|
741
|
-
include_external=include_external,
|
742
|
-
timeout=timeout,
|
743
|
-
retries=retries,
|
744
|
-
)
|
745
|
-
)
|
746
|
-
|
747
|
-
def web_search(
|
748
|
-
self,
|
749
|
-
query: str,
|
750
|
-
*,
|
751
|
-
max_results: int = 10,
|
752
|
-
region: str = "wt-wt",
|
753
|
-
safesearch: Literal["on", "moderate", "off"] = "moderate",
|
754
|
-
timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
|
755
|
-
backend: Literal["auto", "html", "lite"] = "auto",
|
756
|
-
max_retries: Optional[int] = None,
|
757
|
-
) -> SearchResults:
|
758
|
-
"""
|
759
|
-
Synchronously search the web using DuckDuckGo search.
|
760
|
-
|
761
|
-
Args:
|
762
|
-
query: Search query string
|
763
|
-
max_results: Maximum number of results to return (default: 10)
|
764
|
-
region: Search region (default: "wt-wt" for worldwide)
|
765
|
-
safesearch: Safe search setting (default: "moderate")
|
766
|
-
timelimit: Time limit for results (d=day, w=week, m=month, y=year)
|
767
|
-
backend: Search backend to use (default: "auto")
|
768
|
-
max_retries: Maximum number of retry attempts (uses instance default if not provided)
|
769
|
-
|
770
|
-
Returns:
|
771
|
-
List of search result dictionaries with 'title', 'href', and 'body' keys
|
772
|
-
|
773
|
-
Raises:
|
774
|
-
ValueError: If query is empty
|
775
|
-
Exception: If search fails after all retries
|
776
|
-
"""
|
777
|
-
return self._run_async(
|
778
|
-
self._async_client.web_search(
|
779
|
-
query=query,
|
780
|
-
max_results=max_results,
|
781
|
-
region=region,
|
782
|
-
safesearch=safesearch,
|
783
|
-
timelimit=timelimit,
|
784
|
-
backend=backend,
|
785
|
-
max_retries=max_retries,
|
786
|
-
)
|
787
|
-
)
|
788
|
-
|
789
|
-
def search_news(
|
790
|
-
self,
|
791
|
-
query: str,
|
792
|
-
*,
|
793
|
-
max_results: int = 10,
|
794
|
-
region: str = "wt-wt",
|
795
|
-
safesearch: Literal["on", "moderate", "off"] = "moderate",
|
796
|
-
timelimit: Optional[Literal["d", "w", "m"]] = None,
|
797
|
-
max_retries: Optional[int] = None,
|
798
|
-
) -> NewsResults:
|
799
|
-
"""
|
800
|
-
Synchronously search for news using DuckDuckGo news search.
|
801
|
-
|
802
|
-
Args:
|
803
|
-
query: Search query string
|
804
|
-
max_results: Maximum number of results to return (default: 10)
|
805
|
-
region: Search region (default: "wt-wt" for worldwide)
|
806
|
-
safesearch: Safe search setting (default: "moderate")
|
807
|
-
timelimit: Time limit for results (d=day, w=week, m=month)
|
808
|
-
max_retries: Maximum number of retry attempts (uses instance default if not provided)
|
809
|
-
|
810
|
-
Returns:
|
811
|
-
List of news result dictionaries with date, title, body, url, image, and source
|
812
|
-
|
813
|
-
Raises:
|
814
|
-
ValueError: If query is empty
|
815
|
-
Exception: If search fails after all retries
|
816
|
-
"""
|
817
|
-
return self._run_async(
|
818
|
-
self._async_client.search_news(
|
819
|
-
query=query,
|
820
|
-
max_results=max_results,
|
821
|
-
region=region,
|
822
|
-
safesearch=safesearch,
|
823
|
-
timelimit=timelimit,
|
824
|
-
max_retries=max_retries,
|
825
|
-
)
|
826
|
-
)
|
827
|
-
|
828
|
-
def read_web_page(
|
829
|
-
self,
|
830
|
-
url: str,
|
831
|
-
*,
|
832
|
-
timeout: float = 30.0,
|
833
|
-
headers: Optional[Dict[str, str]] = None,
|
834
|
-
extract_text: bool = True,
|
835
|
-
extract_links: bool = False,
|
836
|
-
extract_images: bool = False,
|
837
|
-
css_selector: Optional[str] = None,
|
838
|
-
) -> WebPageResult:
|
839
|
-
"""
|
840
|
-
Synchronously read and parse a single web page using selectolax.
|
841
|
-
|
842
|
-
Args:
|
843
|
-
url: URL to fetch and parse
|
844
|
-
timeout: Request timeout in seconds (default: 30.0)
|
845
|
-
headers: Optional HTTP headers to send
|
846
|
-
extract_text: Whether to extract text content (default: True)
|
847
|
-
extract_links: Whether to extract links (default: False)
|
848
|
-
extract_images: Whether to extract images (default: False)
|
849
|
-
css_selector: Optional CSS selector to extract specific elements
|
850
|
-
|
851
|
-
Returns:
|
852
|
-
Dictionary containing parsed content and metadata
|
853
|
-
|
854
|
-
Raises:
|
855
|
-
httpx.HTTPError: If request fails
|
856
|
-
Exception: If parsing fails
|
857
|
-
"""
|
858
|
-
return self._run_async(
|
859
|
-
self._async_client.read_web_page(
|
860
|
-
url=url,
|
861
|
-
timeout=timeout,
|
862
|
-
headers=headers,
|
863
|
-
extract_text=extract_text,
|
864
|
-
extract_links=extract_links,
|
865
|
-
extract_images=extract_images,
|
866
|
-
css_selector=css_selector,
|
867
|
-
)
|
868
|
-
)
|
869
|
-
|
870
|
-
def read_web_pages(
|
871
|
-
self,
|
872
|
-
urls: List[str],
|
873
|
-
*,
|
874
|
-
timeout: float = 30.0,
|
875
|
-
headers: Optional[Dict[str, str]] = None,
|
876
|
-
extract_text: bool = True,
|
877
|
-
extract_links: bool = False,
|
878
|
-
extract_images: bool = False,
|
879
|
-
css_selector: Optional[str] = None,
|
880
|
-
max_concurrent: Optional[int] = None,
|
881
|
-
) -> WebPageResults:
|
882
|
-
"""
|
883
|
-
Synchronously read and parse multiple web pages concurrently using selectolax.
|
884
|
-
|
885
|
-
Args:
|
886
|
-
urls: List of URLs to fetch and parse
|
887
|
-
timeout: Request timeout in seconds (default: 30.0)
|
888
|
-
headers: Optional HTTP headers to send
|
889
|
-
extract_text: Whether to extract text content (default: True)
|
890
|
-
extract_links: Whether to extract links (default: False)
|
891
|
-
extract_images: Whether to extract images (default: False)
|
892
|
-
css_selector: Optional CSS selector to extract specific elements
|
893
|
-
max_concurrent: Maximum concurrent requests (uses client default if not specified)
|
894
|
-
|
895
|
-
Returns:
|
896
|
-
List of dictionaries containing parsed content and metadata for each URL
|
897
|
-
|
898
|
-
Raises:
|
899
|
-
httpx.HTTPError: If requests fail
|
900
|
-
Exception: If parsing fails
|
901
|
-
"""
|
902
|
-
return self._run_async(
|
903
|
-
self._async_client.read_web_pages(
|
904
|
-
urls=urls,
|
905
|
-
timeout=timeout,
|
906
|
-
headers=headers,
|
907
|
-
extract_text=extract_text,
|
908
|
-
extract_links=extract_links,
|
909
|
-
extract_images=extract_images,
|
910
|
-
css_selector=css_selector,
|
911
|
-
max_concurrent=max_concurrent,
|
912
|
-
)
|
913
|
-
)
|
914
|
-
|
915
|
-
def extract_page_links(
|
916
|
-
self,
|
917
|
-
url: str,
|
918
|
-
*,
|
919
|
-
timeout: float = 30.0,
|
920
|
-
headers: Optional[Dict[str, str]] = None,
|
921
|
-
css_selector: str = "a[href]",
|
922
|
-
include_internal: bool = True,
|
923
|
-
include_external: bool = True,
|
924
|
-
base_url: Optional[str] = None,
|
925
|
-
) -> ExtractedLinks:
|
926
|
-
"""
|
927
|
-
Synchronously extract all links from a web page.
|
928
|
-
|
929
|
-
Args:
|
930
|
-
url: URL to fetch and extract links from
|
931
|
-
timeout: Request timeout in seconds (default: 30.0)
|
932
|
-
headers: Optional HTTP headers to send
|
933
|
-
css_selector: CSS selector for link elements (default: "a[href]")
|
934
|
-
include_internal: Whether to include internal links (default: True)
|
935
|
-
include_external: Whether to include external links (default: True)
|
936
|
-
base_url: Base URL for resolving relative links (uses page URL if not provided)
|
937
|
-
|
938
|
-
Returns:
|
939
|
-
List of link dictionaries with 'href', 'original_href', 'text', 'title', and 'type' keys
|
940
|
-
|
941
|
-
Raises:
|
942
|
-
httpx.HTTPError: If request fails
|
943
|
-
Exception: If parsing fails
|
944
|
-
"""
|
945
|
-
return self._run_async(
|
946
|
-
self._async_client.extract_page_links(
|
947
|
-
url=url,
|
948
|
-
timeout=timeout,
|
949
|
-
headers=headers,
|
950
|
-
css_selector=css_selector,
|
951
|
-
include_internal=include_internal,
|
952
|
-
include_external=include_external,
|
953
|
-
base_url=base_url,
|
954
|
-
)
|
955
|
-
)
|
956
|
-
|
957
|
-
def close(self):
|
958
|
-
"""Close the underlying async client."""
|
959
|
-
pass
|
960
|
-
|
961
|
-
def __enter__(self):
|
962
|
-
"""Context manager entry."""
|
963
|
-
return self
|
964
|
-
|
965
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
966
|
-
"""Context manager exit."""
|
967
|
-
self.close()
|
968
|
-
|
969
|
-
|
970
|
-
@overload
|
971
|
-
def create_search_client(
|
972
|
-
*,
|
973
|
-
timeout: float = 30.0,
|
974
|
-
max_concurrent: int = 5,
|
975
|
-
user_agent: Optional[str] = None,
|
976
|
-
default_headers: Optional[Dict[str, str]] = None,
|
977
|
-
max_retries: int = 3,
|
978
|
-
async_client: Literal[True],
|
979
|
-
) -> AsyncSearchClient: ...
|
980
|
-
|
981
|
-
|
982
|
-
@overload
|
983
|
-
def create_search_client(
|
984
|
-
*,
|
985
|
-
timeout: float = 30.0,
|
986
|
-
max_concurrent: int = 5,
|
987
|
-
user_agent: Optional[str] = None,
|
988
|
-
default_headers: Optional[Dict[str, str]] = None,
|
989
|
-
max_retries: int = 3,
|
990
|
-
async_client: Literal[False] = ...,
|
991
|
-
) -> SearchClient: ...
|
992
|
-
|
993
|
-
|
994
|
-
def create_search_client(
|
995
|
-
*,
|
996
|
-
timeout: float = 30.0,
|
997
|
-
max_concurrent: int = 5,
|
998
|
-
user_agent: Optional[str] = None,
|
999
|
-
default_headers: Optional[Dict[str, str]] = None,
|
1000
|
-
max_retries: int = 3,
|
1001
|
-
async_client: bool = False,
|
1002
|
-
) -> Union[SearchClient, AsyncSearchClient]:
|
1003
|
-
"""
|
1004
|
-
Create a new SearchClient instance.
|
1005
|
-
|
1006
|
-
Args:
|
1007
|
-
timeout: Default timeout for HTTP requests in seconds
|
1008
|
-
max_concurrent: Maximum number of concurrent requests for batch operations
|
1009
|
-
user_agent: User-Agent header for HTTP requests
|
1010
|
-
default_headers: Default headers to include in HTTP requests
|
1011
|
-
max_retries: Maximum number of retry attempts for failed requests
|
1012
|
-
async_client: Whether to return an async client instance
|
1013
|
-
|
1014
|
-
Returns:
|
1015
|
-
SearchClient or AsyncSearchClient instance based on async_client parameter
|
1016
|
-
"""
|
1017
|
-
params = locals()
|
1018
|
-
del params["async_client"]
|
1019
|
-
|
1020
|
-
if async_client:
|
1021
|
-
return AsyncSearchClient(**params)
|
1022
|
-
else:
|
1023
|
-
return SearchClient(**params)
|