hammad-python 0.0.14__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. hammad_python-0.0.15.dist-info/METADATA +184 -0
  2. hammad_python-0.0.15.dist-info/RECORD +4 -0
  3. hammad/__init__.py +0 -1
  4. hammad/ai/__init__.py +0 -1
  5. hammad/ai/_utils.py +0 -142
  6. hammad/ai/completions/__init__.py +0 -45
  7. hammad/ai/completions/client.py +0 -684
  8. hammad/ai/completions/create.py +0 -710
  9. hammad/ai/completions/settings.py +0 -100
  10. hammad/ai/completions/types.py +0 -792
  11. hammad/ai/completions/utils.py +0 -486
  12. hammad/ai/embeddings/__init__.py +0 -35
  13. hammad/ai/embeddings/client/__init__.py +0 -1
  14. hammad/ai/embeddings/client/base_embeddings_client.py +0 -26
  15. hammad/ai/embeddings/client/fastembed_text_embeddings_client.py +0 -200
  16. hammad/ai/embeddings/client/litellm_embeddings_client.py +0 -288
  17. hammad/ai/embeddings/create.py +0 -159
  18. hammad/ai/embeddings/types.py +0 -69
  19. hammad/cache/__init__.py +0 -40
  20. hammad/cache/base_cache.py +0 -181
  21. hammad/cache/cache.py +0 -169
  22. hammad/cache/decorators.py +0 -261
  23. hammad/cache/file_cache.py +0 -80
  24. hammad/cache/ttl_cache.py +0 -74
  25. hammad/cli/__init__.py +0 -33
  26. hammad/cli/animations.py +0 -573
  27. hammad/cli/plugins.py +0 -781
  28. hammad/cli/styles/__init__.py +0 -55
  29. hammad/cli/styles/settings.py +0 -139
  30. hammad/cli/styles/types.py +0 -358
  31. hammad/cli/styles/utils.py +0 -480
  32. hammad/data/__init__.py +0 -56
  33. hammad/data/collections/__init__.py +0 -34
  34. hammad/data/collections/base_collection.py +0 -58
  35. hammad/data/collections/collection.py +0 -452
  36. hammad/data/collections/searchable_collection.py +0 -556
  37. hammad/data/collections/vector_collection.py +0 -596
  38. hammad/data/configurations/__init__.py +0 -35
  39. hammad/data/configurations/configuration.py +0 -564
  40. hammad/data/databases/__init__.py +0 -21
  41. hammad/data/databases/database.py +0 -902
  42. hammad/data/models/__init__.py +0 -44
  43. hammad/data/models/base/__init__.py +0 -35
  44. hammad/data/models/base/fields.py +0 -546
  45. hammad/data/models/base/model.py +0 -1078
  46. hammad/data/models/base/utils.py +0 -280
  47. hammad/data/models/pydantic/__init__.py +0 -55
  48. hammad/data/models/pydantic/converters.py +0 -632
  49. hammad/data/models/pydantic/models/__init__.py +0 -28
  50. hammad/data/models/pydantic/models/arbitrary_model.py +0 -46
  51. hammad/data/models/pydantic/models/cacheable_model.py +0 -79
  52. hammad/data/models/pydantic/models/fast_model.py +0 -318
  53. hammad/data/models/pydantic/models/function_model.py +0 -176
  54. hammad/data/models/pydantic/models/subscriptable_model.py +0 -63
  55. hammad/data/types/__init__.py +0 -41
  56. hammad/data/types/file.py +0 -358
  57. hammad/data/types/multimodal/__init__.py +0 -24
  58. hammad/data/types/multimodal/audio.py +0 -96
  59. hammad/data/types/multimodal/image.py +0 -80
  60. hammad/data/types/text.py +0 -1066
  61. hammad/formatting/__init__.py +0 -38
  62. hammad/formatting/json/__init__.py +0 -21
  63. hammad/formatting/json/converters.py +0 -152
  64. hammad/formatting/text/__init__.py +0 -63
  65. hammad/formatting/text/converters.py +0 -723
  66. hammad/formatting/text/markdown.py +0 -131
  67. hammad/formatting/yaml/__init__.py +0 -26
  68. hammad/formatting/yaml/converters.py +0 -5
  69. hammad/logging/__init__.py +0 -35
  70. hammad/logging/decorators.py +0 -834
  71. hammad/logging/logger.py +0 -954
  72. hammad/mcp/__init__.py +0 -50
  73. hammad/mcp/client/__init__.py +0 -1
  74. hammad/mcp/client/client.py +0 -523
  75. hammad/mcp/client/client_service.py +0 -393
  76. hammad/mcp/client/settings.py +0 -178
  77. hammad/mcp/servers/__init__.py +0 -1
  78. hammad/mcp/servers/launcher.py +0 -1161
  79. hammad/performance/__init__.py +0 -36
  80. hammad/performance/imports.py +0 -231
  81. hammad/performance/runtime/__init__.py +0 -32
  82. hammad/performance/runtime/decorators.py +0 -142
  83. hammad/performance/runtime/run.py +0 -299
  84. hammad/py.typed +0 -0
  85. hammad/service/__init__.py +0 -49
  86. hammad/service/create.py +0 -532
  87. hammad/service/decorators.py +0 -285
  88. hammad/typing/__init__.py +0 -407
  89. hammad/web/__init__.py +0 -43
  90. hammad/web/http/__init__.py +0 -1
  91. hammad/web/http/client.py +0 -944
  92. hammad/web/models.py +0 -245
  93. hammad/web/openapi/__init__.py +0 -1
  94. hammad/web/openapi/client.py +0 -740
  95. hammad/web/search/__init__.py +0 -1
  96. hammad/web/search/client.py +0 -988
  97. hammad/web/utils.py +0 -472
  98. hammad_python-0.0.14.dist-info/METADATA +0 -70
  99. hammad_python-0.0.14.dist-info/RECORD +0 -99
  100. {hammad_python-0.0.14.dist-info → hammad_python-0.0.15.dist-info}/WHEEL +0 -0
  101. {hammad_python-0.0.14.dist-info → hammad_python-0.0.15.dist-info}/licenses/LICENSE +0 -0
@@ -1,988 +0,0 @@
1
- """hammad.web.search.client"""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import logging
7
- from typing import Any, Dict, List, Literal, Optional, Union, overload
8
- from urllib.parse import urljoin, urlparse
9
-
10
- import httpx
11
- from tenacity import (
12
- AsyncRetrying,
13
- retry_if_exception_type,
14
- stop_after_attempt,
15
- wait_exponential,
16
- before_sleep_log,
17
- )
18
-
19
- from ..models import (
20
- SearchResults,
21
- NewsResults,
22
- WebPageResult,
23
- WebPageErrorResult,
24
- WebPageResults,
25
- ExtractedLinks,
26
- )
27
-
28
- __all__ = ("AsyncSearchClient", "SearchClient", "create_search_client")
29
-
30
-
31
- class AsyncSearchClient:
32
- """
33
- Search client that provides web search and page parsing capabilities.
34
-
35
- This client uses lazy loading for DuckDuckGo search and selectolax HTML parsing
36
- to minimize import overhead and memory usage.
37
- """
38
-
39
- def __init__(
40
- self,
41
- *,
42
- timeout: float = 30.0,
43
- max_concurrent: int = 5,
44
- user_agent: Optional[str] = None,
45
- default_headers: Optional[Dict[str, str]] = None,
46
- max_retries: int = 3,
47
- ):
48
- """
49
- Initialize the SearchClient.
50
-
51
- Args:
52
- timeout: Default timeout for HTTP requests in seconds
53
- max_concurrent: Maximum number of concurrent requests for batch operations
54
- user_agent: User-Agent header for HTTP requests
55
- default_headers: Default headers to include in HTTP requests
56
- max_retries: Maximum number of retry attempts for failed requests
57
- """
58
- self.timeout = timeout
59
- self.max_concurrent = max_concurrent
60
- self.user_agent = user_agent or "Mozilla/5.0 (compatible; SearchClient/1.0)"
61
- self.default_headers = default_headers or {}
62
- self.max_retries = max_retries
63
-
64
- # Lazy-loaded resources
65
- self._ddgs_client = None
66
- self._selectolax_parser_class = None
67
-
68
- def _get_duckduckgo_client(self):
69
- """Get a DuckDuckGo search client using lazy import and singleton pattern."""
70
- if self._ddgs_client is None:
71
- try:
72
- from duckduckgo_search import DDGS
73
-
74
- self._ddgs_client = DDGS
75
- except ImportError as e:
76
- raise ImportError(
77
- "duckduckgo_search is required for web search functionality. "
78
- "Install with: pip install duckduckgo-search"
79
- ) from e
80
- return self._ddgs_client
81
-
82
- def _get_selectolax_parser(self):
83
- """Get selectolax HTMLParser class using lazy import and singleton pattern."""
84
- if self._selectolax_parser_class is None:
85
- try:
86
- from selectolax.parser import HTMLParser
87
-
88
- self._selectolax_parser_class = HTMLParser
89
- except ImportError as e:
90
- raise ImportError(
91
- "selectolax is required for HTML parsing functionality. "
92
- "Install with: pip install selectolax"
93
- ) from e
94
- return self._selectolax_parser_class
95
-
96
- def _get_default_headers(self) -> Dict[str, str]:
97
- """Get default headers for HTTP requests."""
98
- headers = {"User-Agent": self.user_agent}
99
- headers.update(self.default_headers)
100
- return headers
101
-
102
- async def search(
103
- self,
104
- query: str,
105
- *,
106
- max_results: int = 10,
107
- region: str = "wt-wt",
108
- safesearch: Literal["on", "moderate", "off"] = "moderate",
109
- timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
110
- backend: Literal["auto", "html", "lite"] = "auto",
111
- max_retries: Optional[int] = None,
112
- ) -> SearchResults:
113
- """
114
- (deprecated in favor of `search_web`)
115
-
116
- Args:
117
- query: Search query string
118
- max_results: Maximum number of results to return (default: 10)
119
- region: Search region (default: "wt-wt" for worldwide)
120
- safesearch: Safe search setting (default: "moderate")
121
- timelimit: Time limit for results (d=day, w=week, m=month, y=year)
122
- backend: Search backend to use (default: "auto")
123
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
124
-
125
- Returns:
126
- List of search result dictionaries with 'title', 'href', and 'body' keys
127
-
128
- Raises:
129
- ValueError: If query is empty
130
- Exception: If search fails after all retries
131
- """
132
- from rich import print
133
-
134
- print(
135
- "[bold yellow]WARNING: [/bold yellow] [yellow]Using `AsyncSearchClient.[bold light_salmon3]search[/bold light_salmon3]` is now deprecated in favor of `AsyncSearchClient.[bold light_salmon3]search_web[/bold light_salmon3]`[/yellow]"
136
- )
137
- return await self.search_web(
138
- query,
139
- max_results=max_results,
140
- region=region,
141
- safesearch=safesearch,
142
- timelimit=timelimit,
143
- )
144
-
145
- async def search_web(
146
- self,
147
- query: str,
148
- *,
149
- max_results: int = 10,
150
- region: str = "wt-wt",
151
- safesearch: Literal["on", "moderate", "off"] = "moderate",
152
- timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
153
- backend: Literal["auto", "html", "lite"] = "auto",
154
- max_retries: Optional[int] = None,
155
- ) -> SearchResults:
156
- """
157
- Search the web using DuckDuckGo search.
158
-
159
- Args:
160
- query: Search query string
161
- max_results: Maximum number of results to return (default: 10)
162
- region: Search region (default: "wt-wt" for worldwide)
163
- safesearch: Safe search setting (default: "moderate")
164
- timelimit: Time limit for results (d=day, w=week, m=month, y=year)
165
- backend: Search backend to use (default: "auto")
166
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
167
-
168
- Returns:
169
- List of search result dictionaries with 'title', 'href', and 'body' keys
170
-
171
- Raises:
172
- ValueError: If query is empty
173
- Exception: If search fails after all retries
174
- """
175
- if not query or not query.strip():
176
- raise ValueError("Query cannot be empty")
177
-
178
- retries = max_retries if max_retries is not None else self.max_retries
179
-
180
- async def _do_search():
181
- DDGS = self._get_duckduckgo_client()
182
- with DDGS() as ddgs:
183
- results = list(
184
- ddgs.text(
185
- keywords=query.strip(),
186
- region=region,
187
- safesearch=safesearch,
188
- timelimit=timelimit,
189
- backend=backend,
190
- max_results=max_results,
191
- )
192
- )
193
- return results
194
-
195
- async for attempt in AsyncRetrying(
196
- stop=stop_after_attempt(retries + 1),
197
- wait=wait_exponential(multiplier=1, min=1, max=10),
198
- retry=retry_if_exception_type(Exception),
199
- before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
200
- ):
201
- with attempt:
202
- return await _do_search()
203
-
204
- async def search_news(
205
- self,
206
- query: str,
207
- *,
208
- max_results: int = 10,
209
- region: str = "wt-wt",
210
- safesearch: Literal["on", "moderate", "off"] = "moderate",
211
- timelimit: Optional[Literal["d", "w", "m"]] = None,
212
- max_retries: Optional[int] = None,
213
- ) -> NewsResults:
214
- """
215
- Search for news using DuckDuckGo news search.
216
-
217
- Args:
218
- query: Search query string
219
- max_results: Maximum number of results to return (default: 10)
220
- region: Search region (default: "wt-wt" for worldwide)
221
- safesearch: Safe search setting (default: "moderate")
222
- timelimit: Time limit for results (d=day, w=week, m=month)
223
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
224
-
225
- Returns:
226
- List of news result dictionaries with date, title, body, url, image, and source
227
-
228
- Raises:
229
- ValueError: If query is empty
230
- Exception: If search fails after all retries
231
- """
232
- if not query or not query.strip():
233
- raise ValueError("Query cannot be empty")
234
-
235
- retries = max_retries if max_retries is not None else self.max_retries
236
-
237
- async def _do_news_search():
238
- DDGS = self._get_duckduckgo_client()
239
- with DDGS() as ddgs:
240
- results = list(
241
- ddgs.news(
242
- keywords=query.strip(),
243
- region=region,
244
- safesearch=safesearch,
245
- timelimit=timelimit,
246
- max_results=max_results,
247
- )
248
- )
249
- return results
250
-
251
- async for attempt in AsyncRetrying(
252
- stop=stop_after_attempt(retries + 1),
253
- wait=wait_exponential(multiplier=1, min=1, max=10),
254
- retry=retry_if_exception_type(Exception),
255
- before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
256
- ):
257
- with attempt:
258
- return await _do_news_search()
259
-
260
- async def read_web_page(
261
- self,
262
- url: str,
263
- *,
264
- timeout: Optional[float] = None,
265
- headers: Optional[Dict[str, str]] = None,
266
- extract_text: bool = True,
267
- extract_links: bool = False,
268
- extract_images: bool = False,
269
- css_selector: Optional[str] = None,
270
- max_retries: Optional[int] = None,
271
- ) -> WebPageResult:
272
- """
273
- Read and parse a single web page using selectolax.
274
-
275
- Args:
276
- url: URL to fetch and parse
277
- timeout: Request timeout in seconds (uses default if not provided)
278
- headers: Optional HTTP headers to send
279
- extract_text: Whether to extract text content (default: True)
280
- extract_links: Whether to extract links (default: False)
281
- extract_images: Whether to extract images (default: False)
282
- css_selector: Optional CSS selector to extract specific elements
283
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
284
-
285
- Returns:
286
- Dictionary containing parsed content and metadata
287
-
288
- Raises:
289
- httpx.HTTPError: If request fails after all retries
290
- Exception: If parsing fails
291
- """
292
- effective_headers = self._get_default_headers()
293
- if headers:
294
- effective_headers.update(headers)
295
-
296
- request_timeout = timeout or self.timeout
297
- retries = max_retries if max_retries is not None else self.max_retries
298
-
299
- async def _do_fetch_and_parse():
300
- async with httpx.AsyncClient(
301
- timeout=request_timeout, follow_redirects=True
302
- ) as client:
303
- response = await client.get(url, headers=effective_headers)
304
- response.raise_for_status()
305
-
306
- # Parse HTML content
307
- HTMLParser = self._get_selectolax_parser()
308
- parser = HTMLParser(response.text)
309
-
310
- result = {
311
- "url": url,
312
- "status_code": response.status_code,
313
- "content_type": response.headers.get("content-type", ""),
314
- "title": "",
315
- "text": "",
316
- "links": [],
317
- "images": [],
318
- "selected_elements": [],
319
- }
320
-
321
- # Extract title
322
- title_node = parser.css_first("title")
323
- if title_node:
324
- result["title"] = title_node.text(strip=True)
325
-
326
- # Extract text content
327
- if extract_text:
328
- if css_selector:
329
- selected_nodes = parser.css(css_selector)
330
- result["text"] = " ".join(
331
- node.text(strip=True) for node in selected_nodes
332
- )
333
- else:
334
- result["text"] = parser.text(strip=True)
335
-
336
- # Extract links
337
- if extract_links:
338
- link_nodes = parser.css("a[href]")
339
- result["links"] = [
340
- {
341
- "href": node.attrs.get("href", ""),
342
- "text": node.text(strip=True),
343
- }
344
- for node in link_nodes
345
- if node.attrs.get("href")
346
- ]
347
-
348
- # Extract images
349
- if extract_images:
350
- img_nodes = parser.css("img[src]")
351
- result["images"] = [
352
- {
353
- "src": node.attrs.get("src", ""),
354
- "alt": node.attrs.get("alt", ""),
355
- "title": node.attrs.get("title", ""),
356
- }
357
- for node in img_nodes
358
- if node.attrs.get("src")
359
- ]
360
-
361
- # Extract selected elements
362
- if css_selector:
363
- selected_nodes = parser.css(css_selector)
364
- result["selected_elements"] = [
365
- {
366
- "tag": node.tag,
367
- "text": node.text(strip=True),
368
- "html": node.html,
369
- "attributes": dict(node.attributes),
370
- }
371
- for node in selected_nodes
372
- ]
373
-
374
- return result
375
-
376
- async for attempt in AsyncRetrying(
377
- stop=stop_after_attempt(retries + 1),
378
- wait=wait_exponential(multiplier=1, min=1, max=10),
379
- retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
380
- before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
381
- ):
382
- with attempt:
383
- return await _do_fetch_and_parse()
384
-
385
- async def read_web_pages(
386
- self,
387
- urls: List[str],
388
- *,
389
- timeout: Optional[float] = None,
390
- headers: Optional[Dict[str, str]] = None,
391
- extract_text: bool = True,
392
- extract_links: bool = False,
393
- extract_images: bool = False,
394
- css_selector: Optional[str] = None,
395
- max_concurrent: Optional[int] = None,
396
- max_retries: Optional[int] = None,
397
- ) -> WebPageResults:
398
- """
399
- Read and parse multiple web pages concurrently using selectolax.
400
-
401
- Args:
402
- urls: List of URLs to fetch and parse
403
- timeout: Request timeout in seconds (uses default if not provided)
404
- headers: Optional HTTP headers to send
405
- extract_text: Whether to extract text content (default: True)
406
- extract_links: Whether to extract links (default: False)
407
- extract_images: Whether to extract images (default: False)
408
- css_selector: Optional CSS selector to extract specific elements
409
- max_concurrent: Maximum number of concurrent requests (uses default if not provided)
410
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
411
-
412
- Returns:
413
- List of dictionaries containing parsed content and metadata
414
-
415
- Raises:
416
- Exception: If any critical error occurs
417
- """
418
- if not urls:
419
- return []
420
-
421
- # Remove duplicates while preserving order
422
- unique_urls = []
423
- seen = set()
424
- for url in urls:
425
- if url not in seen:
426
- unique_urls.append(url)
427
- seen.add(url)
428
-
429
- # Create semaphore for concurrency control
430
- concurrent_limit = max_concurrent or self.max_concurrent
431
- semaphore = asyncio.Semaphore(concurrent_limit)
432
-
433
- async def fetch_page(url: str) -> Dict[str, Any]:
434
- async with semaphore:
435
- try:
436
- return await self.read_web_page(
437
- url=url,
438
- timeout=timeout,
439
- headers=headers,
440
- extract_text=extract_text,
441
- extract_links=extract_links,
442
- extract_images=extract_images,
443
- css_selector=css_selector,
444
- max_retries=max_retries,
445
- )
446
- except Exception as e:
447
- return WebPageErrorResult(
448
- url=url,
449
- error=str(e),
450
- status_code=None,
451
- content_type="",
452
- title="",
453
- text="",
454
- links=[],
455
- images=[],
456
- selected_elements=[],
457
- )
458
-
459
- # Execute all requests concurrently
460
- tasks = [fetch_page(url) for url in unique_urls]
461
- results = await asyncio.gather(*tasks, return_exceptions=False)
462
-
463
- return results
464
-
465
- async def extract_page_links(
466
- self,
467
- url: str,
468
- *,
469
- timeout: Optional[float] = None,
470
- headers: Optional[Dict[str, str]] = None,
471
- css_selector: str = "a[href]",
472
- include_external: bool = True,
473
- include_internal: bool = True,
474
- base_url: Optional[str] = None,
475
- max_retries: Optional[int] = None,
476
- ) -> ExtractedLinks:
477
- """
478
- Extract links from a web page using selectolax.
479
-
480
- Args:
481
- url: URL to fetch and extract links from
482
- timeout: Request timeout in seconds (uses default if not provided)
483
- headers: Optional HTTP headers to send
484
- css_selector: CSS selector for links (default: "a[href]")
485
- include_external: Whether to include external links (default: True)
486
- include_internal: Whether to include internal links (default: True)
487
- base_url: Base URL for resolving relative links (uses page URL if not provided)
488
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
489
-
490
- Returns:
491
- List of link dictionaries with href, text, title, and type (internal/external)
492
-
493
- Raises:
494
- httpx.HTTPError: If request fails after all retries
495
- Exception: If parsing fails
496
- """
497
- effective_headers = self._get_default_headers()
498
- if headers:
499
- effective_headers.update(headers)
500
-
501
- request_timeout = timeout or self.timeout
502
- retries = max_retries if max_retries is not None else self.max_retries
503
-
504
- async def _do_extract_links():
505
- async with httpx.AsyncClient(
506
- timeout=request_timeout, follow_redirects=True
507
- ) as client:
508
- response = await client.get(url, headers=effective_headers)
509
- response.raise_for_status()
510
-
511
- # Parse HTML content
512
- HTMLParser = self._get_selectolax_parser()
513
- parser = HTMLParser(response.text)
514
-
515
- # Use provided base_url or extract from the page
516
- effective_base_url = base_url or url
517
-
518
- # Get the domain for internal/external classification
519
- parsed_base = urlparse(effective_base_url)
520
- base_domain = parsed_base.netloc
521
-
522
- # Extract links
523
- link_nodes = parser.css(css_selector)
524
- links = []
525
-
526
- for node in link_nodes:
527
- href = node.attrs.get("href", "").strip()
528
- if not href:
529
- continue
530
-
531
- # Resolve relative URLs
532
- absolute_href = urljoin(effective_base_url, href)
533
- parsed_href = urlparse(absolute_href)
534
-
535
- # Determine if link is internal or external
536
- is_internal = (
537
- parsed_href.netloc == base_domain or not parsed_href.netloc
538
- )
539
- link_type = "internal" if is_internal else "external"
540
-
541
- # Filter based on include flags
542
- if (is_internal and not include_internal) or (
543
- not is_internal and not include_external
544
- ):
545
- continue
546
-
547
- link_info = {
548
- "href": absolute_href,
549
- "original_href": href,
550
- "text": node.text(strip=True),
551
- "title": node.attrs.get("title", ""),
552
- "type": link_type,
553
- }
554
-
555
- links.append(link_info)
556
-
557
- return links
558
-
559
- async for attempt in AsyncRetrying(
560
- stop=stop_after_attempt(retries + 1),
561
- wait=wait_exponential(multiplier=1, min=1, max=10),
562
- retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
563
- before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
564
- ):
565
- with attempt:
566
- return await _do_extract_links()
567
-
568
-
569
- class SearchClient:
570
- """
571
- Synchronous wrapper around AsyncSearchClient.
572
-
573
- This class provides a synchronous interface to the search functionality
574
- by running async operations in an event loop.
575
- """
576
-
577
- def __init__(
578
- self,
579
- *,
580
- timeout: float = 30.0,
581
- max_concurrent: int = 5,
582
- user_agent: Optional[str] = None,
583
- default_headers: Optional[Dict[str, str]] = None,
584
- max_retries: int = 3,
585
- ):
586
- """
587
- Initialize the SearchClient.
588
-
589
- Args:
590
- timeout: Default timeout for HTTP requests in seconds
591
- max_concurrent: Maximum number of concurrent requests for batch operations
592
- user_agent: User-Agent header for HTTP requests
593
- default_headers: Default headers to include in HTTP requests
594
- max_retries: Maximum number of retry attempts for failed requests
595
- """
596
- self._async_client = AsyncSearchClient(
597
- timeout=timeout,
598
- max_concurrent=max_concurrent,
599
- user_agent=user_agent,
600
- default_headers=default_headers,
601
- max_retries=max_retries,
602
- )
603
-
604
- def _run_async(self, coro):
605
- """Run an async coroutine in a new event loop."""
606
- try:
607
- # Try to get the current event loop
608
- loop = asyncio.get_running_loop()
609
- # If we're already in an event loop, we need to use a thread
610
- import concurrent.futures
611
-
612
- with concurrent.futures.ThreadPoolExecutor() as executor:
613
- future = executor.submit(asyncio.run, coro)
614
- return future.result()
615
- except RuntimeError:
616
- # No event loop running, we can create our own
617
- return asyncio.run(coro)
618
-
619
- def search(
620
- self,
621
- query: str,
622
- *,
623
- max_results: int = 10,
624
- region: str = "wt-wt",
625
- safesearch: str = "moderate",
626
- backend: str = "api",
627
- ) -> SearchResults:
628
- """
629
- Synchronous web search using DuckDuckGo.
630
-
631
- Args:
632
- query: Search query string
633
- max_results: Maximum number of results to return
634
- region: Search region (default: "wt-wt" for worldwide)
635
- safesearch: Safe search setting ("on", "moderate", "off")
636
- backend: Search backend ("api", "html", "lite")
637
-
638
- Returns:
639
- List of search result dictionaries with keys: title, href, body
640
- """
641
- return self._run_async(
642
- self._async_client.search(
643
- query,
644
- max_results=max_results,
645
- region=region,
646
- safesearch=safesearch,
647
- backend=backend,
648
- )
649
- )
650
-
651
- def get_page_content(
652
- self,
653
- url: str,
654
- *,
655
- timeout: Optional[float] = None,
656
- retries: int = 3,
657
- encoding: Optional[str] = None,
658
- ) -> str:
659
- """
660
- Synchronously fetch and return the text content of a web page.
661
-
662
- Args:
663
- url: URL of the web page to fetch
664
- timeout: Request timeout in seconds (uses client default if not specified)
665
- retries: Number of retry attempts for failed requests
666
- encoding: Text encoding to use (auto-detected if not specified)
667
-
668
- Returns:
669
- Plain text content of the web page
670
- """
671
- return self._run_async(
672
- self._async_client.get_page_content(
673
- url, timeout=timeout, retries=retries, encoding=encoding
674
- )
675
- )
676
-
677
- def extract_links(
678
- self,
679
- url: str,
680
- *,
681
- css_selector: str = "a[href]",
682
- include_internal: bool = True,
683
- include_external: bool = True,
684
- timeout: Optional[float] = None,
685
- retries: int = 3,
686
- ) -> ExtractedLinks:
687
- """
688
- Synchronously extract links from a web page.
689
-
690
- Args:
691
- url: URL of the web page to parse
692
- css_selector: CSS selector for link elements
693
- include_internal: Whether to include internal links
694
- include_external: Whether to include external links
695
- timeout: Request timeout in seconds
696
- retries: Number of retry attempts for failed requests
697
-
698
- Returns:
699
- List of link dictionaries with keys: href, original_href, text, title, type
700
- """
701
- return self._run_async(
702
- self._async_client.extract_links(
703
- url,
704
- css_selector=css_selector,
705
- include_internal=include_internal,
706
- include_external=include_external,
707
- timeout=timeout,
708
- retries=retries,
709
- )
710
- )
711
-
712
- def search_web(
713
- self,
714
- query: str,
715
- *,
716
- max_results: int = 10,
717
- region: str = "wt-wt",
718
- safesearch: Literal["on", "moderate", "off"] = "moderate",
719
- timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
720
- backend: Literal["auto", "html", "lite"] = "auto",
721
- max_retries: Optional[int] = None,
722
- ) -> SearchResults:
723
- """
724
- Synchronously search the web using DuckDuckGo search.
725
-
726
- Args:
727
- query: Search query string
728
- max_results: Maximum number of results to return (default: 10)
729
- region: Search region (default: "wt-wt" for worldwide)
730
- safesearch: Safe search setting (default: "moderate")
731
- timelimit: Time limit for results (d=day, w=week, m=month, y=year)
732
- backend: Search backend to use (default: "auto")
733
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
734
-
735
- Returns:
736
- List of search result dictionaries with 'title', 'href', and 'body' keys
737
-
738
- Raises:
739
- ValueError: If query is empty
740
- Exception: If search fails after all retries
741
- """
742
- return self._run_async(
743
- self._async_client.search_web(
744
- query=query,
745
- max_results=max_results,
746
- region=region,
747
- safesearch=safesearch,
748
- timelimit=timelimit,
749
- backend=backend,
750
- max_retries=max_retries,
751
- )
752
- )
753
-
754
- def search_news(
755
- self,
756
- query: str,
757
- *,
758
- max_results: int = 10,
759
- region: str = "wt-wt",
760
- safesearch: Literal["on", "moderate", "off"] = "moderate",
761
- timelimit: Optional[Literal["d", "w", "m"]] = None,
762
- max_retries: Optional[int] = None,
763
- ) -> NewsResults:
764
- """
765
- Synchronously search for news using DuckDuckGo news search.
766
-
767
- Args:
768
- query: Search query string
769
- max_results: Maximum number of results to return (default: 10)
770
- region: Search region (default: "wt-wt" for worldwide)
771
- safesearch: Safe search setting (default: "moderate")
772
- timelimit: Time limit for results (d=day, w=week, m=month)
773
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
774
-
775
- Returns:
776
- List of news result dictionaries with date, title, body, url, image, and source
777
-
778
- Raises:
779
- ValueError: If query is empty
780
- Exception: If search fails after all retries
781
- """
782
- return self._run_async(
783
- self._async_client.search_news(
784
- query=query,
785
- max_results=max_results,
786
- region=region,
787
- safesearch=safesearch,
788
- timelimit=timelimit,
789
- max_retries=max_retries,
790
- )
791
- )
792
-
793
- def read_web_page(
794
- self,
795
- url: str,
796
- *,
797
- timeout: float = 30.0,
798
- headers: Optional[Dict[str, str]] = None,
799
- extract_text: bool = True,
800
- extract_links: bool = False,
801
- extract_images: bool = False,
802
- css_selector: Optional[str] = None,
803
- ) -> WebPageResult:
804
- """
805
- Synchronously read and parse a single web page using selectolax.
806
-
807
- Args:
808
- url: URL to fetch and parse
809
- timeout: Request timeout in seconds (default: 30.0)
810
- headers: Optional HTTP headers to send
811
- extract_text: Whether to extract text content (default: True)
812
- extract_links: Whether to extract links (default: False)
813
- extract_images: Whether to extract images (default: False)
814
- css_selector: Optional CSS selector to extract specific elements
815
-
816
- Returns:
817
- Dictionary containing parsed content and metadata
818
-
819
- Raises:
820
- httpx.HTTPError: If request fails
821
- Exception: If parsing fails
822
- """
823
- return self._run_async(
824
- self._async_client.read_web_page(
825
- url=url,
826
- timeout=timeout,
827
- headers=headers,
828
- extract_text=extract_text,
829
- extract_links=extract_links,
830
- extract_images=extract_images,
831
- css_selector=css_selector,
832
- )
833
- )
834
-
835
- def read_web_pages(
836
- self,
837
- urls: List[str],
838
- *,
839
- timeout: float = 30.0,
840
- headers: Optional[Dict[str, str]] = None,
841
- extract_text: bool = True,
842
- extract_links: bool = False,
843
- extract_images: bool = False,
844
- css_selector: Optional[str] = None,
845
- max_concurrent: Optional[int] = None,
846
- ) -> WebPageResults:
847
- """
848
- Synchronously read and parse multiple web pages concurrently using selectolax.
849
-
850
- Args:
851
- urls: List of URLs to fetch and parse
852
- timeout: Request timeout in seconds (default: 30.0)
853
- headers: Optional HTTP headers to send
854
- extract_text: Whether to extract text content (default: True)
855
- extract_links: Whether to extract links (default: False)
856
- extract_images: Whether to extract images (default: False)
857
- css_selector: Optional CSS selector to extract specific elements
858
- max_concurrent: Maximum concurrent requests (uses client default if not specified)
859
-
860
- Returns:
861
- List of dictionaries containing parsed content and metadata for each URL
862
-
863
- Raises:
864
- httpx.HTTPError: If requests fail
865
- Exception: If parsing fails
866
- """
867
- return self._run_async(
868
- self._async_client.read_web_pages(
869
- urls=urls,
870
- timeout=timeout,
871
- headers=headers,
872
- extract_text=extract_text,
873
- extract_links=extract_links,
874
- extract_images=extract_images,
875
- css_selector=css_selector,
876
- max_concurrent=max_concurrent,
877
- )
878
- )
879
-
880
- def extract_page_links(
881
- self,
882
- url: str,
883
- *,
884
- timeout: float = 30.0,
885
- headers: Optional[Dict[str, str]] = None,
886
- css_selector: str = "a[href]",
887
- include_internal: bool = True,
888
- include_external: bool = True,
889
- base_url: Optional[str] = None,
890
- ) -> ExtractedLinks:
891
- """
892
- Synchronously extract all links from a web page.
893
-
894
- Args:
895
- url: URL to fetch and extract links from
896
- timeout: Request timeout in seconds (default: 30.0)
897
- headers: Optional HTTP headers to send
898
- css_selector: CSS selector for link elements (default: "a[href]")
899
- include_internal: Whether to include internal links (default: True)
900
- include_external: Whether to include external links (default: True)
901
- base_url: Base URL for resolving relative links (uses page URL if not provided)
902
-
903
- Returns:
904
- List of link dictionaries with 'href', 'original_href', 'text', 'title', and 'type' keys
905
-
906
- Raises:
907
- httpx.HTTPError: If request fails
908
- Exception: If parsing fails
909
- """
910
- return self._run_async(
911
- self._async_client.extract_page_links(
912
- url=url,
913
- timeout=timeout,
914
- headers=headers,
915
- css_selector=css_selector,
916
- include_internal=include_internal,
917
- include_external=include_external,
918
- base_url=base_url,
919
- )
920
- )
921
-
922
- def close(self):
923
- """Close the underlying async client."""
924
- pass
925
-
926
- def __enter__(self):
927
- """Context manager entry."""
928
- return self
929
-
930
- def __exit__(self, exc_type, exc_val, exc_tb):
931
- """Context manager exit."""
932
- self.close()
933
-
934
-
935
- @overload
936
- def create_search_client(
937
- *,
938
- timeout: float = 30.0,
939
- max_concurrent: int = 5,
940
- user_agent: Optional[str] = None,
941
- default_headers: Optional[Dict[str, str]] = None,
942
- max_retries: int = 3,
943
- async_client: Literal[True],
944
- ) -> AsyncSearchClient: ...
945
-
946
-
947
- @overload
948
- def create_search_client(
949
- *,
950
- timeout: float = 30.0,
951
- max_concurrent: int = 5,
952
- user_agent: Optional[str] = None,
953
- default_headers: Optional[Dict[str, str]] = None,
954
- max_retries: int = 3,
955
- async_client: Literal[False] = ...,
956
- ) -> SearchClient: ...
957
-
958
-
959
- def create_search_client(
960
- *,
961
- timeout: float = 30.0,
962
- max_concurrent: int = 5,
963
- user_agent: Optional[str] = None,
964
- default_headers: Optional[Dict[str, str]] = None,
965
- max_retries: int = 3,
966
- async_client: bool = False,
967
- ) -> Union[SearchClient, AsyncSearchClient]:
968
- """
969
- Create a new SearchClient instance.
970
-
971
- Args:
972
- timeout: Default timeout for HTTP requests in seconds
973
- max_concurrent: Maximum number of concurrent requests for batch operations
974
- user_agent: User-Agent header for HTTP requests
975
- default_headers: Default headers to include in HTTP requests
976
- max_retries: Maximum number of retry attempts for failed requests
977
- async_client: Whether to return an async client instance
978
-
979
- Returns:
980
- SearchClient or AsyncSearchClient instance based on async_client parameter
981
- """
982
- params = locals()
983
- del params["async_client"]
984
-
985
- if async_client:
986
- return AsyncSearchClient(**params)
987
- else:
988
- return SearchClient(**params)