hammad-python 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. hammad/__init__.py +64 -10
  2. hammad/based/__init__.py +52 -0
  3. hammad/based/fields.py +546 -0
  4. hammad/based/model.py +968 -0
  5. hammad/based/utils.py +455 -0
  6. hammad/cache/__init__.py +30 -0
  7. hammad/{cache.py → cache/_cache.py} +83 -12
  8. hammad/cli/__init__.py +25 -0
  9. hammad/cli/plugins/__init__.py +786 -0
  10. hammad/cli/styles/__init__.py +5 -0
  11. hammad/cli/styles/animations.py +548 -0
  12. hammad/cli/styles/settings.py +135 -0
  13. hammad/cli/styles/types.py +358 -0
  14. hammad/cli/styles/utils.py +480 -0
  15. hammad/data/__init__.py +51 -0
  16. hammad/data/collections/__init__.py +32 -0
  17. hammad/data/collections/base_collection.py +58 -0
  18. hammad/data/collections/collection.py +227 -0
  19. hammad/data/collections/searchable_collection.py +556 -0
  20. hammad/data/collections/vector_collection.py +497 -0
  21. hammad/data/databases/__init__.py +21 -0
  22. hammad/data/databases/database.py +551 -0
  23. hammad/data/types/__init__.py +33 -0
  24. hammad/data/types/files/__init__.py +1 -0
  25. hammad/data/types/files/audio.py +81 -0
  26. hammad/data/types/files/configuration.py +475 -0
  27. hammad/data/types/files/document.py +195 -0
  28. hammad/data/types/files/file.py +358 -0
  29. hammad/data/types/files/image.py +80 -0
  30. hammad/json/__init__.py +21 -0
  31. hammad/{utils/json → json}/converters.py +4 -1
  32. hammad/logging/__init__.py +27 -0
  33. hammad/logging/decorators.py +432 -0
  34. hammad/logging/logger.py +534 -0
  35. hammad/pydantic/__init__.py +43 -0
  36. hammad/{utils/pydantic → pydantic}/converters.py +2 -1
  37. hammad/pydantic/models/__init__.py +28 -0
  38. hammad/pydantic/models/arbitrary_model.py +46 -0
  39. hammad/pydantic/models/cacheable_model.py +79 -0
  40. hammad/pydantic/models/fast_model.py +318 -0
  41. hammad/pydantic/models/function_model.py +176 -0
  42. hammad/pydantic/models/subscriptable_model.py +63 -0
  43. hammad/text/__init__.py +37 -0
  44. hammad/text/text.py +1068 -0
  45. hammad/text/utils/__init__.py +1 -0
  46. hammad/{utils/text → text/utils}/converters.py +2 -2
  47. hammad/text/utils/markdown/__init__.py +1 -0
  48. hammad/{utils → text/utils}/markdown/converters.py +3 -3
  49. hammad/{utils → text/utils}/markdown/formatting.py +1 -1
  50. hammad/{utils/typing/utils.py → typing/__init__.py} +75 -2
  51. hammad/web/__init__.py +42 -0
  52. hammad/web/http/__init__.py +1 -0
  53. hammad/web/http/client.py +944 -0
  54. hammad/web/openapi/client.py +740 -0
  55. hammad/web/search/__init__.py +1 -0
  56. hammad/web/search/client.py +936 -0
  57. hammad/web/utils.py +463 -0
  58. hammad/yaml/__init__.py +30 -0
  59. hammad/yaml/converters.py +19 -0
  60. {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/METADATA +14 -8
  61. hammad_python-0.0.11.dist-info/RECORD +65 -0
  62. hammad/database.py +0 -447
  63. hammad/logger.py +0 -273
  64. hammad/types/color.py +0 -951
  65. hammad/utils/json/__init__.py +0 -0
  66. hammad/utils/markdown/__init__.py +0 -0
  67. hammad/utils/pydantic/__init__.py +0 -0
  68. hammad/utils/text/__init__.py +0 -0
  69. hammad/utils/typing/__init__.py +0 -0
  70. hammad_python-0.0.10.dist-info/RECORD +0 -22
  71. /hammad/{types/__init__.py → py.typed} +0 -0
  72. /hammad/{utils → web/openapi}/__init__.py +0 -0
  73. {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/WHEEL +0 -0
  74. {hammad_python-0.0.10.dist-info → hammad_python-0.0.11.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,936 @@
1
+ """hammad.web.search.client"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from typing import Any, Dict, List, Literal, Optional, Union, overload
8
+ from urllib.parse import urljoin, urlparse
9
+
10
+ import httpx
11
+ from tenacity import (
12
+ AsyncRetrying,
13
+ retry_if_exception_type,
14
+ stop_after_attempt,
15
+ wait_exponential,
16
+ before_sleep_log,
17
+ )
18
+
19
+ __all__ = ("AsyncSearchClient", "SearchClient", "create_search_client")
20
+
21
+
22
+ class AsyncSearchClient:
23
+ """
24
+ Search client that provides web search and page parsing capabilities.
25
+
26
+ This client uses lazy loading for DuckDuckGo search and selectolax HTML parsing
27
+ to minimize import overhead and memory usage.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ *,
33
+ timeout: float = 30.0,
34
+ max_concurrent: int = 5,
35
+ user_agent: Optional[str] = None,
36
+ default_headers: Optional[Dict[str, str]] = None,
37
+ max_retries: int = 3,
38
+ ):
39
+ """
40
+ Initialize the SearchClient.
41
+
42
+ Args:
43
+ timeout: Default timeout for HTTP requests in seconds
44
+ max_concurrent: Maximum number of concurrent requests for batch operations
45
+ user_agent: User-Agent header for HTTP requests
46
+ default_headers: Default headers to include in HTTP requests
47
+ max_retries: Maximum number of retry attempts for failed requests
48
+ """
49
+ self.timeout = timeout
50
+ self.max_concurrent = max_concurrent
51
+ self.user_agent = user_agent or "Mozilla/5.0 (compatible; SearchClient/1.0)"
52
+ self.default_headers = default_headers or {}
53
+ self.max_retries = max_retries
54
+
55
+ # Lazy-loaded resources
56
+ self._ddgs_client = None
57
+ self._selectolax_parser_class = None
58
+
59
+ def _get_duckduckgo_client(self):
60
+ """Get a DuckDuckGo search client using lazy import and singleton pattern."""
61
+ if self._ddgs_client is None:
62
+ try:
63
+ from duckduckgo_search import DDGS
64
+
65
+ self._ddgs_client = DDGS
66
+ except ImportError as e:
67
+ raise ImportError(
68
+ "duckduckgo_search is required for web search functionality. "
69
+ "Install with: pip install duckduckgo-search"
70
+ ) from e
71
+ return self._ddgs_client
72
+
73
+ def _get_selectolax_parser(self):
74
+ """Get selectolax HTMLParser class using lazy import and singleton pattern."""
75
+ if self._selectolax_parser_class is None:
76
+ try:
77
+ from selectolax.parser import HTMLParser
78
+
79
+ self._selectolax_parser_class = HTMLParser
80
+ except ImportError as e:
81
+ raise ImportError(
82
+ "selectolax is required for HTML parsing functionality. "
83
+ "Install with: pip install selectolax"
84
+ ) from e
85
+ return self._selectolax_parser_class
86
+
87
+ def _get_default_headers(self) -> Dict[str, str]:
88
+ """Get default headers for HTTP requests."""
89
+ headers = {"User-Agent": self.user_agent}
90
+ headers.update(self.default_headers)
91
+ return headers
92
+
93
+ async def search_web(
94
+ self,
95
+ query: str,
96
+ *,
97
+ max_results: int = 10,
98
+ region: str = "wt-wt",
99
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
100
+ timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
101
+ backend: Literal["auto", "html", "lite"] = "auto",
102
+ max_retries: Optional[int] = None,
103
+ ) -> List[Dict[str, str]]:
104
+ """
105
+ Search the web using DuckDuckGo search.
106
+
107
+ Args:
108
+ query: Search query string
109
+ max_results: Maximum number of results to return (default: 10)
110
+ region: Search region (default: "wt-wt" for worldwide)
111
+ safesearch: Safe search setting (default: "moderate")
112
+ timelimit: Time limit for results (d=day, w=week, m=month, y=year)
113
+ backend: Search backend to use (default: "auto")
114
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
115
+
116
+ Returns:
117
+ List of search result dictionaries with 'title', 'href', and 'body' keys
118
+
119
+ Raises:
120
+ ValueError: If query is empty
121
+ Exception: If search fails after all retries
122
+ """
123
+ if not query or not query.strip():
124
+ raise ValueError("Query cannot be empty")
125
+
126
+ retries = max_retries if max_retries is not None else self.max_retries
127
+
128
+ async def _do_search():
129
+ DDGS = self._get_duckduckgo_client()
130
+ with DDGS() as ddgs:
131
+ results = list(
132
+ ddgs.text(
133
+ keywords=query.strip(),
134
+ region=region,
135
+ safesearch=safesearch,
136
+ timelimit=timelimit,
137
+ backend=backend,
138
+ max_results=max_results,
139
+ )
140
+ )
141
+ return results
142
+
143
+ async for attempt in AsyncRetrying(
144
+ stop=stop_after_attempt(retries + 1),
145
+ wait=wait_exponential(multiplier=1, min=1, max=10),
146
+ retry=retry_if_exception_type(Exception),
147
+ before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
148
+ ):
149
+ with attempt:
150
+ return await _do_search()
151
+
152
+ async def search_news(
153
+ self,
154
+ query: str,
155
+ *,
156
+ max_results: int = 10,
157
+ region: str = "wt-wt",
158
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
159
+ timelimit: Optional[Literal["d", "w", "m"]] = None,
160
+ max_retries: Optional[int] = None,
161
+ ) -> List[Dict[str, str]]:
162
+ """
163
+ Search for news using DuckDuckGo news search.
164
+
165
+ Args:
166
+ query: Search query string
167
+ max_results: Maximum number of results to return (default: 10)
168
+ region: Search region (default: "wt-wt" for worldwide)
169
+ safesearch: Safe search setting (default: "moderate")
170
+ timelimit: Time limit for results (d=day, w=week, m=month)
171
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
172
+
173
+ Returns:
174
+ List of news result dictionaries with date, title, body, url, image, and source
175
+
176
+ Raises:
177
+ ValueError: If query is empty
178
+ Exception: If search fails after all retries
179
+ """
180
+ if not query or not query.strip():
181
+ raise ValueError("Query cannot be empty")
182
+
183
+ retries = max_retries if max_retries is not None else self.max_retries
184
+
185
+ async def _do_news_search():
186
+ DDGS = self._get_duckduckgo_client()
187
+ with DDGS() as ddgs:
188
+ results = list(
189
+ ddgs.news(
190
+ keywords=query.strip(),
191
+ region=region,
192
+ safesearch=safesearch,
193
+ timelimit=timelimit,
194
+ max_results=max_results,
195
+ )
196
+ )
197
+ return results
198
+
199
+ async for attempt in AsyncRetrying(
200
+ stop=stop_after_attempt(retries + 1),
201
+ wait=wait_exponential(multiplier=1, min=1, max=10),
202
+ retry=retry_if_exception_type(Exception),
203
+ before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
204
+ ):
205
+ with attempt:
206
+ return await _do_news_search()
207
+
208
+ async def read_web_page(
209
+ self,
210
+ url: str,
211
+ *,
212
+ timeout: Optional[float] = None,
213
+ headers: Optional[Dict[str, str]] = None,
214
+ extract_text: bool = True,
215
+ extract_links: bool = False,
216
+ extract_images: bool = False,
217
+ css_selector: Optional[str] = None,
218
+ max_retries: Optional[int] = None,
219
+ ) -> Dict[str, Any]:
220
+ """
221
+ Read and parse a single web page using selectolax.
222
+
223
+ Args:
224
+ url: URL to fetch and parse
225
+ timeout: Request timeout in seconds (uses default if not provided)
226
+ headers: Optional HTTP headers to send
227
+ extract_text: Whether to extract text content (default: True)
228
+ extract_links: Whether to extract links (default: False)
229
+ extract_images: Whether to extract images (default: False)
230
+ css_selector: Optional CSS selector to extract specific elements
231
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
232
+
233
+ Returns:
234
+ Dictionary containing parsed content and metadata
235
+
236
+ Raises:
237
+ httpx.HTTPError: If request fails after all retries
238
+ Exception: If parsing fails
239
+ """
240
+ effective_headers = self._get_default_headers()
241
+ if headers:
242
+ effective_headers.update(headers)
243
+
244
+ request_timeout = timeout or self.timeout
245
+ retries = max_retries if max_retries is not None else self.max_retries
246
+
247
+ async def _do_fetch_and_parse():
248
+ async with httpx.AsyncClient(
249
+ timeout=request_timeout, follow_redirects=True
250
+ ) as client:
251
+ response = await client.get(url, headers=effective_headers)
252
+ response.raise_for_status()
253
+
254
+ # Parse HTML content
255
+ HTMLParser = self._get_selectolax_parser()
256
+ parser = HTMLParser(response.text)
257
+
258
+ result = {
259
+ "url": url,
260
+ "status_code": response.status_code,
261
+ "content_type": response.headers.get("content-type", ""),
262
+ "title": "",
263
+ "text": "",
264
+ "links": [],
265
+ "images": [],
266
+ "selected_elements": [],
267
+ }
268
+
269
+ # Extract title
270
+ title_node = parser.css_first("title")
271
+ if title_node:
272
+ result["title"] = title_node.text(strip=True)
273
+
274
+ # Extract text content
275
+ if extract_text:
276
+ if css_selector:
277
+ selected_nodes = parser.css(css_selector)
278
+ result["text"] = " ".join(
279
+ node.text(strip=True) for node in selected_nodes
280
+ )
281
+ else:
282
+ result["text"] = parser.text(strip=True)
283
+
284
+ # Extract links
285
+ if extract_links:
286
+ link_nodes = parser.css("a[href]")
287
+ result["links"] = [
288
+ {
289
+ "href": node.attrs.get("href", ""),
290
+ "text": node.text(strip=True),
291
+ }
292
+ for node in link_nodes
293
+ if node.attrs.get("href")
294
+ ]
295
+
296
+ # Extract images
297
+ if extract_images:
298
+ img_nodes = parser.css("img[src]")
299
+ result["images"] = [
300
+ {
301
+ "src": node.attrs.get("src", ""),
302
+ "alt": node.attrs.get("alt", ""),
303
+ "title": node.attrs.get("title", ""),
304
+ }
305
+ for node in img_nodes
306
+ if node.attrs.get("src")
307
+ ]
308
+
309
+ # Extract selected elements
310
+ if css_selector:
311
+ selected_nodes = parser.css(css_selector)
312
+ result["selected_elements"] = [
313
+ {
314
+ "tag": node.tag,
315
+ "text": node.text(strip=True),
316
+ "html": node.html,
317
+ "attributes": dict(node.attributes),
318
+ }
319
+ for node in selected_nodes
320
+ ]
321
+
322
+ return result
323
+
324
+ async for attempt in AsyncRetrying(
325
+ stop=stop_after_attempt(retries + 1),
326
+ wait=wait_exponential(multiplier=1, min=1, max=10),
327
+ retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
328
+ before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
329
+ ):
330
+ with attempt:
331
+ return await _do_fetch_and_parse()
332
+
333
+ async def read_web_pages(
334
+ self,
335
+ urls: List[str],
336
+ *,
337
+ timeout: Optional[float] = None,
338
+ headers: Optional[Dict[str, str]] = None,
339
+ extract_text: bool = True,
340
+ extract_links: bool = False,
341
+ extract_images: bool = False,
342
+ css_selector: Optional[str] = None,
343
+ max_concurrent: Optional[int] = None,
344
+ max_retries: Optional[int] = None,
345
+ ) -> List[Dict[str, Any]]:
346
+ """
347
+ Read and parse multiple web pages concurrently using selectolax.
348
+
349
+ Args:
350
+ urls: List of URLs to fetch and parse
351
+ timeout: Request timeout in seconds (uses default if not provided)
352
+ headers: Optional HTTP headers to send
353
+ extract_text: Whether to extract text content (default: True)
354
+ extract_links: Whether to extract links (default: False)
355
+ extract_images: Whether to extract images (default: False)
356
+ css_selector: Optional CSS selector to extract specific elements
357
+ max_concurrent: Maximum number of concurrent requests (uses default if not provided)
358
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
359
+
360
+ Returns:
361
+ List of dictionaries containing parsed content and metadata
362
+
363
+ Raises:
364
+ Exception: If any critical error occurs
365
+ """
366
+ if not urls:
367
+ return []
368
+
369
+ # Remove duplicates while preserving order
370
+ unique_urls = []
371
+ seen = set()
372
+ for url in urls:
373
+ if url not in seen:
374
+ unique_urls.append(url)
375
+ seen.add(url)
376
+
377
+ # Create semaphore for concurrency control
378
+ concurrent_limit = max_concurrent or self.max_concurrent
379
+ semaphore = asyncio.Semaphore(concurrent_limit)
380
+
381
+ async def fetch_page(url: str) -> Dict[str, Any]:
382
+ async with semaphore:
383
+ try:
384
+ return await self.read_web_page(
385
+ url=url,
386
+ timeout=timeout,
387
+ headers=headers,
388
+ extract_text=extract_text,
389
+ extract_links=extract_links,
390
+ extract_images=extract_images,
391
+ css_selector=css_selector,
392
+ max_retries=max_retries,
393
+ )
394
+ except Exception as e:
395
+ return {
396
+ "url": url,
397
+ "error": str(e),
398
+ "status_code": None,
399
+ "content_type": "",
400
+ "title": "",
401
+ "text": "",
402
+ "links": [],
403
+ "images": [],
404
+ "selected_elements": [],
405
+ }
406
+
407
+ # Execute all requests concurrently
408
+ tasks = [fetch_page(url) for url in unique_urls]
409
+ results = await asyncio.gather(*tasks, return_exceptions=False)
410
+
411
+ return results
412
+
413
+ async def extract_page_links(
414
+ self,
415
+ url: str,
416
+ *,
417
+ timeout: Optional[float] = None,
418
+ headers: Optional[Dict[str, str]] = None,
419
+ css_selector: str = "a[href]",
420
+ include_external: bool = True,
421
+ include_internal: bool = True,
422
+ base_url: Optional[str] = None,
423
+ max_retries: Optional[int] = None,
424
+ ) -> List[Dict[str, str]]:
425
+ """
426
+ Extract links from a web page using selectolax.
427
+
428
+ Args:
429
+ url: URL to fetch and extract links from
430
+ timeout: Request timeout in seconds (uses default if not provided)
431
+ headers: Optional HTTP headers to send
432
+ css_selector: CSS selector for links (default: "a[href]")
433
+ include_external: Whether to include external links (default: True)
434
+ include_internal: Whether to include internal links (default: True)
435
+ base_url: Base URL for resolving relative links (uses page URL if not provided)
436
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
437
+
438
+ Returns:
439
+ List of link dictionaries with href, text, title, and type (internal/external)
440
+
441
+ Raises:
442
+ httpx.HTTPError: If request fails after all retries
443
+ Exception: If parsing fails
444
+ """
445
+ effective_headers = self._get_default_headers()
446
+ if headers:
447
+ effective_headers.update(headers)
448
+
449
+ request_timeout = timeout or self.timeout
450
+ retries = max_retries if max_retries is not None else self.max_retries
451
+
452
+ async def _do_extract_links():
453
+ async with httpx.AsyncClient(
454
+ timeout=request_timeout, follow_redirects=True
455
+ ) as client:
456
+ response = await client.get(url, headers=effective_headers)
457
+ response.raise_for_status()
458
+
459
+ # Parse HTML content
460
+ HTMLParser = self._get_selectolax_parser()
461
+ parser = HTMLParser(response.text)
462
+
463
+ # Use provided base_url or extract from the page
464
+ effective_base_url = base_url or url
465
+
466
+ # Get the domain for internal/external classification
467
+ parsed_base = urlparse(effective_base_url)
468
+ base_domain = parsed_base.netloc
469
+
470
+ # Extract links
471
+ link_nodes = parser.css(css_selector)
472
+ links = []
473
+
474
+ for node in link_nodes:
475
+ href = node.attrs.get("href", "").strip()
476
+ if not href:
477
+ continue
478
+
479
+ # Resolve relative URLs
480
+ absolute_href = urljoin(effective_base_url, href)
481
+ parsed_href = urlparse(absolute_href)
482
+
483
+ # Determine if link is internal or external
484
+ is_internal = (
485
+ parsed_href.netloc == base_domain or not parsed_href.netloc
486
+ )
487
+ link_type = "internal" if is_internal else "external"
488
+
489
+ # Filter based on include flags
490
+ if (is_internal and not include_internal) or (
491
+ not is_internal and not include_external
492
+ ):
493
+ continue
494
+
495
+ link_info = {
496
+ "href": absolute_href,
497
+ "original_href": href,
498
+ "text": node.text(strip=True),
499
+ "title": node.attrs.get("title", ""),
500
+ "type": link_type,
501
+ }
502
+
503
+ links.append(link_info)
504
+
505
+ return links
506
+
507
+ async for attempt in AsyncRetrying(
508
+ stop=stop_after_attempt(retries + 1),
509
+ wait=wait_exponential(multiplier=1, min=1, max=10),
510
+ retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
511
+ before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
512
+ ):
513
+ with attempt:
514
+ return await _do_extract_links()
515
+
516
+
517
+ class SearchClient:
518
+ """
519
+ Synchronous wrapper around AsyncSearchClient.
520
+
521
+ This class provides a synchronous interface to the search functionality
522
+ by running async operations in an event loop.
523
+ """
524
+
525
+ def __init__(
526
+ self,
527
+ *,
528
+ timeout: float = 30.0,
529
+ max_concurrent: int = 5,
530
+ user_agent: Optional[str] = None,
531
+ default_headers: Optional[Dict[str, str]] = None,
532
+ max_retries: int = 3,
533
+ ):
534
+ """
535
+ Initialize the SearchClient.
536
+
537
+ Args:
538
+ timeout: Default timeout for HTTP requests in seconds
539
+ max_concurrent: Maximum number of concurrent requests for batch operations
540
+ user_agent: User-Agent header for HTTP requests
541
+ default_headers: Default headers to include in HTTP requests
542
+ max_retries: Maximum number of retry attempts for failed requests
543
+ """
544
+ self._async_client = AsyncSearchClient(
545
+ timeout=timeout,
546
+ max_concurrent=max_concurrent,
547
+ user_agent=user_agent,
548
+ default_headers=default_headers,
549
+ max_retries=max_retries,
550
+ )
551
+
552
+ def _run_async(self, coro):
553
+ """Run an async coroutine in a new event loop."""
554
+ try:
555
+ # Try to get the current event loop
556
+ loop = asyncio.get_running_loop()
557
+ # If we're already in an event loop, we need to use a thread
558
+ import concurrent.futures
559
+
560
+ with concurrent.futures.ThreadPoolExecutor() as executor:
561
+ future = executor.submit(asyncio.run, coro)
562
+ return future.result()
563
+ except RuntimeError:
564
+ # No event loop running, we can create our own
565
+ return asyncio.run(coro)
566
+
567
+ def search(
568
+ self,
569
+ query: str,
570
+ *,
571
+ max_results: int = 10,
572
+ region: str = "wt-wt",
573
+ safesearch: str = "moderate",
574
+ backend: str = "api",
575
+ ) -> List[Dict[str, Any]]:
576
+ """
577
+ Synchronous web search using DuckDuckGo.
578
+
579
+ Args:
580
+ query: Search query string
581
+ max_results: Maximum number of results to return
582
+ region: Search region (default: "wt-wt" for worldwide)
583
+ safesearch: Safe search setting ("on", "moderate", "off")
584
+ backend: Search backend ("api", "html", "lite")
585
+
586
+ Returns:
587
+ List of search result dictionaries with keys: title, href, body
588
+ """
589
+ return self._run_async(
590
+ self._async_client.search(
591
+ query,
592
+ max_results=max_results,
593
+ region=region,
594
+ safesearch=safesearch,
595
+ backend=backend,
596
+ )
597
+ )
598
+
599
+ def get_page_content(
600
+ self,
601
+ url: str,
602
+ *,
603
+ timeout: Optional[float] = None,
604
+ retries: int = 3,
605
+ encoding: Optional[str] = None,
606
+ ) -> str:
607
+ """
608
+ Synchronously fetch and return the text content of a web page.
609
+
610
+ Args:
611
+ url: URL of the web page to fetch
612
+ timeout: Request timeout in seconds (uses client default if not specified)
613
+ retries: Number of retry attempts for failed requests
614
+ encoding: Text encoding to use (auto-detected if not specified)
615
+
616
+ Returns:
617
+ Plain text content of the web page
618
+ """
619
+ return self._run_async(
620
+ self._async_client.get_page_content(
621
+ url, timeout=timeout, retries=retries, encoding=encoding
622
+ )
623
+ )
624
+
625
+ def extract_links(
626
+ self,
627
+ url: str,
628
+ *,
629
+ css_selector: str = "a[href]",
630
+ include_internal: bool = True,
631
+ include_external: bool = True,
632
+ timeout: Optional[float] = None,
633
+ retries: int = 3,
634
+ ) -> List[Dict[str, Any]]:
635
+ """
636
+ Synchronously extract links from a web page.
637
+
638
+ Args:
639
+ url: URL of the web page to parse
640
+ css_selector: CSS selector for link elements
641
+ include_internal: Whether to include internal links
642
+ include_external: Whether to include external links
643
+ timeout: Request timeout in seconds
644
+ retries: Number of retry attempts for failed requests
645
+
646
+ Returns:
647
+ List of link dictionaries with keys: href, original_href, text, title, type
648
+ """
649
+ return self._run_async(
650
+ self._async_client.extract_links(
651
+ url,
652
+ css_selector=css_selector,
653
+ include_internal=include_internal,
654
+ include_external=include_external,
655
+ timeout=timeout,
656
+ retries=retries,
657
+ )
658
+ )
659
+
660
+ def search_web(
661
+ self,
662
+ query: str,
663
+ *,
664
+ max_results: int = 10,
665
+ region: str = "wt-wt",
666
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
667
+ timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
668
+ backend: Literal["auto", "html", "lite"] = "auto",
669
+ max_retries: Optional[int] = None,
670
+ ) -> List[Dict[str, str]]:
671
+ """
672
+ Synchronously search the web using DuckDuckGo search.
673
+
674
+ Args:
675
+ query: Search query string
676
+ max_results: Maximum number of results to return (default: 10)
677
+ region: Search region (default: "wt-wt" for worldwide)
678
+ safesearch: Safe search setting (default: "moderate")
679
+ timelimit: Time limit for results (d=day, w=week, m=month, y=year)
680
+ backend: Search backend to use (default: "auto")
681
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
682
+
683
+ Returns:
684
+ List of search result dictionaries with 'title', 'href', and 'body' keys
685
+
686
+ Raises:
687
+ ValueError: If query is empty
688
+ Exception: If search fails after all retries
689
+ """
690
+ return self._run_async(
691
+ self._async_client.search_web(
692
+ query=query,
693
+ max_results=max_results,
694
+ region=region,
695
+ safesearch=safesearch,
696
+ timelimit=timelimit,
697
+ backend=backend,
698
+ max_retries=max_retries,
699
+ )
700
+ )
701
+
702
+ def search_news(
703
+ self,
704
+ query: str,
705
+ *,
706
+ max_results: int = 10,
707
+ region: str = "wt-wt",
708
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
709
+ timelimit: Optional[Literal["d", "w", "m"]] = None,
710
+ max_retries: Optional[int] = None,
711
+ ) -> List[Dict[str, str]]:
712
+ """
713
+ Synchronously search for news using DuckDuckGo news search.
714
+
715
+ Args:
716
+ query: Search query string
717
+ max_results: Maximum number of results to return (default: 10)
718
+ region: Search region (default: "wt-wt" for worldwide)
719
+ safesearch: Safe search setting (default: "moderate")
720
+ timelimit: Time limit for results (d=day, w=week, m=month)
721
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
722
+
723
+ Returns:
724
+ List of news result dictionaries with date, title, body, url, image, and source
725
+
726
+ Raises:
727
+ ValueError: If query is empty
728
+ Exception: If search fails after all retries
729
+ """
730
+ return self._run_async(
731
+ self._async_client.search_news(
732
+ query=query,
733
+ max_results=max_results,
734
+ region=region,
735
+ safesearch=safesearch,
736
+ timelimit=timelimit,
737
+ max_retries=max_retries,
738
+ )
739
+ )
740
+
741
+ def read_web_page(
742
+ self,
743
+ url: str,
744
+ *,
745
+ timeout: float = 30.0,
746
+ headers: Optional[Dict[str, str]] = None,
747
+ extract_text: bool = True,
748
+ extract_links: bool = False,
749
+ extract_images: bool = False,
750
+ css_selector: Optional[str] = None,
751
+ ) -> Dict[str, Any]:
752
+ """
753
+ Synchronously read and parse a single web page using selectolax.
754
+
755
+ Args:
756
+ url: URL to fetch and parse
757
+ timeout: Request timeout in seconds (default: 30.0)
758
+ headers: Optional HTTP headers to send
759
+ extract_text: Whether to extract text content (default: True)
760
+ extract_links: Whether to extract links (default: False)
761
+ extract_images: Whether to extract images (default: False)
762
+ css_selector: Optional CSS selector to extract specific elements
763
+
764
+ Returns:
765
+ Dictionary containing parsed content and metadata
766
+
767
+ Raises:
768
+ httpx.HTTPError: If request fails
769
+ Exception: If parsing fails
770
+ """
771
+ return self._run_async(
772
+ self._async_client.read_web_page(
773
+ url=url,
774
+ timeout=timeout,
775
+ headers=headers,
776
+ extract_text=extract_text,
777
+ extract_links=extract_links,
778
+ extract_images=extract_images,
779
+ css_selector=css_selector,
780
+ )
781
+ )
782
+
783
+ def read_web_pages(
784
+ self,
785
+ urls: List[str],
786
+ *,
787
+ timeout: float = 30.0,
788
+ headers: Optional[Dict[str, str]] = None,
789
+ extract_text: bool = True,
790
+ extract_links: bool = False,
791
+ extract_images: bool = False,
792
+ css_selector: Optional[str] = None,
793
+ max_concurrent: Optional[int] = None,
794
+ ) -> List[Dict[str, Any]]:
795
+ """
796
+ Synchronously read and parse multiple web pages concurrently using selectolax.
797
+
798
+ Args:
799
+ urls: List of URLs to fetch and parse
800
+ timeout: Request timeout in seconds (default: 30.0)
801
+ headers: Optional HTTP headers to send
802
+ extract_text: Whether to extract text content (default: True)
803
+ extract_links: Whether to extract links (default: False)
804
+ extract_images: Whether to extract images (default: False)
805
+ css_selector: Optional CSS selector to extract specific elements
806
+ max_concurrent: Maximum concurrent requests (uses client default if not specified)
807
+
808
+ Returns:
809
+ List of dictionaries containing parsed content and metadata for each URL
810
+
811
+ Raises:
812
+ httpx.HTTPError: If requests fail
813
+ Exception: If parsing fails
814
+ """
815
+ return self._run_async(
816
+ self._async_client.read_web_pages(
817
+ urls=urls,
818
+ timeout=timeout,
819
+ headers=headers,
820
+ extract_text=extract_text,
821
+ extract_links=extract_links,
822
+ extract_images=extract_images,
823
+ css_selector=css_selector,
824
+ max_concurrent=max_concurrent,
825
+ )
826
+ )
827
+
828
+ def extract_page_links(
829
+ self,
830
+ url: str,
831
+ *,
832
+ timeout: float = 30.0,
833
+ headers: Optional[Dict[str, str]] = None,
834
+ css_selector: str = "a[href]",
835
+ include_internal: bool = True,
836
+ include_external: bool = True,
837
+ base_url: Optional[str] = None,
838
+ ) -> List[Dict[str, str]]:
839
+ """
840
+ Synchronously extract all links from a web page.
841
+
842
+ Args:
843
+ url: URL to fetch and extract links from
844
+ timeout: Request timeout in seconds (default: 30.0)
845
+ headers: Optional HTTP headers to send
846
+ css_selector: CSS selector for link elements (default: "a[href]")
847
+ include_internal: Whether to include internal links (default: True)
848
+ include_external: Whether to include external links (default: True)
849
+ base_url: Base URL for resolving relative links (uses page URL if not provided)
850
+
851
+ Returns:
852
+ List of link dictionaries with 'href', 'original_href', 'text', 'title', and 'type' keys
853
+
854
+ Raises:
855
+ httpx.HTTPError: If request fails
856
+ Exception: If parsing fails
857
+ """
858
+ return self._run_async(
859
+ self._async_client.extract_page_links(
860
+ url=url,
861
+ timeout=timeout,
862
+ headers=headers,
863
+ css_selector=css_selector,
864
+ include_internal=include_internal,
865
+ include_external=include_external,
866
+ base_url=base_url,
867
+ )
868
+ )
869
+
870
+ def close(self):
871
+ """Close the underlying async client."""
872
+ pass
873
+
874
+ def __enter__(self):
875
+ """Context manager entry."""
876
+ return self
877
+
878
+ def __exit__(self, exc_type, exc_val, exc_tb):
879
+ """Context manager exit."""
880
+ self.close()
881
+
882
+
883
+ @overload
884
+ def create_search_client(
885
+ *,
886
+ timeout: float = 30.0,
887
+ max_concurrent: int = 5,
888
+ user_agent: Optional[str] = None,
889
+ default_headers: Optional[Dict[str, str]] = None,
890
+ max_retries: int = 3,
891
+ async_client: Literal[True],
892
+ ) -> AsyncSearchClient: ...
893
+
894
+
895
+ @overload
896
+ def create_search_client(
897
+ *,
898
+ timeout: float = 30.0,
899
+ max_concurrent: int = 5,
900
+ user_agent: Optional[str] = None,
901
+ default_headers: Optional[Dict[str, str]] = None,
902
+ max_retries: int = 3,
903
+ async_client: Literal[False] = ...,
904
+ ) -> SearchClient: ...
905
+
906
+
907
+ def create_search_client(
908
+ *,
909
+ timeout: float = 30.0,
910
+ max_concurrent: int = 5,
911
+ user_agent: Optional[str] = None,
912
+ default_headers: Optional[Dict[str, str]] = None,
913
+ max_retries: int = 3,
914
+ async_client: bool = False,
915
+ ) -> Union[SearchClient, AsyncSearchClient]:
916
+ """
917
+ Create a new SearchClient instance.
918
+
919
+ Args:
920
+ timeout: Default timeout for HTTP requests in seconds
921
+ max_concurrent: Maximum number of concurrent requests for batch operations
922
+ user_agent: User-Agent header for HTTP requests
923
+ default_headers: Default headers to include in HTTP requests
924
+ max_retries: Maximum number of retry attempts for failed requests
925
+ async_client: Whether to return an async client instance
926
+
927
+ Returns:
928
+ SearchClient or AsyncSearchClient instance based on async_client parameter
929
+ """
930
+ params = locals()
931
+ del params["async_client"]
932
+
933
+ if async_client:
934
+ return AsyncSearchClient(**params)
935
+ else:
936
+ return SearchClient(**params)