hammad-python 0.0.15__py3-none-any.whl → 0.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. hammad/__init__.py +178 -0
  2. hammad/_internal.py +237 -0
  3. hammad/cache/__init__.py +40 -0
  4. hammad/cache/base_cache.py +181 -0
  5. hammad/cache/cache.py +169 -0
  6. hammad/cache/decorators.py +261 -0
  7. hammad/cache/file_cache.py +80 -0
  8. hammad/cache/ttl_cache.py +74 -0
  9. hammad/cli/__init__.py +35 -0
  10. hammad/cli/_runner.py +265 -0
  11. hammad/cli/animations.py +573 -0
  12. hammad/cli/plugins.py +836 -0
  13. hammad/cli/styles/__init__.py +55 -0
  14. hammad/cli/styles/settings.py +139 -0
  15. hammad/cli/styles/types.py +358 -0
  16. hammad/cli/styles/utils.py +626 -0
  17. hammad/data/__init__.py +83 -0
  18. hammad/data/collections/__init__.py +44 -0
  19. hammad/data/collections/collection.py +274 -0
  20. hammad/data/collections/indexes/__init__.py +37 -0
  21. hammad/data/collections/indexes/qdrant/__init__.py +1 -0
  22. hammad/data/collections/indexes/qdrant/index.py +735 -0
  23. hammad/data/collections/indexes/qdrant/settings.py +94 -0
  24. hammad/data/collections/indexes/qdrant/utils.py +220 -0
  25. hammad/data/collections/indexes/tantivy/__init__.py +1 -0
  26. hammad/data/collections/indexes/tantivy/index.py +428 -0
  27. hammad/data/collections/indexes/tantivy/settings.py +51 -0
  28. hammad/data/collections/indexes/tantivy/utils.py +200 -0
  29. hammad/data/configurations/__init__.py +35 -0
  30. hammad/data/configurations/configuration.py +564 -0
  31. hammad/data/models/__init__.py +55 -0
  32. hammad/data/models/extensions/__init__.py +4 -0
  33. hammad/data/models/extensions/pydantic/__init__.py +42 -0
  34. hammad/data/models/extensions/pydantic/converters.py +759 -0
  35. hammad/data/models/fields.py +546 -0
  36. hammad/data/models/model.py +1078 -0
  37. hammad/data/models/utils.py +280 -0
  38. hammad/data/sql/__init__.py +23 -0
  39. hammad/data/sql/database.py +578 -0
  40. hammad/data/sql/types.py +141 -0
  41. hammad/data/types/__init__.py +39 -0
  42. hammad/data/types/file.py +358 -0
  43. hammad/data/types/multimodal/__init__.py +24 -0
  44. hammad/data/types/multimodal/audio.py +96 -0
  45. hammad/data/types/multimodal/image.py +80 -0
  46. hammad/data/types/text.py +1066 -0
  47. hammad/formatting/__init__.py +20 -0
  48. hammad/formatting/json/__init__.py +27 -0
  49. hammad/formatting/json/converters.py +158 -0
  50. hammad/formatting/text/__init__.py +63 -0
  51. hammad/formatting/text/converters.py +723 -0
  52. hammad/formatting/text/markdown.py +131 -0
  53. hammad/formatting/yaml/__init__.py +26 -0
  54. hammad/formatting/yaml/converters.py +5 -0
  55. hammad/genai/__init__.py +78 -0
  56. hammad/genai/agents/__init__.py +1 -0
  57. hammad/genai/agents/types/__init__.py +35 -0
  58. hammad/genai/agents/types/history.py +277 -0
  59. hammad/genai/agents/types/tool.py +490 -0
  60. hammad/genai/embedding_models/__init__.py +41 -0
  61. hammad/genai/embedding_models/embedding_model.py +193 -0
  62. hammad/genai/embedding_models/embedding_model_name.py +77 -0
  63. hammad/genai/embedding_models/embedding_model_request.py +65 -0
  64. hammad/genai/embedding_models/embedding_model_response.py +69 -0
  65. hammad/genai/embedding_models/run.py +161 -0
  66. hammad/genai/language_models/__init__.py +35 -0
  67. hammad/genai/language_models/_streaming.py +622 -0
  68. hammad/genai/language_models/_types.py +276 -0
  69. hammad/genai/language_models/_utils/__init__.py +31 -0
  70. hammad/genai/language_models/_utils/_completions.py +131 -0
  71. hammad/genai/language_models/_utils/_messages.py +89 -0
  72. hammad/genai/language_models/_utils/_requests.py +202 -0
  73. hammad/genai/language_models/_utils/_structured_outputs.py +124 -0
  74. hammad/genai/language_models/language_model.py +734 -0
  75. hammad/genai/language_models/language_model_request.py +135 -0
  76. hammad/genai/language_models/language_model_response.py +219 -0
  77. hammad/genai/language_models/language_model_response_chunk.py +53 -0
  78. hammad/genai/language_models/run.py +530 -0
  79. hammad/genai/multimodal_models.py +48 -0
  80. hammad/genai/rerank_models.py +26 -0
  81. hammad/logging/__init__.py +35 -0
  82. hammad/logging/decorators.py +834 -0
  83. hammad/logging/logger.py +954 -0
  84. hammad/mcp/__init__.py +50 -0
  85. hammad/mcp/client/__init__.py +36 -0
  86. hammad/mcp/client/client.py +624 -0
  87. hammad/mcp/client/client_service.py +400 -0
  88. hammad/mcp/client/settings.py +178 -0
  89. hammad/mcp/servers/__init__.py +25 -0
  90. hammad/mcp/servers/launcher.py +1161 -0
  91. hammad/runtime/__init__.py +32 -0
  92. hammad/runtime/decorators.py +142 -0
  93. hammad/runtime/run.py +299 -0
  94. hammad/service/__init__.py +49 -0
  95. hammad/service/create.py +527 -0
  96. hammad/service/decorators.py +285 -0
  97. hammad/typing/__init__.py +435 -0
  98. hammad/web/__init__.py +43 -0
  99. hammad/web/http/__init__.py +1 -0
  100. hammad/web/http/client.py +944 -0
  101. hammad/web/models.py +277 -0
  102. hammad/web/openapi/__init__.py +1 -0
  103. hammad/web/openapi/client.py +740 -0
  104. hammad/web/search/__init__.py +1 -0
  105. hammad/web/search/client.py +1035 -0
  106. hammad/web/utils.py +472 -0
  107. {hammad_python-0.0.15.dist-info → hammad_python-0.0.17.dist-info}/METADATA +8 -1
  108. hammad_python-0.0.17.dist-info/RECORD +110 -0
  109. hammad_python-0.0.15.dist-info/RECORD +0 -4
  110. {hammad_python-0.0.15.dist-info → hammad_python-0.0.17.dist-info}/WHEEL +0 -0
  111. {hammad_python-0.0.15.dist-info → hammad_python-0.0.17.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1035 @@
1
+ """hammad.web.search.client"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from typing import Any, Dict, List, Literal, Optional, Union, overload
8
+ from urllib.parse import urljoin, urlparse
9
+
10
+ import httpx
11
+ from tenacity import (
12
+ AsyncRetrying,
13
+ retry_if_exception_type,
14
+ stop_after_attempt,
15
+ wait_exponential,
16
+ before_sleep_log,
17
+ )
18
+
19
+ from ..models import (
20
+ SearchResult,
21
+ NewsResult,
22
+ SearchResults,
23
+ NewsResults,
24
+ WebPageResult,
25
+ WebPageErrorResult,
26
+ WebPageResults,
27
+ ExtractedLinks,
28
+ ExtractedLink,
29
+ LinkInfo,
30
+ ImageInfo,
31
+ SelectedElement,
32
+ )
33
+
34
+ __all__ = ("AsyncSearchClient", "SearchClient", "create_search_client")
35
+
36
+
37
+ class AsyncSearchClient:
38
+ """
39
+ Search client that provides web search and page parsing capabilities.
40
+
41
+ This client uses lazy loading for DuckDuckGo search and selectolax HTML parsing
42
+ to minimize import overhead and memory usage.
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ *,
48
+ timeout: float = 30.0,
49
+ max_concurrent: int = 5,
50
+ user_agent: Optional[str] = None,
51
+ default_headers: Optional[Dict[str, str]] = None,
52
+ max_retries: int = 3,
53
+ ):
54
+ """
55
+ Initialize the SearchClient.
56
+
57
+ Args:
58
+ timeout: Default timeout for HTTP requests in seconds
59
+ max_concurrent: Maximum number of concurrent requests for batch operations
60
+ user_agent: User-Agent header for HTTP requests
61
+ default_headers: Default headers to include in HTTP requests
62
+ max_retries: Maximum number of retry attempts for failed requests
63
+ """
64
+ self.timeout = timeout
65
+ self.max_concurrent = max_concurrent
66
+ self.user_agent = user_agent or "Mozilla/5.0 (compatible; SearchClient/1.0)"
67
+ self.default_headers = default_headers or {}
68
+ self.max_retries = max_retries
69
+
70
+ # Lazy-loaded resources
71
+ self._ddgs_client = None
72
+ self._selectolax_parser_class = None
73
+
74
+ def _get_duckduckgo_client(self):
75
+ """Get a DuckDuckGo search client using lazy import and singleton pattern."""
76
+ if self._ddgs_client is None:
77
+ try:
78
+ from ddgs import DDGS
79
+
80
+ self._ddgs_client = DDGS
81
+ except ImportError as e:
82
+ raise ImportError(
83
+ "duckduckgo_search is required for web search functionality. "
84
+ "Install with: pip install duckduckgo-search"
85
+ ) from e
86
+ return self._ddgs_client
87
+
88
+ def _get_selectolax_parser(self):
89
+ """Get selectolax HTMLParser class using lazy import and singleton pattern."""
90
+ if self._selectolax_parser_class is None:
91
+ try:
92
+ from selectolax.parser import HTMLParser
93
+
94
+ self._selectolax_parser_class = HTMLParser
95
+ except ImportError as e:
96
+ raise ImportError(
97
+ "selectolax is required for HTML parsing functionality. "
98
+ "Install with: pip install selectolax"
99
+ ) from e
100
+ return self._selectolax_parser_class
101
+
102
+ def _get_default_headers(self) -> Dict[str, str]:
103
+ """Get default headers for HTTP requests."""
104
+ headers = {"User-Agent": self.user_agent}
105
+ headers.update(self.default_headers)
106
+ return headers
107
+
108
+ async def search(
109
+ self,
110
+ query: str,
111
+ *,
112
+ max_results: int = 10,
113
+ region: str = "wt-wt",
114
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
115
+ timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
116
+ backend: Literal["auto", "html", "lite"] = "auto",
117
+ max_retries: Optional[int] = None,
118
+ ) -> SearchResults:
119
+ """
120
+ (deprecated in favor of `web_search`)
121
+
122
+ Args:
123
+ query: Search query string
124
+ max_results: Maximum number of results to return (default: 10)
125
+ region: Search region (default: "wt-wt" for worldwide)
126
+ safesearch: Safe search setting (default: "moderate")
127
+ timelimit: Time limit for results (d=day, w=week, m=month, y=year)
128
+ backend: Search backend to use (default: "auto")
129
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
130
+
131
+ Returns:
132
+ List of search result dictionaries with 'title', 'href', and 'body' keys
133
+
134
+ Raises:
135
+ ValueError: If query is empty
136
+ Exception: If search fails after all retries
137
+ """
138
+ from rich import print
139
+
140
+ print(
141
+ "[bold yellow]WARNING: [/bold yellow] [yellow]Using `AsyncSearchClient.[bold light_salmon3]search[/bold light_salmon3]` is now deprecated in favor of `AsyncSearchClient.[bold light_salmon3]web_search[/bold light_salmon3]`[/yellow]"
142
+ )
143
+ return await self.web_search(
144
+ query,
145
+ max_results=max_results,
146
+ region=region,
147
+ safesearch=safesearch,
148
+ timelimit=timelimit,
149
+ )
150
+
151
+ async def web_search(
152
+ self,
153
+ query: str,
154
+ *,
155
+ max_results: int = 10,
156
+ region: str = "wt-wt",
157
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
158
+ timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
159
+ backend: Literal["auto", "html", "lite"] = "auto",
160
+ max_retries: Optional[int] = None,
161
+ ) -> SearchResults:
162
+ """
163
+ Search the web using DuckDuckGo search.
164
+
165
+ Args:
166
+ query: Search query string
167
+ max_results: Maximum number of results to return (default: 10)
168
+ region: Search region (default: "wt-wt" for worldwide)
169
+ safesearch: Safe search setting (default: "moderate")
170
+ timelimit: Time limit for results (d=day, w=week, m=month, y=year)
171
+ backend: Search backend to use (default: "auto")
172
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
173
+
174
+ Returns:
175
+ List of search result dictionaries with 'title', 'href', and 'body' keys
176
+
177
+ Raises:
178
+ ValueError: If query is empty
179
+ Exception: If search fails after all retries
180
+ """
181
+ if not query or not query.strip():
182
+ raise ValueError("Query cannot be empty")
183
+
184
+ retries = max_retries if max_retries is not None else self.max_retries
185
+
186
+ async def _do_search():
187
+ DDGS = self._get_duckduckgo_client()
188
+ with DDGS() as ddgs:
189
+ raw_results = list(
190
+ ddgs.text(
191
+ keywords=query.strip(),
192
+ region=region,
193
+ safesearch=safesearch,
194
+ timelimit=timelimit,
195
+ backend=backend,
196
+ max_results=max_results,
197
+ )
198
+ )
199
+
200
+ # Convert raw results to SearchResult models
201
+ search_results = [
202
+ SearchResult(
203
+ title=result.get("title", ""),
204
+ href=result.get("href", ""),
205
+ body=result.get("body", "")
206
+ )
207
+ for result in raw_results
208
+ ]
209
+
210
+ return SearchResults(
211
+ query=query.strip(),
212
+ results=search_results
213
+ )
214
+
215
+ async for attempt in AsyncRetrying(
216
+ stop=stop_after_attempt(retries + 1),
217
+ wait=wait_exponential(multiplier=1, min=1, max=10),
218
+ retry=retry_if_exception_type(Exception),
219
+ before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
220
+ ):
221
+ with attempt:
222
+ return await _do_search()
223
+
224
+ async def search_news(
225
+ self,
226
+ query: str,
227
+ *,
228
+ max_results: int = 10,
229
+ region: str = "wt-wt",
230
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
231
+ timelimit: Optional[Literal["d", "w", "m"]] = None,
232
+ max_retries: Optional[int] = None,
233
+ ) -> NewsResults:
234
+ """
235
+ Search for news using DuckDuckGo news search.
236
+
237
+ Args:
238
+ query: Search query string
239
+ max_results: Maximum number of results to return (default: 10)
240
+ region: Search region (default: "wt-wt" for worldwide)
241
+ safesearch: Safe search setting (default: "moderate")
242
+ timelimit: Time limit for results (d=day, w=week, m=month)
243
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
244
+
245
+ Returns:
246
+ List of news result dictionaries with date, title, body, url, image, and source
247
+
248
+ Raises:
249
+ ValueError: If query is empty
250
+ Exception: If search fails after all retries
251
+ """
252
+ if not query or not query.strip():
253
+ raise ValueError("Query cannot be empty")
254
+
255
+ retries = max_retries if max_retries is not None else self.max_retries
256
+
257
+ async def _do_news_search():
258
+ DDGS = self._get_duckduckgo_client()
259
+ with DDGS() as ddgs:
260
+ raw_results = list(
261
+ ddgs.news(
262
+ keywords=query.strip(),
263
+ region=region,
264
+ safesearch=safesearch,
265
+ timelimit=timelimit,
266
+ max_results=max_results,
267
+ )
268
+ )
269
+
270
+ # Convert raw results to NewsResult models
271
+ news_results = [
272
+ NewsResult(
273
+ date=result.get("date", ""),
274
+ title=result.get("title", ""),
275
+ body=result.get("body", ""),
276
+ url=result.get("url", ""),
277
+ image=result.get("image", ""),
278
+ source=result.get("source", "")
279
+ )
280
+ for result in raw_results
281
+ ]
282
+
283
+ return NewsResults(
284
+ query=query.strip(),
285
+ results=news_results
286
+ )
287
+
288
+ async for attempt in AsyncRetrying(
289
+ stop=stop_after_attempt(retries + 1),
290
+ wait=wait_exponential(multiplier=1, min=1, max=10),
291
+ retry=retry_if_exception_type(Exception),
292
+ before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
293
+ ):
294
+ with attempt:
295
+ return await _do_news_search()
296
+
297
+ async def read_web_page(
298
+ self,
299
+ url: str,
300
+ *,
301
+ timeout: Optional[float] = None,
302
+ headers: Optional[Dict[str, str]] = None,
303
+ extract_text: bool = True,
304
+ extract_links: bool = False,
305
+ extract_images: bool = False,
306
+ css_selector: Optional[str] = None,
307
+ max_retries: Optional[int] = None,
308
+ ) -> WebPageResult:
309
+ """
310
+ Read and parse a single web page using selectolax.
311
+
312
+ Args:
313
+ url: URL to fetch and parse
314
+ timeout: Request timeout in seconds (uses default if not provided)
315
+ headers: Optional HTTP headers to send
316
+ extract_text: Whether to extract text content (default: True)
317
+ extract_links: Whether to extract links (default: False)
318
+ extract_images: Whether to extract images (default: False)
319
+ css_selector: Optional CSS selector to extract specific elements
320
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
321
+
322
+ Returns:
323
+ Dictionary containing parsed content and metadata
324
+
325
+ Raises:
326
+ httpx.HTTPError: If request fails after all retries
327
+ Exception: If parsing fails
328
+ """
329
+ effective_headers = self._get_default_headers()
330
+ if headers:
331
+ effective_headers.update(headers)
332
+
333
+ request_timeout = timeout or self.timeout
334
+ retries = max_retries if max_retries is not None else self.max_retries
335
+
336
+ async def _do_fetch_and_parse():
337
+ async with httpx.AsyncClient(
338
+ timeout=request_timeout, follow_redirects=True
339
+ ) as client:
340
+ response = await client.get(url, headers=effective_headers)
341
+ response.raise_for_status()
342
+
343
+ # Parse HTML content
344
+ HTMLParser = self._get_selectolax_parser()
345
+ parser = HTMLParser(response.text)
346
+
347
+ title = ""
348
+ text = ""
349
+ links = []
350
+ images = []
351
+ selected_elements = []
352
+
353
+ # Extract title
354
+ title_node = parser.css_first("title")
355
+ if title_node:
356
+ title = title_node.text(strip=True)
357
+
358
+ # Extract text content
359
+ if extract_text:
360
+ if css_selector:
361
+ selected_nodes = parser.css(css_selector)
362
+ text = " ".join(
363
+ node.text(strip=True) for node in selected_nodes
364
+ )
365
+ else:
366
+ text = parser.text(strip=True)
367
+
368
+ # Extract links
369
+ if extract_links:
370
+ link_nodes = parser.css("a[href]")
371
+ links = [
372
+ LinkInfo(
373
+ href=node.attrs.get("href", ""),
374
+ text=node.text(strip=True),
375
+ )
376
+ for node in link_nodes
377
+ if node.attrs.get("href")
378
+ ]
379
+
380
+ # Extract images
381
+ if extract_images:
382
+ img_nodes = parser.css("img[src]")
383
+ images = [
384
+ ImageInfo(
385
+ src=node.attrs.get("src", ""),
386
+ alt=node.attrs.get("alt", ""),
387
+ title=node.attrs.get("title", ""),
388
+ )
389
+ for node in img_nodes
390
+ if node.attrs.get("src")
391
+ ]
392
+
393
+ # Extract selected elements
394
+ if css_selector:
395
+ selected_nodes = parser.css(css_selector)
396
+ selected_elements = [
397
+ SelectedElement(
398
+ tag=node.tag,
399
+ text=node.text(strip=True),
400
+ html=node.html,
401
+ attributes=dict(node.attributes),
402
+ )
403
+ for node in selected_nodes
404
+ ]
405
+
406
+ return WebPageResult(
407
+ url=url,
408
+ status_code=response.status_code,
409
+ content_type=response.headers.get("content-type", ""),
410
+ title=title,
411
+ text=text,
412
+ links=links,
413
+ images=images,
414
+ selected_elements=selected_elements,
415
+ )
416
+
417
+ async for attempt in AsyncRetrying(
418
+ stop=stop_after_attempt(retries + 1),
419
+ wait=wait_exponential(multiplier=1, min=1, max=10),
420
+ retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
421
+ before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
422
+ ):
423
+ with attempt:
424
+ return await _do_fetch_and_parse()
425
+
426
+ async def read_web_pages(
427
+ self,
428
+ urls: List[str],
429
+ *,
430
+ timeout: Optional[float] = None,
431
+ headers: Optional[Dict[str, str]] = None,
432
+ extract_text: bool = True,
433
+ extract_links: bool = False,
434
+ extract_images: bool = False,
435
+ css_selector: Optional[str] = None,
436
+ max_concurrent: Optional[int] = None,
437
+ max_retries: Optional[int] = None,
438
+ ) -> WebPageResults:
439
+ """
440
+ Read and parse multiple web pages concurrently using selectolax.
441
+
442
+ Args:
443
+ urls: List of URLs to fetch and parse
444
+ timeout: Request timeout in seconds (uses default if not provided)
445
+ headers: Optional HTTP headers to send
446
+ extract_text: Whether to extract text content (default: True)
447
+ extract_links: Whether to extract links (default: False)
448
+ extract_images: Whether to extract images (default: False)
449
+ css_selector: Optional CSS selector to extract specific elements
450
+ max_concurrent: Maximum number of concurrent requests (uses default if not provided)
451
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
452
+
453
+ Returns:
454
+ List of dictionaries containing parsed content and metadata
455
+
456
+ Raises:
457
+ Exception: If any critical error occurs
458
+ """
459
+ if not urls:
460
+ return []
461
+
462
+ # Remove duplicates while preserving order
463
+ unique_urls = []
464
+ seen = set()
465
+ for url in urls:
466
+ if url not in seen:
467
+ unique_urls.append(url)
468
+ seen.add(url)
469
+
470
+ # Create semaphore for concurrency control
471
+ concurrent_limit = max_concurrent or self.max_concurrent
472
+ semaphore = asyncio.Semaphore(concurrent_limit)
473
+
474
+ async def fetch_page(url: str) -> Dict[str, Any]:
475
+ async with semaphore:
476
+ try:
477
+ return await self.read_web_page(
478
+ url=url,
479
+ timeout=timeout,
480
+ headers=headers,
481
+ extract_text=extract_text,
482
+ extract_links=extract_links,
483
+ extract_images=extract_images,
484
+ css_selector=css_selector,
485
+ max_retries=max_retries,
486
+ )
487
+ except Exception as e:
488
+ return WebPageErrorResult(
489
+ url=url,
490
+ error=str(e),
491
+ status_code=None,
492
+ content_type="",
493
+ title="",
494
+ text="",
495
+ links=[],
496
+ images=[],
497
+ selected_elements=[],
498
+ )
499
+
500
+ # Execute all requests concurrently
501
+ tasks = [fetch_page(url) for url in unique_urls]
502
+ results = await asyncio.gather(*tasks, return_exceptions=False)
503
+
504
+ return WebPageResults(
505
+ urls=unique_urls,
506
+ results=results
507
+ )
508
+
509
+ async def extract_page_links(
510
+ self,
511
+ url: str,
512
+ *,
513
+ timeout: Optional[float] = None,
514
+ headers: Optional[Dict[str, str]] = None,
515
+ css_selector: str = "a[href]",
516
+ include_external: bool = True,
517
+ include_internal: bool = True,
518
+ base_url: Optional[str] = None,
519
+ max_retries: Optional[int] = None,
520
+ ) -> ExtractedLinks:
521
+ """
522
+ Extract links from a web page using selectolax.
523
+
524
+ Args:
525
+ url: URL to fetch and extract links from
526
+ timeout: Request timeout in seconds (uses default if not provided)
527
+ headers: Optional HTTP headers to send
528
+ css_selector: CSS selector for links (default: "a[href]")
529
+ include_external: Whether to include external links (default: True)
530
+ include_internal: Whether to include internal links (default: True)
531
+ base_url: Base URL for resolving relative links (uses page URL if not provided)
532
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
533
+
534
+ Returns:
535
+ List of link dictionaries with href, text, title, and type (internal/external)
536
+
537
+ Raises:
538
+ httpx.HTTPError: If request fails after all retries
539
+ Exception: If parsing fails
540
+ """
541
+ effective_headers = self._get_default_headers()
542
+ if headers:
543
+ effective_headers.update(headers)
544
+
545
+ request_timeout = timeout or self.timeout
546
+ retries = max_retries if max_retries is not None else self.max_retries
547
+
548
+ async def _do_extract_links():
549
+ async with httpx.AsyncClient(
550
+ timeout=request_timeout, follow_redirects=True
551
+ ) as client:
552
+ response = await client.get(url, headers=effective_headers)
553
+ response.raise_for_status()
554
+
555
+ # Parse HTML content
556
+ HTMLParser = self._get_selectolax_parser()
557
+ parser = HTMLParser(response.text)
558
+
559
+ # Use provided base_url or extract from the page
560
+ effective_base_url = base_url or url
561
+
562
+ # Get the domain for internal/external classification
563
+ parsed_base = urlparse(effective_base_url)
564
+ base_domain = parsed_base.netloc
565
+
566
+ # Extract links
567
+ link_nodes = parser.css(css_selector)
568
+ links = []
569
+
570
+ for node in link_nodes:
571
+ href = node.attrs.get("href", "").strip()
572
+ if not href:
573
+ continue
574
+
575
+ # Resolve relative URLs
576
+ absolute_href = urljoin(effective_base_url, href)
577
+ parsed_href = urlparse(absolute_href)
578
+
579
+ # Determine if link is internal or external
580
+ is_internal = (
581
+ parsed_href.netloc == base_domain or not parsed_href.netloc
582
+ )
583
+ link_type = "internal" if is_internal else "external"
584
+
585
+ # Filter based on include flags
586
+ if (is_internal and not include_internal) or (
587
+ not is_internal and not include_external
588
+ ):
589
+ continue
590
+
591
+ link_info = ExtractedLink(
592
+ href=absolute_href,
593
+ original_href=href,
594
+ text=node.text(strip=True),
595
+ title=node.attrs.get("title", ""),
596
+ type=link_type,
597
+ )
598
+
599
+ links.append(link_info)
600
+
601
+ return ExtractedLinks(
602
+ url=url,
603
+ results=links
604
+ )
605
+
606
+ async for attempt in AsyncRetrying(
607
+ stop=stop_after_attempt(retries + 1),
608
+ wait=wait_exponential(multiplier=1, min=1, max=10),
609
+ retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
610
+ before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
611
+ ):
612
+ with attempt:
613
+ return await _do_extract_links()
614
+
615
+
616
+ class SearchClient:
617
+ """
618
+ Synchronous wrapper around AsyncSearchClient.
619
+
620
+ This class provides a synchronous interface to the search functionality
621
+ by running async operations in an event loop.
622
+ """
623
+
624
+ def __init__(
625
+ self,
626
+ *,
627
+ timeout: float = 30.0,
628
+ max_concurrent: int = 5,
629
+ user_agent: Optional[str] = None,
630
+ default_headers: Optional[Dict[str, str]] = None,
631
+ max_retries: int = 3,
632
+ ):
633
+ """
634
+ Initialize the SearchClient.
635
+
636
+ Args:
637
+ timeout: Default timeout for HTTP requests in seconds
638
+ max_concurrent: Maximum number of concurrent requests for batch operations
639
+ user_agent: User-Agent header for HTTP requests
640
+ default_headers: Default headers to include in HTTP requests
641
+ max_retries: Maximum number of retry attempts for failed requests
642
+ """
643
+ self._async_client = AsyncSearchClient(
644
+ timeout=timeout,
645
+ max_concurrent=max_concurrent,
646
+ user_agent=user_agent,
647
+ default_headers=default_headers,
648
+ max_retries=max_retries,
649
+ )
650
+
651
+ def _run_async(self, coro):
652
+ """Run an async coroutine in a new event loop."""
653
+ try:
654
+ # Try to get the current event loop
655
+ loop = asyncio.get_running_loop()
656
+ # If we're already in an event loop, we need to use a thread
657
+ import concurrent.futures
658
+
659
+ with concurrent.futures.ThreadPoolExecutor() as executor:
660
+ future = executor.submit(asyncio.run, coro)
661
+ return future.result()
662
+ except RuntimeError:
663
+ # No event loop running, we can create our own
664
+ return asyncio.run(coro)
665
+
666
+ def search(
667
+ self,
668
+ query: str,
669
+ *,
670
+ max_results: int = 10,
671
+ region: str = "wt-wt",
672
+ safesearch: str = "moderate",
673
+ backend: str = "api",
674
+ ) -> SearchResults:
675
+ """
676
+ Synchronous web search using DuckDuckGo.
677
+
678
+ Args:
679
+ query: Search query string
680
+ max_results: Maximum number of results to return
681
+ region: Search region (default: "wt-wt" for worldwide)
682
+ safesearch: Safe search setting ("on", "moderate", "off")
683
+ backend: Search backend ("api", "html", "lite")
684
+
685
+ Returns:
686
+ List of search result dictionaries with keys: title, href, body
687
+ """
688
+ return self._run_async(
689
+ self._async_client.search(
690
+ query,
691
+ max_results=max_results,
692
+ region=region,
693
+ safesearch=safesearch,
694
+ backend=backend,
695
+ )
696
+ )
697
+
698
+ def get_page_content(
699
+ self,
700
+ url: str,
701
+ *,
702
+ timeout: Optional[float] = None,
703
+ retries: int = 3,
704
+ encoding: Optional[str] = None,
705
+ ) -> str:
706
+ """
707
+ Synchronously fetch and return the text content of a web page.
708
+
709
+ Args:
710
+ url: URL of the web page to fetch
711
+ timeout: Request timeout in seconds (uses client default if not specified)
712
+ retries: Number of retry attempts for failed requests
713
+ encoding: Text encoding to use (auto-detected if not specified)
714
+
715
+ Returns:
716
+ Plain text content of the web page
717
+ """
718
+ return self._run_async(
719
+ self._async_client.get_page_content(
720
+ url, timeout=timeout, retries=retries, encoding=encoding
721
+ )
722
+ )
723
+
724
+ def extract_links(
725
+ self,
726
+ url: str,
727
+ *,
728
+ css_selector: str = "a[href]",
729
+ include_internal: bool = True,
730
+ include_external: bool = True,
731
+ timeout: Optional[float] = None,
732
+ retries: int = 3,
733
+ ) -> ExtractedLinks:
734
+ """
735
+ Synchronously extract links from a web page.
736
+
737
+ Args:
738
+ url: URL of the web page to parse
739
+ css_selector: CSS selector for link elements
740
+ include_internal: Whether to include internal links
741
+ include_external: Whether to include external links
742
+ timeout: Request timeout in seconds
743
+ retries: Number of retry attempts for failed requests
744
+
745
+ Returns:
746
+ List of link dictionaries with keys: href, original_href, text, title, type
747
+ """
748
+ return self._run_async(
749
+ self._async_client.extract_links(
750
+ url,
751
+ css_selector=css_selector,
752
+ include_internal=include_internal,
753
+ include_external=include_external,
754
+ timeout=timeout,
755
+ retries=retries,
756
+ )
757
+ )
758
+
759
+ def web_search(
760
+ self,
761
+ query: str,
762
+ *,
763
+ max_results: int = 10,
764
+ region: str = "wt-wt",
765
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
766
+ timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
767
+ backend: Literal["auto", "html", "lite"] = "auto",
768
+ max_retries: Optional[int] = None,
769
+ ) -> SearchResults:
770
+ """
771
+ Synchronously search the web using DuckDuckGo search.
772
+
773
+ Args:
774
+ query: Search query string
775
+ max_results: Maximum number of results to return (default: 10)
776
+ region: Search region (default: "wt-wt" for worldwide)
777
+ safesearch: Safe search setting (default: "moderate")
778
+ timelimit: Time limit for results (d=day, w=week, m=month, y=year)
779
+ backend: Search backend to use (default: "auto")
780
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
781
+
782
+ Returns:
783
+ List of search result dictionaries with 'title', 'href', and 'body' keys
784
+
785
+ Raises:
786
+ ValueError: If query is empty
787
+ Exception: If search fails after all retries
788
+ """
789
+ return self._run_async(
790
+ self._async_client.web_search(
791
+ query=query,
792
+ max_results=max_results,
793
+ region=region,
794
+ safesearch=safesearch,
795
+ timelimit=timelimit,
796
+ backend=backend,
797
+ max_retries=max_retries,
798
+ )
799
+ )
800
+
801
+ def search_news(
802
+ self,
803
+ query: str,
804
+ *,
805
+ max_results: int = 10,
806
+ region: str = "wt-wt",
807
+ safesearch: Literal["on", "moderate", "off"] = "moderate",
808
+ timelimit: Optional[Literal["d", "w", "m"]] = None,
809
+ max_retries: Optional[int] = None,
810
+ ) -> NewsResults:
811
+ """
812
+ Synchronously search for news using DuckDuckGo news search.
813
+
814
+ Args:
815
+ query: Search query string
816
+ max_results: Maximum number of results to return (default: 10)
817
+ region: Search region (default: "wt-wt" for worldwide)
818
+ safesearch: Safe search setting (default: "moderate")
819
+ timelimit: Time limit for results (d=day, w=week, m=month)
820
+ max_retries: Maximum number of retry attempts (uses instance default if not provided)
821
+
822
+ Returns:
823
+ List of news result dictionaries with date, title, body, url, image, and source
824
+
825
+ Raises:
826
+ ValueError: If query is empty
827
+ Exception: If search fails after all retries
828
+ """
829
+ return self._run_async(
830
+ self._async_client.search_news(
831
+ query=query,
832
+ max_results=max_results,
833
+ region=region,
834
+ safesearch=safesearch,
835
+ timelimit=timelimit,
836
+ max_retries=max_retries,
837
+ )
838
+ )
839
+
840
+ def read_web_page(
841
+ self,
842
+ url: str,
843
+ *,
844
+ timeout: float = 30.0,
845
+ headers: Optional[Dict[str, str]] = None,
846
+ extract_text: bool = True,
847
+ extract_links: bool = False,
848
+ extract_images: bool = False,
849
+ css_selector: Optional[str] = None,
850
+ ) -> WebPageResult:
851
+ """
852
+ Synchronously read and parse a single web page using selectolax.
853
+
854
+ Args:
855
+ url: URL to fetch and parse
856
+ timeout: Request timeout in seconds (default: 30.0)
857
+ headers: Optional HTTP headers to send
858
+ extract_text: Whether to extract text content (default: True)
859
+ extract_links: Whether to extract links (default: False)
860
+ extract_images: Whether to extract images (default: False)
861
+ css_selector: Optional CSS selector to extract specific elements
862
+
863
+ Returns:
864
+ Dictionary containing parsed content and metadata
865
+
866
+ Raises:
867
+ httpx.HTTPError: If request fails
868
+ Exception: If parsing fails
869
+ """
870
+ return self._run_async(
871
+ self._async_client.read_web_page(
872
+ url=url,
873
+ timeout=timeout,
874
+ headers=headers,
875
+ extract_text=extract_text,
876
+ extract_links=extract_links,
877
+ extract_images=extract_images,
878
+ css_selector=css_selector,
879
+ )
880
+ )
881
+
882
+ def read_web_pages(
883
+ self,
884
+ urls: List[str],
885
+ *,
886
+ timeout: float = 30.0,
887
+ headers: Optional[Dict[str, str]] = None,
888
+ extract_text: bool = True,
889
+ extract_links: bool = False,
890
+ extract_images: bool = False,
891
+ css_selector: Optional[str] = None,
892
+ max_concurrent: Optional[int] = None,
893
+ ) -> WebPageResults:
894
+ """
895
+ Synchronously read and parse multiple web pages concurrently using selectolax.
896
+
897
+ Args:
898
+ urls: List of URLs to fetch and parse
899
+ timeout: Request timeout in seconds (default: 30.0)
900
+ headers: Optional HTTP headers to send
901
+ extract_text: Whether to extract text content (default: True)
902
+ extract_links: Whether to extract links (default: False)
903
+ extract_images: Whether to extract images (default: False)
904
+ css_selector: Optional CSS selector to extract specific elements
905
+ max_concurrent: Maximum concurrent requests (uses client default if not specified)
906
+
907
+ Returns:
908
+ List of dictionaries containing parsed content and metadata for each URL
909
+
910
+ Raises:
911
+ httpx.HTTPError: If requests fail
912
+ Exception: If parsing fails
913
+ """
914
+ return self._run_async(
915
+ self._async_client.read_web_pages(
916
+ urls=urls,
917
+ timeout=timeout,
918
+ headers=headers,
919
+ extract_text=extract_text,
920
+ extract_links=extract_links,
921
+ extract_images=extract_images,
922
+ css_selector=css_selector,
923
+ max_concurrent=max_concurrent,
924
+ )
925
+ )
926
+
927
+ def extract_page_links(
928
+ self,
929
+ url: str,
930
+ *,
931
+ timeout: float = 30.0,
932
+ headers: Optional[Dict[str, str]] = None,
933
+ css_selector: str = "a[href]",
934
+ include_internal: bool = True,
935
+ include_external: bool = True,
936
+ base_url: Optional[str] = None,
937
+ ) -> ExtractedLinks:
938
+ """
939
+ Synchronously extract all links from a web page.
940
+
941
+ Args:
942
+ url: URL to fetch and extract links from
943
+ timeout: Request timeout in seconds (default: 30.0)
944
+ headers: Optional HTTP headers to send
945
+ css_selector: CSS selector for link elements (default: "a[href]")
946
+ include_internal: Whether to include internal links (default: True)
947
+ include_external: Whether to include external links (default: True)
948
+ base_url: Base URL for resolving relative links (uses page URL if not provided)
949
+
950
+ Returns:
951
+ List of link dictionaries with 'href', 'original_href', 'text', 'title', and 'type' keys
952
+
953
+ Raises:
954
+ httpx.HTTPError: If request fails
955
+ Exception: If parsing fails
956
+ """
957
+ return self._run_async(
958
+ self._async_client.extract_page_links(
959
+ url=url,
960
+ timeout=timeout,
961
+ headers=headers,
962
+ css_selector=css_selector,
963
+ include_internal=include_internal,
964
+ include_external=include_external,
965
+ base_url=base_url,
966
+ )
967
+ )
968
+
969
+ def close(self):
970
+ """Close the underlying async client."""
971
+ pass
972
+
973
+ def __enter__(self):
974
+ """Context manager entry."""
975
+ return self
976
+
977
+ def __exit__(self, exc_type, exc_val, exc_tb):
978
+ """Context manager exit."""
979
+ self.close()
980
+
981
+
982
+ @overload
983
+ def create_search_client(
984
+ *,
985
+ timeout: float = 30.0,
986
+ max_concurrent: int = 5,
987
+ user_agent: Optional[str] = None,
988
+ default_headers: Optional[Dict[str, str]] = None,
989
+ max_retries: int = 3,
990
+ async_client: Literal[True],
991
+ ) -> AsyncSearchClient: ...
992
+
993
+
994
+ @overload
995
+ def create_search_client(
996
+ *,
997
+ timeout: float = 30.0,
998
+ max_concurrent: int = 5,
999
+ user_agent: Optional[str] = None,
1000
+ default_headers: Optional[Dict[str, str]] = None,
1001
+ max_retries: int = 3,
1002
+ async_client: Literal[False] = ...,
1003
+ ) -> SearchClient: ...
1004
+
1005
+
1006
+ def create_search_client(
1007
+ *,
1008
+ timeout: float = 30.0,
1009
+ max_concurrent: int = 5,
1010
+ user_agent: Optional[str] = None,
1011
+ default_headers: Optional[Dict[str, str]] = None,
1012
+ max_retries: int = 3,
1013
+ async_client: bool = False,
1014
+ ) -> Union[SearchClient, AsyncSearchClient]:
1015
+ """
1016
+ Create a new SearchClient instance.
1017
+
1018
+ Args:
1019
+ timeout: Default timeout for HTTP requests in seconds
1020
+ max_concurrent: Maximum number of concurrent requests for batch operations
1021
+ user_agent: User-Agent header for HTTP requests
1022
+ default_headers: Default headers to include in HTTP requests
1023
+ max_retries: Maximum number of retry attempts for failed requests
1024
+ async_client: Whether to return an async client instance
1025
+
1026
+ Returns:
1027
+ SearchClient or AsyncSearchClient instance based on async_client parameter
1028
+ """
1029
+ params = locals()
1030
+ del params["async_client"]
1031
+
1032
+ if async_client:
1033
+ return AsyncSearchClient(**params)
1034
+ else:
1035
+ return SearchClient(**params)