hammad-python 0.0.29__py3-none-any.whl → 0.0.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. ham/__init__.py +10 -0
  2. {hammad_python-0.0.29.dist-info → hammad_python-0.0.31.dist-info}/METADATA +6 -32
  3. hammad_python-0.0.31.dist-info/RECORD +6 -0
  4. hammad/__init__.py +0 -84
  5. hammad/_internal.py +0 -256
  6. hammad/_main.py +0 -226
  7. hammad/cache/__init__.py +0 -40
  8. hammad/cache/base_cache.py +0 -181
  9. hammad/cache/cache.py +0 -169
  10. hammad/cache/decorators.py +0 -261
  11. hammad/cache/file_cache.py +0 -80
  12. hammad/cache/ttl_cache.py +0 -74
  13. hammad/cli/__init__.py +0 -33
  14. hammad/cli/animations.py +0 -573
  15. hammad/cli/plugins.py +0 -867
  16. hammad/cli/styles/__init__.py +0 -55
  17. hammad/cli/styles/settings.py +0 -139
  18. hammad/cli/styles/types.py +0 -358
  19. hammad/cli/styles/utils.py +0 -634
  20. hammad/data/__init__.py +0 -90
  21. hammad/data/collections/__init__.py +0 -49
  22. hammad/data/collections/collection.py +0 -326
  23. hammad/data/collections/indexes/__init__.py +0 -37
  24. hammad/data/collections/indexes/qdrant/__init__.py +0 -1
  25. hammad/data/collections/indexes/qdrant/index.py +0 -723
  26. hammad/data/collections/indexes/qdrant/settings.py +0 -94
  27. hammad/data/collections/indexes/qdrant/utils.py +0 -210
  28. hammad/data/collections/indexes/tantivy/__init__.py +0 -1
  29. hammad/data/collections/indexes/tantivy/index.py +0 -426
  30. hammad/data/collections/indexes/tantivy/settings.py +0 -40
  31. hammad/data/collections/indexes/tantivy/utils.py +0 -176
  32. hammad/data/configurations/__init__.py +0 -35
  33. hammad/data/configurations/configuration.py +0 -564
  34. hammad/data/models/__init__.py +0 -50
  35. hammad/data/models/extensions/__init__.py +0 -4
  36. hammad/data/models/extensions/pydantic/__init__.py +0 -42
  37. hammad/data/models/extensions/pydantic/converters.py +0 -759
  38. hammad/data/models/fields.py +0 -546
  39. hammad/data/models/model.py +0 -1078
  40. hammad/data/models/utils.py +0 -280
  41. hammad/data/sql/__init__.py +0 -24
  42. hammad/data/sql/database.py +0 -576
  43. hammad/data/sql/types.py +0 -127
  44. hammad/data/types/__init__.py +0 -75
  45. hammad/data/types/file.py +0 -431
  46. hammad/data/types/multimodal/__init__.py +0 -36
  47. hammad/data/types/multimodal/audio.py +0 -200
  48. hammad/data/types/multimodal/image.py +0 -182
  49. hammad/data/types/text.py +0 -1308
  50. hammad/formatting/__init__.py +0 -33
  51. hammad/formatting/json/__init__.py +0 -27
  52. hammad/formatting/json/converters.py +0 -158
  53. hammad/formatting/text/__init__.py +0 -63
  54. hammad/formatting/text/converters.py +0 -723
  55. hammad/formatting/text/markdown.py +0 -131
  56. hammad/formatting/yaml/__init__.py +0 -26
  57. hammad/formatting/yaml/converters.py +0 -5
  58. hammad/genai/__init__.py +0 -217
  59. hammad/genai/a2a/__init__.py +0 -32
  60. hammad/genai/a2a/workers.py +0 -552
  61. hammad/genai/agents/__init__.py +0 -59
  62. hammad/genai/agents/agent.py +0 -1973
  63. hammad/genai/agents/run.py +0 -1024
  64. hammad/genai/agents/types/__init__.py +0 -42
  65. hammad/genai/agents/types/agent_context.py +0 -13
  66. hammad/genai/agents/types/agent_event.py +0 -128
  67. hammad/genai/agents/types/agent_hooks.py +0 -220
  68. hammad/genai/agents/types/agent_messages.py +0 -31
  69. hammad/genai/agents/types/agent_response.py +0 -125
  70. hammad/genai/agents/types/agent_stream.py +0 -327
  71. hammad/genai/graphs/__init__.py +0 -125
  72. hammad/genai/graphs/_utils.py +0 -190
  73. hammad/genai/graphs/base.py +0 -1828
  74. hammad/genai/graphs/plugins.py +0 -316
  75. hammad/genai/graphs/types.py +0 -638
  76. hammad/genai/models/__init__.py +0 -1
  77. hammad/genai/models/embeddings/__init__.py +0 -43
  78. hammad/genai/models/embeddings/model.py +0 -226
  79. hammad/genai/models/embeddings/run.py +0 -163
  80. hammad/genai/models/embeddings/types/__init__.py +0 -37
  81. hammad/genai/models/embeddings/types/embedding_model_name.py +0 -75
  82. hammad/genai/models/embeddings/types/embedding_model_response.py +0 -76
  83. hammad/genai/models/embeddings/types/embedding_model_run_params.py +0 -66
  84. hammad/genai/models/embeddings/types/embedding_model_settings.py +0 -47
  85. hammad/genai/models/language/__init__.py +0 -57
  86. hammad/genai/models/language/model.py +0 -1098
  87. hammad/genai/models/language/run.py +0 -878
  88. hammad/genai/models/language/types/__init__.py +0 -40
  89. hammad/genai/models/language/types/language_model_instructor_mode.py +0 -47
  90. hammad/genai/models/language/types/language_model_messages.py +0 -28
  91. hammad/genai/models/language/types/language_model_name.py +0 -239
  92. hammad/genai/models/language/types/language_model_request.py +0 -127
  93. hammad/genai/models/language/types/language_model_response.py +0 -217
  94. hammad/genai/models/language/types/language_model_response_chunk.py +0 -56
  95. hammad/genai/models/language/types/language_model_settings.py +0 -89
  96. hammad/genai/models/language/types/language_model_stream.py +0 -600
  97. hammad/genai/models/language/utils/__init__.py +0 -28
  98. hammad/genai/models/language/utils/requests.py +0 -421
  99. hammad/genai/models/language/utils/structured_outputs.py +0 -135
  100. hammad/genai/models/model_provider.py +0 -4
  101. hammad/genai/models/multimodal.py +0 -47
  102. hammad/genai/models/reranking.py +0 -26
  103. hammad/genai/types/__init__.py +0 -1
  104. hammad/genai/types/base.py +0 -215
  105. hammad/genai/types/history.py +0 -290
  106. hammad/genai/types/tools.py +0 -507
  107. hammad/logging/__init__.py +0 -35
  108. hammad/logging/decorators.py +0 -834
  109. hammad/logging/logger.py +0 -1018
  110. hammad/mcp/__init__.py +0 -53
  111. hammad/mcp/client/__init__.py +0 -35
  112. hammad/mcp/client/client.py +0 -624
  113. hammad/mcp/client/client_service.py +0 -400
  114. hammad/mcp/client/settings.py +0 -178
  115. hammad/mcp/servers/__init__.py +0 -26
  116. hammad/mcp/servers/launcher.py +0 -1161
  117. hammad/runtime/__init__.py +0 -32
  118. hammad/runtime/decorators.py +0 -142
  119. hammad/runtime/run.py +0 -299
  120. hammad/service/__init__.py +0 -49
  121. hammad/service/create.py +0 -527
  122. hammad/service/decorators.py +0 -283
  123. hammad/types.py +0 -288
  124. hammad/typing/__init__.py +0 -435
  125. hammad/web/__init__.py +0 -43
  126. hammad/web/http/__init__.py +0 -1
  127. hammad/web/http/client.py +0 -944
  128. hammad/web/models.py +0 -275
  129. hammad/web/openapi/__init__.py +0 -1
  130. hammad/web/openapi/client.py +0 -740
  131. hammad/web/search/__init__.py +0 -1
  132. hammad/web/search/client.py +0 -1023
  133. hammad/web/utils.py +0 -472
  134. hammad_python-0.0.29.dist-info/RECORD +0 -135
  135. {hammad → ham}/py.typed +0 -0
  136. {hammad_python-0.0.29.dist-info → hammad_python-0.0.31.dist-info}/WHEEL +0 -0
  137. {hammad_python-0.0.29.dist-info → hammad_python-0.0.31.dist-info}/licenses/LICENSE +0 -0
@@ -1,1023 +0,0 @@
1
- """hammad.web.search.client"""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import logging
7
- from typing import Any, Dict, List, Literal, Optional, Union, overload
8
- from urllib.parse import urljoin, urlparse
9
-
10
- import httpx
11
- from tenacity import (
12
- AsyncRetrying,
13
- retry_if_exception_type,
14
- stop_after_attempt,
15
- wait_exponential,
16
- before_sleep_log,
17
- )
18
-
19
- from ..models import (
20
- SearchResult,
21
- NewsResult,
22
- SearchResults,
23
- NewsResults,
24
- WebPageResult,
25
- WebPageErrorResult,
26
- WebPageResults,
27
- ExtractedLinks,
28
- ExtractedLink,
29
- LinkInfo,
30
- ImageInfo,
31
- SelectedElement,
32
- )
33
-
34
- __all__ = ("AsyncSearchClient", "SearchClient", "create_search_client")
35
-
36
-
37
- class AsyncSearchClient:
38
- """
39
- Search client that provides web search and page parsing capabilities.
40
-
41
- This client uses lazy loading for DuckDuckGo search and selectolax HTML parsing
42
- to minimize import overhead and memory usage.
43
- """
44
-
45
- def __init__(
46
- self,
47
- *,
48
- timeout: float = 30.0,
49
- max_concurrent: int = 5,
50
- user_agent: Optional[str] = None,
51
- default_headers: Optional[Dict[str, str]] = None,
52
- max_retries: int = 3,
53
- ):
54
- """
55
- Initialize the SearchClient.
56
-
57
- Args:
58
- timeout: Default timeout for HTTP requests in seconds
59
- max_concurrent: Maximum number of concurrent requests for batch operations
60
- user_agent: User-Agent header for HTTP requests
61
- default_headers: Default headers to include in HTTP requests
62
- max_retries: Maximum number of retry attempts for failed requests
63
- """
64
- self.timeout = timeout
65
- self.max_concurrent = max_concurrent
66
- self.user_agent = user_agent or "Mozilla/5.0 (compatible; SearchClient/1.0)"
67
- self.default_headers = default_headers or {}
68
- self.max_retries = max_retries
69
-
70
- # Lazy-loaded resources
71
- self._ddgs_client = None
72
- self._selectolax_parser_class = None
73
-
74
- def _get_duckduckgo_client(self):
75
- """Get a DuckDuckGo search client using lazy import and singleton pattern."""
76
- if self._ddgs_client is None:
77
- try:
78
- from ddgs import DDGS
79
-
80
- self._ddgs_client = DDGS
81
- except ImportError as e:
82
- raise ImportError(
83
- "duckduckgo_search is required for web search functionality. "
84
- "Install with: pip install duckduckgo-search"
85
- ) from e
86
- return self._ddgs_client
87
-
88
- def _get_selectolax_parser(self):
89
- """Get selectolax HTMLParser class using lazy import and singleton pattern."""
90
- if self._selectolax_parser_class is None:
91
- try:
92
- from selectolax.parser import HTMLParser
93
-
94
- self._selectolax_parser_class = HTMLParser
95
- except ImportError as e:
96
- raise ImportError(
97
- "selectolax is required for HTML parsing functionality. "
98
- "Install with: pip install selectolax"
99
- ) from e
100
- return self._selectolax_parser_class
101
-
102
- def _get_default_headers(self) -> Dict[str, str]:
103
- """Get default headers for HTTP requests."""
104
- headers = {"User-Agent": self.user_agent}
105
- headers.update(self.default_headers)
106
- return headers
107
-
108
- async def search(
109
- self,
110
- query: str,
111
- *,
112
- max_results: int = 10,
113
- region: str = "wt-wt",
114
- safesearch: Literal["on", "moderate", "off"] = "moderate",
115
- timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
116
- backend: Literal["auto", "html", "lite"] = "auto",
117
- max_retries: Optional[int] = None,
118
- ) -> SearchResults:
119
- """
120
- (deprecated in favor of `web_search`)
121
-
122
- Args:
123
- query: Search query string
124
- max_results: Maximum number of results to return (default: 10)
125
- region: Search region (default: "wt-wt" for worldwide)
126
- safesearch: Safe search setting (default: "moderate")
127
- timelimit: Time limit for results (d=day, w=week, m=month, y=year)
128
- backend: Search backend to use (default: "auto")
129
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
130
-
131
- Returns:
132
- List of search result dictionaries with 'title', 'href', and 'body' keys
133
-
134
- Raises:
135
- ValueError: If query is empty
136
- Exception: If search fails after all retries
137
- """
138
- from rich import print
139
-
140
- print(
141
- "[bold yellow]WARNING: [/bold yellow] [yellow]Using `AsyncSearchClient.[bold light_salmon3]search[/bold light_salmon3]` is now deprecated in favor of `AsyncSearchClient.[bold light_salmon3]web_search[/bold light_salmon3]`[/yellow]"
142
- )
143
- return await self.web_search(
144
- query,
145
- max_results=max_results,
146
- region=region,
147
- safesearch=safesearch,
148
- timelimit=timelimit,
149
- )
150
-
151
- async def web_search(
152
- self,
153
- query: str,
154
- *,
155
- max_results: int = 10,
156
- region: str = "wt-wt",
157
- safesearch: Literal["on", "moderate", "off"] = "moderate",
158
- timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
159
- backend: Literal["auto", "html", "lite"] = "auto",
160
- max_retries: Optional[int] = None,
161
- ) -> SearchResults:
162
- """
163
- Search the web using DuckDuckGo search.
164
-
165
- Args:
166
- query: Search query string
167
- max_results: Maximum number of results to return (default: 10)
168
- region: Search region (default: "wt-wt" for worldwide)
169
- safesearch: Safe search setting (default: "moderate")
170
- timelimit: Time limit for results (d=day, w=week, m=month, y=year)
171
- backend: Search backend to use (default: "auto")
172
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
173
-
174
- Returns:
175
- List of search result dictionaries with 'title', 'href', and 'body' keys
176
-
177
- Raises:
178
- ValueError: If query is empty
179
- Exception: If search fails after all retries
180
- """
181
- if not query or not query.strip():
182
- raise ValueError("Query cannot be empty")
183
-
184
- retries = max_retries if max_retries is not None else self.max_retries
185
-
186
- async def _do_search():
187
- DDGS = self._get_duckduckgo_client()
188
- with DDGS() as ddgs:
189
- raw_results = list(
190
- ddgs.text(
191
- query.strip(),
192
- region=region,
193
- safesearch=safesearch,
194
- timelimit=timelimit,
195
- backend=backend,
196
- max_results=max_results,
197
- )
198
- )
199
-
200
- # Convert raw results to SearchResult models
201
- search_results = [
202
- SearchResult(
203
- title=result.get("title", ""),
204
- href=result.get("href", ""),
205
- body=result.get("body", ""),
206
- )
207
- for result in raw_results
208
- ]
209
-
210
- return SearchResults(query=query.strip(), results=search_results)
211
-
212
- async for attempt in AsyncRetrying(
213
- stop=stop_after_attempt(retries + 1),
214
- wait=wait_exponential(multiplier=1, min=1, max=10),
215
- retry=retry_if_exception_type(Exception),
216
- before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
217
- ):
218
- with attempt:
219
- return await _do_search()
220
-
221
- async def search_news(
222
- self,
223
- query: str,
224
- *,
225
- max_results: int = 10,
226
- region: str = "wt-wt",
227
- safesearch: Literal["on", "moderate", "off"] = "moderate",
228
- timelimit: Optional[Literal["d", "w", "m"]] = None,
229
- max_retries: Optional[int] = None,
230
- ) -> NewsResults:
231
- """
232
- Search for news using DuckDuckGo news search.
233
-
234
- Args:
235
- query: Search query string
236
- max_results: Maximum number of results to return (default: 10)
237
- region: Search region (default: "wt-wt" for worldwide)
238
- safesearch: Safe search setting (default: "moderate")
239
- timelimit: Time limit for results (d=day, w=week, m=month)
240
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
241
-
242
- Returns:
243
- List of news result dictionaries with date, title, body, url, image, and source
244
-
245
- Raises:
246
- ValueError: If query is empty
247
- Exception: If search fails after all retries
248
- """
249
- if not query or not query.strip():
250
- raise ValueError("Query cannot be empty")
251
-
252
- retries = max_retries if max_retries is not None else self.max_retries
253
-
254
- async def _do_news_search():
255
- DDGS = self._get_duckduckgo_client()
256
- with DDGS() as ddgs:
257
- raw_results = list(
258
- ddgs.news(
259
- query.strip(),
260
- region=region,
261
- safesearch=safesearch,
262
- timelimit=timelimit,
263
- max_results=max_results,
264
- )
265
- )
266
-
267
- # Convert raw results to NewsResult models
268
- news_results = [
269
- NewsResult(
270
- date=result.get("date", ""),
271
- title=result.get("title", ""),
272
- body=result.get("body", ""),
273
- url=result.get("url", ""),
274
- image=result.get("image", ""),
275
- source=result.get("source", ""),
276
- )
277
- for result in raw_results
278
- ]
279
-
280
- return NewsResults(query=query.strip(), results=news_results)
281
-
282
- async for attempt in AsyncRetrying(
283
- stop=stop_after_attempt(retries + 1),
284
- wait=wait_exponential(multiplier=1, min=1, max=10),
285
- retry=retry_if_exception_type(Exception),
286
- before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
287
- ):
288
- with attempt:
289
- return await _do_news_search()
290
-
291
- async def read_web_page(
292
- self,
293
- url: str,
294
- *,
295
- timeout: Optional[float] = None,
296
- headers: Optional[Dict[str, str]] = None,
297
- extract_text: bool = True,
298
- extract_links: bool = False,
299
- extract_images: bool = False,
300
- css_selector: Optional[str] = None,
301
- max_retries: Optional[int] = None,
302
- ) -> WebPageResult:
303
- """
304
- Read and parse a single web page using selectolax.
305
-
306
- Args:
307
- url: URL to fetch and parse
308
- timeout: Request timeout in seconds (uses default if not provided)
309
- headers: Optional HTTP headers to send
310
- extract_text: Whether to extract text content (default: True)
311
- extract_links: Whether to extract links (default: False)
312
- extract_images: Whether to extract images (default: False)
313
- css_selector: Optional CSS selector to extract specific elements
314
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
315
-
316
- Returns:
317
- Dictionary containing parsed content and metadata
318
-
319
- Raises:
320
- httpx.HTTPError: If request fails after all retries
321
- Exception: If parsing fails
322
- """
323
- effective_headers = self._get_default_headers()
324
- if headers:
325
- effective_headers.update(headers)
326
-
327
- request_timeout = timeout or self.timeout
328
- retries = max_retries if max_retries is not None else self.max_retries
329
-
330
- async def _do_fetch_and_parse():
331
- async with httpx.AsyncClient(
332
- timeout=request_timeout, follow_redirects=True
333
- ) as client:
334
- response = await client.get(url, headers=effective_headers)
335
- response.raise_for_status()
336
-
337
- # Parse HTML content
338
- HTMLParser = self._get_selectolax_parser()
339
- parser = HTMLParser(response.text)
340
-
341
- title = ""
342
- text = ""
343
- links = []
344
- images = []
345
- selected_elements = []
346
-
347
- # Extract title
348
- title_node = parser.css_first("title")
349
- if title_node:
350
- title = title_node.text(strip=True)
351
-
352
- # Extract text content
353
- if extract_text:
354
- if css_selector:
355
- selected_nodes = parser.css(css_selector)
356
- text = " ".join(
357
- node.text(strip=True) for node in selected_nodes
358
- )
359
- else:
360
- text = parser.text(strip=True)
361
-
362
- # Extract links
363
- if extract_links:
364
- link_nodes = parser.css("a[href]")
365
- links = [
366
- LinkInfo(
367
- href=node.attrs.get("href", ""),
368
- text=node.text(strip=True),
369
- )
370
- for node in link_nodes
371
- if node.attrs.get("href")
372
- ]
373
-
374
- # Extract images
375
- if extract_images:
376
- img_nodes = parser.css("img[src]")
377
- images = [
378
- ImageInfo(
379
- src=node.attrs.get("src", ""),
380
- alt=node.attrs.get("alt", ""),
381
- title=node.attrs.get("title", ""),
382
- )
383
- for node in img_nodes
384
- if node.attrs.get("src")
385
- ]
386
-
387
- # Extract selected elements
388
- if css_selector:
389
- selected_nodes = parser.css(css_selector)
390
- selected_elements = [
391
- SelectedElement(
392
- tag=node.tag,
393
- text=node.text(strip=True),
394
- html=node.html,
395
- attributes=dict(node.attributes),
396
- )
397
- for node in selected_nodes
398
- ]
399
-
400
- return WebPageResult(
401
- url=url,
402
- status_code=response.status_code,
403
- content_type=response.headers.get("content-type", ""),
404
- title=title,
405
- text=text,
406
- links=links,
407
- images=images,
408
- selected_elements=selected_elements,
409
- )
410
-
411
- async for attempt in AsyncRetrying(
412
- stop=stop_after_attempt(retries + 1),
413
- wait=wait_exponential(multiplier=1, min=1, max=10),
414
- retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
415
- before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
416
- ):
417
- with attempt:
418
- return await _do_fetch_and_parse()
419
-
420
- async def read_web_pages(
421
- self,
422
- urls: List[str],
423
- *,
424
- timeout: Optional[float] = None,
425
- headers: Optional[Dict[str, str]] = None,
426
- extract_text: bool = True,
427
- extract_links: bool = False,
428
- extract_images: bool = False,
429
- css_selector: Optional[str] = None,
430
- max_concurrent: Optional[int] = None,
431
- max_retries: Optional[int] = None,
432
- ) -> WebPageResults:
433
- """
434
- Read and parse multiple web pages concurrently using selectolax.
435
-
436
- Args:
437
- urls: List of URLs to fetch and parse
438
- timeout: Request timeout in seconds (uses default if not provided)
439
- headers: Optional HTTP headers to send
440
- extract_text: Whether to extract text content (default: True)
441
- extract_links: Whether to extract links (default: False)
442
- extract_images: Whether to extract images (default: False)
443
- css_selector: Optional CSS selector to extract specific elements
444
- max_concurrent: Maximum number of concurrent requests (uses default if not provided)
445
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
446
-
447
- Returns:
448
- List of dictionaries containing parsed content and metadata
449
-
450
- Raises:
451
- Exception: If any critical error occurs
452
- """
453
- if not urls:
454
- return []
455
-
456
- # Remove duplicates while preserving order
457
- unique_urls = []
458
- seen = set()
459
- for url in urls:
460
- if url not in seen:
461
- unique_urls.append(url)
462
- seen.add(url)
463
-
464
- # Create semaphore for concurrency control
465
- concurrent_limit = max_concurrent or self.max_concurrent
466
- semaphore = asyncio.Semaphore(concurrent_limit)
467
-
468
- async def fetch_page(url: str) -> Dict[str, Any]:
469
- async with semaphore:
470
- try:
471
- return await self.read_web_page(
472
- url=url,
473
- timeout=timeout,
474
- headers=headers,
475
- extract_text=extract_text,
476
- extract_links=extract_links,
477
- extract_images=extract_images,
478
- css_selector=css_selector,
479
- max_retries=max_retries,
480
- )
481
- except Exception as e:
482
- return WebPageErrorResult(
483
- url=url,
484
- error=str(e),
485
- status_code=None,
486
- content_type="",
487
- title="",
488
- text="",
489
- links=[],
490
- images=[],
491
- selected_elements=[],
492
- )
493
-
494
- # Execute all requests concurrently
495
- tasks = [fetch_page(url) for url in unique_urls]
496
- results = await asyncio.gather(*tasks, return_exceptions=False)
497
-
498
- return WebPageResults(urls=unique_urls, results=results)
499
-
500
- async def extract_page_links(
501
- self,
502
- url: str,
503
- *,
504
- timeout: Optional[float] = None,
505
- headers: Optional[Dict[str, str]] = None,
506
- css_selector: str = "a[href]",
507
- include_external: bool = True,
508
- include_internal: bool = True,
509
- base_url: Optional[str] = None,
510
- max_retries: Optional[int] = None,
511
- ) -> ExtractedLinks:
512
- """
513
- Extract links from a web page using selectolax.
514
-
515
- Args:
516
- url: URL to fetch and extract links from
517
- timeout: Request timeout in seconds (uses default if not provided)
518
- headers: Optional HTTP headers to send
519
- css_selector: CSS selector for links (default: "a[href]")
520
- include_external: Whether to include external links (default: True)
521
- include_internal: Whether to include internal links (default: True)
522
- base_url: Base URL for resolving relative links (uses page URL if not provided)
523
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
524
-
525
- Returns:
526
- List of link dictionaries with href, text, title, and type (internal/external)
527
-
528
- Raises:
529
- httpx.HTTPError: If request fails after all retries
530
- Exception: If parsing fails
531
- """
532
- effective_headers = self._get_default_headers()
533
- if headers:
534
- effective_headers.update(headers)
535
-
536
- request_timeout = timeout or self.timeout
537
- retries = max_retries if max_retries is not None else self.max_retries
538
-
539
- async def _do_extract_links():
540
- async with httpx.AsyncClient(
541
- timeout=request_timeout, follow_redirects=True
542
- ) as client:
543
- response = await client.get(url, headers=effective_headers)
544
- response.raise_for_status()
545
-
546
- # Parse HTML content
547
- HTMLParser = self._get_selectolax_parser()
548
- parser = HTMLParser(response.text)
549
-
550
- # Use provided base_url or extract from the page
551
- effective_base_url = base_url or url
552
-
553
- # Get the domain for internal/external classification
554
- parsed_base = urlparse(effective_base_url)
555
- base_domain = parsed_base.netloc
556
-
557
- # Extract links
558
- link_nodes = parser.css(css_selector)
559
- links = []
560
-
561
- for node in link_nodes:
562
- href = node.attrs.get("href", "").strip()
563
- if not href:
564
- continue
565
-
566
- # Resolve relative URLs
567
- absolute_href = urljoin(effective_base_url, href)
568
- parsed_href = urlparse(absolute_href)
569
-
570
- # Determine if link is internal or external
571
- is_internal = (
572
- parsed_href.netloc == base_domain or not parsed_href.netloc
573
- )
574
- link_type = "internal" if is_internal else "external"
575
-
576
- # Filter based on include flags
577
- if (is_internal and not include_internal) or (
578
- not is_internal and not include_external
579
- ):
580
- continue
581
-
582
- link_info = ExtractedLink(
583
- href=absolute_href,
584
- original_href=href,
585
- text=node.text(strip=True),
586
- title=node.attrs.get("title", ""),
587
- type=link_type,
588
- )
589
-
590
- links.append(link_info)
591
-
592
- return ExtractedLinks(url=url, results=links)
593
-
594
- async for attempt in AsyncRetrying(
595
- stop=stop_after_attempt(retries + 1),
596
- wait=wait_exponential(multiplier=1, min=1, max=10),
597
- retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
598
- before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING),
599
- ):
600
- with attempt:
601
- return await _do_extract_links()
602
-
603
-
604
- class SearchClient:
605
- """
606
- Synchronous wrapper around AsyncSearchClient.
607
-
608
- This class provides a synchronous interface to the search functionality
609
- by running async operations in an event loop.
610
- """
611
-
612
- def __init__(
613
- self,
614
- *,
615
- timeout: float = 30.0,
616
- max_concurrent: int = 5,
617
- user_agent: Optional[str] = None,
618
- default_headers: Optional[Dict[str, str]] = None,
619
- max_retries: int = 3,
620
- ):
621
- """
622
- Initialize the SearchClient.
623
-
624
- Args:
625
- timeout: Default timeout for HTTP requests in seconds
626
- max_concurrent: Maximum number of concurrent requests for batch operations
627
- user_agent: User-Agent header for HTTP requests
628
- default_headers: Default headers to include in HTTP requests
629
- max_retries: Maximum number of retry attempts for failed requests
630
- """
631
- self._async_client = AsyncSearchClient(
632
- timeout=timeout,
633
- max_concurrent=max_concurrent,
634
- user_agent=user_agent,
635
- default_headers=default_headers,
636
- max_retries=max_retries,
637
- )
638
-
639
- def _run_async(self, coro):
640
- """Run an async coroutine in a new event loop."""
641
- try:
642
- # Try to get the current event loop
643
- loop = asyncio.get_running_loop()
644
- # If we're already in an event loop, we need to use a thread
645
- import concurrent.futures
646
-
647
- with concurrent.futures.ThreadPoolExecutor() as executor:
648
- future = executor.submit(asyncio.run, coro)
649
- return future.result()
650
- except RuntimeError:
651
- # No event loop running, we can create our own
652
- return asyncio.run(coro)
653
-
654
- def search(
655
- self,
656
- query: str,
657
- *,
658
- max_results: int = 10,
659
- region: str = "wt-wt",
660
- safesearch: str = "moderate",
661
- backend: str = "api",
662
- ) -> SearchResults:
663
- """
664
- Synchronous web search using DuckDuckGo.
665
-
666
- Args:
667
- query: Search query string
668
- max_results: Maximum number of results to return
669
- region: Search region (default: "wt-wt" for worldwide)
670
- safesearch: Safe search setting ("on", "moderate", "off")
671
- backend: Search backend ("api", "html", "lite")
672
-
673
- Returns:
674
- List of search result dictionaries with keys: title, href, body
675
- """
676
- return self._run_async(
677
- self._async_client.search(
678
- query,
679
- max_results=max_results,
680
- region=region,
681
- safesearch=safesearch,
682
- backend=backend,
683
- )
684
- )
685
-
686
- def get_page_content(
687
- self,
688
- url: str,
689
- *,
690
- timeout: Optional[float] = None,
691
- retries: int = 3,
692
- encoding: Optional[str] = None,
693
- ) -> str:
694
- """
695
- Synchronously fetch and return the text content of a web page.
696
-
697
- Args:
698
- url: URL of the web page to fetch
699
- timeout: Request timeout in seconds (uses client default if not specified)
700
- retries: Number of retry attempts for failed requests
701
- encoding: Text encoding to use (auto-detected if not specified)
702
-
703
- Returns:
704
- Plain text content of the web page
705
- """
706
- return self._run_async(
707
- self._async_client.get_page_content(
708
- url, timeout=timeout, retries=retries, encoding=encoding
709
- )
710
- )
711
-
712
- def extract_links(
713
- self,
714
- url: str,
715
- *,
716
- css_selector: str = "a[href]",
717
- include_internal: bool = True,
718
- include_external: bool = True,
719
- timeout: Optional[float] = None,
720
- retries: int = 3,
721
- ) -> ExtractedLinks:
722
- """
723
- Synchronously extract links from a web page.
724
-
725
- Args:
726
- url: URL of the web page to parse
727
- css_selector: CSS selector for link elements
728
- include_internal: Whether to include internal links
729
- include_external: Whether to include external links
730
- timeout: Request timeout in seconds
731
- retries: Number of retry attempts for failed requests
732
-
733
- Returns:
734
- List of link dictionaries with keys: href, original_href, text, title, type
735
- """
736
- return self._run_async(
737
- self._async_client.extract_links(
738
- url,
739
- css_selector=css_selector,
740
- include_internal=include_internal,
741
- include_external=include_external,
742
- timeout=timeout,
743
- retries=retries,
744
- )
745
- )
746
-
747
- def web_search(
748
- self,
749
- query: str,
750
- *,
751
- max_results: int = 10,
752
- region: str = "wt-wt",
753
- safesearch: Literal["on", "moderate", "off"] = "moderate",
754
- timelimit: Optional[Literal["d", "w", "m", "y"]] = None,
755
- backend: Literal["auto", "html", "lite"] = "auto",
756
- max_retries: Optional[int] = None,
757
- ) -> SearchResults:
758
- """
759
- Synchronously search the web using DuckDuckGo search.
760
-
761
- Args:
762
- query: Search query string
763
- max_results: Maximum number of results to return (default: 10)
764
- region: Search region (default: "wt-wt" for worldwide)
765
- safesearch: Safe search setting (default: "moderate")
766
- timelimit: Time limit for results (d=day, w=week, m=month, y=year)
767
- backend: Search backend to use (default: "auto")
768
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
769
-
770
- Returns:
771
- List of search result dictionaries with 'title', 'href', and 'body' keys
772
-
773
- Raises:
774
- ValueError: If query is empty
775
- Exception: If search fails after all retries
776
- """
777
- return self._run_async(
778
- self._async_client.web_search(
779
- query=query,
780
- max_results=max_results,
781
- region=region,
782
- safesearch=safesearch,
783
- timelimit=timelimit,
784
- backend=backend,
785
- max_retries=max_retries,
786
- )
787
- )
788
-
789
- def search_news(
790
- self,
791
- query: str,
792
- *,
793
- max_results: int = 10,
794
- region: str = "wt-wt",
795
- safesearch: Literal["on", "moderate", "off"] = "moderate",
796
- timelimit: Optional[Literal["d", "w", "m"]] = None,
797
- max_retries: Optional[int] = None,
798
- ) -> NewsResults:
799
- """
800
- Synchronously search for news using DuckDuckGo news search.
801
-
802
- Args:
803
- query: Search query string
804
- max_results: Maximum number of results to return (default: 10)
805
- region: Search region (default: "wt-wt" for worldwide)
806
- safesearch: Safe search setting (default: "moderate")
807
- timelimit: Time limit for results (d=day, w=week, m=month)
808
- max_retries: Maximum number of retry attempts (uses instance default if not provided)
809
-
810
- Returns:
811
- List of news result dictionaries with date, title, body, url, image, and source
812
-
813
- Raises:
814
- ValueError: If query is empty
815
- Exception: If search fails after all retries
816
- """
817
- return self._run_async(
818
- self._async_client.search_news(
819
- query=query,
820
- max_results=max_results,
821
- region=region,
822
- safesearch=safesearch,
823
- timelimit=timelimit,
824
- max_retries=max_retries,
825
- )
826
- )
827
-
828
- def read_web_page(
829
- self,
830
- url: str,
831
- *,
832
- timeout: float = 30.0,
833
- headers: Optional[Dict[str, str]] = None,
834
- extract_text: bool = True,
835
- extract_links: bool = False,
836
- extract_images: bool = False,
837
- css_selector: Optional[str] = None,
838
- ) -> WebPageResult:
839
- """
840
- Synchronously read and parse a single web page using selectolax.
841
-
842
- Args:
843
- url: URL to fetch and parse
844
- timeout: Request timeout in seconds (default: 30.0)
845
- headers: Optional HTTP headers to send
846
- extract_text: Whether to extract text content (default: True)
847
- extract_links: Whether to extract links (default: False)
848
- extract_images: Whether to extract images (default: False)
849
- css_selector: Optional CSS selector to extract specific elements
850
-
851
- Returns:
852
- Dictionary containing parsed content and metadata
853
-
854
- Raises:
855
- httpx.HTTPError: If request fails
856
- Exception: If parsing fails
857
- """
858
- return self._run_async(
859
- self._async_client.read_web_page(
860
- url=url,
861
- timeout=timeout,
862
- headers=headers,
863
- extract_text=extract_text,
864
- extract_links=extract_links,
865
- extract_images=extract_images,
866
- css_selector=css_selector,
867
- )
868
- )
869
-
870
- def read_web_pages(
871
- self,
872
- urls: List[str],
873
- *,
874
- timeout: float = 30.0,
875
- headers: Optional[Dict[str, str]] = None,
876
- extract_text: bool = True,
877
- extract_links: bool = False,
878
- extract_images: bool = False,
879
- css_selector: Optional[str] = None,
880
- max_concurrent: Optional[int] = None,
881
- ) -> WebPageResults:
882
- """
883
- Synchronously read and parse multiple web pages concurrently using selectolax.
884
-
885
- Args:
886
- urls: List of URLs to fetch and parse
887
- timeout: Request timeout in seconds (default: 30.0)
888
- headers: Optional HTTP headers to send
889
- extract_text: Whether to extract text content (default: True)
890
- extract_links: Whether to extract links (default: False)
891
- extract_images: Whether to extract images (default: False)
892
- css_selector: Optional CSS selector to extract specific elements
893
- max_concurrent: Maximum concurrent requests (uses client default if not specified)
894
-
895
- Returns:
896
- List of dictionaries containing parsed content and metadata for each URL
897
-
898
- Raises:
899
- httpx.HTTPError: If requests fail
900
- Exception: If parsing fails
901
- """
902
- return self._run_async(
903
- self._async_client.read_web_pages(
904
- urls=urls,
905
- timeout=timeout,
906
- headers=headers,
907
- extract_text=extract_text,
908
- extract_links=extract_links,
909
- extract_images=extract_images,
910
- css_selector=css_selector,
911
- max_concurrent=max_concurrent,
912
- )
913
- )
914
-
915
- def extract_page_links(
916
- self,
917
- url: str,
918
- *,
919
- timeout: float = 30.0,
920
- headers: Optional[Dict[str, str]] = None,
921
- css_selector: str = "a[href]",
922
- include_internal: bool = True,
923
- include_external: bool = True,
924
- base_url: Optional[str] = None,
925
- ) -> ExtractedLinks:
926
- """
927
- Synchronously extract all links from a web page.
928
-
929
- Args:
930
- url: URL to fetch and extract links from
931
- timeout: Request timeout in seconds (default: 30.0)
932
- headers: Optional HTTP headers to send
933
- css_selector: CSS selector for link elements (default: "a[href]")
934
- include_internal: Whether to include internal links (default: True)
935
- include_external: Whether to include external links (default: True)
936
- base_url: Base URL for resolving relative links (uses page URL if not provided)
937
-
938
- Returns:
939
- List of link dictionaries with 'href', 'original_href', 'text', 'title', and 'type' keys
940
-
941
- Raises:
942
- httpx.HTTPError: If request fails
943
- Exception: If parsing fails
944
- """
945
- return self._run_async(
946
- self._async_client.extract_page_links(
947
- url=url,
948
- timeout=timeout,
949
- headers=headers,
950
- css_selector=css_selector,
951
- include_internal=include_internal,
952
- include_external=include_external,
953
- base_url=base_url,
954
- )
955
- )
956
-
957
- def close(self):
958
- """Close the underlying async client."""
959
- pass
960
-
961
- def __enter__(self):
962
- """Context manager entry."""
963
- return self
964
-
965
- def __exit__(self, exc_type, exc_val, exc_tb):
966
- """Context manager exit."""
967
- self.close()
968
-
969
-
970
- @overload
971
- def create_search_client(
972
- *,
973
- timeout: float = 30.0,
974
- max_concurrent: int = 5,
975
- user_agent: Optional[str] = None,
976
- default_headers: Optional[Dict[str, str]] = None,
977
- max_retries: int = 3,
978
- async_client: Literal[True],
979
- ) -> AsyncSearchClient: ...
980
-
981
-
982
- @overload
983
- def create_search_client(
984
- *,
985
- timeout: float = 30.0,
986
- max_concurrent: int = 5,
987
- user_agent: Optional[str] = None,
988
- default_headers: Optional[Dict[str, str]] = None,
989
- max_retries: int = 3,
990
- async_client: Literal[False] = ...,
991
- ) -> SearchClient: ...
992
-
993
-
994
- def create_search_client(
995
- *,
996
- timeout: float = 30.0,
997
- max_concurrent: int = 5,
998
- user_agent: Optional[str] = None,
999
- default_headers: Optional[Dict[str, str]] = None,
1000
- max_retries: int = 3,
1001
- async_client: bool = False,
1002
- ) -> Union[SearchClient, AsyncSearchClient]:
1003
- """
1004
- Create a new SearchClient instance.
1005
-
1006
- Args:
1007
- timeout: Default timeout for HTTP requests in seconds
1008
- max_concurrent: Maximum number of concurrent requests for batch operations
1009
- user_agent: User-Agent header for HTTP requests
1010
- default_headers: Default headers to include in HTTP requests
1011
- max_retries: Maximum number of retry attempts for failed requests
1012
- async_client: Whether to return an async client instance
1013
-
1014
- Returns:
1015
- SearchClient or AsyncSearchClient instance based on async_client parameter
1016
- """
1017
- params = locals()
1018
- del params["async_client"]
1019
-
1020
- if async_client:
1021
- return AsyncSearchClient(**params)
1022
- else:
1023
- return SearchClient(**params)