crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -104,6 +104,7 @@ class HttpClient(ABC):
104
104
  session: Session | None = None,
105
105
  proxy_info: ProxyInfo | None = None,
106
106
  statistics: Statistics | None = None,
107
+ timeout: timedelta | None = None,
107
108
  ) -> HttpCrawlingResult:
108
109
  """Perform the crawling for a given request.
109
110
 
@@ -114,6 +115,7 @@ class HttpClient(ABC):
114
115
  session: The session associated with the request.
115
116
  proxy_info: The information about the proxy to be used.
116
117
  statistics: The statistics object to register status codes.
118
+ timeout: Maximum time allowed to process the request.
117
119
 
118
120
  Raises:
119
121
  ProxyError: Raised if a proxy-related error occurs.
@@ -132,6 +134,7 @@ class HttpClient(ABC):
132
134
  payload: HttpPayload | None = None,
133
135
  session: Session | None = None,
134
136
  proxy_info: ProxyInfo | None = None,
137
+ timeout: timedelta | None = None,
135
138
  ) -> HttpResponse:
136
139
  """Send an HTTP request via the client.
137
140
 
@@ -144,6 +147,7 @@ class HttpClient(ABC):
144
147
  payload: The data to be sent as the request body.
145
148
  session: The session associated with the request.
146
149
  proxy_info: The information about the proxy to be used.
150
+ timeout: Maximum time allowed to process the request.
147
151
 
148
152
  Raises:
149
153
  ProxyError: Raised if a proxy-related error occurs.
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from contextlib import asynccontextmanager
4
- from typing import TYPE_CHECKING, Any
5
+ from http.cookiejar import Cookie
6
+ from typing import TYPE_CHECKING, Any, cast
5
7
 
6
8
  from curl_cffi import CurlInfo
7
9
  from curl_cffi.const import CurlHttpVersion
@@ -10,10 +12,11 @@ from curl_cffi.requests.cookies import Cookies as CurlCookies
10
12
  from curl_cffi.requests.cookies import CurlMorsel
11
13
  from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
12
14
  from curl_cffi.requests.exceptions import RequestException as CurlRequestError
15
+ from curl_cffi.requests.exceptions import Timeout
13
16
  from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
14
17
  from typing_extensions import override
15
18
 
16
- from crawlee._types import HttpHeaders, HttpPayload
19
+ from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
17
20
  from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
18
21
  from crawlee._utils.docs import docs_group
19
22
  from crawlee.errors import ProxyError
@@ -22,11 +25,11 @@ from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse
22
25
  if TYPE_CHECKING:
23
26
  from collections.abc import AsyncGenerator
24
27
  from datetime import timedelta
25
- from http.cookiejar import Cookie
26
28
 
27
29
  from curl_cffi import Curl
28
30
  from curl_cffi.requests import Request as CurlRequest
29
31
  from curl_cffi.requests import Response
32
+ from curl_cffi.requests.session import HttpMethod as CurlHttpMethod
30
33
 
31
34
  from crawlee import Request
32
35
  from crawlee._types import HttpMethod
@@ -88,15 +91,17 @@ class _CurlImpersonateResponse:
88
91
  async def read(self) -> bytes:
89
92
  if self._response.astream_task:
90
93
  raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
94
+
91
95
  return self._response.content
92
96
 
93
97
  async def read_stream(self) -> AsyncGenerator[bytes, None]:
94
- if not self._response.astream_task or self._response.astream_task.done(): # type: ignore[attr-defined]
95
- raise RuntimeError(
96
- 'Cannot read stream: either already consumed or Response not obtained from `stream` method'
97
- )
98
+ if not self._response.astream_task:
99
+ raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.')
98
100
 
99
- async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call]
101
+ if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done():
102
+ raise RuntimeError('Cannot read stream, it was already consumed.')
103
+
104
+ async for chunk in self._response.aiter_content():
100
105
  yield chunk
101
106
 
102
107
 
@@ -147,17 +152,21 @@ class CurlImpersonateHttpClient(HttpClient):
147
152
  session: Session | None = None,
148
153
  proxy_info: ProxyInfo | None = None,
149
154
  statistics: Statistics | None = None,
155
+ timeout: timedelta | None = None,
150
156
  ) -> HttpCrawlingResult:
151
157
  client = self._get_client(proxy_info.url if proxy_info else None)
152
158
 
153
159
  try:
154
160
  response = await client.request(
155
161
  url=request.url,
156
- method=request.method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method
162
+ method=self._convert_method(request.method),
157
163
  headers=request.headers,
158
164
  data=request.payload,
159
165
  cookies=session.cookies.jar if session else None,
166
+ timeout=timeout.total_seconds() if timeout else None,
160
167
  )
168
+ except Timeout as exc:
169
+ raise asyncio.TimeoutError from exc
161
170
  except CurlRequestError as exc:
162
171
  if self._is_proxy_error(exc):
163
172
  raise ProxyError from exc
@@ -186,6 +195,7 @@ class CurlImpersonateHttpClient(HttpClient):
186
195
  payload: HttpPayload | None = None,
187
196
  session: Session | None = None,
188
197
  proxy_info: ProxyInfo | None = None,
198
+ timeout: timedelta | None = None,
189
199
  ) -> HttpResponse:
190
200
  if isinstance(headers, dict) or headers is None:
191
201
  headers = HttpHeaders(headers or {})
@@ -196,11 +206,14 @@ class CurlImpersonateHttpClient(HttpClient):
196
206
  try:
197
207
  response = await client.request(
198
208
  url=url,
199
- method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method
209
+ method=self._convert_method(method),
200
210
  headers=dict(headers) if headers else None,
201
211
  data=payload,
202
212
  cookies=session.cookies.jar if session else None,
213
+ timeout=timeout.total_seconds() if timeout else None,
203
214
  )
215
+ except Timeout as exc:
216
+ raise asyncio.TimeoutError from exc
204
217
  except CurlRequestError as exc:
205
218
  if self._is_proxy_error(exc):
206
219
  raise ProxyError from exc
@@ -234,13 +247,15 @@ class CurlImpersonateHttpClient(HttpClient):
234
247
  try:
235
248
  response = await client.request(
236
249
  url=url,
237
- method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method
250
+ method=self._convert_method(method),
238
251
  headers=dict(headers) if headers else None,
239
252
  data=payload,
240
253
  cookies=session.cookies.jar if session else None,
241
254
  stream=True,
242
255
  timeout=timeout.total_seconds() if timeout else None,
243
256
  )
257
+ except Timeout as exc:
258
+ raise asyncio.TimeoutError from exc
244
259
  except CurlRequestError as exc:
245
260
  if self._is_proxy_error(exc):
246
261
  raise ProxyError from exc
@@ -279,6 +294,40 @@ class CurlImpersonateHttpClient(HttpClient):
279
294
 
280
295
  return self._client_by_proxy_url[proxy_url]
281
296
 
297
+ def _convert_method(self, method: HttpMethod) -> CurlHttpMethod:
298
+ """Convert from Crawlee HTTP method to curl-cffi HTTP method.
299
+
300
+ Args:
301
+ method: Crawlee HTTP method.
302
+
303
+ Returns:
304
+ Corresponding curl-cffi HTTP method.
305
+
306
+ Raises:
307
+ ValueError: If the provided HTTP method is not supported.
308
+ """
309
+ method_upper = method.upper() # curl-cffi requires uppercase methods
310
+
311
+ match method_upper:
312
+ case 'GET':
313
+ return 'GET'
314
+ case 'POST':
315
+ return 'POST'
316
+ case 'PUT':
317
+ return 'PUT'
318
+ case 'DELETE':
319
+ return 'DELETE'
320
+ case 'OPTIONS':
321
+ return 'OPTIONS'
322
+ case 'HEAD':
323
+ return 'HEAD'
324
+ case 'TRACE':
325
+ return 'TRACE'
326
+ case 'PATCH':
327
+ return 'PATCH'
328
+ case _:
329
+ raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.')
330
+
282
331
  @staticmethod
283
332
  def _is_proxy_error(error: CurlRequestError) -> bool:
284
333
  """Determine whether the given error is related to a proxy issue.
@@ -296,11 +345,16 @@ class CurlImpersonateHttpClient(HttpClient):
296
345
 
297
346
  @staticmethod
298
347
  def _get_cookies(curl: Curl) -> list[Cookie]:
299
- cookies: list[Cookie] = []
300
- for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr]
301
- curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type]
348
+ cookies = list[Cookie]()
349
+
350
+ # Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST.
351
+ cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST))
352
+
353
+ for curl_cookie in cookie_list:
354
+ curl_morsel = CurlMorsel.from_curl_format(curl_cookie)
302
355
  cookie = curl_morsel.to_cookiejar_cookie()
303
356
  cookies.append(cookie)
357
+
304
358
  return cookies
305
359
 
306
360
  async def cleanup(self) -> None:
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from contextlib import asynccontextmanager
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any, cast
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
146
147
  session: Session | None = None,
147
148
  proxy_info: ProxyInfo | None = None,
148
149
  statistics: Statistics | None = None,
150
+ timeout: timedelta | None = None,
149
151
  ) -> HttpCrawlingResult:
150
152
  client = self._get_client(proxy_info.url if proxy_info else None)
151
153
  headers = self._combine_headers(request.headers)
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
157
159
  content=request.payload,
158
160
  cookies=session.cookies.jar if session else None,
159
161
  extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
162
+ timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
160
163
  )
161
164
 
162
165
  try:
163
166
  response = await client.send(http_request)
167
+ except httpx.TimeoutException as exc:
168
+ raise asyncio.TimeoutError from exc
164
169
  except httpx.TransportError as exc:
165
170
  if self._is_proxy_error(exc):
166
171
  raise ProxyError from exc
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
185
190
  payload: HttpPayload | None = None,
186
191
  session: Session | None = None,
187
192
  proxy_info: ProxyInfo | None = None,
193
+ timeout: timedelta | None = None,
188
194
  ) -> HttpResponse:
189
195
  client = self._get_client(proxy_info.url if proxy_info else None)
190
196
 
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
195
201
  headers=headers,
196
202
  payload=payload,
197
203
  session=session,
204
+ timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
198
205
  )
199
206
 
200
207
  try:
201
208
  response = await client.send(http_request)
209
+ except httpx.TimeoutException as exc:
210
+ raise asyncio.TimeoutError from exc
202
211
  except httpx.TransportError as exc:
203
212
  if self._is_proxy_error(exc):
204
213
  raise ProxyError from exc
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
228
237
  headers=headers,
229
238
  payload=payload,
230
239
  session=session,
231
- timeout=timeout,
240
+ timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
232
241
  )
233
242
 
234
- response = await client.send(http_request, stream=True)
243
+ try:
244
+ response = await client.send(http_request, stream=True)
245
+ except httpx.TimeoutException as exc:
246
+ raise asyncio.TimeoutError from exc
235
247
 
236
248
  try:
237
249
  yield _HttpxResponse(response)
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
246
258
  headers: HttpHeaders | dict[str, str] | None,
247
259
  payload: HttpPayload | None,
248
260
  session: Session | None = None,
249
- timeout: timedelta | None = None,
261
+ timeout: httpx.Timeout | None = None,
250
262
  ) -> httpx.Request:
251
263
  """Build an `httpx.Request` using the provided parameters."""
252
264
  if isinstance(headers, dict) or headers is None:
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
254
266
 
255
267
  headers = self._combine_headers(headers)
256
268
 
257
- httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
258
-
259
269
  return client.build_request(
260
270
  url=url,
261
271
  method=method,
262
272
  headers=dict(headers) if headers else None,
263
273
  content=payload,
264
274
  extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
265
- timeout=httpx_timeout,
275
+ timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
266
276
  )
267
277
 
268
278
  def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from contextlib import asynccontextmanager
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any, TypedDict
6
7
 
7
8
  from cachetools import LRUCache
8
- from impit import AsyncClient, Browser, HTTPError, Response, TransportError
9
+ from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
9
10
  from impit import ProxyError as ImpitProxyError
10
11
  from typing_extensions import override
11
12
 
@@ -124,6 +125,7 @@ class ImpitHttpClient(HttpClient):
124
125
  session: Session | None = None,
125
126
  proxy_info: ProxyInfo | None = None,
126
127
  statistics: Statistics | None = None,
128
+ timeout: timedelta | None = None,
127
129
  ) -> HttpCrawlingResult:
128
130
  client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
129
131
 
@@ -133,7 +135,10 @@ class ImpitHttpClient(HttpClient):
133
135
  method=request.method,
134
136
  content=request.payload,
135
137
  headers=dict(request.headers) if request.headers else None,
138
+ timeout=timeout.total_seconds() if timeout else None,
136
139
  )
140
+ except TimeoutException as exc:
141
+ raise asyncio.TimeoutError from exc
137
142
  except (TransportError, HTTPError) as exc:
138
143
  if self._is_proxy_error(exc):
139
144
  raise ProxyError from exc
@@ -156,6 +161,7 @@ class ImpitHttpClient(HttpClient):
156
161
  payload: HttpPayload | None = None,
157
162
  session: Session | None = None,
158
163
  proxy_info: ProxyInfo | None = None,
164
+ timeout: timedelta | None = None,
159
165
  ) -> HttpResponse:
160
166
  if isinstance(headers, dict) or headers is None:
161
167
  headers = HttpHeaders(headers or {})
@@ -164,8 +170,14 @@ class ImpitHttpClient(HttpClient):
164
170
 
165
171
  try:
166
172
  response = await client.request(
167
- method=method, url=url, content=payload, headers=dict(headers) if headers else None
173
+ method=method,
174
+ url=url,
175
+ content=payload,
176
+ headers=dict(headers) if headers else None,
177
+ timeout=timeout.total_seconds() if timeout else None,
168
178
  )
179
+ except TimeoutException as exc:
180
+ raise asyncio.TimeoutError from exc
169
181
  except (TransportError, HTTPError) as exc:
170
182
  if self._is_proxy_error(exc):
171
183
  raise ProxyError from exc
@@ -188,18 +200,27 @@ class ImpitHttpClient(HttpClient):
188
200
  ) -> AsyncGenerator[HttpResponse]:
189
201
  client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
190
202
 
191
- response = await client.request(
192
- method=method,
193
- url=url,
194
- content=payload,
195
- headers=dict(headers) if headers else None,
196
- timeout=timeout.total_seconds() if timeout else None,
197
- stream=True,
198
- )
203
+ try:
204
+ response = await client.request(
205
+ method=method,
206
+ url=url,
207
+ content=payload,
208
+ headers=dict(headers) if headers else None,
209
+ timeout=timeout.total_seconds() if timeout else None,
210
+ stream=True,
211
+ )
212
+ except TimeoutException as exc:
213
+ raise asyncio.TimeoutError from exc
214
+
199
215
  try:
200
216
  yield _ImpitResponse(response)
201
217
  finally:
202
- await response.aclose()
218
+ # TODO: https://github.com/apify/impit/issues/242
219
+ # Quickly closing Response while reading the response body causes an error in the Rust generator in `impit`.
220
+ # With a short sleep and sync closing, the error does not occur.
221
+ # Replace with `response.aclose` when this is resolved in impit.
222
+ await asyncio.sleep(0.01)
223
+ response.close()
203
224
 
204
225
  def _get_client(self, proxy_url: str | None, cookie_jar: CookieJar | None) -> AsyncClient:
205
226
  """Retrieve or create an HTTP client for the given proxy URL.
@@ -3,9 +3,7 @@ from __future__ import annotations
3
3
  import inspect
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
- from opentelemetry.instrumentation.instrumentor import ( # type:ignore[attr-defined] # Mypy has troubles with OTEL
7
- BaseInstrumentor,
8
- )
6
+ from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
9
7
  from opentelemetry.instrumentation.utils import unwrap
10
8
  from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME
11
9
  from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD
@@ -69,7 +67,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
69
67
 
70
68
  if request_handling_instrumentation:
71
69
 
72
- async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
70
+ async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
73
71
  with self._tracer.start_as_current_span(
74
72
  name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
75
73
  attributes={
@@ -111,8 +109,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
111
109
  # Handpicked interesting methods to instrument
112
110
  self._instrumented.extend(
113
111
  [
114
- (_Middleware, 'action', middlware_wrapper),
115
- (_Middleware, 'cleanup', middlware_wrapper),
112
+ (_Middleware, 'action', middleware_wrapper),
113
+ (_Middleware, 'cleanup', middleware_wrapper),
116
114
  (ContextPipeline, '__call__', context_pipeline_wrapper),
117
115
  (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
118
116
  (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
@@ -5,8 +5,8 @@
5
5
  # % endif
6
6
  # % if cookiecutter.http_client == 'curl-impersonate'
7
7
  # % do extras.append('curl-impersonate')
8
- # % elif cookiecutter.http_client == 'impit'
9
- # % do extras.append('impit')
8
+ # % elif cookiecutter.http_client == 'httpx'
9
+ # % do extras.append('httpx')
10
10
  # % endif
11
11
 
12
12
  [project]
@@ -10,4 +10,7 @@ apify
10
10
  # % if cookiecutter.http_client == 'curl-impersonate'
11
11
  # % do extras.append('curl-impersonate')
12
12
  # % endif
13
+ # % if cookiecutter.http_client == 'httpx'
14
+ # % do extras.append('httpx')
15
+ # % endif
13
16
  crawlee[{{ extras | join(',') }}]
@@ -17,7 +17,7 @@ logger = getLogger(__name__)
17
17
 
18
18
 
19
19
  class RequestListState(BaseModel):
20
- model_config = ConfigDict(populate_by_name=True)
20
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
21
21
 
22
22
  next_index: Annotated[int, Field(alias='nextIndex')] = 0
23
23
  next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
@@ -166,7 +166,7 @@ class RequestList(RequestLoader):
166
166
  return None
167
167
 
168
168
  state = await self._get_state()
169
- state.in_progress.add(self._next[0].id)
169
+ state.in_progress.add(self._next[0].unique_key)
170
170
  self._assumed_total_count += 1
171
171
 
172
172
  next_request = self._next[0]
@@ -183,7 +183,7 @@ class RequestList(RequestLoader):
183
183
  async def mark_request_as_handled(self, request: Request) -> None:
184
184
  self._handled_count += 1
185
185
  state = await self._get_state()
186
- state.in_progress.remove(request.id)
186
+ state.in_progress.remove(request.unique_key)
187
187
 
188
188
  async def _ensure_next_request(self) -> None:
189
189
  await self._get_state()
@@ -43,7 +43,11 @@ class RequestLoader(ABC):
43
43
 
44
44
  @abstractmethod
45
45
  async def fetch_next_request(self) -> Request | None:
46
- """Return the next request to be processed, or `null` if there are no more pending requests."""
46
+ """Return the next request to be processed, or `None` if there are no more pending requests.
47
+
48
+ The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method
49
+ should wait until a request appears.
50
+ """
47
51
 
48
52
  @abstractmethod
49
53
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: