crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +44 -5
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +156 -131
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
  61. crawlee/storage_clients/_redis/__init__.py +6 -0
  62. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  63. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  64. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  65. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  66. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  67. crawlee/storage_clients/_redis/_utils.py +23 -0
  68. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  69. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  70. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  71. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  72. crawlee/storage_clients/_redis/py.typed +0 -0
  73. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  74. crawlee/storage_clients/_sql/_db_models.py +1 -2
  75. crawlee/storage_clients/models.py +8 -3
  76. crawlee/storages/_key_value_store.py +5 -2
  77. crawlee/storages/_storage_instance_manager.py +103 -44
  78. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
  79. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
  80. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  81. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  82. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from contextlib import asynccontextmanager
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING, Any, cast
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
146
147
  session: Session | None = None,
147
148
  proxy_info: ProxyInfo | None = None,
148
149
  statistics: Statistics | None = None,
150
+ timeout: timedelta | None = None,
149
151
  ) -> HttpCrawlingResult:
150
152
  client = self._get_client(proxy_info.url if proxy_info else None)
151
153
  headers = self._combine_headers(request.headers)
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
157
159
  content=request.payload,
158
160
  cookies=session.cookies.jar if session else None,
159
161
  extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
162
+ timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
160
163
  )
161
164
 
162
165
  try:
163
166
  response = await client.send(http_request)
167
+ except httpx.TimeoutException as exc:
168
+ raise asyncio.TimeoutError from exc
164
169
  except httpx.TransportError as exc:
165
170
  if self._is_proxy_error(exc):
166
171
  raise ProxyError from exc
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
185
190
  payload: HttpPayload | None = None,
186
191
  session: Session | None = None,
187
192
  proxy_info: ProxyInfo | None = None,
193
+ timeout: timedelta | None = None,
188
194
  ) -> HttpResponse:
189
195
  client = self._get_client(proxy_info.url if proxy_info else None)
190
196
 
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
195
201
  headers=headers,
196
202
  payload=payload,
197
203
  session=session,
204
+ timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
198
205
  )
199
206
 
200
207
  try:
201
208
  response = await client.send(http_request)
209
+ except httpx.TimeoutException as exc:
210
+ raise asyncio.TimeoutError from exc
202
211
  except httpx.TransportError as exc:
203
212
  if self._is_proxy_error(exc):
204
213
  raise ProxyError from exc
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
228
237
  headers=headers,
229
238
  payload=payload,
230
239
  session=session,
231
- timeout=timeout,
240
+ timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
232
241
  )
233
242
 
234
- response = await client.send(http_request, stream=True)
243
+ try:
244
+ response = await client.send(http_request, stream=True)
245
+ except httpx.TimeoutException as exc:
246
+ raise asyncio.TimeoutError from exc
235
247
 
236
248
  try:
237
249
  yield _HttpxResponse(response)
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
246
258
  headers: HttpHeaders | dict[str, str] | None,
247
259
  payload: HttpPayload | None,
248
260
  session: Session | None = None,
249
- timeout: timedelta | None = None,
261
+ timeout: httpx.Timeout | None = None,
250
262
  ) -> httpx.Request:
251
263
  """Build an `httpx.Request` using the provided parameters."""
252
264
  if isinstance(headers, dict) or headers is None:
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
254
266
 
255
267
  headers = self._combine_headers(headers)
256
268
 
257
- httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
258
-
259
269
  return client.build_request(
260
270
  url=url,
261
271
  method=method,
262
272
  headers=dict(headers) if headers else None,
263
273
  content=payload,
264
274
  extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
265
- timeout=httpx_timeout,
275
+ timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
266
276
  )
267
277
 
268
278
  def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
@@ -6,7 +6,7 @@ from logging import getLogger
6
6
  from typing import TYPE_CHECKING, Any, TypedDict
7
7
 
8
8
  from cachetools import LRUCache
9
- from impit import AsyncClient, Browser, HTTPError, Response, TransportError
9
+ from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
10
10
  from impit import ProxyError as ImpitProxyError
11
11
  from typing_extensions import override
12
12
 
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
125
125
  session: Session | None = None,
126
126
  proxy_info: ProxyInfo | None = None,
127
127
  statistics: Statistics | None = None,
128
+ timeout: timedelta | None = None,
128
129
  ) -> HttpCrawlingResult:
129
130
  client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
130
131
 
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
134
135
  method=request.method,
135
136
  content=request.payload,
136
137
  headers=dict(request.headers) if request.headers else None,
138
+ timeout=timeout.total_seconds() if timeout else None,
137
139
  )
140
+ except TimeoutException as exc:
141
+ raise asyncio.TimeoutError from exc
138
142
  except (TransportError, HTTPError) as exc:
139
143
  if self._is_proxy_error(exc):
140
144
  raise ProxyError from exc
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
157
161
  payload: HttpPayload | None = None,
158
162
  session: Session | None = None,
159
163
  proxy_info: ProxyInfo | None = None,
164
+ timeout: timedelta | None = None,
160
165
  ) -> HttpResponse:
161
166
  if isinstance(headers, dict) or headers is None:
162
167
  headers = HttpHeaders(headers or {})
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
165
170
 
166
171
  try:
167
172
  response = await client.request(
168
- method=method, url=url, content=payload, headers=dict(headers) if headers else None
173
+ method=method,
174
+ url=url,
175
+ content=payload,
176
+ headers=dict(headers) if headers else None,
177
+ timeout=timeout.total_seconds() if timeout else None,
169
178
  )
179
+ except TimeoutException as exc:
180
+ raise asyncio.TimeoutError from exc
170
181
  except (TransportError, HTTPError) as exc:
171
182
  if self._is_proxy_error(exc):
172
183
  raise ProxyError from exc
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
189
200
  ) -> AsyncGenerator[HttpResponse]:
190
201
  client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
191
202
 
192
- response = await client.request(
193
- method=method,
194
- url=url,
195
- content=payload,
196
- headers=dict(headers) if headers else None,
197
- timeout=timeout.total_seconds() if timeout else None,
198
- stream=True,
199
- )
203
+ try:
204
+ response = await client.request(
205
+ method=method,
206
+ url=url,
207
+ content=payload,
208
+ headers=dict(headers) if headers else None,
209
+ timeout=timeout.total_seconds() if timeout else None,
210
+ stream=True,
211
+ )
212
+ except TimeoutException as exc:
213
+ raise asyncio.TimeoutError from exc
214
+
200
215
  try:
201
216
  yield _ImpitResponse(response)
202
217
  finally:
@@ -3,9 +3,7 @@ from __future__ import annotations
3
3
  import inspect
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
- from opentelemetry.instrumentation.instrumentor import ( # type:ignore[attr-defined] # Mypy has troubles with OTEL
7
- BaseInstrumentor,
8
- )
6
+ from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
9
7
  from opentelemetry.instrumentation.utils import unwrap
10
8
  from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME
11
9
  from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD
@@ -69,7 +67,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
69
67
 
70
68
  if request_handling_instrumentation:
71
69
 
72
- async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
70
+ async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
73
71
  with self._tracer.start_as_current_span(
74
72
  name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
75
73
  attributes={
@@ -111,8 +109,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
111
109
  # Handpicked interesting methods to instrument
112
110
  self._instrumented.extend(
113
111
  [
114
- (_Middleware, 'action', middlware_wrapper),
115
- (_Middleware, 'cleanup', middlware_wrapper),
112
+ (_Middleware, 'action', middleware_wrapper),
113
+ (_Middleware, 'cleanup', middleware_wrapper),
116
114
  (ContextPipeline, '__call__', context_pipeline_wrapper),
117
115
  (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
118
116
  (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
  from typing_extensions import override
11
11
 
12
- from crawlee import Request
12
+ from crawlee import Request, RequestOptions
13
13
  from crawlee._utils.docs import docs_group
14
14
  from crawlee._utils.globs import Glob
15
15
  from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import re
21
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
22
  from types import TracebackType
23
23
 
24
+ from crawlee import RequestTransformAction
24
25
  from crawlee.http_clients import HttpClient
25
26
  from crawlee.proxy_configuration import ProxyInfo
26
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
90
91
  class SitemapRequestLoader(RequestLoader):
91
92
  """A request loader that reads URLs from sitemap(s).
92
93
 
94
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
95
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
96
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
97
+ and the `enqueue_links` functionality.
98
+
93
99
  The loader fetches and parses sitemaps in the background, allowing crawling to start
94
100
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
95
101
 
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
107
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
108
114
  max_buffer_size: int = 200,
109
115
  persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
110
117
  ) -> None:
111
118
  """Initialize the sitemap request loader.
112
119
 
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
120
127
  persist_state_key: A key for persisting the loader's state in the KeyValueStore.
121
128
  When provided, allows resuming from where it left off after interruption.
122
129
  If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
123
133
  """
124
134
  self._http_client = http_client
125
135
  self._sitemap_urls = sitemap_urls
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
127
137
  self._exclude = exclude
128
138
  self._proxy_info = proxy_info
129
139
  self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
130
141
 
131
142
  # Synchronization for queue operations
132
143
  self._queue_has_capacity = asyncio.Event()
@@ -219,7 +230,7 @@ class SitemapRequestLoader(RequestLoader):
219
230
  continue
220
231
  state.in_progress_sitemap_url = sitemap_url
221
232
 
222
- parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
233
+ parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
223
234
 
224
235
  async for item in parse_sitemap(
225
236
  [SitemapSource(type='url', url=sitemap_url)],
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
308
319
 
309
320
  async with self._queue_lock:
310
321
  url = state.url_queue.popleft()
311
-
312
- request = Request.from_url(url)
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
313
331
  state.in_progress.add(request.url)
314
332
  if len(state.url_queue) < self._max_buffer_size:
315
333
  self._queue_has_capacity.set()
crawlee/router.py CHANGED
@@ -1,13 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from collections.abc import Awaitable, Callable
4
5
  from typing import Generic, TypeVar
5
6
 
7
+ from crawlee._request import RequestState
6
8
  from crawlee._types import BasicCrawlingContext
7
9
  from crawlee._utils.docs import docs_group
8
10
 
9
11
  __all__ = ['Router']
10
12
 
13
+ from crawlee.errors import UserHandlerTimeoutError
14
+
11
15
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
12
16
  RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
13
17
 
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
89
93
 
90
94
  async def __call__(self, context: TCrawlingContext) -> None:
91
95
  """Invoke a request handler that matches the request label (or the default)."""
96
+ context.request.state = RequestState.REQUEST_HANDLER
92
97
  if context.request.label is None or context.request.label not in self._handlers_by_label:
93
98
  if self._default_handler is None:
94
99
  raise RuntimeError(
95
100
  f'No handler matches label `{context.request.label}` and no default handler is configured'
96
101
  )
97
102
 
98
- return await self._default_handler(context)
103
+ user_defined_handler = self._default_handler
104
+ else:
105
+ user_defined_handler = self._handlers_by_label[context.request.label]
99
106
 
100
- handler = self._handlers_by_label[context.request.label]
101
- return await handler(context)
107
+ try:
108
+ return await user_defined_handler(context)
109
+ except asyncio.TimeoutError as e:
110
+ # Timeout in handler, but not timeout of handler.
111
+ raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
@@ -10,6 +10,7 @@ from crawlee._utils.docs import docs_group
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from collections.abc import Iterator
13
+ from typing import TypeGuard
13
14
 
14
15
 
15
16
  @docs_group('Session management')
@@ -66,17 +67,18 @@ class SessionCookies:
66
67
 
67
68
  self._jar = CookieJar()
68
69
 
69
- if isinstance(cookies, dict):
70
- for key, value in cookies.items():
71
- self.set(key, value)
72
-
73
- elif isinstance(cookies, list):
70
+ if isinstance(cookies, list):
74
71
  for item in cookies:
75
72
  self.set(**item)
76
73
 
77
74
  elif isinstance(cookies, SessionCookies):
78
75
  for cookie in cookies.jar:
79
- self.jar.set_cookie(cookie)
76
+ self._jar.set_cookie(cookie)
77
+
78
+ elif isinstance(cookies, dict):
79
+ cookies_dict: dict[str, str] = cookies
80
+ for key, value in cookies_dict.items():
81
+ self.set(key, value)
80
82
 
81
83
  @property
82
84
  def jar(self) -> CookieJar:
@@ -151,8 +153,8 @@ class SessionCookies:
151
153
  if cookie.expires:
152
154
  cookie_dict['expires'] = cookie.expires
153
155
 
154
- if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site in {'Lax', 'None', 'Strict'}:
155
- cookie_dict['same_site'] = same_site # type: ignore[typeddict-item]
156
+ if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):
157
+ cookie_dict['same_site'] = same_site
156
158
 
157
159
  return cookie_dict
158
160
 
@@ -273,3 +275,6 @@ class SessionCookies:
273
275
  """Return hash based on the cookies key attributes."""
274
276
  cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)
275
277
  return hash(cookie_tuples)
278
+
279
+ def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:
280
+ return value in {'Lax', 'None', 'Strict'}
@@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel):
63
63
  ),
64
64
  ]
65
65
 
66
- @computed_field(alias='sessionCount') # type: ignore[prop-decorator]
66
+ @computed_field(alias='sessionCount')
67
67
  @property
68
68
  def session_count(self) -> int:
69
69
  """Get the total number of sessions currently maintained in the pool."""
70
70
  return len(self.sessions)
71
71
 
72
- @computed_field(alias='usableSessionCount') # type: ignore[prop-decorator]
72
+ @computed_field(alias='usableSessionCount')
73
73
  @property
74
74
  def usable_session_count(self) -> int:
75
75
  """Get the number of sessions that are currently usable."""
76
76
  return len([session for _, session in self.sessions.items() if session.is_usable])
77
77
 
78
- @computed_field(alias='retiredSessionCount') # type: ignore[prop-decorator]
78
+ @computed_field(alias='retiredSessionCount')
79
79
  @property
80
80
  def retired_session_count(self) -> int:
81
81
  """Get the number of sessions that are no longer usable."""
@@ -163,7 +163,7 @@ class SessionPool:
163
163
  def add_session(self, session: Session) -> None:
164
164
  """Add an externally created session to the pool.
165
165
 
166
- This is intened only for the cases when you want to add a session that was created outside of the pool.
166
+ This is intended only for the cases when you want to add a session that was created outside of the pool.
167
167
  Otherwise, the pool will create new sessions automatically.
168
168
 
169
169
  Args:
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
32
32
  """Capture error snapshot and save it to key value store.
33
33
 
34
34
  It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
35
- it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
35
+ it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
36
36
  returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
37
37
  an exception.
38
38
 
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import warnings
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime, timedelta, timezone
6
- from typing import Annotated, Any
7
+ from typing import TYPE_CHECKING, Annotated, Any
7
8
 
8
9
  from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field
9
10
  from typing_extensions import override
@@ -76,10 +77,20 @@ class StatisticsState(BaseModel):
76
77
  crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
77
78
  crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
78
79
  crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
79
- crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
80
- errors: dict[str, Any] = Field(default_factory=dict)
81
- retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
82
- requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
80
+
81
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
82
+ if TYPE_CHECKING:
83
+ errors: dict[str, Any] = {}
84
+ retry_errors: dict[str, Any] = {}
85
+ requests_with_status_code: dict[str, int] = {}
86
+ else:
87
+ errors: Annotated[dict[str, Any], Field(default_factory=dict)]
88
+ retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]
89
+ requests_with_status_code: Annotated[
90
+ dict[str, int],
91
+ Field(alias='requestsWithStatusCode', default_factory=dict),
92
+ ]
93
+
83
94
  stats_persisted_at: Annotated[
84
95
  datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))
85
96
  ] = None
@@ -93,22 +104,53 @@ class StatisticsState(BaseModel):
93
104
  ),
94
105
  ] = {}
95
106
 
96
- @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
107
+ # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
108
+ _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
109
+
110
+ def model_post_init(self, /, __context: Any) -> None:
111
+ self._runtime_offset = self.crawler_runtime or self._runtime_offset
112
+
113
+ @property
114
+ def crawler_runtime(self) -> timedelta:
115
+ if self.crawler_last_started_at:
116
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
117
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
118
+ return self._runtime_offset
119
+
120
+ @crawler_runtime.setter
121
+ def crawler_runtime(self, value: timedelta) -> None:
122
+ # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
123
+ # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
124
+ warnings.warn(
125
+ f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
126
+ f' Value {value} will not be used.',
127
+ DeprecationWarning,
128
+ stacklevel=2,
129
+ )
130
+
131
+ @computed_field(alias='crawlerRuntimeMillis')
132
+ def crawler_runtime_for_serialization(self) -> timedelta:
133
+ if self.crawler_last_started_at:
134
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
135
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
136
+ return self._runtime_offset
137
+
138
+ @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)
97
139
  @property
98
140
  def request_total_duration(self) -> timedelta:
99
141
  return self.request_total_finished_duration + self.request_total_failed_duration
100
142
 
101
- @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator]
143
+ @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
102
144
  @property
103
145
  def request_avg_failed_duration(self) -> timedelta | None:
104
146
  return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None
105
147
 
106
- @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator]
148
+ @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
107
149
  @property
108
150
  def request_avg_finished_duration(self) -> timedelta | None:
109
151
  return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None
110
152
 
111
- @computed_field(alias='requestsTotal') # type: ignore[prop-decorator]
153
+ @computed_field(alias='requestsTotal')
112
154
  @property
113
155
  def requests_total(self) -> int:
114
156
  return self.requests_failed + self.requests_finished