crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_types.py +44 -5
  5. crawlee/_utils/context.py +3 -3
  6. crawlee/_utils/file.py +8 -1
  7. crawlee/_utils/globs.py +4 -4
  8. crawlee/_utils/recurring_task.py +12 -3
  9. crawlee/_utils/sitemap.py +12 -5
  10. crawlee/_utils/system.py +27 -11
  11. crawlee/_utils/time.py +41 -1
  12. crawlee/browsers/_browser_pool.py +1 -1
  13. crawlee/browsers/_playwright_browser.py +2 -1
  14. crawlee/crawlers/__init__.py +5 -1
  15. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  16. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
  17. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  18. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
  19. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  20. crawlee/crawlers/_basic/_basic_crawler.py +138 -124
  21. crawlee/crawlers/_basic/_context_utils.py +24 -0
  22. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  23. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  24. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  25. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  26. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
  27. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  28. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  29. crawlee/crawlers/_playwright/_types.py +12 -2
  30. crawlee/errors.py +4 -0
  31. crawlee/events/_event_manager.py +12 -6
  32. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  33. crawlee/http_clients/_base.py +4 -0
  34. crawlee/http_clients/_curl_impersonate.py +68 -14
  35. crawlee/http_clients/_httpx.py +16 -6
  36. crawlee/http_clients/_impit.py +25 -10
  37. crawlee/otel/crawler_instrumentor.py +1 -3
  38. crawlee/request_loaders/_sitemap_request_loader.py +18 -5
  39. crawlee/router.py +13 -3
  40. crawlee/sessions/_cookies.py +13 -8
  41. crawlee/sessions/_models.py +3 -3
  42. crawlee/statistics/_models.py +51 -9
  43. crawlee/statistics/_statistics.py +2 -21
  44. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  45. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  46. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  47. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  48. crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
  49. crawlee/storage_clients/_redis/_client_mixin.py +1 -4
  50. crawlee/storage_clients/_redis/_dataset_client.py +6 -2
  51. crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
  52. crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
  53. crawlee/storage_clients/_redis/_storage_client.py +12 -9
  54. crawlee/storage_clients/_redis/_utils.py +1 -1
  55. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  56. crawlee/storage_clients/_sql/_storage_client.py +0 -9
  57. crawlee/storage_clients/models.py +8 -3
  58. crawlee/storages/_storage_instance_manager.py +103 -44
  59. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
  60. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
  61. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  62. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  63. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
@@ -6,7 +6,7 @@ from logging import getLogger
6
6
  from typing import TYPE_CHECKING, Any, TypedDict
7
7
 
8
8
  from cachetools import LRUCache
9
- from impit import AsyncClient, Browser, HTTPError, Response, TransportError
9
+ from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
10
10
  from impit import ProxyError as ImpitProxyError
11
11
  from typing_extensions import override
12
12
 
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
125
125
  session: Session | None = None,
126
126
  proxy_info: ProxyInfo | None = None,
127
127
  statistics: Statistics | None = None,
128
+ timeout: timedelta | None = None,
128
129
  ) -> HttpCrawlingResult:
129
130
  client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
130
131
 
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
134
135
  method=request.method,
135
136
  content=request.payload,
136
137
  headers=dict(request.headers) if request.headers else None,
138
+ timeout=timeout.total_seconds() if timeout else None,
137
139
  )
140
+ except TimeoutException as exc:
141
+ raise asyncio.TimeoutError from exc
138
142
  except (TransportError, HTTPError) as exc:
139
143
  if self._is_proxy_error(exc):
140
144
  raise ProxyError from exc
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
157
161
  payload: HttpPayload | None = None,
158
162
  session: Session | None = None,
159
163
  proxy_info: ProxyInfo | None = None,
164
+ timeout: timedelta | None = None,
160
165
  ) -> HttpResponse:
161
166
  if isinstance(headers, dict) or headers is None:
162
167
  headers = HttpHeaders(headers or {})
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
165
170
 
166
171
  try:
167
172
  response = await client.request(
168
- method=method, url=url, content=payload, headers=dict(headers) if headers else None
173
+ method=method,
174
+ url=url,
175
+ content=payload,
176
+ headers=dict(headers) if headers else None,
177
+ timeout=timeout.total_seconds() if timeout else None,
169
178
  )
179
+ except TimeoutException as exc:
180
+ raise asyncio.TimeoutError from exc
170
181
  except (TransportError, HTTPError) as exc:
171
182
  if self._is_proxy_error(exc):
172
183
  raise ProxyError from exc
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
189
200
  ) -> AsyncGenerator[HttpResponse]:
190
201
  client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
191
202
 
192
- response = await client.request(
193
- method=method,
194
- url=url,
195
- content=payload,
196
- headers=dict(headers) if headers else None,
197
- timeout=timeout.total_seconds() if timeout else None,
198
- stream=True,
199
- )
203
+ try:
204
+ response = await client.request(
205
+ method=method,
206
+ url=url,
207
+ content=payload,
208
+ headers=dict(headers) if headers else None,
209
+ timeout=timeout.total_seconds() if timeout else None,
210
+ stream=True,
211
+ )
212
+ except TimeoutException as exc:
213
+ raise asyncio.TimeoutError from exc
214
+
200
215
  try:
201
216
  yield _ImpitResponse(response)
202
217
  finally:
@@ -3,9 +3,7 @@ from __future__ import annotations
3
3
  import inspect
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
- from opentelemetry.instrumentation.instrumentor import ( # type:ignore[attr-defined] # Mypy has troubles with OTEL
7
- BaseInstrumentor,
8
- )
6
+ from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
9
7
  from opentelemetry.instrumentation.utils import unwrap
10
8
  from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME
11
9
  from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
9
9
  from pydantic import BaseModel, ConfigDict, Field
10
10
  from typing_extensions import override
11
11
 
12
- from crawlee import Request
12
+ from crawlee import Request, RequestOptions
13
13
  from crawlee._utils.docs import docs_group
14
14
  from crawlee._utils.globs import Glob
15
15
  from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
18
18
 
19
19
  if TYPE_CHECKING:
20
20
  import re
21
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
22
  from types import TracebackType
23
23
 
24
+ from crawlee import RequestTransformAction
24
25
  from crawlee.http_clients import HttpClient
25
26
  from crawlee.proxy_configuration import ProxyInfo
26
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -112,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
112
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
113
114
  max_buffer_size: int = 200,
114
115
  persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
115
117
  ) -> None:
116
118
  """Initialize the sitemap request loader.
117
119
 
@@ -125,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
125
127
  persist_state_key: A key for persisting the loader's state in the KeyValueStore.
126
128
  When provided, allows resuming from where it left off after interruption.
127
129
  If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
128
133
  """
129
134
  self._http_client = http_client
130
135
  self._sitemap_urls = sitemap_urls
@@ -132,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
132
137
  self._exclude = exclude
133
138
  self._proxy_info = proxy_info
134
139
  self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
135
141
 
136
142
  # Synchronization for queue operations
137
143
  self._queue_has_capacity = asyncio.Event()
@@ -224,7 +230,7 @@ class SitemapRequestLoader(RequestLoader):
224
230
  continue
225
231
  state.in_progress_sitemap_url = sitemap_url
226
232
 
227
- parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
233
+ parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
228
234
 
229
235
  async for item in parse_sitemap(
230
236
  [SitemapSource(type='url', url=sitemap_url)],
@@ -313,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
313
319
 
314
320
  async with self._queue_lock:
315
321
  url = state.url_queue.popleft()
316
-
317
- request = Request.from_url(url)
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
318
331
  state.in_progress.add(request.url)
319
332
  if len(state.url_queue) < self._max_buffer_size:
320
333
  self._queue_has_capacity.set()
crawlee/router.py CHANGED
@@ -1,13 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from collections.abc import Awaitable, Callable
4
5
  from typing import Generic, TypeVar
5
6
 
7
+ from crawlee._request import RequestState
6
8
  from crawlee._types import BasicCrawlingContext
7
9
  from crawlee._utils.docs import docs_group
8
10
 
9
11
  __all__ = ['Router']
10
12
 
13
+ from crawlee.errors import UserHandlerTimeoutError
14
+
11
15
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
12
16
  RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
13
17
 
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
89
93
 
90
94
  async def __call__(self, context: TCrawlingContext) -> None:
91
95
  """Invoke a request handler that matches the request label (or the default)."""
96
+ context.request.state = RequestState.REQUEST_HANDLER
92
97
  if context.request.label is None or context.request.label not in self._handlers_by_label:
93
98
  if self._default_handler is None:
94
99
  raise RuntimeError(
95
100
  f'No handler matches label `{context.request.label}` and no default handler is configured'
96
101
  )
97
102
 
98
- return await self._default_handler(context)
103
+ user_defined_handler = self._default_handler
104
+ else:
105
+ user_defined_handler = self._handlers_by_label[context.request.label]
99
106
 
100
- handler = self._handlers_by_label[context.request.label]
101
- return await handler(context)
107
+ try:
108
+ return await user_defined_handler(context)
109
+ except asyncio.TimeoutError as e:
110
+ # Timeout in handler, but not timeout of handler.
111
+ raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
@@ -10,6 +10,7 @@ from crawlee._utils.docs import docs_group
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from collections.abc import Iterator
13
+ from typing import TypeGuard
13
14
 
14
15
 
15
16
  @docs_group('Session management')
@@ -66,17 +67,18 @@ class SessionCookies:
66
67
 
67
68
  self._jar = CookieJar()
68
69
 
69
- if isinstance(cookies, dict):
70
- for key, value in cookies.items():
71
- self.set(key, value)
72
-
73
- elif isinstance(cookies, list):
70
+ if isinstance(cookies, list):
74
71
  for item in cookies:
75
72
  self.set(**item)
76
73
 
77
74
  elif isinstance(cookies, SessionCookies):
78
75
  for cookie in cookies.jar:
79
- self.jar.set_cookie(cookie)
76
+ self._jar.set_cookie(cookie)
77
+
78
+ elif isinstance(cookies, dict):
79
+ cookies_dict: dict[str, str] = cookies
80
+ for key, value in cookies_dict.items():
81
+ self.set(key, value)
80
82
 
81
83
  @property
82
84
  def jar(self) -> CookieJar:
@@ -151,8 +153,8 @@ class SessionCookies:
151
153
  if cookie.expires:
152
154
  cookie_dict['expires'] = cookie.expires
153
155
 
154
- if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site in {'Lax', 'None', 'Strict'}:
155
- cookie_dict['same_site'] = same_site # type: ignore[typeddict-item]
156
+ if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):
157
+ cookie_dict['same_site'] = same_site
156
158
 
157
159
  return cookie_dict
158
160
 
@@ -273,3 +275,6 @@ class SessionCookies:
273
275
  """Return hash based on the cookies key attributes."""
274
276
  cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)
275
277
  return hash(cookie_tuples)
278
+
279
+ def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:
280
+ return value in {'Lax', 'None', 'Strict'}
@@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel):
63
63
  ),
64
64
  ]
65
65
 
66
- @computed_field(alias='sessionCount') # type: ignore[prop-decorator]
66
+ @computed_field(alias='sessionCount')
67
67
  @property
68
68
  def session_count(self) -> int:
69
69
  """Get the total number of sessions currently maintained in the pool."""
70
70
  return len(self.sessions)
71
71
 
72
- @computed_field(alias='usableSessionCount') # type: ignore[prop-decorator]
72
+ @computed_field(alias='usableSessionCount')
73
73
  @property
74
74
  def usable_session_count(self) -> int:
75
75
  """Get the number of sessions that are currently usable."""
76
76
  return len([session for _, session in self.sessions.items() if session.is_usable])
77
77
 
78
- @computed_field(alias='retiredSessionCount') # type: ignore[prop-decorator]
78
+ @computed_field(alias='retiredSessionCount')
79
79
  @property
80
80
  def retired_session_count(self) -> int:
81
81
  """Get the number of sessions that are no longer usable."""
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import warnings
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime, timedelta, timezone
6
- from typing import Annotated, Any
7
+ from typing import TYPE_CHECKING, Annotated, Any
7
8
 
8
9
  from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field
9
10
  from typing_extensions import override
@@ -76,10 +77,20 @@ class StatisticsState(BaseModel):
76
77
  crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
77
78
  crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
78
79
  crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
79
- crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
80
- errors: dict[str, Any] = Field(default_factory=dict)
81
- retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
82
- requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
80
+
81
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
82
+ if TYPE_CHECKING:
83
+ errors: dict[str, Any] = {}
84
+ retry_errors: dict[str, Any] = {}
85
+ requests_with_status_code: dict[str, int] = {}
86
+ else:
87
+ errors: Annotated[dict[str, Any], Field(default_factory=dict)]
88
+ retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]
89
+ requests_with_status_code: Annotated[
90
+ dict[str, int],
91
+ Field(alias='requestsWithStatusCode', default_factory=dict),
92
+ ]
93
+
83
94
  stats_persisted_at: Annotated[
84
95
  datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))
85
96
  ] = None
@@ -93,22 +104,53 @@ class StatisticsState(BaseModel):
93
104
  ),
94
105
  ] = {}
95
106
 
96
- @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
107
+ # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
108
+ _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
109
+
110
+ def model_post_init(self, /, __context: Any) -> None:
111
+ self._runtime_offset = self.crawler_runtime or self._runtime_offset
112
+
113
+ @property
114
+ def crawler_runtime(self) -> timedelta:
115
+ if self.crawler_last_started_at:
116
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
117
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
118
+ return self._runtime_offset
119
+
120
+ @crawler_runtime.setter
121
+ def crawler_runtime(self, value: timedelta) -> None:
122
+ # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
123
+ # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
124
+ warnings.warn(
125
+ f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
126
+ f' Value {value} will not be used.',
127
+ DeprecationWarning,
128
+ stacklevel=2,
129
+ )
130
+
131
+ @computed_field(alias='crawlerRuntimeMillis')
132
+ def crawler_runtime_for_serialization(self) -> timedelta:
133
+ if self.crawler_last_started_at:
134
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
135
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
136
+ return self._runtime_offset
137
+
138
+ @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)
97
139
  @property
98
140
  def request_total_duration(self) -> timedelta:
99
141
  return self.request_total_finished_duration + self.request_total_failed_duration
100
142
 
101
- @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator]
143
+ @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
102
144
  @property
103
145
  def request_avg_failed_duration(self) -> timedelta | None:
104
146
  return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None
105
147
 
106
- @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator]
148
+ @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
107
149
  @property
108
150
  def request_avg_finished_duration(self) -> timedelta | None:
109
151
  return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None
110
152
 
111
- @computed_field(alias='requestsTotal') # type: ignore[prop-decorator]
153
+ @computed_field(alias='requestsTotal')
112
154
  @property
113
155
  def requests_total(self) -> int:
114
156
  return self.requests_failed + self.requests_finished
@@ -110,9 +110,6 @@ class Statistics(Generic[TStatisticsState]):
110
110
  # Flag to indicate the context state.
111
111
  self._active = False
112
112
 
113
- # Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
114
- self._runtime_offset = timedelta(seconds=0)
115
-
116
113
  def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
117
114
  """Create near copy of the `Statistics` with replaced `state_model`."""
118
115
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
@@ -168,8 +165,8 @@ class Statistics(Generic[TStatisticsState]):
168
165
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
169
166
 
170
167
  await self._state.initialize()
171
-
172
- self._runtime_offset = self.state.crawler_runtime
168
+ # Reset `crawler_finished_at` to indicate a new run in progress.
169
+ self.state.crawler_finished_at = None
173
170
 
174
171
  # Start periodic logging and let it print initial state before activation.
175
172
  self._periodic_logger.start()
@@ -200,10 +197,6 @@ class Statistics(Generic[TStatisticsState]):
200
197
  # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
201
198
  await self._periodic_logger.stop()
202
199
  self.state.crawler_finished_at = datetime.now(timezone.utc)
203
- self.state.crawler_runtime = (
204
- self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
205
- )
206
-
207
200
  self._active = False
208
201
  await self._state.teardown()
209
202
 
@@ -262,20 +255,8 @@ class Statistics(Generic[TStatisticsState]):
262
255
 
263
256
  del self._requests_in_progress[request_id_or_key]
264
257
 
265
- def _update_crawler_runtime(self) -> None:
266
- current_run_duration = (
267
- (datetime.now(timezone.utc) - self.state.crawler_last_started_at)
268
- if self.state.crawler_last_started_at
269
- else timedelta()
270
- )
271
- self.state.crawler_runtime = current_run_duration + self._runtime_offset
272
-
273
258
  def calculate(self) -> FinalStatistics:
274
259
  """Calculate the current statistics."""
275
- if self._active:
276
- # Only update state when active. If not, just report the last known runtime.
277
- self._update_crawler_runtime()
278
-
279
260
  total_minutes = self.state.crawler_runtime.total_seconds() / 60
280
261
  state = self._state.current_value
281
262
  serialized_state = state.model_dump(by_alias=False)
@@ -87,8 +87,8 @@ class DatasetClient(ABC):
87
87
 
88
88
  The backend method for the `Dataset.iterate_items` call.
89
89
  """
90
- # This syntax is to make mypy properly work with abstract AsyncIterator.
90
+ # This syntax is to make type checker properly work with abstract AsyncIterator.
91
91
  # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
92
92
  raise NotImplementedError
93
- if False: # type: ignore[unreachable]
93
+ if False:
94
94
  yield 0
@@ -72,10 +72,10 @@ class KeyValueStoreClient(ABC):
72
72
 
73
73
  The backend method for the `KeyValueStore.iterate_keys` call.
74
74
  """
75
- # This syntax is to make mypy properly work with abstract AsyncIterator.
75
+ # This syntax is to make type checker properly work with abstract AsyncIterator.
76
76
  # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
77
77
  raise NotImplementedError
78
- if False: # type: ignore[unreachable]
78
+ if False:
79
79
  yield 0
80
80
 
81
81
  @abstractmethod
@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
134
134
  continue
135
135
 
136
136
  try:
137
- file = await asyncio.to_thread(path_to_metadata.open)
137
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
138
138
  try:
139
139
  file_content = json.load(file)
140
140
  metadata = DatasetMetadata(**file_content)
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
163
163
 
164
164
  # If the dataset directory exists, reconstruct the client from the metadata file.
165
165
  if path_to_dataset.exists() and path_to_metadata.exists():
166
- file = await asyncio.to_thread(open, path_to_metadata)
166
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
167
167
  try:
168
168
  file_content = json.load(file)
169
169
  finally:
@@ -473,9 +473,10 @@ class FileSystemDatasetClient(DatasetClient):
473
473
  """
474
474
  # Retrieve and sort all JSON files in the dataset directory numerically.
475
475
  files = await asyncio.to_thread(
476
- sorted,
477
- self.path_to_dataset.glob('*.json'),
478
- key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
476
+ lambda: sorted(
477
+ self.path_to_dataset.glob('*.json'),
478
+ key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
479
+ )
479
480
  )
480
481
 
481
482
  # Remove the metadata file from the list if present.
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import functools
4
5
  import json
5
6
  import shutil
6
7
  import urllib.parse
@@ -133,7 +134,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
133
134
  continue
134
135
 
135
136
  try:
136
- file = await asyncio.to_thread(path_to_metadata.open)
137
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
137
138
  try:
138
139
  file_content = json.load(file)
139
140
  metadata = KeyValueStoreMetadata(**file_content)
@@ -162,7 +163,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
162
163
 
163
164
  # If the key-value store directory exists, reconstruct the client from the metadata file.
164
165
  if path_to_kvs.exists() and path_to_metadata.exists():
165
- file = await asyncio.to_thread(open, path_to_metadata)
166
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
166
167
  try:
167
168
  file_content = json.load(file)
168
169
  finally:
@@ -239,7 +240,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
239
240
  # Read the metadata file
240
241
  async with self._lock:
241
242
  try:
242
- file = await asyncio.to_thread(open, record_metadata_filepath)
243
+ file = await asyncio.to_thread(
244
+ functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),
245
+ )
243
246
  except FileNotFoundError:
244
247
  logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
245
248
  return None
@@ -373,7 +376,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
373
376
 
374
377
  # List and sort all files *inside* a brief lock, then release it immediately:
375
378
  async with self._lock:
376
- files = sorted(await asyncio.to_thread(list, self.path_to_kvs.glob('*')))
379
+ files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))
377
380
 
378
381
  count = 0
379
382
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import functools
4
5
  import json
5
6
  import shutil
6
7
  from collections import deque
@@ -197,7 +198,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
197
198
  continue
198
199
 
199
200
  try:
200
- file = await asyncio.to_thread(path_to_metadata.open)
201
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
201
202
  try:
202
203
  file_content = json.load(file)
203
204
  metadata = RequestQueueMetadata(**file_content)
@@ -232,7 +233,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
232
233
 
233
234
  # If the RQ directory exists, reconstruct the client from the metadata file.
234
235
  if path_to_rq.exists() and path_to_metadata.exists():
235
- file = await asyncio.to_thread(open, path_to_metadata)
236
+ file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')
236
237
  try:
237
238
  file_content = json.load(file)
238
239
  finally:
@@ -756,7 +757,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
756
757
  await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)
757
758
 
758
759
  # List all the json files.
759
- files = await asyncio.to_thread(list, path_to_rq.glob('*.json'))
760
+ files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json')))
760
761
 
761
762
  # Filter out metadata file and non-file entries.
762
763
  filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
@@ -775,7 +776,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
775
776
  """
776
777
  # Open the request file.
777
778
  try:
778
- file = await asyncio.to_thread(open, file_path)
779
+ file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))
779
780
  except FileNotFoundError:
780
781
  logger.warning(f'Request file "{file_path}" not found.')
781
782
  return None
@@ -179,7 +179,7 @@ class RedisClientMixin:
179
179
  """Create a new Redis pipeline."""
180
180
  async with self._redis.pipeline() as pipe:
181
181
  try:
182
- pipe.multi() # type: ignore[no-untyped-call]
182
+ pipe.multi()
183
183
  yield pipe
184
184
  finally:
185
185
  if with_execute:
@@ -187,7 +187,6 @@ class RedisClientMixin:
187
187
 
188
188
  async def _create_storage(self, pipeline: Pipeline) -> None:
189
189
  """Create the actual storage structure in Redis."""
190
- _ = pipeline # To avoid unused variable mypy error
191
190
 
192
191
  async def _create_script(self, script_name: str) -> AsyncScript:
193
192
  """Load a Lua script from a file and return a Script object."""
@@ -262,8 +261,6 @@ class RedisClientMixin:
262
261
  pipeline: The Redis pipeline to use for the update.
263
262
  **kwargs: Storage-specific update parameters.
264
263
  """
265
- _ = pipeline # To avoid unused variable mypy error
266
- _ = kwargs
267
264
 
268
265
  async def _update_metadata(
269
266
  self,
@@ -179,13 +179,15 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin):
179
179
  case (True, int(), None):
180
180
  json_path += f'[:-{offset}]'
181
181
  case (True, int(), int()):
182
- json_path += f'[-{offset + limit}:-{offset}]'
182
+ # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
183
+ json_path += f'[-{offset + limit}:-{offset}]' # ty: ignore[unsupported-operator]
183
184
  case (False, 0, int()):
184
185
  json_path += f'[:{limit}]'
185
186
  case (False, int(), None):
186
187
  json_path += f'[{offset}:]'
187
188
  case (False, int(), int()):
188
- json_path += f'[{offset}:{offset + limit}]'
189
+ # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
190
+ json_path += f'[{offset}:{offset + limit}]' # ty: ignore[unsupported-operator]
189
191
 
190
192
  if json_path == '$':
191
193
  json_path = '$[*]'
@@ -195,6 +197,8 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin):
195
197
  if data is None:
196
198
  data = []
197
199
 
200
+ data = [item for item in data if isinstance(item, dict)]
201
+
198
202
  if skip_empty:
199
203
  data = [item for item in data if item]
200
204
 
@@ -144,7 +144,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
144
144
 
145
145
  async with self._get_pipeline() as pipe:
146
146
  # redis-py typing issue
147
- await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # type: ignore[arg-type]
147
+ await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # ty: ignore[invalid-argument-type]
148
148
 
149
149
  await await_redis_response(
150
150
  pipe.hset(
@@ -174,9 +174,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
174
174
 
175
175
  # Query the record by key
176
176
  # redis-py typing issue
177
- value_bytes: bytes | None = await await_redis_response(
178
- self._redis.hget(self._items_key, key) # type: ignore[arg-type]
179
- )
177
+ value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key)) # ty: ignore[invalid-assignment]
180
178
 
181
179
  if value_bytes is None:
182
180
  logger.warning(f'Value for key "{key}" is missing.')
@@ -225,7 +223,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
225
223
  raise TypeError('The items data was received in an incorrect format.')
226
224
 
227
225
  # Get all keys, sorted alphabetically
228
- keys = sorted(items_data.keys())
226
+ keys = sorted(items_data.keys()) # ty: ignore[invalid-argument-type]
229
227
 
230
228
  # Apply exclusive_start_key filter if provided
231
229
  if exclusive_start_key is not None: