crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (93) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +62 -32
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +52 -19
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +160 -134
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
  61. crawlee/storage_clients/_memory/_dataset_client.py +2 -2
  62. crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
  63. crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
  64. crawlee/storage_clients/_redis/__init__.py +6 -0
  65. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  66. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  67. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  68. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  69. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  70. crawlee/storage_clients/_redis/_utils.py +23 -0
  71. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  72. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  73. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  74. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  75. crawlee/storage_clients/_redis/py.typed +0 -0
  76. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  77. crawlee/storage_clients/_sql/_dataset_client.py +2 -2
  78. crawlee/storage_clients/_sql/_db_models.py +1 -2
  79. crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
  80. crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
  81. crawlee/storage_clients/_sql/_storage_client.py +1 -1
  82. crawlee/storage_clients/models.py +8 -3
  83. crawlee/storages/_base.py +3 -1
  84. crawlee/storages/_dataset.py +3 -0
  85. crawlee/storages/_key_value_store.py +8 -2
  86. crawlee/storages/_request_queue.py +3 -0
  87. crawlee/storages/_storage_instance_manager.py +109 -42
  88. crawlee/storages/_utils.py +11 -0
  89. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
  90. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
  91. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  92. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  93. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from importlib import metadata
2
2
 
3
- from ._request import Request, RequestOptions
3
+ from ._request import Request, RequestOptions, RequestState
4
4
  from ._service_locator import service_locator
5
5
  from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
6
6
  from ._utils.globs import Glob
@@ -14,6 +14,7 @@ __all__ = [
14
14
  'HttpHeaders',
15
15
  'Request',
16
16
  'RequestOptions',
17
+ 'RequestState',
17
18
  'RequestTransformAction',
18
19
  'SkippedReason',
19
20
  'service_locator',
@@ -1,4 +1,8 @@
1
1
  # ruff: noqa: N802, PLC0415
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from collections.abc import Callable
2
6
 
3
7
 
4
8
  def patch_browserforge() -> None:
@@ -12,7 +16,7 @@ def patch_browserforge() -> None:
12
16
  import apify_fingerprint_datapoints
13
17
  from browserforge import download
14
18
 
15
- download.DATA_DIRS: dict[str, Path] = { # type:ignore[misc]
19
+ download.DATA_DIRS = {
16
20
  'headers': apify_fingerprint_datapoints.get_header_network().parent,
17
21
  'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
18
22
  }
@@ -20,7 +24,7 @@ def patch_browserforge() -> None:
20
24
  def DownloadIfNotExists(**flags: bool) -> None:
21
25
  pass
22
26
 
23
- download.DownloadIfNotExists = DownloadIfNotExists
27
+ download.DownloadIfNotExists: Callable[..., None] = DownloadIfNotExists
24
28
 
25
29
  import browserforge.bayesian_network
26
30
 
@@ -33,7 +37,7 @@ def patch_browserforge() -> None:
33
37
  path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
34
38
  super().__init__(path)
35
39
 
36
- browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc]
40
+ browserforge.bayesian_network.BayesianNetwork: BayesianNetwork = BayesianNetwork
37
41
  import browserforge.headers.generator
38
42
 
39
43
  browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']
crawlee/_request.py CHANGED
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
34
34
  class CrawleeRequestData(BaseModel):
35
35
  """Crawlee-specific configuration stored in the `user_data`."""
36
36
 
37
- max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
37
+ max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
38
38
  """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
39
39
  `BasicCrawler`."""
40
40
 
41
41
  enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
42
42
  """The strategy that was used for enqueuing the request."""
43
43
 
44
- state: RequestState | None = None
44
+ state: RequestState = RequestState.UNPROCESSED
45
45
  """Describes the request's current lifecycle state."""
46
46
 
47
47
  session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -93,7 +93,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
93
93
  def __delitem__(self, key: str) -> None:
94
94
  del self.__pydantic_extra__[key]
95
95
 
96
- def __iter__(self) -> Iterator[str]: # type: ignore[override]
96
+ def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
97
97
  yield from self.__pydantic_extra__
98
98
 
99
99
  def __len__(self) -> int:
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
137
137
  always_enqueue: NotRequired[bool]
138
138
  user_data: NotRequired[dict[str, JsonSerializable]]
139
139
  no_retry: NotRequired[bool]
140
+ enqueue_strategy: NotRequired[EnqueueStrategy]
141
+ max_retries: NotRequired[int | None]
140
142
 
141
143
 
142
144
  @docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
166
168
 
167
169
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
168
170
 
169
- unique_key: Annotated[str, Field(alias='uniqueKey')]
171
+ unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
170
172
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
171
173
  to the same URL.
172
174
 
@@ -178,40 +180,52 @@ class Request(BaseModel):
178
180
  and specify which URLs shall be considered equal.
179
181
  """
180
182
 
181
- url: Annotated[str, BeforeValidator(validate_http_url), Field()]
183
+ url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
182
184
  """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
183
185
  and fragments."""
184
186
 
185
- method: HttpMethod = 'GET'
187
+ method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
186
188
  """HTTP request method."""
187
189
 
188
- headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
189
- """HTTP request headers."""
190
-
191
190
  payload: Annotated[
192
191
  HttpPayload | None,
193
192
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
194
193
  PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
194
+ Field(frozen=True),
195
195
  ] = None
196
196
  """HTTP request payload."""
197
197
 
198
- user_data: Annotated[
199
- dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
200
- Field(alias='userData', default_factory=lambda: UserData()),
201
- PlainValidator(user_data_adapter.validate_python),
202
- PlainSerializer(
203
- lambda instance: user_data_adapter.dump_python(
204
- instance,
205
- by_alias=True,
206
- exclude_none=True,
207
- exclude_unset=True,
208
- exclude_defaults=True,
209
- )
210
- ),
211
- ] = {}
212
- """Custom user data assigned to the request. Use this to save any request related data to the
213
- request's scope, keeping them accessible on retries, failures etc.
214
- """
198
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
199
+ if TYPE_CHECKING:
200
+ headers: HttpHeaders = HttpHeaders()
201
+ """HTTP request headers."""
202
+
203
+ user_data: dict[str, JsonSerializable] = {}
204
+ """Custom user data assigned to the request. Use this to save any request related data to the
205
+ request's scope, keeping them accessible on retries, failures etc.
206
+ """
207
+
208
+ else:
209
+ headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
210
+ """HTTP request headers."""
211
+
212
+ user_data: Annotated[
213
+ dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
214
+ Field(alias='userData', default_factory=lambda: UserData()),
215
+ PlainValidator(user_data_adapter.validate_python),
216
+ PlainSerializer(
217
+ lambda instance: user_data_adapter.dump_python(
218
+ instance,
219
+ by_alias=True,
220
+ exclude_none=True,
221
+ exclude_unset=True,
222
+ exclude_defaults=True,
223
+ )
224
+ ),
225
+ ]
226
+ """Custom user data assigned to the request. Use this to save any request related data to the
227
+ request's scope, keeping them accessible on retries, failures etc.
228
+ """
215
229
 
216
230
  retry_count: Annotated[int, Field(alias='retryCount')] = 0
217
231
  """Number of times the request has been retried."""
@@ -239,6 +253,8 @@ class Request(BaseModel):
239
253
  keep_url_fragment: bool = False,
240
254
  use_extended_unique_key: bool = False,
241
255
  always_enqueue: bool = False,
256
+ enqueue_strategy: EnqueueStrategy | None = None,
257
+ max_retries: int | None = None,
242
258
  **kwargs: Any,
243
259
  ) -> Self:
244
260
  """Create a new `Request` instance from a URL.
@@ -266,6 +282,9 @@ class Request(BaseModel):
266
282
  `unique_key` computation. This is only relevant when `unique_key` is not provided.
267
283
  always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
268
284
  Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
285
+ enqueue_strategy: The strategy that will be used for enqueuing the request.
286
+ max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
287
+ option of `BasicCrawler`.
269
288
  **kwargs: Additional request properties.
270
289
  """
271
290
  if unique_key is not None and always_enqueue:
@@ -288,7 +307,21 @@ class Request(BaseModel):
288
307
  )
289
308
 
290
309
  if always_enqueue:
291
- unique_key = f'{unique_key}_{crypto_random_object_id()}'
310
+ unique_key = f'{crypto_random_object_id()}|{unique_key}'
311
+
312
+ user_data_dict = kwargs.pop('user_data', {}) or {}
313
+ crawlee_data_dict = user_data_dict.get('__crawlee', {})
314
+
315
+ if max_retries is not None:
316
+ crawlee_data_dict['maxRetries'] = max_retries
317
+
318
+ if enqueue_strategy is not None:
319
+ crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
320
+
321
+ crawlee_data = CrawleeRequestData(**crawlee_data_dict)
322
+
323
+ if crawlee_data:
324
+ user_data_dict['__crawlee'] = crawlee_data
292
325
 
293
326
  request = cls(
294
327
  url=url,
@@ -296,6 +329,7 @@ class Request(BaseModel):
296
329
  method=method,
297
330
  headers=headers,
298
331
  payload=payload,
332
+ user_data=user_data_dict,
299
333
  **kwargs,
300
334
  )
301
335
 
@@ -341,7 +375,7 @@ class Request(BaseModel):
341
375
  self.crawlee_data.crawl_depth = new_value
342
376
 
343
377
  @property
344
- def state(self) -> RequestState | None:
378
+ def state(self) -> RequestState:
345
379
  """Crawlee-specific request handling state."""
346
380
  return self.crawlee_data.state
347
381
 
@@ -354,10 +388,6 @@ class Request(BaseModel):
354
388
  """Crawlee-specific limit on the number of retries of the request."""
355
389
  return self.crawlee_data.max_retries
356
390
 
357
- @max_retries.setter
358
- def max_retries(self, new_max_retries: int) -> None:
359
- self.crawlee_data.max_retries = new_max_retries
360
-
361
391
  @property
362
392
  def session_rotation_count(self) -> int | None:
363
393
  """Crawlee-specific number of finished session rotations for the request."""
@@ -38,7 +38,7 @@ class ServiceLocator:
38
38
  def get_configuration(self) -> Configuration:
39
39
  """Get the configuration."""
40
40
  if self._configuration is None:
41
- logger.warning('No configuration set, implicitly creating and using default Configuration.')
41
+ logger.debug('No configuration set, implicitly creating and using default Configuration.')
42
42
  self._configuration = Configuration()
43
43
 
44
44
  return self._configuration
@@ -63,9 +63,9 @@ class ServiceLocator:
63
63
  def get_event_manager(self) -> EventManager:
64
64
  """Get the event manager."""
65
65
  if self._event_manager is None:
66
- logger.warning('No event manager set, implicitly creating and using default LocalEventManager.')
66
+ logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
67
67
  if self._configuration is None:
68
- logger.warning(
68
+ logger.debug(
69
69
  'Implicit creation of event manager will implicitly set configuration as side effect. '
70
70
  'It is advised to explicitly first set the configuration instead.'
71
71
  )
@@ -93,7 +93,7 @@ class ServiceLocator:
93
93
  def get_storage_client(self) -> StorageClient:
94
94
  """Get the storage client."""
95
95
  if self._storage_client is None:
96
- logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.')
96
+ logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
97
97
  if self._configuration is None:
98
98
  logger.warning(
99
99
  'Implicit creation of storage client will implicitly set configuration as side effect. '
crawlee/_types.py CHANGED
@@ -2,18 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
+ from copy import deepcopy
5
6
  from dataclasses import dataclass
6
- from typing import (
7
- TYPE_CHECKING,
8
- Annotated,
9
- Any,
10
- Literal,
11
- Protocol,
12
- TypedDict,
13
- TypeVar,
14
- cast,
15
- overload,
16
- )
7
+ from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
17
8
 
18
9
  from pydantic import ConfigDict, Field, PlainValidator, RootModel
19
10
 
@@ -25,7 +16,7 @@ if TYPE_CHECKING:
25
16
  import re
26
17
  from collections.abc import Callable, Coroutine, Sequence
27
18
 
28
- from typing_extensions import NotRequired, Required, Unpack
19
+ from typing_extensions import NotRequired, Required, Self, Unpack
29
20
 
30
21
  from crawlee import Glob, Request
31
22
  from crawlee._request import RequestOptions
@@ -71,11 +62,15 @@ class HttpHeaders(RootModel, Mapping[str, str]):
71
62
 
72
63
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
73
64
 
74
- root: Annotated[
75
- dict[str, str],
76
- PlainValidator(lambda value: _normalize_headers(value)),
77
- Field(default_factory=dict),
78
- ] = {}
65
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
66
+ if TYPE_CHECKING:
67
+ root: dict[str, str] = {}
68
+ else:
69
+ root: Annotated[
70
+ dict[str, str],
71
+ PlainValidator(lambda value: _normalize_headers(value)),
72
+ Field(default_factory=lambda: dict[str, str]()),
73
+ ]
79
74
 
80
75
  def __getitem__(self, key: str) -> str:
81
76
  return self.root[key.lower()]
@@ -96,7 +91,7 @@ class HttpHeaders(RootModel, Mapping[str, str]):
96
91
  combined_headers = {**other, **self.root}
97
92
  return HttpHeaders(combined_headers)
98
93
 
99
- def __iter__(self) -> Iterator[str]: # type: ignore[override]
94
+ def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
100
95
  yield from self.root
101
96
 
102
97
  def __len__(self) -> int:
@@ -266,12 +261,24 @@ class KeyValueStoreChangeRecords:
266
261
  class RequestHandlerRunResult:
267
262
  """Record of calls to storage-related context helpers."""
268
263
 
269
- def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
264
+ def __init__(
265
+ self,
266
+ *,
267
+ key_value_store_getter: GetKeyValueStoreFunction,
268
+ request: Request,
269
+ ) -> None:
270
270
  self._key_value_store_getter = key_value_store_getter
271
271
  self.add_requests_calls = list[AddRequestsKwargs]()
272
272
  self.push_data_calls = list[PushDataFunctionCall]()
273
273
  self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
274
274
 
275
+ # Isolated copies for handler execution
276
+ self._request = deepcopy(request)
277
+
278
+ @property
279
+ def request(self) -> Request:
280
+ return self._request
281
+
275
282
  async def add_requests(
276
283
  self,
277
284
  requests: Sequence[str | Request],
@@ -321,6 +328,14 @@ class RequestHandlerRunResult:
321
328
 
322
329
  return self.key_value_store_changes[id, name, alias]
323
330
 
331
+ def apply_request_changes(self, target: Request) -> None:
332
+ """Apply tracked changes from handler copy to original request."""
333
+ if self.request.user_data != target.user_data:
334
+ target.user_data = self.request.user_data
335
+
336
+ if self.request.headers != target.headers:
337
+ target.headers = self.request.headers
338
+
324
339
 
325
340
  @docs_group('Functions')
326
341
  class AddRequestsFunction(Protocol):
@@ -649,6 +664,24 @@ class BasicCrawlingContext:
649
664
  """Return hash of the context. Each context is considered unique."""
650
665
  return id(self)
651
666
 
667
+ def create_modified_copy(
668
+ self,
669
+ push_data: PushDataFunction | None = None,
670
+ add_requests: AddRequestsFunction | None = None,
671
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
672
+ ) -> Self:
673
+ """Create a modified copy of the crawling context with specified changes."""
674
+ modifications = dict[str, Any]()
675
+
676
+ if push_data is not None:
677
+ modifications['push_data'] = push_data
678
+ if add_requests is not None:
679
+ modifications['add_requests'] = add_requests
680
+ if get_key_value_store is not None:
681
+ modifications['get_key_value_store'] = get_key_value_store
682
+
683
+ return dataclasses.replace(self, **modifications)
684
+
652
685
 
653
686
  class GetDataKwargs(TypedDict):
654
687
  """Keyword arguments for dataset's `get_data` method."""
crawlee/_utils/context.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
3
+ import inspect
4
4
  from collections.abc import Callable
5
5
  from functools import wraps
6
- from typing import Any, TypeVar
6
+ from typing import Any, TypeVar, cast
7
7
 
8
8
  T = TypeVar('T', bound=Callable[..., Any])
9
9
 
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
44
44
 
45
45
  return await method(self, *args, **kwargs)
46
46
 
47
- return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
47
+ return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)
crawlee/_utils/file.py CHANGED
@@ -163,7 +163,14 @@ async def export_csv_to_stream(
163
163
  dst: TextIO,
164
164
  **kwargs: Unpack[ExportDataCsvKwargs],
165
165
  ) -> None:
166
- writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
166
+ # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167
+ # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168
+ # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169
+ # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170
+ if 'lineterminator' not in kwargs:
171
+ kwargs['lineterminator'] = '\n'
172
+
173
+ writer = csv.writer(dst, **kwargs)
167
174
  write_header = True
168
175
 
169
176
  # Iterate over the dataset and write to CSV.
crawlee/_utils/globs.py CHANGED
@@ -33,12 +33,12 @@ def _translate(
33
33
 
34
34
  HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
35
35
  """
36
- if not seps:
37
- seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep
36
+ _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps
38
37
 
39
- escaped_seps = ''.join(map(re.escape, seps))
40
- any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
38
+ escaped_seps = ''.join(map(re.escape, _seps))
39
+ any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
41
40
  not_sep = f'[^{escaped_seps}]'
41
+
42
42
  if include_hidden:
43
43
  one_last_segment = f'{not_sep}+'
44
44
  one_segment = f'{one_last_segment}{any_sep}'
@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
7
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
7
8
  from crawlee.events._types import Event, EventPersistStateData
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  import logging
12
+ from collections.abc import Callable, Coroutine
11
13
 
12
- from crawlee.storages._key_value_store import KeyValueStore
14
+ from crawlee.storages import KeyValueStore
13
15
 
14
16
  TStateModel = TypeVar('TStateModel', bound=BaseModel)
15
17
 
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
37
39
  persistence_enabled: Literal[True, False, 'explicit_only'] = False,
38
40
  persist_state_kvs_name: str | None = None,
39
41
  persist_state_kvs_id: str | None = None,
42
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
40
43
  logger: logging.Logger,
41
44
  ) -> None:
42
45
  """Initialize a new recoverable state object.
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
51
54
  If neither a name nor and id are supplied, the default store will be used.
52
55
  persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
53
56
  If neither a name nor and id are supplied, the default store will be used.
57
+ persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
58
+ not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
54
59
  logger: A logger instance for logging operations related to state persistence
55
60
  """
61
+ raise_if_too_many_kwargs(
62
+ persist_state_kvs_name=persist_state_kvs_name,
63
+ persist_state_kvs_id=persist_state_kvs_id,
64
+ persist_state_kvs_factory=persist_state_kvs_factory,
65
+ )
66
+ if not persist_state_kvs_factory:
67
+ logger.debug(
68
+ 'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
69
+ 'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
70
+ 'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
71
+ 'global side effects.'
72
+ )
73
+
56
74
  self._default_state = default_state
57
75
  self._state_type: type[TStateModel] = self._default_state.__class__
58
76
  self._state: TStateModel | None = None
59
77
  self._persistence_enabled = persistence_enabled
60
78
  self._persist_state_key = persist_state_key
61
- self._persist_state_kvs_name = persist_state_kvs_name
62
- self._persist_state_kvs_id = persist_state_kvs_id
63
- self._key_value_store: 'KeyValueStore | None' = None # noqa: UP037
79
+ if persist_state_kvs_factory is None:
80
+
81
+ async def kvs_factory() -> KeyValueStore:
82
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
83
+
84
+ return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
85
+
86
+ self._persist_state_kvs_factory = kvs_factory
87
+ else:
88
+ self._persist_state_kvs_factory = persist_state_kvs_factory
89
+
90
+ self._key_value_store: KeyValueStore | None = None
64
91
  self._log = logger
65
92
 
66
93
  async def initialize(self) -> TStateModel:
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
77
104
  return self.current_value
78
105
 
79
106
  # Import here to avoid circular imports.
80
- from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415
81
107
 
82
- self._key_value_store = await KeyValueStore.open(
83
- name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
84
- )
108
+ self._key_value_store = await self._persist_state_kvs_factory()
85
109
 
86
110
  await self._load_saved_state()
87
111
 
@@ -1,12 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import inspect
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING
6
7
 
7
8
  if TYPE_CHECKING:
8
9
  from collections.abc import Callable
9
10
  from datetime import timedelta
11
+ from types import TracebackType
12
+
13
+ from typing_extensions import Self
10
14
 
11
15
  logger = getLogger(__name__)
12
16
 
@@ -21,11 +25,27 @@ class RecurringTask:
21
25
  """
22
26
 
23
27
  def __init__(self, func: Callable, delay: timedelta) -> None:
24
- logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...')
28
+ logger.debug(
29
+ 'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
30
+ func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
31
+ delay,
32
+ )
25
33
  self.func = func
26
34
  self.delay = delay
27
35
  self.task: asyncio.Task | None = None
28
36
 
37
+ async def __aenter__(self) -> Self:
38
+ self.start()
39
+ return self
40
+
41
+ async def __aexit__(
42
+ self,
43
+ exc_type: type[BaseException] | None,
44
+ exc_value: BaseException | None,
45
+ exc_traceback: TracebackType | None,
46
+ ) -> None:
47
+ await self.stop()
48
+
29
49
  async def _wrapper(self) -> None:
30
50
  """Continuously execute the provided function with the specified delay.
31
51
 
@@ -34,12 +54,16 @@ class RecurringTask:
34
54
  """
35
55
  sleep_time_secs = self.delay.total_seconds()
36
56
  while True:
37
- await self.func() if asyncio.iscoroutinefunction(self.func) else self.func()
57
+ await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
38
58
  await asyncio.sleep(sleep_time_secs)
39
59
 
40
60
  def start(self) -> None:
41
61
  """Start the recurring task execution."""
42
- self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}')
62
+ name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
63
+ self.task = asyncio.create_task(
64
+ self._wrapper(),
65
+ name=f'Task-recurring-{name}',
66
+ )
43
67
 
44
68
  async def stop(self) -> None:
45
69
  """Stop the recurring task execution."""
crawlee/_utils/robots.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from logging import getLogger
3
4
  from typing import TYPE_CHECKING
4
5
 
5
6
  from protego import Protego
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
15
16
  from crawlee.proxy_configuration import ProxyInfo
16
17
 
17
18
 
19
+ logger = getLogger(__name__)
20
+
21
+
18
22
  class RobotsTxtFile:
19
23
  def __init__(
20
24
  self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ class RobotsTxtFile:
56
60
  http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
57
61
  proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
58
62
  """
59
- response = await http_client.send_request(url, proxy_info=proxy_info)
60
- body = (
61
- b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
62
- )
63
+ try:
64
+ response = await http_client.send_request(url, proxy_info=proxy_info)
65
+
66
+ body = (
67
+ b'User-agent: *\nAllow: /'
68
+ if is_status_code_client_error(response.status_code)
69
+ else await response.read()
70
+ )
71
+ robots = Protego.parse(body.decode('utf-8'))
72
+
73
+ except Exception as e:
74
+ logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
63
75
 
64
- robots = Protego.parse(body.decode('utf-8'))
76
+ robots = Protego.parse('User-agent: *\nAllow: /')
65
77
 
66
78
  return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
67
79