crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +44 -5
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +156 -131
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
  61. crawlee/storage_clients/_redis/__init__.py +6 -0
  62. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  63. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  64. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  65. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  66. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  67. crawlee/storage_clients/_redis/_utils.py +23 -0
  68. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  69. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  70. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  71. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  72. crawlee/storage_clients/_redis/py.typed +0 -0
  73. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  74. crawlee/storage_clients/_sql/_db_models.py +1 -2
  75. crawlee/storage_clients/models.py +8 -3
  76. crawlee/storages/_key_value_store.py +5 -2
  77. crawlee/storages/_storage_instance_manager.py +103 -44
  78. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
  79. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
  80. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  81. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  82. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from importlib import metadata
2
2
 
3
- from ._request import Request, RequestOptions
3
+ from ._request import Request, RequestOptions, RequestState
4
4
  from ._service_locator import service_locator
5
5
  from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
6
6
  from ._utils.globs import Glob
@@ -14,6 +14,7 @@ __all__ = [
14
14
  'HttpHeaders',
15
15
  'Request',
16
16
  'RequestOptions',
17
+ 'RequestState',
17
18
  'RequestTransformAction',
18
19
  'SkippedReason',
19
20
  'service_locator',
@@ -1,4 +1,8 @@
1
1
  # ruff: noqa: N802, PLC0415
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from collections.abc import Callable
2
6
 
3
7
 
4
8
  def patch_browserforge() -> None:
@@ -12,7 +16,7 @@ def patch_browserforge() -> None:
12
16
  import apify_fingerprint_datapoints
13
17
  from browserforge import download
14
18
 
15
- download.DATA_DIRS: dict[str, Path] = { # type:ignore[misc]
19
+ download.DATA_DIRS = {
16
20
  'headers': apify_fingerprint_datapoints.get_header_network().parent,
17
21
  'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
18
22
  }
@@ -20,7 +24,7 @@ def patch_browserforge() -> None:
20
24
  def DownloadIfNotExists(**flags: bool) -> None:
21
25
  pass
22
26
 
23
- download.DownloadIfNotExists = DownloadIfNotExists
27
+ download.DownloadIfNotExists: Callable[..., None] = DownloadIfNotExists
24
28
 
25
29
  import browserforge.bayesian_network
26
30
 
@@ -33,7 +37,7 @@ def patch_browserforge() -> None:
33
37
  path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
34
38
  super().__init__(path)
35
39
 
36
- browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc]
40
+ browserforge.bayesian_network.BayesianNetwork: BayesianNetwork = BayesianNetwork
37
41
  import browserforge.headers.generator
38
42
 
39
43
  browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']
crawlee/_request.py CHANGED
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
34
34
  class CrawleeRequestData(BaseModel):
35
35
  """Crawlee-specific configuration stored in the `user_data`."""
36
36
 
37
- max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
37
+ max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
38
38
  """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
39
39
  `BasicCrawler`."""
40
40
 
41
41
  enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
42
42
  """The strategy that was used for enqueuing the request."""
43
43
 
44
- state: RequestState | None = None
44
+ state: RequestState = RequestState.UNPROCESSED
45
45
  """Describes the request's current lifecycle state."""
46
46
 
47
47
  session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -93,7 +93,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
93
93
  def __delitem__(self, key: str) -> None:
94
94
  del self.__pydantic_extra__[key]
95
95
 
96
- def __iter__(self) -> Iterator[str]: # type: ignore[override]
96
+ def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
97
97
  yield from self.__pydantic_extra__
98
98
 
99
99
  def __len__(self) -> int:
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
137
137
  always_enqueue: NotRequired[bool]
138
138
  user_data: NotRequired[dict[str, JsonSerializable]]
139
139
  no_retry: NotRequired[bool]
140
+ enqueue_strategy: NotRequired[EnqueueStrategy]
141
+ max_retries: NotRequired[int | None]
140
142
 
141
143
 
142
144
  @docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
166
168
 
167
169
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
168
170
 
169
- unique_key: Annotated[str, Field(alias='uniqueKey')]
171
+ unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
170
172
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
171
173
  to the same URL.
172
174
 
@@ -178,21 +180,22 @@ class Request(BaseModel):
178
180
  and specify which URLs shall be considered equal.
179
181
  """
180
182
 
181
- url: Annotated[str, BeforeValidator(validate_http_url), Field()]
183
+ url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
182
184
  """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
183
185
  and fragments."""
184
186
 
185
- method: HttpMethod = 'GET'
187
+ method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
186
188
  """HTTP request method."""
187
189
 
188
190
  payload: Annotated[
189
191
  HttpPayload | None,
190
192
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
191
193
  PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
194
+ Field(frozen=True),
192
195
  ] = None
193
196
  """HTTP request payload."""
194
197
 
195
- # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
198
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
196
199
  if TYPE_CHECKING:
197
200
  headers: HttpHeaders = HttpHeaders()
198
201
  """HTTP request headers."""
@@ -250,6 +253,8 @@ class Request(BaseModel):
250
253
  keep_url_fragment: bool = False,
251
254
  use_extended_unique_key: bool = False,
252
255
  always_enqueue: bool = False,
256
+ enqueue_strategy: EnqueueStrategy | None = None,
257
+ max_retries: int | None = None,
253
258
  **kwargs: Any,
254
259
  ) -> Self:
255
260
  """Create a new `Request` instance from a URL.
@@ -277,6 +282,9 @@ class Request(BaseModel):
277
282
  `unique_key` computation. This is only relevant when `unique_key` is not provided.
278
283
  always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
279
284
  Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
285
+ enqueue_strategy: The strategy that will be used for enqueuing the request.
286
+ max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
287
+ option of `BasicCrawler`.
280
288
  **kwargs: Additional request properties.
281
289
  """
282
290
  if unique_key is not None and always_enqueue:
@@ -299,7 +307,21 @@ class Request(BaseModel):
299
307
  )
300
308
 
301
309
  if always_enqueue:
302
- unique_key = f'{unique_key}_{crypto_random_object_id()}'
310
+ unique_key = f'{crypto_random_object_id()}|{unique_key}'
311
+
312
+ user_data_dict = kwargs.pop('user_data', {}) or {}
313
+ crawlee_data_dict = user_data_dict.get('__crawlee', {})
314
+
315
+ if max_retries is not None:
316
+ crawlee_data_dict['maxRetries'] = max_retries
317
+
318
+ if enqueue_strategy is not None:
319
+ crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
320
+
321
+ crawlee_data = CrawleeRequestData(**crawlee_data_dict)
322
+
323
+ if crawlee_data:
324
+ user_data_dict['__crawlee'] = crawlee_data
303
325
 
304
326
  request = cls(
305
327
  url=url,
@@ -307,6 +329,7 @@ class Request(BaseModel):
307
329
  method=method,
308
330
  headers=headers,
309
331
  payload=payload,
332
+ user_data=user_data_dict,
310
333
  **kwargs,
311
334
  )
312
335
 
@@ -352,7 +375,7 @@ class Request(BaseModel):
352
375
  self.crawlee_data.crawl_depth = new_value
353
376
 
354
377
  @property
355
- def state(self) -> RequestState | None:
378
+ def state(self) -> RequestState:
356
379
  """Crawlee-specific request handling state."""
357
380
  return self.crawlee_data.state
358
381
 
@@ -365,10 +388,6 @@ class Request(BaseModel):
365
388
  """Crawlee-specific limit on the number of retries of the request."""
366
389
  return self.crawlee_data.max_retries
367
390
 
368
- @max_retries.setter
369
- def max_retries(self, new_max_retries: int) -> None:
370
- self.crawlee_data.max_retries = new_max_retries
371
-
372
391
  @property
373
392
  def session_rotation_count(self) -> int | None:
374
393
  """Crawlee-specific number of finished session rotations for the request."""
@@ -38,7 +38,7 @@ class ServiceLocator:
38
38
  def get_configuration(self) -> Configuration:
39
39
  """Get the configuration."""
40
40
  if self._configuration is None:
41
- logger.warning('No configuration set, implicitly creating and using default Configuration.')
41
+ logger.debug('No configuration set, implicitly creating and using default Configuration.')
42
42
  self._configuration = Configuration()
43
43
 
44
44
  return self._configuration
@@ -63,9 +63,9 @@ class ServiceLocator:
63
63
  def get_event_manager(self) -> EventManager:
64
64
  """Get the event manager."""
65
65
  if self._event_manager is None:
66
- logger.warning('No event manager set, implicitly creating and using default LocalEventManager.')
66
+ logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
67
67
  if self._configuration is None:
68
- logger.warning(
68
+ logger.debug(
69
69
  'Implicit creation of event manager will implicitly set configuration as side effect. '
70
70
  'It is advised to explicitly first set the configuration instead.'
71
71
  )
@@ -93,7 +93,7 @@ class ServiceLocator:
93
93
  def get_storage_client(self) -> StorageClient:
94
94
  """Get the storage client."""
95
95
  if self._storage_client is None:
96
- logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.')
96
+ logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
97
97
  if self._configuration is None:
98
98
  logger.warning(
99
99
  'Implicit creation of storage client will implicitly set configuration as side effect. '
crawlee/_types.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
+ from copy import deepcopy
5
6
  from dataclasses import dataclass
6
7
  from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
7
8
 
@@ -15,7 +16,7 @@ if TYPE_CHECKING:
15
16
  import re
16
17
  from collections.abc import Callable, Coroutine, Sequence
17
18
 
18
- from typing_extensions import NotRequired, Required, Unpack
19
+ from typing_extensions import NotRequired, Required, Self, Unpack
19
20
 
20
21
  from crawlee import Glob, Request
21
22
  from crawlee._request import RequestOptions
@@ -61,14 +62,14 @@ class HttpHeaders(RootModel, Mapping[str, str]):
61
62
 
62
63
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
63
64
 
64
- # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
65
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
65
66
  if TYPE_CHECKING:
66
67
  root: dict[str, str] = {}
67
68
  else:
68
69
  root: Annotated[
69
70
  dict[str, str],
70
71
  PlainValidator(lambda value: _normalize_headers(value)),
71
- Field(default_factory=dict),
72
+ Field(default_factory=lambda: dict[str, str]()),
72
73
  ]
73
74
 
74
75
  def __getitem__(self, key: str) -> str:
@@ -90,7 +91,7 @@ class HttpHeaders(RootModel, Mapping[str, str]):
90
91
  combined_headers = {**other, **self.root}
91
92
  return HttpHeaders(combined_headers)
92
93
 
93
- def __iter__(self) -> Iterator[str]: # type: ignore[override]
94
+ def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
94
95
  yield from self.root
95
96
 
96
97
  def __len__(self) -> int:
@@ -260,12 +261,24 @@ class KeyValueStoreChangeRecords:
260
261
  class RequestHandlerRunResult:
261
262
  """Record of calls to storage-related context helpers."""
262
263
 
263
- def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
264
+ def __init__(
265
+ self,
266
+ *,
267
+ key_value_store_getter: GetKeyValueStoreFunction,
268
+ request: Request,
269
+ ) -> None:
264
270
  self._key_value_store_getter = key_value_store_getter
265
271
  self.add_requests_calls = list[AddRequestsKwargs]()
266
272
  self.push_data_calls = list[PushDataFunctionCall]()
267
273
  self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
268
274
 
275
+ # Isolated copies for handler execution
276
+ self._request = deepcopy(request)
277
+
278
+ @property
279
+ def request(self) -> Request:
280
+ return self._request
281
+
269
282
  async def add_requests(
270
283
  self,
271
284
  requests: Sequence[str | Request],
@@ -315,6 +328,14 @@ class RequestHandlerRunResult:
315
328
 
316
329
  return self.key_value_store_changes[id, name, alias]
317
330
 
331
+ def apply_request_changes(self, target: Request) -> None:
332
+ """Apply tracked changes from handler copy to original request."""
333
+ if self.request.user_data != target.user_data:
334
+ target.user_data = self.request.user_data
335
+
336
+ if self.request.headers != target.headers:
337
+ target.headers = self.request.headers
338
+
318
339
 
319
340
  @docs_group('Functions')
320
341
  class AddRequestsFunction(Protocol):
@@ -643,6 +664,24 @@ class BasicCrawlingContext:
643
664
  """Return hash of the context. Each context is considered unique."""
644
665
  return id(self)
645
666
 
667
+ def create_modified_copy(
668
+ self,
669
+ push_data: PushDataFunction | None = None,
670
+ add_requests: AddRequestsFunction | None = None,
671
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
672
+ ) -> Self:
673
+ """Create a modified copy of the crawling context with specified changes."""
674
+ modifications = dict[str, Any]()
675
+
676
+ if push_data is not None:
677
+ modifications['push_data'] = push_data
678
+ if add_requests is not None:
679
+ modifications['add_requests'] = add_requests
680
+ if get_key_value_store is not None:
681
+ modifications['get_key_value_store'] = get_key_value_store
682
+
683
+ return dataclasses.replace(self, **modifications)
684
+
646
685
 
647
686
  class GetDataKwargs(TypedDict):
648
687
  """Keyword arguments for dataset's `get_data` method."""
crawlee/_utils/context.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
3
+ import inspect
4
4
  from collections.abc import Callable
5
5
  from functools import wraps
6
- from typing import Any, TypeVar
6
+ from typing import Any, TypeVar, cast
7
7
 
8
8
  T = TypeVar('T', bound=Callable[..., Any])
9
9
 
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
44
44
 
45
45
  return await method(self, *args, **kwargs)
46
46
 
47
- return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
47
+ return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)
crawlee/_utils/file.py CHANGED
@@ -163,7 +163,14 @@ async def export_csv_to_stream(
163
163
  dst: TextIO,
164
164
  **kwargs: Unpack[ExportDataCsvKwargs],
165
165
  ) -> None:
166
- writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
166
+ # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167
+ # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168
+ # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169
+ # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170
+ if 'lineterminator' not in kwargs:
171
+ kwargs['lineterminator'] = '\n'
172
+
173
+ writer = csv.writer(dst, **kwargs)
167
174
  write_header = True
168
175
 
169
176
  # Iterate over the dataset and write to CSV.
crawlee/_utils/globs.py CHANGED
@@ -33,12 +33,12 @@ def _translate(
33
33
 
34
34
  HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
35
35
  """
36
- if not seps:
37
- seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep
36
+ _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps
38
37
 
39
- escaped_seps = ''.join(map(re.escape, seps))
40
- any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
38
+ escaped_seps = ''.join(map(re.escape, _seps))
39
+ any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
41
40
  not_sep = f'[^{escaped_seps}]'
41
+
42
42
  if include_hidden:
43
43
  one_last_segment = f'{not_sep}+'
44
44
  one_segment = f'{one_last_segment}{any_sep}'
@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
7
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
7
8
  from crawlee.events._types import Event, EventPersistStateData
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  import logging
12
+ from collections.abc import Callable, Coroutine
11
13
 
12
- from crawlee.storages._key_value_store import KeyValueStore
14
+ from crawlee.storages import KeyValueStore
13
15
 
14
16
  TStateModel = TypeVar('TStateModel', bound=BaseModel)
15
17
 
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
37
39
  persistence_enabled: Literal[True, False, 'explicit_only'] = False,
38
40
  persist_state_kvs_name: str | None = None,
39
41
  persist_state_kvs_id: str | None = None,
42
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
40
43
  logger: logging.Logger,
41
44
  ) -> None:
42
45
  """Initialize a new recoverable state object.
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
51
54
  If neither a name nor and id are supplied, the default store will be used.
52
55
  persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
53
56
  If neither a name nor and id are supplied, the default store will be used.
57
+ persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
58
+ not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
54
59
  logger: A logger instance for logging operations related to state persistence
55
60
  """
61
+ raise_if_too_many_kwargs(
62
+ persist_state_kvs_name=persist_state_kvs_name,
63
+ persist_state_kvs_id=persist_state_kvs_id,
64
+ persist_state_kvs_factory=persist_state_kvs_factory,
65
+ )
66
+ if not persist_state_kvs_factory:
67
+ logger.debug(
68
+ 'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
69
+ 'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
70
+ 'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
71
+ 'global side effects.'
72
+ )
73
+
56
74
  self._default_state = default_state
57
75
  self._state_type: type[TStateModel] = self._default_state.__class__
58
76
  self._state: TStateModel | None = None
59
77
  self._persistence_enabled = persistence_enabled
60
78
  self._persist_state_key = persist_state_key
61
- self._persist_state_kvs_name = persist_state_kvs_name
62
- self._persist_state_kvs_id = persist_state_kvs_id
63
- self._key_value_store: 'KeyValueStore | None' = None # noqa: UP037
79
+ if persist_state_kvs_factory is None:
80
+
81
+ async def kvs_factory() -> KeyValueStore:
82
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
83
+
84
+ return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
85
+
86
+ self._persist_state_kvs_factory = kvs_factory
87
+ else:
88
+ self._persist_state_kvs_factory = persist_state_kvs_factory
89
+
90
+ self._key_value_store: KeyValueStore | None = None
64
91
  self._log = logger
65
92
 
66
93
  async def initialize(self) -> TStateModel:
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
77
104
  return self.current_value
78
105
 
79
106
  # Import here to avoid circular imports.
80
- from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415
81
107
 
82
- self._key_value_store = await KeyValueStore.open(
83
- name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
84
- )
108
+ self._key_value_store = await self._persist_state_kvs_factory()
85
109
 
86
110
  await self._load_saved_state()
87
111
 
@@ -1,12 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import inspect
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING
6
7
 
7
8
  if TYPE_CHECKING:
8
9
  from collections.abc import Callable
9
10
  from datetime import timedelta
11
+ from types import TracebackType
12
+
13
+ from typing_extensions import Self
10
14
 
11
15
  logger = getLogger(__name__)
12
16
 
@@ -21,11 +25,27 @@ class RecurringTask:
21
25
  """
22
26
 
23
27
  def __init__(self, func: Callable, delay: timedelta) -> None:
24
- logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...')
28
+ logger.debug(
29
+ 'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
30
+ func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
31
+ delay,
32
+ )
25
33
  self.func = func
26
34
  self.delay = delay
27
35
  self.task: asyncio.Task | None = None
28
36
 
37
+ async def __aenter__(self) -> Self:
38
+ self.start()
39
+ return self
40
+
41
+ async def __aexit__(
42
+ self,
43
+ exc_type: type[BaseException] | None,
44
+ exc_value: BaseException | None,
45
+ exc_traceback: TracebackType | None,
46
+ ) -> None:
47
+ await self.stop()
48
+
29
49
  async def _wrapper(self) -> None:
30
50
  """Continuously execute the provided function with the specified delay.
31
51
 
@@ -34,12 +54,16 @@ class RecurringTask:
34
54
  """
35
55
  sleep_time_secs = self.delay.total_seconds()
36
56
  while True:
37
- await self.func() if asyncio.iscoroutinefunction(self.func) else self.func()
57
+ await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
38
58
  await asyncio.sleep(sleep_time_secs)
39
59
 
40
60
  def start(self) -> None:
41
61
  """Start the recurring task execution."""
42
- self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}')
62
+ name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
63
+ self.task = asyncio.create_task(
64
+ self._wrapper(),
65
+ name=f'Task-recurring-{name}',
66
+ )
43
67
 
44
68
  async def stop(self) -> None:
45
69
  """Stop the recurring task execution."""
crawlee/_utils/robots.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from logging import getLogger
3
4
  from typing import TYPE_CHECKING
4
5
 
5
6
  from protego import Protego
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
15
16
  from crawlee.proxy_configuration import ProxyInfo
16
17
 
17
18
 
19
+ logger = getLogger(__name__)
20
+
21
+
18
22
  class RobotsTxtFile:
19
23
  def __init__(
20
24
  self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ class RobotsTxtFile:
56
60
  http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
57
61
  proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
58
62
  """
59
- response = await http_client.send_request(url, proxy_info=proxy_info)
60
- body = (
61
- b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
62
- )
63
+ try:
64
+ response = await http_client.send_request(url, proxy_info=proxy_info)
65
+
66
+ body = (
67
+ b'User-agent: *\nAllow: /'
68
+ if is_status_code_client_error(response.status_code)
69
+ else await response.read()
70
+ )
71
+ robots = Protego.parse(body.decode('utf-8'))
72
+
73
+ except Exception as e:
74
+ logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
63
75
 
64
- robots = Protego.parse(body.decode('utf-8'))
76
+ robots = Protego.parse('User-agent: *\nAllow: /')
65
77
 
66
78
  return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
67
79
 
crawlee/_utils/sitemap.py CHANGED
@@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
335
335
  # Check if the first chunk is a valid gzip header
336
336
  if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
337
337
  decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
338
- first_chunk = False
338
+ first_chunk = False
339
339
 
340
340
  chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
341
341
  text_chunk = decoder.decode(chunk)
@@ -430,10 +430,17 @@ async def parse_sitemap(
430
430
  up to the specified maximum depth.
431
431
  """
432
432
  # Set default options
433
- options = options or {}
434
- emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
435
- max_depth = options.get('max_depth', float('inf'))
436
- sitemap_retries = options.get('sitemap_retries', 3)
433
+ default_timeout = timedelta(seconds=30)
434
+ if options:
435
+ emit_nested_sitemaps = options['emit_nested_sitemaps']
436
+ max_depth = options['max_depth']
437
+ sitemap_retries = options['sitemap_retries']
438
+ timeout = options.get('timeout', default_timeout)
439
+ else:
440
+ emit_nested_sitemaps = False
441
+ max_depth = float('inf')
442
+ sitemap_retries = 3
443
+ timeout = default_timeout
437
444
 
438
445
  # Setup working state
439
446
  sources = list(initial_sources)
@@ -472,7 +479,7 @@ async def parse_sitemap(
472
479
  sitemap_retries,
473
480
  emit_nested_sitemaps=emit_nested_sitemaps,
474
481
  proxy_info=proxy_info,
475
- timeout=options.get('timeout', timedelta(seconds=30)),
482
+ timeout=timeout,
476
483
  ):
477
484
  yield result
478
485
  else:
crawlee/_utils/system.py CHANGED
@@ -5,7 +5,7 @@ import sys
5
5
  from contextlib import suppress
6
6
  from datetime import datetime, timezone
7
7
  from logging import getLogger
8
- from typing import Annotated
8
+ from typing import TYPE_CHECKING, Annotated
9
9
 
10
10
  import psutil
11
11
  from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
@@ -41,11 +41,19 @@ class CpuInfo(BaseModel):
41
41
  used_ratio: Annotated[float, Field(alias='usedRatio')]
42
42
  """The ratio of CPU currently in use, represented as a float between 0 and 1."""
43
43
 
44
- created_at: datetime = Field(
45
- alias='createdAt',
46
- default_factory=lambda: datetime.now(timezone.utc),
47
- )
48
- """The time at which the measurement was taken."""
44
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
45
+ if TYPE_CHECKING:
46
+ created_at: datetime = datetime.now(timezone.utc)
47
+ """The time at which the measurement was taken."""
48
+ else:
49
+ created_at: Annotated[
50
+ datetime,
51
+ Field(
52
+ alias='createdAt',
53
+ default_factory=lambda: datetime.now(timezone.utc),
54
+ ),
55
+ ]
56
+ """The time at which the measurement was taken."""
49
57
 
50
58
 
51
59
  class MemoryUsageInfo(BaseModel):
@@ -61,11 +69,19 @@ class MemoryUsageInfo(BaseModel):
61
69
  ]
62
70
  """Memory usage of the current Python process and its children."""
63
71
 
64
- created_at: datetime = Field(
65
- alias='createdAt',
66
- default_factory=lambda: datetime.now(timezone.utc),
67
- )
68
- """The time at which the measurement was taken."""
72
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
73
+ if TYPE_CHECKING:
74
+ created_at: datetime = datetime.now(timezone.utc)
75
+ """The time at which the measurement was taken."""
76
+ else:
77
+ created_at: Annotated[
78
+ datetime,
79
+ Field(
80
+ alias='createdAt',
81
+ default_factory=lambda: datetime.now(timezone.utc),
82
+ ),
83
+ ]
84
+ """The time at which the measurement was taken."""
69
85
 
70
86
 
71
87
  class MemoryInfo(MemoryUsageInfo):