crawlee 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (80) hide show
  1. crawlee/_request.py +32 -21
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +87 -25
  4. crawlee/_utils/file.py +7 -0
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +15 -0
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/time.py +41 -1
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +21 -15
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +3 -1
  17. crawlee/crawlers/__init__.py +2 -1
  18. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  19. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
  21. crawlee/crawlers/_basic/_basic_crawler.py +139 -96
  22. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  23. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  24. crawlee/crawlers/_playwright/_playwright_crawler.py +52 -10
  25. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  26. crawlee/events/_event_manager.py +3 -1
  27. crawlee/fingerprint_suite/_header_generator.py +2 -2
  28. crawlee/http_clients/_base.py +4 -0
  29. crawlee/http_clients/_curl_impersonate.py +12 -0
  30. crawlee/http_clients/_httpx.py +16 -6
  31. crawlee/http_clients/_impit.py +25 -10
  32. crawlee/otel/crawler_instrumentor.py +3 -3
  33. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  34. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  35. crawlee/request_loaders/_sitemap_request_loader.py +22 -4
  36. crawlee/sessions/_session_pool.py +1 -1
  37. crawlee/statistics/_error_snapshotter.py +1 -1
  38. crawlee/statistics/_models.py +32 -1
  39. crawlee/statistics/_statistics.py +24 -33
  40. crawlee/storage_clients/__init__.py +16 -0
  41. crawlee/storage_clients/_base/_storage_client.py +5 -4
  42. crawlee/storage_clients/_file_system/_dataset_client.py +6 -7
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -8
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +31 -15
  45. crawlee/storage_clients/_file_system/_storage_client.py +2 -2
  46. crawlee/storage_clients/_memory/_dataset_client.py +4 -5
  47. crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
  48. crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
  49. crawlee/storage_clients/_redis/__init__.py +6 -0
  50. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  51. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  52. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  53. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  54. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  55. crawlee/storage_clients/_redis/_utils.py +23 -0
  56. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  57. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  58. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  59. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  60. crawlee/storage_clients/_redis/py.typed +0 -0
  61. crawlee/storage_clients/_sql/__init__.py +6 -0
  62. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  63. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  64. crawlee/storage_clients/_sql/_db_models.py +268 -0
  65. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  66. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  67. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  68. crawlee/storage_clients/_sql/py.typed +0 -0
  69. crawlee/storage_clients/models.py +10 -10
  70. crawlee/storages/_base.py +3 -1
  71. crawlee/storages/_dataset.py +5 -3
  72. crawlee/storages/_key_value_store.py +11 -6
  73. crawlee/storages/_request_queue.py +5 -3
  74. crawlee/storages/_storage_instance_manager.py +54 -68
  75. crawlee/storages/_utils.py +11 -0
  76. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +17 -5
  77. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +80 -58
  78. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
  79. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
  80. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
crawlee/_request.py CHANGED
@@ -185,9 +185,6 @@ class Request(BaseModel):
185
185
  method: HttpMethod = 'GET'
186
186
  """HTTP request method."""
187
187
 
188
- headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
189
- """HTTP request headers."""
190
-
191
188
  payload: Annotated[
192
189
  HttpPayload | None,
193
190
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
@@ -195,23 +192,37 @@ class Request(BaseModel):
195
192
  ] = None
196
193
  """HTTP request payload."""
197
194
 
198
- user_data: Annotated[
199
- dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
200
- Field(alias='userData', default_factory=lambda: UserData()),
201
- PlainValidator(user_data_adapter.validate_python),
202
- PlainSerializer(
203
- lambda instance: user_data_adapter.dump_python(
204
- instance,
205
- by_alias=True,
206
- exclude_none=True,
207
- exclude_unset=True,
208
- exclude_defaults=True,
209
- )
210
- ),
211
- ] = {}
212
- """Custom user data assigned to the request. Use this to save any request related data to the
213
- request's scope, keeping them accessible on retries, failures etc.
214
- """
195
+ # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
196
+ if TYPE_CHECKING:
197
+ headers: HttpHeaders = HttpHeaders()
198
+ """HTTP request headers."""
199
+
200
+ user_data: dict[str, JsonSerializable] = {}
201
+ """Custom user data assigned to the request. Use this to save any request related data to the
202
+ request's scope, keeping them accessible on retries, failures etc.
203
+ """
204
+
205
+ else:
206
+ headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
207
+ """HTTP request headers."""
208
+
209
+ user_data: Annotated[
210
+ dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
211
+ Field(alias='userData', default_factory=lambda: UserData()),
212
+ PlainValidator(user_data_adapter.validate_python),
213
+ PlainSerializer(
214
+ lambda instance: user_data_adapter.dump_python(
215
+ instance,
216
+ by_alias=True,
217
+ exclude_none=True,
218
+ exclude_unset=True,
219
+ exclude_defaults=True,
220
+ )
221
+ ),
222
+ ]
223
+ """Custom user data assigned to the request. Use this to save any request related data to the
224
+ request's scope, keeping them accessible on retries, failures etc.
225
+ """
215
226
 
216
227
  retry_count: Annotated[int, Field(alias='retryCount')] = 0
217
228
  """Number of times the request has been retried."""
@@ -288,7 +299,7 @@ class Request(BaseModel):
288
299
  )
289
300
 
290
301
  if always_enqueue:
291
- unique_key = f'{unique_key}_{crypto_random_object_id()}'
302
+ unique_key = f'{crypto_random_object_id()}|{unique_key}'
292
303
 
293
304
  request = cls(
294
305
  url=url,
@@ -38,7 +38,7 @@ class ServiceLocator:
38
38
  def get_configuration(self) -> Configuration:
39
39
  """Get the configuration."""
40
40
  if self._configuration is None:
41
- logger.warning('No configuration set, implicitly creating and using default Configuration.')
41
+ logger.debug('No configuration set, implicitly creating and using default Configuration.')
42
42
  self._configuration = Configuration()
43
43
 
44
44
  return self._configuration
@@ -63,9 +63,9 @@ class ServiceLocator:
63
63
  def get_event_manager(self) -> EventManager:
64
64
  """Get the event manager."""
65
65
  if self._event_manager is None:
66
- logger.warning('No event manager set, implicitly creating and using default LocalEventManager.')
66
+ logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
67
67
  if self._configuration is None:
68
- logger.warning(
68
+ logger.debug(
69
69
  'Implicit creation of event manager will implicitly set configuration as side effect. '
70
70
  'It is advised to explicitly first set the configuration instead.'
71
71
  )
@@ -93,7 +93,7 @@ class ServiceLocator:
93
93
  def get_storage_client(self) -> StorageClient:
94
94
  """Get the storage client."""
95
95
  if self._storage_client is None:
96
- logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.')
96
+ logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
97
97
  if self._configuration is None:
98
98
  logger.warning(
99
99
  'Implicit creation of storage client will implicitly set configuration as side effect. '
crawlee/_types.py CHANGED
@@ -3,17 +3,7 @@ from __future__ import annotations
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
5
  from dataclasses import dataclass
6
- from typing import (
7
- TYPE_CHECKING,
8
- Annotated,
9
- Any,
10
- Literal,
11
- Protocol,
12
- TypedDict,
13
- TypeVar,
14
- cast,
15
- overload,
16
- )
6
+ from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
17
7
 
18
8
  from pydantic import ConfigDict, Field, PlainValidator, RootModel
19
9
 
@@ -25,7 +15,7 @@ if TYPE_CHECKING:
25
15
  import re
26
16
  from collections.abc import Callable, Coroutine, Sequence
27
17
 
28
- from typing_extensions import NotRequired, Required, Unpack
18
+ from typing_extensions import NotRequired, Required, Self, Unpack
29
19
 
30
20
  from crawlee import Glob, Request
31
21
  from crawlee._request import RequestOptions
@@ -71,11 +61,15 @@ class HttpHeaders(RootModel, Mapping[str, str]):
71
61
 
72
62
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
73
63
 
74
- root: Annotated[
75
- dict[str, str],
76
- PlainValidator(lambda value: _normalize_headers(value)),
77
- Field(default_factory=dict),
78
- ] = {}
64
+ # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
65
+ if TYPE_CHECKING:
66
+ root: dict[str, str] = {}
67
+ else:
68
+ root: Annotated[
69
+ dict[str, str],
70
+ PlainValidator(lambda value: _normalize_headers(value)),
71
+ Field(default_factory=dict),
72
+ ]
79
73
 
80
74
  def __getitem__(self, key: str) -> str:
81
75
  return self.root[key.lower()]
@@ -110,9 +104,9 @@ class ConcurrencySettings:
110
104
  def __init__(
111
105
  self,
112
106
  min_concurrency: int = 1,
113
- max_concurrency: int = 200,
107
+ max_concurrency: int = 100,
114
108
  max_tasks_per_minute: float = float('inf'),
115
- desired_concurrency: int | None = None,
109
+ desired_concurrency: int = 10,
116
110
  ) -> None:
117
111
  """Initialize a new instance.
118
112
 
@@ -125,21 +119,24 @@ class ConcurrencySettings:
125
119
  desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
126
120
  if there is a large enough supply of them. By default, it is `min_concurrency`.
127
121
  """
128
- if desired_concurrency is not None and desired_concurrency < 1:
129
- raise ValueError('desired_concurrency must be 1 or larger')
130
-
131
122
  if min_concurrency < 1:
132
123
  raise ValueError('min_concurrency must be 1 or larger')
133
124
 
134
125
  if max_concurrency < min_concurrency:
135
126
  raise ValueError('max_concurrency cannot be less than min_concurrency')
136
127
 
128
+ if desired_concurrency < min_concurrency:
129
+ raise ValueError('desired_concurrency cannot be less than min_concurrency')
130
+
131
+ if desired_concurrency > max_concurrency:
132
+ raise ValueError('desired_concurrency cannot be greater than max_concurrency')
133
+
137
134
  if max_tasks_per_minute <= 0:
138
135
  raise ValueError('max_tasks_per_minute must be positive')
139
136
 
140
137
  self.min_concurrency = min_concurrency
141
138
  self.max_concurrency = max_concurrency
142
- self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
139
+ self.desired_concurrency = desired_concurrency
143
140
  self.max_tasks_per_minute = max_tasks_per_minute
144
141
 
145
142
 
@@ -180,6 +177,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
180
177
  requests: Sequence[str | Request]
181
178
  """Requests to be added to the `RequestManager`."""
182
179
 
180
+ rq_id: str | None
181
+ """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
182
+
183
+ rq_name: str | None
184
+ """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
185
+ """
186
+
187
+ rq_alias: str | None
188
+ """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
189
+ """
190
+
183
191
 
184
192
  class PushDataKwargs(TypedDict):
185
193
  """Keyword arguments for dataset's `push_data` method."""
@@ -261,10 +269,18 @@ class RequestHandlerRunResult:
261
269
  async def add_requests(
262
270
  self,
263
271
  requests: Sequence[str | Request],
272
+ rq_id: str | None = None,
273
+ rq_name: str | None = None,
274
+ rq_alias: str | None = None,
264
275
  **kwargs: Unpack[EnqueueLinksKwargs],
265
276
  ) -> None:
266
277
  """Track a call to the `add_requests` context helper."""
267
- self.add_requests_calls.append(AddRequestsKwargs(requests=requests, **kwargs))
278
+ specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
279
+ if specified_params > 1:
280
+ raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
281
+ self.add_requests_calls.append(
282
+ AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
283
+ )
268
284
 
269
285
  async def push_data(
270
286
  self,
@@ -311,12 +327,21 @@ class AddRequestsFunction(Protocol):
311
327
  def __call__(
312
328
  self,
313
329
  requests: Sequence[str | Request],
330
+ rq_id: str | None = None,
331
+ rq_name: str | None = None,
332
+ rq_alias: str | None = None,
314
333
  **kwargs: Unpack[EnqueueLinksKwargs],
315
334
  ) -> Coroutine[None, None, None]:
316
335
  """Call dunder method.
317
336
 
318
337
  Args:
319
338
  requests: Requests to be added to the `RequestManager`.
339
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
340
+ provided.
341
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
342
+ can be provided.
343
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
344
+ can be provided.
320
345
  **kwargs: Additional keyword arguments.
321
346
  """
322
347
 
@@ -344,12 +369,21 @@ class EnqueueLinksFunction(Protocol):
344
369
  label: str | None = None,
345
370
  user_data: dict[str, Any] | None = None,
346
371
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
372
+ rq_id: str | None = None,
373
+ rq_name: str | None = None,
374
+ rq_alias: str | None = None,
347
375
  **kwargs: Unpack[EnqueueLinksKwargs],
348
376
  ) -> Coroutine[None, None, None]: ...
349
377
 
350
378
  @overload
351
379
  def __call__(
352
- self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
380
+ self,
381
+ *,
382
+ requests: Sequence[str | Request] | None = None,
383
+ rq_id: str | None = None,
384
+ rq_name: str | None = None,
385
+ rq_alias: str | None = None,
386
+ **kwargs: Unpack[EnqueueLinksKwargs],
353
387
  ) -> Coroutine[None, None, None]: ...
354
388
 
355
389
  def __call__(
@@ -360,6 +394,9 @@ class EnqueueLinksFunction(Protocol):
360
394
  user_data: dict[str, Any] | None = None,
361
395
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
362
396
  requests: Sequence[str | Request] | None = None,
397
+ rq_id: str | None = None,
398
+ rq_name: str | None = None,
399
+ rq_alias: str | None = None,
363
400
  **kwargs: Unpack[EnqueueLinksKwargs],
364
401
  ) -> Coroutine[None, None, None]:
365
402
  """Call enqueue links function.
@@ -377,6 +414,12 @@ class EnqueueLinksFunction(Protocol):
377
414
  - `'skip'` to exclude the request from being enqueued,
378
415
  - `'unchanged'` to use the original request options without modification.
379
416
  requests: Requests to be added to the `RequestManager`.
417
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
418
+ provided.
419
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
420
+ can be provided.
421
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
422
+ can be provided.
380
423
  **kwargs: Additional keyword arguments.
381
424
  """
382
425
 
@@ -600,6 +643,25 @@ class BasicCrawlingContext:
600
643
  """Return hash of the context. Each context is considered unique."""
601
644
  return id(self)
602
645
 
646
+ def create_modified_copy(
647
+ self,
648
+ push_data: PushDataFunction | None = None,
649
+ add_requests: AddRequestsFunction | None = None,
650
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
651
+ ) -> Self:
652
+ """Create a modified copy of the crawling context with specified changes."""
653
+ original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
654
+ modified_fields = {
655
+ key: value
656
+ for key, value in {
657
+ 'push_data': push_data,
658
+ 'add_requests': add_requests,
659
+ 'get_key_value_store': get_key_value_store,
660
+ }.items()
661
+ if value
662
+ }
663
+ return self.__class__(**{**original_fields, **modified_fields})
664
+
603
665
 
604
666
  class GetDataKwargs(TypedDict):
605
667
  """Keyword arguments for dataset's `get_data` method."""
crawlee/_utils/file.py CHANGED
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
163
163
  dst: TextIO,
164
164
  **kwargs: Unpack[ExportDataCsvKwargs],
165
165
  ) -> None:
166
+ # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167
+ # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168
+ # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169
+ # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170
+ if 'lineterminator' not in kwargs:
171
+ kwargs['lineterminator'] = '\n'
172
+
166
173
  writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
167
174
  write_header = True
168
175
 
@@ -0,0 +1,12 @@
1
+ from typing import Any
2
+
3
+
4
+ def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
5
+ """Raise ValueError if there are more non-None kwargs then max_kwargs."""
6
+ none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
7
+ if len(none_kwargs_names) > max_kwargs:
8
+ all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
9
+ raise ValueError(
10
+ f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
11
+ f'specified: {", ".join(none_kwargs_names)}.'
12
+ )
@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
7
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
7
8
  from crawlee.events._types import Event, EventPersistStateData
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  import logging
12
+ from collections.abc import Callable, Coroutine
11
13
 
12
- from crawlee.storages._key_value_store import KeyValueStore
14
+ from crawlee.storages import KeyValueStore
13
15
 
14
16
  TStateModel = TypeVar('TStateModel', bound=BaseModel)
15
17
 
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
37
39
  persistence_enabled: Literal[True, False, 'explicit_only'] = False,
38
40
  persist_state_kvs_name: str | None = None,
39
41
  persist_state_kvs_id: str | None = None,
42
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
40
43
  logger: logging.Logger,
41
44
  ) -> None:
42
45
  """Initialize a new recoverable state object.
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
51
54
  If neither a name nor and id are supplied, the default store will be used.
52
55
  persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
53
56
  If neither a name nor and id are supplied, the default store will be used.
57
+ persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
58
+ not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
54
59
  logger: A logger instance for logging operations related to state persistence
55
60
  """
61
+ raise_if_too_many_kwargs(
62
+ persist_state_kvs_name=persist_state_kvs_name,
63
+ persist_state_kvs_id=persist_state_kvs_id,
64
+ persist_state_kvs_factory=persist_state_kvs_factory,
65
+ )
66
+ if not persist_state_kvs_factory:
67
+ logger.debug(
68
+ 'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
69
+ 'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
70
+ 'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
71
+ 'global side effects.'
72
+ )
73
+
56
74
  self._default_state = default_state
57
75
  self._state_type: type[TStateModel] = self._default_state.__class__
58
76
  self._state: TStateModel | None = None
59
77
  self._persistence_enabled = persistence_enabled
60
78
  self._persist_state_key = persist_state_key
61
- self._persist_state_kvs_name = persist_state_kvs_name
62
- self._persist_state_kvs_id = persist_state_kvs_id
63
- self._key_value_store: 'KeyValueStore | None' = None # noqa: UP037
79
+ if persist_state_kvs_factory is None:
80
+
81
+ async def kvs_factory() -> KeyValueStore:
82
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
83
+
84
+ return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
85
+
86
+ self._persist_state_kvs_factory = kvs_factory
87
+ else:
88
+ self._persist_state_kvs_factory = persist_state_kvs_factory
89
+
90
+ self._key_value_store: KeyValueStore | None = None
64
91
  self._log = logger
65
92
 
66
93
  async def initialize(self) -> TStateModel:
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
77
104
  return self.current_value
78
105
 
79
106
  # Import here to avoid circular imports.
80
- from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415
81
107
 
82
- self._key_value_store = await KeyValueStore.open(
83
- name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
84
- )
108
+ self._key_value_store = await self._persist_state_kvs_factory()
85
109
 
86
110
  await self._load_saved_state()
87
111
 
@@ -7,6 +7,9 @@ from typing import TYPE_CHECKING
7
7
  if TYPE_CHECKING:
8
8
  from collections.abc import Callable
9
9
  from datetime import timedelta
10
+ from types import TracebackType
11
+
12
+ from typing_extensions import Self
10
13
 
11
14
  logger = getLogger(__name__)
12
15
 
@@ -26,6 +29,18 @@ class RecurringTask:
26
29
  self.delay = delay
27
30
  self.task: asyncio.Task | None = None
28
31
 
32
+ async def __aenter__(self) -> Self:
33
+ self.start()
34
+ return self
35
+
36
+ async def __aexit__(
37
+ self,
38
+ exc_type: type[BaseException] | None,
39
+ exc_value: BaseException | None,
40
+ exc_traceback: TracebackType | None,
41
+ ) -> None:
42
+ await self.stop()
43
+
29
44
  async def _wrapper(self) -> None:
30
45
  """Continuously execute the provided function with the specified delay.
31
46
 
crawlee/_utils/robots.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from logging import getLogger
3
4
  from typing import TYPE_CHECKING
4
5
 
5
6
  from protego import Protego
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
15
16
  from crawlee.proxy_configuration import ProxyInfo
16
17
 
17
18
 
19
+ logger = getLogger(__name__)
20
+
21
+
18
22
  class RobotsTxtFile:
19
23
  def __init__(
20
24
  self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ class RobotsTxtFile:
56
60
  http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
57
61
  proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
58
62
  """
59
- response = await http_client.send_request(url, proxy_info=proxy_info)
60
- body = (
61
- b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
62
- )
63
+ try:
64
+ response = await http_client.send_request(url, proxy_info=proxy_info)
65
+
66
+ body = (
67
+ b'User-agent: *\nAllow: /'
68
+ if is_status_code_client_error(response.status_code)
69
+ else await response.read()
70
+ )
71
+ robots = Protego.parse(body.decode('utf-8'))
72
+
73
+ except Exception as e:
74
+ logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
63
75
 
64
- robots = Protego.parse(body.decode('utf-8'))
76
+ robots = Protego.parse('User-agent: *\nAllow: /')
65
77
 
66
78
  return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
67
79
 
crawlee/_utils/sitemap.py CHANGED
@@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
335
335
  # Check if the first chunk is a valid gzip header
336
336
  if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
337
337
  decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
338
- first_chunk = False
338
+ first_chunk = False
339
339
 
340
340
  chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
341
341
  text_chunk = decoder.decode(chunk)
crawlee/_utils/time.py CHANGED
@@ -3,11 +3,14 @@ from __future__ import annotations
3
3
  import time
4
4
  from contextlib import contextmanager
5
5
  from dataclasses import dataclass
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING
7
8
 
9
+ from async_timeout import Timeout, timeout
10
+
8
11
  if TYPE_CHECKING:
9
12
  from collections.abc import Iterator
10
- from datetime import timedelta
13
+ from types import TracebackType
11
14
 
12
15
  _SECONDS_PER_MINUTE = 60
13
16
  _SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
35
38
  result.cpu = after_cpu - before_cpu
36
39
 
37
40
 
41
+ class SharedTimeout:
42
+ """Keeps track of a time budget shared by multiple independent async operations.
43
+
44
+ Provides a reusable, non-reentrant context manager interface.
45
+ """
46
+
47
+ def __init__(self, timeout: timedelta) -> None:
48
+ self._remaining_timeout = timeout
49
+ self._active_timeout: Timeout | None = None
50
+ self._activation_timestamp: float | None = None
51
+
52
+ async def __aenter__(self) -> timedelta:
53
+ if self._active_timeout is not None or self._activation_timestamp is not None:
54
+ raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55
+
56
+ self._activation_timestamp = time.monotonic()
57
+ self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58
+ await new_timeout.__aenter__()
59
+ return self._remaining_timeout
60
+
61
+ async def __aexit__(
62
+ self,
63
+ exc_type: type[BaseException] | None,
64
+ exc_value: BaseException | None,
65
+ exc_traceback: TracebackType | None,
66
+ ) -> None:
67
+ if self._active_timeout is None or self._activation_timestamp is None:
68
+ raise RuntimeError('Logic error')
69
+
70
+ await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71
+ elapsed = time.monotonic() - self._activation_timestamp
72
+ self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73
+
74
+ self._active_timeout = None
75
+ self._activation_timestamp = None
76
+
77
+
38
78
  def format_duration(duration: timedelta | None) -> str:
39
79
  """Format a timedelta into a human-readable string with appropriate units."""
40
80
  if duration is None:
crawlee/_utils/urls.py CHANGED
@@ -7,6 +7,7 @@ from yarl import URL
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from collections.abc import Iterator
10
+ from logging import Logger
10
11
 
11
12
 
12
13
  def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
22
23
  return str(URL(base_url).join(URL(relative_url)))
23
24
 
24
25
 
25
- def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
26
+ def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
26
27
  """Convert an iterator of relative URLs to absolute URLs using a base URL."""
27
28
  for url in urls:
28
29
  if is_url_absolute(url):
29
30
  yield url
30
31
  else:
31
- yield convert_to_absolute_url(base_url, url)
32
+ converted_url = convert_to_absolute_url(base_url, url)
33
+ # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
34
+ if not is_url_absolute(converted_url):
35
+ if logger:
36
+ logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
37
+ continue
38
+ yield converted_url
32
39
 
33
40
 
34
41
  _http_url_adapter = TypeAdapter(AnyHttpUrl)
@@ -118,7 +118,10 @@ class BrowserPool:
118
118
  """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
119
119
 
120
120
  Args:
121
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
121
+ browser_type: The type of browser to launch:
122
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
123
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
124
+ the system.
122
125
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
123
126
  and local storage.
124
127
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided