crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/_types.py CHANGED
@@ -2,18 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
+ from copy import deepcopy
5
6
  from dataclasses import dataclass
6
- from typing import (
7
- TYPE_CHECKING,
8
- Annotated,
9
- Any,
10
- Literal,
11
- Protocol,
12
- TypedDict,
13
- TypeVar,
14
- cast,
15
- overload,
16
- )
7
+ from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
17
8
 
18
9
  from pydantic import ConfigDict, Field, PlainValidator, RootModel
19
10
 
@@ -25,7 +16,7 @@ if TYPE_CHECKING:
25
16
  import re
26
17
  from collections.abc import Callable, Coroutine, Sequence
27
18
 
28
- from typing_extensions import NotRequired, Required, Unpack
19
+ from typing_extensions import NotRequired, Required, Self, Unpack
29
20
 
30
21
  from crawlee import Glob, Request
31
22
  from crawlee._request import RequestOptions
@@ -69,13 +60,17 @@ def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
69
60
  class HttpHeaders(RootModel, Mapping[str, str]):
70
61
  """A dictionary-like object representing HTTP headers."""
71
62
 
72
- model_config = ConfigDict(populate_by_name=True)
63
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
73
64
 
74
- root: Annotated[
75
- dict[str, str],
76
- PlainValidator(lambda value: _normalize_headers(value)),
77
- Field(default_factory=dict),
78
- ] = {}
65
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
66
+ if TYPE_CHECKING:
67
+ root: dict[str, str] = {}
68
+ else:
69
+ root: Annotated[
70
+ dict[str, str],
71
+ PlainValidator(lambda value: _normalize_headers(value)),
72
+ Field(default_factory=lambda: dict[str, str]()),
73
+ ]
79
74
 
80
75
  def __getitem__(self, key: str) -> str:
81
76
  return self.root[key.lower()]
@@ -96,7 +91,7 @@ class HttpHeaders(RootModel, Mapping[str, str]):
96
91
  combined_headers = {**other, **self.root}
97
92
  return HttpHeaders(combined_headers)
98
93
 
99
- def __iter__(self) -> Iterator[str]: # type: ignore[override]
94
+ def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
100
95
  yield from self.root
101
96
 
102
97
  def __len__(self) -> int:
@@ -110,9 +105,9 @@ class ConcurrencySettings:
110
105
  def __init__(
111
106
  self,
112
107
  min_concurrency: int = 1,
113
- max_concurrency: int = 200,
108
+ max_concurrency: int = 100,
114
109
  max_tasks_per_minute: float = float('inf'),
115
- desired_concurrency: int | None = None,
110
+ desired_concurrency: int = 10,
116
111
  ) -> None:
117
112
  """Initialize a new instance.
118
113
 
@@ -125,21 +120,24 @@ class ConcurrencySettings:
125
120
  desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
126
121
  if there is a large enough supply of them. By default, it is `min_concurrency`.
127
122
  """
128
- if desired_concurrency is not None and desired_concurrency < 1:
129
- raise ValueError('desired_concurrency must be 1 or larger')
130
-
131
123
  if min_concurrency < 1:
132
124
  raise ValueError('min_concurrency must be 1 or larger')
133
125
 
134
126
  if max_concurrency < min_concurrency:
135
127
  raise ValueError('max_concurrency cannot be less than min_concurrency')
136
128
 
129
+ if desired_concurrency < min_concurrency:
130
+ raise ValueError('desired_concurrency cannot be less than min_concurrency')
131
+
132
+ if desired_concurrency > max_concurrency:
133
+ raise ValueError('desired_concurrency cannot be greater than max_concurrency')
134
+
137
135
  if max_tasks_per_minute <= 0:
138
136
  raise ValueError('max_tasks_per_minute must be positive')
139
137
 
140
138
  self.min_concurrency = min_concurrency
141
139
  self.max_concurrency = max_concurrency
142
- self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
140
+ self.desired_concurrency = desired_concurrency
143
141
  self.max_tasks_per_minute = max_tasks_per_minute
144
142
 
145
143
 
@@ -180,6 +178,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
180
178
  requests: Sequence[str | Request]
181
179
  """Requests to be added to the `RequestManager`."""
182
180
 
181
+ rq_id: str | None
182
+ """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
183
+
184
+ rq_name: str | None
185
+ """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
186
+ """
187
+
188
+ rq_alias: str | None
189
+ """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
190
+ """
191
+
183
192
 
184
193
  class PushDataKwargs(TypedDict):
185
194
  """Keyword arguments for dataset's `push_data` method."""
@@ -189,6 +198,7 @@ class PushDataFunctionCall(PushDataKwargs):
189
198
  data: list[dict[str, Any]] | dict[str, Any]
190
199
  dataset_id: str | None
191
200
  dataset_name: str | None
201
+ dataset_alias: str | None
192
202
 
193
203
 
194
204
  class KeyValueStoreInterface(Protocol):
@@ -251,25 +261,46 @@ class KeyValueStoreChangeRecords:
251
261
  class RequestHandlerRunResult:
252
262
  """Record of calls to storage-related context helpers."""
253
263
 
254
- def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
264
+ def __init__(
265
+ self,
266
+ *,
267
+ key_value_store_getter: GetKeyValueStoreFunction,
268
+ request: Request,
269
+ ) -> None:
255
270
  self._key_value_store_getter = key_value_store_getter
256
271
  self.add_requests_calls = list[AddRequestsKwargs]()
257
272
  self.push_data_calls = list[PushDataFunctionCall]()
258
- self.key_value_store_changes = dict[tuple[str | None, str | None], KeyValueStoreChangeRecords]()
273
+ self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
274
+
275
+ # Isolated copies for handler execution
276
+ self._request = deepcopy(request)
277
+
278
+ @property
279
+ def request(self) -> Request:
280
+ return self._request
259
281
 
260
282
  async def add_requests(
261
283
  self,
262
284
  requests: Sequence[str | Request],
285
+ rq_id: str | None = None,
286
+ rq_name: str | None = None,
287
+ rq_alias: str | None = None,
263
288
  **kwargs: Unpack[EnqueueLinksKwargs],
264
289
  ) -> None:
265
290
  """Track a call to the `add_requests` context helper."""
266
- self.add_requests_calls.append(AddRequestsKwargs(requests=requests, **kwargs))
291
+ specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
292
+ if specified_params > 1:
293
+ raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
294
+ self.add_requests_calls.append(
295
+ AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
296
+ )
267
297
 
268
298
  async def push_data(
269
299
  self,
270
300
  data: list[dict[str, Any]] | dict[str, Any],
271
301
  dataset_id: str | None = None,
272
302
  dataset_name: str | None = None,
303
+ dataset_alias: str | None = None,
273
304
  **kwargs: Unpack[PushDataKwargs],
274
305
  ) -> None:
275
306
  """Track a call to the `push_data` context helper."""
@@ -278,6 +309,7 @@ class RequestHandlerRunResult:
278
309
  data=data,
279
310
  dataset_id=dataset_id,
280
311
  dataset_name=dataset_name,
312
+ dataset_alias=dataset_alias,
281
313
  **kwargs,
282
314
  )
283
315
  )
@@ -287,13 +319,22 @@ class RequestHandlerRunResult:
287
319
  *,
288
320
  id: str | None = None,
289
321
  name: str | None = None,
322
+ alias: str | None = None,
290
323
  ) -> KeyValueStoreInterface:
291
- if (id, name) not in self.key_value_store_changes:
292
- self.key_value_store_changes[id, name] = KeyValueStoreChangeRecords(
293
- await self._key_value_store_getter(id=id, name=name)
324
+ if (id, name, alias) not in self.key_value_store_changes:
325
+ self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(
326
+ await self._key_value_store_getter(id=id, name=name, alias=alias)
294
327
  )
295
328
 
296
- return self.key_value_store_changes[id, name]
329
+ return self.key_value_store_changes[id, name, alias]
330
+
331
+ def apply_request_changes(self, target: Request) -> None:
332
+ """Apply tracked changes from handler copy to original request."""
333
+ if self.request.user_data != target.user_data:
334
+ target.user_data = self.request.user_data
335
+
336
+ if self.request.headers != target.headers:
337
+ target.headers = self.request.headers
297
338
 
298
339
 
299
340
  @docs_group('Functions')
@@ -307,12 +348,21 @@ class AddRequestsFunction(Protocol):
307
348
  def __call__(
308
349
  self,
309
350
  requests: Sequence[str | Request],
351
+ rq_id: str | None = None,
352
+ rq_name: str | None = None,
353
+ rq_alias: str | None = None,
310
354
  **kwargs: Unpack[EnqueueLinksKwargs],
311
355
  ) -> Coroutine[None, None, None]:
312
356
  """Call dunder method.
313
357
 
314
358
  Args:
315
359
  requests: Requests to be added to the `RequestManager`.
360
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
361
+ provided.
362
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
363
+ can be provided.
364
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
365
+ can be provided.
316
366
  **kwargs: Additional keyword arguments.
317
367
  """
318
368
 
@@ -340,12 +390,21 @@ class EnqueueLinksFunction(Protocol):
340
390
  label: str | None = None,
341
391
  user_data: dict[str, Any] | None = None,
342
392
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
393
+ rq_id: str | None = None,
394
+ rq_name: str | None = None,
395
+ rq_alias: str | None = None,
343
396
  **kwargs: Unpack[EnqueueLinksKwargs],
344
397
  ) -> Coroutine[None, None, None]: ...
345
398
 
346
399
  @overload
347
400
  def __call__(
348
- self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
401
+ self,
402
+ *,
403
+ requests: Sequence[str | Request] | None = None,
404
+ rq_id: str | None = None,
405
+ rq_name: str | None = None,
406
+ rq_alias: str | None = None,
407
+ **kwargs: Unpack[EnqueueLinksKwargs],
349
408
  ) -> Coroutine[None, None, None]: ...
350
409
 
351
410
  def __call__(
@@ -356,6 +415,9 @@ class EnqueueLinksFunction(Protocol):
356
415
  user_data: dict[str, Any] | None = None,
357
416
  transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
358
417
  requests: Sequence[str | Request] | None = None,
418
+ rq_id: str | None = None,
419
+ rq_name: str | None = None,
420
+ rq_alias: str | None = None,
359
421
  **kwargs: Unpack[EnqueueLinksKwargs],
360
422
  ) -> Coroutine[None, None, None]:
361
423
  """Call enqueue links function.
@@ -373,6 +435,12 @@ class EnqueueLinksFunction(Protocol):
373
435
  - `'skip'` to exclude the request from being enqueued,
374
436
  - `'unchanged'` to use the original request options without modification.
375
437
  requests: Requests to be added to the `RequestManager`.
438
+ rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
439
+ provided.
440
+ rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
441
+ can be provided.
442
+ rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
443
+ can be provided.
376
444
  **kwargs: Additional keyword arguments.
377
445
  """
378
446
 
@@ -424,12 +492,14 @@ class GetKeyValueStoreFunction(Protocol):
424
492
  *,
425
493
  id: str | None = None,
426
494
  name: str | None = None,
495
+ alias: str | None = None,
427
496
  ) -> Coroutine[None, None, KeyValueStore]:
428
497
  """Call dunder method.
429
498
 
430
499
  Args:
431
500
  id: The ID of the `KeyValueStore` to get.
432
- name: The name of the `KeyValueStore` to get.
501
+ name: The name of the `KeyValueStore` to get (global scope, named storage).
502
+ alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
433
503
  """
434
504
 
435
505
 
@@ -444,12 +514,14 @@ class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
444
514
  *,
445
515
  id: str | None = None,
446
516
  name: str | None = None,
517
+ alias: str | None = None,
447
518
  ) -> Coroutine[None, None, KeyValueStoreInterface]:
448
519
  """Call dunder method.
449
520
 
450
521
  Args:
451
522
  id: The ID of the `KeyValueStore` to get.
452
- name: The name of the `KeyValueStore` to get.
523
+ name: The name of the `KeyValueStore` to get (global scope, named storage).
524
+ alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
453
525
  """
454
526
 
455
527
 
@@ -466,6 +538,7 @@ class PushDataFunction(Protocol):
466
538
  data: list[dict[str, Any]] | dict[str, Any],
467
539
  dataset_id: str | None = None,
468
540
  dataset_name: str | None = None,
541
+ dataset_alias: str | None = None,
469
542
  **kwargs: Unpack[PushDataKwargs],
470
543
  ) -> Coroutine[None, None, None]:
471
544
  """Call dunder method.
@@ -473,7 +546,8 @@ class PushDataFunction(Protocol):
473
546
  Args:
474
547
  data: The data to push to the `Dataset`.
475
548
  dataset_id: The ID of the `Dataset` to push the data to.
476
- dataset_name: The name of the `Dataset` to push the data to.
549
+ dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).
550
+ dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).
477
551
  **kwargs: Additional keyword arguments.
478
552
  """
479
553
 
@@ -590,6 +664,24 @@ class BasicCrawlingContext:
590
664
  """Return hash of the context. Each context is considered unique."""
591
665
  return id(self)
592
666
 
667
+ def create_modified_copy(
668
+ self,
669
+ push_data: PushDataFunction | None = None,
670
+ add_requests: AddRequestsFunction | None = None,
671
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
672
+ ) -> Self:
673
+ """Create a modified copy of the crawling context with specified changes."""
674
+ modifications = dict[str, Any]()
675
+
676
+ if push_data is not None:
677
+ modifications['push_data'] = push_data
678
+ if add_requests is not None:
679
+ modifications['add_requests'] = add_requests
680
+ if get_key_value_store is not None:
681
+ modifications['get_key_value_store'] = get_key_value_store
682
+
683
+ return dataclasses.replace(self, **modifications)
684
+
593
685
 
594
686
  class GetDataKwargs(TypedDict):
595
687
  """Keyword arguments for dataset's `get_data` method."""
crawlee/_utils/context.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
3
+ import inspect
4
4
  from collections.abc import Callable
5
5
  from functools import wraps
6
- from typing import Any, TypeVar
6
+ from typing import Any, TypeVar, cast
7
7
 
8
8
  T = TypeVar('T', bound=Callable[..., Any])
9
9
 
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
44
44
 
45
45
  return await method(self, *args, **kwargs)
46
46
 
47
- return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
47
+ return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)
crawlee/_utils/file.py CHANGED
@@ -163,7 +163,14 @@ async def export_csv_to_stream(
163
163
  dst: TextIO,
164
164
  **kwargs: Unpack[ExportDataCsvKwargs],
165
165
  ) -> None:
166
- writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
166
+ # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167
+ # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168
+ # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169
+ # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170
+ if 'lineterminator' not in kwargs:
171
+ kwargs['lineterminator'] = '\n'
172
+
173
+ writer = csv.writer(dst, **kwargs)
167
174
  write_header = True
168
175
 
169
176
  # Iterate over the dataset and write to CSV.
crawlee/_utils/globs.py CHANGED
@@ -33,12 +33,12 @@ def _translate(
33
33
 
34
34
  HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
35
35
  """
36
- if not seps:
37
- seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep
36
+ _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps
38
37
 
39
- escaped_seps = ''.join(map(re.escape, seps))
40
- any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
38
+ escaped_seps = ''.join(map(re.escape, _seps))
39
+ any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
41
40
  not_sep = f'[^{escaped_seps}]'
41
+
42
42
  if include_hidden:
43
43
  one_last_segment = f'{not_sep}+'
44
44
  one_segment = f'{one_last_segment}{any_sep}'
@@ -0,0 +1,12 @@
1
+ from typing import Any
2
+
3
+
4
+ def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
5
+ """Raise ValueError if there are more non-None kwargs then max_kwargs."""
6
+ none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
7
+ if len(none_kwargs_names) > max_kwargs:
8
+ all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
9
+ raise ValueError(
10
+ f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
11
+ f'specified: {", ".join(none_kwargs_names)}.'
12
+ )
@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
7
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
7
8
  from crawlee.events._types import Event, EventPersistStateData
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  import logging
12
+ from collections.abc import Callable, Coroutine
11
13
 
12
- from crawlee.storages._key_value_store import KeyValueStore
14
+ from crawlee.storages import KeyValueStore
13
15
 
14
16
  TStateModel = TypeVar('TStateModel', bound=BaseModel)
15
17
 
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
37
39
  persistence_enabled: Literal[True, False, 'explicit_only'] = False,
38
40
  persist_state_kvs_name: str | None = None,
39
41
  persist_state_kvs_id: str | None = None,
42
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
40
43
  logger: logging.Logger,
41
44
  ) -> None:
42
45
  """Initialize a new recoverable state object.
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
51
54
  If neither a name nor and id are supplied, the default store will be used.
52
55
  persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
53
56
  If neither a name nor and id are supplied, the default store will be used.
57
+ persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
58
+ not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
54
59
  logger: A logger instance for logging operations related to state persistence
55
60
  """
61
+ raise_if_too_many_kwargs(
62
+ persist_state_kvs_name=persist_state_kvs_name,
63
+ persist_state_kvs_id=persist_state_kvs_id,
64
+ persist_state_kvs_factory=persist_state_kvs_factory,
65
+ )
66
+ if not persist_state_kvs_factory:
67
+ logger.debug(
68
+ 'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
69
+ 'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
70
+ 'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
71
+ 'global side effects.'
72
+ )
73
+
56
74
  self._default_state = default_state
57
75
  self._state_type: type[TStateModel] = self._default_state.__class__
58
76
  self._state: TStateModel | None = None
59
77
  self._persistence_enabled = persistence_enabled
60
78
  self._persist_state_key = persist_state_key
61
- self._persist_state_kvs_name = persist_state_kvs_name
62
- self._persist_state_kvs_id = persist_state_kvs_id
63
- self._key_value_store: 'KeyValueStore | None' = None # noqa: UP037
79
+ if persist_state_kvs_factory is None:
80
+
81
+ async def kvs_factory() -> KeyValueStore:
82
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
83
+
84
+ return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
85
+
86
+ self._persist_state_kvs_factory = kvs_factory
87
+ else:
88
+ self._persist_state_kvs_factory = persist_state_kvs_factory
89
+
90
+ self._key_value_store: KeyValueStore | None = None
64
91
  self._log = logger
65
92
 
66
93
  async def initialize(self) -> TStateModel:
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
77
104
  return self.current_value
78
105
 
79
106
  # Import here to avoid circular imports.
80
- from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415
81
107
 
82
- self._key_value_store = await KeyValueStore.open(
83
- name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
84
- )
108
+ self._key_value_store = await self._persist_state_kvs_factory()
85
109
 
86
110
  await self._load_saved_state()
87
111
 
@@ -1,12 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import inspect
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING
6
7
 
7
8
  if TYPE_CHECKING:
8
9
  from collections.abc import Callable
9
10
  from datetime import timedelta
11
+ from types import TracebackType
12
+
13
+ from typing_extensions import Self
10
14
 
11
15
  logger = getLogger(__name__)
12
16
 
@@ -21,11 +25,27 @@ class RecurringTask:
21
25
  """
22
26
 
23
27
  def __init__(self, func: Callable, delay: timedelta) -> None:
24
- logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...')
28
+ logger.debug(
29
+ 'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
30
+ func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
31
+ delay,
32
+ )
25
33
  self.func = func
26
34
  self.delay = delay
27
35
  self.task: asyncio.Task | None = None
28
36
 
37
+ async def __aenter__(self) -> Self:
38
+ self.start()
39
+ return self
40
+
41
+ async def __aexit__(
42
+ self,
43
+ exc_type: type[BaseException] | None,
44
+ exc_value: BaseException | None,
45
+ exc_traceback: TracebackType | None,
46
+ ) -> None:
47
+ await self.stop()
48
+
29
49
  async def _wrapper(self) -> None:
30
50
  """Continuously execute the provided function with the specified delay.
31
51
 
@@ -34,12 +54,16 @@ class RecurringTask:
34
54
  """
35
55
  sleep_time_secs = self.delay.total_seconds()
36
56
  while True:
37
- await self.func() if asyncio.iscoroutinefunction(self.func) else self.func()
57
+ await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
38
58
  await asyncio.sleep(sleep_time_secs)
39
59
 
40
60
  def start(self) -> None:
41
61
  """Start the recurring task execution."""
42
- self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}')
62
+ name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
63
+ self.task = asyncio.create_task(
64
+ self._wrapper(),
65
+ name=f'Task-recurring-{name}',
66
+ )
43
67
 
44
68
  async def stop(self) -> None:
45
69
  """Stop the recurring task execution."""
@@ -1,8 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import re
4
- from base64 import b64encode
5
- from hashlib import sha256
6
3
  from logging import getLogger
7
4
  from typing import TYPE_CHECKING
8
5
 
@@ -16,29 +13,6 @@ if TYPE_CHECKING:
16
13
  logger = getLogger(__name__)
17
14
 
18
15
 
19
- def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> str:
20
- """Generate a deterministic request ID based on a unique key.
21
-
22
- Args:
23
- unique_key: The unique key to convert into a request ID.
24
- request_id_length: The length of the request ID.
25
-
26
- Returns:
27
- A URL-safe, truncated request ID based on the unique key.
28
- """
29
- # Encode the unique key and compute its SHA-256 hash
30
- hashed_key = sha256(unique_key.encode('utf-8')).digest()
31
-
32
- # Encode the hash in base64 and decode it to get a string
33
- base64_encoded = b64encode(hashed_key).decode('utf-8')
34
-
35
- # Remove characters that are not URL-safe ('+', '/', or '=')
36
- url_safe_key = re.sub(r'(\+|\/|=)', '', base64_encoded)
37
-
38
- # Truncate the key to the desired length
39
- return url_safe_key[:request_id_length]
40
-
41
-
42
16
  def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
43
17
  """Normalize a URL.
44
18
 
crawlee/_utils/robots.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from logging import getLogger
3
4
  from typing import TYPE_CHECKING
4
5
 
5
6
  from protego import Protego
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
15
16
  from crawlee.proxy_configuration import ProxyInfo
16
17
 
17
18
 
19
+ logger = getLogger(__name__)
20
+
21
+
18
22
  class RobotsTxtFile:
19
23
  def __init__(
20
24
  self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ class RobotsTxtFile:
56
60
  http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
57
61
  proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
58
62
  """
59
- response = await http_client.send_request(url, proxy_info=proxy_info)
60
- body = (
61
- b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
62
- )
63
+ try:
64
+ response = await http_client.send_request(url, proxy_info=proxy_info)
65
+
66
+ body = (
67
+ b'User-agent: *\nAllow: /'
68
+ if is_status_code_client_error(response.status_code)
69
+ else await response.read()
70
+ )
71
+ robots = Protego.parse(body.decode('utf-8'))
72
+
73
+ except Exception as e:
74
+ logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
63
75
 
64
- robots = Protego.parse(body.decode('utf-8'))
76
+ robots = Protego.parse('User-agent: *\nAllow: /')
65
77
 
66
78
  return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
67
79