crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_types.py +44 -5
  5. crawlee/_utils/context.py +3 -3
  6. crawlee/_utils/file.py +8 -1
  7. crawlee/_utils/globs.py +4 -4
  8. crawlee/_utils/recurring_task.py +12 -3
  9. crawlee/_utils/sitemap.py +12 -5
  10. crawlee/_utils/system.py +27 -11
  11. crawlee/_utils/time.py +41 -1
  12. crawlee/browsers/_browser_pool.py +1 -1
  13. crawlee/browsers/_playwright_browser.py +2 -1
  14. crawlee/crawlers/__init__.py +5 -1
  15. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  16. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
  17. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  18. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
  19. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  20. crawlee/crawlers/_basic/_basic_crawler.py +138 -124
  21. crawlee/crawlers/_basic/_context_utils.py +24 -0
  22. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  23. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  24. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  25. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  26. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
  27. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  28. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  29. crawlee/crawlers/_playwright/_types.py +12 -2
  30. crawlee/errors.py +4 -0
  31. crawlee/events/_event_manager.py +12 -6
  32. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  33. crawlee/http_clients/_base.py +4 -0
  34. crawlee/http_clients/_curl_impersonate.py +68 -14
  35. crawlee/http_clients/_httpx.py +16 -6
  36. crawlee/http_clients/_impit.py +25 -10
  37. crawlee/otel/crawler_instrumentor.py +1 -3
  38. crawlee/request_loaders/_sitemap_request_loader.py +18 -5
  39. crawlee/router.py +13 -3
  40. crawlee/sessions/_cookies.py +13 -8
  41. crawlee/sessions/_models.py +3 -3
  42. crawlee/statistics/_models.py +51 -9
  43. crawlee/statistics/_statistics.py +2 -21
  44. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  45. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  46. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  47. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  48. crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
  49. crawlee/storage_clients/_redis/_client_mixin.py +1 -4
  50. crawlee/storage_clients/_redis/_dataset_client.py +6 -2
  51. crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
  52. crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
  53. crawlee/storage_clients/_redis/_storage_client.py +12 -9
  54. crawlee/storage_clients/_redis/_utils.py +1 -1
  55. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  56. crawlee/storage_clients/_sql/_storage_client.py +0 -9
  57. crawlee/storage_clients/models.py +8 -3
  58. crawlee/storages/_storage_instance_manager.py +103 -44
  59. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
  60. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
  61. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  62. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  63. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from importlib import metadata
2
2
 
3
- from ._request import Request, RequestOptions
3
+ from ._request import Request, RequestOptions, RequestState
4
4
  from ._service_locator import service_locator
5
5
  from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
6
6
  from ._utils.globs import Glob
@@ -14,6 +14,7 @@ __all__ = [
14
14
  'HttpHeaders',
15
15
  'Request',
16
16
  'RequestOptions',
17
+ 'RequestState',
17
18
  'RequestTransformAction',
18
19
  'SkippedReason',
19
20
  'service_locator',
@@ -1,4 +1,8 @@
1
1
  # ruff: noqa: N802, PLC0415
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from collections.abc import Callable
2
6
 
3
7
 
4
8
  def patch_browserforge() -> None:
@@ -12,7 +16,7 @@ def patch_browserforge() -> None:
12
16
  import apify_fingerprint_datapoints
13
17
  from browserforge import download
14
18
 
15
- download.DATA_DIRS: dict[str, Path] = { # type:ignore[misc]
19
+ download.DATA_DIRS = {
16
20
  'headers': apify_fingerprint_datapoints.get_header_network().parent,
17
21
  'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
18
22
  }
@@ -20,7 +24,7 @@ def patch_browserforge() -> None:
20
24
  def DownloadIfNotExists(**flags: bool) -> None:
21
25
  pass
22
26
 
23
- download.DownloadIfNotExists = DownloadIfNotExists
27
+ download.DownloadIfNotExists: Callable[..., None] = DownloadIfNotExists
24
28
 
25
29
  import browserforge.bayesian_network
26
30
 
@@ -33,7 +37,7 @@ def patch_browserforge() -> None:
33
37
  path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
34
38
  super().__init__(path)
35
39
 
36
- browserforge.bayesian_network.BayesianNetwork = BayesianNetwork # type:ignore[misc]
40
+ browserforge.bayesian_network.BayesianNetwork: BayesianNetwork = BayesianNetwork
37
41
  import browserforge.headers.generator
38
42
 
39
43
  browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']
crawlee/_request.py CHANGED
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
34
34
  class CrawleeRequestData(BaseModel):
35
35
  """Crawlee-specific configuration stored in the `user_data`."""
36
36
 
37
- max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
37
+ max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
38
38
  """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
39
39
  `BasicCrawler`."""
40
40
 
41
41
  enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
42
42
  """The strategy that was used for enqueuing the request."""
43
43
 
44
- state: RequestState | None = None
44
+ state: RequestState = RequestState.UNPROCESSED
45
45
  """Describes the request's current lifecycle state."""
46
46
 
47
47
  session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -93,7 +93,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
93
93
  def __delitem__(self, key: str) -> None:
94
94
  del self.__pydantic_extra__[key]
95
95
 
96
- def __iter__(self) -> Iterator[str]: # type: ignore[override]
96
+ def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
97
97
  yield from self.__pydantic_extra__
98
98
 
99
99
  def __len__(self) -> int:
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
137
137
  always_enqueue: NotRequired[bool]
138
138
  user_data: NotRequired[dict[str, JsonSerializable]]
139
139
  no_retry: NotRequired[bool]
140
+ enqueue_strategy: NotRequired[EnqueueStrategy]
141
+ max_retries: NotRequired[int | None]
140
142
 
141
143
 
142
144
  @docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
166
168
 
167
169
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
168
170
 
169
- unique_key: Annotated[str, Field(alias='uniqueKey')]
171
+ unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
170
172
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
171
173
  to the same URL.
172
174
 
@@ -178,21 +180,22 @@ class Request(BaseModel):
178
180
  and specify which URLs shall be considered equal.
179
181
  """
180
182
 
181
- url: Annotated[str, BeforeValidator(validate_http_url), Field()]
183
+ url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
182
184
  """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
183
185
  and fragments."""
184
186
 
185
- method: HttpMethod = 'GET'
187
+ method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
186
188
  """HTTP request method."""
187
189
 
188
190
  payload: Annotated[
189
191
  HttpPayload | None,
190
192
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
191
193
  PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
194
+ Field(frozen=True),
192
195
  ] = None
193
196
  """HTTP request payload."""
194
197
 
195
- # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
198
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
196
199
  if TYPE_CHECKING:
197
200
  headers: HttpHeaders = HttpHeaders()
198
201
  """HTTP request headers."""
@@ -250,6 +253,8 @@ class Request(BaseModel):
250
253
  keep_url_fragment: bool = False,
251
254
  use_extended_unique_key: bool = False,
252
255
  always_enqueue: bool = False,
256
+ enqueue_strategy: EnqueueStrategy | None = None,
257
+ max_retries: int | None = None,
253
258
  **kwargs: Any,
254
259
  ) -> Self:
255
260
  """Create a new `Request` instance from a URL.
@@ -277,6 +282,9 @@ class Request(BaseModel):
277
282
  `unique_key` computation. This is only relevant when `unique_key` is not provided.
278
283
  always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
279
284
  Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
285
+ enqueue_strategy: The strategy that will be used for enqueuing the request.
286
+ max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
287
+ option of `BasicCrawler`.
280
288
  **kwargs: Additional request properties.
281
289
  """
282
290
  if unique_key is not None and always_enqueue:
@@ -299,7 +307,21 @@ class Request(BaseModel):
299
307
  )
300
308
 
301
309
  if always_enqueue:
302
- unique_key = f'{unique_key}_{crypto_random_object_id()}'
310
+ unique_key = f'{crypto_random_object_id()}|{unique_key}'
311
+
312
+ user_data_dict = kwargs.pop('user_data', {}) or {}
313
+ crawlee_data_dict = user_data_dict.get('__crawlee', {})
314
+
315
+ if max_retries is not None:
316
+ crawlee_data_dict['maxRetries'] = max_retries
317
+
318
+ if enqueue_strategy is not None:
319
+ crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
320
+
321
+ crawlee_data = CrawleeRequestData(**crawlee_data_dict)
322
+
323
+ if crawlee_data:
324
+ user_data_dict['__crawlee'] = crawlee_data
303
325
 
304
326
  request = cls(
305
327
  url=url,
@@ -307,6 +329,7 @@ class Request(BaseModel):
307
329
  method=method,
308
330
  headers=headers,
309
331
  payload=payload,
332
+ user_data=user_data_dict,
310
333
  **kwargs,
311
334
  )
312
335
 
@@ -352,7 +375,7 @@ class Request(BaseModel):
352
375
  self.crawlee_data.crawl_depth = new_value
353
376
 
354
377
  @property
355
- def state(self) -> RequestState | None:
378
+ def state(self) -> RequestState:
356
379
  """Crawlee-specific request handling state."""
357
380
  return self.crawlee_data.state
358
381
 
@@ -365,10 +388,6 @@ class Request(BaseModel):
365
388
  """Crawlee-specific limit on the number of retries of the request."""
366
389
  return self.crawlee_data.max_retries
367
390
 
368
- @max_retries.setter
369
- def max_retries(self, new_max_retries: int) -> None:
370
- self.crawlee_data.max_retries = new_max_retries
371
-
372
391
  @property
373
392
  def session_rotation_count(self) -> int | None:
374
393
  """Crawlee-specific number of finished session rotations for the request."""
crawlee/_types.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
+ from copy import deepcopy
5
6
  from dataclasses import dataclass
6
7
  from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
7
8
 
@@ -15,7 +16,7 @@ if TYPE_CHECKING:
15
16
  import re
16
17
  from collections.abc import Callable, Coroutine, Sequence
17
18
 
18
- from typing_extensions import NotRequired, Required, Unpack
19
+ from typing_extensions import NotRequired, Required, Self, Unpack
19
20
 
20
21
  from crawlee import Glob, Request
21
22
  from crawlee._request import RequestOptions
@@ -61,14 +62,14 @@ class HttpHeaders(RootModel, Mapping[str, str]):
61
62
 
62
63
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
63
64
 
64
- # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
65
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
65
66
  if TYPE_CHECKING:
66
67
  root: dict[str, str] = {}
67
68
  else:
68
69
  root: Annotated[
69
70
  dict[str, str],
70
71
  PlainValidator(lambda value: _normalize_headers(value)),
71
- Field(default_factory=dict),
72
+ Field(default_factory=lambda: dict[str, str]()),
72
73
  ]
73
74
 
74
75
  def __getitem__(self, key: str) -> str:
@@ -90,7 +91,7 @@ class HttpHeaders(RootModel, Mapping[str, str]):
90
91
  combined_headers = {**other, **self.root}
91
92
  return HttpHeaders(combined_headers)
92
93
 
93
- def __iter__(self) -> Iterator[str]: # type: ignore[override]
94
+ def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
94
95
  yield from self.root
95
96
 
96
97
  def __len__(self) -> int:
@@ -260,12 +261,24 @@ class KeyValueStoreChangeRecords:
260
261
  class RequestHandlerRunResult:
261
262
  """Record of calls to storage-related context helpers."""
262
263
 
263
- def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
264
+ def __init__(
265
+ self,
266
+ *,
267
+ key_value_store_getter: GetKeyValueStoreFunction,
268
+ request: Request,
269
+ ) -> None:
264
270
  self._key_value_store_getter = key_value_store_getter
265
271
  self.add_requests_calls = list[AddRequestsKwargs]()
266
272
  self.push_data_calls = list[PushDataFunctionCall]()
267
273
  self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
268
274
 
275
+ # Isolated copies for handler execution
276
+ self._request = deepcopy(request)
277
+
278
+ @property
279
+ def request(self) -> Request:
280
+ return self._request
281
+
269
282
  async def add_requests(
270
283
  self,
271
284
  requests: Sequence[str | Request],
@@ -315,6 +328,14 @@ class RequestHandlerRunResult:
315
328
 
316
329
  return self.key_value_store_changes[id, name, alias]
317
330
 
331
+ def apply_request_changes(self, target: Request) -> None:
332
+ """Apply tracked changes from handler copy to original request."""
333
+ if self.request.user_data != target.user_data:
334
+ target.user_data = self.request.user_data
335
+
336
+ if self.request.headers != target.headers:
337
+ target.headers = self.request.headers
338
+
318
339
 
319
340
  @docs_group('Functions')
320
341
  class AddRequestsFunction(Protocol):
@@ -643,6 +664,24 @@ class BasicCrawlingContext:
643
664
  """Return hash of the context. Each context is considered unique."""
644
665
  return id(self)
645
666
 
667
+ def create_modified_copy(
668
+ self,
669
+ push_data: PushDataFunction | None = None,
670
+ add_requests: AddRequestsFunction | None = None,
671
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
672
+ ) -> Self:
673
+ """Create a modified copy of the crawling context with specified changes."""
674
+ modifications = dict[str, Any]()
675
+
676
+ if push_data is not None:
677
+ modifications['push_data'] = push_data
678
+ if add_requests is not None:
679
+ modifications['add_requests'] = add_requests
680
+ if get_key_value_store is not None:
681
+ modifications['get_key_value_store'] = get_key_value_store
682
+
683
+ return dataclasses.replace(self, **modifications)
684
+
646
685
 
647
686
  class GetDataKwargs(TypedDict):
648
687
  """Keyword arguments for dataset's `get_data` method."""
crawlee/_utils/context.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
3
+ import inspect
4
4
  from collections.abc import Callable
5
5
  from functools import wraps
6
- from typing import Any, TypeVar
6
+ from typing import Any, TypeVar, cast
7
7
 
8
8
  T = TypeVar('T', bound=Callable[..., Any])
9
9
 
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
44
44
 
45
45
  return await method(self, *args, **kwargs)
46
46
 
47
- return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
47
+ return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)
crawlee/_utils/file.py CHANGED
@@ -163,7 +163,14 @@ async def export_csv_to_stream(
163
163
  dst: TextIO,
164
164
  **kwargs: Unpack[ExportDataCsvKwargs],
165
165
  ) -> None:
166
- writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
166
+ # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167
+ # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168
+ # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169
+ # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170
+ if 'lineterminator' not in kwargs:
171
+ kwargs['lineterminator'] = '\n'
172
+
173
+ writer = csv.writer(dst, **kwargs)
167
174
  write_header = True
168
175
 
169
176
  # Iterate over the dataset and write to CSV.
crawlee/_utils/globs.py CHANGED
@@ -33,12 +33,12 @@ def _translate(
33
33
 
34
34
  HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
35
35
  """
36
- if not seps:
37
- seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep
36
+ _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps
38
37
 
39
- escaped_seps = ''.join(map(re.escape, seps))
40
- any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
38
+ escaped_seps = ''.join(map(re.escape, _seps))
39
+ any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
41
40
  not_sep = f'[^{escaped_seps}]'
41
+
42
42
  if include_hidden:
43
43
  one_last_segment = f'{not_sep}+'
44
44
  one_segment = f'{one_last_segment}{any_sep}'
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import inspect
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING
6
7
 
@@ -24,7 +25,11 @@ class RecurringTask:
24
25
  """
25
26
 
26
27
  def __init__(self, func: Callable, delay: timedelta) -> None:
27
- logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...')
28
+ logger.debug(
29
+ 'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
30
+ func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
31
+ delay,
32
+ )
28
33
  self.func = func
29
34
  self.delay = delay
30
35
  self.task: asyncio.Task | None = None
@@ -49,12 +54,16 @@ class RecurringTask:
49
54
  """
50
55
  sleep_time_secs = self.delay.total_seconds()
51
56
  while True:
52
- await self.func() if asyncio.iscoroutinefunction(self.func) else self.func()
57
+ await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
53
58
  await asyncio.sleep(sleep_time_secs)
54
59
 
55
60
  def start(self) -> None:
56
61
  """Start the recurring task execution."""
57
- self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}')
62
+ name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
63
+ self.task = asyncio.create_task(
64
+ self._wrapper(),
65
+ name=f'Task-recurring-{name}',
66
+ )
58
67
 
59
68
  async def stop(self) -> None:
60
69
  """Stop the recurring task execution."""
crawlee/_utils/sitemap.py CHANGED
@@ -430,10 +430,17 @@ async def parse_sitemap(
430
430
  up to the specified maximum depth.
431
431
  """
432
432
  # Set default options
433
- options = options or {}
434
- emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
435
- max_depth = options.get('max_depth', float('inf'))
436
- sitemap_retries = options.get('sitemap_retries', 3)
433
+ default_timeout = timedelta(seconds=30)
434
+ if options:
435
+ emit_nested_sitemaps = options['emit_nested_sitemaps']
436
+ max_depth = options['max_depth']
437
+ sitemap_retries = options['sitemap_retries']
438
+ timeout = options.get('timeout', default_timeout)
439
+ else:
440
+ emit_nested_sitemaps = False
441
+ max_depth = float('inf')
442
+ sitemap_retries = 3
443
+ timeout = default_timeout
437
444
 
438
445
  # Setup working state
439
446
  sources = list(initial_sources)
@@ -472,7 +479,7 @@ async def parse_sitemap(
472
479
  sitemap_retries,
473
480
  emit_nested_sitemaps=emit_nested_sitemaps,
474
481
  proxy_info=proxy_info,
475
- timeout=options.get('timeout', timedelta(seconds=30)),
482
+ timeout=timeout,
476
483
  ):
477
484
  yield result
478
485
  else:
crawlee/_utils/system.py CHANGED
@@ -5,7 +5,7 @@ import sys
5
5
  from contextlib import suppress
6
6
  from datetime import datetime, timezone
7
7
  from logging import getLogger
8
- from typing import Annotated
8
+ from typing import TYPE_CHECKING, Annotated
9
9
 
10
10
  import psutil
11
11
  from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
@@ -41,11 +41,19 @@ class CpuInfo(BaseModel):
41
41
  used_ratio: Annotated[float, Field(alias='usedRatio')]
42
42
  """The ratio of CPU currently in use, represented as a float between 0 and 1."""
43
43
 
44
- created_at: datetime = Field(
45
- alias='createdAt',
46
- default_factory=lambda: datetime.now(timezone.utc),
47
- )
48
- """The time at which the measurement was taken."""
44
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
45
+ if TYPE_CHECKING:
46
+ created_at: datetime = datetime.now(timezone.utc)
47
+ """The time at which the measurement was taken."""
48
+ else:
49
+ created_at: Annotated[
50
+ datetime,
51
+ Field(
52
+ alias='createdAt',
53
+ default_factory=lambda: datetime.now(timezone.utc),
54
+ ),
55
+ ]
56
+ """The time at which the measurement was taken."""
49
57
 
50
58
 
51
59
  class MemoryUsageInfo(BaseModel):
@@ -61,11 +69,19 @@ class MemoryUsageInfo(BaseModel):
61
69
  ]
62
70
  """Memory usage of the current Python process and its children."""
63
71
 
64
- created_at: datetime = Field(
65
- alias='createdAt',
66
- default_factory=lambda: datetime.now(timezone.utc),
67
- )
68
- """The time at which the measurement was taken."""
72
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
73
+ if TYPE_CHECKING:
74
+ created_at: datetime = datetime.now(timezone.utc)
75
+ """The time at which the measurement was taken."""
76
+ else:
77
+ created_at: Annotated[
78
+ datetime,
79
+ Field(
80
+ alias='createdAt',
81
+ default_factory=lambda: datetime.now(timezone.utc),
82
+ ),
83
+ ]
84
+ """The time at which the measurement was taken."""
69
85
 
70
86
 
71
87
  class MemoryInfo(MemoryUsageInfo):
crawlee/_utils/time.py CHANGED
@@ -3,11 +3,14 @@ from __future__ import annotations
3
3
  import time
4
4
  from contextlib import contextmanager
5
5
  from dataclasses import dataclass
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING
7
8
 
9
+ from async_timeout import Timeout, timeout
10
+
8
11
  if TYPE_CHECKING:
9
12
  from collections.abc import Iterator
10
- from datetime import timedelta
13
+ from types import TracebackType
11
14
 
12
15
  _SECONDS_PER_MINUTE = 60
13
16
  _SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
35
38
  result.cpu = after_cpu - before_cpu
36
39
 
37
40
 
41
+ class SharedTimeout:
42
+ """Keeps track of a time budget shared by multiple independent async operations.
43
+
44
+ Provides a reusable, non-reentrant context manager interface.
45
+ """
46
+
47
+ def __init__(self, timeout: timedelta) -> None:
48
+ self._remaining_timeout = timeout
49
+ self._active_timeout: Timeout | None = None
50
+ self._activation_timestamp: float | None = None
51
+
52
+ async def __aenter__(self) -> timedelta:
53
+ if self._active_timeout is not None or self._activation_timestamp is not None:
54
+ raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55
+
56
+ self._activation_timestamp = time.monotonic()
57
+ self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58
+ await new_timeout.__aenter__()
59
+ return self._remaining_timeout
60
+
61
+ async def __aexit__(
62
+ self,
63
+ exc_type: type[BaseException] | None,
64
+ exc_value: BaseException | None,
65
+ exc_traceback: TracebackType | None,
66
+ ) -> None:
67
+ if self._active_timeout is None or self._activation_timestamp is None:
68
+ raise RuntimeError('Logic error')
69
+
70
+ await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71
+ elapsed = time.monotonic() - self._activation_timestamp
72
+ self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73
+
74
+ self._active_timeout = None
75
+ self._activation_timestamp = None
76
+
77
+
38
78
  def format_duration(duration: timedelta | None) -> str:
39
79
  """Format a timedelta into a human-readable string with appropriate units."""
40
80
  if duration is None:
@@ -138,7 +138,7 @@ class BrowserPool:
138
138
  kwargs: Additional arguments for default constructor.
139
139
  """
140
140
  plugin_options: dict = defaultdict(dict)
141
- plugin_options['browser_launch_options'] = browser_launch_options or {}
141
+ plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}
142
142
  plugin_options['browser_new_context_options'] = browser_new_context_options or {}
143
143
 
144
144
  if headless is not None:
@@ -78,7 +78,8 @@ class PlaywrightPersistentBrowser(Browser):
78
78
 
79
79
  async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
80
80
  if self._temp_dir and self._temp_dir.exists():
81
- await asyncio.to_thread(shutil.rmtree, self._temp_dir, ignore_errors=True)
81
+ temp_dir = self._temp_dir
82
+ await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
82
83
 
83
84
  @override
84
85
  async def close(self, **kwargs: Any) -> None:
@@ -1,7 +1,7 @@
1
1
  from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
2
  from crawlee._utils.try_import import try_import as _try_import
3
3
 
4
- from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4
+ from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
5
5
  from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
6
6
  from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
7
7
 
@@ -23,12 +23,14 @@ with _try_import(
23
23
  'AdaptivePlaywrightCrawler',
24
24
  'AdaptivePlaywrightCrawlingContext',
25
25
  'AdaptivePlaywrightPreNavCrawlingContext',
26
+ 'AdaptivePlaywrightCrawlerStatisticState',
26
27
  'RenderingType',
27
28
  'RenderingTypePrediction',
28
29
  'RenderingTypePredictor',
29
30
  ):
30
31
  from ._adaptive_playwright import (
31
32
  AdaptivePlaywrightCrawler,
33
+ AdaptivePlaywrightCrawlerStatisticState,
32
34
  AdaptivePlaywrightCrawlingContext,
33
35
  AdaptivePlaywrightPreNavCrawlingContext,
34
36
  RenderingType,
@@ -41,6 +43,7 @@ __all__ = [
41
43
  'AbstractHttpCrawler',
42
44
  'AbstractHttpParser',
43
45
  'AdaptivePlaywrightCrawler',
46
+ 'AdaptivePlaywrightCrawlerStatisticState',
44
47
  'AdaptivePlaywrightCrawlingContext',
45
48
  'AdaptivePlaywrightPreNavCrawlingContext',
46
49
  'BasicCrawler',
@@ -51,6 +54,7 @@ __all__ = [
51
54
  'BeautifulSoupParserType',
52
55
  'ContextPipeline',
53
56
  'HttpCrawler',
57
+ 'HttpCrawlerOptions',
54
58
  'HttpCrawlingContext',
55
59
  'HttpCrawlingResult',
56
60
  'ParsedHttpCrawlingContext',
@@ -1,9 +1,10 @@
1
- from ._abstract_http_crawler import AbstractHttpCrawler
1
+ from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
2
2
  from ._abstract_http_parser import AbstractHttpParser
3
3
  from ._http_crawling_context import ParsedHttpCrawlingContext
4
4
 
5
5
  __all__ = [
6
6
  'AbstractHttpCrawler',
7
7
  'AbstractHttpParser',
8
+ 'HttpCrawlerOptions',
8
9
  'ParsedHttpCrawlingContext',
9
10
  ]