crawlee 1.1.1b1__py3-none-any.whl → 1.2.1b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (37) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_request.py +29 -10
  3. crawlee/_types.py +42 -2
  4. crawlee/_utils/context.py +2 -2
  5. crawlee/_utils/file.py +7 -0
  6. crawlee/_utils/recurring_task.py +2 -1
  7. crawlee/_utils/time.py +41 -1
  8. crawlee/crawlers/__init__.py +2 -1
  9. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  10. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +52 -14
  11. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +10 -33
  12. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  13. crawlee/crawlers/_basic/_basic_crawler.py +135 -118
  14. crawlee/crawlers/_basic/_context_utils.py +24 -0
  15. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  16. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  17. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  18. crawlee/crawlers/_playwright/_playwright_crawler.py +58 -17
  19. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  20. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  21. crawlee/crawlers/_playwright/_types.py +12 -2
  22. crawlee/errors.py +4 -0
  23. crawlee/events/_event_manager.py +1 -3
  24. crawlee/http_clients/_base.py +4 -0
  25. crawlee/http_clients/_curl_impersonate.py +12 -0
  26. crawlee/http_clients/_httpx.py +16 -6
  27. crawlee/http_clients/_impit.py +25 -10
  28. crawlee/router.py +13 -3
  29. crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
  30. crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
  31. crawlee/storage_clients/_file_system/_request_queue_client.py +3 -3
  32. crawlee/storage_clients/_sql/_storage_client.py +0 -9
  33. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/METADATA +10 -16
  34. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/RECORD +37 -36
  35. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/WHEEL +1 -1
  36. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/entry_points.txt +0 -0
  37. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from importlib import metadata
2
2
 
3
- from ._request import Request, RequestOptions
3
+ from ._request import Request, RequestOptions, RequestState
4
4
  from ._service_locator import service_locator
5
5
  from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
6
6
  from ._utils.globs import Glob
@@ -14,6 +14,7 @@ __all__ = [
14
14
  'HttpHeaders',
15
15
  'Request',
16
16
  'RequestOptions',
17
+ 'RequestState',
17
18
  'RequestTransformAction',
18
19
  'SkippedReason',
19
20
  'service_locator',
crawlee/_request.py CHANGED
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
34
34
  class CrawleeRequestData(BaseModel):
35
35
  """Crawlee-specific configuration stored in the `user_data`."""
36
36
 
37
- max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
37
+ max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
38
38
  """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
39
39
  `BasicCrawler`."""
40
40
 
41
41
  enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
42
42
  """The strategy that was used for enqueuing the request."""
43
43
 
44
- state: RequestState | None = None
44
+ state: RequestState = RequestState.UNPROCESSED
45
45
  """Describes the request's current lifecycle state."""
46
46
 
47
47
  session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
137
137
  always_enqueue: NotRequired[bool]
138
138
  user_data: NotRequired[dict[str, JsonSerializable]]
139
139
  no_retry: NotRequired[bool]
140
+ enqueue_strategy: NotRequired[EnqueueStrategy]
141
+ max_retries: NotRequired[int | None]
140
142
 
141
143
 
142
144
  @docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
166
168
 
167
169
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
168
170
 
169
- unique_key: Annotated[str, Field(alias='uniqueKey')]
171
+ unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
170
172
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
171
173
  to the same URL.
172
174
 
@@ -178,17 +180,18 @@ class Request(BaseModel):
178
180
  and specify which URLs shall be considered equal.
179
181
  """
180
182
 
181
- url: Annotated[str, BeforeValidator(validate_http_url), Field()]
183
+ url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
182
184
  """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
183
185
  and fragments."""
184
186
 
185
- method: HttpMethod = 'GET'
187
+ method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
186
188
  """HTTP request method."""
187
189
 
188
190
  payload: Annotated[
189
191
  HttpPayload | None,
190
192
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
191
193
  PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
194
+ Field(frozen=True),
192
195
  ] = None
193
196
  """HTTP request payload."""
194
197
 
@@ -250,6 +253,8 @@ class Request(BaseModel):
250
253
  keep_url_fragment: bool = False,
251
254
  use_extended_unique_key: bool = False,
252
255
  always_enqueue: bool = False,
256
+ enqueue_strategy: EnqueueStrategy | None = None,
257
+ max_retries: int | None = None,
253
258
  **kwargs: Any,
254
259
  ) -> Self:
255
260
  """Create a new `Request` instance from a URL.
@@ -277,6 +282,9 @@ class Request(BaseModel):
277
282
  `unique_key` computation. This is only relevant when `unique_key` is not provided.
278
283
  always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
279
284
  Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
285
+ enqueue_strategy: The strategy that will be used for enqueuing the request.
286
+ max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
287
+ option of `BasicCrawler`.
280
288
  **kwargs: Additional request properties.
281
289
  """
282
290
  if unique_key is not None and always_enqueue:
@@ -301,12 +309,27 @@ class Request(BaseModel):
301
309
  if always_enqueue:
302
310
  unique_key = f'{crypto_random_object_id()}|{unique_key}'
303
311
 
312
+ user_data_dict = kwargs.pop('user_data', {}) or {}
313
+ crawlee_data_dict = user_data_dict.get('__crawlee', {})
314
+
315
+ if max_retries is not None:
316
+ crawlee_data_dict['maxRetries'] = max_retries
317
+
318
+ if enqueue_strategy is not None:
319
+ crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
320
+
321
+ crawlee_data = CrawleeRequestData(**crawlee_data_dict)
322
+
323
+ if crawlee_data:
324
+ user_data_dict['__crawlee'] = crawlee_data
325
+
304
326
  request = cls(
305
327
  url=url,
306
328
  unique_key=unique_key,
307
329
  method=method,
308
330
  headers=headers,
309
331
  payload=payload,
332
+ user_data=user_data_dict,
310
333
  **kwargs,
311
334
  )
312
335
 
@@ -352,7 +375,7 @@ class Request(BaseModel):
352
375
  self.crawlee_data.crawl_depth = new_value
353
376
 
354
377
  @property
355
- def state(self) -> RequestState | None:
378
+ def state(self) -> RequestState:
356
379
  """Crawlee-specific request handling state."""
357
380
  return self.crawlee_data.state
358
381
 
@@ -365,10 +388,6 @@ class Request(BaseModel):
365
388
  """Crawlee-specific limit on the number of retries of the request."""
366
389
  return self.crawlee_data.max_retries
367
390
 
368
- @max_retries.setter
369
- def max_retries(self, new_max_retries: int) -> None:
370
- self.crawlee_data.max_retries = new_max_retries
371
-
372
391
  @property
373
392
  def session_rotation_count(self) -> int | None:
374
393
  """Crawlee-specific number of finished session rotations for the request."""
crawlee/_types.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
+ from copy import deepcopy
5
6
  from dataclasses import dataclass
6
7
  from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
7
8
 
@@ -15,7 +16,7 @@ if TYPE_CHECKING:
15
16
  import re
16
17
  from collections.abc import Callable, Coroutine, Sequence
17
18
 
18
- from typing_extensions import NotRequired, Required, Unpack
19
+ from typing_extensions import NotRequired, Required, Self, Unpack
19
20
 
20
21
  from crawlee import Glob, Request
21
22
  from crawlee._request import RequestOptions
@@ -260,12 +261,24 @@ class KeyValueStoreChangeRecords:
260
261
  class RequestHandlerRunResult:
261
262
  """Record of calls to storage-related context helpers."""
262
263
 
263
- def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
264
+ def __init__(
265
+ self,
266
+ *,
267
+ key_value_store_getter: GetKeyValueStoreFunction,
268
+ request: Request,
269
+ ) -> None:
264
270
  self._key_value_store_getter = key_value_store_getter
265
271
  self.add_requests_calls = list[AddRequestsKwargs]()
266
272
  self.push_data_calls = list[PushDataFunctionCall]()
267
273
  self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
268
274
 
275
+ # Isolated copies for handler execution
276
+ self._request = deepcopy(request)
277
+
278
+ @property
279
+ def request(self) -> Request:
280
+ return self._request
281
+
269
282
  async def add_requests(
270
283
  self,
271
284
  requests: Sequence[str | Request],
@@ -315,6 +328,14 @@ class RequestHandlerRunResult:
315
328
 
316
329
  return self.key_value_store_changes[id, name, alias]
317
330
 
331
+ def apply_request_changes(self, target: Request) -> None:
332
+ """Apply tracked changes from handler copy to original request."""
333
+ if self.request.user_data != target.user_data:
334
+ target.user_data = self.request.user_data
335
+
336
+ if self.request.headers != target.headers:
337
+ target.headers = self.request.headers
338
+
318
339
 
319
340
  @docs_group('Functions')
320
341
  class AddRequestsFunction(Protocol):
@@ -643,6 +664,25 @@ class BasicCrawlingContext:
643
664
  """Return hash of the context. Each context is considered unique."""
644
665
  return id(self)
645
666
 
667
+ def create_modified_copy(
668
+ self,
669
+ push_data: PushDataFunction | None = None,
670
+ add_requests: AddRequestsFunction | None = None,
671
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
672
+ ) -> Self:
673
+ """Create a modified copy of the crawling context with specified changes."""
674
+ original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
675
+ modified_fields = {
676
+ key: value
677
+ for key, value in {
678
+ 'push_data': push_data,
679
+ 'add_requests': add_requests,
680
+ 'get_key_value_store': get_key_value_store,
681
+ }.items()
682
+ if value
683
+ }
684
+ return self.__class__(**{**original_fields, **modified_fields})
685
+
646
686
 
647
687
  class GetDataKwargs(TypedDict):
648
688
  """Keyword arguments for dataset's `get_data` method."""
crawlee/_utils/context.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
3
+ import inspect
4
4
  from collections.abc import Callable
5
5
  from functools import wraps
6
6
  from typing import Any, TypeVar
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
44
44
 
45
45
  return await method(self, *args, **kwargs)
46
46
 
47
- return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
47
+ return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
crawlee/_utils/file.py CHANGED
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
163
163
  dst: TextIO,
164
164
  **kwargs: Unpack[ExportDataCsvKwargs],
165
165
  ) -> None:
166
+ # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
167
+ # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
168
+ # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
169
+ # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
170
+ if 'lineterminator' not in kwargs:
171
+ kwargs['lineterminator'] = '\n'
172
+
166
173
  writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
167
174
  write_header = True
168
175
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import inspect
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING
6
7
 
@@ -49,7 +50,7 @@ class RecurringTask:
49
50
  """
50
51
  sleep_time_secs = self.delay.total_seconds()
51
52
  while True:
52
- await self.func() if asyncio.iscoroutinefunction(self.func) else self.func()
53
+ await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
53
54
  await asyncio.sleep(sleep_time_secs)
54
55
 
55
56
  def start(self) -> None:
crawlee/_utils/time.py CHANGED
@@ -3,11 +3,14 @@ from __future__ import annotations
3
3
  import time
4
4
  from contextlib import contextmanager
5
5
  from dataclasses import dataclass
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING
7
8
 
9
+ from async_timeout import Timeout, timeout
10
+
8
11
  if TYPE_CHECKING:
9
12
  from collections.abc import Iterator
10
- from datetime import timedelta
13
+ from types import TracebackType
11
14
 
12
15
  _SECONDS_PER_MINUTE = 60
13
16
  _SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
35
38
  result.cpu = after_cpu - before_cpu
36
39
 
37
40
 
41
+ class SharedTimeout:
42
+ """Keeps track of a time budget shared by multiple independent async operations.
43
+
44
+ Provides a reusable, non-reentrant context manager interface.
45
+ """
46
+
47
+ def __init__(self, timeout: timedelta) -> None:
48
+ self._remaining_timeout = timeout
49
+ self._active_timeout: Timeout | None = None
50
+ self._activation_timestamp: float | None = None
51
+
52
+ async def __aenter__(self) -> timedelta:
53
+ if self._active_timeout is not None or self._activation_timestamp is not None:
54
+ raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55
+
56
+ self._activation_timestamp = time.monotonic()
57
+ self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58
+ await new_timeout.__aenter__()
59
+ return self._remaining_timeout
60
+
61
+ async def __aexit__(
62
+ self,
63
+ exc_type: type[BaseException] | None,
64
+ exc_value: BaseException | None,
65
+ exc_traceback: TracebackType | None,
66
+ ) -> None:
67
+ if self._active_timeout is None or self._activation_timestamp is None:
68
+ raise RuntimeError('Logic error')
69
+
70
+ await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71
+ elapsed = time.monotonic() - self._activation_timestamp
72
+ self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73
+
74
+ self._active_timeout = None
75
+ self._activation_timestamp = None
76
+
77
+
38
78
  def format_duration(duration: timedelta | None) -> str:
39
79
  """Format a timedelta into a human-readable string with appropriate units."""
40
80
  if duration is None:
@@ -1,7 +1,7 @@
1
1
  from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
2
  from crawlee._utils.try_import import try_import as _try_import
3
3
 
4
- from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4
+ from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
5
5
  from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
6
6
  from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
7
7
 
@@ -51,6 +51,7 @@ __all__ = [
51
51
  'BeautifulSoupParserType',
52
52
  'ContextPipeline',
53
53
  'HttpCrawler',
54
+ 'HttpCrawlerOptions',
54
55
  'HttpCrawlingContext',
55
56
  'HttpCrawlingResult',
56
57
  'ParsedHttpCrawlingContext',
@@ -1,9 +1,10 @@
1
- from ._abstract_http_crawler import AbstractHttpCrawler
1
+ from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
2
2
  from ._abstract_http_parser import AbstractHttpParser
3
3
  from ._http_crawling_context import ParsedHttpCrawlingContext
4
4
 
5
5
  __all__ = [
6
6
  'AbstractHttpCrawler',
7
7
  'AbstractHttpParser',
8
+ 'HttpCrawlerOptions',
8
9
  'ParsedHttpCrawlingContext',
9
10
  ]
@@ -3,14 +3,16 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  from abc import ABC
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING, Any, Generic
7
8
 
8
9
  from more_itertools import partition
9
10
  from pydantic import ValidationError
10
- from typing_extensions import TypeVar
11
+ from typing_extensions import NotRequired, TypeVar
11
12
 
12
- from crawlee._request import Request, RequestOptions
13
+ from crawlee._request import Request, RequestOptions, RequestState
13
14
  from crawlee._utils.docs import docs_group
15
+ from crawlee._utils.time import SharedTimeout
14
16
  from crawlee._utils.urls import to_absolute_url_iterator
15
17
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
16
18
  from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
32
34
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
33
35
 
34
36
 
37
+ class HttpCrawlerOptions(
38
+ BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39
+ Generic[TCrawlingContext, TStatisticsState],
40
+ ):
41
+ """Arguments for the `AbstractHttpCrawler` constructor.
42
+
43
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
44
+ """
45
+
46
+ navigation_timeout: NotRequired[timedelta | None]
47
+ """Timeout for the HTTP request."""
48
+
49
+
35
50
  @docs_group('Crawlers')
36
51
  class AbstractHttpCrawler(
37
52
  BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
56
71
  self,
57
72
  *,
58
73
  parser: AbstractHttpParser[TParseResult, TSelectResult],
74
+ navigation_timeout: timedelta | None = None,
59
75
  **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
60
76
  ) -> None:
61
77
  self._parser = parser
78
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
62
79
  self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
63
81
 
64
82
  if '_context_pipeline' not in kwargs:
65
83
  raise ValueError(
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
112
130
  async def _execute_pre_navigation_hooks(
113
131
  self, context: BasicCrawlingContext
114
132
  ) -> AsyncGenerator[BasicCrawlingContext, None]:
115
- for hook in self._pre_navigation_hooks:
116
- await hook(context)
117
- yield context
133
+ context_id = id(context)
134
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
135
+
136
+ try:
137
+ for hook in self._pre_navigation_hooks:
138
+ async with self._shared_navigation_timeouts[context_id]:
139
+ await hook(context)
140
+
141
+ yield context
142
+ finally:
143
+ self._shared_navigation_timeouts.pop(context_id, None)
118
144
 
119
145
  async def _parse_http_response(
120
146
  self, context: HttpCrawlingContext
@@ -165,11 +191,18 @@ class AbstractHttpCrawler(
165
191
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
166
192
 
167
193
  kwargs.setdefault('strategy', 'same-hostname')
194
+ strategy = kwargs.get('strategy', 'same-hostname')
168
195
 
169
196
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170
- links_iterator = to_absolute_url_iterator(
171
- context.request.loaded_url or context.request.url, links_iterator, logger=context.log
197
+
198
+ # Get base URL from <base> tag if present
199
+ extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
200
+ base_url: str = (
201
+ str(extracted_base_urls[0])
202
+ if extracted_base_urls
203
+ else context.request.loaded_url or context.request.url
172
204
  )
205
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
173
206
 
174
207
  if robots_txt_file:
175
208
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -177,7 +210,9 @@ class AbstractHttpCrawler(
177
210
  skipped = iter([])
178
211
 
179
212
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
180
- request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
213
+ request_options = RequestOptions(
214
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
215
+ )
181
216
 
182
217
  if transform_request_function:
183
218
  transform_request_options = transform_request_function(request_options)
@@ -216,13 +251,16 @@ class AbstractHttpCrawler(
216
251
  Yields:
217
252
  The original crawling context enhanced by HTTP response.
218
253
  """
219
- result = await self._http_client.crawl(
220
- request=context.request,
221
- session=context.session,
222
- proxy_info=context.proxy_info,
223
- statistics=self._statistics,
224
- )
254
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
255
+ result = await self._http_client.crawl(
256
+ request=context.request,
257
+ session=context.session,
258
+ proxy_info=context.proxy_info,
259
+ statistics=self._statistics,
260
+ timeout=remaining_timeout,
261
+ )
225
262
 
263
+ context.request.state = RequestState.AFTER_NAV
226
264
  yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
227
265
 
228
266
  async def _handle_status_code_response(
@@ -290,11 +290,14 @@ class AdaptivePlaywrightCrawler(
290
290
  use_state_function = context.use_state
291
291
 
292
292
  # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
293
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
293
+ result = RequestHandlerRunResult(
294
+ key_value_store_getter=self.get_key_value_store,
295
+ request=context.request,
296
+ )
294
297
  context_linked_to_result = BasicCrawlingContext(
295
- request=deepcopy(context.request),
296
- session=deepcopy(context.session),
297
- proxy_info=deepcopy(context.proxy_info),
298
+ request=result.request,
299
+ session=context.session,
300
+ proxy_info=context.proxy_info,
298
301
  send_request=context.send_request,
299
302
  add_requests=result.add_requests,
300
303
  push_data=result.push_data,
@@ -314,7 +317,7 @@ class AdaptivePlaywrightCrawler(
314
317
  ),
315
318
  logger=self._logger,
316
319
  )
317
- return SubCrawlerRun(result=result, run_context=context_linked_to_result)
320
+ return SubCrawlerRun(result=result)
318
321
  except Exception as e:
319
322
  return SubCrawlerRun(exception=e)
320
323
 
@@ -370,8 +373,7 @@ class AdaptivePlaywrightCrawler(
370
373
  self.track_http_only_request_handler_runs()
371
374
 
372
375
  static_run = await self._crawl_one(rendering_type='static', context=context)
373
- if static_run.result and static_run.run_context and self.result_checker(static_run.result):
374
- self._update_context_from_copy(context, static_run.run_context)
376
+ if static_run.result and self.result_checker(static_run.result):
375
377
  self._context_result_map[context] = static_run.result
376
378
  return
377
379
  if static_run.exception:
@@ -402,7 +404,7 @@ class AdaptivePlaywrightCrawler(
402
404
  if pw_run.exception is not None:
403
405
  raise pw_run.exception
404
406
 
405
- if pw_run.result and pw_run.run_context:
407
+ if pw_run.result:
406
408
  if should_detect_rendering_type:
407
409
  detection_result: RenderingType
408
410
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
@@ -414,7 +416,6 @@ class AdaptivePlaywrightCrawler(
414
416
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
415
417
  self.rendering_type_predictor.store_result(context.request, detection_result)
416
418
 
417
- self._update_context_from_copy(context, pw_run.run_context)
418
419
  self._context_result_map[context] = pw_run.result
419
420
 
420
421
  def pre_navigation_hook(
@@ -451,32 +452,8 @@ class AdaptivePlaywrightCrawler(
451
452
  def track_rendering_type_mispredictions(self) -> None:
452
453
  self.statistics.state.rendering_type_mispredictions += 1
453
454
 
454
- def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
455
- """Update mutable fields of `context` from `context_copy`.
456
-
457
- Uses object.__setattr__ to bypass frozen dataclass restrictions,
458
- allowing state synchronization after isolated crawler execution.
459
- """
460
- updating_attributes = {
461
- 'request': ('headers', 'user_data'),
462
- 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
463
- }
464
-
465
- for attr, sub_attrs in updating_attributes.items():
466
- original_sub_obj = getattr(context, attr)
467
- copy_sub_obj = getattr(context_copy, attr)
468
-
469
- # Check that both sub objects are not None
470
- if original_sub_obj is None or copy_sub_obj is None:
471
- continue
472
-
473
- for sub_attr in sub_attrs:
474
- new_value = getattr(copy_sub_obj, sub_attr)
475
- object.__setattr__(original_sub_obj, sub_attr, new_value)
476
-
477
455
 
478
456
  @dataclass(frozen=True)
479
457
  class SubCrawlerRun:
480
458
  result: RequestHandlerRunResult | None = None
481
459
  exception: Exception | None = None
482
- run_context: BasicCrawlingContext | None = None
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
17
17
  from playwright.async_api import Page, Response
18
18
  from typing_extensions import Self
19
19
 
20
- from crawlee.crawlers._playwright._types import BlockRequestsFunction
20
+ from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
21
21
 
22
22
 
23
23
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
190
190
  http_response = await PlaywrightHttpResponse.from_playwright_response(
191
191
  response=context.response, protocol=protocol_guess or ''
192
192
  )
193
- # block_requests is useful only on pre-navigation contexts. It is useless here.
193
+ # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
194
194
  context_kwargs.pop('block_requests')
195
+ context_kwargs.pop('goto_options')
195
196
  return cls(
196
197
  parsed_content=await parser.parse(http_response),
197
198
  http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
212
213
  block_requests: BlockRequestsFunction | None = None
213
214
  """Blocks network requests matching specified URL patterns."""
214
215
 
216
+ goto_options: GotoOptions | None = None
217
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
218
+
215
219
  @property
216
220
  def page(self) -> Page:
217
221
  """The Playwright `Page` object for the current page.