PyPI - crawlee - Versions diffs - 1.1.1b1__py3-none-any.whl → 1.2.1b7__py3-none-any.whl - Mend

crawlee 1.1.1b1py3-none-any.whl → 1.2.1b7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (37) hide show

crawlee/__init__.py +2 -1
crawlee/_request.py +29 -10
crawlee/_types.py +42 -2
crawlee/_utils/context.py +2 -2
crawlee/_utils/file.py +7 -0
crawlee/_utils/recurring_task.py +2 -1
crawlee/_utils/time.py +41 -1
crawlee/crawlers/__init__.py +2 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +52 -14
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +10 -33
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
crawlee/crawlers/_basic/_basic_crawler.py +135 -118
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +23 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_playwright/_playwright_crawler.py +58 -17
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +1 -3
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +12 -0
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/router.py +13 -3
crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
crawlee/storage_clients/_file_system/_request_queue_client.py +3 -3
crawlee/storage_clients/_sql/_storage_client.py +0 -9
{crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/METADATA +10 -16
{crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/RECORD +37 -36
{crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/WHEEL +1 -1
{crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/entry_points.txt +0 -0
{crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/licenses/LICENSE +0 -0

crawlee/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from importlib import metadata
-from ._request import Request, RequestOptions
+from ._request import Request, RequestOptions, RequestState
 from ._service_locator import service_locator
 from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
 from ._utils.globs import Glob
@@ -14,6 +14,7 @@ __all__ = [
     'HttpHeaders',
     'Request',
     'RequestOptions',
+    'RequestState',
     'RequestTransformAction',
     'SkippedReason',
     'service_locator',

crawlee/_request.py CHANGED Viewed

@@ -34,14 +34,14 @@ class RequestState(IntEnum):
 class CrawleeRequestData(BaseModel):
     """Crawlee-specific configuration stored in the `user_data`."""
-    max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
+    max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
     """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
     `BasicCrawler`."""
     enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
     """The strategy that was used for enqueuing the request."""
-    state: RequestState | None = None
+    state: RequestState = RequestState.UNPROCESSED
     """Describes the request's current lifecycle state."""
     session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
     always_enqueue: NotRequired[bool]
     user_data: NotRequired[dict[str, JsonSerializable]]
     no_retry: NotRequired[bool]
+    enqueue_strategy: NotRequired[EnqueueStrategy]
+    max_retries: NotRequired[int | None]
 @docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
     model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
-    unique_key: Annotated[str, Field(alias='uniqueKey')]
+    unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
     """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
     to the same URL.
@@ -178,17 +180,18 @@ class Request(BaseModel):
     and specify which URLs shall be considered equal.
     """
-    url: Annotated[str, BeforeValidator(validate_http_url), Field()]
+    url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
     """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
     and fragments."""
-    method: HttpMethod = 'GET'
+    method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
     """HTTP request method."""
     payload: Annotated[
         HttpPayload | None,
         BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
         PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
+        Field(frozen=True),
     ] = None
     """HTTP request payload."""
@@ -250,6 +253,8 @@ class Request(BaseModel):
         keep_url_fragment: bool = False,
         use_extended_unique_key: bool = False,
         always_enqueue: bool = False,
+        enqueue_strategy: EnqueueStrategy | None = None,
+        max_retries: int | None = None,
         **kwargs: Any,
     ) -> Self:
         """Create a new `Request` instance from a URL.
@@ -277,6 +282,9 @@ class Request(BaseModel):
                 `unique_key` computation. This is only relevant when `unique_key` is not provided.
             always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
                 Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
+            enqueue_strategy: The strategy that will be used for enqueuing the request.
+            max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
+                option of `BasicCrawler`.
             **kwargs: Additional request properties.
         """
         if unique_key is not None and always_enqueue:
@@ -301,12 +309,27 @@ class Request(BaseModel):
         if always_enqueue:
             unique_key = f'{crypto_random_object_id()}|{unique_key}'
+        user_data_dict = kwargs.pop('user_data', {}) or {}
+        crawlee_data_dict = user_data_dict.get('__crawlee', {})
+        if max_retries is not None:
+            crawlee_data_dict['maxRetries'] = max_retries
+        if enqueue_strategy is not None:
+            crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
+        crawlee_data = CrawleeRequestData(**crawlee_data_dict)
+        if crawlee_data:
+            user_data_dict['__crawlee'] = crawlee_data
         request = cls(
             url=url,
             unique_key=unique_key,
             method=method,
             headers=headers,
             payload=payload,
+            user_data=user_data_dict,
             **kwargs,
         )
@@ -352,7 +375,7 @@ class Request(BaseModel):
         self.crawlee_data.crawl_depth = new_value
     @property
-    def state(self) -> RequestState | None:
+    def state(self) -> RequestState:
         """Crawlee-specific request handling state."""
         return self.crawlee_data.state
@@ -365,10 +388,6 @@ class Request(BaseModel):
         """Crawlee-specific limit on the number of retries of the request."""
         return self.crawlee_data.max_retries
-    @max_retries.setter
-    def max_retries(self, new_max_retries: int) -> None:
-        self.crawlee_data.max_retries = new_max_retries
     @property
     def session_rotation_count(self) -> int | None:
         """Crawlee-specific number of finished session rotations for the request."""

crawlee/_types.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import dataclasses
 from collections.abc import Callable, Iterator, Mapping
+from copy import deepcopy
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
@@ -15,7 +16,7 @@ if TYPE_CHECKING:
     import re
     from collections.abc import Callable, Coroutine, Sequence
-    from typing_extensions import NotRequired, Required, Unpack
+    from typing_extensions import NotRequired, Required, Self, Unpack
     from crawlee import Glob, Request
     from crawlee._request import RequestOptions
@@ -260,12 +261,24 @@ class KeyValueStoreChangeRecords:
 class RequestHandlerRunResult:
     """Record of calls to storage-related context helpers."""
-    def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
+    def __init__(
+        self,
+        *,
+        key_value_store_getter: GetKeyValueStoreFunction,
+        request: Request,
+    ) -> None:
         self._key_value_store_getter = key_value_store_getter
         self.add_requests_calls = list[AddRequestsKwargs]()
         self.push_data_calls = list[PushDataFunctionCall]()
         self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
+        # Isolated copies for handler execution
+        self._request = deepcopy(request)
+    @property
+    def request(self) -> Request:
+        return self._request
     async def add_requests(
         self,
         requests: Sequence[str | Request],
@@ -315,6 +328,14 @@ class RequestHandlerRunResult:
         return self.key_value_store_changes[id, name, alias]
+    def apply_request_changes(self, target: Request) -> None:
+        """Apply tracked changes from handler copy to original request."""
+        if self.request.user_data != target.user_data:
+            target.user_data = self.request.user_data
+        if self.request.headers != target.headers:
+            target.headers = self.request.headers
 @docs_group('Functions')
 class AddRequestsFunction(Protocol):
@@ -643,6 +664,25 @@ class BasicCrawlingContext:
         """Return hash of the context. Each context is considered unique."""
         return id(self)
+    def create_modified_copy(
+        self,
+        push_data: PushDataFunction | None = None,
+        add_requests: AddRequestsFunction | None = None,
+        get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
+    ) -> Self:
+        """Create a modified copy of the crawling context with specified changes."""
+        original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
+        modified_fields = {
+            key: value
+            for key, value in {
+                'push_data': push_data,
+                'add_requests': add_requests,
+                'get_key_value_store': get_key_value_store,
+            }.items()
+            if value
+        }
+        return self.__class__(**{**original_fields, **modified_fields})
 class GetDataKwargs(TypedDict):
     """Keyword arguments for dataset's `get_data` method."""

crawlee/_utils/context.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-import asyncio
+import inspect
 from collections.abc import Callable
 from functools import wraps
 from typing import Any, TypeVar
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
         return await method(self, *args, **kwargs)
-    return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper  # type: ignore[return-value]
+    return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper  # type: ignore[return-value]

crawlee/_utils/file.py CHANGED Viewed

@@ -163,6 +163,13 @@ async def export_csv_to_stream(
     dst: TextIO,
     **kwargs: Unpack[ExportDataCsvKwargs],
 ) -> None:
+    # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
+    # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
+    # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
+    # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
+    if 'lineterminator' not in kwargs:
+        kwargs['lineterminator'] = '\n'
     writer = csv.writer(dst, **kwargs)  # type: ignore[arg-type]
     write_header = True

crawlee/_utils/recurring_task.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import asyncio
+import inspect
 from logging import getLogger
 from typing import TYPE_CHECKING
@@ -49,7 +50,7 @@ class RecurringTask:
         """
         sleep_time_secs = self.delay.total_seconds()
         while True:
-            await self.func() if asyncio.iscoroutinefunction(self.func) else self.func()
+            await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
             await asyncio.sleep(sleep_time_secs)
     def start(self) -> None:

crawlee/_utils/time.py CHANGED Viewed

@@ -3,11 +3,14 @@ from __future__ import annotations
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass
+from datetime import timedelta
 from typing import TYPE_CHECKING
+from async_timeout import Timeout, timeout
 if TYPE_CHECKING:
     from collections.abc import Iterator
-    from datetime import timedelta
+    from types import TracebackType
 _SECONDS_PER_MINUTE = 60
 _SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
         result.cpu = after_cpu - before_cpu
+class SharedTimeout:
+    """Keeps track of a time budget shared by multiple independent async operations.
+    Provides a reusable, non-reentrant context manager interface.
+    """
+    def __init__(self, timeout: timedelta) -> None:
+        self._remaining_timeout = timeout
+        self._active_timeout: Timeout | None = None
+        self._activation_timestamp: float | None = None
+    async def __aenter__(self) -> timedelta:
+        if self._active_timeout is not None or self._activation_timestamp is not None:
+            raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
+        self._activation_timestamp = time.monotonic()
+        self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
+        await new_timeout.__aenter__()
+        return self._remaining_timeout
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        if self._active_timeout is None or self._activation_timestamp is None:
+            raise RuntimeError('Logic error')
+        await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
+        elapsed = time.monotonic() - self._activation_timestamp
+        self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
+        self._active_timeout = None
+        self._activation_timestamp = None
 def format_duration(duration: timedelta | None) -> str:
     """Format a timedelta into a human-readable string with appropriate units."""
     if duration is None:

crawlee/crawlers/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from crawlee._utils.try_import import install_import_hook as _install_import_hook
 from crawlee._utils.try_import import try_import as _try_import
-from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
+from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
 from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
 from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
@@ -51,6 +51,7 @@ __all__ = [
     'BeautifulSoupParserType',
     'ContextPipeline',
     'HttpCrawler',
+    'HttpCrawlerOptions',
     'HttpCrawlingContext',
     'HttpCrawlingResult',
     'ParsedHttpCrawlingContext',

crawlee/crawlers/_abstract_http/__init__.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from ._abstract_http_crawler import AbstractHttpCrawler
+from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
 from ._abstract_http_parser import AbstractHttpParser
 from ._http_crawling_context import ParsedHttpCrawlingContext
 __all__ = [
     'AbstractHttpCrawler',
     'AbstractHttpParser',
+    'HttpCrawlerOptions',
     'ParsedHttpCrawlingContext',
 ]

crawlee/crawlers/_abstract_http/_abstract_http_crawler.py CHANGED Viewed

@@ -3,14 +3,16 @@ from __future__ import annotations
 import asyncio
 import logging
 from abc import ABC
+from datetime import timedelta
 from typing import TYPE_CHECKING, Any, Generic
 from more_itertools import partition
 from pydantic import ValidationError
-from typing_extensions import TypeVar
+from typing_extensions import NotRequired, TypeVar
-from crawlee._request import Request, RequestOptions
+from crawlee._request import Request, RequestOptions, RequestState
 from crawlee._utils.docs import docs_group
+from crawlee._utils.time import SharedTimeout
 from crawlee._utils.urls import to_absolute_url_iterator
 from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
 from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
+class HttpCrawlerOptions(
+    BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
+    Generic[TCrawlingContext, TStatisticsState],
+):
+    """Arguments for the `AbstractHttpCrawler` constructor.
+    It is intended for typing forwarded `__init__` arguments in the subclasses.
+    """
+    navigation_timeout: NotRequired[timedelta | None]
+    """Timeout for the HTTP request."""
 @docs_group('Crawlers')
 class AbstractHttpCrawler(
     BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
         self,
         *,
         parser: AbstractHttpParser[TParseResult, TSelectResult],
+        navigation_timeout: timedelta | None = None,
         **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
     ) -> None:
         self._parser = parser
+        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
         self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
+        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
         if '_context_pipeline' not in kwargs:
             raise ValueError(
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
     async def _execute_pre_navigation_hooks(
         self, context: BasicCrawlingContext
     ) -> AsyncGenerator[BasicCrawlingContext, None]:
-        for hook in self._pre_navigation_hooks:
-            await hook(context)
-        yield context
+        context_id = id(context)
+        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
+        try:
+            for hook in self._pre_navigation_hooks:
+                async with self._shared_navigation_timeouts[context_id]:
+                    await hook(context)
+            yield context
+        finally:
+            self._shared_navigation_timeouts.pop(context_id, None)
     async def _parse_http_response(
         self, context: HttpCrawlingContext
@@ -165,11 +191,18 @@ class AbstractHttpCrawler(
             robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
             kwargs.setdefault('strategy', 'same-hostname')
+            strategy = kwargs.get('strategy', 'same-hostname')
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(
-                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            # Get base URL from <base> tag if present
+            extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
+            base_url: str = (
+                str(extracted_base_urls[0])
+                if extracted_base_urls
+                else context.request.loaded_url or context.request.url
             )
+            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -177,7 +210,9 @@ class AbstractHttpCrawler(
                 skipped = iter([])
             for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
-                request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
+                request_options = RequestOptions(
+                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
+                )
                 if transform_request_function:
                     transform_request_options = transform_request_function(request_options)
@@ -216,13 +251,16 @@ class AbstractHttpCrawler(
         Yields:
             The original crawling context enhanced by HTTP response.
         """
-        result = await self._http_client.crawl(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            statistics=self._statistics,
-        )
+        async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
+            result = await self._http_client.crawl(
+                request=context.request,
+                session=context.session,
+                proxy_info=context.proxy_info,
+                statistics=self._statistics,
+                timeout=remaining_timeout,
+            )
+        context.request.state = RequestState.AFTER_NAV
         yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
     async def _handle_status_code_response(

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py CHANGED Viewed

@@ -290,11 +290,14 @@ class AdaptivePlaywrightCrawler(
             use_state_function = context.use_state
         # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
-        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
+        result = RequestHandlerRunResult(
+            key_value_store_getter=self.get_key_value_store,
+            request=context.request,
+        )
         context_linked_to_result = BasicCrawlingContext(
-            request=deepcopy(context.request),
-            session=deepcopy(context.session),
-            proxy_info=deepcopy(context.proxy_info),
+            request=result.request,
+            session=context.session,
+            proxy_info=context.proxy_info,
             send_request=context.send_request,
             add_requests=result.add_requests,
             push_data=result.push_data,
@@ -314,7 +317,7 @@ class AdaptivePlaywrightCrawler(
                 ),
                 logger=self._logger,
             )
-            return SubCrawlerRun(result=result, run_context=context_linked_to_result)
+            return SubCrawlerRun(result=result)
         except Exception as e:
             return SubCrawlerRun(exception=e)
@@ -370,8 +373,7 @@ class AdaptivePlaywrightCrawler(
                 self.track_http_only_request_handler_runs()
                 static_run = await self._crawl_one(rendering_type='static', context=context)
-                if static_run.result and static_run.run_context and self.result_checker(static_run.result):
-                    self._update_context_from_copy(context, static_run.run_context)
+                if static_run.result and self.result_checker(static_run.result):
                     self._context_result_map[context] = static_run.result
                     return
                 if static_run.exception:
@@ -402,7 +404,7 @@ class AdaptivePlaywrightCrawler(
         if pw_run.exception is not None:
             raise pw_run.exception
-        if pw_run.result and pw_run.run_context:
+        if pw_run.result:
             if should_detect_rendering_type:
                 detection_result: RenderingType
                 static_run = await self._crawl_one('static', context=context, state=old_state_copy)
@@ -414,7 +416,6 @@ class AdaptivePlaywrightCrawler(
                 context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
                 self.rendering_type_predictor.store_result(context.request, detection_result)
-            self._update_context_from_copy(context, pw_run.run_context)
             self._context_result_map[context] = pw_run.result
     def pre_navigation_hook(
@@ -451,32 +452,8 @@ class AdaptivePlaywrightCrawler(
     def track_rendering_type_mispredictions(self) -> None:
         self.statistics.state.rendering_type_mispredictions += 1
-    def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
-        """Update mutable fields of `context` from `context_copy`.
-        Uses object.__setattr__ to bypass frozen dataclass restrictions,
-        allowing state synchronization after isolated crawler execution.
-        """
-        updating_attributes = {
-            'request': ('headers', 'user_data'),
-            'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
-        }
-        for attr, sub_attrs in updating_attributes.items():
-            original_sub_obj = getattr(context, attr)
-            copy_sub_obj = getattr(context_copy, attr)
-            # Check that both sub objects are not None
-            if original_sub_obj is None or copy_sub_obj is None:
-                continue
-            for sub_attr in sub_attrs:
-                new_value = getattr(copy_sub_obj, sub_attr)
-                object.__setattr__(original_sub_obj, sub_attr, new_value)
 @dataclass(frozen=True)
 class SubCrawlerRun:
     result: RequestHandlerRunResult | None = None
     exception: Exception | None = None
-    run_context: BasicCrawlingContext | None = None

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py CHANGED Viewed

@@ -17,7 +17,7 @@ if TYPE_CHECKING:
     from playwright.async_api import Page, Response
     from typing_extensions import Self
-    from crawlee.crawlers._playwright._types import BlockRequestsFunction
+    from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
 TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
         http_response = await PlaywrightHttpResponse.from_playwright_response(
             response=context.response, protocol=protocol_guess or ''
         )
-        # block_requests is useful only on pre-navigation contexts. It is useless here.
+        # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
         context_kwargs.pop('block_requests')
+        context_kwargs.pop('goto_options')
         return cls(
             parsed_content=await parser.parse(http_response),
             http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
     block_requests: BlockRequestsFunction | None = None
     """Blocks network requests matching specified URL patterns."""
+    goto_options: GotoOptions | None = None
+    """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
     @property
     def page(self) -> Page:
         """The Playwright `Page` object for the current page.

crawlee 1.1.1b1__py3-none-any.whl → 1.2.1b7__py3-none-any.whl

Potentially problematic release.

crawlee 1.1.1b1py3-none-any.whl → 1.2.1b7py3-none-any.whl