PyPI - crawlee - Versions diffs - 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl - Mend

crawlee 1.0.3b6py3-none-any.whl → 1.2.2b24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

crawlee/__init__.py +2 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +32 -13
crawlee/_service_locator.py +4 -4
crawlee/_types.py +44 -5
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +27 -3
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +13 -6
crawlee/_utils/system.py +27 -11
crawlee/_utils/time.py +41 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +5 -2
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/browsers/_playwright_browser_controller.py +1 -1
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
crawlee/crawlers/_basic/_basic_crawler.py +156 -131
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +23 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +4 -6
crawlee/request_loaders/_sitemap_request_loader.py +23 -5
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +3 -3
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +51 -9
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +4 -0
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +292 -0
crawlee/storage_clients/_redis/_dataset_client.py +329 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
crawlee/storage_clients/_redis/_storage_client.py +149 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/_client_mixin.py +1 -1
crawlee/storage_clients/_sql/_db_models.py +1 -2
crawlee/storage_clients/models.py +8 -3
crawlee/storages/_key_value_store.py +5 -2
crawlee/storages/_storage_instance_manager.py +103 -44
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0

crawlee/_utils/time.py CHANGED Viewed

@@ -3,11 +3,14 @@ from __future__ import annotations
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass
+from datetime import timedelta
 from typing import TYPE_CHECKING
+from async_timeout import Timeout, timeout
 if TYPE_CHECKING:
     from collections.abc import Iterator
-    from datetime import timedelta
+    from types import TracebackType
 _SECONDS_PER_MINUTE = 60
 _SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
         result.cpu = after_cpu - before_cpu
+class SharedTimeout:
+    """Keeps track of a time budget shared by multiple independent async operations.
+    Provides a reusable, non-reentrant context manager interface.
+    """
+    def __init__(self, timeout: timedelta) -> None:
+        self._remaining_timeout = timeout
+        self._active_timeout: Timeout | None = None
+        self._activation_timestamp: float | None = None
+    async def __aenter__(self) -> timedelta:
+        if self._active_timeout is not None or self._activation_timestamp is not None:
+            raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
+        self._activation_timestamp = time.monotonic()
+        self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
+        await new_timeout.__aenter__()
+        return self._remaining_timeout
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        if self._active_timeout is None or self._activation_timestamp is None:
+            raise RuntimeError('Logic error')
+        await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
+        elapsed = time.monotonic() - self._activation_timestamp
+        self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
+        self._active_timeout = None
+        self._activation_timestamp = None
 def format_duration(duration: timedelta | None) -> str:
     """Format a timedelta into a human-readable string with appropriate units."""
     if duration is None:

crawlee/_utils/urls.py CHANGED Viewed

@@ -7,6 +7,7 @@ from yarl import URL
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from logging import Logger
 def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
     return str(URL(base_url).join(URL(relative_url)))
-def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
+def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
     """Convert an iterator of relative URLs to absolute URLs using a base URL."""
     for url in urls:
         if is_url_absolute(url):
             yield url
         else:
-            yield convert_to_absolute_url(base_url, url)
+            converted_url = convert_to_absolute_url(base_url, url)
+            # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
+            if not is_url_absolute(converted_url):
+                if logger:
+                    logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
+                continue
+            yield converted_url
 _http_url_adapter = TypeAdapter(AnyHttpUrl)

crawlee/browsers/_browser_pool.py CHANGED Viewed

@@ -118,7 +118,10 @@ class BrowserPool:
         """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
         Args:
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
             user_data_dir: Path to a user data directory, which stores browser session data like cookies
                 and local storage.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -135,7 +138,7 @@ class BrowserPool:
             kwargs: Additional arguments for default constructor.
         """
         plugin_options: dict = defaultdict(dict)
-        plugin_options['browser_launch_options'] = browser_launch_options or {}
+        plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}
         plugin_options['browser_new_context_options'] = browser_new_context_options or {}
         if headless is not None:

crawlee/browsers/_playwright_browser.py CHANGED Viewed

@@ -78,7 +78,8 @@ class PlaywrightPersistentBrowser(Browser):
     async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
         if self._temp_dir and self._temp_dir.exists():
-            await asyncio.to_thread(shutil.rmtree, self._temp_dir, ignore_errors=True)
+            temp_dir = self._temp_dir
+            await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
     @override
     async def close(self, **kwargs: Any) -> None:

crawlee/browsers/_playwright_browser_controller.py CHANGED Viewed

@@ -216,7 +216,7 @@ class PlaywrightBrowserController(BrowserController):
         browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
         if proxy_info:
             if browser_new_context_options.get('proxy'):
-                logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
+                logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
             browser_new_context_options['proxy'] = ProxySettings(
                 server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',

crawlee/browsers/_playwright_browser_plugin.py CHANGED Viewed

@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
     It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
     for creating new browser instances and provides a unified interface for interacting with different browser types
-    (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,
-    executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
+    (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
+    mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
     browser instance, ensuring that resource limits are respected.
     """
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
         """Initialize a new instance.
         Args:
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
             user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
                 storage.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
             'chromium_sandbox': not config.disable_browser_sandbox,
         }
+        if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
+            raise ValueError(
+                'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
+            )
+        # Map 'chrome' to 'chromium' with the 'chrome' channel.
+        if browser_type == 'chrome':
+            browser_type = 'chromium'
+            # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
+            default_launch_browser_options['channel'] = 'chrome'
         self._browser_type: BrowserType = browser_type
         self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
         self._browser_new_context_options = browser_new_context_options or {}

crawlee/browsers/_types.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal
 if TYPE_CHECKING:
     from playwright.async_api import Page
-BrowserType = Literal['chromium', 'firefox', 'webkit']
+BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']
 @dataclass

crawlee/configuration.py CHANGED Viewed

@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
     Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
     """
-    model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True)
+    # TODO: https://github.com/pydantic/pydantic-settings/issues/706
+    # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
+    model_config = SettingsConfigDict(populate_by_name=True)
     internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
     """Timeout for the internal asynchronous operations."""

crawlee/crawlers/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from crawlee._utils.try_import import install_import_hook as _install_import_hook
 from crawlee._utils.try_import import try_import as _try_import
-from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
+from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
 from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
 from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
@@ -23,12 +23,14 @@ with _try_import(
     'AdaptivePlaywrightCrawler',
     'AdaptivePlaywrightCrawlingContext',
     'AdaptivePlaywrightPreNavCrawlingContext',
+    'AdaptivePlaywrightCrawlerStatisticState',
     'RenderingType',
     'RenderingTypePrediction',
     'RenderingTypePredictor',
 ):
     from ._adaptive_playwright import (
         AdaptivePlaywrightCrawler,
+        AdaptivePlaywrightCrawlerStatisticState,
         AdaptivePlaywrightCrawlingContext,
         AdaptivePlaywrightPreNavCrawlingContext,
         RenderingType,
@@ -41,6 +43,7 @@ __all__ = [
     'AbstractHttpCrawler',
     'AbstractHttpParser',
     'AdaptivePlaywrightCrawler',
+    'AdaptivePlaywrightCrawlerStatisticState',
     'AdaptivePlaywrightCrawlingContext',
     'AdaptivePlaywrightPreNavCrawlingContext',
     'BasicCrawler',
@@ -51,6 +54,7 @@ __all__ = [
     'BeautifulSoupParserType',
     'ContextPipeline',
     'HttpCrawler',
+    'HttpCrawlerOptions',
     'HttpCrawlingContext',
     'HttpCrawlingResult',
     'ParsedHttpCrawlingContext',

crawlee/crawlers/_abstract_http/__init__.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from ._abstract_http_crawler import AbstractHttpCrawler
+from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
 from ._abstract_http_parser import AbstractHttpParser
 from ._http_crawling_context import ParsedHttpCrawlingContext
 __all__ = [
     'AbstractHttpCrawler',
     'AbstractHttpParser',
+    'HttpCrawlerOptions',
     'ParsedHttpCrawlingContext',
 ]

crawlee/crawlers/_abstract_http/_abstract_http_crawler.py CHANGED Viewed

@@ -3,14 +3,16 @@ from __future__ import annotations
 import asyncio
 import logging
 from abc import ABC
+from datetime import timedelta
 from typing import TYPE_CHECKING, Any, Generic
 from more_itertools import partition
 from pydantic import ValidationError
-from typing_extensions import TypeVar
+from typing_extensions import NotRequired, TypeVar
-from crawlee._request import Request, RequestOptions
+from crawlee._request import Request, RequestOptions, RequestState
 from crawlee._utils.docs import docs_group
+from crawlee._utils.time import SharedTimeout
 from crawlee._utils.urls import to_absolute_url_iterator
 from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
 from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
+class HttpCrawlerOptions(
+    BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
+    Generic[TCrawlingContext, TStatisticsState],
+):
+    """Arguments for the `AbstractHttpCrawler` constructor.
+    It is intended for typing forwarded `__init__` arguments in the subclasses.
+    """
+    navigation_timeout: NotRequired[timedelta | None]
+    """Timeout for the HTTP request."""
 @docs_group('Crawlers')
 class AbstractHttpCrawler(
     BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
         self,
         *,
         parser: AbstractHttpParser[TParseResult, TSelectResult],
+        navigation_timeout: timedelta | None = None,
         **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
     ) -> None:
         self._parser = parser
+        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
         self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
+        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
         if '_context_pipeline' not in kwargs:
             raise ValueError(
@@ -82,9 +100,7 @@ class AbstractHttpCrawler(
         this method simplifies cases where `TParseResult` is used for both generic parameters.
         """
-        class _ParsedHttpCrawler(
-            AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]
-        ):
+        class _ParsedHttpCrawler(AbstractHttpCrawler):
             def __init__(
                 self,
                 parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
@@ -112,9 +128,17 @@ class AbstractHttpCrawler(
     async def _execute_pre_navigation_hooks(
         self, context: BasicCrawlingContext
     ) -> AsyncGenerator[BasicCrawlingContext, None]:
-        for hook in self._pre_navigation_hooks:
-            await hook(context)
-        yield context
+        context_id = id(context)
+        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
+        try:
+            for hook in self._pre_navigation_hooks:
+                async with self._shared_navigation_timeouts[context_id]:
+                    await hook(context)
+            yield context
+        finally:
+            self._shared_navigation_timeouts.pop(context_id, None)
     async def _parse_http_response(
         self, context: HttpCrawlingContext
@@ -165,9 +189,18 @@ class AbstractHttpCrawler(
             robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
             kwargs.setdefault('strategy', 'same-hostname')
+            strategy = kwargs.get('strategy', 'same-hostname')
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            # Get base URL from <base> tag if present
+            extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
+            base_url: str = (
+                str(extracted_base_urls[0])
+                if extracted_base_urls
+                else context.request.loaded_url or context.request.url
+            )
+            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -175,7 +208,9 @@ class AbstractHttpCrawler(
                 skipped = iter([])
             for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
-                request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
+                request_options = RequestOptions(
+                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
+                )
                 if transform_request_function:
                     transform_request_options = transform_request_function(request_options)
@@ -214,13 +249,16 @@ class AbstractHttpCrawler(
         Yields:
             The original crawling context enhanced by HTTP response.
         """
-        result = await self._http_client.crawl(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            statistics=self._statistics,
-        )
+        async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
+            result = await self._http_client.crawl(
+                request=context.request,
+                session=context.session,
+                proxy_info=context.proxy_info,
+                statistics=self._statistics,
+                timeout=remaining_timeout,
+            )
+        context.request.state = RequestState.AFTER_NAV
         yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
     async def _handle_status_code_response(

crawlee/crawlers/_adaptive_playwright/__init__.py CHANGED Viewed

@@ -11,13 +11,16 @@ _install_import_hook(__name__)
 # The following imports are wrapped in try_import to handle optional dependencies,
 # ensuring the module can still function even if these dependencies are missing.
-with _try_import(__name__, 'BeautifulSoupCrawler'):
+with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):
     from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
-with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
+with _try_import(__name__, 'AdaptivePlaywrightCrawler'):
     from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
+with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
+    from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState
 __all__ = [
     'AdaptivePlaywrightCrawler',
+    'AdaptivePlaywrightCrawlerStatisticState',
     'AdaptivePlaywrightCrawlingContext',
     'AdaptivePlaywrightPreNavCrawlingContext',
     'RenderingType',

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py CHANGED Viewed

@@ -27,23 +27,16 @@ from crawlee.crawlers import (
 )
 from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
 from crawlee.crawlers._parsel._parsel_parser import ParselParser
+from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
 from crawlee.statistics import Statistics, StatisticsState
-from ._adaptive_playwright_crawler_statistics import (
-    AdaptivePlaywrightCrawlerStatisticState,
-)
+from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
 from ._adaptive_playwright_crawling_context import (
     AdaptivePlaywrightCrawlingContext,
     AdaptivePlaywrightPreNavCrawlingContext,
 )
-from ._rendering_type_predictor import (
-    DefaultRenderingTypePredictor,
-    RenderingType,
-    RenderingTypePredictor,
-)
-from ._result_comparator import (
-    create_default_comparator,
-)
+from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
+from ._result_comparator import create_default_comparator
 if TYPE_CHECKING:
     from types import TracebackType
@@ -51,7 +44,6 @@ if TYPE_CHECKING:
     from typing_extensions import Unpack
     from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
-    from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
 TStaticParseResult = TypeVar('TStaticParseResult')
@@ -71,7 +63,6 @@ class _NonPersistentStatistics(Statistics):
     async def __aenter__(self) -> Self:
         self._active = True
         await self._state.initialize()
-        self._after_initialize()
         return self
     async def __aexit__(
@@ -149,10 +140,6 @@ class AdaptivePlaywrightCrawler(
                 non-default configuration.
             kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
         """
-        # Some sub crawler kwargs are internally modified. Prepare copies.
-        basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
-        basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
         # Adaptive crawling related.
         self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
         self.result_checker = result_checker or (lambda _: True)
@@ -162,19 +149,21 @@ class AdaptivePlaywrightCrawler(
         if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
             kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
-        super().__init__(statistics=statistics, **kwargs)
+        adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
+        super().__init__(statistics=adaptive_statistics, **kwargs)
         # Sub crawlers related.
-        playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {}
+        playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()
         # Each sub crawler will use custom logger .
         static_logger = getLogger('Subcrawler_static')
         static_logger.setLevel(logging.ERROR)
-        basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
+        basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
         pw_logger = getLogger('Subcrawler_playwright')
         pw_logger.setLevel(logging.ERROR)
-        basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
+        basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
         # Initialize sub crawlers to create their pipelines.
         static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -295,11 +284,14 @@ class AdaptivePlaywrightCrawler(
             use_state_function = context.use_state
         # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
-        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
+        result = RequestHandlerRunResult(
+            key_value_store_getter=self.get_key_value_store,
+            request=context.request,
+        )
         context_linked_to_result = BasicCrawlingContext(
-            request=deepcopy(context.request),
-            session=deepcopy(context.session),
-            proxy_info=deepcopy(context.proxy_info),
+            request=result.request,
+            session=context.session,
+            proxy_info=context.proxy_info,
             send_request=context.send_request,
             add_requests=result.add_requests,
             push_data=result.push_data,
@@ -337,7 +329,7 @@ class AdaptivePlaywrightCrawler(
                 )
                 await self.router(adaptive_crawling_context)
-            return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)
+            return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]
         if rendering_type == 'client only':
@@ -347,7 +339,7 @@ class AdaptivePlaywrightCrawler(
                 )
                 await self.router(adaptive_crawling_context)
-            return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)
+            return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]
         raise RuntimeError(
             f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
@@ -407,12 +399,9 @@ class AdaptivePlaywrightCrawler(
             raise pw_run.exception
         if pw_run.result:
-            self._context_result_map[context] = pw_run.result
             if should_detect_rendering_type:
                 detection_result: RenderingType
                 static_run = await self._crawl_one('static', context=context, state=old_state_copy)
                 if static_run.result and self.result_comparator(static_run.result, pw_run.result):
                     detection_result = 'static'
                 else:
@@ -421,6 +410,8 @@ class AdaptivePlaywrightCrawler(
                 context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
                 self.rendering_type_predictor.store_result(context.request, detection_result)
+            self._context_result_map[context] = pw_run.result
     def pre_navigation_hook(
         self,
         hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py CHANGED Viewed

@@ -17,7 +17,7 @@ if TYPE_CHECKING:
     from playwright.async_api import Page, Response
     from typing_extensions import Self
-    from crawlee.crawlers._playwright._types import BlockRequestsFunction
+    from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
 TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
         http_response = await PlaywrightHttpResponse.from_playwright_response(
             response=context.response, protocol=protocol_guess or ''
         )
-        # block_requests is useful only on pre-navigation contexts. It is useless here.
+        # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
         context_kwargs.pop('block_requests')
+        context_kwargs.pop('goto_options')
         return cls(
             parsed_content=await parser.parse(http_response),
             http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
     block_requests: BlockRequestsFunction | None = None
     """Blocks network requests matching specified URL patterns."""
+    goto_options: GotoOptions | None = None
+    """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
     @property
     def page(self) -> Page:
         """The Playwright `Page` object for the current page.

crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

crawlee 1.0.3b6py3-none-any.whl → 1.2.2b24py3-none-any.whl