PyPI - crawlee - Versions diffs - 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl - Mend

crawlee 0.6.13b15py3-none-any.whl → 1.3.1b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show

crawlee/__init__.py +2 -1
crawlee/_autoscaling/snapshotter.py +1 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +64 -43
crawlee/_service_locator.py +44 -24
crawlee/_types.py +128 -36
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +27 -3
crawlee/_utils/requests.py +0 -26
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +16 -7
crawlee/_utils/system.py +30 -14
crawlee/_utils/time.py +120 -0
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +5 -2
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +2 -0
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
crawlee/crawlers/_basic/_basic_crawler.py +254 -148
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +27 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/events/_types.py +6 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/fingerprint_suite/_types.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +32 -11
crawlee/otel/crawler_instrumentor.py +4 -6
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_request_list.py +3 -3
crawlee/request_loaders/_request_loader.py +5 -1
crawlee/request_loaders/_sitemap_request_loader.py +248 -50
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +5 -5
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +62 -12
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_base/_request_queue_client.py +2 -2
crawlee/storage_clients/_base/_storage_client.py +13 -0
crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
crawlee/storage_clients/_file_system/_storage_client.py +16 -3
crawlee/storage_clients/_file_system/_utils.py +0 -0
crawlee/storage_clients/_memory/_dataset_client.py +16 -4
crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
crawlee/storage_clients/_memory/_storage_client.py +6 -3
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +292 -0
crawlee/storage_clients/_redis/_dataset_client.py +329 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
crawlee/storage_clients/_redis/_storage_client.py +149 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +282 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +21 -14
crawlee/storages/_base.py +5 -1
crawlee/storages/_dataset.py +12 -2
crawlee/storages/_key_value_store.py +17 -4
crawlee/storages/_request_queue.py +13 -5
crawlee/storages/_storage_instance_manager.py +196 -75
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
crawlee-1.3.1b3.dist-info/RECORD +207 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
crawlee/_utils/measure_time.py +0 -31
crawlee-0.6.13b15.dist-info/RECORD +0 -183
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0

crawlee/crawlers/_basic/_context_utils.py ADDED Viewed

@@ -0,0 +1,24 @@
+from __future__ import annotations
+from contextlib import contextmanager
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from crawlee._request import Request
+    from ._basic_crawling_context import BasicCrawlingContext
+@contextmanager
+def swapped_context(
+    context: BasicCrawlingContext,
+    request: Request,
+) -> Iterator[None]:
+    """Replace context's isolated copies with originals after handler execution."""
+    try:
+        yield
+    finally:
+        # Restore original context state to avoid side effects between different handlers.
+        object.__setattr__(context, 'request', request)

crawlee/crawlers/_basic/_logging_utils.py CHANGED Viewed

@@ -2,9 +2,21 @@ import asyncio
 import re
 import traceback
+import crawlee.errors
 def _get_only_innermost_exception(error: BaseException) -> BaseException:
-    """Get innermost exception by following __cause__ and __context__ attributes of exception."""
+    """Get innermost exception by following __cause__ and __context__ attributes of exception.
+    If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
+    """
+    if type(error) is crawlee.errors.UserHandlerTimeoutError:
+        if error.__cause__:
+            return error.__cause__
+        if error.__context__:
+            return error.__context__
+        return error
     if error.__cause__:
         return _get_only_innermost_exception(error.__cause__)
     if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
 def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
-    timeout_error: asyncio.exceptions.TimeoutError,
+    timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
 ) -> list[str]:
     innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
     return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,24 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
 def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
     innermost_error = _get_only_innermost_exception(error)
     return traceback.format_exception(
-        type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
+        type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
     )
 def get_one_line_error_summary_if_possible(error: Exception) -> str:
     if isinstance(error, asyncio.exceptions.TimeoutError):
-        most_relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
+        relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
+        most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
+    elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
+        # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
+        # code and third line the topmost user error
+        traceback_parts = _get_traceback_parts_for_innermost_exception(error)
+        relevant_index_from_start = 3
+        most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
+    elif 'playwright._impl._errors.Error' in str(error.__class__):
+        # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
+        # point to deep internals.
+        return ''
     else:
         traceback_parts = _get_traceback_parts_for_innermost_exception(error)
         # Commonly last traceback part is type of the error, and the second last part is the relevant file.

crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
 from bs4 import BeautifulSoup, Tag
 from crawlee._utils.docs import docs_group
-from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
+from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
 from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
 from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
         self,
         *,
         parser: BeautifulSoupParserType = 'lxml',
-        **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
+        **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
     ) -> None:
         """Initialize a new instance.

crawlee/crawlers/_parsel/_parsel_crawler.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
 from parsel import Selector
 from crawlee._utils.docs import docs_group
-from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
+from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
 from ._parsel_crawling_context import ParselCrawlingContext
 from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
     def __init__(
         self,
-        **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
+        **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
     ) -> None:
         """Initialize a new instance.

crawlee/crawlers/_parsel/_parsel_parser.py CHANGED Viewed

@@ -22,7 +22,7 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
     @override
     async def parse(self, response: HttpResponse) -> Selector:
         response_body = await response.read()
-        return await asyncio.to_thread(lambda: Selector(body=response_body))
+        return await asyncio.to_thread(Selector, body=response_body)
     @override
     async def parse_text(self, text: str) -> Selector:

crawlee/crawlers/_playwright/_playwright_crawler.py CHANGED Viewed

@@ -3,18 +3,22 @@ from __future__ import annotations
 import asyncio
 import logging
 import warnings
+from datetime import timedelta
 from functools import partial
 from typing import TYPE_CHECKING, Any, Generic, Literal
+import playwright.async_api
 from more_itertools import partition
 from pydantic import ValidationError
 from typing_extensions import NotRequired, TypedDict, TypeVar
 from crawlee import service_locator
-from crawlee._request import Request, RequestOptions
+from crawlee._request import Request, RequestOptions, RequestState
+from crawlee._types import BasicCrawlingContext, ConcurrencySettings
 from crawlee._utils.blocked import RETRY_CSS_SELECTORS
 from crawlee._utils.docs import docs_group
 from crawlee._utils.robots import RobotsTxtFile
+from crawlee._utils.time import SharedTimeout
 from crawlee._utils.urls import to_absolute_url_iterator
 from crawlee.browsers import BrowserPool
 from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -28,6 +32,7 @@ from crawlee.statistics import StatisticsState
 from ._playwright_crawling_context import PlaywrightCrawlingContext
 from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
 from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
+from ._types import GotoOptions
 from ._utils import block_requests, infinite_scroll
 TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
@@ -43,7 +48,6 @@ if TYPE_CHECKING:
     from crawlee import RequestTransformAction
     from crawlee._types import (
-        BasicCrawlingContext,
         EnqueueLinksKwargs,
         ExtractLinksFunction,
         HttpHeaders,
@@ -102,9 +106,11 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
         user_data_dir: str | Path | None = None,
         browser_launch_options: Mapping[str, Any] | None = None,
         browser_new_context_options: Mapping[str, Any] | None = None,
+        goto_options: GotoOptions | None = None,
         fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
         headless: bool | None = None,
         use_incognito_pages: bool | None = None,
+        navigation_timeout: timedelta | None = None,
         **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
     ) -> None:
         """Initialize a new instance.
@@ -113,7 +119,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
             user_data_dir: Path to a user data directory, which stores browser session data like cookies
                 and local storage.
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
                 This option should not be used if `browser_pool` is provided.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
                 directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -130,12 +139,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
                 own context that is destroyed once the page is closed or crashes.
                 This option should not be used if `browser_pool` is provided.
+            navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
+                the request handler)
+            goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
+                not supported, use `navigation_timeout` instead.
             kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
         """
         configuration = kwargs.pop('configuration', None)
         if configuration is not None:
             service_locator.set_configuration(configuration)
+        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
         if browser_pool:
             # Raise an exception if browser_pool is provided together with other browser-related arguments.
             if any(
@@ -152,17 +167,16 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             ):
                 raise ValueError(
                     'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
-                    '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`  or'
+                    '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
                     '`fingerprint_generator` arguments when `browser_pool` is provided.'
                 )
         # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
         else:
             if fingerprint_generator == 'default':
-                if not browser_type:
-                    generator_browser_type = None
-                else:
-                    generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
+                generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = (
+                    [fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None
+                )
                 fingerprint_generator = DefaultFingerprintGenerator(
                     header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
@@ -194,6 +208,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
         kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
+        # Set default concurrency settings for browser crawlers if not provided
+        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
+            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
+        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
+        self._goto_options = goto_options or GotoOptions()
         super().__init__(**kwargs)
     async def _open_page(
@@ -218,12 +239,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             log=context.log,
             page=crawlee_page.page,
             block_requests=partial(block_requests, page=crawlee_page.page),
+            goto_options=GotoOptions(**self._goto_options),
         )
-        async with browser_page_context(crawlee_page.page):
-            for hook in self._pre_navigation_hooks:
-                await hook(pre_navigation_context)
-        yield pre_navigation_context
+        context_id = id(pre_navigation_context)
+        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
+        try:
+            async with browser_page_context(crawlee_page.page):
+                for hook in self._pre_navigation_hooks:
+                    async with self._shared_navigation_timeouts[context_id]:
+                        await hook(pre_navigation_context)
+            yield pre_navigation_context
+        finally:
+            self._shared_navigation_timeouts.pop(context_id, None)
     def _prepare_request_interceptor(
         self,
@@ -258,6 +288,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
         Raises:
             ValueError: If the browser pool is not initialized.
             SessionError: If the URL cannot be loaded by the browser.
+            TimeoutError: If navigation does not succeed within the navigation timeout.
         Yields:
             The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -289,7 +320,14 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
                 # Set route_handler only for current request
                 await context.page.route(context.request.url, route_handler)
-            response = await context.page.goto(context.request.url)
+            try:
+                async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
+                    response = await context.page.goto(
+                        context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
+                    )
+                context.request.state = RequestState.AFTER_NAV
+            except playwright.async_api.TimeoutError as exc:
+                raise asyncio.TimeoutError from exc
             if response is None:
                 raise SessionError(f'Failed to load the URL: {context.request.url}')
@@ -316,6 +354,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
                     extract_links=extract_links,
                     enqueue_links=self._create_enqueue_links_function(context, extract_links),
                     block_requests=partial(block_requests, page=context.page),
+                    goto_options=context.goto_options,
                 )
             if context.session:
@@ -356,12 +395,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
             kwargs.setdefault('strategy', 'same-hostname')
+            strategy = kwargs.get('strategy', 'same-hostname')
             elements = await context.page.query_selector_all(selector)
             links_iterator: Iterator[str] = iter(
                 [url for element in elements if (url := await element.get_attribute('href')) is not None]
             )
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            # Get base URL from <base> tag if present
+            extracted_base_url = await context.page.evaluate('document.baseURI')
+            base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
+            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -369,17 +414,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
                 skipped = iter([])
             for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
-                request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
+                request_options = RequestOptions(
+                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
+                )
                 if transform_request_function:
-                    transform_request_option = transform_request_function(request_option)
-                    if transform_request_option == 'skip':
+                    transform_request_options = transform_request_function(request_options)
+                    if transform_request_options == 'skip':
                         continue
-                    if transform_request_option != 'unchanged':
-                        request_option = transform_request_option
+                    if transform_request_options != 'unchanged':
+                        request_options = transform_request_options
                 try:
-                    request = Request.from_url(**request_option)
+                    request = Request.from_url(**request_options)
                 except ValidationError as exc:
                     context.log.debug(
                         f'Skipping URL "{url}" due to invalid format: {exc}. '
@@ -465,7 +512,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
     async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None:
         """Update the cookies in the page context."""
-        await page.context.add_cookies([{**cookie} for cookie in cookies])
+        # False positive ty error, see https://github.com/astral-sh/ty/issues/1493.
+        await page.context.add_cookies([{**cookie} for cookie in cookies])  # ty: ignore[invalid-argument-type]
     async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
         """Find the robots.txt file for a given URL.
@@ -489,7 +537,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
     """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
     browser_type: NotRequired[BrowserType]
-    """The type of browser to launch ('chromium', 'firefox', or 'webkit').
+    """The type of browser to launch:
+    - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+    - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
     This option should not be used if `browser_pool` is provided."""
     browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -509,9 +559,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
 class PlaywrightCrawlerOptions(
-    Generic[TCrawlingContext, TStatisticsState],
     _PlaywrightCrawlerAdditionalOptions,
     BasicCrawlerOptions[TCrawlingContext, StatisticsState],
+    Generic[TCrawlingContext, TStatisticsState],
 ):
     """Arguments for the `AbstractHttpCrawler` constructor.

crawlee/crawlers/_playwright/_playwright_http_client.py CHANGED Viewed

@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         # `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
         # TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
         # Proxies appropriate to the browser context are used
         response = await browser_context.request.fetch(
-            url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
+            url_or_request=url,
+            method=method.lower(),
+            headers=dict(headers) if headers else None,
+            data=payload,
+            timeout=timeout.total_seconds() if timeout else None,
         )
         return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')

crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py CHANGED Viewed

@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
 if TYPE_CHECKING:
     from playwright.async_api import Page
-    from ._types import BlockRequestsFunction
+    from ._types import BlockRequestsFunction, GotoOptions
 @dataclass(frozen=True)
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
     block_requests: BlockRequestsFunction
     """Blocks network requests matching specified URL patterns."""
+    goto_options: GotoOptions
+    """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
     async def get_snapshot(self) -> PageSnapshot:
         """Get snapshot of crawled page."""
         html = None

crawlee/crawlers/_playwright/_types.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Protocol
+from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
 from crawlee import HttpHeaders
 from crawlee._utils.docs import docs_group
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
     from collections.abc import AsyncGenerator
     from playwright.async_api import APIResponse, Response
-    from typing_extensions import Self
+    from typing_extensions import NotRequired, Self
 @docs_group('Functions')
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
         _content = await response.body()
         return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
+class GotoOptions(TypedDict):
+    """Keyword arguments for Playwright's `Page.goto()` method."""
+    wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
+    """When to consider operation succeeded, defaults to 'load' event."""
+    referer: NotRequired[str]
+    """Referer header value."""

crawlee/errors.py CHANGED Viewed

@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
     """Wraps an exception thrown from an user-defined error handler."""
+class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
+    """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
 @docs_group('Errors')
 class SessionError(Exception):
     """Errors of `SessionError` type will trigger a session rotation.

crawlee/events/_event_manager.py CHANGED Viewed

@@ -130,11 +130,13 @@ class EventManager:
         if not self._active:
             raise RuntimeError(f'The {self.__class__.__name__} is not active.')
+        # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
+        await self._emit_persist_state_event_rec_task.stop()
+        await self._emit_persist_state_event()
         await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
         self._event_emitter.remove_all_listeners()
         self._listener_tasks.clear()
         self._listeners_to_wrappers.clear()
-        await self._emit_persist_state_event_rec_task.stop()
         self._active = False
     @overload
@@ -172,13 +174,12 @@ class EventManager:
             # to avoid blocking the event loop
             coro = (
                 listener(*bound_args.args, **bound_args.kwargs)
-                if asyncio.iscoroutinefunction(listener)
+                if inspect.iscoroutinefunction(listener)
                 else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
             )
-            # Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
-            # unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
-            listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}')
+            listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__
+            listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}')
             self._listener_tasks.add(listener_task)
             try:
@@ -189,7 +190,12 @@ class EventManager:
                 # We need to swallow the exception and just log it here, otherwise it could break the event emitter
                 logger.exception(
                     'Exception in the event listener',
-                    extra={'event_name': event.value, 'listener_name': listener.__name__},
+                    extra={
+                        'event_name': event.value,
+                        'listener_name': listener.__name__
+                        if hasattr(listener, '__name__')
+                        else listener.__class__.__name__,
+                    },
                 )
             finally:
                 logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...')

crawlee/events/_types.py CHANGED Viewed

@@ -40,7 +40,7 @@ class Event(str, Enum):
 class EventPersistStateData(BaseModel):
     """Data for the persist state event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     is_migrating: Annotated[bool, Field(alias='isMigrating')]
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
 class EventSystemInfoData(BaseModel):
     """Data for the system info event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
     memory_info: Annotated[
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
 class EventMigratingData(BaseModel):
     """Data for the migrating event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     # The remaining time in seconds before the migration is forced and the process is killed
     # Optional because it's not present when the event handler is called manually
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
 class EventAbortingData(BaseModel):
     """Data for the aborting event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
 @docs_group('Event data')
 class EventExitData(BaseModel):
     """Data for the exit event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
 @docs_group('Event data')
 class EventCrawlerStatusData(BaseModel):
     """Data for the crawler status event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     message: str
     """A message describing the current status of the crawler."""

crawlee/fingerprint_suite/_browserforge_adapter.py CHANGED Viewed

@@ -154,7 +154,7 @@ class PatchedHeaderGenerator(bf_HeaderGenerator):
 class PatchedFingerprintGenerator(bf_FingerprintGenerator):
     """Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""
-    def __init__(  # type:ignore[no-untyped-def]  # Upstream repo types missing.
+    def __init__(
         self,
         *,
         screen: Screen | None = None,

crawlee/fingerprint_suite/_fingerprint_generator.py CHANGED Viewed

@@ -3,10 +3,13 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
+from crawlee._utils.docs import docs_group
 if TYPE_CHECKING:
     from browserforge.fingerprints import Fingerprint
+@docs_group('Other')
 class FingerprintGenerator(ABC):
     """A class for creating browser fingerprints that mimic browser fingerprints of real users."""

crawlee/fingerprint_suite/_header_generator.py CHANGED Viewed

@@ -11,9 +11,9 @@ if TYPE_CHECKING:
 def fingerprint_browser_type_from_playwright_browser_type(
-    playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
+    playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
 ) -> SupportedBrowserType:
-    if playwright_browser_type == 'chromium':
+    if playwright_browser_type in {'chromium', 'chrome'}:
         return 'chrome'
     if playwright_browser_type == 'firefox':
         return 'firefox'

crawlee/fingerprint_suite/_types.py CHANGED Viewed

@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
 class ScreenOptions(BaseModel):
-    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+    model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
     """Defines the screen constrains for the fingerprint generator."""
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
 class HeaderGeneratorOptions(BaseModel):
     """Collection of header related attributes that can be used by the fingerprint generator."""
-    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+    model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
     browsers: list[SupportedBrowserType] | None = None
     """List of BrowserSpecifications to generate the headers for."""

crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b15py3-none-any.whl → 1.3.1b3py3-none-any.whl