PyPI - crawlee - Versions diffs - 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl - Mend

crawlee 1.0.3b6py3-none-any.whl → 1.2.2b24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

crawlee/__init__.py +2 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +32 -13
crawlee/_service_locator.py +4 -4
crawlee/_types.py +44 -5
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +27 -3
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +13 -6
crawlee/_utils/system.py +27 -11
crawlee/_utils/time.py +41 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +5 -2
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/browsers/_playwright_browser_controller.py +1 -1
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
crawlee/crawlers/_basic/_basic_crawler.py +156 -131
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +23 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +4 -6
crawlee/request_loaders/_sitemap_request_loader.py +23 -5
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +3 -3
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +51 -9
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +4 -0
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +292 -0
crawlee/storage_clients/_redis/_dataset_client.py +329 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
crawlee/storage_clients/_redis/_storage_client.py +149 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/_client_mixin.py +1 -1
crawlee/storage_clients/_sql/_db_models.py +1 -2
crawlee/storage_clients/models.py +8 -3
crawlee/storages/_key_value_store.py +5 -2
crawlee/storages/_storage_instance_manager.py +103 -44
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0

crawlee/crawlers/_basic/_basic_crawler.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 import asyncio
+import functools
 import logging
 import signal
 import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
 from contextlib import AsyncExitStack, suppress
 from datetime import timedelta
 from functools import partial
+from io import StringIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generic, Literal, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
 from urllib.parse import ParseResult, urlparse
 from weakref import WeakKeyDictionary
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
 from crawlee._types import (
     BasicCrawlingContext,
     EnqueueLinksKwargs,
+    ExportDataCsvKwargs,
+    ExportDataJsonKwargs,
     GetKeyValueStoreFromRequestHandlerFunction,
     HttpHeaders,
     HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
     SkippedReason,
 )
 from crawlee._utils.docs import docs_group
-from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
+from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
 from crawlee._utils.recurring_task import RecurringTask
 from crawlee._utils.robots import RobotsTxtFile
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -55,6 +59,7 @@ from crawlee.errors import (
     RequestHandlerError,
     SessionError,
     UserDefinedErrorHandlerError,
+    UserHandlerTimeoutError,
 )
 from crawlee.events._types import Event, EventCrawlerStatusData
 from crawlee.http_clients import ImpitHttpClient
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
 from crawlee.storages import Dataset, KeyValueStore, RequestQueue
 from ._context_pipeline import ContextPipeline
+from ._context_utils import swapped_context
 from ._logging_utils import (
     get_one_line_error_summary_if_possible,
     reduce_asyncio_timeout_error_to_relevant_traceback_parts,
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
 TRequestIterator = TypeVar('TRequestIterator', str, Request)
+TParams = ParamSpec('TParams')
+T = TypeVar('T')
 ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
 FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
 SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -401,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
         # Context pipeline
-        self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
+        self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)  # ty: ignore[invalid-argument-type]
         # Crawl settings
         self._max_request_retries = max_request_retries
@@ -437,14 +446,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self._statistics_log_format = statistics_log_format
         # Statistics
-        self._statistics = statistics or cast(
-            'Statistics[TStatisticsState]',
-            Statistics.with_default_state(
-                periodic_message_logger=self._logger,
-                statistics_log_format=self._statistics_log_format,
-                log_message='Current request statistics:',
-            ),
-        )
+        if statistics:
+            self._statistics = statistics
+        else:
+            async def persist_state_factory() -> KeyValueStore:
+                return await self.get_key_value_store()
+            self._statistics = cast(
+                'Statistics[TStatisticsState]',
+                Statistics.with_default_state(
+                    persistence_enabled=True,
+                    periodic_message_logger=self._logger,
+                    statistics_log_format=self._statistics_log_format,
+                    log_message='Current request statistics:',
+                    persist_state_kvs_factory=persist_state_factory,
+                ),
+            )
         # Additional context managers to enter and exit
         self._additional_context_managers = _additional_context_managers or []
@@ -511,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
         self._unexpected_stop = True
+    def _wrap_handler_with_error_context(
+        self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
+    ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
+        """Decorate error handlers to make their context helpers usable."""
+        @functools.wraps(handler)
+        async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
+            # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
+            # failed. Modified context provides context helpers with direct access to the storages.
+            error_context = context.create_modified_copy(
+                push_data=self._push_data,
+                get_key_value_store=self.get_key_value_store,
+                add_requests=functools.partial(self._add_requests, context),
+            )
+            return await handler(error_context, exception)
+        return wrapped_handler
     def _stop_if_max_requests_count_exceeded(self) -> None:
         """Call `stop` when the maximum number of requests to crawl has been reached."""
         if self._max_requests_per_crawl is None:
@@ -609,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         The error handler is invoked after a request handler error occurs and before a retry attempt.
         """
-        self._error_handler = handler
+        self._error_handler = self._wrap_handler_with_error_context(handler)
         return handler
     def failed_request_handler(
@@ -619,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         The failed request handler is invoked when a request has failed all retry attempts.
         """
-        self._failed_request_handler = handler
+        self._failed_request_handler = self._wrap_handler_with_error_context(handler)
         return handler
     def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -689,7 +725,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         except CancelledError:
             pass
         finally:
-            await self._crawler_state_rec_task.stop()
             if threading.current_thread() is threading.main_thread():
                 with suppress(NotImplementedError):
                     asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -721,8 +756,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
     async def _run_crawler(self) -> None:
         event_manager = self._service_locator.get_event_manager()
-        self._crawler_state_rec_task.start()
         # Collect the context managers to be entered. Context managers that are already active are excluded,
         # as they were likely entered by the caller, who will also be responsible for exiting them.
         contexts_to_enter = [
@@ -733,6 +766,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 self._statistics,
                 self._session_pool if self._use_session_pool else None,
                 self._http_client,
+                self._crawler_state_rec_task,
                 *self._additional_context_managers,
             )
             if cm and getattr(cm, 'active', False) is False
@@ -740,7 +774,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         async with AsyncExitStack() as exit_stack:
             for context in contexts_to_enter:
-                await exit_stack.enter_async_context(context)  # type: ignore[arg-type]
+                await exit_stack.enter_async_context(context)  # ty: ignore[invalid-argument-type]
             await self._autoscaled_pool.run()
@@ -839,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         dataset_id: str | None = None,
         dataset_name: str | None = None,
         dataset_alias: str | None = None,
+        **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
     ) -> None:
         """Export all items from a Dataset to a JSON or CSV file.
@@ -851,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             dataset_id: The ID of the Dataset to export from.
             dataset_name: The name of the Dataset to export from (global scope, named storage).
             dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
+            additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
         """
         dataset = await Dataset.open(
             id=dataset_id,
@@ -860,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             configuration=self._service_locator.get_configuration(),
         )
-        path = path if isinstance(path, Path) else Path(path)
-        dst = path.open('w', newline='')
+        path = Path(path)
         if path.suffix == '.csv':
-            await export_csv_to_stream(dataset.iterate_items(), dst)
+            dst = StringIO()
+            csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
+            await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
+            await atomic_write(path, dst.getvalue())
         elif path.suffix == '.json':
-            await export_json_to_stream(dataset.iterate_items(), dst)
+            dst = StringIO()
+            json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
+            await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
+            await atomic_write(path, dst.getvalue())
         else:
             raise ValueError(f'Unsupported file extension: {path.suffix}')
@@ -972,6 +1013,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                         label=label,
                         user_data=user_data,
                         transform_request_function=transform_request_function,
+                        **kwargs,
                     ),
                     rq_id=rq_id,
                     rq_name=rq_name,
@@ -997,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         warning_flag = True
         for request in request_iterator:
-            target_url = request.url if isinstance(request, Request) else request
+            if isinstance(request, Request):
+                if request.enqueue_strategy != strategy:
+                    request.enqueue_strategy = strategy
+                target_url = request.url
+            else:
+                target_url = request
             parsed_target_url = urlparse(target_url)
             if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
@@ -1035,8 +1082,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             return target_url.hostname == origin_url.hostname
         if strategy == 'same-domain':
-            origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
-            target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
+            origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
+            target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
             return origin_domain == target_domain
         if strategy == 'same-origin':
@@ -1094,7 +1141,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             request.retry_count += 1
             reduced_error = str(error).split('\n')[0]
             self.log.warning(
-                f'Retrying request to {context.request.url} due to: {reduced_error}'
+                f'Retrying request to {context.request.url} due to: {reduced_error}. '
                 f'{get_one_line_error_summary_if_possible(error)}'
             )
             await self._statistics.error_tracker.add(error=error, context=context)
@@ -1105,19 +1152,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 except Exception as e:
                     raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
                 else:
-                    if new_request is not None:
-                        request = new_request
+                    if new_request is not None and new_request != request:
+                        await request_manager.add_request(new_request)
+                        await self._mark_request_as_handled(request)
+                        return
             await request_manager.reclaim_request(request)
         else:
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(context.request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
+            request.state = RequestState.ERROR
+            await self._mark_request_as_handled(request)
             await self._handle_failed_request(context, error)
             self._statistics.record_request_processing_failure(request.unique_key)
@@ -1132,8 +1175,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 f'{self._internal_timeout.total_seconds()} seconds',
                 logger=self._logger,
             )
-            context.request.state = RequestState.DONE
         except UserDefinedErrorHandlerError:
             context.request.state = RequestState.ERROR
             raise
@@ -1166,17 +1207,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
     ) -> None:
         if need_mark and isinstance(request, Request):
-            request_manager = await self.get_request_manager()
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
             request.state = RequestState.SKIPPED
+            await self._mark_request_as_handled(request)
         url = request.url if isinstance(request, Request) else request
@@ -1196,10 +1228,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         if (
             isinstance(error, asyncio.exceptions.TimeoutError)
+            and traceback_parts
             and self._request_handler_timeout_text in traceback_parts[-1]
-        ):
+        ) or isinstance(error, UserHandlerTimeoutError):
             used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
-            used_traceback_parts.append(traceback_parts[-1])
+            used_traceback_parts.extend(traceback_parts[-1:])
         return ''.join(used_traceback_parts).strip('\n')
@@ -1248,58 +1281,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             else:
                 yield Request.from_url(url)
-    async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
-        """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
-        result = self._context_result_map[context]
-        base_request_manager = await self.get_request_manager()
-        origin = context.request.loaded_url or context.request.url
-        for add_requests_call in result.add_requests_calls:
-            rq_id = add_requests_call.get('rq_id')
-            rq_name = add_requests_call.get('rq_name')
-            rq_alias = add_requests_call.get('rq_alias')
-            specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
-            if specified_params > 1:
-                raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
-            if rq_id or rq_name or rq_alias:
-                request_manager: RequestManager | RequestQueue = await RequestQueue.open(
-                    id=rq_id,
-                    name=rq_name,
-                    alias=rq_alias,
-                    storage_client=self._service_locator.get_storage_client(),
-                    configuration=self._service_locator.get_configuration(),
-                )
-            else:
-                request_manager = base_request_manager
-            requests = list[Request]()
-            base_url = url if (url := add_requests_call.get('base_url')) else origin
-            requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
+    async def _add_requests(
+        self,
+        context: BasicCrawlingContext,
+        requests: Sequence[str | Request],
+        rq_id: str | None = None,
+        rq_name: str | None = None,
+        rq_alias: str | None = None,
+        **kwargs: Unpack[EnqueueLinksKwargs],
+    ) -> None:
+        """Add requests method aware of the crawling context."""
+        if rq_id or rq_name or rq_alias:
+            request_manager: RequestManager = await RequestQueue.open(
+                id=rq_id,
+                name=rq_name,
+                alias=rq_alias,
+                storage_client=self._service_locator.get_storage_client(),
+                configuration=self._service_locator.get_configuration(),
+            )
+        else:
+            request_manager = await self.get_request_manager()
-            enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'}  # type: ignore[assignment]
+        context_aware_requests = list[Request]()
+        base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
+        requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
+        filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
+        for dst_request in filter_requests_iterator:
+            # Update the crawl depth of the request.
+            dst_request.crawl_depth = context.request.crawl_depth + 1
-            filter_requests_iterator = self._enqueue_links_filter_iterator(
-                requests_iterator, context.request.url, **enqueue_links_kwargs
-            )
+            if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
+                context_aware_requests.append(dst_request)
-            for dst_request in filter_requests_iterator:
-                # Update the crawl depth of the request.
-                dst_request.crawl_depth = context.request.crawl_depth + 1
+        return await request_manager.add_requests(context_aware_requests)
-                if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
-                    requests.append(dst_request)
+    async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
+        """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
+        result = self._context_result_map[context]
-            await request_manager.add_requests(requests)
+        for add_requests_call in result.add_requests_calls:
+            await self._add_requests(context, **add_requests_call)
         for push_data_call in result.push_data_calls:
             await self._push_data(**push_data_call)
         await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
+        result.apply_request_changes(target=context.request)
     @staticmethod
     async def _commit_key_value_store_changes(
         result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
@@ -1365,10 +1394,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         else:
             session = await self._get_session()
         proxy_info = await self._get_proxy_info(request, session)
-        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
+        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
         context = BasicCrawlingContext(
-            request=request,
+            request=result.request,
             session=session,
             proxy_info=proxy_info,
             send_request=self._prepare_send_request_function(session, proxy_info),
@@ -1385,32 +1414,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         try:
             request.state = RequestState.REQUEST_HANDLER
-            self._check_request_collision(context.request, context.session)
             try:
-                await self._run_request_handler(context=context)
+                with swapped_context(context, request):
+                    self._check_request_collision(request, session)
+                    await self._run_request_handler(context=context)
             except asyncio.TimeoutError as e:
                 raise RequestHandlerError(e, context) from e
             await self._commit_request_handler_result(context)
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(context.request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
             request.state = RequestState.DONE
-            if context.session and context.session.is_usable:
-                context.session.mark_good()
+            await self._mark_request_as_handled(request)
+            if session and session.is_usable:
+                session.mark_good()
             self._statistics.record_request_processing_finish(request.unique_key)
         except RequestCollisionError as request_error:
-            context.request.no_retry = True
+            request.no_retry = True
             await self._handle_request_error(context, request_error)
         except RequestHandlerError as primary_error:
@@ -1425,7 +1448,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
         except SessionError as session_error:
-            if not context.session:
+            if not session:
                 raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
             if self._error_handler:
@@ -1435,22 +1458,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
                 self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
-                context.session.retire()
+                if session:
+                    session.retire()
                 # Increment session rotation count.
-                context.request.session_rotation_count = (context.request.session_rotation_count or 0) + 1
+                request.session_rotation_count = (request.session_rotation_count or 0) + 1
                 await request_manager.reclaim_request(request)
                 await self._statistics.error_tracker_retry.add(error=session_error, context=context)
             else:
-                await wait_for(
-                    lambda: request_manager.mark_request_as_handled(context.request),
-                    timeout=self._internal_timeout,
-                    timeout_message='Marking request as handled timed out after '
-                    f'{self._internal_timeout.total_seconds()} seconds',
-                    logger=self._logger,
-                    max_retries=3,
-                )
+                await self._mark_request_as_handled(request)
                 await self._handle_failed_request(context, session_error)
                 self._statistics.record_request_processing_failure(request.unique_key)
@@ -1458,14 +1475,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         except ContextPipelineInterruptedError as interrupted_error:
             self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(context.request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
+            await self._mark_request_as_handled(request)
         except ContextPipelineInitializationError as initialization_error:
             self._logger.debug(
@@ -1483,12 +1493,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             raise
     async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
-        await wait_for(
-            lambda: self._context_pipeline(context, self.router),
-            timeout=self._request_handler_timeout,
-            timeout_message=f'{self._request_handler_timeout_text}'
-            f' {self._request_handler_timeout.total_seconds()} seconds',
-            logger=self._logger,
+        context.request.state = RequestState.BEFORE_NAV
+        await self._context_pipeline(
+            context,
+            lambda final_context: wait_for(
+                lambda: self.router(final_context),
+                timeout=self._request_handler_timeout,
+                timeout_message=f'{self._request_handler_timeout_text}'
+                f' {self._request_handler_timeout.total_seconds()} seconds',
+                logger=self._logger,
+            ),
         )
     def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1636,3 +1650,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         )
         self._previous_crawler_state = current_state
+    async def _mark_request_as_handled(self, request: Request) -> None:
+        request_manager = await self.get_request_manager()
+        await wait_for(
+            lambda: request_manager.mark_request_as_handled(request),
+            timeout=self._internal_timeout,
+            timeout_message='Marking request as handled timed out after '
+            f'{self._internal_timeout.total_seconds()} seconds',
+            logger=self._logger,
+            max_retries=3,
+        )

crawlee/crawlers/_basic/_context_utils.py ADDED Viewed

@@ -0,0 +1,24 @@
+from __future__ import annotations
+from contextlib import contextmanager
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from crawlee._request import Request
+    from ._basic_crawling_context import BasicCrawlingContext
+@contextmanager
+def swapped_context(
+    context: BasicCrawlingContext,
+    request: Request,
+) -> Iterator[None]:
+    """Replace context's isolated copies with originals after handler execution."""
+    try:
+        yield
+    finally:
+        # Restore original context state to avoid side effects between different handlers.
+        object.__setattr__(context, 'request', request)

crawlee/crawlers/_basic/_logging_utils.py CHANGED Viewed

@@ -2,9 +2,21 @@ import asyncio
 import re
 import traceback
+import crawlee.errors
 def _get_only_innermost_exception(error: BaseException) -> BaseException:
-    """Get innermost exception by following __cause__ and __context__ attributes of exception."""
+    """Get innermost exception by following __cause__ and __context__ attributes of exception.
+    If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
+    """
+    if type(error) is crawlee.errors.UserHandlerTimeoutError:
+        if error.__cause__:
+            return error.__cause__
+        if error.__context__:
+            return error.__context__
+        return error
     if error.__cause__:
         return _get_only_innermost_exception(error.__cause__)
     if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
 def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
-    timeout_error: asyncio.exceptions.TimeoutError,
+    timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
 ) -> list[str]:
     innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
     return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
 def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
     innermost_error = _get_only_innermost_exception(error)
     return traceback.format_exception(
-        type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
+        type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
     )
 def get_one_line_error_summary_if_possible(error: Exception) -> str:
     if isinstance(error, asyncio.exceptions.TimeoutError):
-        most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
+        relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
+        most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
+    elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
+        # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
+        # code and third line the topmost user error
+        traceback_parts = _get_traceback_parts_for_innermost_exception(error)
+        relevant_index_from_start = 3
+        most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
     elif 'playwright._impl._errors.Error' in str(error.__class__):
         # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
         # point to deep internals.

crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
 from bs4 import BeautifulSoup, Tag
 from crawlee._utils.docs import docs_group
-from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
+from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
 from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
 from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
         self,
         *,
         parser: BeautifulSoupParserType = 'lxml',
-        **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
+        **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
     ) -> None:
         """Initialize a new instance.

crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

crawlee 1.0.3b6py3-none-any.whl → 1.2.2b24py3-none-any.whl