PyPI - crawlee - Versions diffs - 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl - Mend

crawlee 1.0.1b9py3-none-any.whl → 1.3.1b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (93) hide show

crawlee/__init__.py +2 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +62 -32
crawlee/_service_locator.py +4 -4
crawlee/_types.py +52 -19
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +27 -3
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +13 -6
crawlee/_utils/system.py +27 -11
crawlee/_utils/time.py +41 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +5 -2
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/browsers/_playwright_browser_controller.py +1 -1
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
crawlee/crawlers/_basic/_basic_crawler.py +160 -134
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +23 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +4 -6
crawlee/request_loaders/_sitemap_request_loader.py +23 -5
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +3 -3
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +51 -9
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +4 -0
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
crawlee/storage_clients/_memory/_dataset_client.py +2 -2
crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +292 -0
crawlee/storage_clients/_redis/_dataset_client.py +329 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
crawlee/storage_clients/_redis/_storage_client.py +149 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/_client_mixin.py +1 -1
crawlee/storage_clients/_sql/_dataset_client.py +2 -2
crawlee/storage_clients/_sql/_db_models.py +1 -2
crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
crawlee/storage_clients/_sql/_storage_client.py +1 -1
crawlee/storage_clients/models.py +8 -3
crawlee/storages/_base.py +3 -1
crawlee/storages/_dataset.py +3 -0
crawlee/storages/_key_value_store.py +8 -2
crawlee/storages/_request_queue.py +3 -0
crawlee/storages/_storage_instance_manager.py +109 -42
crawlee/storages/_utils.py +11 -0
{crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
{crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
{crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
{crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
{crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py CHANGED Viewed

@@ -17,7 +17,7 @@ if TYPE_CHECKING:
     from playwright.async_api import Page, Response
     from typing_extensions import Self
-    from crawlee.crawlers._playwright._types import BlockRequestsFunction
+    from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
 TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
         http_response = await PlaywrightHttpResponse.from_playwright_response(
             response=context.response, protocol=protocol_guess or ''
         )
-        # block_requests is useful only on pre-navigation contexts. It is useless here.
+        # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
         context_kwargs.pop('block_requests')
+        context_kwargs.pop('goto_options')
         return cls(
             parsed_content=await parser.parse(http_response),
             http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
     block_requests: BlockRequestsFunction | None = None
     """Blocks network requests matching specified URL patterns."""
+    goto_options: GotoOptions | None = None
+    """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
     @property
     def page(self) -> Page:
         """The Playwright `Page` object for the current page.

crawlee/crawlers/_basic/_basic_crawler.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 import asyncio
+import functools
 import logging
 import signal
 import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
 from contextlib import AsyncExitStack, suppress
 from datetime import timedelta
 from functools import partial
+from io import StringIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generic, Literal, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
 from urllib.parse import ParseResult, urlparse
 from weakref import WeakKeyDictionary
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
 from crawlee._types import (
     BasicCrawlingContext,
     EnqueueLinksKwargs,
+    ExportDataCsvKwargs,
+    ExportDataJsonKwargs,
     GetKeyValueStoreFromRequestHandlerFunction,
     HttpHeaders,
     HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
     SkippedReason,
 )
 from crawlee._utils.docs import docs_group
-from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
+from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
 from crawlee._utils.recurring_task import RecurringTask
 from crawlee._utils.robots import RobotsTxtFile
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -55,6 +59,7 @@ from crawlee.errors import (
     RequestHandlerError,
     SessionError,
     UserDefinedErrorHandlerError,
+    UserHandlerTimeoutError,
 )
 from crawlee.events._types import Event, EventCrawlerStatusData
 from crawlee.http_clients import ImpitHttpClient
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
 from crawlee.storages import Dataset, KeyValueStore, RequestQueue
 from ._context_pipeline import ContextPipeline
+from ._context_utils import swapped_context
 from ._logging_utils import (
     get_one_line_error_summary_if_possible,
     reduce_asyncio_timeout_error_to_relevant_traceback_parts,
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
 TRequestIterator = TypeVar('TRequestIterator', str, Request)
+TParams = ParamSpec('TParams')
+T = TypeVar('T')
 ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
 FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
 SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -401,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
         # Context pipeline
-        self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
+        self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)  # ty: ignore[invalid-argument-type]
         # Crawl settings
         self._max_request_retries = max_request_retries
@@ -437,14 +446,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self._statistics_log_format = statistics_log_format
         # Statistics
-        self._statistics = statistics or cast(
-            'Statistics[TStatisticsState]',
-            Statistics.with_default_state(
-                periodic_message_logger=self._logger,
-                statistics_log_format=self._statistics_log_format,
-                log_message='Current request statistics:',
-            ),
-        )
+        if statistics:
+            self._statistics = statistics
+        else:
+            async def persist_state_factory() -> KeyValueStore:
+                return await self.get_key_value_store()
+            self._statistics = cast(
+                'Statistics[TStatisticsState]',
+                Statistics.with_default_state(
+                    persistence_enabled=True,
+                    periodic_message_logger=self._logger,
+                    statistics_log_format=self._statistics_log_format,
+                    log_message='Current request statistics:',
+                    persist_state_kvs_factory=persist_state_factory,
+                ),
+            )
         # Additional context managers to enter and exit
         self._additional_context_managers = _additional_context_managers or []
@@ -511,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
         self._unexpected_stop = True
+    def _wrap_handler_with_error_context(
+        self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
+    ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
+        """Decorate error handlers to make their context helpers usable."""
+        @functools.wraps(handler)
+        async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
+            # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
+            # failed. Modified context provides context helpers with direct access to the storages.
+            error_context = context.create_modified_copy(
+                push_data=self._push_data,
+                get_key_value_store=self.get_key_value_store,
+                add_requests=functools.partial(self._add_requests, context),
+            )
+            return await handler(error_context, exception)
+        return wrapped_handler
     def _stop_if_max_requests_count_exceeded(self) -> None:
         """Call `stop` when the maximum number of requests to crawl has been reached."""
         if self._max_requests_per_crawl is None:
@@ -609,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         The error handler is invoked after a request handler error occurs and before a retry attempt.
         """
-        self._error_handler = handler
+        self._error_handler = self._wrap_handler_with_error_context(handler)
         return handler
     def failed_request_handler(
@@ -619,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         The failed request handler is invoked when a request has failed all retry attempts.
         """
-        self._failed_request_handler = handler
+        self._failed_request_handler = self._wrap_handler_with_error_context(handler)
         return handler
     def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -689,7 +725,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         except CancelledError:
             pass
         finally:
-            await self._crawler_state_rec_task.stop()
             if threading.current_thread() is threading.main_thread():
                 with suppress(NotImplementedError):
                     asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -721,8 +756,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
     async def _run_crawler(self) -> None:
         event_manager = self._service_locator.get_event_manager()
-        self._crawler_state_rec_task.start()
         # Collect the context managers to be entered. Context managers that are already active are excluded,
         # as they were likely entered by the caller, who will also be responsible for exiting them.
         contexts_to_enter = [
@@ -733,6 +766,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 self._statistics,
                 self._session_pool if self._use_session_pool else None,
                 self._http_client,
+                self._crawler_state_rec_task,
                 *self._additional_context_managers,
             )
             if cm and getattr(cm, 'active', False) is False
@@ -740,7 +774,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         async with AsyncExitStack() as exit_stack:
             for context in contexts_to_enter:
-                await exit_stack.enter_async_context(context)  # type: ignore[arg-type]
+                await exit_stack.enter_async_context(context)  # ty: ignore[invalid-argument-type]
             await self._autoscaled_pool.run()
@@ -839,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         dataset_id: str | None = None,
         dataset_name: str | None = None,
         dataset_alias: str | None = None,
+        **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
     ) -> None:
         """Export all items from a Dataset to a JSON or CSV file.
@@ -851,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             dataset_id: The ID of the Dataset to export from.
             dataset_name: The name of the Dataset to export from (global scope, named storage).
             dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
+            additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
         """
         dataset = await Dataset.open(
             id=dataset_id,
@@ -860,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             configuration=self._service_locator.get_configuration(),
         )
-        path = path if isinstance(path, Path) else Path(path)
-        dst = path.open('w', newline='')
+        path = Path(path)
         if path.suffix == '.csv':
-            await export_csv_to_stream(dataset.iterate_items(), dst)
+            dst = StringIO()
+            csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
+            await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
+            await atomic_write(path, dst.getvalue())
         elif path.suffix == '.json':
-            await export_json_to_stream(dataset.iterate_items(), dst)
+            dst = StringIO()
+            json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
+            await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
+            await atomic_write(path, dst.getvalue())
         else:
             raise ValueError(f'Unsupported file extension: {path.suffix}')
@@ -972,6 +1013,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                         label=label,
                         user_data=user_data,
                         transform_request_function=transform_request_function,
+                        **kwargs,
                     ),
                     rq_id=rq_id,
                     rq_name=rq_name,
@@ -997,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         warning_flag = True
         for request in request_iterator:
-            target_url = request.url if isinstance(request, Request) else request
+            if isinstance(request, Request):
+                if request.enqueue_strategy != strategy:
+                    request.enqueue_strategy = strategy
+                target_url = request.url
+            else:
+                target_url = request
             parsed_target_url = urlparse(target_url)
             if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
@@ -1009,9 +1056,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             ) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
                 yield request
-                limit = limit - 1 if limit is not None else None
-                if limit and limit <= 0:
-                    break
+                if limit is not None:
+                    limit -= 1
+                    if limit <= 0:
+                        break
     def _check_enqueue_strategy(
         self,
@@ -1035,8 +1083,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             return target_url.hostname == origin_url.hostname
         if strategy == 'same-domain':
-            origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
-            target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
+            origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
+            target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
             return origin_domain == target_domain
         if strategy == 'same-origin':
@@ -1094,7 +1142,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             request.retry_count += 1
             reduced_error = str(error).split('\n')[0]
             self.log.warning(
-                f'Retrying request to {context.request.url} due to: {reduced_error}'
+                f'Retrying request to {context.request.url} due to: {reduced_error}. '
                 f'{get_one_line_error_summary_if_possible(error)}'
             )
             await self._statistics.error_tracker.add(error=error, context=context)
@@ -1105,19 +1153,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 except Exception as e:
                     raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
                 else:
-                    if new_request is not None:
-                        request = new_request
+                    if new_request is not None and new_request != request:
+                        await request_manager.add_request(new_request)
+                        await self._mark_request_as_handled(request)
+                        return
             await request_manager.reclaim_request(request)
         else:
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(context.request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
+            request.state = RequestState.ERROR
+            await self._mark_request_as_handled(request)
             await self._handle_failed_request(context, error)
             self._statistics.record_request_processing_failure(request.unique_key)
@@ -1132,8 +1176,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 f'{self._internal_timeout.total_seconds()} seconds',
                 logger=self._logger,
             )
-            context.request.state = RequestState.DONE
         except UserDefinedErrorHandlerError:
             context.request.state = RequestState.ERROR
             raise
@@ -1166,17 +1208,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
     ) -> None:
         if need_mark and isinstance(request, Request):
-            request_manager = await self.get_request_manager()
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
             request.state = RequestState.SKIPPED
+            await self._mark_request_as_handled(request)
         url = request.url if isinstance(request, Request) else request
@@ -1196,10 +1229,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         if (
             isinstance(error, asyncio.exceptions.TimeoutError)
+            and traceback_parts
             and self._request_handler_timeout_text in traceback_parts[-1]
-        ):
+        ) or isinstance(error, UserHandlerTimeoutError):
             used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
-            used_traceback_parts.append(traceback_parts[-1])
+            used_traceback_parts.extend(traceback_parts[-1:])
         return ''.join(used_traceback_parts).strip('\n')
@@ -1248,58 +1282,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             else:
                 yield Request.from_url(url)
-    async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
-        """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
-        result = self._context_result_map[context]
-        base_request_manager = await self.get_request_manager()
-        origin = context.request.loaded_url or context.request.url
-        for add_requests_call in result.add_requests_calls:
-            rq_id = add_requests_call.get('rq_id')
-            rq_name = add_requests_call.get('rq_name')
-            rq_alias = add_requests_call.get('rq_alias')
-            specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
-            if specified_params > 1:
-                raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
-            if rq_id or rq_name or rq_alias:
-                request_manager: RequestManager | RequestQueue = await RequestQueue.open(
-                    id=rq_id,
-                    name=rq_name,
-                    alias=rq_alias,
-                    storage_client=self._service_locator.get_storage_client(),
-                    configuration=self._service_locator.get_configuration(),
-                )
-            else:
-                request_manager = base_request_manager
-            requests = list[Request]()
-            base_url = url if (url := add_requests_call.get('base_url')) else origin
-            requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
+    async def _add_requests(
+        self,
+        context: BasicCrawlingContext,
+        requests: Sequence[str | Request],
+        rq_id: str | None = None,
+        rq_name: str | None = None,
+        rq_alias: str | None = None,
+        **kwargs: Unpack[EnqueueLinksKwargs],
+    ) -> None:
+        """Add requests method aware of the crawling context."""
+        if rq_id or rq_name or rq_alias:
+            request_manager: RequestManager = await RequestQueue.open(
+                id=rq_id,
+                name=rq_name,
+                alias=rq_alias,
+                storage_client=self._service_locator.get_storage_client(),
+                configuration=self._service_locator.get_configuration(),
+            )
+        else:
+            request_manager = await self.get_request_manager()
-            enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'}  # type: ignore[assignment]
+        context_aware_requests = list[Request]()
+        base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
+        requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
+        filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
+        for dst_request in filter_requests_iterator:
+            # Update the crawl depth of the request.
+            dst_request.crawl_depth = context.request.crawl_depth + 1
-            filter_requests_iterator = self._enqueue_links_filter_iterator(
-                requests_iterator, context.request.url, **enqueue_links_kwargs
-            )
+            if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
+                context_aware_requests.append(dst_request)
-            for dst_request in filter_requests_iterator:
-                # Update the crawl depth of the request.
-                dst_request.crawl_depth = context.request.crawl_depth + 1
+        return await request_manager.add_requests(context_aware_requests)
-                if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
-                    requests.append(dst_request)
+    async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
+        """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
+        result = self._context_result_map[context]
-            await request_manager.add_requests(requests)
+        for add_requests_call in result.add_requests_calls:
+            await self._add_requests(context, **add_requests_call)
         for push_data_call in result.push_data_calls:
             await self._push_data(**push_data_call)
         await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
+        result.apply_request_changes(target=context.request)
     @staticmethod
     async def _commit_key_value_store_changes(
         result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
@@ -1365,10 +1395,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         else:
             session = await self._get_session()
         proxy_info = await self._get_proxy_info(request, session)
-        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
+        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
         context = BasicCrawlingContext(
-            request=request,
+            request=result.request,
             session=session,
             proxy_info=proxy_info,
             send_request=self._prepare_send_request_function(session, proxy_info),
@@ -1385,32 +1415,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         try:
             request.state = RequestState.REQUEST_HANDLER
-            self._check_request_collision(context.request, context.session)
             try:
-                await self._run_request_handler(context=context)
+                with swapped_context(context, request):
+                    self._check_request_collision(request, session)
+                    await self._run_request_handler(context=context)
             except asyncio.TimeoutError as e:
                 raise RequestHandlerError(e, context) from e
             await self._commit_request_handler_result(context)
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(context.request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
             request.state = RequestState.DONE
-            if context.session and context.session.is_usable:
-                context.session.mark_good()
+            await self._mark_request_as_handled(request)
+            if session and session.is_usable:
+                session.mark_good()
             self._statistics.record_request_processing_finish(request.unique_key)
         except RequestCollisionError as request_error:
-            context.request.no_retry = True
+            request.no_retry = True
             await self._handle_request_error(context, request_error)
         except RequestHandlerError as primary_error:
@@ -1425,7 +1449,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
         except SessionError as session_error:
-            if not context.session:
+            if not session:
                 raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
             if self._error_handler:
@@ -1435,22 +1459,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
                 self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
-                context.session.retire()
+                if session:
+                    session.retire()
                 # Increment session rotation count.
-                context.request.session_rotation_count = (context.request.session_rotation_count or 0) + 1
+                request.session_rotation_count = (request.session_rotation_count or 0) + 1
                 await request_manager.reclaim_request(request)
                 await self._statistics.error_tracker_retry.add(error=session_error, context=context)
             else:
-                await wait_for(
-                    lambda: request_manager.mark_request_as_handled(context.request),
-                    timeout=self._internal_timeout,
-                    timeout_message='Marking request as handled timed out after '
-                    f'{self._internal_timeout.total_seconds()} seconds',
-                    logger=self._logger,
-                    max_retries=3,
-                )
+                await self._mark_request_as_handled(request)
                 await self._handle_failed_request(context, session_error)
                 self._statistics.record_request_processing_failure(request.unique_key)
@@ -1458,14 +1476,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         except ContextPipelineInterruptedError as interrupted_error:
             self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
-            await wait_for(
-                lambda: request_manager.mark_request_as_handled(context.request),
-                timeout=self._internal_timeout,
-                timeout_message='Marking request as handled timed out after '
-                f'{self._internal_timeout.total_seconds()} seconds',
-                logger=self._logger,
-                max_retries=3,
-            )
+            await self._mark_request_as_handled(request)
         except ContextPipelineInitializationError as initialization_error:
             self._logger.debug(
@@ -1483,12 +1494,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             raise
     async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
-        await wait_for(
-            lambda: self._context_pipeline(context, self.router),
-            timeout=self._request_handler_timeout,
-            timeout_message=f'{self._request_handler_timeout_text}'
-            f' {self._request_handler_timeout.total_seconds()} seconds',
-            logger=self._logger,
+        context.request.state = RequestState.BEFORE_NAV
+        await self._context_pipeline(
+            context,
+            lambda final_context: wait_for(
+                lambda: self.router(final_context),
+                timeout=self._request_handler_timeout,
+                timeout_message=f'{self._request_handler_timeout_text}'
+                f' {self._request_handler_timeout.total_seconds()} seconds',
+                logger=self._logger,
+            ),
         )
     def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1636,3 +1651,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         )
         self._previous_crawler_state = current_state
+    async def _mark_request_as_handled(self, request: Request) -> None:
+        request_manager = await self.get_request_manager()
+        await wait_for(
+            lambda: request_manager.mark_request_as_handled(request),
+            timeout=self._internal_timeout,
+            timeout_message='Marking request as handled timed out after '
+            f'{self._internal_timeout.total_seconds()} seconds',
+            logger=self._logger,
+            max_retries=3,
+        )

crawlee/crawlers/_basic/_context_utils.py ADDED Viewed

@@ -0,0 +1,24 @@
+from __future__ import annotations
+from contextlib import contextmanager
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from crawlee._request import Request
+    from ._basic_crawling_context import BasicCrawlingContext
+@contextmanager
+def swapped_context(
+    context: BasicCrawlingContext,
+    request: Request,
+) -> Iterator[None]:
+    """Replace context's isolated copies with originals after handler execution."""
+    try:
+        yield
+    finally:
+        # Restore original context state to avoid side effects between different handlers.
+        object.__setattr__(context, 'request', request)

crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

Potentially problematic release.

crawlee 1.0.1b9py3-none-any.whl → 1.3.1b3py3-none-any.whl