PyPI - crawlee - Versions diffs - 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl - Mend

crawlee 1.0.5b18py3-none-any.whl → 1.2.2b24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

crawlee/__init__.py +2 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +32 -13
crawlee/_types.py +44 -5
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/recurring_task.py +12 -3
crawlee/_utils/sitemap.py +12 -5
crawlee/_utils/system.py +27 -11
crawlee/_utils/time.py +41 -1
crawlee/browsers/_browser_pool.py +1 -1
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
crawlee/crawlers/_basic/_basic_crawler.py +138 -124
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +23 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +1 -3
crawlee/request_loaders/_sitemap_request_loader.py +18 -5
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +3 -3
crawlee/statistics/_models.py +51 -9
crawlee/statistics/_statistics.py +2 -21
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
crawlee/storage_clients/_redis/_client_mixin.py +1 -4
crawlee/storage_clients/_redis/_dataset_client.py +6 -2
crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
crawlee/storage_clients/_redis/_storage_client.py +12 -9
crawlee/storage_clients/_redis/_utils.py +1 -1
crawlee/storage_clients/_sql/_client_mixin.py +1 -1
crawlee/storage_clients/_sql/_storage_client.py +0 -9
crawlee/storage_clients/models.py +8 -3
crawlee/storages/_storage_instance_manager.py +103 -44
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0

crawlee/crawlers/_abstract_http/_abstract_http_crawler.py CHANGED Viewed

@@ -3,14 +3,16 @@ from __future__ import annotations
 import asyncio
 import logging
 from abc import ABC
+from datetime import timedelta
 from typing import TYPE_CHECKING, Any, Generic
 from more_itertools import partition
 from pydantic import ValidationError
-from typing_extensions import TypeVar
+from typing_extensions import NotRequired, TypeVar
-from crawlee._request import Request, RequestOptions
+from crawlee._request import Request, RequestOptions, RequestState
 from crawlee._utils.docs import docs_group
+from crawlee._utils.time import SharedTimeout
 from crawlee._utils.urls import to_absolute_url_iterator
 from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
 from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
+class HttpCrawlerOptions(
+    BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
+    Generic[TCrawlingContext, TStatisticsState],
+):
+    """Arguments for the `AbstractHttpCrawler` constructor.
+    It is intended for typing forwarded `__init__` arguments in the subclasses.
+    """
+    navigation_timeout: NotRequired[timedelta | None]
+    """Timeout for the HTTP request."""
 @docs_group('Crawlers')
 class AbstractHttpCrawler(
     BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
         self,
         *,
         parser: AbstractHttpParser[TParseResult, TSelectResult],
+        navigation_timeout: timedelta | None = None,
         **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
     ) -> None:
         self._parser = parser
+        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
         self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
+        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
         if '_context_pipeline' not in kwargs:
             raise ValueError(
@@ -82,9 +100,7 @@ class AbstractHttpCrawler(
         this method simplifies cases where `TParseResult` is used for both generic parameters.
         """
-        class _ParsedHttpCrawler(
-            AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]
-        ):
+        class _ParsedHttpCrawler(AbstractHttpCrawler):
             def __init__(
                 self,
                 parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
@@ -112,9 +128,17 @@ class AbstractHttpCrawler(
     async def _execute_pre_navigation_hooks(
         self, context: BasicCrawlingContext
     ) -> AsyncGenerator[BasicCrawlingContext, None]:
-        for hook in self._pre_navigation_hooks:
-            await hook(context)
-        yield context
+        context_id = id(context)
+        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
+        try:
+            for hook in self._pre_navigation_hooks:
+                async with self._shared_navigation_timeouts[context_id]:
+                    await hook(context)
+            yield context
+        finally:
+            self._shared_navigation_timeouts.pop(context_id, None)
     async def _parse_http_response(
         self, context: HttpCrawlingContext
@@ -165,11 +189,18 @@ class AbstractHttpCrawler(
             robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
             kwargs.setdefault('strategy', 'same-hostname')
+            strategy = kwargs.get('strategy', 'same-hostname')
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(
-                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            # Get base URL from <base> tag if present
+            extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
+            base_url: str = (
+                str(extracted_base_urls[0])
+                if extracted_base_urls
+                else context.request.loaded_url or context.request.url
             )
+            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -177,7 +208,9 @@ class AbstractHttpCrawler(
                 skipped = iter([])
             for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
-                request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
+                request_options = RequestOptions(
+                    url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
+                )
                 if transform_request_function:
                     transform_request_options = transform_request_function(request_options)
@@ -216,13 +249,16 @@ class AbstractHttpCrawler(
         Yields:
             The original crawling context enhanced by HTTP response.
         """
-        result = await self._http_client.crawl(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            statistics=self._statistics,
-        )
+        async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
+            result = await self._http_client.crawl(
+                request=context.request,
+                session=context.session,
+                proxy_info=context.proxy_info,
+                statistics=self._statistics,
+                timeout=remaining_timeout,
+            )
+        context.request.state = RequestState.AFTER_NAV
         yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
     async def _handle_status_code_response(

crawlee/crawlers/_adaptive_playwright/__init__.py CHANGED Viewed

@@ -11,13 +11,16 @@ _install_import_hook(__name__)
 # The following imports are wrapped in try_import to handle optional dependencies,
 # ensuring the module can still function even if these dependencies are missing.
-with _try_import(__name__, 'BeautifulSoupCrawler'):
+with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):
     from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
-with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
+with _try_import(__name__, 'AdaptivePlaywrightCrawler'):
     from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
+with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
+    from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState
 __all__ = [
     'AdaptivePlaywrightCrawler',
+    'AdaptivePlaywrightCrawlerStatisticState',
     'AdaptivePlaywrightCrawlingContext',
     'AdaptivePlaywrightPreNavCrawlingContext',
     'RenderingType',

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py CHANGED Viewed

@@ -27,23 +27,16 @@ from crawlee.crawlers import (
 )
 from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
 from crawlee.crawlers._parsel._parsel_parser import ParselParser
+from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
 from crawlee.statistics import Statistics, StatisticsState
-from ._adaptive_playwright_crawler_statistics import (
-    AdaptivePlaywrightCrawlerStatisticState,
-)
+from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
 from ._adaptive_playwright_crawling_context import (
     AdaptivePlaywrightCrawlingContext,
     AdaptivePlaywrightPreNavCrawlingContext,
 )
-from ._rendering_type_predictor import (
-    DefaultRenderingTypePredictor,
-    RenderingType,
-    RenderingTypePredictor,
-)
-from ._result_comparator import (
-    create_default_comparator,
-)
+from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
+from ._result_comparator import create_default_comparator
 if TYPE_CHECKING:
     from types import TracebackType
@@ -51,7 +44,6 @@ if TYPE_CHECKING:
     from typing_extensions import Unpack
     from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
-    from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
 TStaticParseResult = TypeVar('TStaticParseResult')
@@ -157,10 +149,12 @@ class AdaptivePlaywrightCrawler(
         if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
             kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
-        super().__init__(statistics=statistics, **kwargs)
+        adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
+        super().__init__(statistics=adaptive_statistics, **kwargs)
         # Sub crawlers related.
-        playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {}
+        playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()
         # Each sub crawler will use custom logger .
         static_logger = getLogger('Subcrawler_static')
@@ -290,11 +284,14 @@ class AdaptivePlaywrightCrawler(
             use_state_function = context.use_state
         # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
-        result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
+        result = RequestHandlerRunResult(
+            key_value_store_getter=self.get_key_value_store,
+            request=context.request,
+        )
         context_linked_to_result = BasicCrawlingContext(
-            request=deepcopy(context.request),
-            session=deepcopy(context.session),
-            proxy_info=deepcopy(context.proxy_info),
+            request=result.request,
+            session=context.session,
+            proxy_info=context.proxy_info,
             send_request=context.send_request,
             add_requests=result.add_requests,
             push_data=result.push_data,
@@ -314,7 +311,7 @@ class AdaptivePlaywrightCrawler(
                 ),
                 logger=self._logger,
             )
-            return SubCrawlerRun(result=result, run_context=context_linked_to_result)
+            return SubCrawlerRun(result=result)
         except Exception as e:
             return SubCrawlerRun(exception=e)
@@ -332,7 +329,7 @@ class AdaptivePlaywrightCrawler(
                 )
                 await self.router(adaptive_crawling_context)
-            return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)
+            return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]
         if rendering_type == 'client only':
@@ -342,7 +339,7 @@ class AdaptivePlaywrightCrawler(
                 )
                 await self.router(adaptive_crawling_context)
-            return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)
+            return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)  # ty: ignore[invalid-argument-type]
         raise RuntimeError(
             f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
@@ -370,8 +367,7 @@ class AdaptivePlaywrightCrawler(
                 self.track_http_only_request_handler_runs()
                 static_run = await self._crawl_one(rendering_type='static', context=context)
-                if static_run.result and static_run.run_context and self.result_checker(static_run.result):
-                    self._update_context_from_copy(context, static_run.run_context)
+                if static_run.result and self.result_checker(static_run.result):
                     self._context_result_map[context] = static_run.result
                     return
                 if static_run.exception:
@@ -402,7 +398,7 @@ class AdaptivePlaywrightCrawler(
         if pw_run.exception is not None:
             raise pw_run.exception
-        if pw_run.result and pw_run.run_context:
+        if pw_run.result:
             if should_detect_rendering_type:
                 detection_result: RenderingType
                 static_run = await self._crawl_one('static', context=context, state=old_state_copy)
@@ -414,7 +410,6 @@ class AdaptivePlaywrightCrawler(
                 context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
                 self.rendering_type_predictor.store_result(context.request, detection_result)
-            self._update_context_from_copy(context, pw_run.run_context)
             self._context_result_map[context] = pw_run.result
     def pre_navigation_hook(
@@ -451,32 +446,8 @@ class AdaptivePlaywrightCrawler(
     def track_rendering_type_mispredictions(self) -> None:
         self.statistics.state.rendering_type_mispredictions += 1
-    def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
-        """Update mutable fields of `context` from `context_copy`.
-        Uses object.__setattr__ to bypass frozen dataclass restrictions,
-        allowing state synchronization after isolated crawler execution.
-        """
-        updating_attributes = {
-            'request': ('headers', 'user_data'),
-            'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
-        }
-        for attr, sub_attrs in updating_attributes.items():
-            original_sub_obj = getattr(context, attr)
-            copy_sub_obj = getattr(context_copy, attr)
-            # Check that both sub objects are not None
-            if original_sub_obj is None or copy_sub_obj is None:
-                continue
-            for sub_attr in sub_attrs:
-                new_value = getattr(copy_sub_obj, sub_attr)
-                object.__setattr__(original_sub_obj, sub_attr, new_value)
 @dataclass(frozen=True)
 class SubCrawlerRun:
     result: RequestHandlerRunResult | None = None
     exception: Exception | None = None
-    run_context: BasicCrawlingContext | None = None

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py CHANGED Viewed

@@ -17,7 +17,7 @@ if TYPE_CHECKING:
     from playwright.async_api import Page, Response
     from typing_extensions import Self
-    from crawlee.crawlers._playwright._types import BlockRequestsFunction
+    from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
 TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
         http_response = await PlaywrightHttpResponse.from_playwright_response(
             response=context.response, protocol=protocol_guess or ''
         )
-        # block_requests is useful only on pre-navigation contexts. It is useless here.
+        # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
         context_kwargs.pop('block_requests')
+        context_kwargs.pop('goto_options')
         return cls(
             parsed_content=await parser.parse(http_response),
             http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
     block_requests: BlockRequestsFunction | None = None
     """Blocks network requests matching specified URL patterns."""
+    goto_options: GotoOptions | None = None
+    """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
     @property
     def page(self) -> Page:
         """The Playwright `Page` object for the current page.

crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

crawlee 1.0.5b18py3-none-any.whl → 1.2.2b24py3-none-any.whl