PyPI - crawlee - Versions diffs - 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl - Mend

crawlee 0.6.13b17py3-none-any.whl → 1.1.2b7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show

crawlee/_autoscaling/snapshotter.py +1 -1
crawlee/_request.py +35 -33
crawlee/_service_locator.py +44 -24
crawlee/_types.py +106 -34
crawlee/_utils/context.py +2 -2
crawlee/_utils/file.py +7 -0
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +17 -1
crawlee/_utils/requests.py +0 -26
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +4 -2
crawlee/_utils/system.py +3 -3
crawlee/_utils/time.py +120 -0
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +4 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +2 -0
crawlee/crawlers/__init__.py +2 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
crawlee/crawlers/_basic/_basic_crawler.py +219 -126
crawlee/crawlers/_basic/_logging_utils.py +5 -1
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/events/_event_manager.py +4 -4
crawlee/events/_types.py +6 -6
crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/fingerprint_suite/_types.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +12 -0
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +3 -3
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_request_list.py +3 -3
crawlee/request_loaders/_request_loader.py +5 -1
crawlee/request_loaders/_sitemap_request_loader.py +248 -50
crawlee/sessions/_models.py +2 -2
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +43 -4
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_request_queue_client.py +2 -2
crawlee/storage_clients/_base/_storage_client.py +13 -0
crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
crawlee/storage_clients/_file_system/_storage_client.py +16 -3
crawlee/storage_clients/_file_system/_utils.py +0 -0
crawlee/storage_clients/_memory/_dataset_client.py +16 -4
crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
crawlee/storage_clients/_memory/_storage_client.py +6 -3
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +295 -0
crawlee/storage_clients/_redis/_dataset_client.py +325 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
crawlee/storage_clients/_redis/_storage_client.py +146 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +282 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +13 -11
crawlee/storages/_base.py +5 -1
crawlee/storages/_dataset.py +12 -2
crawlee/storages/_key_value_store.py +17 -4
crawlee/storages/_request_queue.py +13 -5
crawlee/storages/_storage_instance_manager.py +133 -71
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
{crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
{crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
crawlee/_utils/measure_time.py +0 -31
{crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0

crawlee/crawlers/_abstract_http/_abstract_http_crawler.py CHANGED Viewed

@@ -3,14 +3,16 @@ from __future__ import annotations
 import asyncio
 import logging
 from abc import ABC
+from datetime import timedelta
 from typing import TYPE_CHECKING, Any, Generic
 from more_itertools import partition
 from pydantic import ValidationError
-from typing_extensions import TypeVar
+from typing_extensions import NotRequired, TypeVar
 from crawlee._request import Request, RequestOptions
 from crawlee._utils.docs import docs_group
+from crawlee._utils.time import SharedTimeout
 from crawlee._utils.urls import to_absolute_url_iterator
 from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
 from crawlee.errors import SessionError
@@ -32,9 +34,24 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
+class HttpCrawlerOptions(
+    BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
+    Generic[TCrawlingContext, TStatisticsState],
+):
+    """Arguments for the `AbstractHttpCrawler` constructor.
+    It is intended for typing forwarded `__init__` arguments in the subclasses.
+    """
+    navigation_timeout: NotRequired[timedelta | None]
+    """Timeout for the HTTP request."""
 @docs_group('Crawlers')
 class AbstractHttpCrawler(
-    Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
+    BasicCrawler[TCrawlingContext, StatisticsState],
+    ABC,
+    Generic[TCrawlingContext, TParseResult, TSelectResult],
 ):
     """A web crawler for performing HTTP requests.
@@ -54,10 +71,13 @@ class AbstractHttpCrawler(
         self,
         *,
         parser: AbstractHttpParser[TParseResult, TSelectResult],
+        navigation_timeout: timedelta | None = None,
         **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
     ) -> None:
         self._parser = parser
+        self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
         self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
+        self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
         if '_context_pipeline' not in kwargs:
             raise ValueError(
@@ -110,9 +130,17 @@ class AbstractHttpCrawler(
     async def _execute_pre_navigation_hooks(
         self, context: BasicCrawlingContext
     ) -> AsyncGenerator[BasicCrawlingContext, None]:
-        for hook in self._pre_navigation_hooks:
-            await hook(context)
-        yield context
+        context_id = id(context)
+        self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
+        try:
+            for hook in self._pre_navigation_hooks:
+                async with self._shared_navigation_timeouts[context_id]:
+                    await hook(context)
+            yield context
+        finally:
+            self._shared_navigation_timeouts.pop(context_id, None)
     async def _parse_http_response(
         self, context: HttpCrawlingContext
@@ -165,7 +193,15 @@ class AbstractHttpCrawler(
             kwargs.setdefault('strategy', 'same-hostname')
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            # Get base URL from <base> tag if present
+            extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
+            base_url: str = (
+                str(extracted_base_urls[0])
+                if extracted_base_urls
+                else context.request.loaded_url or context.request.url
+            )
+            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -212,12 +248,14 @@ class AbstractHttpCrawler(
         Yields:
             The original crawling context enhanced by HTTP response.
         """
-        result = await self._http_client.crawl(
-            request=context.request,
-            session=context.session,
-            proxy_info=context.proxy_info,
-            statistics=self._statistics,
-        )
+        async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
+            result = await self._http_client.crawl(
+                request=context.request,
+                session=context.session,
+                proxy_info=context.proxy_info,
+                statistics=self._statistics,
+                timeout=remaining_timeout,
+            )
         yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)

crawlee/crawlers/_abstract_http/_abstract_http_parser.py CHANGED Viewed

@@ -16,7 +16,7 @@ if TYPE_CHECKING:
 @docs_group('HTTP parsers')
-class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC):
+class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
     """Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
     @abstractmethod

crawlee/crawlers/_abstract_http/_http_crawling_context.py CHANGED Viewed

@@ -31,7 +31,7 @@ class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
 @dataclass(frozen=True)
 @docs_group('Crawling contexts')
-class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
+class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
     """The crawling context used by `AbstractHttpCrawler`.
     It provides access to key objects as well as utility functions for handling crawling tasks.

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py CHANGED Viewed

@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
 from parsel import Selector
 from typing_extensions import Self, TypeVar, override
-from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
+from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
 from crawlee._utils.docs import docs_group
 from crawlee._utils.wait import wait_for
 from crawlee.crawlers import (
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
     async def __aenter__(self) -> Self:
         self._active = True
         await self._state.initialize()
-        self._after_initialize()
         return self
     async def __aexit__(
@@ -85,8 +84,8 @@ class _NonPersistentStatistics(Statistics):
 @docs_group('Crawlers')
 class AdaptivePlaywrightCrawler(
-    Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
     BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
+    Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
 ):
     """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
                 non-default configuration.
             kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
         """
-        # Some sub crawler kwargs are internally modified. Prepare copies.
-        basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
-        basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
         # Adaptive crawling related.
         self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
         self.result_checker = result_checker or (lambda _: True)
         self.result_comparator = result_comparator or create_default_comparator(result_checker)
+        # Set default concurrency settings for browser crawlers if not provided
+        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
+            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
         super().__init__(statistics=statistics, **kwargs)
         # Sub crawlers related.
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
         # Each sub crawler will use custom logger .
         static_logger = getLogger('Subcrawler_static')
         static_logger.setLevel(logging.ERROR)
-        basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
+        basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
         pw_logger = getLogger('Subcrawler_playwright')
         pw_logger.setLevel(logging.ERROR)
-        basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
+        basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
         # Initialize sub crawlers to create their pipelines.
         static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
                 ),
                 logger=self._logger,
             )
-            return SubCrawlerRun(result=result)
+            return SubCrawlerRun(result=result, run_context=context_linked_to_result)
         except Exception as e:
             return SubCrawlerRun(exception=e)
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
                 self.track_http_only_request_handler_runs()
                 static_run = await self._crawl_one(rendering_type='static', context=context)
-                if static_run.result and self.result_checker(static_run.result):
+                if static_run.result and static_run.run_context and self.result_checker(static_run.result):
+                    self._update_context_from_copy(context, static_run.run_context)
                     self._context_result_map[context] = static_run.result
                     return
                 if static_run.exception:
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
         if pw_run.exception is not None:
             raise pw_run.exception
-        if pw_run.result:
-            self._context_result_map[context] = pw_run.result
+        if pw_run.result and pw_run.run_context:
             if should_detect_rendering_type:
                 detection_result: RenderingType
                 static_run = await self._crawl_one('static', context=context, state=old_state_copy)
                 if static_run.result and self.result_comparator(static_run.result, pw_run.result):
                     detection_result = 'static'
                 else:
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
                 context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
                 self.rendering_type_predictor.store_result(context.request, detection_result)
+            self._update_context_from_copy(context, pw_run.run_context)
+            self._context_result_map[context] = pw_run.result
     def pre_navigation_hook(
         self,
         hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
     def track_rendering_type_mispredictions(self) -> None:
         self.statistics.state.rendering_type_mispredictions += 1
+    def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
+        """Update mutable fields of `context` from `context_copy`.
+        Uses object.__setattr__ to bypass frozen dataclass restrictions,
+        allowing state synchronization after isolated crawler execution.
+        """
+        updating_attributes = {
+            'request': ('headers', 'user_data'),
+            'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
+        }
+        for attr, sub_attrs in updating_attributes.items():
+            original_sub_obj = getattr(context, attr)
+            copy_sub_obj = getattr(context_copy, attr)
+            # Check that both sub objects are not None
+            if original_sub_obj is None or copy_sub_obj is None:
+                continue
+            for sub_attr in sub_attrs:
+                new_value = getattr(copy_sub_obj, sub_attr)
+                object.__setattr__(original_sub_obj, sub_attr, new_value)
 @dataclass(frozen=True)
 class SubCrawlerRun:
     result: RequestHandlerRunResult | None = None
     exception: Exception | None = None
+    run_context: BasicCrawlingContext | None = None

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py CHANGED Viewed

@@ -12,7 +12,7 @@ from crawlee.statistics import StatisticsState
 class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
     """Statistic data about a crawler run with additional information related to adaptive crawling."""
-    model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
     http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
     """Number representing how many times static http based crawling was used."""

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py CHANGED Viewed

@@ -17,7 +17,7 @@ if TYPE_CHECKING:
     from playwright.async_api import Page, Response
     from typing_extensions import Self
-    from crawlee.crawlers._playwright._types import BlockRequestsFunction
+    from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
 TStaticParseResult = TypeVar('TStaticParseResult')
@@ -31,7 +31,8 @@ class AdaptiveContextError(RuntimeError):
 @dataclass(frozen=True)
 @docs_group('Crawling contexts')
 class AdaptivePlaywrightCrawlingContext(
-    Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult]
+    ParsedHttpCrawlingContext[TStaticParseResult],
+    Generic[TStaticParseResult, TStaticSelectResult],
 ):
     _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
     """The crawling context used by `AdaptivePlaywrightCrawler`.
@@ -189,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
         http_response = await PlaywrightHttpResponse.from_playwright_response(
             response=context.response, protocol=protocol_guess or ''
         )
-        # block_requests is useful only on pre-navigation contexts. It is useless here.
+        # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
         context_kwargs.pop('block_requests')
+        context_kwargs.pop('goto_options')
         return cls(
             parsed_content=await parser.parse(http_response),
             http_response=http_response,
@@ -211,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
     block_requests: BlockRequestsFunction | None = None
     """Blocks network requests matching specified URL patterns."""
+    goto_options: GotoOptions | None = None
+    """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
     @property
     def page(self) -> Page:
         """The Playwright `Page` object for the current page.

crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py CHANGED Viewed

@@ -32,7 +32,7 @@ FeatureVector = tuple[float, float]
 class RenderingTypePredictorState(BaseModel):
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     model: Annotated[
         LogisticRegression,

crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b17py3-none-any.whl → 1.1.2b7py3-none-any.whl