PyPI - crawlee - Versions diffs - 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl - Mend

crawlee 0.6.13b15py3-none-any.whl → 1.3.1b3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show

crawlee/__init__.py +2 -1
crawlee/_autoscaling/snapshotter.py +1 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +64 -43
crawlee/_service_locator.py +44 -24
crawlee/_types.py +128 -36
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +27 -3
crawlee/_utils/requests.py +0 -26
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +16 -7
crawlee/_utils/system.py +30 -14
crawlee/_utils/time.py +120 -0
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +5 -2
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +2 -0
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
crawlee/crawlers/_basic/_basic_crawler.py +254 -148
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +27 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/events/_types.py +6 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/fingerprint_suite/_types.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +32 -11
crawlee/otel/crawler_instrumentor.py +4 -6
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_request_list.py +3 -3
crawlee/request_loaders/_request_loader.py +5 -1
crawlee/request_loaders/_sitemap_request_loader.py +248 -50
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +5 -5
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +62 -12
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_base/_request_queue_client.py +2 -2
crawlee/storage_clients/_base/_storage_client.py +13 -0
crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
crawlee/storage_clients/_file_system/_storage_client.py +16 -3
crawlee/storage_clients/_file_system/_utils.py +0 -0
crawlee/storage_clients/_memory/_dataset_client.py +16 -4
crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
crawlee/storage_clients/_memory/_storage_client.py +6 -3
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +292 -0
crawlee/storage_clients/_redis/_dataset_client.py +329 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
crawlee/storage_clients/_redis/_storage_client.py +149 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +282 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +21 -14
crawlee/storages/_base.py +5 -1
crawlee/storages/_dataset.py +12 -2
crawlee/storages/_key_value_store.py +17 -4
crawlee/storages/_request_queue.py +13 -5
crawlee/storages/_storage_instance_manager.py +196 -75
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
crawlee-1.3.1b3.dist-info/RECORD +207 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
crawlee/_utils/measure_time.py +0 -31
crawlee-0.6.13b15.dist-info/RECORD +0 -183
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0

crawlee/request_loaders/_sitemap_request_loader.py CHANGED Viewed

@@ -1,20 +1,27 @@
 from __future__ import annotations
 import asyncio
+from collections import deque
 from contextlib import suppress
 from logging import getLogger
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Annotated, Any
-from crawlee import Request
+from pydantic import BaseModel, ConfigDict, Field
+from typing_extensions import override
+from crawlee import Request, RequestOptions
 from crawlee._utils.docs import docs_group
 from crawlee._utils.globs import Glob
-from crawlee._utils.sitemap import ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
+from crawlee._utils.recoverable_state import RecoverableState
+from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
 from crawlee.request_loaders._request_loader import RequestLoader
 if TYPE_CHECKING:
     import re
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
+    from types import TracebackType
+    from crawlee import RequestTransformAction
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.storage_clients.models import ProcessedRequest
@@ -23,12 +30,77 @@ if TYPE_CHECKING:
 logger = getLogger(__name__)
+class SitemapRequestLoaderState(BaseModel):
+    """State model for persisting sitemap request loader data.
+    The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.
+    The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved
+    from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to
+    `pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a
+    `SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,
+    the loader was restarted from a saved state and the URL is skipped.
+    If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is
+    incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`
+    is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is
+    cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in
+    `processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.
+    When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.
+    When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and
+    `handled_count` is incremented by 1.
+    During initial startup or restart after persistence, state validation occurs in `_get_state`. If both
+    `pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a
+    fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is
+    restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and
+    `in_progress` is cleared.
+    """
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
+    url_queue: Annotated[deque[str], Field(alias='urlQueue')]
+    """Queue of URLs extracted from sitemaps and ready for processing."""
+    in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
+    """Set of request URLs currently being processed."""
+    pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]
+    """Queue of sitemap URLs that need to be fetched and processed."""
+    in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None
+    """The sitemap URL currently being processed."""
+    current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set()
+    """URLs from the current sitemap that have been added to the queue."""
+    processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set()
+    """Set of processed sitemap URLs."""
+    completed: Annotated[bool, Field(alias='sitemapCompleted')] = False
+    """Whether all sitemaps have been fully processed."""
+    total_count: Annotated[int, Field(alias='totalCount')] = 0
+    """Total number of URLs found and added to the queue from all processed sitemaps."""
+    handled_count: Annotated[int, Field(alias='handledCount')] = 0
+    """Number of URLs that have been successfully handled."""
 @docs_group('Request loaders')
 class SitemapRequestLoader(RequestLoader):
     """A request loader that reads URLs from sitemap(s).
+    The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
+    (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
+    Note that HTML pages containing links are not supported - those should be handled by regular crawlers
+    and the `enqueue_links` functionality.
     The loader fetches and parses sitemaps in the background, allowing crawling to start
     before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
+    The loader supports state persistence, allowing it to resume from where it left off
+    after interruption when a `persist_state_key` is provided during initialization.
     """
     def __init__(
@@ -40,7 +112,8 @@ class SitemapRequestLoader(RequestLoader):
         include: list[re.Pattern[Any] | Glob] | None = None,
         exclude: list[re.Pattern[Any] | Glob] | None = None,
         max_buffer_size: int = 200,
-        parse_sitemap_options: ParseSitemapOptions | None = None,
+        persist_state_key: str | None = None,
+        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
     ) -> None:
         """Initialize the sitemap request loader.
@@ -50,27 +123,68 @@ class SitemapRequestLoader(RequestLoader):
             include: List of glob or regex patterns to include URLs.
             exclude: List of glob or regex patterns to exclude URLs.
             max_buffer_size: Maximum number of URLs to buffer in memory.
-            parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`.
             http_client: the instance of `HttpClient` to use for fetching sitemaps.
+            persist_state_key: A key for persisting the loader's state in the KeyValueStore.
+                When provided, allows resuming from where it left off after interruption.
+                If None, no state persistence occurs.
+            transform_request_function: An optional function to transform requests
+                generated by the loader. It receives `RequestOptions` with `url` and should return either
+                modified `RequestOptions` or a `RequestTransformAction`.
         """
         self._http_client = http_client
         self._sitemap_urls = sitemap_urls
         self._include = include
         self._exclude = exclude
         self._proxy_info = proxy_info
-        self._parse_sitemap_options = parse_sitemap_options or ParseSitemapOptions()
+        self._max_buffer_size = max_buffer_size
+        self._transform_request_function = transform_request_function
+        # Synchronization for queue operations
+        self._queue_has_capacity = asyncio.Event()
+        self._queue_has_capacity.set()
+        self._queue_lock = asyncio.Lock()
+        # Initialize recoverable state
+        self._state = RecoverableState(
+            default_state=SitemapRequestLoaderState(
+                url_queue=deque(),
+                pending_sitemap_urls=deque(),
+            ),
+            persistence_enabled=bool(persist_state_key),
+            persist_state_key=persist_state_key or '',
+            logger=logger,
+        )
+        # Start background loading
+        self._loading_task = asyncio.create_task(self._load_sitemaps())
-        self._handled_count = 0
-        self._total_count = 0
+    async def _get_state(self) -> SitemapRequestLoaderState:
+        """Initialize and return the current state."""
+        async with self._queue_lock:
+            if self._state.is_initialized:
+                return self._state.current_value
-        # URL queue and tracking
-        self._url_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=max_buffer_size)
-        self._in_progress: set[str] = set()
-        self._processed_urls: set[str] = set()
+            await self._state.initialize()
-        # Loading state
-        self._loading_task = asyncio.create_task(self._load_sitemaps())
+            # Initialize pending sitemaps on first run
+            has_sitemap_for_processing = (
+                self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url
+            )
+            if not has_sitemap_for_processing and not self._state.current_value.completed:
+                self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)
+            if self._state.current_value.in_progress:
+                self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)
+                self._state.current_value.in_progress.clear()
+            if (
+                self._state.current_value.url_queue
+                and len(self._state.current_value.url_queue) >= self._max_buffer_size
+            ):
+                # Notify that the queue is full
+                self._queue_has_capacity.clear()
+            return self._state.current_value
     def _check_url_patterns(
         self,
@@ -105,73 +219,157 @@ class SitemapRequestLoader(RequestLoader):
     async def _load_sitemaps(self) -> None:
         """Load URLs from sitemaps in the background."""
         try:
-            async for item in parse_sitemap(
-                [SitemapSource(type='url', url=url) for url in self._sitemap_urls],
-                self._http_client,
-                proxy_info=self._proxy_info,
-                options=self._parse_sitemap_options,
-            ):
-                # Only process URL items (not nested sitemaps)
-                if isinstance(item, SitemapUrl):
-                    url = item.loc
-                    # Skip if already processed
-                    if url in self._processed_urls:
+            # Get actual state
+            while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url):
+                # Get sitemap URL for parsing
+                sitemap_url = state.in_progress_sitemap_url
+                if not sitemap_url:
+                    sitemap_url = state.pending_sitemap_urls.popleft()
+                    # Skip processed urls
+                    if sitemap_url in state.processed_sitemap_urls:
                         continue
-                    # Check if URL should be included
-                    if not self._check_url_patterns(url, self._include, self._exclude):
+                    state.in_progress_sitemap_url = sitemap_url
+                parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
+                async for item in parse_sitemap(
+                    [SitemapSource(type='url', url=sitemap_url)],
+                    self._http_client,
+                    proxy_info=self._proxy_info,
+                    options=parse_options,
+                ):
+                    if isinstance(item, NestedSitemap):
+                        # Add nested sitemap to queue
+                        if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
+                            state.pending_sitemap_urls.append(item.loc)
                         continue
-                    await self._url_queue.put(url)
-                    self._processed_urls.add(url)
-                    self._total_count += 1
+                    if isinstance(item, SitemapUrl):
+                        url = item.loc
+                        state = await self._get_state()
+                        # Skip if already processed
+                        if url in state.current_sitemap_processed_urls:
+                            continue
+                        # Check if URL should be included
+                        if not self._check_url_patterns(url, self._include, self._exclude):
+                            continue
+                        # Check if we have capacity in the queue
+                        await self._queue_has_capacity.wait()
+                        state = await self._get_state()
+                        async with self._queue_lock:
+                            state.url_queue.append(url)
+                            state.current_sitemap_processed_urls.add(url)
+                            state.total_count += 1
+                            if len(state.url_queue) >= self._max_buffer_size:
+                                # Notify that the queue is full
+                                self._queue_has_capacity.clear()
+                # Clear current sitemap after processing
+                state = await self._get_state()
+                current_sitemap_url = state.in_progress_sitemap_url
+                state.in_progress_sitemap_url = None
+                if current_sitemap_url:
+                    state.processed_sitemap_urls.add(current_sitemap_url)
+                state.current_sitemap_processed_urls.clear()
+            # Mark as completed after processing all sitemap urls
+            state.completed = True
         except Exception:
             logger.exception('Error loading sitemaps')
             raise
+    @override
     async def get_total_count(self) -> int:
         """Return the total number of URLs found so far."""
-        return self._total_count
+        state = await self._get_state()
+        return state.total_count
+    @override
+    async def get_handled_count(self) -> int:
+        """Return the number of URLs that have been handled."""
+        state = await self._get_state()
+        return state.handled_count
+    @override
     async def is_empty(self) -> bool:
         """Check if there are no more URLs to process."""
-        return self._url_queue.empty() and self._loading_task.done()
+        state = await self._get_state()
+        return not state.url_queue
+    @override
     async def is_finished(self) -> bool:
         """Check if all URLs have been processed."""
-        return self._url_queue.empty() and len(self._in_progress) == 0 and self._loading_task.done()
+        state = await self._get_state()
+        return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()
+    @override
     async def fetch_next_request(self) -> Request | None:
         """Fetch the next request to process."""
-        while not (self._loading_task.done() and self._url_queue.empty()):
-            if self._url_queue.empty():
-                await asyncio.sleep(0.5)
+        while not (await self.is_finished()):
+            state = await self._get_state()
+            if not state.url_queue:
+                await asyncio.sleep(0.1)
                 continue
-            url = await self._url_queue.get()
+            async with self._queue_lock:
+                url = state.url_queue.popleft()
+                request_option = RequestOptions(url=url)
+                if self._transform_request_function:
+                    transform_request_option = self._transform_request_function(request_option)
+                    if transform_request_option == 'skip':
+                        state.total_count -= 1
+                        continue
+                    if transform_request_option != 'unchanged':
+                        request_option = transform_request_option
+                request = Request.from_url(**request_option)
+                state.in_progress.add(request.url)
+                if len(state.url_queue) < self._max_buffer_size:
+                    self._queue_has_capacity.set()
-            request = Request.from_url(url)
-            self._in_progress.add(request.id)
             return request
         return None
+    @override
     async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
         """Mark a request as successfully handled."""
-        if request.id in self._in_progress:
-            self._in_progress.remove(request.id)
-            self._handled_count += 1
+        state = await self._get_state()
+        if request.url in state.in_progress:
+            state.in_progress.remove(request.url)
+            state.handled_count += 1
         return None
-    async def get_handled_count(self) -> int:
-        """Return the number of handled requests."""
-        return self._handled_count
     async def abort_loading(self) -> None:
         """Abort the sitemap loading process."""
         if self._loading_task and not self._loading_task.done():
             self._loading_task.cancel()
             with suppress(asyncio.CancelledError):
                 await self._loading_task
+    async def start(self) -> None:
+        """Start the sitemap loading process."""
+        if self._loading_task and not self._loading_task.done():
+            return
+        self._loading_task = asyncio.create_task(self._load_sitemaps())
+    async def close(self) -> None:
+        """Close the request loader."""
+        await self.abort_loading()
+        await self._state.teardown()
+    async def __aenter__(self) -> SitemapRequestLoader:
+        """Enter the context manager."""
+        await self.start()
+        return self
+    async def __aexit__(
+        self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
+    ) -> None:
+        """Exit the context manager."""
+        await self.close()

crawlee/router.py CHANGED Viewed

@@ -1,13 +1,17 @@
 from __future__ import annotations
+import asyncio
 from collections.abc import Awaitable, Callable
 from typing import Generic, TypeVar
+from crawlee._request import RequestState
 from crawlee._types import BasicCrawlingContext
 from crawlee._utils.docs import docs_group
 __all__ = ['Router']
+from crawlee.errors import UserHandlerTimeoutError
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
 RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
     async def __call__(self, context: TCrawlingContext) -> None:
         """Invoke a request handler that matches the request label (or the default)."""
+        context.request.state = RequestState.REQUEST_HANDLER
         if context.request.label is None or context.request.label not in self._handlers_by_label:
             if self._default_handler is None:
                 raise RuntimeError(
                     f'No handler matches label `{context.request.label}` and no default handler is configured'
                 )
-            return await self._default_handler(context)
+            user_defined_handler = self._default_handler
+        else:
+            user_defined_handler = self._handlers_by_label[context.request.label]
-        handler = self._handlers_by_label[context.request.label]
-        return await handler(context)
+        try:
+            return await user_defined_handler(context)
+        except asyncio.TimeoutError as e:
+            # Timeout in handler, but not timeout of handler.
+            raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e

crawlee/sessions/_cookies.py CHANGED Viewed

@@ -10,6 +10,7 @@ from crawlee._utils.docs import docs_group
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from typing import TypeGuard
 @docs_group('Session management')
@@ -66,17 +67,18 @@ class SessionCookies:
         self._jar = CookieJar()
-        if isinstance(cookies, dict):
-            for key, value in cookies.items():
-                self.set(key, value)
-        elif isinstance(cookies, list):
+        if isinstance(cookies, list):
             for item in cookies:
                 self.set(**item)
         elif isinstance(cookies, SessionCookies):
             for cookie in cookies.jar:
-                self.jar.set_cookie(cookie)
+                self._jar.set_cookie(cookie)
+        elif isinstance(cookies, dict):
+            cookies_dict: dict[str, str] = cookies
+            for key, value in cookies_dict.items():
+                self.set(key, value)
     @property
     def jar(self) -> CookieJar:
@@ -151,8 +153,8 @@ class SessionCookies:
         if cookie.expires:
             cookie_dict['expires'] = cookie.expires
-        if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site in {'Lax', 'None', 'Strict'}:
-            cookie_dict['same_site'] = same_site  # type: ignore[typeddict-item]
+        if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):
+            cookie_dict['same_site'] = same_site
         return cookie_dict
@@ -273,3 +275,6 @@ class SessionCookies:
         """Return hash based on the cookies key attributes."""
         cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)
         return hash(cookie_tuples)
+    def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:
+        return value in {'Lax', 'None', 'Strict'}

crawlee/sessions/_models.py CHANGED Viewed

@@ -20,7 +20,7 @@ from ._session import Session
 class SessionModel(BaseModel):
     """Model for a Session object."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     id: Annotated[str, Field(alias='id')]
     max_age: Annotated[timedelta, Field(alias='maxAge')]
@@ -38,7 +38,7 @@ class SessionModel(BaseModel):
 class SessionPoolModel(BaseModel):
     """Model for a SessionPool object."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     max_pool_size: Annotated[int, Field(alias='maxPoolSize')]
@@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel):
         ),
     ]
-    @computed_field(alias='sessionCount')  # type: ignore[prop-decorator]
+    @computed_field(alias='sessionCount')
     @property
     def session_count(self) -> int:
         """Get the total number of sessions currently maintained in the pool."""
         return len(self.sessions)
-    @computed_field(alias='usableSessionCount')  # type: ignore[prop-decorator]
+    @computed_field(alias='usableSessionCount')
     @property
     def usable_session_count(self) -> int:
         """Get the number of sessions that are currently usable."""
         return len([session for _, session in self.sessions.items() if session.is_usable])
-    @computed_field(alias='retiredSessionCount')  # type: ignore[prop-decorator]
+    @computed_field(alias='retiredSessionCount')
     @property
     def retired_session_count(self) -> int:
         """Get the number of sessions that are no longer usable."""

crawlee/sessions/_session_pool.py CHANGED Viewed

@@ -163,7 +163,7 @@ class SessionPool:
     def add_session(self, session: Session) -> None:
         """Add an externally created session to the pool.
-        This is intened only for the cases when you want to add a session that was created outside of the pool.
+        This is intended only for the cases when you want to add a session that was created outside of the pool.
         Otherwise, the pool will create new sessions automatically.
         Args:

crawlee/statistics/_error_snapshotter.py CHANGED Viewed

@@ -32,7 +32,7 @@ class ErrorSnapshotter:
         """Capture error snapshot and save it to key value store.
         It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
-        it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
+        it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
         returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
         an exception.

crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b15py3-none-any.whl → 1.3.1b3py3-none-any.whl