PyPI - crawlee - Versions diffs - 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl - Mend

crawlee 0.6.13b43py3-none-any.whl → 1.1.2b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (80) hide show

crawlee/_request.py +32 -21
crawlee/_service_locator.py +4 -4
crawlee/_types.py +87 -25
crawlee/_utils/file.py +7 -0
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +15 -0
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +1 -1
crawlee/_utils/time.py +41 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +4 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/__init__.py +2 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
crawlee/crawlers/_basic/_basic_crawler.py +139 -96
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_playwright/_playwright_crawler.py +52 -10
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/events/_event_manager.py +3 -1
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +12 -0
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +3 -3
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_sitemap_request_loader.py +22 -4
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +32 -1
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_storage_client.py +5 -4
crawlee/storage_clients/_file_system/_dataset_client.py +6 -7
crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -8
crawlee/storage_clients/_file_system/_request_queue_client.py +31 -15
crawlee/storage_clients/_file_system/_storage_client.py +2 -2
crawlee/storage_clients/_memory/_dataset_client.py +4 -5
crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +295 -0
crawlee/storage_clients/_redis/_dataset_client.py +325 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
crawlee/storage_clients/_redis/_storage_client.py +146 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +282 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +10 -10
crawlee/storages/_base.py +3 -1
crawlee/storages/_dataset.py +5 -3
crawlee/storages/_key_value_store.py +11 -6
crawlee/storages/_request_queue.py +5 -3
crawlee/storages/_storage_instance_manager.py +54 -68
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +17 -5
{crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +80 -58
{crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
{crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0

crawlee/events/_event_manager.py CHANGED Viewed

@@ -130,11 +130,13 @@ class EventManager:
         if not self._active:
             raise RuntimeError(f'The {self.__class__.__name__} is not active.')
+        # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
+        await self._emit_persist_state_event_rec_task.stop()
+        await self._emit_persist_state_event()
         await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
         self._event_emitter.remove_all_listeners()
         self._listener_tasks.clear()
         self._listeners_to_wrappers.clear()
-        await self._emit_persist_state_event_rec_task.stop()
         self._active = False
     @overload

crawlee/fingerprint_suite/_header_generator.py CHANGED Viewed

@@ -11,9 +11,9 @@ if TYPE_CHECKING:
 def fingerprint_browser_type_from_playwright_browser_type(
-    playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
+    playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
 ) -> SupportedBrowserType:
-    if playwright_browser_type == 'chromium':
+    if playwright_browser_type in {'chromium', 'chrome'}:
         return 'chrome'
     if playwright_browser_type == 'firefox':
         return 'firefox'

crawlee/http_clients/_base.py CHANGED Viewed

@@ -104,6 +104,7 @@ class HttpClient(ABC):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         """Perform the crawling for a given request.
@@ -114,6 +115,7 @@ class HttpClient(ABC):
             session: The session associated with the request.
             proxy_info: The information about the proxy to be used.
             statistics: The statistics object to register status codes.
+            timeout: Maximum time allowed to process the request.
         Raises:
             ProxyError: Raised if a proxy-related error occurs.
@@ -132,6 +134,7 @@ class HttpClient(ABC):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         """Send an HTTP request via the client.
@@ -144,6 +147,7 @@ class HttpClient(ABC):
             payload: The data to be sent as the request body.
             session: The session associated with the request.
             proxy_info: The information about the proxy to be used.
+            timeout: Maximum time allowed to process the request.
         Raises:
             ProxyError: Raised if a proxy-related error occurs.

crawlee/http_clients/_curl_impersonate.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 from contextlib import asynccontextmanager
 from typing import TYPE_CHECKING, Any
@@ -10,6 +11,7 @@ from curl_cffi.requests.cookies import Cookies as CurlCookies
 from curl_cffi.requests.cookies import CurlMorsel
 from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
 from curl_cffi.requests.exceptions import RequestException as CurlRequestError
+from curl_cffi.requests.exceptions import Timeout
 from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
 from typing_extensions import override
@@ -147,6 +149,7 @@ class CurlImpersonateHttpClient(HttpClient):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         client = self._get_client(proxy_info.url if proxy_info else None)
@@ -157,7 +160,10 @@ class CurlImpersonateHttpClient(HttpClient):
                 headers=request.headers,
                 data=request.payload,
                 cookies=session.cookies.jar if session else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except Timeout as exc:
+            raise asyncio.TimeoutError from exc
         except CurlRequestError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -186,6 +192,7 @@ class CurlImpersonateHttpClient(HttpClient):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         if isinstance(headers, dict) or headers is None:
             headers = HttpHeaders(headers or {})
@@ -200,7 +207,10 @@ class CurlImpersonateHttpClient(HttpClient):
                 headers=dict(headers) if headers else None,
                 data=payload,
                 cookies=session.cookies.jar if session else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except Timeout as exc:
+            raise asyncio.TimeoutError from exc
         except CurlRequestError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -241,6 +251,8 @@ class CurlImpersonateHttpClient(HttpClient):
                 stream=True,
                 timeout=timeout.total_seconds() if timeout else None,
             )
+        except Timeout as exc:
+            raise asyncio.TimeoutError from exc
         except CurlRequestError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc

crawlee/http_clients/_httpx.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 from contextlib import asynccontextmanager
 from logging import getLogger
 from typing import TYPE_CHECKING, Any, cast
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         client = self._get_client(proxy_info.url if proxy_info else None)
         headers = self._combine_headers(request.headers)
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
             content=request.payload,
             cookies=session.cookies.jar if session else None,
             extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
+            timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
         )
         try:
             response = await client.send(http_request)
+        except httpx.TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except httpx.TransportError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         client = self._get_client(proxy_info.url if proxy_info else None)
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
             headers=headers,
             payload=payload,
             session=session,
+            timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
         )
         try:
             response = await client.send(http_request)
+        except httpx.TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except httpx.TransportError as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
             headers=headers,
             payload=payload,
             session=session,
-            timeout=timeout,
+            timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
         )
-        response = await client.send(http_request, stream=True)
+        try:
+            response = await client.send(http_request, stream=True)
+        except httpx.TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         try:
             yield _HttpxResponse(response)
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
         headers: HttpHeaders | dict[str, str] | None,
         payload: HttpPayload | None,
         session: Session | None = None,
-        timeout: timedelta | None = None,
+        timeout: httpx.Timeout | None = None,
     ) -> httpx.Request:
         """Build an `httpx.Request` using the provided parameters."""
         if isinstance(headers, dict) or headers is None:
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
         headers = self._combine_headers(headers)
-        httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
         return client.build_request(
             url=url,
             method=method,
             headers=dict(headers) if headers else None,
             content=payload,
             extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
-            timeout=httpx_timeout,
+            timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
         )
     def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:

crawlee/http_clients/_impit.py CHANGED Viewed

@@ -6,7 +6,7 @@ from logging import getLogger
 from typing import TYPE_CHECKING, Any, TypedDict
 from cachetools import LRUCache
-from impit import AsyncClient, Browser, HTTPError, Response, TransportError
+from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
 from impit import ProxyError as ImpitProxyError
 from typing_extensions import override
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
                 method=request.method,
                 content=request.payload,
                 headers=dict(request.headers) if request.headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except (TransportError, HTTPError) as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         if isinstance(headers, dict) or headers is None:
             headers = HttpHeaders(headers or {})
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
         try:
             response = await client.request(
-                method=method, url=url, content=payload, headers=dict(headers) if headers else None
+                method=method,
+                url=url,
+                content=payload,
+                headers=dict(headers) if headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except (TransportError, HTTPError) as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
     ) -> AsyncGenerator[HttpResponse]:
         client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
-        response = await client.request(
-            method=method,
-            url=url,
-            content=payload,
-            headers=dict(headers) if headers else None,
-            timeout=timeout.total_seconds() if timeout else None,
-            stream=True,
-        )
+        try:
+            response = await client.request(
+                method=method,
+                url=url,
+                content=payload,
+                headers=dict(headers) if headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
+                stream=True,
+            )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         try:
             yield _ImpitResponse(response)
         finally:

crawlee/otel/crawler_instrumentor.py CHANGED Viewed

@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
         if request_handling_instrumentation:
-            async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
+            async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
                 with self._tracer.start_as_current_span(
                     name=f'{instance.generator.__name__}, {wrapped.__name__}',  # type:ignore[attr-defined]  # valid in our context
                     attributes={
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
             # Handpicked interesting methods to instrument
             self._instrumented.extend(
                 [
-                    (_Middleware, 'action', middlware_wrapper),
-                    (_Middleware, 'cleanup', middlware_wrapper),
+                    (_Middleware, 'action', middleware_wrapper),
+                    (_Middleware, 'cleanup', middleware_wrapper),
                     (ContextPipeline, '__call__', context_pipeline_wrapper),
                     (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
                     (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),

crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml CHANGED Viewed

@@ -5,8 +5,8 @@
 # % endif
 # % if cookiecutter.http_client == 'curl-impersonate'
 # % do extras.append('curl-impersonate')
-# % elif cookiecutter.http_client == 'impit'
-# % do extras.append('impit')
+# % elif cookiecutter.http_client == 'httpx'
+# % do extras.append('httpx')
 # % endif
 [project]

crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt CHANGED Viewed

@@ -10,4 +10,7 @@ apify
 # % if cookiecutter.http_client == 'curl-impersonate'
 # % do extras.append('curl-impersonate')
 # % endif
+# % if cookiecutter.http_client == 'httpx'
+# % do extras.append('httpx')
+# % endif
 crawlee[{{ extras | join(',') }}]

crawlee/request_loaders/_sitemap_request_loader.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import override
-from crawlee import Request
+from crawlee import Request, RequestOptions
 from crawlee._utils.docs import docs_group
 from crawlee._utils.globs import Glob
 from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
 if TYPE_CHECKING:
     import re
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
     from types import TracebackType
+    from crawlee import RequestTransformAction
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.storage_clients.models import ProcessedRequest
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
 class SitemapRequestLoader(RequestLoader):
     """A request loader that reads URLs from sitemap(s).
+    The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
+    (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
+    Note that HTML pages containing links are not supported - those should be handled by regular crawlers
+    and the `enqueue_links` functionality.
     The loader fetches and parses sitemaps in the background, allowing crawling to start
     before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
         exclude: list[re.Pattern[Any] | Glob] | None = None,
         max_buffer_size: int = 200,
         persist_state_key: str | None = None,
+        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
     ) -> None:
         """Initialize the sitemap request loader.
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
             persist_state_key: A key for persisting the loader's state in the KeyValueStore.
                 When provided, allows resuming from where it left off after interruption.
                 If None, no state persistence occurs.
+            transform_request_function: An optional function to transform requests
+                generated by the loader. It receives `RequestOptions` with `url` and should return either
+                modified `RequestOptions` or a `RequestTransformAction`.
         """
         self._http_client = http_client
         self._sitemap_urls = sitemap_urls
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
         self._exclude = exclude
         self._proxy_info = proxy_info
         self._max_buffer_size = max_buffer_size
+        self._transform_request_function = transform_request_function
         # Synchronization for queue operations
         self._queue_has_capacity = asyncio.Event()
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
             async with self._queue_lock:
                 url = state.url_queue.popleft()
-                request = Request.from_url(url)
+                request_option = RequestOptions(url=url)
+                if self._transform_request_function:
+                    transform_request_option = self._transform_request_function(request_option)
+                    if transform_request_option == 'skip':
+                        state.total_count -= 1
+                        continue
+                    if transform_request_option != 'unchanged':
+                        request_option = transform_request_option
+                request = Request.from_url(**request_option)
                 state.in_progress.add(request.url)
                 if len(state.url_queue) < self._max_buffer_size:
                     self._queue_has_capacity.set()

crawlee/sessions/_session_pool.py CHANGED Viewed

@@ -163,7 +163,7 @@ class SessionPool:
     def add_session(self, session: Session) -> None:
         """Add an externally created session to the pool.
-        This is intened only for the cases when you want to add a session that was created outside of the pool.
+        This is intended only for the cases when you want to add a session that was created outside of the pool.
         Otherwise, the pool will create new sessions automatically.
         Args:

crawlee/statistics/_error_snapshotter.py CHANGED Viewed

@@ -32,7 +32,7 @@ class ErrorSnapshotter:
         """Capture error snapshot and save it to key value store.
         It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
-        it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
+        it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
         returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
         an exception.

crawlee/statistics/_models.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import json
+import warnings
 from dataclasses import asdict, dataclass
 from datetime import datetime, timedelta, timezone
 from typing import Annotated, Any
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
     crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
     crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
     crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
-    crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
     errors: dict[str, Any] = Field(default_factory=dict)
     retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
     requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
         ),
     ] = {}
+    # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
+    _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
+    def model_post_init(self, /, __context: Any) -> None:
+        self._runtime_offset = self.crawler_runtime or self._runtime_offset
+    @property
+    def crawler_runtime(self) -> timedelta:
+        if self.crawler_last_started_at:
+            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
+            return self._runtime_offset + finished_at - self.crawler_last_started_at
+        return self._runtime_offset
+    @crawler_runtime.setter
+    def crawler_runtime(self, value: timedelta) -> None:
+        # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
+        # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
+        warnings.warn(
+            f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
+            f' Value {value} will not be used.',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+    @computed_field(alias='crawlerRuntimeMillis')
+    def crawler_runtime_for_serialization(self) -> timedelta:
+        if self.crawler_last_started_at:
+            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
+            return self._runtime_offset + finished_at - self.crawler_last_started_at
+        return self._runtime_offset
     @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)  # type: ignore[prop-decorator]
     @property
     def request_total_duration(self) -> timedelta:

crawlee/statistics/_statistics.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
 from __future__ import annotations
+import asyncio
 import math
 import time
 from datetime import datetime, timedelta, timezone
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
 from crawlee.statistics._error_tracker import ErrorTracker
 if TYPE_CHECKING:
+    from collections.abc import Callable, Coroutine
     from types import TracebackType
+    from crawlee.storages import KeyValueStore
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
 TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
 logger = getLogger(__name__)
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
         persistence_enabled: bool | Literal['explicit_only'] = False,
         persist_state_kvs_name: str | None = None,
         persist_state_key: str | None = None,
+        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
         log_message: str = 'Statistics',
         periodic_message_logger: Logger | None = None,
         log_interval: timedelta = timedelta(minutes=1),
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
         self._id = Statistics.__next_id
         Statistics.__next_id += 1
-        self._instance_start: datetime | None = None
         self.error_tracker = ErrorTracker(
             save_error_snapshots=save_error_snapshots,
             snapshot_kvs_name=persist_state_kvs_name,
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
         self._state = RecoverableState(
             default_state=state_model(stats_id=self._id),
-            persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}',
+            persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
             persistence_enabled=persistence_enabled,
             persist_state_kvs_name=persist_state_kvs_name,
+            persist_state_kvs_factory=persist_state_kvs_factory,
             logger=logger,
         )
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
         """Create near copy of the `Statistics` with replaced `state_model`."""
         new_statistics: Statistics[TNewStatisticsState] = Statistics(
             persistence_enabled=self._state._persistence_enabled,  # noqa: SLF001
-            persist_state_kvs_name=self._state._persist_state_kvs_name,  # noqa: SLF001
             persist_state_key=self._state._persist_state_key,  # noqa: SLF001
+            persist_state_kvs_factory=self._state._persist_state_kvs_factory,  # noqa: SLF001
             log_message=self._log_message,
             periodic_message_logger=self._periodic_message_logger,
             state_model=state_model,
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
         persistence_enabled: bool = False,
         persist_state_kvs_name: str | None = None,
         persist_state_key: str | None = None,
+        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
         log_message: str = 'Statistics',
         periodic_message_logger: Logger | None = None,
         log_interval: timedelta = timedelta(minutes=1),
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
             persistence_enabled=persistence_enabled,
             persist_state_kvs_name=persist_state_kvs_name,
             persist_state_key=persist_state_key,
+            persist_state_kvs_factory=persist_state_kvs_factory,
             log_message=log_message,
             periodic_message_logger=periodic_message_logger,
             log_interval=log_interval,
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
         if self._active:
             raise RuntimeError(f'The {self.__class__.__name__} is already active.')
-        self._active = True
-        self._instance_start = datetime.now(timezone.utc)
         await self._state.initialize()
-        self._after_initialize()
+        # Reset `crawler_finished_at` to indicate a new run in progress.
+        self.state.crawler_finished_at = None
+        # Start periodic logging and let it print initial state before activation.
         self._periodic_logger.start()
+        await asyncio.sleep(0.01)
+        self._active = True
+        self.state.crawler_last_started_at = datetime.now(timezone.utc)
+        self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
         return self
     async def __aexit__(
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
         if not self._active:
             raise RuntimeError(f'The {self.__class__.__name__} is not active.')
-        self._state.current_value.crawler_finished_at = datetime.now(timezone.utc)
-        await self._state.teardown()
+        if not self.state.crawler_last_started_at:
+            raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
+        # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
         await self._periodic_logger.stop()
+        self.state.crawler_finished_at = datetime.now(timezone.utc)
         self._active = False
+        await self._state.teardown()
     @property
     def state(self) -> TStatisticsState:
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
     def calculate(self) -> FinalStatistics:
         """Calculate the current statistics."""
-        if self._instance_start is None:
-            raise RuntimeError('The Statistics object is not initialized')
-        crawler_runtime = datetime.now(timezone.utc) - self._instance_start
-        total_minutes = crawler_runtime.total_seconds() / 60
+        total_minutes = self.state.crawler_runtime.total_seconds() / 60
         state = self._state.current_value
         serialized_state = state.model_dump(by_alias=False)
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
             requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
             request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
             requests_total=state.requests_failed + state.requests_finished,
-            crawler_runtime=crawler_runtime,
+            crawler_runtime=state.crawler_runtime,
             requests_finished=state.requests_finished,
             requests_failed=state.requests_failed,
             retry_histogram=serialized_state['request_retry_histogram'],
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
         else:
             self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
-    def _after_initialize(self) -> None:
-        state = self._state.current_value
-        if state.crawler_started_at is None:
-            state.crawler_started_at = datetime.now(timezone.utc)
-        if state.stats_persisted_at is not None and state.crawler_last_started_at:
-            self._instance_start = datetime.now(timezone.utc) - (
-                state.stats_persisted_at - state.crawler_last_started_at
-            )
-        elif state.crawler_last_started_at:
-            self._instance_start = state.crawler_last_started_at
-        state.crawler_last_started_at = self._instance_start
     def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
         retry_count = record.retry_count
         state = self._state.current_value

crawlee/storage_clients/__init__.py CHANGED Viewed

@@ -1,9 +1,25 @@
+from crawlee._utils.try_import import install_import_hook as _install_import_hook
+from crawlee._utils.try_import import try_import as _try_import
+# These imports have only mandatory dependencies, so they are imported directly.
 from ._base import StorageClient
 from ._file_system import FileSystemStorageClient
 from ._memory import MemoryStorageClient
+_install_import_hook(__name__)
+# The following imports are wrapped in try_import to handle optional dependencies,
+# ensuring the module can still function even if these dependencies are missing.
+with _try_import(__name__, 'SqlStorageClient'):
+    from ._sql import SqlStorageClient
+with _try_import(__name__, 'RedisStorageClient'):
+    from ._redis import RedisStorageClient
 __all__ = [
     'FileSystemStorageClient',
     'MemoryStorageClient',
+    'RedisStorageClient',
+    'SqlStorageClient',
     'StorageClient',
 ]

crawlee 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b43py3-none-any.whl → 1.1.2b4py3-none-any.whl