PyPI - crawlee - Versions diffs - 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl - Mend

crawlee 1.0.5b18py3-none-any.whl → 1.2.2b24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

crawlee/__init__.py +2 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +32 -13
crawlee/_types.py +44 -5
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/recurring_task.py +12 -3
crawlee/_utils/sitemap.py +12 -5
crawlee/_utils/system.py +27 -11
crawlee/_utils/time.py +41 -1
crawlee/browsers/_browser_pool.py +1 -1
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
crawlee/crawlers/_basic/_basic_crawler.py +138 -124
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +23 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +1 -3
crawlee/request_loaders/_sitemap_request_loader.py +18 -5
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +3 -3
crawlee/statistics/_models.py +51 -9
crawlee/statistics/_statistics.py +2 -21
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
crawlee/storage_clients/_redis/_client_mixin.py +1 -4
crawlee/storage_clients/_redis/_dataset_client.py +6 -2
crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
crawlee/storage_clients/_redis/_storage_client.py +12 -9
crawlee/storage_clients/_redis/_utils.py +1 -1
crawlee/storage_clients/_sql/_client_mixin.py +1 -1
crawlee/storage_clients/_sql/_storage_client.py +0 -9
crawlee/storage_clients/models.py +8 -3
crawlee/storages/_storage_instance_manager.py +103 -44
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
{crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0

crawlee/http_clients/_impit.py CHANGED Viewed

@@ -6,7 +6,7 @@ from logging import getLogger
 from typing import TYPE_CHECKING, Any, TypedDict
 from cachetools import LRUCache
-from impit import AsyncClient, Browser, HTTPError, Response, TransportError
+from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
 from impit import ProxyError as ImpitProxyError
 from typing_extensions import override
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
         statistics: Statistics | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpCrawlingResult:
         client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
                 method=request.method,
                 content=request.payload,
                 headers=dict(request.headers) if request.headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except (TransportError, HTTPError) as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
         payload: HttpPayload | None = None,
         session: Session | None = None,
         proxy_info: ProxyInfo | None = None,
+        timeout: timedelta | None = None,
     ) -> HttpResponse:
         if isinstance(headers, dict) or headers is None:
             headers = HttpHeaders(headers or {})
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
         try:
             response = await client.request(
-                method=method, url=url, content=payload, headers=dict(headers) if headers else None
+                method=method,
+                url=url,
+                content=payload,
+                headers=dict(headers) if headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
             )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         except (TransportError, HTTPError) as exc:
             if self._is_proxy_error(exc):
                 raise ProxyError from exc
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
     ) -> AsyncGenerator[HttpResponse]:
         client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
-        response = await client.request(
-            method=method,
-            url=url,
-            content=payload,
-            headers=dict(headers) if headers else None,
-            timeout=timeout.total_seconds() if timeout else None,
-            stream=True,
-        )
+        try:
+            response = await client.request(
+                method=method,
+                url=url,
+                content=payload,
+                headers=dict(headers) if headers else None,
+                timeout=timeout.total_seconds() if timeout else None,
+                stream=True,
+            )
+        except TimeoutException as exc:
+            raise asyncio.TimeoutError from exc
         try:
             yield _ImpitResponse(response)
         finally:

crawlee/otel/crawler_instrumentor.py CHANGED Viewed

@@ -3,9 +3,7 @@ from __future__ import annotations
 import inspect
 from typing import TYPE_CHECKING, Any
-from opentelemetry.instrumentation.instrumentor import (  # type:ignore[attr-defined]  # Mypy has troubles with OTEL
-    BaseInstrumentor,
-)
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
 from opentelemetry.instrumentation.utils import unwrap
 from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME
 from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD

crawlee/request_loaders/_sitemap_request_loader.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import override
-from crawlee import Request
+from crawlee import Request, RequestOptions
 from crawlee._utils.docs import docs_group
 from crawlee._utils.globs import Glob
 from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
 if TYPE_CHECKING:
     import re
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
     from types import TracebackType
+    from crawlee import RequestTransformAction
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.storage_clients.models import ProcessedRequest
@@ -112,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
         exclude: list[re.Pattern[Any] | Glob] | None = None,
         max_buffer_size: int = 200,
         persist_state_key: str | None = None,
+        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
     ) -> None:
         """Initialize the sitemap request loader.
@@ -125,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
             persist_state_key: A key for persisting the loader's state in the KeyValueStore.
                 When provided, allows resuming from where it left off after interruption.
                 If None, no state persistence occurs.
+            transform_request_function: An optional function to transform requests
+                generated by the loader. It receives `RequestOptions` with `url` and should return either
+                modified `RequestOptions` or a `RequestTransformAction`.
         """
         self._http_client = http_client
         self._sitemap_urls = sitemap_urls
@@ -132,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
         self._exclude = exclude
         self._proxy_info = proxy_info
         self._max_buffer_size = max_buffer_size
+        self._transform_request_function = transform_request_function
         # Synchronization for queue operations
         self._queue_has_capacity = asyncio.Event()
@@ -224,7 +230,7 @@ class SitemapRequestLoader(RequestLoader):
                         continue
                     state.in_progress_sitemap_url = sitemap_url
-                parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
+                parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
                 async for item in parse_sitemap(
                     [SitemapSource(type='url', url=sitemap_url)],
@@ -313,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
             async with self._queue_lock:
                 url = state.url_queue.popleft()
-                request = Request.from_url(url)
+                request_option = RequestOptions(url=url)
+                if self._transform_request_function:
+                    transform_request_option = self._transform_request_function(request_option)
+                    if transform_request_option == 'skip':
+                        state.total_count -= 1
+                        continue
+                    if transform_request_option != 'unchanged':
+                        request_option = transform_request_option
+                request = Request.from_url(**request_option)
                 state.in_progress.add(request.url)
                 if len(state.url_queue) < self._max_buffer_size:
                     self._queue_has_capacity.set()

crawlee/router.py CHANGED Viewed

@@ -1,13 +1,17 @@
 from __future__ import annotations
+import asyncio
 from collections.abc import Awaitable, Callable
 from typing import Generic, TypeVar
+from crawlee._request import RequestState
 from crawlee._types import BasicCrawlingContext
 from crawlee._utils.docs import docs_group
 __all__ = ['Router']
+from crawlee.errors import UserHandlerTimeoutError
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
 RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
     async def __call__(self, context: TCrawlingContext) -> None:
         """Invoke a request handler that matches the request label (or the default)."""
+        context.request.state = RequestState.REQUEST_HANDLER
         if context.request.label is None or context.request.label not in self._handlers_by_label:
             if self._default_handler is None:
                 raise RuntimeError(
                     f'No handler matches label `{context.request.label}` and no default handler is configured'
                 )
-            return await self._default_handler(context)
+            user_defined_handler = self._default_handler
+        else:
+            user_defined_handler = self._handlers_by_label[context.request.label]
-        handler = self._handlers_by_label[context.request.label]
-        return await handler(context)
+        try:
+            return await user_defined_handler(context)
+        except asyncio.TimeoutError as e:
+            # Timeout in handler, but not timeout of handler.
+            raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e

crawlee/sessions/_cookies.py CHANGED Viewed

@@ -10,6 +10,7 @@ from crawlee._utils.docs import docs_group
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from typing import TypeGuard
 @docs_group('Session management')
@@ -66,17 +67,18 @@ class SessionCookies:
         self._jar = CookieJar()
-        if isinstance(cookies, dict):
-            for key, value in cookies.items():
-                self.set(key, value)
-        elif isinstance(cookies, list):
+        if isinstance(cookies, list):
             for item in cookies:
                 self.set(**item)
         elif isinstance(cookies, SessionCookies):
             for cookie in cookies.jar:
-                self.jar.set_cookie(cookie)
+                self._jar.set_cookie(cookie)
+        elif isinstance(cookies, dict):
+            cookies_dict: dict[str, str] = cookies
+            for key, value in cookies_dict.items():
+                self.set(key, value)
     @property
     def jar(self) -> CookieJar:
@@ -151,8 +153,8 @@ class SessionCookies:
         if cookie.expires:
             cookie_dict['expires'] = cookie.expires
-        if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site in {'Lax', 'None', 'Strict'}:
-            cookie_dict['same_site'] = same_site  # type: ignore[typeddict-item]
+        if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):
+            cookie_dict['same_site'] = same_site
         return cookie_dict
@@ -273,3 +275,6 @@ class SessionCookies:
         """Return hash based on the cookies key attributes."""
         cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)
         return hash(cookie_tuples)
+    def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:
+        return value in {'Lax', 'None', 'Strict'}

crawlee/sessions/_models.py CHANGED Viewed

@@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel):
         ),
     ]
-    @computed_field(alias='sessionCount')  # type: ignore[prop-decorator]
+    @computed_field(alias='sessionCount')
     @property
     def session_count(self) -> int:
         """Get the total number of sessions currently maintained in the pool."""
         return len(self.sessions)
-    @computed_field(alias='usableSessionCount')  # type: ignore[prop-decorator]
+    @computed_field(alias='usableSessionCount')
     @property
     def usable_session_count(self) -> int:
         """Get the number of sessions that are currently usable."""
         return len([session for _, session in self.sessions.items() if session.is_usable])
-    @computed_field(alias='retiredSessionCount')  # type: ignore[prop-decorator]
+    @computed_field(alias='retiredSessionCount')
     @property
     def retired_session_count(self) -> int:
         """Get the number of sessions that are no longer usable."""

crawlee/statistics/_models.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from __future__ import annotations
 import json
+import warnings
 from dataclasses import asdict, dataclass
 from datetime import datetime, timedelta, timezone
-from typing import Annotated, Any
+from typing import TYPE_CHECKING, Annotated, Any
 from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field
 from typing_extensions import override
@@ -76,10 +77,20 @@ class StatisticsState(BaseModel):
     crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
     crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
     crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
-    crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
-    errors: dict[str, Any] = Field(default_factory=dict)
-    retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
-    requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
+    # Workaround for Pydantic and type checkers when using Annotated with default_factory
+    if TYPE_CHECKING:
+        errors: dict[str, Any] = {}
+        retry_errors: dict[str, Any] = {}
+        requests_with_status_code: dict[str, int] = {}
+    else:
+        errors: Annotated[dict[str, Any], Field(default_factory=dict)]
+        retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]
+        requests_with_status_code: Annotated[
+            dict[str, int],
+            Field(alias='requestsWithStatusCode', default_factory=dict),
+        ]
     stats_persisted_at: Annotated[
         datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))
     ] = None
@@ -93,22 +104,53 @@ class StatisticsState(BaseModel):
         ),
     ] = {}
-    @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)  # type: ignore[prop-decorator]
+    # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
+    _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
+    def model_post_init(self, /, __context: Any) -> None:
+        self._runtime_offset = self.crawler_runtime or self._runtime_offset
+    @property
+    def crawler_runtime(self) -> timedelta:
+        if self.crawler_last_started_at:
+            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
+            return self._runtime_offset + finished_at - self.crawler_last_started_at
+        return self._runtime_offset
+    @crawler_runtime.setter
+    def crawler_runtime(self, value: timedelta) -> None:
+        # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
+        # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
+        warnings.warn(
+            f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
+            f' Value {value} will not be used.',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+    @computed_field(alias='crawlerRuntimeMillis')
+    def crawler_runtime_for_serialization(self) -> timedelta:
+        if self.crawler_last_started_at:
+            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
+            return self._runtime_offset + finished_at - self.crawler_last_started_at
+        return self._runtime_offset
+    @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)
     @property
     def request_total_duration(self) -> timedelta:
         return self.request_total_finished_duration + self.request_total_failed_duration
-    @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)  # type: ignore[prop-decorator]
+    @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
     @property
     def request_avg_failed_duration(self) -> timedelta | None:
         return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None
-    @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)  # type: ignore[prop-decorator]
+    @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
     @property
     def request_avg_finished_duration(self) -> timedelta | None:
         return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None
-    @computed_field(alias='requestsTotal')  # type: ignore[prop-decorator]
+    @computed_field(alias='requestsTotal')
     @property
     def requests_total(self) -> int:
         return self.requests_failed + self.requests_finished

crawlee/statistics/_statistics.py CHANGED Viewed

@@ -110,9 +110,6 @@ class Statistics(Generic[TStatisticsState]):
         # Flag to indicate the context state.
         self._active = False
-        # Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
-        self._runtime_offset = timedelta(seconds=0)
     def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
         """Create near copy of the `Statistics` with replaced `state_model`."""
         new_statistics: Statistics[TNewStatisticsState] = Statistics(
@@ -168,8 +165,8 @@ class Statistics(Generic[TStatisticsState]):
             raise RuntimeError(f'The {self.__class__.__name__} is already active.')
         await self._state.initialize()
-        self._runtime_offset = self.state.crawler_runtime
+        # Reset `crawler_finished_at` to indicate a new run in progress.
+        self.state.crawler_finished_at = None
         # Start periodic logging and let it print initial state before activation.
         self._periodic_logger.start()
@@ -200,10 +197,6 @@ class Statistics(Generic[TStatisticsState]):
         # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
         await self._periodic_logger.stop()
         self.state.crawler_finished_at = datetime.now(timezone.utc)
-        self.state.crawler_runtime = (
-            self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
-        )
         self._active = False
         await self._state.teardown()
@@ -262,20 +255,8 @@ class Statistics(Generic[TStatisticsState]):
         del self._requests_in_progress[request_id_or_key]
-    def _update_crawler_runtime(self) -> None:
-        current_run_duration = (
-            (datetime.now(timezone.utc) - self.state.crawler_last_started_at)
-            if self.state.crawler_last_started_at
-            else timedelta()
-        )
-        self.state.crawler_runtime = current_run_duration + self._runtime_offset
     def calculate(self) -> FinalStatistics:
         """Calculate the current statistics."""
-        if self._active:
-            # Only update state when active. If not, just report the last known runtime.
-            self._update_crawler_runtime()
         total_minutes = self.state.crawler_runtime.total_seconds() / 60
         state = self._state.current_value
         serialized_state = state.model_dump(by_alias=False)

crawlee/storage_clients/_base/_dataset_client.py CHANGED Viewed

@@ -87,8 +87,8 @@ class DatasetClient(ABC):
         The backend method for the `Dataset.iterate_items` call.
         """
-        # This syntax is to make mypy properly work with abstract AsyncIterator.
+        # This syntax is to make type checker properly work with abstract AsyncIterator.
         # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
         raise NotImplementedError
-        if False:  # type: ignore[unreachable]
+        if False:
             yield 0

crawlee/storage_clients/_base/_key_value_store_client.py CHANGED Viewed

@@ -72,10 +72,10 @@ class KeyValueStoreClient(ABC):
         The backend method for the `KeyValueStore.iterate_keys` call.
         """
-        # This syntax is to make mypy properly work with abstract AsyncIterator.
+        # This syntax is to make type checker properly work with abstract AsyncIterator.
         # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
         raise NotImplementedError
-        if False:  # type: ignore[unreachable]
+        if False:
             yield 0
     @abstractmethod

crawlee/storage_clients/_file_system/_dataset_client.py CHANGED Viewed

@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
                     continue
                 try:
-                    file = await asyncio.to_thread(path_to_metadata.open)
+                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                     try:
                         file_content = json.load(file)
                         metadata = DatasetMetadata(**file_content)
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
             # If the dataset directory exists, reconstruct the client from the metadata file.
             if path_to_dataset.exists() and path_to_metadata.exists():
-                file = await asyncio.to_thread(open, path_to_metadata)
+                file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                 try:
                     file_content = json.load(file)
                 finally:
@@ -473,9 +473,10 @@ class FileSystemDatasetClient(DatasetClient):
         """
         # Retrieve and sort all JSON files in the dataset directory numerically.
         files = await asyncio.to_thread(
-            sorted,
-            self.path_to_dataset.glob('*.json'),
-            key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
+            lambda: sorted(
+                self.path_to_dataset.glob('*.json'),
+                key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
+            )
         )
         # Remove the metadata file from the list if present.

crawlee/storage_clients/_file_system/_key_value_store_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import asyncio
+import functools
 import json
 import shutil
 import urllib.parse
@@ -133,7 +134,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
                     continue
                 try:
-                    file = await asyncio.to_thread(path_to_metadata.open)
+                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                     try:
                         file_content = json.load(file)
                         metadata = KeyValueStoreMetadata(**file_content)
@@ -162,7 +163,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
             # If the key-value store directory exists, reconstruct the client from the metadata file.
             if path_to_kvs.exists() and path_to_metadata.exists():
-                file = await asyncio.to_thread(open, path_to_metadata)
+                file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                 try:
                     file_content = json.load(file)
                 finally:
@@ -239,7 +240,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
         # Read the metadata file
         async with self._lock:
             try:
-                file = await asyncio.to_thread(open, record_metadata_filepath)
+                file = await asyncio.to_thread(
+                    functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),
+                )
             except FileNotFoundError:
                 logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
                 return None
@@ -373,7 +376,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
         # List and sort all files *inside* a brief lock, then release it immediately:
         async with self._lock:
-            files = sorted(await asyncio.to_thread(list, self.path_to_kvs.glob('*')))
+            files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))
         count = 0

crawlee/storage_clients/_file_system/_request_queue_client.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import asyncio
+import functools
 import json
 import shutil
 from collections import deque
@@ -197,7 +198,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
                     continue
                 try:
-                    file = await asyncio.to_thread(path_to_metadata.open)
+                    file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
                     try:
                         file_content = json.load(file)
                         metadata = RequestQueueMetadata(**file_content)
@@ -232,7 +233,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
             # If the RQ directory exists, reconstruct the client from the metadata file.
             if path_to_rq.exists() and path_to_metadata.exists():
-                file = await asyncio.to_thread(open, path_to_metadata)
+                file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')
                 try:
                     file_content = json.load(file)
                 finally:
@@ -756,7 +757,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
         await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)
         # List all the json files.
-        files = await asyncio.to_thread(list, path_to_rq.glob('*.json'))
+        files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json')))
         # Filter out metadata file and non-file entries.
         filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
@@ -775,7 +776,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
         """
         # Open the request file.
         try:
-            file = await asyncio.to_thread(open, file_path)
+            file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))
         except FileNotFoundError:
             logger.warning(f'Request file "{file_path}" not found.')
             return None

crawlee/storage_clients/_redis/_client_mixin.py CHANGED Viewed

@@ -179,7 +179,7 @@ class RedisClientMixin:
         """Create a new Redis pipeline."""
         async with self._redis.pipeline() as pipe:
             try:
-                pipe.multi()  # type: ignore[no-untyped-call]
+                pipe.multi()
                 yield pipe
             finally:
                 if with_execute:
@@ -187,7 +187,6 @@ class RedisClientMixin:
     async def _create_storage(self, pipeline: Pipeline) -> None:
         """Create the actual storage structure in Redis."""
-        _ = pipeline  # To avoid unused variable mypy error
     async def _create_script(self, script_name: str) -> AsyncScript:
         """Load a Lua script from a file and return a Script object."""
@@ -262,8 +261,6 @@ class RedisClientMixin:
             pipeline: The Redis pipeline to use for the update.
             **kwargs: Storage-specific update parameters.
         """
-        _ = pipeline  # To avoid unused variable mypy error
-        _ = kwargs
     async def _update_metadata(
         self,

crawlee/storage_clients/_redis/_dataset_client.py CHANGED Viewed

@@ -179,13 +179,15 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin):
             case (True, int(), None):
                 json_path += f'[:-{offset}]'
             case (True, int(), int()):
-                json_path += f'[-{offset + limit}:-{offset}]'
+                # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
+                json_path += f'[-{offset + limit}:-{offset}]'  # ty: ignore[unsupported-operator]
             case (False, 0, int()):
                 json_path += f'[:{limit}]'
             case (False, int(), None):
                 json_path += f'[{offset}:]'
             case (False, int(), int()):
-                json_path += f'[{offset}:{offset + limit}]'
+                # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
+                json_path += f'[{offset}:{offset + limit}]'  # ty: ignore[unsupported-operator]
         if json_path == '$':
             json_path = '$[*]'
@@ -195,6 +197,8 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin):
         if data is None:
             data = []
+        data = [item for item in data if isinstance(item, dict)]
         if skip_empty:
             data = [item for item in data if item]

crawlee/storage_clients/_redis/_key_value_store_client.py CHANGED Viewed

@@ -144,7 +144,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
         async with self._get_pipeline() as pipe:
             # redis-py typing issue
-            await await_redis_response(pipe.hset(self._items_key, key, value_bytes))  # type: ignore[arg-type]
+            await await_redis_response(pipe.hset(self._items_key, key, value_bytes))  # ty: ignore[invalid-argument-type]
             await await_redis_response(
                 pipe.hset(
@@ -174,9 +174,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
         # Query the record by key
         # redis-py typing issue
-        value_bytes: bytes | None = await await_redis_response(
-            self._redis.hget(self._items_key, key)  # type: ignore[arg-type]
-        )
+        value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key))  # ty: ignore[invalid-assignment]
         if value_bytes is None:
             logger.warning(f'Value for key "{key}" is missing.')
@@ -225,7 +223,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
             raise TypeError('The items data was received in an incorrect format.')
         # Get all keys, sorted alphabetically
-        keys = sorted(items_data.keys())
+        keys = sorted(items_data.keys())  # ty: ignore[invalid-argument-type]
         # Apply exclusive_start_key filter if provided
         if exclusive_start_key is not None:

crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

crawlee 1.0.5b18py3-none-any.whl → 1.2.2b24py3-none-any.whl