PyPI - crawlee - Versions diffs - 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl - Mend

crawlee 1.0.3b6py3-none-any.whl → 1.2.2b24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

crawlee/__init__.py +2 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +32 -13
crawlee/_service_locator.py +4 -4
crawlee/_types.py +44 -5
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +27 -3
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +13 -6
crawlee/_utils/system.py +27 -11
crawlee/_utils/time.py +41 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +5 -2
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/browsers/_playwright_browser_controller.py +1 -1
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
crawlee/crawlers/_basic/_basic_crawler.py +156 -131
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +23 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +4 -6
crawlee/request_loaders/_sitemap_request_loader.py +23 -5
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +3 -3
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +51 -9
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +4 -0
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +292 -0
crawlee/storage_clients/_redis/_dataset_client.py +329 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
crawlee/storage_clients/_redis/_storage_client.py +149 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/_client_mixin.py +1 -1
crawlee/storage_clients/_sql/_db_models.py +1 -2
crawlee/storage_clients/models.py +8 -3
crawlee/storages/_key_value_store.py +5 -2
crawlee/storages/_storage_instance_manager.py +103 -44
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0

crawlee/storage_clients/_redis/_storage_client.py ADDED Viewed

@@ -0,0 +1,149 @@
+from __future__ import annotations
+import warnings
+from typing import Literal
+from redis.asyncio import Redis
+from typing_extensions import override
+from crawlee._utils.docs import docs_group
+from crawlee.configuration import Configuration
+from crawlee.storage_clients._base import StorageClient
+from ._dataset_client import RedisDatasetClient
+from ._key_value_store_client import RedisKeyValueStoreClient
+from ._request_queue_client import RedisRequestQueueClient
+@docs_group('Storage clients')
+class RedisStorageClient(StorageClient):
+    """Redis implementation of the storage client.
+    This storage client provides access to datasets, key-value stores, and request queues that persist data
+    to a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for
+    efficient storage and retrieval.
+    The client accepts either a Redis connection string or a pre-configured Redis client instance.
+    Exactly one of these parameters must be provided during initialization.
+    Storage types use the following Redis data structures:
+    - **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects
+    - **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage
+    - **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking,
+      and Bloom filters for request deduplication
+    Warning:
+        This is an experimental feature. The behavior and interface may change in future versions.
+    """
+    def __init__(
+        self,
+        *,
+        connection_string: str | None = None,
+        redis: Redis | None = None,
+        queue_dedup_strategy: Literal['default', 'bloom'] = 'default',
+        queue_bloom_error_rate: float = 1e-7,
+    ) -> None:
+        """Initialize the Redis storage client.
+        Args:
+            connection_string: Redis connection string (e.g., "redis://localhost:6379").
+                Supports standard Redis URL format with optional database selection.
+            redis: Pre-configured Redis client instance.
+            queue_dedup_strategy: Strategy for request queue deduplication. Options are:
+                - 'default': Uses Redis sets for exact deduplication.
+                - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using
+                    this approach, approximately 1 in 1e-7 requests will be falsely considered duplicate.
+            queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if
+                `queue_dedup_strategy` is set to 'bloom'.
+        """
+        if redis is None and connection_string is None:
+            raise ValueError('Either redis or connection_string must be provided.')
+        if redis is not None and connection_string is not None:
+            raise ValueError('Either redis or connection_string must be provided, not both.')
+        if isinstance(redis, Redis) and connection_string is None:
+            self._redis = redis
+        if isinstance(connection_string, str) and redis is None:
+            self._redis = Redis.from_url(connection_string)
+        self._redis: Redis  # to help type checker
+        self._queue_dedup_strategy = queue_dedup_strategy
+        self._queue_bloom_error_rate = queue_bloom_error_rate
+        # Call the notification only once
+        warnings.warn(
+            (
+                'RedisStorageClient is experimental and its API, behavior, and key structure may change in future '
+                'releases.'
+            ),
+            category=UserWarning,
+            stacklevel=2,
+        )
+    @override
+    async def create_dataset_client(
+        self,
+        *,
+        id: str | None = None,
+        name: str | None = None,
+        alias: str | None = None,
+        configuration: Configuration | None = None,
+    ) -> RedisDatasetClient:
+        configuration = configuration or Configuration.get_global_configuration()
+        client = await RedisDatasetClient.open(
+            id=id,
+            name=name,
+            alias=alias,
+            redis=self._redis,
+        )
+        await self._purge_if_needed(client, configuration)
+        return client
+    @override
+    async def create_kvs_client(
+        self,
+        *,
+        id: str | None = None,
+        name: str | None = None,
+        alias: str | None = None,
+        configuration: Configuration | None = None,
+    ) -> RedisKeyValueStoreClient:
+        configuration = configuration or Configuration.get_global_configuration()
+        client = await RedisKeyValueStoreClient.open(
+            id=id,
+            name=name,
+            alias=alias,
+            redis=self._redis,
+        )
+        await self._purge_if_needed(client, configuration)
+        return client
+    @override
+    async def create_rq_client(
+        self,
+        *,
+        id: str | None = None,
+        name: str | None = None,
+        alias: str | None = None,
+        configuration: Configuration | None = None,
+    ) -> RedisRequestQueueClient:
+        configuration = configuration or Configuration.get_global_configuration()
+        client = await RedisRequestQueueClient.open(
+            id=id,
+            name=name,
+            alias=alias,
+            redis=self._redis,
+            dedup_strategy=self._queue_dedup_strategy,
+            bloom_error_rate=self._queue_bloom_error_rate,
+        )
+        await self._purge_if_needed(client, configuration)
+        return client

crawlee/storage_clients/_redis/_utils.py ADDED Viewed

@@ -0,0 +1,23 @@
+from collections.abc import Awaitable
+from pathlib import Path
+from typing import TypeVar, overload
+T = TypeVar('T')
+@overload
+async def await_redis_response(response: Awaitable[T]) -> T: ...
+@overload
+async def await_redis_response(response: T) -> T: ...
+async def await_redis_response(response: Awaitable[T] | T) -> T:
+    """Solve the problem of ambiguous typing for redis."""
+    return await response if isinstance(response, Awaitable) else response
+def read_lua_script(script_name: str) -> str:
+    """Read a Lua script from a file."""
+    file_path = Path(__file__).parent / 'lua_scripts' / script_name
+    with file_path.open(mode='r', encoding='utf-8') as file:
+        return file.read()

crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua ADDED Viewed

@@ -0,0 +1,36 @@
+local added_filter_key = KEYS[1]
+local queue_key = KEYS[2]
+local data_key = KEYS[3]
+local forefront = ARGV[1] == '1'
+local unique_keys = cjson.decode(ARGV[2])
+local requests_data = cjson.decode(ARGV[3])
+-- Add and check which unique keys are actually new using Bloom filter
+local bf_results = redis.call('bf.madd', added_filter_key, unpack(unique_keys))
+local actually_added = {}
+local hset_args = {}
+-- Process the results
+for i, unique_key in ipairs(unique_keys) do
+    if bf_results[i] == 1 then
+        -- This key was added by us (did not exist before)
+        table.insert(hset_args, unique_key)
+        table.insert(hset_args, requests_data[unique_key])
+        table.insert(actually_added, unique_key)
+    end
+end
+-- Add only those that are actually new
+if #actually_added > 0 then
+    redis.call('hset', data_key, unpack(hset_args))
+    if forefront then
+        redis.call('lpush', queue_key, unpack(actually_added))
+    else
+        redis.call('rpush', queue_key, unpack(actually_added))
+    end
+end
+return cjson.encode(actually_added)

crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua ADDED Viewed

@@ -0,0 +1,49 @@
+local queue_key = KEYS[1]
+local in_progress_key = KEYS[2]
+local data_key = KEYS[3]
+local client_id = ARGV[1]
+local blocked_until_timestamp = ARGV[2]
+local batch_size = tonumber(ARGV[3])
+-- Pop batch unique_key from queue
+local batch_result = redis.call('LMPOP', 1, queue_key, 'LEFT', 'COUNT', batch_size)
+if not batch_result then
+    return nil
+end
+local unique_keys = batch_result[2]
+-- Get requests data
+local requests_data = redis.call('HMGET', data_key, unpack(unique_keys))
+if not requests_data then
+    -- Data missing, skip this request
+    return nil
+end
+-- Prepare results and update in_progress
+local final_result = {}
+local in_progress_hmset = {}
+local pending_decrement = 0
+local in_progress_data = cjson.encode({
+    client_id = client_id,
+    blocked_until_timestamp = tonumber(blocked_until_timestamp)
+})
+for i = 1, #unique_keys do
+    local unique_key = unique_keys[i]
+    local request_data = requests_data[i]
+    if request_data then
+        -- Add to in_progress hash
+        table.insert(in_progress_hmset, unique_key)
+        table.insert(in_progress_hmset, in_progress_data)
+        table.insert(final_result, request_data)
+    end
+end
+-- Update in_progress hash
+if #in_progress_hmset > 0 then
+    redis.call('HMSET', in_progress_key, unpack(in_progress_hmset))
+end
+-- Return result with requests data
+return final_result

crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua ADDED Viewed

@@ -0,0 +1,37 @@
+local added_filter_key = KEYS[1]
+local queue_key = KEYS[2]
+local data_key = KEYS[3]
+local forefront = ARGV[1] == '1'
+local unique_keys = cjson.decode(ARGV[2])
+local requests_data = cjson.decode(ARGV[3])
+-- Add and check which unique keys are actually new using Redis set
+local actually_added = {}
+local hset_args = {}
+-- Process each unique key
+for _, unique_key in ipairs(unique_keys) do
+    -- Try to add the key to the set, returns 1 if added, 0 if already existed
+    local set_result = redis.call('sadd', added_filter_key, unique_key)
+    if set_result == 1 then
+        -- This key was added by us (did not exist before)
+        table.insert(hset_args, unique_key)
+        table.insert(hset_args, requests_data[unique_key])
+        table.insert(actually_added, unique_key)
+    end
+end
+-- Add only those that are actually new
+if #actually_added > 0 then
+    redis.call('hset', data_key, unpack(hset_args))
+    if forefront then
+        redis.call('lpush', queue_key, unpack(actually_added))
+    else
+        redis.call('rpush', queue_key, unpack(actually_added))
+    end
+end
+return cjson.encode(actually_added)

crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua ADDED Viewed

@@ -0,0 +1,34 @@
+local in_progress_key = KEYS[1]
+local queue_key = KEYS[2]
+local data_key = KEYS[3]
+local current_time = tonumber(ARGV[1])
+local max_reclaim = 1000
+local cursor = "0"
+local count = 0
+repeat
+    local result = redis.call('hscan', in_progress_key, cursor, 'COUNT', 100)
+    cursor = result[1]
+    local entries = result[2]
+    for i = 1, #entries, 2 do
+        if count >= max_reclaim then
+            break
+        end
+        local unique_key = entries[i]
+        local data = cjson.decode(entries[i + 1])
+        -- Check if timed out
+        if current_time > data.blocked_until_timestamp then
+            -- Atomically remove from in_progress and add back to queue
+            redis.call('hdel', in_progress_key, unique_key)
+            redis.call('rpush', queue_key, unique_key)
+            count = count + 1
+        end
+    end
+until cursor == "0" or count >= max_reclaim
+return count

crawlee/storage_clients/_redis/py.typed ADDED Viewed

File without changes

crawlee/storage_clients/_sql/_client_mixin.py CHANGED Viewed

@@ -105,7 +105,7 @@ class SqlClientMixin(ABC):
         else:
             stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
             result = await session.execute(stmt)
-            orm_metadata = result.scalar_one_or_none()  # type: ignore[assignment]
+            orm_metadata = result.scalar_one_or_none()
         if orm_metadata:
             client = cls(id=orm_metadata.id, storage_client=storage_client)

crawlee/storage_clients/_sql/_db_models.py CHANGED Viewed

@@ -205,9 +205,8 @@ class RequestDb(Base):
             'idx_fetch_available',
             'request_queue_id',
             'is_handled',
-            'time_blocked_until',
             'sequence_number',
-            postgresql_where=text('is_handled = false'),
+            postgresql_where=text('is_handled is false'),
         ),
     )

crawlee/storage_clients/models.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 from datetime import datetime
-from typing import Annotated, Any, Generic
+from typing import TYPE_CHECKING, Annotated, Any, Generic
 from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
 from typing_extensions import TypeVar
@@ -127,8 +127,13 @@ class DatasetItemsListPage(BaseModel):
     desc: Annotated[bool, Field(default=False)]
     """Indicates if the returned list is in descending order."""
-    items: Annotated[list[dict], Field(default_factory=list)]
-    """The list of dataset items returned on this page."""
+    # Workaround for Pydantic and type checkers when using Annotated with default_factory
+    if TYPE_CHECKING:
+        items: list[dict] = []
+        """The list of dataset items returned on this page."""
+    else:
+        items: Annotated[list[dict], Field(default_factory=list)]
+        """The list of dataset items returned on this page."""
 @docs_group('Storage data')

crawlee/storages/_key_value_store.py CHANGED Viewed

@@ -281,11 +281,14 @@ class KeyValueStore(Storage):
             if key in cache:
                 return cache[key].current_value.root
+            async def kvs_factory() -> KeyValueStore:
+                return self
             cache[key] = recoverable_state = RecoverableState(
                 default_state=AutosavedValue(default_value),
-                persistence_enabled=True,
-                persist_state_kvs_id=self.id,
                 persist_state_key=key,
+                persistence_enabled=True,
+                persist_state_kvs_factory=kvs_factory,
                 logger=logger,
             )

crawlee/storages/_storage_instance_manager.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from __future__ import annotations
+from asyncio import Lock
 from collections import defaultdict
 from collections.abc import Coroutine, Hashable
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, TypeVar
+from weakref import WeakValueDictionary
 from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
 from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient
@@ -76,6 +78,7 @@ class StorageInstanceManager:
     def __init__(self) -> None:
         self._cache: _StorageCache = _StorageCache()
+        self._opener_locks: WeakValueDictionary[tuple, Lock] = WeakValueDictionary()
     async def open_storage_instance(
         self,
@@ -119,63 +122,71 @@ class StorageInstanceManager:
             if not any([name, alias, id]):
                 alias = self._DEFAULT_STORAGE_ALIAS
-            # Check cache
-            if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):
-                if isinstance(cached_instance, cls):
-                    return cached_instance
-                raise RuntimeError('Cached instance type mismatch.')
+            # Check cache without lock first for performance.
+            if cached_instance := self._get_from_cache(
+                cls,
+                id=id,
+                name=name,
+                alias=alias,
+                storage_client_cache_key=storage_client_cache_key,
+            ):
+                return cached_instance
-            if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):
-                if isinstance(cached_instance, cls):
-                    return cached_instance
-                raise RuntimeError('Cached instance type mismatch.')
+            # Validate storage name
+            if name is not None:
+                validate_storage_name(name)
-            if alias is not None and (
-                cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)
-            ):
-                if isinstance(cached_instance, cls):
+            # Acquire lock for this opener
+            opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key)
+            if not (lock := self._opener_locks.get(opener_lock_key)):
+                lock = Lock()
+                self._opener_locks[opener_lock_key] = lock
+            async with lock:
+                # Another task could have created the storage while we were waiting for the lock - check if that
+                # happened
+                if cached_instance := self._get_from_cache(
+                    cls,
+                    id=id,
+                    name=name,
+                    alias=alias,
+                    storage_client_cache_key=storage_client_cache_key,
+                ):
                     return cached_instance
-                raise RuntimeError('Cached instance type mismatch.')
-            # Check for conflicts between named and alias storages
-            if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):
-                raise ValueError(
-                    f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. '
-                    f'Use a different alias or drop the existing named storage first.'
+                # Check for conflicts between named and alias storages
+                self._check_name_alias_conflict(
+                    cls,
+                    name=name,
+                    alias=alias,
+                    storage_client_cache_key=storage_client_cache_key,
                 )
-            if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):
-                raise ValueError(
-                    f'Cannot create named storage "{name}" because an alias storage with the same name already exists. '
-                    f'Use a different name or drop the existing alias storage first.'
-                )
+                # Create new instance
+                client: KeyValueStoreClient | DatasetClient | RequestQueueClient
+                client = await client_opener_coro
-            # Validate storage name
-            if name is not None:
-                validate_storage_name(name)
-            # Create new instance
-            client: KeyValueStoreClient | DatasetClient | RequestQueueClient
-            client = await client_opener_coro
+                metadata = await client.get_metadata()
-            metadata = await client.get_metadata()
+                instance = cls(client, metadata.id, metadata.name)  # type: ignore[call-arg]
+                instance_name = getattr(instance, 'name', None)
-            instance = cls(client, metadata.id, metadata.name)  # type: ignore[call-arg]
-            instance_name = getattr(instance, 'name', None)
+                # Cache the instance.
+                # Note: No awaits in this section. All cache entries must be written
+                # atomically to ensure pre-checks outside the lock see consistent state.
-            # Cache the instance.
-            # Always cache by id.
-            self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance
+                # Always cache by id.
+                self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance
-            # Cache named storage.
-            if instance_name is not None:
-                self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance
+                # Cache named storage.
+                if instance_name is not None:
+                    self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance
-            # Cache unnamed storage.
-            if alias is not None:
-                self._cache.by_alias[cls][alias][storage_client_cache_key] = instance
+                # Cache unnamed storage.
+                if alias is not None:
+                    self._cache.by_alias[cls][alias][storage_client_cache_key] = instance
-            return instance
+                return instance
         finally:
             # Make sure the client opener is closed.
@@ -193,3 +204,51 @@ class StorageInstanceManager:
     def clear_cache(self) -> None:
         """Clear all cached storage instances."""
         self._cache = _StorageCache()
+    def _get_from_cache(
+        self,
+        cls: type[T],
+        *,
+        id: str | None = None,
+        name: str | None = None,
+        alias: str | None = None,
+        storage_client_cache_key: Hashable = '',
+    ) -> T | None:
+        """Get a storage instance from the cache."""
+        if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):
+            if isinstance(cached_instance, cls):
+                return cached_instance
+            raise RuntimeError('Cached instance type mismatch.')
+        if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):
+            if isinstance(cached_instance, cls):
+                return cached_instance
+            raise RuntimeError('Cached instance type mismatch.')
+        if alias is not None and (cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)):
+            if isinstance(cached_instance, cls):
+                return cached_instance
+            raise RuntimeError('Cached instance type mismatch.')
+        return None
+    def _check_name_alias_conflict(
+        self,
+        cls: type[T],
+        *,
+        name: str | None = None,
+        alias: str | None = None,
+        storage_client_cache_key: Hashable = '',
+    ) -> None:
+        """Check for conflicts between named and alias storages."""
+        if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):
+            raise ValueError(
+                f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. '
+                f'Use a different alias or drop the existing named storage first.'
+            )
+        if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):
+            raise ValueError(
+                f'Cannot create named storage "{name}" because an alias storage with the same name already exists. '
+                f'Use a different name or drop the existing alias storage first.'
+            )

{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: crawlee
-Version: 1.0.3b6
+Version: 1.2.2b24
 Summary: Crawlee for Python
 Project-URL: Apify Homepage, https://apify.com
 Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -223,15 +223,17 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Classifier: Topic :: Software Development :: Libraries
 Requires-Python: >=3.10
+Requires-Dist: async-timeout>=5.0.1
 Requires-Dist: cachetools>=5.5.0
 Requires-Dist: colorama>=0.4.0
-Requires-Dist: impit>=0.6.1
+Requires-Dist: impit>=0.8.0
 Requires-Dist: more-itertools>=10.2.0
 Requires-Dist: protego>=0.5.0
 Requires-Dist: psutil>=6.0.0
-Requires-Dist: pydantic-settings!=2.7.0,!=2.7.1,!=2.8.0,>=2.2.0
+Requires-Dist: pydantic-settings>=2.12.0
 Requires-Dist: pydantic>=2.11.0
 Requires-Dist: pyee>=9.0.0
 Requires-Dist: tldextract>=5.1.0
@@ -263,6 +265,7 @@ Requires-Dist: opentelemetry-sdk>=1.34.1; extra == 'all'
 Requires-Dist: opentelemetry-semantic-conventions>=0.54; extra == 'all'
 Requires-Dist: parsel>=1.10.0; extra == 'all'
 Requires-Dist: playwright>=1.27.0; extra == 'all'
+Requires-Dist: redis[hiredis]>=7.0.0; extra == 'all'
 Requires-Dist: rich>=13.9.0; extra == 'all'
 Requires-Dist: scikit-learn>=1.6.0; extra == 'all'
 Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'all'
@@ -296,6 +299,8 @@ Provides-Extra: playwright
 Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'playwright'
 Requires-Dist: browserforge>=1.2.3; extra == 'playwright'
 Requires-Dist: playwright>=1.27.0; extra == 'playwright'
+Provides-Extra: redis
+Requires-Dist: redis[hiredis]>=7.0.0; extra == 'redis'
 Provides-Extra: sql-postgres
 Requires-Dist: asyncpg>=0.24.0; extra == 'sql-postgres'
 Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-postgres'
@@ -319,19 +324,12 @@ Description-Content-Type: text/markdown
     <a href="https://trendshift.io/repositories/11169" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11169" alt="apify%2Fcrawlee-python | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>
-<p align=center>
-    <a href="https://badge.fury.io/py/crawlee" rel="nofollow">
-        <img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI version" style="max-width: 100%;">
-    </a>
-    <a href="https://pypi.org/project/crawlee/" rel="nofollow">
-        <img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI - Downloads" style="max-width: 100%;">
-    </a>
-    <a href="https://pypi.org/project/crawlee/" rel="nofollow">
-        <img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI - Python Version" style="max-width: 100%;">
-    </a>
-    <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow">
-        <img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on discord" style="max-width: 100%;">
-    </a>
+<p align="center">
+  <a href="https://badge.fury.io/py/crawlee" rel="nofollow"><img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI package version"></a>
+  <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI package downloads"></a>
+  <a href="https://codecov.io/gh/apify/crawlee-python"><img src="https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG" alt="Codecov report"></a>
+  <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI Python version"></a>
+  <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow"><img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on Discord"></a>
 </p>
 Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**

crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

crawlee 1.0.3b6py3-none-any.whl → 1.2.2b24py3-none-any.whl