PyPI - crawlee - Versions diffs - 1.0.3b6__py3-none-any.whl → 1.0.5b18__py3-none-any.whl - Mend

crawlee 1.0.3b6py3-none-any.whl → 1.0.5b18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

crawlee/_service_locator.py +4 -4
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +15 -0
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +1 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +4 -1
crawlee/browsers/_playwright_browser_controller.py +1 -1
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
crawlee/crawlers/_basic/_basic_crawler.py +23 -12
crawlee/crawlers/_playwright/_playwright_crawler.py +11 -4
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/otel/crawler_instrumentor.py +3 -3
crawlee/request_loaders/_sitemap_request_loader.py +5 -0
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_statistics.py +41 -31
crawlee/storage_clients/__init__.py +4 -0
crawlee/storage_clients/_file_system/_request_queue_client.py +24 -6
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +295 -0
crawlee/storage_clients/_redis/_dataset_client.py +325 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
crawlee/storage_clients/_redis/_storage_client.py +146 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/_db_models.py +1 -2
crawlee/storage_clients/_sql/_storage_client.py +9 -0
crawlee/storages/_key_value_store.py +5 -2
{crawlee-1.0.3b6.dist-info → crawlee-1.0.5b18.dist-info}/METADATA +9 -5
{crawlee-1.0.3b6.dist-info → crawlee-1.0.5b18.dist-info}/RECORD +43 -31
{crawlee-1.0.3b6.dist-info → crawlee-1.0.5b18.dist-info}/WHEEL +0 -0
{crawlee-1.0.3b6.dist-info → crawlee-1.0.5b18.dist-info}/entry_points.txt +0 -0
{crawlee-1.0.3b6.dist-info → crawlee-1.0.5b18.dist-info}/licenses/LICENSE +0 -0

crawlee/storage_clients/_redis/_dataset_client.py ADDED Viewed

@@ -0,0 +1,325 @@
+from __future__ import annotations
+from logging import getLogger
+from typing import TYPE_CHECKING, Any, cast
+from typing_extensions import NotRequired, override
+from crawlee.storage_clients._base import DatasetClient
+from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
+from ._client_mixin import MetadataUpdateParams, RedisClientMixin
+from ._utils import await_redis_response
+if TYPE_CHECKING:
+    from collections.abc import AsyncIterator
+    from redis.asyncio import Redis
+    from redis.asyncio.client import Pipeline
+logger = getLogger(__name__)
+class _DatasetMetadataUpdateParams(MetadataUpdateParams):
+    """Parameters for updating dataset metadata."""
+    new_item_count: NotRequired[int]
+    delta_item_count: NotRequired[int]
+class RedisDatasetClient(DatasetClient, RedisClientMixin):
+    """Redis implementation of the dataset client.
+    This client persists dataset items to Redis using JSON arrays for efficient storage and retrieval.
+    Items are stored as JSON objects with automatic ordering preservation through Redis list operations.
+    The dataset data is stored in Redis using the following key pattern:
+    - `datasets:{name}:items` - Redis JSON array containing all dataset items.
+    - `datasets:{name}:metadata` - Redis JSON object containing dataset metadata.
+    Items must be JSON-serializable dictionaries. Single items or lists of items can be pushed to the dataset.
+    The item ordering is preserved through Redis JSON array operations. All operations provide atomic consistency
+    through Redis transactions and pipeline operations.
+    """
+    _DEFAULT_NAME = 'default'
+    """Default Dataset name key prefix when none provided."""
+    _MAIN_KEY = 'datasets'
+    """Main Redis key prefix for Dataset."""
+    _CLIENT_TYPE = 'Dataset'
+    """Human-readable client type for error messages."""
+    def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
+        """Initialize a new instance.
+        Preferably use the `RedisDatasetClient.open` class method to create a new instance.
+        Args:
+            storage_name: Internal storage name used for Redis keys.
+            storage_id: Unique identifier for the dataset.
+            redis: Redis client instance.
+        """
+        super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)
+    @property
+    def _items_key(self) -> str:
+        """Return the Redis key for the items of this dataset."""
+        return f'{self._MAIN_KEY}:{self._storage_name}:items'
+    @classmethod
+    async def open(
+        cls,
+        *,
+        id: str | None,
+        name: str | None,
+        alias: str | None,
+        redis: Redis,
+    ) -> RedisDatasetClient:
+        """Open or create a new Redis dataset client.
+        This method attempts to open an existing dataset from the Redis database. If a dataset with the specified
+        ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
+        is created.
+        Args:
+            id: The ID of the dataset. If not provided, a random ID will be generated.
+            name: The name of the dataset for named (global scope) storages.
+            alias: The alias of the dataset for unnamed (run scope) storages.
+            redis: Redis client instance.
+        Returns:
+            An instance for the opened or created storage client.
+        """
+        return await cls._open(
+            id=id,
+            name=name,
+            alias=alias,
+            redis=redis,
+            metadata_model=DatasetMetadata,
+            extra_metadata_fields={'item_count': 0},
+            instance_kwargs={},
+        )
+    @override
+    async def get_metadata(self) -> DatasetMetadata:
+        return await self._get_metadata(DatasetMetadata)
+    @override
+    async def drop(self) -> None:
+        await self._drop(extra_keys=[self._items_key])
+    @override
+    async def purge(self) -> None:
+        await self._purge(
+            extra_keys=[self._items_key],
+            metadata_kwargs=_DatasetMetadataUpdateParams(
+                new_item_count=0, update_accessed_at=True, update_modified_at=True
+            ),
+        )
+    @override
+    async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
+        if isinstance(data, dict):
+            data = [data]
+        async with self._get_pipeline() as pipe:
+            pipe.json().arrappend(self._items_key, '$', *data)
+            await self._update_metadata(
+                pipe,
+                **_DatasetMetadataUpdateParams(
+                    update_accessed_at=True, update_modified_at=True, delta_item_count=len(data)
+                ),
+            )
+    @override
+    async def get_data(
+        self,
+        *,
+        offset: int = 0,
+        limit: int | None = 999_999_999_999,
+        clean: bool = False,
+        desc: bool = False,
+        fields: list[str] | None = None,
+        omit: list[str] | None = None,
+        unwind: list[str] | None = None,
+        skip_empty: bool = False,
+        skip_hidden: bool = False,
+        flatten: list[str] | None = None,
+        view: str | None = None,
+    ) -> DatasetItemsListPage:
+        # Check for unsupported arguments and log a warning if found
+        unsupported_args: dict[str, Any] = {
+            'clean': clean,
+            'fields': fields,
+            'omit': omit,
+            'unwind': unwind,
+            'skip_hidden': skip_hidden,
+            'flatten': flatten,
+            'view': view,
+        }
+        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}
+        if unsupported:
+            logger.warning(
+                f'The arguments {list(unsupported.keys())} of get_data are not supported '
+                f'by the {self.__class__.__name__} client.'
+            )
+        metadata = await self.get_metadata()
+        total = metadata.item_count
+        json_path = '$'
+        # Apply sorting and pagination
+        match (desc, offset, limit):
+            case (True, 0, int()):
+                json_path += f'[-{limit}:]'
+            case (True, int(), None):
+                json_path += f'[:-{offset}]'
+            case (True, int(), int()):
+                json_path += f'[-{offset + limit}:-{offset}]'
+            case (False, 0, int()):
+                json_path += f'[:{limit}]'
+            case (False, int(), None):
+                json_path += f'[{offset}:]'
+            case (False, int(), int()):
+                json_path += f'[{offset}:{offset + limit}]'
+        if json_path == '$':
+            json_path = '$[*]'
+        data = await await_redis_response(self._redis.json().get(self._items_key, json_path))
+        if data is None:
+            data = []
+        if skip_empty:
+            data = [item for item in data if item]
+        if desc:
+            data = list(reversed(data))
+        async with self._get_pipeline() as pipe:
+            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))
+        return DatasetItemsListPage(
+            count=len(data),
+            offset=offset,
+            limit=limit or (total - offset),
+            total=total,
+            desc=desc,
+            items=data,
+        )
+    @override
+    async def iterate_items(
+        self,
+        *,
+        offset: int = 0,
+        limit: int | None = None,
+        clean: bool = False,
+        desc: bool = False,
+        fields: list[str] | None = None,
+        omit: list[str] | None = None,
+        unwind: list[str] | None = None,
+        skip_empty: bool = False,
+        skip_hidden: bool = False,
+    ) -> AsyncIterator[dict[str, Any]]:
+        """Iterate over dataset items one by one.
+        This method yields items individually instead of loading all items at once,
+        which is more memory efficient for large datasets.
+        """
+        # Log warnings for unsupported arguments
+        unsupported_args: dict[str, Any] = {
+            'clean': clean,
+            'fields': fields,
+            'omit': omit,
+            'unwind': unwind,
+            'skip_hidden': skip_hidden,
+        }
+        unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}
+        if unsupported:
+            logger.warning(
+                f'The arguments {list(unsupported.keys())} of iterate_items are not supported '
+                f'by the {self.__class__.__name__} client.'
+            )
+        metadata = await self.get_metadata()
+        total_items = metadata.item_count
+        # Calculate actual range based on parameters
+        start_idx = offset
+        end_idx = min(total_items, offset + limit) if limit is not None else total_items
+        # Update accessed_at timestamp
+        async with self._get_pipeline() as pipe:
+            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))
+        # Process items in batches for better network efficiency
+        batch_size = 100
+        for batch_start in range(start_idx, end_idx, batch_size):
+            batch_end = min(batch_start + batch_size, end_idx)
+            # Build JsonPath for batch slice
+            if desc:
+                # For descending order, we need to reverse the slice calculation
+                desc_batch_start = total_items - batch_end
+                desc_batch_end = total_items - batch_start
+                json_path = f'$[{desc_batch_start}:{desc_batch_end}]'
+            else:
+                json_path = f'$[{batch_start}:{batch_end}]'
+            # Get batch of items
+            batch_items = await await_redis_response(self._redis.json().get(self._items_key, json_path))
+            # Handle case where batch_items might be None or not a list
+            if batch_items is None:
+                continue
+            # Reverse batch if desc order (since we got items in normal order but need desc)
+            items_iter = reversed(batch_items) if desc else iter(batch_items)
+            # Yield items from batch
+            for item in items_iter:
+                # Apply skip_empty filter
+                if skip_empty and not item:
+                    continue
+                yield cast('dict[str, Any]', item)
+        async with self._get_pipeline() as pipe:
+            await self._update_metadata(pipe, **_DatasetMetadataUpdateParams(update_accessed_at=True))
+    @override
+    async def _create_storage(self, pipeline: Pipeline) -> None:
+        """Create the main dataset keys in Redis."""
+        # Create an empty JSON array for items
+        await await_redis_response(pipeline.json().set(self._items_key, '$', []))
+    @override
+    async def _specific_update_metadata(
+        self,
+        pipeline: Pipeline,
+        *,
+        new_item_count: int | None = None,
+        delta_item_count: int | None = None,
+        **_kwargs: Any,
+    ) -> None:
+        """Update the dataset metadata in the database.
+        Args:
+            pipeline: The Redis pipeline to use for the update.
+            new_item_count: If provided, update the item count to this value.
+            delta_item_count: If provided, increment the item count by this value.
+        """
+        if new_item_count is not None:
+            await await_redis_response(
+                pipeline.json().set(self.metadata_key, '$.item_count', new_item_count, nx=False, xx=True)
+            )
+        elif delta_item_count is not None:
+            await await_redis_response(pipeline.json().numincrby(self.metadata_key, '$.item_count', delta_item_count))

crawlee/storage_clients/_redis/_key_value_store_client.py ADDED Viewed

@@ -0,0 +1,264 @@
+from __future__ import annotations
+import json
+from logging import getLogger
+from typing import TYPE_CHECKING, Any
+from typing_extensions import override
+from crawlee._utils.file import infer_mime_type
+from crawlee.storage_clients._base import KeyValueStoreClient
+from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
+from ._client_mixin import MetadataUpdateParams, RedisClientMixin
+from ._utils import await_redis_response
+if TYPE_CHECKING:
+    from collections.abc import AsyncIterator
+    from redis.asyncio import Redis
+logger = getLogger(__name__)
+class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
+    """Redis implementation of the key-value store client.
+    This client persists key-value data to Redis using hash data structures for efficient storage and retrieval.
+    Keys are mapped to values with automatic content type detection and size tracking for metadata management.
+    The key-value store data is stored in Redis using the following key pattern:
+    - `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data).
+    - `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key.
+    - `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata.
+    Values are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings,
+    text values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles
+    content type detection and maintains metadata about each record including size and MIME type information.
+    All operations are atomic through Redis hash operations and pipeline transactions. The client supports
+    concurrent access through Redis's built-in atomic operations for hash fields.
+    """
+    _DEFAULT_NAME = 'default'
+    """Default Key-Value Store name key prefix when none provided."""
+    _MAIN_KEY = 'key_value_stores'
+    """Main Redis key prefix for Key-Value Store."""
+    _CLIENT_TYPE = 'Key-value store'
+    """Human-readable client type for error messages."""
+    def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
+        """Initialize a new instance.
+        Preferably use the `RedisKeyValueStoreClient.open` class method to create a new instance.
+        """
+        super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)
+    @property
+    def _items_key(self) -> str:
+        """Return the Redis key for the items of KVS."""
+        return f'{self._MAIN_KEY}:{self._storage_name}:items'
+    @property
+    def _metadata_items_key(self) -> str:
+        """Return the Redis key for the items metadata of KVS."""
+        return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items'
+    @classmethod
+    async def open(
+        cls,
+        *,
+        id: str | None,
+        name: str | None,
+        alias: str | None,
+        redis: Redis,
+    ) -> RedisKeyValueStoreClient:
+        """Open or create a new Redis key-value store client.
+        This method attempts to open an existing key-value store from the Redis database. If a store with the specified
+        ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
+        is created.
+        Args:
+            id: The ID of the key-value store. If not provided, a random ID will be generated.
+            name: The name of the key-value store for named (global scope) storages.
+            alias: The alias of the key-value store for unnamed (run scope) storages.
+            redis: Redis client instance.
+        Returns:
+            An instance for the opened or created storage client.
+        """
+        return await cls._open(
+            id=id,
+            name=name,
+            alias=alias,
+            redis=redis,
+            metadata_model=KeyValueStoreMetadata,
+            extra_metadata_fields={},
+            instance_kwargs={},
+        )
+    @override
+    async def get_metadata(self) -> KeyValueStoreMetadata:
+        return await self._get_metadata(KeyValueStoreMetadata)
+    @override
+    async def drop(self) -> None:
+        await self._drop(extra_keys=[self._items_key, self._metadata_items_key])
+    @override
+    async def purge(self) -> None:
+        await self._purge(
+            extra_keys=[self._items_key, self._metadata_items_key],
+            metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True),
+        )
+    @override
+    async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
+        # Special handling for None values
+        if value is None:
+            content_type = 'application/x-none'  # Special content type to identify None values
+            value_bytes = b''
+        else:
+            content_type = content_type or infer_mime_type(value)
+            # Serialize the value to bytes.
+            if 'application/json' in content_type:
+                value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')
+            elif isinstance(value, str):
+                value_bytes = value.encode('utf-8')
+            elif isinstance(value, (bytes, bytearray)):
+                value_bytes = value
+            else:
+                # Fallback: attempt to convert to string and encode.
+                value_bytes = str(value).encode('utf-8')
+        size = len(value_bytes)
+        item_metadata = KeyValueStoreRecordMetadata(
+            key=key,
+            content_type=content_type,
+            size=size,
+        )
+        async with self._get_pipeline() as pipe:
+            # redis-py typing issue
+            await await_redis_response(pipe.hset(self._items_key, key, value_bytes))  # type: ignore[arg-type]
+            await await_redis_response(
+                pipe.hset(
+                    self._metadata_items_key,
+                    key,
+                    item_metadata.model_dump_json(),
+                )
+            )
+            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))
+    @override
+    async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
+        serialized_metadata_item = await await_redis_response(self._redis.hget(self._metadata_items_key, key))
+        async with self._get_pipeline() as pipe:
+            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True))
+        if not isinstance(serialized_metadata_item, (str, bytes, bytearray)):
+            logger.warning(f'Metadata for key "{key}" is missing or invalid.')
+            return None
+        metadata_item = KeyValueStoreRecordMetadata.model_validate_json(serialized_metadata_item)
+        # Handle None values
+        if metadata_item.content_type == 'application/x-none':
+            return KeyValueStoreRecord(value=None, **metadata_item.model_dump())
+        # Query the record by key
+        # redis-py typing issue
+        value_bytes: bytes | None = await await_redis_response(
+            self._redis.hget(self._items_key, key)  # type: ignore[arg-type]
+        )
+        if value_bytes is None:
+            logger.warning(f'Value for key "{key}" is missing.')
+            return None
+        # Handle JSON values
+        if 'application/json' in metadata_item.content_type:
+            try:
+                value = json.loads(value_bytes.decode('utf-8'))
+            except (json.JSONDecodeError, UnicodeDecodeError):
+                logger.warning(f'Failed to decode JSON value for key "{key}"')
+                return None
+        # Handle text values
+        elif metadata_item.content_type.startswith('text/'):
+            try:
+                value = value_bytes.decode('utf-8')
+            except UnicodeDecodeError:
+                logger.warning(f'Failed to decode text value for key "{key}"')
+                return None
+        # Handle binary values
+        else:
+            value = value_bytes
+        return KeyValueStoreRecord(value=value, **metadata_item.model_dump())
+    @override
+    async def delete_value(self, *, key: str) -> None:
+        async with self._get_pipeline() as pipe:
+            await await_redis_response(pipe.hdel(self._items_key, key))
+            await await_redis_response(pipe.hdel(self._metadata_items_key, key))
+            await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))
+    @override
+    async def iterate_keys(
+        self,
+        *,
+        exclusive_start_key: str | None = None,
+        limit: int | None = None,
+    ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
+        items_data = await await_redis_response(self._redis.hgetall(self._metadata_items_key))
+        if not items_data:
+            return  # No items to iterate over
+        if not isinstance(items_data, dict):
+            raise TypeError('The items data was received in an incorrect format.')
+        # Get all keys, sorted alphabetically
+        keys = sorted(items_data.keys())
+        # Apply exclusive_start_key filter if provided
+        if exclusive_start_key is not None:
+            bytes_exclusive_start_key = exclusive_start_key.encode()
+            keys = [k for k in keys if k > bytes_exclusive_start_key]
+        # Apply limit if provided
+        if limit is not None:
+            keys = keys[:limit]
+        # Yield metadata for each key
+        for key in keys:
+            record = items_data[key]
+            yield KeyValueStoreRecordMetadata.model_validate_json(record)
+        async with self._get_pipeline() as pipe:
+            await self._update_metadata(
+                pipe,
+                **MetadataUpdateParams(update_accessed_at=True),
+            )
+    @override
+    async def get_public_url(self, *, key: str) -> str:
+        raise NotImplementedError('Public URLs are not supported for memory key-value stores.')
+    @override
+    async def record_exists(self, *, key: str) -> bool:
+        async with self._get_pipeline(with_execute=False) as pipe:
+            await await_redis_response(pipe.hexists(self._items_key, key))
+            await self._update_metadata(
+                pipe,
+                **MetadataUpdateParams(update_accessed_at=True),
+            )
+            results = await pipe.execute()
+        return bool(results[0])

crawlee 1.0.3b6__py3-none-any.whl → 1.0.5b18__py3-none-any.whl

crawlee 1.0.3b6py3-none-any.whl → 1.0.5b18py3-none-any.whl