PyPI - crawlee - Versions diffs - 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl - Mend

crawlee 0.6.13b31py3-none-any.whl → 1.1.1b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (82) hide show

crawlee/_autoscaling/snapshotter.py +1 -1
crawlee/_request.py +34 -22
crawlee/_service_locator.py +44 -24
crawlee/_types.py +86 -33
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +15 -0
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +1 -1
crawlee/_utils/system.py +3 -3
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +4 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +2 -0
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
crawlee/crawlers/_basic/_basic_crawler.py +124 -37
crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
crawlee/events/_event_manager.py +3 -1
crawlee/events/_types.py +6 -6
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/fingerprint_suite/_types.py +2 -2
crawlee/otel/crawler_instrumentor.py +3 -3
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_request_list.py +1 -1
crawlee/request_loaders/_sitemap_request_loader.py +23 -5
crawlee/sessions/_models.py +2 -2
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +33 -2
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_storage_client.py +13 -0
crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
crawlee/storage_clients/_file_system/_storage_client.py +16 -3
crawlee/storage_clients/_file_system/_utils.py +0 -0
crawlee/storage_clients/_memory/_dataset_client.py +16 -4
crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
crawlee/storage_clients/_memory/_storage_client.py +6 -3
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +295 -0
crawlee/storage_clients/_redis/_dataset_client.py +325 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
crawlee/storage_clients/_redis/_storage_client.py +146 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +291 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +10 -10
crawlee/storages/_base.py +5 -1
crawlee/storages/_dataset.py +12 -2
crawlee/storages/_key_value_store.py +17 -4
crawlee/storages/_request_queue.py +10 -2
crawlee/storages/_storage_instance_manager.py +133 -71
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0

crawlee/sessions/_models.py CHANGED Viewed

@@ -20,7 +20,7 @@ from ._session import Session
 class SessionModel(BaseModel):
     """Model for a Session object."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     id: Annotated[str, Field(alias='id')]
     max_age: Annotated[timedelta, Field(alias='maxAge')]
@@ -38,7 +38,7 @@ class SessionModel(BaseModel):
 class SessionPoolModel(BaseModel):
     """Model for a SessionPool object."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     max_pool_size: Annotated[int, Field(alias='maxPoolSize')]

crawlee/sessions/_session_pool.py CHANGED Viewed

@@ -163,7 +163,7 @@ class SessionPool:
     def add_session(self, session: Session) -> None:
         """Add an externally created session to the pool.
-        This is intened only for the cases when you want to add a session that was created outside of the pool.
+        This is intended only for the cases when you want to add a session that was created outside of the pool.
         Otherwise, the pool will create new sessions automatically.
         Args:

crawlee/statistics/_error_snapshotter.py CHANGED Viewed

@@ -32,7 +32,7 @@ class ErrorSnapshotter:
         """Capture error snapshot and save it to key value store.
         It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
-        it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
+        it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
         returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
         an exception.

crawlee/statistics/_models.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import json
+import warnings
 from dataclasses import asdict, dataclass
 from datetime import datetime, timedelta, timezone
 from typing import Annotated, Any
@@ -57,7 +58,7 @@ class FinalStatistics:
 class StatisticsState(BaseModel):
     """Statistic data about a crawler run."""
-    model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
     stats_id: Annotated[int | None, Field(alias='statsId')] = None
     requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
     crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
     crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
     crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
-    crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
     errors: dict[str, Any] = Field(default_factory=dict)
     retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
     requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
         ),
     ] = {}
+    # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
+    _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
+    def model_post_init(self, /, __context: Any) -> None:
+        self._runtime_offset = self.crawler_runtime or self._runtime_offset
+    @property
+    def crawler_runtime(self) -> timedelta:
+        if self.crawler_last_started_at:
+            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
+            return self._runtime_offset + finished_at - self.crawler_last_started_at
+        return self._runtime_offset
+    @crawler_runtime.setter
+    def crawler_runtime(self, value: timedelta) -> None:
+        # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
+        # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
+        warnings.warn(
+            f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
+            f' Value {value} will not be used.',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+    @computed_field(alias='crawlerRuntimeMillis')
+    def crawler_runtime_for_serialization(self) -> timedelta:
+        if self.crawler_last_started_at:
+            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
+            return self._runtime_offset + finished_at - self.crawler_last_started_at
+        return self._runtime_offset
     @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)  # type: ignore[prop-decorator]
     @property
     def request_total_duration(self) -> timedelta:

crawlee/statistics/_statistics.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
 from __future__ import annotations
+import asyncio
 import math
 import time
 from datetime import datetime, timedelta, timezone
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
 from crawlee.statistics._error_tracker import ErrorTracker
 if TYPE_CHECKING:
+    from collections.abc import Callable, Coroutine
     from types import TracebackType
+    from crawlee.storages import KeyValueStore
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
 TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
 logger = getLogger(__name__)
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
         persistence_enabled: bool | Literal['explicit_only'] = False,
         persist_state_kvs_name: str | None = None,
         persist_state_key: str | None = None,
+        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
         log_message: str = 'Statistics',
         periodic_message_logger: Logger | None = None,
         log_interval: timedelta = timedelta(minutes=1),
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
         self._id = Statistics.__next_id
         Statistics.__next_id += 1
-        self._instance_start: datetime | None = None
         self.error_tracker = ErrorTracker(
             save_error_snapshots=save_error_snapshots,
             snapshot_kvs_name=persist_state_kvs_name,
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
         self._state = RecoverableState(
             default_state=state_model(stats_id=self._id),
-            persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}',
+            persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
             persistence_enabled=persistence_enabled,
             persist_state_kvs_name=persist_state_kvs_name,
+            persist_state_kvs_factory=persist_state_kvs_factory,
             logger=logger,
         )
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
         """Create near copy of the `Statistics` with replaced `state_model`."""
         new_statistics: Statistics[TNewStatisticsState] = Statistics(
             persistence_enabled=self._state._persistence_enabled,  # noqa: SLF001
-            persist_state_kvs_name=self._state._persist_state_kvs_name,  # noqa: SLF001
             persist_state_key=self._state._persist_state_key,  # noqa: SLF001
+            persist_state_kvs_factory=self._state._persist_state_kvs_factory,  # noqa: SLF001
             log_message=self._log_message,
             periodic_message_logger=self._periodic_message_logger,
             state_model=state_model,
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
         persistence_enabled: bool = False,
         persist_state_kvs_name: str | None = None,
         persist_state_key: str | None = None,
+        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
         log_message: str = 'Statistics',
         periodic_message_logger: Logger | None = None,
         log_interval: timedelta = timedelta(minutes=1),
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
             persistence_enabled=persistence_enabled,
             persist_state_kvs_name=persist_state_kvs_name,
             persist_state_key=persist_state_key,
+            persist_state_kvs_factory=persist_state_kvs_factory,
             log_message=log_message,
             periodic_message_logger=periodic_message_logger,
             log_interval=log_interval,
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
         if self._active:
             raise RuntimeError(f'The {self.__class__.__name__} is already active.')
-        self._active = True
-        self._instance_start = datetime.now(timezone.utc)
         await self._state.initialize()
-        self._after_initialize()
+        # Reset `crawler_finished_at` to indicate a new run in progress.
+        self.state.crawler_finished_at = None
+        # Start periodic logging and let it print initial state before activation.
         self._periodic_logger.start()
+        await asyncio.sleep(0.01)
+        self._active = True
+        self.state.crawler_last_started_at = datetime.now(timezone.utc)
+        self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
         return self
     async def __aexit__(
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
         if not self._active:
             raise RuntimeError(f'The {self.__class__.__name__} is not active.')
-        self._state.current_value.crawler_finished_at = datetime.now(timezone.utc)
-        await self._state.teardown()
+        if not self.state.crawler_last_started_at:
+            raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
+        # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
         await self._periodic_logger.stop()
+        self.state.crawler_finished_at = datetime.now(timezone.utc)
         self._active = False
+        await self._state.teardown()
     @property
     def state(self) -> TStatisticsState:
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
     def calculate(self) -> FinalStatistics:
         """Calculate the current statistics."""
-        if self._instance_start is None:
-            raise RuntimeError('The Statistics object is not initialized')
-        crawler_runtime = datetime.now(timezone.utc) - self._instance_start
-        total_minutes = crawler_runtime.total_seconds() / 60
+        total_minutes = self.state.crawler_runtime.total_seconds() / 60
         state = self._state.current_value
         serialized_state = state.model_dump(by_alias=False)
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
             requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
             request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
             requests_total=state.requests_failed + state.requests_finished,
-            crawler_runtime=crawler_runtime,
+            crawler_runtime=state.crawler_runtime,
             requests_finished=state.requests_finished,
             requests_failed=state.requests_failed,
             retry_histogram=serialized_state['request_retry_histogram'],
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
         else:
             self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
-    def _after_initialize(self) -> None:
-        state = self._state.current_value
-        if state.crawler_started_at is None:
-            state.crawler_started_at = datetime.now(timezone.utc)
-        if state.stats_persisted_at is not None and state.crawler_last_started_at:
-            self._instance_start = datetime.now(timezone.utc) - (
-                state.stats_persisted_at - state.crawler_last_started_at
-            )
-        elif state.crawler_last_started_at:
-            self._instance_start = state.crawler_last_started_at
-        state.crawler_last_started_at = self._instance_start
     def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
         retry_count = record.retry_count
         state = self._state.current_value

crawlee/storage_clients/__init__.py CHANGED Viewed

@@ -1,9 +1,25 @@
+from crawlee._utils.try_import import install_import_hook as _install_import_hook
+from crawlee._utils.try_import import try_import as _try_import
+# These imports have only mandatory dependencies, so they are imported directly.
 from ._base import StorageClient
 from ._file_system import FileSystemStorageClient
 from ._memory import MemoryStorageClient
+_install_import_hook(__name__)
+# The following imports are wrapped in try_import to handle optional dependencies,
+# ensuring the module can still function even if these dependencies are missing.
+with _try_import(__name__, 'SqlStorageClient'):
+    from ._sql import SqlStorageClient
+with _try_import(__name__, 'RedisStorageClient'):
+    from ._redis import RedisStorageClient
 __all__ = [
     'FileSystemStorageClient',
     'MemoryStorageClient',
+    'RedisStorageClient',
+    'SqlStorageClient',
     'StorageClient',
 ]

crawlee/storage_clients/_base/_storage_client.py CHANGED Viewed

@@ -6,6 +6,8 @@ from typing import TYPE_CHECKING
 from crawlee._utils.docs import docs_group
 if TYPE_CHECKING:
+    from collections.abc import Hashable
     from crawlee.configuration import Configuration
     from ._dataset_client import DatasetClient
@@ -28,12 +30,21 @@ class StorageClient(ABC):
     (where applicable), and consistent access patterns across all storage types it supports.
     """
+    def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:  # noqa: ARG002
+        """Return a cache key that can differentiate between different storages of this and other clients.
+        Can be based on configuration or on the client itself. By default, returns a module and name of the client
+        class.
+        """
+        return f'{self.__class__.__module__}.{self.__class__.__name__}'
     @abstractmethod
     async def create_dataset_client(
         self,
         *,
         id: str | None = None,
         name: str | None = None,
+        alias: str | None = None,
         configuration: Configuration | None = None,
     ) -> DatasetClient:
         """Create a dataset client."""
@@ -44,6 +55,7 @@ class StorageClient(ABC):
         *,
         id: str | None = None,
         name: str | None = None,
+        alias: str | None = None,
         configuration: Configuration | None = None,
     ) -> KeyValueStoreClient:
         """Create a key-value store client."""
@@ -54,6 +66,7 @@ class StorageClient(ABC):
         *,
         id: str | None = None,
         name: str | None = None,
+        alias: str | None = None,
         configuration: Configuration | None = None,
     ) -> RequestQueueClient:
         """Create a request queue client."""

crawlee/storage_clients/_file_system/_dataset_client.py CHANGED Viewed

@@ -9,11 +9,12 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from pydantic import ValidationError
-from typing_extensions import override
+from typing_extensions import Self, override
 from crawlee._consts import METADATA_FILENAME
 from crawlee._utils.crypto import crypto_random_object_id
 from crawlee._utils.file import atomic_write, json_dumps
+from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
 from crawlee.storage_clients._base import DatasetClient
 from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
@@ -56,7 +57,7 @@ class FileSystemDatasetClient(DatasetClient):
         self,
         *,
         metadata: DatasetMetadata,
-        storage_dir: Path,
+        path_to_dataset: Path,
         lock: asyncio.Lock,
     ) -> None:
         """Initialize a new instance.
@@ -65,8 +66,8 @@ class FileSystemDatasetClient(DatasetClient):
         """
         self._metadata = metadata
-        self._storage_dir = storage_dir
-        """The base directory where the storage data are being persisted."""
+        self._path_to_dataset = path_to_dataset
+        """The full path to the dataset directory."""
         self._lock = lock
         """A lock to ensure that only one operation is performed at a time."""
@@ -78,10 +79,7 @@ class FileSystemDatasetClient(DatasetClient):
     @property
     def path_to_dataset(self) -> Path:
         """The full path to the dataset directory."""
-        if self._metadata.name is None:
-            return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
-        return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
+        return self._path_to_dataset
     @property
     def path_to_metadata(self) -> Path:
@@ -94,8 +92,9 @@ class FileSystemDatasetClient(DatasetClient):
         *,
         id: str | None,
         name: str | None,
+        alias: str | None,
         configuration: Configuration,
-    ) -> FileSystemDatasetClient:
+    ) -> Self:
         """Open or create a file system dataset client.
         This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
@@ -104,17 +103,21 @@ class FileSystemDatasetClient(DatasetClient):
         Args:
             id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
-            name: The name of the dataset to open. If not provided, uses the default dataset.
+            name: The name of the dataset for named (global scope) storages.
+            alias: The alias of the dataset for unnamed (run scope) storages.
             configuration: The configuration object containing storage directory settings.
         Returns:
             An instance for the opened or created storage client.
         Raises:
-            ValueError: If a dataset with the specified ID is not found, or if metadata is invalid.
+            ValueError: If a dataset with the specified ID is not found, if metadata is invalid,
+                or if both name and alias are provided.
         """
-        storage_dir = Path(configuration.storage_dir)
-        dataset_base_path = storage_dir / cls._STORAGE_SUBDIR
+        # Validate input parameters.
+        raise_if_too_many_kwargs(id=id, name=name, alias=alias)
+        dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
         if not dataset_base_path.exists():
             await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True)
@@ -126,19 +129,19 @@ class FileSystemDatasetClient(DatasetClient):
                 if not dataset_dir.is_dir():
                     continue
-                metadata_path = dataset_dir / METADATA_FILENAME
-                if not metadata_path.exists():
+                path_to_metadata = dataset_dir / METADATA_FILENAME
+                if not path_to_metadata.exists():
                     continue
                 try:
-                    file = await asyncio.to_thread(metadata_path.open)
+                    file = await asyncio.to_thread(path_to_metadata.open)
                     try:
                         file_content = json.load(file)
                         metadata = DatasetMetadata(**file_content)
                         if metadata.id == id:
                             client = cls(
                                 metadata=metadata,
-                                storage_dir=storage_dir,
+                                path_to_dataset=dataset_base_path / dataset_dir,
                                 lock=asyncio.Lock(),
                             )
                             await client._update_metadata(update_accessed_at=True)
@@ -152,16 +155,15 @@ class FileSystemDatasetClient(DatasetClient):
             if not found:
                 raise ValueError(f'Dataset with ID "{id}" not found')
-        # Get a new instance by name.
+        # Get a new instance by name or alias.
         else:
-            dataset_path = (
-                dataset_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else dataset_base_path / name
-            )
-            metadata_path = dataset_path / METADATA_FILENAME
+            dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')
+            path_to_dataset = dataset_base_path / dataset_dir
+            path_to_metadata = path_to_dataset / METADATA_FILENAME
             # If the dataset directory exists, reconstruct the client from the metadata file.
-            if dataset_path.exists() and metadata_path.exists():
-                file = await asyncio.to_thread(open, metadata_path)
+            if path_to_dataset.exists() and path_to_metadata.exists():
+                file = await asyncio.to_thread(open, path_to_metadata)
                 try:
                     file_content = json.load(file)
                 finally:
@@ -169,11 +171,11 @@ class FileSystemDatasetClient(DatasetClient):
                 try:
                     metadata = DatasetMetadata(**file_content)
                 except ValidationError as exc:
-                    raise ValueError(f'Invalid metadata file for dataset "{name}"') from exc
+                    raise ValueError(f'Invalid metadata file for dataset "{name or alias}"') from exc
                 client = cls(
                     metadata=metadata,
-                    storage_dir=storage_dir,
+                    path_to_dataset=path_to_dataset,
                     lock=asyncio.Lock(),
                 )
@@ -192,7 +194,7 @@ class FileSystemDatasetClient(DatasetClient):
                 )
                 client = cls(
                     metadata=metadata,
-                    storage_dir=storage_dir,
+                    path_to_dataset=path_to_dataset,
                     lock=asyncio.Lock(),
                 )
                 await client._update_metadata()

crawlee/storage_clients/_file_system/_key_value_store_client.py CHANGED Viewed

@@ -10,11 +10,12 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from pydantic import ValidationError
-from typing_extensions import override
+from typing_extensions import Self, override
 from crawlee._consts import METADATA_FILENAME
 from crawlee._utils.crypto import crypto_random_object_id
 from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
+from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
 from crawlee.storage_clients._base import KeyValueStoreClient
 from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
@@ -55,7 +56,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
         self,
         *,
         metadata: KeyValueStoreMetadata,
-        storage_dir: Path,
+        path_to_kvs: Path,
         lock: asyncio.Lock,
     ) -> None:
         """Initialize a new instance.
@@ -64,8 +65,8 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
         """
         self._metadata = metadata
-        self._storage_dir = storage_dir
-        """The base directory where the storage data are being persisted."""
+        self._path_to_kvs = path_to_kvs
+        """The full path to the key-value store directory."""
         self._lock = lock
         """A lock to ensure that only one operation is performed at a time."""
@@ -77,10 +78,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
     @property
     def path_to_kvs(self) -> Path:
         """The full path to the key-value store directory."""
-        if self._metadata.name is None:
-            return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
-        return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
+        return self._path_to_kvs
     @property
     def path_to_metadata(self) -> Path:
@@ -93,8 +91,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
         *,
         id: str | None,
         name: str | None,
+        alias: str | None,
         configuration: Configuration,
-    ) -> FileSystemKeyValueStoreClient:
+    ) -> Self:
         """Open or create a file system key-value store client.
         This method attempts to open an existing key-value store from the file system. If a KVS with the specified
@@ -103,17 +102,21 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
         Args:
             id: The ID of the key-value store to open. If provided, searches for existing store by ID.
-            name: The name of the key-value store to open. If not provided, uses the default store.
+            name: The name of the key-value store for named (global scope) storages.
+            alias: The alias of the key-value store for unnamed (run scope) storages.
             configuration: The configuration object containing storage directory settings.
         Returns:
             An instance for the opened or created storage client.
         Raises:
-            ValueError: If a store with the specified ID is not found, or if metadata is invalid.
+            ValueError: If a store with the specified ID is not found, if metadata is invalid,
+                or if both name and alias are provided.
         """
-        storage_dir = Path(configuration.storage_dir)
-        kvs_base_path = storage_dir / cls._STORAGE_SUBDIR
+        # Validate input parameters.
+        raise_if_too_many_kwargs(id=id, name=name, alias=alias)
+        kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
         if not kvs_base_path.exists():
             await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True)
@@ -125,19 +128,19 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
                 if not kvs_dir.is_dir():
                     continue
-                metadata_path = kvs_dir / METADATA_FILENAME
-                if not metadata_path.exists():
+                path_to_metadata = kvs_dir / METADATA_FILENAME
+                if not path_to_metadata.exists():
                     continue
                 try:
-                    file = await asyncio.to_thread(metadata_path.open)
+                    file = await asyncio.to_thread(path_to_metadata.open)
                     try:
                         file_content = json.load(file)
                         metadata = KeyValueStoreMetadata(**file_content)
                         if metadata.id == id:
                             client = cls(
                                 metadata=metadata,
-                                storage_dir=storage_dir,
+                                path_to_kvs=kvs_base_path / kvs_dir,
                                 lock=asyncio.Lock(),
                             )
                             await client._update_metadata(update_accessed_at=True)
@@ -151,14 +154,15 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
             if not found:
                 raise ValueError(f'Key-value store with ID "{id}" not found.')
-        # Get a new instance by name.
+        # Get a new instance by name or alias.
         else:
-            kvs_path = kvs_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else kvs_base_path / name
-            metadata_path = kvs_path / METADATA_FILENAME
+            kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
+            path_to_kvs = kvs_base_path / kvs_dir
+            path_to_metadata = path_to_kvs / METADATA_FILENAME
             # If the key-value store directory exists, reconstruct the client from the metadata file.
-            if kvs_path.exists() and metadata_path.exists():
-                file = await asyncio.to_thread(open, metadata_path)
+            if path_to_kvs.exists() and path_to_metadata.exists():
+                file = await asyncio.to_thread(open, path_to_metadata)
                 try:
                     file_content = json.load(file)
                 finally:
@@ -166,11 +170,11 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
                 try:
                     metadata = KeyValueStoreMetadata(**file_content)
                 except ValidationError as exc:
-                    raise ValueError(f'Invalid metadata file for key-value store "{name}"') from exc
+                    raise ValueError(f'Invalid metadata file for key-value store "{name or alias}"') from exc
                 client = cls(
                     metadata=metadata,
-                    storage_dir=storage_dir,
+                    path_to_kvs=path_to_kvs,
                     lock=asyncio.Lock(),
                 )
@@ -188,7 +192,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
                 )
                 client = cls(
                     metadata=metadata,
-                    storage_dir=storage_dir,
+                    path_to_kvs=path_to_kvs,
                     lock=asyncio.Lock(),
                 )
                 await client._update_metadata()

crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b31py3-none-any.whl → 1.1.1b1py3-none-any.whl