PyPI - crawlee - Versions diffs - 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl - Mend

crawlee 1.0.3b6py3-none-any.whl → 1.2.2b24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

crawlee/__init__.py +2 -1
crawlee/_browserforge_workaround.py +7 -3
crawlee/_request.py +32 -13
crawlee/_service_locator.py +4 -4
crawlee/_types.py +44 -5
crawlee/_utils/context.py +3 -3
crawlee/_utils/file.py +8 -1
crawlee/_utils/globs.py +4 -4
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +27 -3
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +13 -6
crawlee/_utils/system.py +27 -11
crawlee/_utils/time.py +41 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +5 -2
crawlee/browsers/_playwright_browser.py +2 -1
crawlee/browsers/_playwright_browser_controller.py +1 -1
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/__init__.py +5 -1
crawlee/crawlers/_abstract_http/__init__.py +2 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
crawlee/crawlers/_basic/_basic_crawler.py +156 -131
crawlee/crawlers/_basic/_context_utils.py +24 -0
crawlee/crawlers/_basic/_logging_utils.py +23 -4
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
crawlee/crawlers/_playwright/_types.py +12 -2
crawlee/errors.py +4 -0
crawlee/events/_event_manager.py +12 -6
crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/http_clients/_base.py +4 -0
crawlee/http_clients/_curl_impersonate.py +68 -14
crawlee/http_clients/_httpx.py +16 -6
crawlee/http_clients/_impit.py +25 -10
crawlee/otel/crawler_instrumentor.py +4 -6
crawlee/request_loaders/_sitemap_request_loader.py +23 -5
crawlee/router.py +13 -3
crawlee/sessions/_cookies.py +13 -8
crawlee/sessions/_models.py +3 -3
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +51 -9
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +4 -0
crawlee/storage_clients/_base/_dataset_client.py +2 -2
crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +292 -0
crawlee/storage_clients/_redis/_dataset_client.py +329 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
crawlee/storage_clients/_redis/_storage_client.py +149 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/_client_mixin.py +1 -1
crawlee/storage_clients/_sql/_db_models.py +1 -2
crawlee/storage_clients/models.py +8 -3
crawlee/storages/_key_value_store.py +5 -2
crawlee/storages/_storage_instance_manager.py +103 -44
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
{crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0

crawlee/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from importlib import metadata
-from ._request import Request, RequestOptions
+from ._request import Request, RequestOptions, RequestState
 from ._service_locator import service_locator
 from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
 from ._utils.globs import Glob
@@ -14,6 +14,7 @@ __all__ = [
     'HttpHeaders',
     'Request',
     'RequestOptions',
+    'RequestState',
     'RequestTransformAction',
     'SkippedReason',
     'service_locator',

crawlee/_browserforge_workaround.py CHANGED Viewed

@@ -1,4 +1,8 @@
 # ruff: noqa: N802, PLC0415
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Callable
 def patch_browserforge() -> None:
@@ -12,7 +16,7 @@ def patch_browserforge() -> None:
     import apify_fingerprint_datapoints
     from browserforge import download
-    download.DATA_DIRS: dict[str, Path] = {  # type:ignore[misc]
+    download.DATA_DIRS = {
         'headers': apify_fingerprint_datapoints.get_header_network().parent,
         'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
     }
@@ -20,7 +24,7 @@ def patch_browserforge() -> None:
     def DownloadIfNotExists(**flags: bool) -> None:
         pass
-    download.DownloadIfNotExists = DownloadIfNotExists
+    download.DownloadIfNotExists: Callable[..., None] = DownloadIfNotExists
     import browserforge.bayesian_network
@@ -33,7 +37,7 @@ def patch_browserforge() -> None:
                 path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
             super().__init__(path)
-    browserforge.bayesian_network.BayesianNetwork = BayesianNetwork  # type:ignore[misc]
+    browserforge.bayesian_network.BayesianNetwork: BayesianNetwork = BayesianNetwork
     import browserforge.headers.generator
     browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']

crawlee/_request.py CHANGED Viewed

@@ -34,14 +34,14 @@ class RequestState(IntEnum):
 class CrawleeRequestData(BaseModel):
     """Crawlee-specific configuration stored in the `user_data`."""
-    max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
+    max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
     """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
     `BasicCrawler`."""
     enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
     """The strategy that was used for enqueuing the request."""
-    state: RequestState | None = None
+    state: RequestState = RequestState.UNPROCESSED
     """Describes the request's current lifecycle state."""
     session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -93,7 +93,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
     def __delitem__(self, key: str) -> None:
         del self.__pydantic_extra__[key]
-    def __iter__(self) -> Iterator[str]:  # type: ignore[override]
+    def __iter__(self) -> Iterator[str]:  # ty: ignore[invalid-method-override]
         yield from self.__pydantic_extra__
     def __len__(self) -> int:
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
     always_enqueue: NotRequired[bool]
     user_data: NotRequired[dict[str, JsonSerializable]]
     no_retry: NotRequired[bool]
+    enqueue_strategy: NotRequired[EnqueueStrategy]
+    max_retries: NotRequired[int | None]
 @docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
     model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
-    unique_key: Annotated[str, Field(alias='uniqueKey')]
+    unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
     """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
     to the same URL.
@@ -178,21 +180,22 @@ class Request(BaseModel):
     and specify which URLs shall be considered equal.
     """
-    url: Annotated[str, BeforeValidator(validate_http_url), Field()]
+    url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
     """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
     and fragments."""
-    method: HttpMethod = 'GET'
+    method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
     """HTTP request method."""
     payload: Annotated[
         HttpPayload | None,
         BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
         PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
+        Field(frozen=True),
     ] = None
     """HTTP request payload."""
-    # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
+    # Workaround for Pydantic and type checkers when using Annotated with default_factory
     if TYPE_CHECKING:
         headers: HttpHeaders = HttpHeaders()
         """HTTP request headers."""
@@ -250,6 +253,8 @@ class Request(BaseModel):
         keep_url_fragment: bool = False,
         use_extended_unique_key: bool = False,
         always_enqueue: bool = False,
+        enqueue_strategy: EnqueueStrategy | None = None,
+        max_retries: int | None = None,
         **kwargs: Any,
     ) -> Self:
         """Create a new `Request` instance from a URL.
@@ -277,6 +282,9 @@ class Request(BaseModel):
                 `unique_key` computation. This is only relevant when `unique_key` is not provided.
             always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
                 Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
+            enqueue_strategy: The strategy that will be used for enqueuing the request.
+            max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
+                option of `BasicCrawler`.
             **kwargs: Additional request properties.
         """
         if unique_key is not None and always_enqueue:
@@ -299,7 +307,21 @@ class Request(BaseModel):
         )
         if always_enqueue:
-            unique_key = f'{unique_key}_{crypto_random_object_id()}'
+            unique_key = f'{crypto_random_object_id()}|{unique_key}'
+        user_data_dict = kwargs.pop('user_data', {}) or {}
+        crawlee_data_dict = user_data_dict.get('__crawlee', {})
+        if max_retries is not None:
+            crawlee_data_dict['maxRetries'] = max_retries
+        if enqueue_strategy is not None:
+            crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
+        crawlee_data = CrawleeRequestData(**crawlee_data_dict)
+        if crawlee_data:
+            user_data_dict['__crawlee'] = crawlee_data
         request = cls(
             url=url,
@@ -307,6 +329,7 @@ class Request(BaseModel):
             method=method,
             headers=headers,
             payload=payload,
+            user_data=user_data_dict,
             **kwargs,
         )
@@ -352,7 +375,7 @@ class Request(BaseModel):
         self.crawlee_data.crawl_depth = new_value
     @property
-    def state(self) -> RequestState | None:
+    def state(self) -> RequestState:
         """Crawlee-specific request handling state."""
         return self.crawlee_data.state
@@ -365,10 +388,6 @@ class Request(BaseModel):
         """Crawlee-specific limit on the number of retries of the request."""
         return self.crawlee_data.max_retries
-    @max_retries.setter
-    def max_retries(self, new_max_retries: int) -> None:
-        self.crawlee_data.max_retries = new_max_retries
     @property
     def session_rotation_count(self) -> int | None:
         """Crawlee-specific number of finished session rotations for the request."""

crawlee/_service_locator.py CHANGED Viewed

@@ -38,7 +38,7 @@ class ServiceLocator:
     def get_configuration(self) -> Configuration:
         """Get the configuration."""
         if self._configuration is None:
-            logger.warning('No configuration set, implicitly creating and using default Configuration.')
+            logger.debug('No configuration set, implicitly creating and using default Configuration.')
             self._configuration = Configuration()
         return self._configuration
@@ -63,9 +63,9 @@ class ServiceLocator:
     def get_event_manager(self) -> EventManager:
         """Get the event manager."""
         if self._event_manager is None:
-            logger.warning('No event manager set, implicitly creating and using default LocalEventManager.')
+            logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
             if self._configuration is None:
-                logger.warning(
+                logger.debug(
                     'Implicit creation of event manager will implicitly set configuration as side effect. '
                     'It is advised to explicitly first set the configuration instead.'
                 )
@@ -93,7 +93,7 @@ class ServiceLocator:
     def get_storage_client(self) -> StorageClient:
         """Get the storage client."""
         if self._storage_client is None:
-            logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.')
+            logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
             if self._configuration is None:
                 logger.warning(
                     'Implicit creation of storage client will implicitly set configuration as side effect. '

crawlee/_types.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import dataclasses
 from collections.abc import Callable, Iterator, Mapping
+from copy import deepcopy
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
@@ -15,7 +16,7 @@ if TYPE_CHECKING:
     import re
     from collections.abc import Callable, Coroutine, Sequence
-    from typing_extensions import NotRequired, Required, Unpack
+    from typing_extensions import NotRequired, Required, Self, Unpack
     from crawlee import Glob, Request
     from crawlee._request import RequestOptions
@@ -61,14 +62,14 @@ class HttpHeaders(RootModel, Mapping[str, str]):
     model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
-    # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
+    # Workaround for Pydantic and type checkers when using Annotated with default_factory
     if TYPE_CHECKING:
         root: dict[str, str] = {}
     else:
         root: Annotated[
             dict[str, str],
             PlainValidator(lambda value: _normalize_headers(value)),
-            Field(default_factory=dict),
+            Field(default_factory=lambda: dict[str, str]()),
         ]
     def __getitem__(self, key: str) -> str:
@@ -90,7 +91,7 @@ class HttpHeaders(RootModel, Mapping[str, str]):
         combined_headers = {**other, **self.root}
         return HttpHeaders(combined_headers)
-    def __iter__(self) -> Iterator[str]:  # type: ignore[override]
+    def __iter__(self) -> Iterator[str]:  # ty: ignore[invalid-method-override]
         yield from self.root
     def __len__(self) -> int:
@@ -260,12 +261,24 @@ class KeyValueStoreChangeRecords:
 class RequestHandlerRunResult:
     """Record of calls to storage-related context helpers."""
-    def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
+    def __init__(
+        self,
+        *,
+        key_value_store_getter: GetKeyValueStoreFunction,
+        request: Request,
+    ) -> None:
         self._key_value_store_getter = key_value_store_getter
         self.add_requests_calls = list[AddRequestsKwargs]()
         self.push_data_calls = list[PushDataFunctionCall]()
         self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
+        # Isolated copies for handler execution
+        self._request = deepcopy(request)
+    @property
+    def request(self) -> Request:
+        return self._request
     async def add_requests(
         self,
         requests: Sequence[str | Request],
@@ -315,6 +328,14 @@ class RequestHandlerRunResult:
         return self.key_value_store_changes[id, name, alias]
+    def apply_request_changes(self, target: Request) -> None:
+        """Apply tracked changes from handler copy to original request."""
+        if self.request.user_data != target.user_data:
+            target.user_data = self.request.user_data
+        if self.request.headers != target.headers:
+            target.headers = self.request.headers
 @docs_group('Functions')
 class AddRequestsFunction(Protocol):
@@ -643,6 +664,24 @@ class BasicCrawlingContext:
         """Return hash of the context. Each context is considered unique."""
         return id(self)
+    def create_modified_copy(
+        self,
+        push_data: PushDataFunction | None = None,
+        add_requests: AddRequestsFunction | None = None,
+        get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
+    ) -> Self:
+        """Create a modified copy of the crawling context with specified changes."""
+        modifications = dict[str, Any]()
+        if push_data is not None:
+            modifications['push_data'] = push_data
+        if add_requests is not None:
+            modifications['add_requests'] = add_requests
+        if get_key_value_store is not None:
+            modifications['get_key_value_store'] = get_key_value_store
+        return dataclasses.replace(self, **modifications)
 class GetDataKwargs(TypedDict):
     """Keyword arguments for dataset's `get_data` method."""

crawlee/_utils/context.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from __future__ import annotations
-import asyncio
+import inspect
 from collections.abc import Callable
 from functools import wraps
-from typing import Any, TypeVar
+from typing import Any, TypeVar, cast
 T = TypeVar('T', bound=Callable[..., Any])
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
         return await method(self, *args, **kwargs)
-    return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper  # type: ignore[return-value]
+    return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)

crawlee/_utils/file.py CHANGED Viewed

@@ -163,7 +163,14 @@ async def export_csv_to_stream(
     dst: TextIO,
     **kwargs: Unpack[ExportDataCsvKwargs],
 ) -> None:
-    writer = csv.writer(dst, **kwargs)  # type: ignore[arg-type]
+    # Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
+    # The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
+    # to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
+    # conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
+    if 'lineterminator' not in kwargs:
+        kwargs['lineterminator'] = '\n'
+    writer = csv.writer(dst, **kwargs)
     write_header = True
     # Iterate over the dataset and write to CSV.

crawlee/_utils/globs.py CHANGED Viewed

@@ -33,12 +33,12 @@ def _translate(
     HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
     """
-    if not seps:
-        seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep
+    _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps
-    escaped_seps = ''.join(map(re.escape, seps))
-    any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
+    escaped_seps = ''.join(map(re.escape, _seps))
+    any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
     not_sep = f'[^{escaped_seps}]'
     if include_hidden:
         one_last_segment = f'{not_sep}+'
         one_segment = f'{one_last_segment}{any_sep}'

crawlee/_utils/recoverable_state.py CHANGED Viewed

@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
 from pydantic import BaseModel
+from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
 from crawlee.events._types import Event, EventPersistStateData
 if TYPE_CHECKING:
     import logging
+    from collections.abc import Callable, Coroutine
-    from crawlee.storages._key_value_store import KeyValueStore
+    from crawlee.storages import KeyValueStore
 TStateModel = TypeVar('TStateModel', bound=BaseModel)
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
         persistence_enabled: Literal[True, False, 'explicit_only'] = False,
         persist_state_kvs_name: str | None = None,
         persist_state_kvs_id: str | None = None,
+        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
         logger: logging.Logger,
     ) -> None:
         """Initialize a new recoverable state object.
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
                 If neither a name nor and id are supplied, the default store will be used.
             persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
                 If neither a name nor and id are supplied, the default store will be used.
+            persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
+                not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
             logger: A logger instance for logging operations related to state persistence
         """
+        raise_if_too_many_kwargs(
+            persist_state_kvs_name=persist_state_kvs_name,
+            persist_state_kvs_id=persist_state_kvs_id,
+            persist_state_kvs_factory=persist_state_kvs_factory,
+        )
+        if not persist_state_kvs_factory:
+            logger.debug(
+                'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
+                'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
+                'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
+                'global side effects.'
+            )
         self._default_state = default_state
         self._state_type: type[TStateModel] = self._default_state.__class__
         self._state: TStateModel | None = None
         self._persistence_enabled = persistence_enabled
         self._persist_state_key = persist_state_key
-        self._persist_state_kvs_name = persist_state_kvs_name
-        self._persist_state_kvs_id = persist_state_kvs_id
-        self._key_value_store: 'KeyValueStore | None' = None  # noqa: UP037
+        if persist_state_kvs_factory is None:
+            async def kvs_factory() -> KeyValueStore:
+                from crawlee.storages import KeyValueStore  # noqa: PLC0415 avoid circular import
+                return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
+            self._persist_state_kvs_factory = kvs_factory
+        else:
+            self._persist_state_kvs_factory = persist_state_kvs_factory
+        self._key_value_store: KeyValueStore | None = None
         self._log = logger
     async def initialize(self) -> TStateModel:
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
             return self.current_value
         # Import here to avoid circular imports.
-        from crawlee.storages._key_value_store import KeyValueStore  # noqa: PLC0415
-        self._key_value_store = await KeyValueStore.open(
-            name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
-        )
+        self._key_value_store = await self._persist_state_kvs_factory()
         await self._load_saved_state()

crawlee/_utils/recurring_task.py CHANGED Viewed

@@ -1,12 +1,16 @@
 from __future__ import annotations
 import asyncio
+import inspect
 from logging import getLogger
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from collections.abc import Callable
     from datetime import timedelta
+    from types import TracebackType
+    from typing_extensions import Self
 logger = getLogger(__name__)
@@ -21,11 +25,27 @@ class RecurringTask:
     """
     def __init__(self, func: Callable, delay: timedelta) -> None:
-        logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...')
+        logger.debug(
+            'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
+            func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
+            delay,
+        )
         self.func = func
         self.delay = delay
         self.task: asyncio.Task | None = None
+    async def __aenter__(self) -> Self:
+        self.start()
+        return self
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        await self.stop()
     async def _wrapper(self) -> None:
         """Continuously execute the provided function with the specified delay.
@@ -34,12 +54,16 @@ class RecurringTask:
         """
         sleep_time_secs = self.delay.total_seconds()
         while True:
-            await self.func() if asyncio.iscoroutinefunction(self.func) else self.func()
+            await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
             await asyncio.sleep(sleep_time_secs)
     def start(self) -> None:
         """Start the recurring task execution."""
-        self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}')
+        name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
+        self.task = asyncio.create_task(
+            self._wrapper(),
+            name=f'Task-recurring-{name}',
+        )
     async def stop(self) -> None:
         """Stop the recurring task execution."""

crawlee/_utils/robots.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from logging import getLogger
 from typing import TYPE_CHECKING
 from protego import Protego
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
     from crawlee.proxy_configuration import ProxyInfo
+logger = getLogger(__name__)
 class RobotsTxtFile:
     def __init__(
         self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ class RobotsTxtFile:
             http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
             proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
         """
-        response = await http_client.send_request(url, proxy_info=proxy_info)
-        body = (
-            b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
-        )
+        try:
+            response = await http_client.send_request(url, proxy_info=proxy_info)
+            body = (
+                b'User-agent: *\nAllow: /'
+                if is_status_code_client_error(response.status_code)
+                else await response.read()
+            )
+            robots = Protego.parse(body.decode('utf-8'))
+        except Exception as e:
+            logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
-        robots = Protego.parse(body.decode('utf-8'))
+            robots = Protego.parse('User-agent: *\nAllow: /')
         return cls(url, robots, http_client=http_client, proxy_info=proxy_info)

crawlee/_utils/sitemap.py CHANGED Viewed

@@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
                         # Check if the first chunk is a valid gzip header
                         if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
                             decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
-                            first_chunk = False
+                        first_chunk = False
                         chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
                         text_chunk = decoder.decode(chunk)
@@ -430,10 +430,17 @@ async def parse_sitemap(
     up to the specified maximum depth.
     """
     # Set default options
-    options = options or {}
-    emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
-    max_depth = options.get('max_depth', float('inf'))
-    sitemap_retries = options.get('sitemap_retries', 3)
+    default_timeout = timedelta(seconds=30)
+    if options:
+        emit_nested_sitemaps = options['emit_nested_sitemaps']
+        max_depth = options['max_depth']
+        sitemap_retries = options['sitemap_retries']
+        timeout = options.get('timeout', default_timeout)
+    else:
+        emit_nested_sitemaps = False
+        max_depth = float('inf')
+        sitemap_retries = 3
+        timeout = default_timeout
     # Setup working state
     sources = list(initial_sources)
@@ -472,7 +479,7 @@ async def parse_sitemap(
                 sitemap_retries,
                 emit_nested_sitemaps=emit_nested_sitemaps,
                 proxy_info=proxy_info,
-                timeout=options.get('timeout', timedelta(seconds=30)),
+                timeout=timeout,
             ):
                 yield result
         else:

crawlee/_utils/system.py CHANGED Viewed

@@ -5,7 +5,7 @@ import sys
 from contextlib import suppress
 from datetime import datetime, timezone
 from logging import getLogger
-from typing import Annotated
+from typing import TYPE_CHECKING, Annotated
 import psutil
 from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
@@ -41,11 +41,19 @@ class CpuInfo(BaseModel):
     used_ratio: Annotated[float, Field(alias='usedRatio')]
     """The ratio of CPU currently in use, represented as a float between 0 and 1."""
-    created_at: datetime = Field(
-        alias='createdAt',
-        default_factory=lambda: datetime.now(timezone.utc),
-    )
-    """The time at which the measurement was taken."""
+    # Workaround for Pydantic and type checkers when using Annotated with default_factory
+    if TYPE_CHECKING:
+        created_at: datetime = datetime.now(timezone.utc)
+        """The time at which the measurement was taken."""
+    else:
+        created_at: Annotated[
+            datetime,
+            Field(
+                alias='createdAt',
+                default_factory=lambda: datetime.now(timezone.utc),
+            ),
+        ]
+        """The time at which the measurement was taken."""
 class MemoryUsageInfo(BaseModel):
@@ -61,11 +69,19 @@ class MemoryUsageInfo(BaseModel):
     ]
     """Memory usage of the current Python process and its children."""
-    created_at: datetime = Field(
-        alias='createdAt',
-        default_factory=lambda: datetime.now(timezone.utc),
-    )
-    """The time at which the measurement was taken."""
+    # Workaround for Pydantic and type checkers when using Annotated with default_factory
+    if TYPE_CHECKING:
+        created_at: datetime = datetime.now(timezone.utc)
+        """The time at which the measurement was taken."""
+    else:
+        created_at: Annotated[
+            datetime,
+            Field(
+                alias='createdAt',
+                default_factory=lambda: datetime.now(timezone.utc),
+            ),
+        ]
+        """The time at which the measurement was taken."""
 class MemoryInfo(MemoryUsageInfo):

crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

crawlee 1.0.3b6py3-none-any.whl → 1.2.2b24py3-none-any.whl