PyPI - crawlee - Versions diffs - 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl - Mend

crawlee 0.6.13b43py3-none-any.whl → 1.1.1b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (69) hide show

crawlee/_request.py +32 -21
crawlee/_service_locator.py +4 -4
crawlee/_types.py +67 -24
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +15 -0
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +1 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +4 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
crawlee/crawlers/_basic/_basic_crawler.py +51 -14
crawlee/crawlers/_playwright/_playwright_crawler.py +16 -4
crawlee/events/_event_manager.py +3 -1
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/otel/crawler_instrumentor.py +3 -3
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_sitemap_request_loader.py +22 -4
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +32 -1
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_storage_client.py +5 -4
crawlee/storage_clients/_file_system/_dataset_client.py +4 -5
crawlee/storage_clients/_file_system/_key_value_store_client.py +4 -5
crawlee/storage_clients/_file_system/_request_queue_client.py +28 -12
crawlee/storage_clients/_file_system/_storage_client.py +2 -2
crawlee/storage_clients/_memory/_dataset_client.py +4 -5
crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +295 -0
crawlee/storage_clients/_redis/_dataset_client.py +325 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
crawlee/storage_clients/_redis/_storage_client.py +146 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +291 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +10 -10
crawlee/storages/_base.py +3 -1
crawlee/storages/_dataset.py +5 -3
crawlee/storages/_key_value_store.py +11 -6
crawlee/storages/_request_queue.py +5 -3
crawlee/storages/_storage_instance_manager.py +54 -68
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +16 -5
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +69 -47
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0

crawlee/_request.py CHANGED Viewed

@@ -185,9 +185,6 @@ class Request(BaseModel):
     method: HttpMethod = 'GET'
     """HTTP request method."""
-    headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
-    """HTTP request headers."""
     payload: Annotated[
         HttpPayload | None,
         BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
@@ -195,23 +192,37 @@ class Request(BaseModel):
     ] = None
     """HTTP request payload."""
-    user_data: Annotated[
-        dict[str, JsonSerializable],  # Internally, the model contains `UserData`, this is just for convenience
-        Field(alias='userData', default_factory=lambda: UserData()),
-        PlainValidator(user_data_adapter.validate_python),
-        PlainSerializer(
-            lambda instance: user_data_adapter.dump_python(
-                instance,
-                by_alias=True,
-                exclude_none=True,
-                exclude_unset=True,
-                exclude_defaults=True,
-            )
-        ),
-    ] = {}
-    """Custom user data assigned to the request. Use this to save any request related data to the
-    request's scope, keeping them accessible on retries, failures etc.
-    """
+    # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
+    if TYPE_CHECKING:
+        headers: HttpHeaders = HttpHeaders()
+        """HTTP request headers."""
+        user_data: dict[str, JsonSerializable] = {}
+        """Custom user data assigned to the request. Use this to save any request related data to the
+        request's scope, keeping them accessible on retries, failures etc.
+        """
+    else:
+        headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
+        """HTTP request headers."""
+        user_data: Annotated[
+            dict[str, JsonSerializable],  # Internally, the model contains `UserData`, this is just for convenience
+            Field(alias='userData', default_factory=lambda: UserData()),
+            PlainValidator(user_data_adapter.validate_python),
+            PlainSerializer(
+                lambda instance: user_data_adapter.dump_python(
+                    instance,
+                    by_alias=True,
+                    exclude_none=True,
+                    exclude_unset=True,
+                    exclude_defaults=True,
+                )
+            ),
+        ]
+        """Custom user data assigned to the request. Use this to save any request related data to the
+        request's scope, keeping them accessible on retries, failures etc.
+        """
     retry_count: Annotated[int, Field(alias='retryCount')] = 0
     """Number of times the request has been retried."""
@@ -288,7 +299,7 @@ class Request(BaseModel):
         )
         if always_enqueue:
-            unique_key = f'{unique_key}_{crypto_random_object_id()}'
+            unique_key = f'{crypto_random_object_id()}|{unique_key}'
         request = cls(
             url=url,

crawlee/_service_locator.py CHANGED Viewed

@@ -38,7 +38,7 @@ class ServiceLocator:
     def get_configuration(self) -> Configuration:
         """Get the configuration."""
         if self._configuration is None:
-            logger.warning('No configuration set, implicitly creating and using default Configuration.')
+            logger.debug('No configuration set, implicitly creating and using default Configuration.')
             self._configuration = Configuration()
         return self._configuration
@@ -63,9 +63,9 @@ class ServiceLocator:
     def get_event_manager(self) -> EventManager:
         """Get the event manager."""
         if self._event_manager is None:
-            logger.warning('No event manager set, implicitly creating and using default LocalEventManager.')
+            logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
             if self._configuration is None:
-                logger.warning(
+                logger.debug(
                     'Implicit creation of event manager will implicitly set configuration as side effect. '
                     'It is advised to explicitly first set the configuration instead.'
                 )
@@ -93,7 +93,7 @@ class ServiceLocator:
     def get_storage_client(self) -> StorageClient:
         """Get the storage client."""
         if self._storage_client is None:
-            logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.')
+            logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
             if self._configuration is None:
                 logger.warning(
                     'Implicit creation of storage client will implicitly set configuration as side effect. '

crawlee/_types.py CHANGED Viewed

@@ -3,17 +3,7 @@ from __future__ import annotations
 import dataclasses
 from collections.abc import Callable, Iterator, Mapping
 from dataclasses import dataclass
-from typing import (
-    TYPE_CHECKING,
-    Annotated,
-    Any,
-    Literal,
-    Protocol,
-    TypedDict,
-    TypeVar,
-    cast,
-    overload,
-)
+from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
 from pydantic import ConfigDict, Field, PlainValidator, RootModel
@@ -71,11 +61,15 @@ class HttpHeaders(RootModel, Mapping[str, str]):
     model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
-    root: Annotated[
-        dict[str, str],
-        PlainValidator(lambda value: _normalize_headers(value)),
-        Field(default_factory=dict),
-    ] = {}
+    # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
+    if TYPE_CHECKING:
+        root: dict[str, str] = {}
+    else:
+        root: Annotated[
+            dict[str, str],
+            PlainValidator(lambda value: _normalize_headers(value)),
+            Field(default_factory=dict),
+        ]
     def __getitem__(self, key: str) -> str:
         return self.root[key.lower()]
@@ -110,9 +104,9 @@ class ConcurrencySettings:
     def __init__(
         self,
         min_concurrency: int = 1,
-        max_concurrency: int = 200,
+        max_concurrency: int = 100,
         max_tasks_per_minute: float = float('inf'),
-        desired_concurrency: int | None = None,
+        desired_concurrency: int = 10,
     ) -> None:
         """Initialize a new instance.
@@ -125,21 +119,24 @@ class ConcurrencySettings:
             desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
                 if there is a large enough supply of them. By default, it is `min_concurrency`.
         """
-        if desired_concurrency is not None and desired_concurrency < 1:
-            raise ValueError('desired_concurrency must be 1 or larger')
         if min_concurrency < 1:
             raise ValueError('min_concurrency must be 1 or larger')
         if max_concurrency < min_concurrency:
             raise ValueError('max_concurrency cannot be less than min_concurrency')
+        if desired_concurrency < min_concurrency:
+            raise ValueError('desired_concurrency cannot be less than min_concurrency')
+        if desired_concurrency > max_concurrency:
+            raise ValueError('desired_concurrency cannot be greater than max_concurrency')
         if max_tasks_per_minute <= 0:
             raise ValueError('max_tasks_per_minute must be positive')
         self.min_concurrency = min_concurrency
         self.max_concurrency = max_concurrency
-        self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
+        self.desired_concurrency = desired_concurrency
         self.max_tasks_per_minute = max_tasks_per_minute
@@ -180,6 +177,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
     requests: Sequence[str | Request]
     """Requests to be added to the `RequestManager`."""
+    rq_id: str | None
+    """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
+    rq_name: str | None
+    """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
+    """
+    rq_alias: str | None
+    """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
+    """
 class PushDataKwargs(TypedDict):
     """Keyword arguments for dataset's `push_data` method."""
@@ -261,10 +269,18 @@ class RequestHandlerRunResult:
     async def add_requests(
         self,
         requests: Sequence[str | Request],
+        rq_id: str | None = None,
+        rq_name: str | None = None,
+        rq_alias: str | None = None,
         **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> None:
         """Track a call to the `add_requests` context helper."""
-        self.add_requests_calls.append(AddRequestsKwargs(requests=requests, **kwargs))
+        specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
+        if specified_params > 1:
+            raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
+        self.add_requests_calls.append(
+            AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
+        )
     async def push_data(
         self,
@@ -311,12 +327,21 @@ class AddRequestsFunction(Protocol):
     def __call__(
         self,
         requests: Sequence[str | Request],
+        rq_id: str | None = None,
+        rq_name: str | None = None,
+        rq_alias: str | None = None,
         **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> Coroutine[None, None, None]:
         """Call dunder method.
         Args:
             requests: Requests to be added to the `RequestManager`.
+            rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
+                provided.
+            rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
+                can be provided.
+            rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
+                can be provided.
             **kwargs: Additional keyword arguments.
         """
@@ -344,12 +369,21 @@ class EnqueueLinksFunction(Protocol):
         label: str | None = None,
         user_data: dict[str, Any] | None = None,
         transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
+        rq_id: str | None = None,
+        rq_name: str | None = None,
+        rq_alias: str | None = None,
         **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> Coroutine[None, None, None]: ...
     @overload
     def __call__(
-        self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
+        self,
+        *,
+        requests: Sequence[str | Request] | None = None,
+        rq_id: str | None = None,
+        rq_name: str | None = None,
+        rq_alias: str | None = None,
+        **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> Coroutine[None, None, None]: ...
     def __call__(
@@ -360,6 +394,9 @@ class EnqueueLinksFunction(Protocol):
         user_data: dict[str, Any] | None = None,
         transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
         requests: Sequence[str | Request] | None = None,
+        rq_id: str | None = None,
+        rq_name: str | None = None,
+        rq_alias: str | None = None,
         **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> Coroutine[None, None, None]:
         """Call enqueue links function.
@@ -377,6 +414,12 @@ class EnqueueLinksFunction(Protocol):
                 - `'skip'` to exclude the request from being enqueued,
                 - `'unchanged'` to use the original request options without modification.
             requests: Requests to be added to the `RequestManager`.
+            rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
+                provided.
+            rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
+                can be provided.
+            rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
+                can be provided.
             **kwargs: Additional keyword arguments.
         """

crawlee/_utils/raise_if_too_many_kwargs.py ADDED Viewed

@@ -0,0 +1,12 @@
+from typing import Any
+def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
+    """Raise ValueError if there are more non-None kwargs then max_kwargs."""
+    none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
+    if len(none_kwargs_names) > max_kwargs:
+        all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
+        raise ValueError(
+            f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
+            f'specified: {", ".join(none_kwargs_names)}.'
+        )

crawlee/_utils/recoverable_state.py CHANGED Viewed

@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
 from pydantic import BaseModel
+from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
 from crawlee.events._types import Event, EventPersistStateData
 if TYPE_CHECKING:
     import logging
+    from collections.abc import Callable, Coroutine
-    from crawlee.storages._key_value_store import KeyValueStore
+    from crawlee.storages import KeyValueStore
 TStateModel = TypeVar('TStateModel', bound=BaseModel)
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
         persistence_enabled: Literal[True, False, 'explicit_only'] = False,
         persist_state_kvs_name: str | None = None,
         persist_state_kvs_id: str | None = None,
+        persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
         logger: logging.Logger,
     ) -> None:
         """Initialize a new recoverable state object.
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
                 If neither a name nor and id are supplied, the default store will be used.
             persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
                 If neither a name nor and id are supplied, the default store will be used.
+            persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
+                not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
             logger: A logger instance for logging operations related to state persistence
         """
+        raise_if_too_many_kwargs(
+            persist_state_kvs_name=persist_state_kvs_name,
+            persist_state_kvs_id=persist_state_kvs_id,
+            persist_state_kvs_factory=persist_state_kvs_factory,
+        )
+        if not persist_state_kvs_factory:
+            logger.debug(
+                'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
+                'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
+                'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
+                'global side effects.'
+            )
         self._default_state = default_state
         self._state_type: type[TStateModel] = self._default_state.__class__
         self._state: TStateModel | None = None
         self._persistence_enabled = persistence_enabled
         self._persist_state_key = persist_state_key
-        self._persist_state_kvs_name = persist_state_kvs_name
-        self._persist_state_kvs_id = persist_state_kvs_id
-        self._key_value_store: 'KeyValueStore | None' = None  # noqa: UP037
+        if persist_state_kvs_factory is None:
+            async def kvs_factory() -> KeyValueStore:
+                from crawlee.storages import KeyValueStore  # noqa: PLC0415 avoid circular import
+                return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
+            self._persist_state_kvs_factory = kvs_factory
+        else:
+            self._persist_state_kvs_factory = persist_state_kvs_factory
+        self._key_value_store: KeyValueStore | None = None
         self._log = logger
     async def initialize(self) -> TStateModel:
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
             return self.current_value
         # Import here to avoid circular imports.
-        from crawlee.storages._key_value_store import KeyValueStore  # noqa: PLC0415
-        self._key_value_store = await KeyValueStore.open(
-            name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
-        )
+        self._key_value_store = await self._persist_state_kvs_factory()
         await self._load_saved_state()

crawlee/_utils/recurring_task.py CHANGED Viewed

@@ -7,6 +7,9 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from collections.abc import Callable
     from datetime import timedelta
+    from types import TracebackType
+    from typing_extensions import Self
 logger = getLogger(__name__)
@@ -26,6 +29,18 @@ class RecurringTask:
         self.delay = delay
         self.task: asyncio.Task | None = None
+    async def __aenter__(self) -> Self:
+        self.start()
+        return self
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        await self.stop()
     async def _wrapper(self) -> None:
         """Continuously execute the provided function with the specified delay.

crawlee/_utils/robots.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from logging import getLogger
 from typing import TYPE_CHECKING
 from protego import Protego
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
     from crawlee.proxy_configuration import ProxyInfo
+logger = getLogger(__name__)
 class RobotsTxtFile:
     def __init__(
         self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ class RobotsTxtFile:
             http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
             proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
         """
-        response = await http_client.send_request(url, proxy_info=proxy_info)
-        body = (
-            b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
-        )
+        try:
+            response = await http_client.send_request(url, proxy_info=proxy_info)
+            body = (
+                b'User-agent: *\nAllow: /'
+                if is_status_code_client_error(response.status_code)
+                else await response.read()
+            )
+            robots = Protego.parse(body.decode('utf-8'))
+        except Exception as e:
+            logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
-        robots = Protego.parse(body.decode('utf-8'))
+            robots = Protego.parse('User-agent: *\nAllow: /')
         return cls(url, robots, http_client=http_client, proxy_info=proxy_info)

crawlee/_utils/sitemap.py CHANGED Viewed

@@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
                         # Check if the first chunk is a valid gzip header
                         if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
                             decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
-                            first_chunk = False
+                        first_chunk = False
                         chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
                         text_chunk = decoder.decode(chunk)

crawlee/_utils/urls.py CHANGED Viewed

@@ -7,6 +7,7 @@ from yarl import URL
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from logging import Logger
 def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
     return str(URL(base_url).join(URL(relative_url)))
-def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
+def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
     """Convert an iterator of relative URLs to absolute URLs using a base URL."""
     for url in urls:
         if is_url_absolute(url):
             yield url
         else:
-            yield convert_to_absolute_url(base_url, url)
+            converted_url = convert_to_absolute_url(base_url, url)
+            # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
+            if not is_url_absolute(converted_url):
+                if logger:
+                    logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
+                continue
+            yield converted_url
 _http_url_adapter = TypeAdapter(AnyHttpUrl)

crawlee/browsers/_browser_pool.py CHANGED Viewed

@@ -118,7 +118,10 @@ class BrowserPool:
         """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
         Args:
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
             user_data_dir: Path to a user data directory, which stores browser session data like cookies
                 and local storage.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided

crawlee/browsers/_playwright_browser_controller.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+from asyncio import Lock
 from datetime import datetime, timedelta, timezone
 from typing import TYPE_CHECKING, Any, cast
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
         self._total_opened_pages = 0
+        self._context_creation_lock: Lock | None = None
+    async def _get_context_creation_lock(self) -> Lock:
+        """Get context checking and creation lock.
+        It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
+        memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
+        """
+        if self._context_creation_lock:
+            return self._context_creation_lock
+        self._context_creation_lock = Lock()
+        return self._context_creation_lock
     @property
     @override
     def pages(self) -> list[Page]:
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
         Raises:
             ValueError: If the browser has reached the maximum number of open pages.
         """
-        if not self._browser_context:
-            self._browser_context = await self._create_browser_context(
-                browser_new_context_options=browser_new_context_options,
-                proxy_info=proxy_info,
-            )
         if not self.has_free_capacity:
             raise ValueError('Cannot open more pages in this browser.')
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
             )
             page = await new_context.new_page()
         else:
-            if not self._browser_context:
-                self._browser_context = await self._create_browser_context(
-                    browser_new_context_options=browser_new_context_options,
-                    proxy_info=proxy_info,
-                )
+            async with await self._get_context_creation_lock():
+                if not self._browser_context:
+                    self._browser_context = await self._create_browser_context(
+                        browser_new_context_options=browser_new_context_options,
+                        proxy_info=proxy_info,
+                    )
             page = await self._browser_context.new_page()
         # Handle page close event
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
         self._last_page_opened_at = datetime.now(timezone.utc)
         self._total_opened_pages += 1
         return page
     @override
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
         `self._fingerprint_generator` is available.
         """
         browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
         if proxy_info:
             if browser_new_context_options.get('proxy'):
-                logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
+                logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
             browser_new_context_options['proxy'] = ProxySettings(
                 server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
         browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
             'extra_http_headers', extra_http_headers
         )
         return await self._browser.new_context(**browser_new_context_options)

crawlee/browsers/_playwright_browser_plugin.py CHANGED Viewed

@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
     It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
     for creating new browser instances and provides a unified interface for interacting with different browser types
-    (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,
-    executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
+    (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
+    mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
     browser instance, ensuring that resource limits are respected.
     """
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
         """Initialize a new instance.
         Args:
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
             user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
                 storage.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
             'chromium_sandbox': not config.disable_browser_sandbox,
         }
+        if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
+            raise ValueError(
+                'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
+            )
+        # Map 'chrome' to 'chromium' with the 'chrome' channel.
+        if browser_type == 'chrome':
+            browser_type = 'chromium'
+            # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
+            default_launch_browser_options['channel'] = 'chrome'
         self._browser_type: BrowserType = browser_type
         self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
         self._browser_new_context_options = browser_new_context_options or {}

crawlee/browsers/_types.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal
 if TYPE_CHECKING:
     from playwright.async_api import Page
-BrowserType = Literal['chromium', 'firefox', 'webkit']
+BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']
 @dataclass

crawlee/configuration.py CHANGED Viewed

@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
     Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
     """
-    model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True)
+    # TODO: https://github.com/pydantic/pydantic-settings/issues/706
+    # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
+    model_config = SettingsConfigDict(populate_by_name=True)
     internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
     """Timeout for the internal asynchronous operations."""

crawlee 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b43py3-none-any.whl → 1.1.1b1py3-none-any.whl