PyPI - crawlee - Versions diffs - 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl - Mend

crawlee 0.6.13b31py3-none-any.whl → 1.1.1b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (82) hide show

crawlee/_autoscaling/snapshotter.py +1 -1
crawlee/_request.py +34 -22
crawlee/_service_locator.py +44 -24
crawlee/_types.py +86 -33
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +15 -0
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +1 -1
crawlee/_utils/system.py +3 -3
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +4 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +2 -0
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
crawlee/crawlers/_basic/_basic_crawler.py +124 -37
crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
crawlee/events/_event_manager.py +3 -1
crawlee/events/_types.py +6 -6
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/fingerprint_suite/_types.py +2 -2
crawlee/otel/crawler_instrumentor.py +3 -3
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_request_list.py +1 -1
crawlee/request_loaders/_sitemap_request_loader.py +23 -5
crawlee/sessions/_models.py +2 -2
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +33 -2
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_storage_client.py +13 -0
crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
crawlee/storage_clients/_file_system/_storage_client.py +16 -3
crawlee/storage_clients/_file_system/_utils.py +0 -0
crawlee/storage_clients/_memory/_dataset_client.py +16 -4
crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
crawlee/storage_clients/_memory/_storage_client.py +6 -3
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +295 -0
crawlee/storage_clients/_redis/_dataset_client.py +325 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
crawlee/storage_clients/_redis/_storage_client.py +146 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +291 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +10 -10
crawlee/storages/_base.py +5 -1
crawlee/storages/_dataset.py +12 -2
crawlee/storages/_key_value_store.py +17 -4
crawlee/storages/_request_queue.py +10 -2
crawlee/storages/_storage_instance_manager.py +133 -71
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0

crawlee/crawlers/_basic/_basic_crawler.py CHANGED Viewed

@@ -27,6 +27,7 @@ from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locat
 from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
 from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
 from crawlee._request import Request, RequestOptions, RequestState
+from crawlee._service_locator import ServiceLocator
 from crawlee._types import (
     BasicCrawlingContext,
     EnqueueLinksKwargs,
@@ -204,7 +205,7 @@ class _BasicCrawlerOptions(TypedDict):
     Returning `None` suppresses the status message."""
-class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
+class _BasicCrawlerOptionsGeneric(TypedDict, Generic[TCrawlingContext, TStatisticsState]):
     """Generic options the `BasicCrawler` constructor."""
     request_handler: NotRequired[Callable[[TCrawlingContext], Awaitable[None]]]
@@ -219,9 +220,9 @@ class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], T
 class BasicCrawlerOptions(
-    Generic[TCrawlingContext, TStatisticsState],
     _BasicCrawlerOptions,
     _BasicCrawlerOptionsGeneric[TCrawlingContext, TStatisticsState],
+    Generic[TCrawlingContext, TStatisticsState],
 ):
     """Arguments for the `BasicCrawler` constructor.
@@ -346,14 +347,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             _logger: A logger instance, typically provided by a subclass, for consistent logging labels.
                 Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
         """
-        if configuration:
-            service_locator.set_configuration(configuration)
-        if storage_client:
-            service_locator.set_storage_client(storage_client)
-        if event_manager:
-            service_locator.set_event_manager(event_manager)
+        implicit_event_manager_with_explicit_config = False
+        if not configuration:
+            configuration = service_locator.get_configuration()
+        elif not event_manager:
+            implicit_event_manager_with_explicit_config = True
-        config = service_locator.get_configuration()
+        if not storage_client:
+            storage_client = service_locator.get_storage_client()
+        if not event_manager:
+            event_manager = service_locator.get_event_manager()
+        self._service_locator = ServiceLocator(
+            configuration=configuration, storage_client=storage_client, event_manager=event_manager
+        )
+        config = self._service_locator.get_configuration()
         # Core components
         self._request_manager = request_manager
@@ -419,17 +429,31 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             httpx_logger = logging.getLogger('httpx')  # Silence HTTPX logger
             httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
         self._logger = _logger or logging.getLogger(__name__)
+        if implicit_event_manager_with_explicit_config:
+            self._logger.warning(
+                'No event manager set, implicitly using event manager from global service_locator.'
+                'It is advised to explicitly set the event manager if explicit configuration is used as well.'
+            )
         self._statistics_log_format = statistics_log_format
         # Statistics
-        self._statistics = statistics or cast(
-            'Statistics[TStatisticsState]',
-            Statistics.with_default_state(
-                periodic_message_logger=self._logger,
-                statistics_log_format=self._statistics_log_format,
-                log_message='Current request statistics:',
-            ),
-        )
+        if statistics:
+            self._statistics = statistics
+        else:
+            async def persist_state_factory() -> KeyValueStore:
+                return await self.get_key_value_store()
+            self._statistics = cast(
+                'Statistics[TStatisticsState]',
+                Statistics.with_default_state(
+                    persistence_enabled=True,
+                    periodic_message_logger=self._logger,
+                    statistics_log_format=self._statistics_log_format,
+                    log_message='Current request statistics:',
+                    persist_state_kvs_factory=persist_state_factory,
+                ),
+            )
         # Additional context managers to enter and exit
         self._additional_context_managers = _additional_context_managers or []
@@ -548,7 +572,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
     async def get_request_manager(self) -> RequestManager:
         """Return the configured request manager. If none is configured, open and return the default request queue."""
         if not self._request_manager:
-            self._request_manager = await RequestQueue.open()
+            self._request_manager = await RequestQueue.open(
+                storage_client=self._service_locator.get_storage_client(),
+                configuration=self._service_locator.get_configuration(),
+            )
         return self._request_manager
@@ -557,18 +584,32 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         *,
         id: str | None = None,
         name: str | None = None,
+        alias: str | None = None,
     ) -> Dataset:
         """Return the `Dataset` with the given ID or name. If none is provided, return the default one."""
-        return await Dataset.open(id=id, name=name)
+        return await Dataset.open(
+            id=id,
+            name=name,
+            alias=alias,
+            storage_client=self._service_locator.get_storage_client(),
+            configuration=self._service_locator.get_configuration(),
+        )
     async def get_key_value_store(
         self,
         *,
         id: str | None = None,
         name: str | None = None,
+        alias: str | None = None,
     ) -> KeyValueStore:
         """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS."""
-        return await KeyValueStore.open(id=id, name=name)
+        return await KeyValueStore.open(
+            id=id,
+            name=name,
+            alias=alias,
+            storage_client=self._service_locator.get_storage_client(),
+            configuration=self._service_locator.get_configuration(),
+        )
     def error_handler(
         self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
@@ -627,7 +668,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             request_manager = await self.get_request_manager()
             if purge_request_queue and isinstance(request_manager, RequestQueue):
                 await request_manager.drop()
-                self._request_manager = await RequestQueue.open()
+                self._request_manager = await RequestQueue.open(
+                    storage_client=self._service_locator.get_storage_client(),
+                    configuration=self._service_locator.get_configuration(),
+                )
         if requests is not None:
             await self.add_requests(requests)
@@ -654,7 +698,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         except CancelledError:
             pass
         finally:
-            await self._crawler_state_rec_task.stop()
             if threading.current_thread() is threading.main_thread():
                 with suppress(NotImplementedError):
                     asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -684,9 +727,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         return final_statistics
     async def _run_crawler(self) -> None:
-        event_manager = service_locator.get_event_manager()
-        self._crawler_state_rec_task.start()
+        event_manager = self._service_locator.get_event_manager()
         # Collect the context managers to be entered. Context managers that are already active are excluded,
         # as they were likely entered by the caller, who will also be responsible for exiting them.
@@ -698,6 +739,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 self._statistics,
                 self._session_pool if self._use_session_pool else None,
                 self._http_client,
+                self._crawler_state_rec_task,
                 *self._additional_context_managers,
             )
             if cm and getattr(cm, 'active', False) is False
@@ -772,6 +814,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self,
         dataset_id: str | None = None,
         dataset_name: str | None = None,
+        dataset_alias: str | None = None,
         **kwargs: Unpack[GetDataKwargs],
     ) -> DatasetItemsListPage:
         """Retrieve data from a `Dataset`.
@@ -781,13 +824,20 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         Args:
             dataset_id: The ID of the `Dataset`.
-            dataset_name: The name of the `Dataset`.
+            dataset_name: The name of the `Dataset` (global scope, named storage).
+            dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
             kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method.
         Returns:
             The retrieved data.
         """
-        dataset = await Dataset.open(id=dataset_id, name=dataset_name)
+        dataset = await Dataset.open(
+            id=dataset_id,
+            name=dataset_name,
+            alias=dataset_alias,
+            storage_client=self._service_locator.get_storage_client(),
+            configuration=self._service_locator.get_configuration(),
+        )
         return await dataset.get_data(**kwargs)
     async def export_data(
@@ -795,6 +845,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         path: str | Path,
         dataset_id: str | None = None,
         dataset_name: str | None = None,
+        dataset_alias: str | None = None,
     ) -> None:
         """Export all items from a Dataset to a JSON or CSV file.
@@ -804,10 +855,17 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         Args:
             path: The destination file path. Must end with '.json' or '.csv'.
-            dataset_id: The ID of the Dataset to export from. If None, uses `name` parameter instead.
-            dataset_name: The name of the Dataset to export from. If None, uses `id` parameter instead.
+            dataset_id: The ID of the Dataset to export from.
+            dataset_name: The name of the Dataset to export from (global scope, named storage).
+            dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
         """
-        dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
+        dataset = await Dataset.open(
+            id=dataset_id,
+            name=dataset_name,
+            alias=dataset_alias,
+            storage_client=self._service_locator.get_storage_client(),
+            configuration=self._service_locator.get_configuration(),
+        )
         path = path if isinstance(path, Path) else Path(path)
         dst = path.open('w', newline='')
@@ -824,6 +882,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         data: list[dict[str, Any]] | dict[str, Any],
         dataset_id: str | None = None,
         dataset_name: str | None = None,
+        dataset_alias: str | None = None,
         **kwargs: Unpack[PushDataKwargs],
     ) -> None:
         """Push data to a `Dataset`.
@@ -834,10 +893,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         Args:
             data: The data to push to the `Dataset`.
             dataset_id: The ID of the `Dataset`.
-            dataset_name: The name of the `Dataset`.
+            dataset_name: The name of the `Dataset` (global scope, named storage).
+            dataset_alias: The alias of the `Dataset` (run scope, unnamed storage).
             kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method.
         """
-        dataset = await self.get_dataset(id=dataset_id, name=dataset_name)
+        dataset = await self.get_dataset(id=dataset_id, name=dataset_name, alias=dataset_alias)
         await dataset.push_data(data, **kwargs)
     def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) -> bool:
@@ -894,6 +954,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
             | None = None,
             requests: Sequence[str | Request] | None = None,
+            rq_id: str | None = None,
+            rq_name: str | None = None,
+            rq_alias: str | None = None,
             **kwargs: Unpack[EnqueueLinksKwargs],
         ) -> None:
             kwargs.setdefault('strategy', 'same-hostname')
@@ -905,7 +968,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                         '`transform_request_function` arguments when `requests` is provided.'
                     )
                 # Add directly passed requests.
-                await context.add_requests(requests or list[str | Request](), **kwargs)
+                await context.add_requests(
+                    requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
+                )
             else:
                 # Add requests from extracted links.
                 await context.add_requests(
@@ -914,7 +979,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                         label=label,
                         user_data=user_data,
                         transform_request_function=transform_request_function,
+                        **kwargs,
                     ),
+                    rq_id=rq_id,
+                    rq_name=rq_name,
+                    rq_alias=rq_alias,
                     **kwargs,
                 )
@@ -1191,10 +1260,28 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
         result = self._context_result_map[context]
-        request_manager = await self.get_request_manager()
+        base_request_manager = await self.get_request_manager()
         origin = context.request.loaded_url or context.request.url
         for add_requests_call in result.add_requests_calls:
+            rq_id = add_requests_call.get('rq_id')
+            rq_name = add_requests_call.get('rq_name')
+            rq_alias = add_requests_call.get('rq_alias')
+            specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
+            if specified_params > 1:
+                raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
+            if rq_id or rq_name or rq_alias:
+                request_manager: RequestManager | RequestQueue = await RequestQueue.open(
+                    id=rq_id,
+                    name=rq_name,
+                    alias=rq_alias,
+                    storage_client=self._service_locator.get_storage_client(),
+                    configuration=self._service_locator.get_configuration(),
+                )
+            else:
+                request_manager = base_request_manager
             requests = list[Request]()
             base_url = url if (url := add_requests_call.get('base_url')) else origin
@@ -1226,8 +1313,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
     ) -> None:
         """Store key value store changes recorded in result."""
-        for (id, name), changes in result.key_value_store_changes.items():
-            store = await get_kvs(id=id, name=name)
+        for (id, name, alias), changes in result.key_value_store_changes.items():
+            store = await get_kvs(id=id, name=name, alias=alias)
             for key, value in changes.updates.items():
                 await store.set_value(key, value.content, value.content_type)
@@ -1520,7 +1607,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
     async def _crawler_state_task(self) -> None:
         """Emit a persist state event with the given migration status."""
-        event_manager = service_locator.get_event_manager()
+        event_manager = self._service_locator.get_event_manager()
         current_state = self.statistics.state

crawlee/crawlers/_playwright/_playwright_crawler.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar
 from crawlee import service_locator
 from crawlee._request import Request, RequestOptions
+from crawlee._types import ConcurrencySettings
 from crawlee._utils.blocked import RETRY_CSS_SELECTORS
 from crawlee._utils.docs import docs_group
 from crawlee._utils.robots import RobotsTxtFile
@@ -113,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
             user_data_dir: Path to a user data directory, which stores browser session data like cookies
                 and local storage.
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
                 This option should not be used if `browser_pool` is provided.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
                 directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -152,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             ):
                 raise ValueError(
                     'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
-                    '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`  or'
+                    '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
                     '`fingerprint_generator` arguments when `browser_pool` is provided.'
                 )
@@ -194,6 +198,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
         kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
+        # Set default concurrency settings for browser crawlers if not provided
+        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
+            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
         super().__init__(**kwargs)
     async def _open_page(
@@ -361,7 +369,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             links_iterator: Iterator[str] = iter(
                 [url for element in elements if (url := await element.get_attribute('href')) is not None]
             )
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            links_iterator = to_absolute_url_iterator(
+                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            )
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -489,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
     """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
     browser_type: NotRequired[BrowserType]
-    """The type of browser to launch ('chromium', 'firefox', or 'webkit').
+    """The type of browser to launch:
+    - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+    - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
     This option should not be used if `browser_pool` is provided."""
     browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -509,9 +521,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
 class PlaywrightCrawlerOptions(
-    Generic[TCrawlingContext, TStatisticsState],
     _PlaywrightCrawlerAdditionalOptions,
     BasicCrawlerOptions[TCrawlingContext, StatisticsState],
+    Generic[TCrawlingContext, TStatisticsState],
 ):
     """Arguments for the `AbstractHttpCrawler` constructor.

crawlee/events/_event_manager.py CHANGED Viewed

@@ -130,11 +130,13 @@ class EventManager:
         if not self._active:
             raise RuntimeError(f'The {self.__class__.__name__} is not active.')
+        # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
+        await self._emit_persist_state_event_rec_task.stop()
+        await self._emit_persist_state_event()
         await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
         self._event_emitter.remove_all_listeners()
         self._listener_tasks.clear()
         self._listeners_to_wrappers.clear()
-        await self._emit_persist_state_event_rec_task.stop()
         self._active = False
     @overload

crawlee/events/_types.py CHANGED Viewed

@@ -40,7 +40,7 @@ class Event(str, Enum):
 class EventPersistStateData(BaseModel):
     """Data for the persist state event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     is_migrating: Annotated[bool, Field(alias='isMigrating')]
@@ -49,7 +49,7 @@ class EventPersistStateData(BaseModel):
 class EventSystemInfoData(BaseModel):
     """Data for the system info event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     cpu_info: Annotated[CpuInfo, Field(alias='cpuInfo')]
     memory_info: Annotated[
@@ -62,7 +62,7 @@ class EventSystemInfoData(BaseModel):
 class EventMigratingData(BaseModel):
     """Data for the migrating event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     # The remaining time in seconds before the migration is forced and the process is killed
     # Optional because it's not present when the event handler is called manually
@@ -73,21 +73,21 @@ class EventMigratingData(BaseModel):
 class EventAbortingData(BaseModel):
     """Data for the aborting event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
 @docs_group('Event data')
 class EventExitData(BaseModel):
     """Data for the exit event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
 @docs_group('Event data')
 class EventCrawlerStatusData(BaseModel):
     """Data for the crawler status event."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     message: str
     """A message describing the current status of the crawler."""

crawlee/fingerprint_suite/_header_generator.py CHANGED Viewed

@@ -11,9 +11,9 @@ if TYPE_CHECKING:
 def fingerprint_browser_type_from_playwright_browser_type(
-    playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
+    playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
 ) -> SupportedBrowserType:
-    if playwright_browser_type == 'chromium':
+    if playwright_browser_type in {'chromium', 'chrome'}:
         return 'chrome'
     if playwright_browser_type == 'firefox':
         return 'firefox'

crawlee/fingerprint_suite/_types.py CHANGED Viewed

@@ -11,7 +11,7 @@ SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
 class ScreenOptions(BaseModel):
-    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+    model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
     """Defines the screen constrains for the fingerprint generator."""
@@ -31,7 +31,7 @@ class ScreenOptions(BaseModel):
 class HeaderGeneratorOptions(BaseModel):
     """Collection of header related attributes that can be used by the fingerprint generator."""
-    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+    model_config = ConfigDict(extra='forbid', validate_by_name=True, validate_by_alias=True)
     browsers: list[SupportedBrowserType] | None = None
     """List of BrowserSpecifications to generate the headers for."""

crawlee/otel/crawler_instrumentor.py CHANGED Viewed

@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
         if request_handling_instrumentation:
-            async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
+            async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
                 with self._tracer.start_as_current_span(
                     name=f'{instance.generator.__name__}, {wrapped.__name__}',  # type:ignore[attr-defined]  # valid in our context
                     attributes={
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
             # Handpicked interesting methods to instrument
             self._instrumented.extend(
                 [
-                    (_Middleware, 'action', middlware_wrapper),
-                    (_Middleware, 'cleanup', middlware_wrapper),
+                    (_Middleware, 'action', middleware_wrapper),
+                    (_Middleware, 'cleanup', middleware_wrapper),
                     (ContextPipeline, '__call__', context_pipeline_wrapper),
                     (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
                     (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),

crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml CHANGED Viewed

@@ -5,8 +5,8 @@
 # % endif
 # % if cookiecutter.http_client == 'curl-impersonate'
 # % do extras.append('curl-impersonate')
-# % elif cookiecutter.http_client == 'impit'
-# % do extras.append('impit')
+# % elif cookiecutter.http_client == 'httpx'
+# % do extras.append('httpx')
 # % endif
 [project]

crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt CHANGED Viewed

@@ -10,4 +10,7 @@ apify
 # % if cookiecutter.http_client == 'curl-impersonate'
 # % do extras.append('curl-impersonate')
 # % endif
+# % if cookiecutter.http_client == 'httpx'
+# % do extras.append('httpx')
+# % endif
 crawlee[{{ extras | join(',') }}]

crawlee/request_loaders/_request_list.py CHANGED Viewed

@@ -17,7 +17,7 @@ logger = getLogger(__name__)
 class RequestListState(BaseModel):
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     next_index: Annotated[int, Field(alias='nextIndex')] = 0
     next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None

crawlee/request_loaders/_sitemap_request_loader.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import override
-from crawlee import Request
+from crawlee import Request, RequestOptions
 from crawlee._utils.docs import docs_group
 from crawlee._utils.globs import Glob
 from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
 if TYPE_CHECKING:
     import re
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
     from types import TracebackType
+    from crawlee import RequestTransformAction
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.storage_clients.models import ProcessedRequest
@@ -56,7 +57,7 @@ class SitemapRequestLoaderState(BaseModel):
     `in_progress` is cleared.
     """
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     url_queue: Annotated[deque[str], Field(alias='urlQueue')]
     """Queue of URLs extracted from sitemaps and ready for processing."""
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
 class SitemapRequestLoader(RequestLoader):
     """A request loader that reads URLs from sitemap(s).
+    The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
+    (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
+    Note that HTML pages containing links are not supported - those should be handled by regular crawlers
+    and the `enqueue_links` functionality.
     The loader fetches and parses sitemaps in the background, allowing crawling to start
     before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
         exclude: list[re.Pattern[Any] | Glob] | None = None,
         max_buffer_size: int = 200,
         persist_state_key: str | None = None,
+        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
     ) -> None:
         """Initialize the sitemap request loader.
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
             persist_state_key: A key for persisting the loader's state in the KeyValueStore.
                 When provided, allows resuming from where it left off after interruption.
                 If None, no state persistence occurs.
+            transform_request_function: An optional function to transform requests
+                generated by the loader. It receives `RequestOptions` with `url` and should return either
+                modified `RequestOptions` or a `RequestTransformAction`.
         """
         self._http_client = http_client
         self._sitemap_urls = sitemap_urls
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
         self._exclude = exclude
         self._proxy_info = proxy_info
         self._max_buffer_size = max_buffer_size
+        self._transform_request_function = transform_request_function
         # Synchronization for queue operations
         self._queue_has_capacity = asyncio.Event()
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
             async with self._queue_lock:
                 url = state.url_queue.popleft()
-                request = Request.from_url(url)
+                request_option = RequestOptions(url=url)
+                if self._transform_request_function:
+                    transform_request_option = self._transform_request_function(request_option)
+                    if transform_request_option == 'skip':
+                        state.total_count -= 1
+                        continue
+                    if transform_request_option != 'unchanged':
+                        request_option = transform_request_option
+                request = Request.from_url(**request_option)
                 state.in_progress.add(request.url)
                 if len(state.url_queue) < self._max_buffer_size:
                     self._queue_has_capacity.set()

crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b31py3-none-any.whl → 1.1.1b1py3-none-any.whl