PyPI - crawlee - Versions diffs - 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl - Mend

crawlee 0.6.13b43py3-none-any.whl → 1.1.1b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (69) hide show

crawlee/_request.py +32 -21
crawlee/_service_locator.py +4 -4
crawlee/_types.py +67 -24
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +15 -0
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +1 -1
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +4 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +3 -1
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
crawlee/crawlers/_basic/_basic_crawler.py +51 -14
crawlee/crawlers/_playwright/_playwright_crawler.py +16 -4
crawlee/events/_event_manager.py +3 -1
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/otel/crawler_instrumentor.py +3 -3
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_sitemap_request_loader.py +22 -4
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +32 -1
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_storage_client.py +5 -4
crawlee/storage_clients/_file_system/_dataset_client.py +4 -5
crawlee/storage_clients/_file_system/_key_value_store_client.py +4 -5
crawlee/storage_clients/_file_system/_request_queue_client.py +28 -12
crawlee/storage_clients/_file_system/_storage_client.py +2 -2
crawlee/storage_clients/_memory/_dataset_client.py +4 -5
crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +295 -0
crawlee/storage_clients/_redis/_dataset_client.py +325 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
crawlee/storage_clients/_redis/_storage_client.py +146 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +291 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +10 -10
crawlee/storages/_base.py +3 -1
crawlee/storages/_dataset.py +5 -3
crawlee/storages/_key_value_store.py +11 -6
crawlee/storages/_request_queue.py +5 -3
crawlee/storages/_storage_instance_manager.py +54 -68
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +16 -5
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +69 -47
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0

crawlee/crawlers/_abstract_http/_abstract_http_crawler.py CHANGED Viewed

@@ -167,7 +167,9 @@ class AbstractHttpCrawler(
             kwargs.setdefault('strategy', 'same-hostname')
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            links_iterator = to_absolute_url_iterator(
+                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            )
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py CHANGED Viewed

@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
 from parsel import Selector
 from typing_extensions import Self, TypeVar, override
-from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
+from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
 from crawlee._utils.docs import docs_group
 from crawlee._utils.wait import wait_for
 from crawlee.crawlers import (
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
     async def __aenter__(self) -> Self:
         self._active = True
         await self._state.initialize()
-        self._after_initialize()
         return self
     async def __aexit__(
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
                 non-default configuration.
             kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
         """
-        # Some sub crawler kwargs are internally modified. Prepare copies.
-        basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
-        basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
         # Adaptive crawling related.
         self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
         self.result_checker = result_checker or (lambda _: True)
         self.result_comparator = result_comparator or create_default_comparator(result_checker)
+        # Set default concurrency settings for browser crawlers if not provided
+        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
+            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
         super().__init__(statistics=statistics, **kwargs)
         # Sub crawlers related.
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
         # Each sub crawler will use custom logger .
         static_logger = getLogger('Subcrawler_static')
         static_logger.setLevel(logging.ERROR)
-        basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
+        basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
         pw_logger = getLogger('Subcrawler_playwright')
         pw_logger.setLevel(logging.ERROR)
-        basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
+        basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
         # Initialize sub crawlers to create their pipelines.
         static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
                 ),
                 logger=self._logger,
             )
-            return SubCrawlerRun(result=result)
+            return SubCrawlerRun(result=result, run_context=context_linked_to_result)
         except Exception as e:
             return SubCrawlerRun(exception=e)
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
                 self.track_http_only_request_handler_runs()
                 static_run = await self._crawl_one(rendering_type='static', context=context)
-                if static_run.result and self.result_checker(static_run.result):
+                if static_run.result and static_run.run_context and self.result_checker(static_run.result):
+                    self._update_context_from_copy(context, static_run.run_context)
                     self._context_result_map[context] = static_run.result
                     return
                 if static_run.exception:
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
         if pw_run.exception is not None:
             raise pw_run.exception
-        if pw_run.result:
-            self._context_result_map[context] = pw_run.result
+        if pw_run.result and pw_run.run_context:
             if should_detect_rendering_type:
                 detection_result: RenderingType
                 static_run = await self._crawl_one('static', context=context, state=old_state_copy)
                 if static_run.result and self.result_comparator(static_run.result, pw_run.result):
                     detection_result = 'static'
                 else:
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
                 context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
                 self.rendering_type_predictor.store_result(context.request, detection_result)
+            self._update_context_from_copy(context, pw_run.run_context)
+            self._context_result_map[context] = pw_run.result
     def pre_navigation_hook(
         self,
         hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
     def track_rendering_type_mispredictions(self) -> None:
         self.statistics.state.rendering_type_mispredictions += 1
+    def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
+        """Update mutable fields of `context` from `context_copy`.
+        Uses object.__setattr__ to bypass frozen dataclass restrictions,
+        allowing state synchronization after isolated crawler execution.
+        """
+        updating_attributes = {
+            'request': ('headers', 'user_data'),
+            'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
+        }
+        for attr, sub_attrs in updating_attributes.items():
+            original_sub_obj = getattr(context, attr)
+            copy_sub_obj = getattr(context_copy, attr)
+            # Check that both sub objects are not None
+            if original_sub_obj is None or copy_sub_obj is None:
+                continue
+            for sub_attr in sub_attrs:
+                new_value = getattr(copy_sub_obj, sub_attr)
+                object.__setattr__(original_sub_obj, sub_attr, new_value)
 @dataclass(frozen=True)
 class SubCrawlerRun:
     result: RequestHandlerRunResult | None = None
     exception: Exception | None = None
+    run_context: BasicCrawlingContext | None = None

crawlee/crawlers/_basic/_basic_crawler.py CHANGED Viewed

@@ -437,14 +437,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         self._statistics_log_format = statistics_log_format
         # Statistics
-        self._statistics = statistics or cast(
-            'Statistics[TStatisticsState]',
-            Statistics.with_default_state(
-                periodic_message_logger=self._logger,
-                statistics_log_format=self._statistics_log_format,
-                log_message='Current request statistics:',
-            ),
-        )
+        if statistics:
+            self._statistics = statistics
+        else:
+            async def persist_state_factory() -> KeyValueStore:
+                return await self.get_key_value_store()
+            self._statistics = cast(
+                'Statistics[TStatisticsState]',
+                Statistics.with_default_state(
+                    persistence_enabled=True,
+                    periodic_message_logger=self._logger,
+                    statistics_log_format=self._statistics_log_format,
+                    log_message='Current request statistics:',
+                    persist_state_kvs_factory=persist_state_factory,
+                ),
+            )
         # Additional context managers to enter and exit
         self._additional_context_managers = _additional_context_managers or []
@@ -659,7 +668,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             request_manager = await self.get_request_manager()
             if purge_request_queue and isinstance(request_manager, RequestQueue):
                 await request_manager.drop()
-                self._request_manager = await RequestQueue.open()
+                self._request_manager = await RequestQueue.open(
+                    storage_client=self._service_locator.get_storage_client(),
+                    configuration=self._service_locator.get_configuration(),
+                )
         if requests is not None:
             await self.add_requests(requests)
@@ -686,7 +698,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         except CancelledError:
             pass
         finally:
-            await self._crawler_state_rec_task.stop()
             if threading.current_thread() is threading.main_thread():
                 with suppress(NotImplementedError):
                     asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -718,8 +729,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
     async def _run_crawler(self) -> None:
         event_manager = self._service_locator.get_event_manager()
-        self._crawler_state_rec_task.start()
         # Collect the context managers to be entered. Context managers that are already active are excluded,
         # as they were likely entered by the caller, who will also be responsible for exiting them.
         contexts_to_enter = [
@@ -730,6 +739,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                 self._statistics,
                 self._session_pool if self._use_session_pool else None,
                 self._http_client,
+                self._crawler_state_rec_task,
                 *self._additional_context_managers,
             )
             if cm and getattr(cm, 'active', False) is False
@@ -944,6 +954,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
             transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
             | None = None,
             requests: Sequence[str | Request] | None = None,
+            rq_id: str | None = None,
+            rq_name: str | None = None,
+            rq_alias: str | None = None,
             **kwargs: Unpack[EnqueueLinksKwargs],
         ) -> None:
             kwargs.setdefault('strategy', 'same-hostname')
@@ -955,7 +968,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                         '`transform_request_function` arguments when `requests` is provided.'
                     )
                 # Add directly passed requests.
-                await context.add_requests(requests or list[str | Request](), **kwargs)
+                await context.add_requests(
+                    requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
+                )
             else:
                 # Add requests from extracted links.
                 await context.add_requests(
@@ -964,7 +979,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
                         label=label,
                         user_data=user_data,
                         transform_request_function=transform_request_function,
+                        **kwargs,
                     ),
+                    rq_id=rq_id,
+                    rq_name=rq_name,
+                    rq_alias=rq_alias,
                     **kwargs,
                 )
@@ -1241,10 +1260,28 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
         """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
         result = self._context_result_map[context]
-        request_manager = await self.get_request_manager()
+        base_request_manager = await self.get_request_manager()
         origin = context.request.loaded_url or context.request.url
         for add_requests_call in result.add_requests_calls:
+            rq_id = add_requests_call.get('rq_id')
+            rq_name = add_requests_call.get('rq_name')
+            rq_alias = add_requests_call.get('rq_alias')
+            specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
+            if specified_params > 1:
+                raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
+            if rq_id or rq_name or rq_alias:
+                request_manager: RequestManager | RequestQueue = await RequestQueue.open(
+                    id=rq_id,
+                    name=rq_name,
+                    alias=rq_alias,
+                    storage_client=self._service_locator.get_storage_client(),
+                    configuration=self._service_locator.get_configuration(),
+                )
+            else:
+                request_manager = base_request_manager
             requests = list[Request]()
             base_url = url if (url := add_requests_call.get('base_url')) else origin

crawlee/crawlers/_playwright/_playwright_crawler.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar
 from crawlee import service_locator
 from crawlee._request import Request, RequestOptions
+from crawlee._types import ConcurrencySettings
 from crawlee._utils.blocked import RETRY_CSS_SELECTORS
 from crawlee._utils.docs import docs_group
 from crawlee._utils.robots import RobotsTxtFile
@@ -113,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
             user_data_dir: Path to a user data directory, which stores browser session data like cookies
                 and local storage.
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
                 This option should not be used if `browser_pool` is provided.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
                 directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -152,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             ):
                 raise ValueError(
                     'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
-                    '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`  or'
+                    '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
                     '`fingerprint_generator` arguments when `browser_pool` is provided.'
                 )
@@ -194,6 +198,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
         kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
+        # Set default concurrency settings for browser crawlers if not provided
+        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
+            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
         super().__init__(**kwargs)
     async def _open_page(
@@ -361,7 +369,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
             links_iterator: Iterator[str] = iter(
                 [url for element in elements if (url := await element.get_attribute('href')) is not None]
             )
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            links_iterator = to_absolute_url_iterator(
+                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            )
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -489,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
     """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
     browser_type: NotRequired[BrowserType]
-    """The type of browser to launch ('chromium', 'firefox', or 'webkit').
+    """The type of browser to launch:
+    - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+    - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
     This option should not be used if `browser_pool` is provided."""
     browser_launch_options: NotRequired[Mapping[str, Any]]

crawlee/events/_event_manager.py CHANGED Viewed

@@ -130,11 +130,13 @@ class EventManager:
         if not self._active:
             raise RuntimeError(f'The {self.__class__.__name__} is not active.')
+        # Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
+        await self._emit_persist_state_event_rec_task.stop()
+        await self._emit_persist_state_event()
         await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
         self._event_emitter.remove_all_listeners()
         self._listener_tasks.clear()
         self._listeners_to_wrappers.clear()
-        await self._emit_persist_state_event_rec_task.stop()
         self._active = False
     @overload

crawlee/fingerprint_suite/_header_generator.py CHANGED Viewed

@@ -11,9 +11,9 @@ if TYPE_CHECKING:
 def fingerprint_browser_type_from_playwright_browser_type(
-    playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
+    playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
 ) -> SupportedBrowserType:
-    if playwright_browser_type == 'chromium':
+    if playwright_browser_type in {'chromium', 'chrome'}:
         return 'chrome'
     if playwright_browser_type == 'firefox':
         return 'firefox'

crawlee/otel/crawler_instrumentor.py CHANGED Viewed

@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
         if request_handling_instrumentation:
-            async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
+            async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
                 with self._tracer.start_as_current_span(
                     name=f'{instance.generator.__name__}, {wrapped.__name__}',  # type:ignore[attr-defined]  # valid in our context
                     attributes={
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
             # Handpicked interesting methods to instrument
             self._instrumented.extend(
                 [
-                    (_Middleware, 'action', middlware_wrapper),
-                    (_Middleware, 'cleanup', middlware_wrapper),
+                    (_Middleware, 'action', middleware_wrapper),
+                    (_Middleware, 'cleanup', middleware_wrapper),
                     (ContextPipeline, '__call__', context_pipeline_wrapper),
                     (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
                     (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),

crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml CHANGED Viewed

@@ -5,8 +5,8 @@
 # % endif
 # % if cookiecutter.http_client == 'curl-impersonate'
 # % do extras.append('curl-impersonate')
-# % elif cookiecutter.http_client == 'impit'
-# % do extras.append('impit')
+# % elif cookiecutter.http_client == 'httpx'
+# % do extras.append('httpx')
 # % endif
 [project]

crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt CHANGED Viewed

@@ -10,4 +10,7 @@ apify
 # % if cookiecutter.http_client == 'curl-impersonate'
 # % do extras.append('curl-impersonate')
 # % endif
+# % if cookiecutter.http_client == 'httpx'
+# % do extras.append('httpx')
+# % endif
 crawlee[{{ extras | join(',') }}]

crawlee/request_loaders/_sitemap_request_loader.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import override
-from crawlee import Request
+from crawlee import Request, RequestOptions
 from crawlee._utils.docs import docs_group
 from crawlee._utils.globs import Glob
 from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
 if TYPE_CHECKING:
     import re
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
     from types import TracebackType
+    from crawlee import RequestTransformAction
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.storage_clients.models import ProcessedRequest
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
 class SitemapRequestLoader(RequestLoader):
     """A request loader that reads URLs from sitemap(s).
+    The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
+    (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
+    Note that HTML pages containing links are not supported - those should be handled by regular crawlers
+    and the `enqueue_links` functionality.
     The loader fetches and parses sitemaps in the background, allowing crawling to start
     before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
         exclude: list[re.Pattern[Any] | Glob] | None = None,
         max_buffer_size: int = 200,
         persist_state_key: str | None = None,
+        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
     ) -> None:
         """Initialize the sitemap request loader.
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
             persist_state_key: A key for persisting the loader's state in the KeyValueStore.
                 When provided, allows resuming from where it left off after interruption.
                 If None, no state persistence occurs.
+            transform_request_function: An optional function to transform requests
+                generated by the loader. It receives `RequestOptions` with `url` and should return either
+                modified `RequestOptions` or a `RequestTransformAction`.
         """
         self._http_client = http_client
         self._sitemap_urls = sitemap_urls
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
         self._exclude = exclude
         self._proxy_info = proxy_info
         self._max_buffer_size = max_buffer_size
+        self._transform_request_function = transform_request_function
         # Synchronization for queue operations
         self._queue_has_capacity = asyncio.Event()
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
             async with self._queue_lock:
                 url = state.url_queue.popleft()
-                request = Request.from_url(url)
+                request_option = RequestOptions(url=url)
+                if self._transform_request_function:
+                    transform_request_option = self._transform_request_function(request_option)
+                    if transform_request_option == 'skip':
+                        state.total_count -= 1
+                        continue
+                    if transform_request_option != 'unchanged':
+                        request_option = transform_request_option
+                request = Request.from_url(**request_option)
                 state.in_progress.add(request.url)
                 if len(state.url_queue) < self._max_buffer_size:
                     self._queue_has_capacity.set()

crawlee/sessions/_session_pool.py CHANGED Viewed

@@ -163,7 +163,7 @@ class SessionPool:
     def add_session(self, session: Session) -> None:
         """Add an externally created session to the pool.
-        This is intened only for the cases when you want to add a session that was created outside of the pool.
+        This is intended only for the cases when you want to add a session that was created outside of the pool.
         Otherwise, the pool will create new sessions automatically.
         Args:

crawlee/statistics/_error_snapshotter.py CHANGED Viewed

@@ -32,7 +32,7 @@ class ErrorSnapshotter:
         """Capture error snapshot and save it to key value store.
         It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
-        it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
+        it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
         returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
         an exception.

crawlee/statistics/_models.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import json
+import warnings
 from dataclasses import asdict, dataclass
 from datetime import datetime, timedelta, timezone
 from typing import Annotated, Any
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
     crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
     crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
     crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
-    crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
     errors: dict[str, Any] = Field(default_factory=dict)
     retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
     requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
         ),
     ] = {}
+    # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
+    _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
+    def model_post_init(self, /, __context: Any) -> None:
+        self._runtime_offset = self.crawler_runtime or self._runtime_offset
+    @property
+    def crawler_runtime(self) -> timedelta:
+        if self.crawler_last_started_at:
+            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
+            return self._runtime_offset + finished_at - self.crawler_last_started_at
+        return self._runtime_offset
+    @crawler_runtime.setter
+    def crawler_runtime(self, value: timedelta) -> None:
+        # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
+        # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
+        warnings.warn(
+            f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
+            f' Value {value} will not be used.',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+    @computed_field(alias='crawlerRuntimeMillis')
+    def crawler_runtime_for_serialization(self) -> timedelta:
+        if self.crawler_last_started_at:
+            finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
+            return self._runtime_offset + finished_at - self.crawler_last_started_at
+        return self._runtime_offset
     @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)  # type: ignore[prop-decorator]
     @property
     def request_total_duration(self) -> timedelta:

crawlee 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b43py3-none-any.whl → 1.1.1b1py3-none-any.whl