PyPI - crawlee - Versions diffs - 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl - Mend

crawlee 0.6.13b31py3-none-any.whl → 1.1.1b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlee might be problematic. Click here for more details.

Files changed (82) hide show

crawlee/_autoscaling/snapshotter.py +1 -1
crawlee/_request.py +34 -22
crawlee/_service_locator.py +44 -24
crawlee/_types.py +86 -33
crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
crawlee/_utils/recoverable_state.py +32 -8
crawlee/_utils/recurring_task.py +15 -0
crawlee/_utils/robots.py +17 -5
crawlee/_utils/sitemap.py +1 -1
crawlee/_utils/system.py +3 -3
crawlee/_utils/urls.py +9 -2
crawlee/browsers/_browser_pool.py +4 -1
crawlee/browsers/_playwright_browser_controller.py +21 -15
crawlee/browsers/_playwright_browser_plugin.py +17 -3
crawlee/browsers/_types.py +1 -1
crawlee/configuration.py +2 -0
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
crawlee/crawlers/_basic/_basic_crawler.py +124 -37
crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
crawlee/events/_event_manager.py +3 -1
crawlee/events/_types.py +6 -6
crawlee/fingerprint_suite/_header_generator.py +2 -2
crawlee/fingerprint_suite/_types.py +2 -2
crawlee/otel/crawler_instrumentor.py +3 -3
crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
crawlee/request_loaders/_request_list.py +1 -1
crawlee/request_loaders/_sitemap_request_loader.py +23 -5
crawlee/sessions/_models.py +2 -2
crawlee/sessions/_session_pool.py +1 -1
crawlee/statistics/_error_snapshotter.py +1 -1
crawlee/statistics/_models.py +33 -2
crawlee/statistics/_statistics.py +24 -33
crawlee/storage_clients/__init__.py +16 -0
crawlee/storage_clients/_base/_storage_client.py +13 -0
crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
crawlee/storage_clients/_file_system/_storage_client.py +16 -3
crawlee/storage_clients/_file_system/_utils.py +0 -0
crawlee/storage_clients/_memory/_dataset_client.py +16 -4
crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
crawlee/storage_clients/_memory/_storage_client.py +6 -3
crawlee/storage_clients/_redis/__init__.py +6 -0
crawlee/storage_clients/_redis/_client_mixin.py +295 -0
crawlee/storage_clients/_redis/_dataset_client.py +325 -0
crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
crawlee/storage_clients/_redis/_storage_client.py +146 -0
crawlee/storage_clients/_redis/_utils.py +23 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
crawlee/storage_clients/_redis/py.typed +0 -0
crawlee/storage_clients/_sql/__init__.py +6 -0
crawlee/storage_clients/_sql/_client_mixin.py +385 -0
crawlee/storage_clients/_sql/_dataset_client.py +310 -0
crawlee/storage_clients/_sql/_db_models.py +268 -0
crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
crawlee/storage_clients/_sql/_storage_client.py +291 -0
crawlee/storage_clients/_sql/py.typed +0 -0
crawlee/storage_clients/models.py +10 -10
crawlee/storages/_base.py +5 -1
crawlee/storages/_dataset.py +12 -2
crawlee/storages/_key_value_store.py +17 -4
crawlee/storages/_request_queue.py +10 -2
crawlee/storages/_storage_instance_manager.py +133 -71
crawlee/storages/_utils.py +11 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
{crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0

crawlee/_utils/robots.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from logging import getLogger
 from typing import TYPE_CHECKING
 from protego import Protego
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
     from crawlee.proxy_configuration import ProxyInfo
+logger = getLogger(__name__)
 class RobotsTxtFile:
     def __init__(
         self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ class RobotsTxtFile:
             http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
             proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
         """
-        response = await http_client.send_request(url, proxy_info=proxy_info)
-        body = (
-            b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
-        )
+        try:
+            response = await http_client.send_request(url, proxy_info=proxy_info)
+            body = (
+                b'User-agent: *\nAllow: /'
+                if is_status_code_client_error(response.status_code)
+                else await response.read()
+            )
+            robots = Protego.parse(body.decode('utf-8'))
+        except Exception as e:
+            logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
-        robots = Protego.parse(body.decode('utf-8'))
+            robots = Protego.parse('User-agent: *\nAllow: /')
         return cls(url, robots, http_client=http_client, proxy_info=proxy_info)

crawlee/_utils/sitemap.py CHANGED Viewed

@@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
                         # Check if the first chunk is a valid gzip header
                         if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
                             decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
-                            first_chunk = False
+                        first_chunk = False
                         chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
                         text_chunk = decoder.decode(chunk)

crawlee/_utils/system.py CHANGED Viewed

@@ -36,7 +36,7 @@ else:
 class CpuInfo(BaseModel):
     """Information about the CPU usage."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     used_ratio: Annotated[float, Field(alias='usedRatio')]
     """The ratio of CPU currently in use, represented as a float between 0 and 1."""
@@ -51,7 +51,7 @@ class CpuInfo(BaseModel):
 class MemoryUsageInfo(BaseModel):
     """Information about the memory usage."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     current_size: Annotated[
         ByteSize,
@@ -71,7 +71,7 @@ class MemoryUsageInfo(BaseModel):
 class MemoryInfo(MemoryUsageInfo):
     """Information about system memory."""
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     total_size: Annotated[
         ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')

crawlee/_utils/urls.py CHANGED Viewed

@@ -7,6 +7,7 @@ from yarl import URL
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from logging import Logger
 def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
     return str(URL(base_url).join(URL(relative_url)))
-def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
+def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
     """Convert an iterator of relative URLs to absolute URLs using a base URL."""
     for url in urls:
         if is_url_absolute(url):
             yield url
         else:
-            yield convert_to_absolute_url(base_url, url)
+            converted_url = convert_to_absolute_url(base_url, url)
+            # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
+            if not is_url_absolute(converted_url):
+                if logger:
+                    logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
+                continue
+            yield converted_url
 _http_url_adapter = TypeAdapter(AnyHttpUrl)

crawlee/browsers/_browser_pool.py CHANGED Viewed

@@ -118,7 +118,10 @@ class BrowserPool:
         """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
         Args:
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
             user_data_dir: Path to a user data directory, which stores browser session data like cookies
                 and local storage.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided

crawlee/browsers/_playwright_browser_controller.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+from asyncio import Lock
 from datetime import datetime, timedelta, timezone
 from typing import TYPE_CHECKING, Any, cast
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
         self._total_opened_pages = 0
+        self._context_creation_lock: Lock | None = None
+    async def _get_context_creation_lock(self) -> Lock:
+        """Get context checking and creation lock.
+        It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
+        memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
+        """
+        if self._context_creation_lock:
+            return self._context_creation_lock
+        self._context_creation_lock = Lock()
+        return self._context_creation_lock
     @property
     @override
     def pages(self) -> list[Page]:
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
         Raises:
             ValueError: If the browser has reached the maximum number of open pages.
         """
-        if not self._browser_context:
-            self._browser_context = await self._create_browser_context(
-                browser_new_context_options=browser_new_context_options,
-                proxy_info=proxy_info,
-            )
         if not self.has_free_capacity:
             raise ValueError('Cannot open more pages in this browser.')
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
             )
             page = await new_context.new_page()
         else:
-            if not self._browser_context:
-                self._browser_context = await self._create_browser_context(
-                    browser_new_context_options=browser_new_context_options,
-                    proxy_info=proxy_info,
-                )
+            async with await self._get_context_creation_lock():
+                if not self._browser_context:
+                    self._browser_context = await self._create_browser_context(
+                        browser_new_context_options=browser_new_context_options,
+                        proxy_info=proxy_info,
+                    )
             page = await self._browser_context.new_page()
         # Handle page close event
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
         self._last_page_opened_at = datetime.now(timezone.utc)
         self._total_opened_pages += 1
         return page
     @override
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
         `self._fingerprint_generator` is available.
         """
         browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
         if proxy_info:
             if browser_new_context_options.get('proxy'):
-                logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
+                logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
             browser_new_context_options['proxy'] = ProxySettings(
                 server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
         browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
             'extra_http_headers', extra_http_headers
         )
         return await self._browser.new_context(**browser_new_context_options)

crawlee/browsers/_playwright_browser_plugin.py CHANGED Viewed

@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
     It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
     for creating new browser instances and provides a unified interface for interacting with different browser types
-    (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,
-    executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
+    (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
+    mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
     browser instance, ensuring that resource limits are respected.
     """
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
         """Initialize a new instance.
         Args:
-            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            browser_type: The type of browser to launch:
+                - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
+                - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
+                    the system.
             user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
                 storage.
             browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
             'chromium_sandbox': not config.disable_browser_sandbox,
         }
+        if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
+            raise ValueError(
+                'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
+            )
+        # Map 'chrome' to 'chromium' with the 'chrome' channel.
+        if browser_type == 'chrome':
+            browser_type = 'chromium'
+            # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
+            default_launch_browser_options['channel'] = 'chrome'
         self._browser_type: BrowserType = browser_type
         self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
         self._browser_new_context_options = browser_new_context_options or {}

crawlee/browsers/_types.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal
 if TYPE_CHECKING:
     from playwright.async_api import Page
-BrowserType = Literal['chromium', 'firefox', 'webkit']
+BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']
 @dataclass

crawlee/configuration.py CHANGED Viewed

@@ -28,6 +28,8 @@ class Configuration(BaseSettings):
     Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
     """
+    # TODO: https://github.com/pydantic/pydantic-settings/issues/706
+    # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
     model_config = SettingsConfigDict(populate_by_name=True)
     internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None

crawlee/crawlers/_abstract_http/_abstract_http_crawler.py CHANGED Viewed

@@ -34,7 +34,9 @@ TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=St
 @docs_group('Crawlers')
 class AbstractHttpCrawler(
-    Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
+    BasicCrawler[TCrawlingContext, StatisticsState],
+    ABC,
+    Generic[TCrawlingContext, TParseResult, TSelectResult],
 ):
     """A web crawler for performing HTTP requests.
@@ -165,7 +167,9 @@ class AbstractHttpCrawler(
             kwargs.setdefault('strategy', 'same-hostname')
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            links_iterator = to_absolute_url_iterator(
+                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            )
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

crawlee/crawlers/_abstract_http/_abstract_http_parser.py CHANGED Viewed

@@ -16,7 +16,7 @@ if TYPE_CHECKING:
 @docs_group('HTTP parsers')
-class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC):
+class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
     """Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
     @abstractmethod

crawlee/crawlers/_abstract_http/_http_crawling_context.py CHANGED Viewed

@@ -31,7 +31,7 @@ class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
 @dataclass(frozen=True)
 @docs_group('Crawling contexts')
-class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
+class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
     """The crawling context used by `AbstractHttpCrawler`.
     It provides access to key objects as well as utility functions for handling crawling tasks.

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py CHANGED Viewed

@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
 from parsel import Selector
 from typing_extensions import Self, TypeVar, override
-from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
+from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
 from crawlee._utils.docs import docs_group
 from crawlee._utils.wait import wait_for
 from crawlee.crawlers import (
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
     async def __aenter__(self) -> Self:
         self._active = True
         await self._state.initialize()
-        self._after_initialize()
         return self
     async def __aexit__(
@@ -85,8 +84,8 @@ class _NonPersistentStatistics(Statistics):
 @docs_group('Crawlers')
 class AdaptivePlaywrightCrawler(
-    Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
     BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
+    Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
 ):
     """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
                 non-default configuration.
             kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
         """
-        # Some sub crawler kwargs are internally modified. Prepare copies.
-        basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
-        basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
         # Adaptive crawling related.
         self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
         self.result_checker = result_checker or (lambda _: True)
         self.result_comparator = result_comparator or create_default_comparator(result_checker)
+        # Set default concurrency settings for browser crawlers if not provided
+        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
+            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
         super().__init__(statistics=statistics, **kwargs)
         # Sub crawlers related.
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
         # Each sub crawler will use custom logger .
         static_logger = getLogger('Subcrawler_static')
         static_logger.setLevel(logging.ERROR)
-        basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
+        basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
         pw_logger = getLogger('Subcrawler_playwright')
         pw_logger.setLevel(logging.ERROR)
-        basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
+        basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
         # Initialize sub crawlers to create their pipelines.
         static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
                 ),
                 logger=self._logger,
             )
-            return SubCrawlerRun(result=result)
+            return SubCrawlerRun(result=result, run_context=context_linked_to_result)
         except Exception as e:
             return SubCrawlerRun(exception=e)
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
                 self.track_http_only_request_handler_runs()
                 static_run = await self._crawl_one(rendering_type='static', context=context)
-                if static_run.result and self.result_checker(static_run.result):
+                if static_run.result and static_run.run_context and self.result_checker(static_run.result):
+                    self._update_context_from_copy(context, static_run.run_context)
                     self._context_result_map[context] = static_run.result
                     return
                 if static_run.exception:
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
         if pw_run.exception is not None:
             raise pw_run.exception
-        if pw_run.result:
-            self._context_result_map[context] = pw_run.result
+        if pw_run.result and pw_run.run_context:
             if should_detect_rendering_type:
                 detection_result: RenderingType
                 static_run = await self._crawl_one('static', context=context, state=old_state_copy)
                 if static_run.result and self.result_comparator(static_run.result, pw_run.result):
                     detection_result = 'static'
                 else:
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
                 context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
                 self.rendering_type_predictor.store_result(context.request, detection_result)
+            self._update_context_from_copy(context, pw_run.run_context)
+            self._context_result_map[context] = pw_run.result
     def pre_navigation_hook(
         self,
         hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
     def track_rendering_type_mispredictions(self) -> None:
         self.statistics.state.rendering_type_mispredictions += 1
+    def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
+        """Update mutable fields of `context` from `context_copy`.
+        Uses object.__setattr__ to bypass frozen dataclass restrictions,
+        allowing state synchronization after isolated crawler execution.
+        """
+        updating_attributes = {
+            'request': ('headers', 'user_data'),
+            'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
+        }
+        for attr, sub_attrs in updating_attributes.items():
+            original_sub_obj = getattr(context, attr)
+            copy_sub_obj = getattr(context_copy, attr)
+            # Check that both sub objects are not None
+            if original_sub_obj is None or copy_sub_obj is None:
+                continue
+            for sub_attr in sub_attrs:
+                new_value = getattr(copy_sub_obj, sub_attr)
+                object.__setattr__(original_sub_obj, sub_attr, new_value)
 @dataclass(frozen=True)
 class SubCrawlerRun:
     result: RequestHandlerRunResult | None = None
     exception: Exception | None = None
+    run_context: BasicCrawlingContext | None = None

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py CHANGED Viewed

@@ -12,7 +12,7 @@ from crawlee.statistics import StatisticsState
 class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
     """Statistic data about a crawler run with additional information related to adaptive crawling."""
-    model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
     http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
     """Number representing how many times static http based crawling was used."""

crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py CHANGED Viewed

@@ -31,7 +31,8 @@ class AdaptiveContextError(RuntimeError):
 @dataclass(frozen=True)
 @docs_group('Crawling contexts')
 class AdaptivePlaywrightCrawlingContext(
-    Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult]
+    ParsedHttpCrawlingContext[TStaticParseResult],
+    Generic[TStaticParseResult, TStaticSelectResult],
 ):
     _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
     """The crawling context used by `AdaptivePlaywrightCrawler`.

crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py CHANGED Viewed

@@ -32,7 +32,7 @@ FeatureVector = tuple[float, float]
 class RenderingTypePredictorState(BaseModel):
-    model_config = ConfigDict(populate_by_name=True)
+    model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
     model: Annotated[
         LogisticRegression,

crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

Potentially problematic release.

crawlee 0.6.13b31py3-none-any.whl → 1.1.1b1py3-none-any.whl