crawlee 0.6.13b43__py3-none-any.whl → 1.1.2b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (80) hide show
  1. crawlee/_request.py +32 -21
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +87 -25
  4. crawlee/_utils/file.py +7 -0
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +15 -0
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/time.py +41 -1
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +21 -15
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +3 -1
  17. crawlee/crawlers/__init__.py +2 -1
  18. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  19. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
  21. crawlee/crawlers/_basic/_basic_crawler.py +139 -96
  22. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  23. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  24. crawlee/crawlers/_playwright/_playwright_crawler.py +52 -10
  25. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  26. crawlee/events/_event_manager.py +3 -1
  27. crawlee/fingerprint_suite/_header_generator.py +2 -2
  28. crawlee/http_clients/_base.py +4 -0
  29. crawlee/http_clients/_curl_impersonate.py +12 -0
  30. crawlee/http_clients/_httpx.py +16 -6
  31. crawlee/http_clients/_impit.py +25 -10
  32. crawlee/otel/crawler_instrumentor.py +3 -3
  33. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  34. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  35. crawlee/request_loaders/_sitemap_request_loader.py +22 -4
  36. crawlee/sessions/_session_pool.py +1 -1
  37. crawlee/statistics/_error_snapshotter.py +1 -1
  38. crawlee/statistics/_models.py +32 -1
  39. crawlee/statistics/_statistics.py +24 -33
  40. crawlee/storage_clients/__init__.py +16 -0
  41. crawlee/storage_clients/_base/_storage_client.py +5 -4
  42. crawlee/storage_clients/_file_system/_dataset_client.py +6 -7
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -8
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +31 -15
  45. crawlee/storage_clients/_file_system/_storage_client.py +2 -2
  46. crawlee/storage_clients/_memory/_dataset_client.py +4 -5
  47. crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
  48. crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
  49. crawlee/storage_clients/_redis/__init__.py +6 -0
  50. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  51. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  52. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  53. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  54. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  55. crawlee/storage_clients/_redis/_utils.py +23 -0
  56. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  57. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  58. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  59. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  60. crawlee/storage_clients/_redis/py.typed +0 -0
  61. crawlee/storage_clients/_sql/__init__.py +6 -0
  62. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  63. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  64. crawlee/storage_clients/_sql/_db_models.py +268 -0
  65. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  66. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  67. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  68. crawlee/storage_clients/_sql/py.typed +0 -0
  69. crawlee/storage_clients/models.py +10 -10
  70. crawlee/storages/_base.py +3 -1
  71. crawlee/storages/_dataset.py +5 -3
  72. crawlee/storages/_key_value_store.py +11 -6
  73. crawlee/storages/_request_queue.py +5 -3
  74. crawlee/storages/_storage_instance_manager.py +54 -68
  75. crawlee/storages/_utils.py +11 -0
  76. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +17 -5
  77. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +80 -58
  78. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
  79. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
  80. {crawlee-0.6.13b43.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from asyncio import Lock
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import TYPE_CHECKING, Any, cast
7
8
 
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
77
78
 
78
79
  self._total_opened_pages = 0
79
80
 
81
+ self._context_creation_lock: Lock | None = None
82
+
83
+ async def _get_context_creation_lock(self) -> Lock:
84
+ """Get context checking and creation lock.
85
+
86
+ It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
87
+ memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
88
+ """
89
+ if self._context_creation_lock:
90
+ return self._context_creation_lock
91
+ self._context_creation_lock = Lock()
92
+ return self._context_creation_lock
93
+
80
94
  @property
81
95
  @override
82
96
  def pages(self) -> list[Page]:
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
137
151
  Raises:
138
152
  ValueError: If the browser has reached the maximum number of open pages.
139
153
  """
140
- if not self._browser_context:
141
- self._browser_context = await self._create_browser_context(
142
- browser_new_context_options=browser_new_context_options,
143
- proxy_info=proxy_info,
144
- )
145
-
146
154
  if not self.has_free_capacity:
147
155
  raise ValueError('Cannot open more pages in this browser.')
148
156
 
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
154
162
  )
155
163
  page = await new_context.new_page()
156
164
  else:
157
- if not self._browser_context:
158
- self._browser_context = await self._create_browser_context(
159
- browser_new_context_options=browser_new_context_options,
160
- proxy_info=proxy_info,
161
- )
165
+ async with await self._get_context_creation_lock():
166
+ if not self._browser_context:
167
+ self._browser_context = await self._create_browser_context(
168
+ browser_new_context_options=browser_new_context_options,
169
+ proxy_info=proxy_info,
170
+ )
162
171
  page = await self._browser_context.new_page()
163
172
 
164
173
  # Handle page close event
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
169
178
  self._last_page_opened_at = datetime.now(timezone.utc)
170
179
 
171
180
  self._total_opened_pages += 1
172
-
173
181
  return page
174
182
 
175
183
  @override
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
206
214
  `self._fingerprint_generator` is available.
207
215
  """
208
216
  browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
209
-
210
217
  if proxy_info:
211
218
  if browser_new_context_options.get('proxy'):
212
- logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
219
+ logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
213
220
 
214
221
  browser_new_context_options['proxy'] = ProxySettings(
215
222
  server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
244
251
  browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
245
252
  'extra_http_headers', extra_http_headers
246
253
  )
247
-
248
254
  return await self._browser.new_context(**browser_new_context_options)
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
34
34
 
35
35
  It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
36
36
  for creating new browser instances and provides a unified interface for interacting with different browser types
37
- (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,
38
- executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
37
+ (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
38
+ mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
39
39
  browser instance, ensuring that resource limits are respected.
40
40
  """
41
41
 
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
55
55
  """Initialize a new instance.
56
56
 
57
57
  Args:
58
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
58
+ browser_type: The type of browser to launch:
59
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
60
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
61
+ the system.
59
62
  user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
60
63
  storage.
61
64
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
80
83
  'chromium_sandbox': not config.disable_browser_sandbox,
81
84
  }
82
85
 
86
+ if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
87
+ raise ValueError(
88
+ 'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
89
+ )
90
+
91
+ # Map 'chrome' to 'chromium' with the 'chrome' channel.
92
+ if browser_type == 'chrome':
93
+ browser_type = 'chromium'
94
+ # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
95
+ default_launch_browser_options['channel'] = 'chrome'
96
+
83
97
  self._browser_type: BrowserType = browser_type
84
98
  self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
85
99
  self._browser_new_context_options = browser_new_context_options or {}
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal
6
6
  if TYPE_CHECKING:
7
7
  from playwright.async_api import Page
8
8
 
9
- BrowserType = Literal['chromium', 'firefox', 'webkit']
9
+ BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']
10
10
 
11
11
 
12
12
  @dataclass
crawlee/configuration.py CHANGED
@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
28
28
  Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
29
29
  """
30
30
 
31
- model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True)
31
+ # TODO: https://github.com/pydantic/pydantic-settings/issues/706
32
+ # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
33
+ model_config = SettingsConfigDict(populate_by_name=True)
32
34
 
33
35
  internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
34
36
  """Timeout for the internal asynchronous operations."""
@@ -1,7 +1,7 @@
1
1
  from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
2
  from crawlee._utils.try_import import try_import as _try_import
3
3
 
4
- from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4
+ from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
5
5
  from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
6
6
  from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
7
7
 
@@ -51,6 +51,7 @@ __all__ = [
51
51
  'BeautifulSoupParserType',
52
52
  'ContextPipeline',
53
53
  'HttpCrawler',
54
+ 'HttpCrawlerOptions',
54
55
  'HttpCrawlingContext',
55
56
  'HttpCrawlingResult',
56
57
  'ParsedHttpCrawlingContext',
@@ -1,9 +1,10 @@
1
- from ._abstract_http_crawler import AbstractHttpCrawler
1
+ from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
2
2
  from ._abstract_http_parser import AbstractHttpParser
3
3
  from ._http_crawling_context import ParsedHttpCrawlingContext
4
4
 
5
5
  __all__ = [
6
6
  'AbstractHttpCrawler',
7
7
  'AbstractHttpParser',
8
+ 'HttpCrawlerOptions',
8
9
  'ParsedHttpCrawlingContext',
9
10
  ]
@@ -3,14 +3,16 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  from abc import ABC
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING, Any, Generic
7
8
 
8
9
  from more_itertools import partition
9
10
  from pydantic import ValidationError
10
- from typing_extensions import TypeVar
11
+ from typing_extensions import NotRequired, TypeVar
11
12
 
12
13
  from crawlee._request import Request, RequestOptions
13
14
  from crawlee._utils.docs import docs_group
15
+ from crawlee._utils.time import SharedTimeout
14
16
  from crawlee._utils.urls import to_absolute_url_iterator
15
17
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
16
18
  from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
32
34
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
33
35
 
34
36
 
37
+ class HttpCrawlerOptions(
38
+ BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39
+ Generic[TCrawlingContext, TStatisticsState],
40
+ ):
41
+ """Arguments for the `AbstractHttpCrawler` constructor.
42
+
43
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
44
+ """
45
+
46
+ navigation_timeout: NotRequired[timedelta | None]
47
+ """Timeout for the HTTP request."""
48
+
49
+
35
50
  @docs_group('Crawlers')
36
51
  class AbstractHttpCrawler(
37
52
  BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
56
71
  self,
57
72
  *,
58
73
  parser: AbstractHttpParser[TParseResult, TSelectResult],
74
+ navigation_timeout: timedelta | None = None,
59
75
  **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
60
76
  ) -> None:
61
77
  self._parser = parser
78
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
62
79
  self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
63
81
 
64
82
  if '_context_pipeline' not in kwargs:
65
83
  raise ValueError(
@@ -112,9 +130,17 @@ class AbstractHttpCrawler(
112
130
  async def _execute_pre_navigation_hooks(
113
131
  self, context: BasicCrawlingContext
114
132
  ) -> AsyncGenerator[BasicCrawlingContext, None]:
115
- for hook in self._pre_navigation_hooks:
116
- await hook(context)
117
- yield context
133
+ context_id = id(context)
134
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
135
+
136
+ try:
137
+ for hook in self._pre_navigation_hooks:
138
+ async with self._shared_navigation_timeouts[context_id]:
139
+ await hook(context)
140
+
141
+ yield context
142
+ finally:
143
+ self._shared_navigation_timeouts.pop(context_id, None)
118
144
 
119
145
  async def _parse_http_response(
120
146
  self, context: HttpCrawlingContext
@@ -167,7 +193,15 @@ class AbstractHttpCrawler(
167
193
  kwargs.setdefault('strategy', 'same-hostname')
168
194
 
169
195
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
196
+
197
+ # Get base URL from <base> tag if present
198
+ extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
199
+ base_url: str = (
200
+ str(extracted_base_urls[0])
201
+ if extracted_base_urls
202
+ else context.request.loaded_url or context.request.url
203
+ )
204
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
171
205
 
172
206
  if robots_txt_file:
173
207
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -214,12 +248,14 @@ class AbstractHttpCrawler(
214
248
  Yields:
215
249
  The original crawling context enhanced by HTTP response.
216
250
  """
217
- result = await self._http_client.crawl(
218
- request=context.request,
219
- session=context.session,
220
- proxy_info=context.proxy_info,
221
- statistics=self._statistics,
222
- )
251
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
252
+ result = await self._http_client.crawl(
253
+ request=context.request,
254
+ session=context.session,
255
+ proxy_info=context.proxy_info,
256
+ statistics=self._statistics,
257
+ timeout=remaining_timeout,
258
+ )
223
259
 
224
260
  yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
225
261
 
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
12
12
  from parsel import Selector
13
13
  from typing_extensions import Self, TypeVar, override
14
14
 
15
- from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
15
+ from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
16
16
  from crawlee._utils.docs import docs_group
17
17
  from crawlee._utils.wait import wait_for
18
18
  from crawlee.crawlers import (
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
71
71
  async def __aenter__(self) -> Self:
72
72
  self._active = True
73
73
  await self._state.initialize()
74
- self._after_initialize()
75
74
  return self
76
75
 
77
76
  async def __aexit__(
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
149
148
  non-default configuration.
150
149
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
151
150
  """
152
- # Some sub crawler kwargs are internally modified. Prepare copies.
153
- basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
154
- basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
155
-
156
151
  # Adaptive crawling related.
157
152
  self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
158
153
  self.result_checker = result_checker or (lambda _: True)
159
154
  self.result_comparator = result_comparator or create_default_comparator(result_checker)
160
155
 
156
+ # Set default concurrency settings for browser crawlers if not provided
157
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
158
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
159
+
161
160
  super().__init__(statistics=statistics, **kwargs)
162
161
 
163
162
  # Sub crawlers related.
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
166
165
  # Each sub crawler will use custom logger .
167
166
  static_logger = getLogger('Subcrawler_static')
168
167
  static_logger.setLevel(logging.ERROR)
169
- basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
168
+ basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
170
169
 
171
170
  pw_logger = getLogger('Subcrawler_playwright')
172
171
  pw_logger.setLevel(logging.ERROR)
173
- basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
172
+ basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
174
173
 
175
174
  # Initialize sub crawlers to create their pipelines.
176
175
  static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
315
314
  ),
316
315
  logger=self._logger,
317
316
  )
318
- return SubCrawlerRun(result=result)
317
+ return SubCrawlerRun(result=result, run_context=context_linked_to_result)
319
318
  except Exception as e:
320
319
  return SubCrawlerRun(exception=e)
321
320
 
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
371
370
  self.track_http_only_request_handler_runs()
372
371
 
373
372
  static_run = await self._crawl_one(rendering_type='static', context=context)
374
- if static_run.result and self.result_checker(static_run.result):
373
+ if static_run.result and static_run.run_context and self.result_checker(static_run.result):
374
+ self._update_context_from_copy(context, static_run.run_context)
375
375
  self._context_result_map[context] = static_run.result
376
376
  return
377
377
  if static_run.exception:
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
402
402
  if pw_run.exception is not None:
403
403
  raise pw_run.exception
404
404
 
405
- if pw_run.result:
406
- self._context_result_map[context] = pw_run.result
407
-
405
+ if pw_run.result and pw_run.run_context:
408
406
  if should_detect_rendering_type:
409
407
  detection_result: RenderingType
410
408
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
411
-
412
409
  if static_run.result and self.result_comparator(static_run.result, pw_run.result):
413
410
  detection_result = 'static'
414
411
  else:
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
417
414
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
418
415
  self.rendering_type_predictor.store_result(context.request, detection_result)
419
416
 
417
+ self._update_context_from_copy(context, pw_run.run_context)
418
+ self._context_result_map[context] = pw_run.result
419
+
420
420
  def pre_navigation_hook(
421
421
  self,
422
422
  hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
451
451
  def track_rendering_type_mispredictions(self) -> None:
452
452
  self.statistics.state.rendering_type_mispredictions += 1
453
453
 
454
+ def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
455
+ """Update mutable fields of `context` from `context_copy`.
456
+
457
+ Uses object.__setattr__ to bypass frozen dataclass restrictions,
458
+ allowing state synchronization after isolated crawler execution.
459
+ """
460
+ updating_attributes = {
461
+ 'request': ('headers', 'user_data'),
462
+ 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
463
+ }
464
+
465
+ for attr, sub_attrs in updating_attributes.items():
466
+ original_sub_obj = getattr(context, attr)
467
+ copy_sub_obj = getattr(context_copy, attr)
468
+
469
+ # Check that both sub objects are not None
470
+ if original_sub_obj is None or copy_sub_obj is None:
471
+ continue
472
+
473
+ for sub_attr in sub_attrs:
474
+ new_value = getattr(copy_sub_obj, sub_attr)
475
+ object.__setattr__(original_sub_obj, sub_attr, new_value)
476
+
454
477
 
455
478
  @dataclass(frozen=True)
456
479
  class SubCrawlerRun:
457
480
  result: RequestHandlerRunResult | None = None
458
481
  exception: Exception | None = None
482
+ run_context: BasicCrawlingContext | None = None