crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (82) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +34 -22
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +86 -33
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +15 -0
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/system.py +3 -3
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +21 -15
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +2 -0
  17. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
  18. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  19. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  21. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  22. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  23. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  24. crawlee/crawlers/_basic/_basic_crawler.py +124 -37
  25. crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
  26. crawlee/events/_event_manager.py +3 -1
  27. crawlee/events/_types.py +6 -6
  28. crawlee/fingerprint_suite/_header_generator.py +2 -2
  29. crawlee/fingerprint_suite/_types.py +2 -2
  30. crawlee/otel/crawler_instrumentor.py +3 -3
  31. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  32. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  33. crawlee/request_loaders/_request_list.py +1 -1
  34. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  35. crawlee/sessions/_models.py +2 -2
  36. crawlee/sessions/_session_pool.py +1 -1
  37. crawlee/statistics/_error_snapshotter.py +1 -1
  38. crawlee/statistics/_models.py +33 -2
  39. crawlee/statistics/_statistics.py +24 -33
  40. crawlee/storage_clients/__init__.py +16 -0
  41. crawlee/storage_clients/_base/_storage_client.py +13 -0
  42. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
  45. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  46. crawlee/storage_clients/_file_system/_utils.py +0 -0
  47. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  48. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  49. crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
  50. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  51. crawlee/storage_clients/_redis/__init__.py +6 -0
  52. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  53. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  54. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  55. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  56. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  57. crawlee/storage_clients/_redis/_utils.py +23 -0
  58. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  59. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  60. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  61. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  62. crawlee/storage_clients/_redis/py.typed +0 -0
  63. crawlee/storage_clients/_sql/__init__.py +6 -0
  64. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  65. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  66. crawlee/storage_clients/_sql/_db_models.py +268 -0
  67. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  68. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  69. crawlee/storage_clients/_sql/_storage_client.py +291 -0
  70. crawlee/storage_clients/_sql/py.typed +0 -0
  71. crawlee/storage_clients/models.py +10 -10
  72. crawlee/storages/_base.py +5 -1
  73. crawlee/storages/_dataset.py +12 -2
  74. crawlee/storages/_key_value_store.py +17 -4
  75. crawlee/storages/_request_queue.py +10 -2
  76. crawlee/storages/_storage_instance_manager.py +133 -71
  77. crawlee/storages/_utils.py +11 -0
  78. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
  79. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
  80. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
  81. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
  82. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
crawlee/_utils/robots.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from logging import getLogger
3
4
  from typing import TYPE_CHECKING
4
5
 
5
6
  from protego import Protego
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
15
16
  from crawlee.proxy_configuration import ProxyInfo
16
17
 
17
18
 
19
+ logger = getLogger(__name__)
20
+
21
+
18
22
  class RobotsTxtFile:
19
23
  def __init__(
20
24
  self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
@@ -56,12 +60,20 @@ class RobotsTxtFile:
56
60
  http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
57
61
  proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
58
62
  """
59
- response = await http_client.send_request(url, proxy_info=proxy_info)
60
- body = (
61
- b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
62
- )
63
+ try:
64
+ response = await http_client.send_request(url, proxy_info=proxy_info)
65
+
66
+ body = (
67
+ b'User-agent: *\nAllow: /'
68
+ if is_status_code_client_error(response.status_code)
69
+ else await response.read()
70
+ )
71
+ robots = Protego.parse(body.decode('utf-8'))
72
+
73
+ except Exception as e:
74
+ logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
63
75
 
64
- robots = Protego.parse(body.decode('utf-8'))
76
+ robots = Protego.parse('User-agent: *\nAllow: /')
65
77
 
66
78
  return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
67
79
 
crawlee/_utils/sitemap.py CHANGED
@@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
335
335
  # Check if the first chunk is a valid gzip header
336
336
  if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
337
337
  decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
338
- first_chunk = False
338
+ first_chunk = False
339
339
 
340
340
  chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
341
341
  text_chunk = decoder.decode(chunk)
crawlee/_utils/system.py CHANGED
@@ -36,7 +36,7 @@ else:
36
36
  class CpuInfo(BaseModel):
37
37
  """Information about the CPU usage."""
38
38
 
39
- model_config = ConfigDict(populate_by_name=True)
39
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
40
40
 
41
41
  used_ratio: Annotated[float, Field(alias='usedRatio')]
42
42
  """The ratio of CPU currently in use, represented as a float between 0 and 1."""
@@ -51,7 +51,7 @@ class CpuInfo(BaseModel):
51
51
  class MemoryUsageInfo(BaseModel):
52
52
  """Information about the memory usage."""
53
53
 
54
- model_config = ConfigDict(populate_by_name=True)
54
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
55
55
 
56
56
  current_size: Annotated[
57
57
  ByteSize,
@@ -71,7 +71,7 @@ class MemoryUsageInfo(BaseModel):
71
71
  class MemoryInfo(MemoryUsageInfo):
72
72
  """Information about system memory."""
73
73
 
74
- model_config = ConfigDict(populate_by_name=True)
74
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
75
75
 
76
76
  total_size: Annotated[
77
77
  ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')
crawlee/_utils/urls.py CHANGED
@@ -7,6 +7,7 @@ from yarl import URL
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from collections.abc import Iterator
10
+ from logging import Logger
10
11
 
11
12
 
12
13
  def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
22
23
  return str(URL(base_url).join(URL(relative_url)))
23
24
 
24
25
 
25
- def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
26
+ def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
26
27
  """Convert an iterator of relative URLs to absolute URLs using a base URL."""
27
28
  for url in urls:
28
29
  if is_url_absolute(url):
29
30
  yield url
30
31
  else:
31
- yield convert_to_absolute_url(base_url, url)
32
+ converted_url = convert_to_absolute_url(base_url, url)
33
+ # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
34
+ if not is_url_absolute(converted_url):
35
+ if logger:
36
+ logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
37
+ continue
38
+ yield converted_url
32
39
 
33
40
 
34
41
  _http_url_adapter = TypeAdapter(AnyHttpUrl)
@@ -118,7 +118,10 @@ class BrowserPool:
118
118
  """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
119
119
 
120
120
  Args:
121
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
121
+ browser_type: The type of browser to launch:
122
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
123
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
124
+ the system.
122
125
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
123
126
  and local storage.
124
127
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from asyncio import Lock
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import TYPE_CHECKING, Any, cast
7
8
 
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
77
78
 
78
79
  self._total_opened_pages = 0
79
80
 
81
+ self._context_creation_lock: Lock | None = None
82
+
83
+ async def _get_context_creation_lock(self) -> Lock:
84
+ """Get context checking and creation lock.
85
+
86
+ It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
87
+ memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
88
+ """
89
+ if self._context_creation_lock:
90
+ return self._context_creation_lock
91
+ self._context_creation_lock = Lock()
92
+ return self._context_creation_lock
93
+
80
94
  @property
81
95
  @override
82
96
  def pages(self) -> list[Page]:
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
137
151
  Raises:
138
152
  ValueError: If the browser has reached the maximum number of open pages.
139
153
  """
140
- if not self._browser_context:
141
- self._browser_context = await self._create_browser_context(
142
- browser_new_context_options=browser_new_context_options,
143
- proxy_info=proxy_info,
144
- )
145
-
146
154
  if not self.has_free_capacity:
147
155
  raise ValueError('Cannot open more pages in this browser.')
148
156
 
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
154
162
  )
155
163
  page = await new_context.new_page()
156
164
  else:
157
- if not self._browser_context:
158
- self._browser_context = await self._create_browser_context(
159
- browser_new_context_options=browser_new_context_options,
160
- proxy_info=proxy_info,
161
- )
165
+ async with await self._get_context_creation_lock():
166
+ if not self._browser_context:
167
+ self._browser_context = await self._create_browser_context(
168
+ browser_new_context_options=browser_new_context_options,
169
+ proxy_info=proxy_info,
170
+ )
162
171
  page = await self._browser_context.new_page()
163
172
 
164
173
  # Handle page close event
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
169
178
  self._last_page_opened_at = datetime.now(timezone.utc)
170
179
 
171
180
  self._total_opened_pages += 1
172
-
173
181
  return page
174
182
 
175
183
  @override
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
206
214
  `self._fingerprint_generator` is available.
207
215
  """
208
216
  browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
209
-
210
217
  if proxy_info:
211
218
  if browser_new_context_options.get('proxy'):
212
- logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
219
+ logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
213
220
 
214
221
  browser_new_context_options['proxy'] = ProxySettings(
215
222
  server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
244
251
  browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
245
252
  'extra_http_headers', extra_http_headers
246
253
  )
247
-
248
254
  return await self._browser.new_context(**browser_new_context_options)
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
34
34
 
35
35
  It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
36
36
  for creating new browser instances and provides a unified interface for interacting with different browser types
37
- (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,
38
- executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
37
+ (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
38
+ mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
39
39
  browser instance, ensuring that resource limits are respected.
40
40
  """
41
41
 
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
55
55
  """Initialize a new instance.
56
56
 
57
57
  Args:
58
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
58
+ browser_type: The type of browser to launch:
59
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
60
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
61
+ the system.
59
62
  user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
60
63
  storage.
61
64
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
80
83
  'chromium_sandbox': not config.disable_browser_sandbox,
81
84
  }
82
85
 
86
+ if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
87
+ raise ValueError(
88
+ 'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
89
+ )
90
+
91
+ # Map 'chrome' to 'chromium' with the 'chrome' channel.
92
+ if browser_type == 'chrome':
93
+ browser_type = 'chromium'
94
+ # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
95
+ default_launch_browser_options['channel'] = 'chrome'
96
+
83
97
  self._browser_type: BrowserType = browser_type
84
98
  self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
85
99
  self._browser_new_context_options = browser_new_context_options or {}
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal
6
6
  if TYPE_CHECKING:
7
7
  from playwright.async_api import Page
8
8
 
9
- BrowserType = Literal['chromium', 'firefox', 'webkit']
9
+ BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']
10
10
 
11
11
 
12
12
  @dataclass
crawlee/configuration.py CHANGED
@@ -28,6 +28,8 @@ class Configuration(BaseSettings):
28
28
  Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
29
29
  """
30
30
 
31
+ # TODO: https://github.com/pydantic/pydantic-settings/issues/706
32
+ # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
31
33
  model_config = SettingsConfigDict(populate_by_name=True)
32
34
 
33
35
  internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
@@ -34,7 +34,9 @@ TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=St
34
34
 
35
35
  @docs_group('Crawlers')
36
36
  class AbstractHttpCrawler(
37
- Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
37
+ BasicCrawler[TCrawlingContext, StatisticsState],
38
+ ABC,
39
+ Generic[TCrawlingContext, TParseResult, TSelectResult],
38
40
  ):
39
41
  """A web crawler for performing HTTP requests.
40
42
 
@@ -165,7 +167,9 @@ class AbstractHttpCrawler(
165
167
  kwargs.setdefault('strategy', 'same-hostname')
166
168
 
167
169
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
168
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
170
+ links_iterator = to_absolute_url_iterator(
171
+ context.request.loaded_url or context.request.url, links_iterator, logger=context.log
172
+ )
169
173
 
170
174
  if robots_txt_file:
171
175
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
16
16
 
17
17
 
18
18
  @docs_group('HTTP parsers')
19
- class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC):
19
+ class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
20
20
  """Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
21
21
 
22
22
  @abstractmethod
@@ -31,7 +31,7 @@ class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
31
31
 
32
32
  @dataclass(frozen=True)
33
33
  @docs_group('Crawling contexts')
34
- class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
34
+ class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
35
35
  """The crawling context used by `AbstractHttpCrawler`.
36
36
 
37
37
  It provides access to key objects as well as utility functions for handling crawling tasks.
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
12
12
  from parsel import Selector
13
13
  from typing_extensions import Self, TypeVar, override
14
14
 
15
- from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
15
+ from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
16
16
  from crawlee._utils.docs import docs_group
17
17
  from crawlee._utils.wait import wait_for
18
18
  from crawlee.crawlers import (
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
71
71
  async def __aenter__(self) -> Self:
72
72
  self._active = True
73
73
  await self._state.initialize()
74
- self._after_initialize()
75
74
  return self
76
75
 
77
76
  async def __aexit__(
@@ -85,8 +84,8 @@ class _NonPersistentStatistics(Statistics):
85
84
 
86
85
  @docs_group('Crawlers')
87
86
  class AdaptivePlaywrightCrawler(
88
- Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
89
87
  BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
88
+ Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
90
89
  ):
91
90
  """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.
92
91
 
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
149
148
  non-default configuration.
150
149
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
151
150
  """
152
- # Some sub crawler kwargs are internally modified. Prepare copies.
153
- basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
154
- basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
155
-
156
151
  # Adaptive crawling related.
157
152
  self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
158
153
  self.result_checker = result_checker or (lambda _: True)
159
154
  self.result_comparator = result_comparator or create_default_comparator(result_checker)
160
155
 
156
+ # Set default concurrency settings for browser crawlers if not provided
157
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
158
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
159
+
161
160
  super().__init__(statistics=statistics, **kwargs)
162
161
 
163
162
  # Sub crawlers related.
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
166
165
  # Each sub crawler will use custom logger .
167
166
  static_logger = getLogger('Subcrawler_static')
168
167
  static_logger.setLevel(logging.ERROR)
169
- basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
168
+ basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
170
169
 
171
170
  pw_logger = getLogger('Subcrawler_playwright')
172
171
  pw_logger.setLevel(logging.ERROR)
173
- basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
172
+ basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
174
173
 
175
174
  # Initialize sub crawlers to create their pipelines.
176
175
  static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
315
314
  ),
316
315
  logger=self._logger,
317
316
  )
318
- return SubCrawlerRun(result=result)
317
+ return SubCrawlerRun(result=result, run_context=context_linked_to_result)
319
318
  except Exception as e:
320
319
  return SubCrawlerRun(exception=e)
321
320
 
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
371
370
  self.track_http_only_request_handler_runs()
372
371
 
373
372
  static_run = await self._crawl_one(rendering_type='static', context=context)
374
- if static_run.result and self.result_checker(static_run.result):
373
+ if static_run.result and static_run.run_context and self.result_checker(static_run.result):
374
+ self._update_context_from_copy(context, static_run.run_context)
375
375
  self._context_result_map[context] = static_run.result
376
376
  return
377
377
  if static_run.exception:
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
402
402
  if pw_run.exception is not None:
403
403
  raise pw_run.exception
404
404
 
405
- if pw_run.result:
406
- self._context_result_map[context] = pw_run.result
407
-
405
+ if pw_run.result and pw_run.run_context:
408
406
  if should_detect_rendering_type:
409
407
  detection_result: RenderingType
410
408
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
411
-
412
409
  if static_run.result and self.result_comparator(static_run.result, pw_run.result):
413
410
  detection_result = 'static'
414
411
  else:
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
417
414
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
418
415
  self.rendering_type_predictor.store_result(context.request, detection_result)
419
416
 
417
+ self._update_context_from_copy(context, pw_run.run_context)
418
+ self._context_result_map[context] = pw_run.result
419
+
420
420
  def pre_navigation_hook(
421
421
  self,
422
422
  hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
451
451
  def track_rendering_type_mispredictions(self) -> None:
452
452
  self.statistics.state.rendering_type_mispredictions += 1
453
453
 
454
+ def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
455
+ """Update mutable fields of `context` from `context_copy`.
456
+
457
+ Uses object.__setattr__ to bypass frozen dataclass restrictions,
458
+ allowing state synchronization after isolated crawler execution.
459
+ """
460
+ updating_attributes = {
461
+ 'request': ('headers', 'user_data'),
462
+ 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
463
+ }
464
+
465
+ for attr, sub_attrs in updating_attributes.items():
466
+ original_sub_obj = getattr(context, attr)
467
+ copy_sub_obj = getattr(context_copy, attr)
468
+
469
+ # Check that both sub objects are not None
470
+ if original_sub_obj is None or copy_sub_obj is None:
471
+ continue
472
+
473
+ for sub_attr in sub_attrs:
474
+ new_value = getattr(copy_sub_obj, sub_attr)
475
+ object.__setattr__(original_sub_obj, sub_attr, new_value)
476
+
454
477
 
455
478
  @dataclass(frozen=True)
456
479
  class SubCrawlerRun:
457
480
  result: RequestHandlerRunResult | None = None
458
481
  exception: Exception | None = None
482
+ run_context: BasicCrawlingContext | None = None
@@ -12,7 +12,7 @@ from crawlee.statistics import StatisticsState
12
12
  class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
13
13
  """Statistic data about a crawler run with additional information related to adaptive crawling."""
14
14
 
15
- model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
15
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
16
16
 
17
17
  http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
18
18
  """Number representing how many times static http based crawling was used."""
@@ -31,7 +31,8 @@ class AdaptiveContextError(RuntimeError):
31
31
  @dataclass(frozen=True)
32
32
  @docs_group('Crawling contexts')
33
33
  class AdaptivePlaywrightCrawlingContext(
34
- Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult]
34
+ ParsedHttpCrawlingContext[TStaticParseResult],
35
+ Generic[TStaticParseResult, TStaticSelectResult],
35
36
  ):
36
37
  _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
37
38
  """The crawling context used by `AdaptivePlaywrightCrawler`.
@@ -32,7 +32,7 @@ FeatureVector = tuple[float, float]
32
32
 
33
33
 
34
34
  class RenderingTypePredictorState(BaseModel):
35
- model_config = ConfigDict(populate_by_name=True)
35
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
36
36
 
37
37
  model: Annotated[
38
38
  LogisticRegression,