crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/_utils/sitemap.py CHANGED
@@ -9,6 +9,7 @@ from datetime import datetime, timedelta
9
9
  from hashlib import sha256
10
10
  from logging import getLogger
11
11
  from typing import TYPE_CHECKING, Literal, TypedDict
12
+ from xml.sax import SAXParseException
12
13
  from xml.sax.expatreader import ExpatParser
13
14
  from xml.sax.handler import ContentHandler
14
15
 
@@ -192,7 +193,8 @@ class _XmlSitemapParser:
192
193
 
193
194
  def close(self) -> None:
194
195
  """Clean up resources."""
195
- self._parser.close()
196
+ with suppress(SAXParseException):
197
+ self._parser.close()
196
198
 
197
199
 
198
200
  def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser:
@@ -333,7 +335,7 @@ async def _fetch_and_process_sitemap(
333
335
  # Check if the first chunk is a valid gzip header
334
336
  if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
335
337
  decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
336
- first_chunk = False
338
+ first_chunk = False
337
339
 
338
340
  chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
339
341
  text_chunk = decoder.decode(chunk)
@@ -428,10 +430,17 @@ async def parse_sitemap(
428
430
  up to the specified maximum depth.
429
431
  """
430
432
  # Set default options
431
- options = options or {}
432
- emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
433
- max_depth = options.get('max_depth', float('inf'))
434
- sitemap_retries = options.get('sitemap_retries', 3)
433
+ default_timeout = timedelta(seconds=30)
434
+ if options:
435
+ emit_nested_sitemaps = options['emit_nested_sitemaps']
436
+ max_depth = options['max_depth']
437
+ sitemap_retries = options['sitemap_retries']
438
+ timeout = options.get('timeout', default_timeout)
439
+ else:
440
+ emit_nested_sitemaps = False
441
+ max_depth = float('inf')
442
+ sitemap_retries = 3
443
+ timeout = default_timeout
435
444
 
436
445
  # Setup working state
437
446
  sources = list(initial_sources)
@@ -470,7 +479,7 @@ async def parse_sitemap(
470
479
  sitemap_retries,
471
480
  emit_nested_sitemaps=emit_nested_sitemaps,
472
481
  proxy_info=proxy_info,
473
- timeout=options.get('timeout', timedelta(seconds=30)),
482
+ timeout=timeout,
474
483
  ):
475
484
  yield result
476
485
  else:
crawlee/_utils/system.py CHANGED
@@ -5,7 +5,7 @@ import sys
5
5
  from contextlib import suppress
6
6
  from datetime import datetime, timezone
7
7
  from logging import getLogger
8
- from typing import Annotated
8
+ from typing import TYPE_CHECKING, Annotated
9
9
 
10
10
  import psutil
11
11
  from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
@@ -36,22 +36,30 @@ else:
36
36
  class CpuInfo(BaseModel):
37
37
  """Information about the CPU usage."""
38
38
 
39
- model_config = ConfigDict(populate_by_name=True)
39
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
40
40
 
41
41
  used_ratio: Annotated[float, Field(alias='usedRatio')]
42
42
  """The ratio of CPU currently in use, represented as a float between 0 and 1."""
43
43
 
44
- created_at: datetime = Field(
45
- alias='createdAt',
46
- default_factory=lambda: datetime.now(timezone.utc),
47
- )
48
- """The time at which the measurement was taken."""
44
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
45
+ if TYPE_CHECKING:
46
+ created_at: datetime = datetime.now(timezone.utc)
47
+ """The time at which the measurement was taken."""
48
+ else:
49
+ created_at: Annotated[
50
+ datetime,
51
+ Field(
52
+ alias='createdAt',
53
+ default_factory=lambda: datetime.now(timezone.utc),
54
+ ),
55
+ ]
56
+ """The time at which the measurement was taken."""
49
57
 
50
58
 
51
59
  class MemoryUsageInfo(BaseModel):
52
60
  """Information about the memory usage."""
53
61
 
54
- model_config = ConfigDict(populate_by_name=True)
62
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
55
63
 
56
64
  current_size: Annotated[
57
65
  ByteSize,
@@ -61,17 +69,25 @@ class MemoryUsageInfo(BaseModel):
61
69
  ]
62
70
  """Memory usage of the current Python process and its children."""
63
71
 
64
- created_at: datetime = Field(
65
- alias='createdAt',
66
- default_factory=lambda: datetime.now(timezone.utc),
67
- )
68
- """The time at which the measurement was taken."""
72
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
73
+ if TYPE_CHECKING:
74
+ created_at: datetime = datetime.now(timezone.utc)
75
+ """The time at which the measurement was taken."""
76
+ else:
77
+ created_at: Annotated[
78
+ datetime,
79
+ Field(
80
+ alias='createdAt',
81
+ default_factory=lambda: datetime.now(timezone.utc),
82
+ ),
83
+ ]
84
+ """The time at which the measurement was taken."""
69
85
 
70
86
 
71
87
  class MemoryInfo(MemoryUsageInfo):
72
88
  """Information about system memory."""
73
89
 
74
- model_config = ConfigDict(populate_by_name=True)
90
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
75
91
 
76
92
  total_size: Annotated[
77
93
  ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')
crawlee/_utils/time.py ADDED
@@ -0,0 +1,120 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from contextlib import contextmanager
5
+ from dataclasses import dataclass
6
+ from datetime import timedelta
7
+ from typing import TYPE_CHECKING
8
+
9
+ from async_timeout import Timeout, timeout
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Iterator
13
+ from types import TracebackType
14
+
15
+ _SECONDS_PER_MINUTE = 60
16
+ _SECONDS_PER_HOUR = 3600
17
+
18
+
19
+ @dataclass
20
+ class TimerResult:
21
+ wall: float | None = None
22
+ cpu: float | None = None
23
+
24
+
25
+ @contextmanager
26
+ def measure_time() -> Iterator[TimerResult]:
27
+ """Measure the execution time (wall-clock and CPU) between the start and end of the with-block."""
28
+ result = TimerResult()
29
+ before_wall = time.monotonic()
30
+ before_cpu = time.thread_time()
31
+
32
+ try:
33
+ yield result
34
+ finally:
35
+ after_wall = time.monotonic()
36
+ after_cpu = time.thread_time()
37
+ result.wall = after_wall - before_wall
38
+ result.cpu = after_cpu - before_cpu
39
+
40
+
41
+ class SharedTimeout:
42
+ """Keeps track of a time budget shared by multiple independent async operations.
43
+
44
+ Provides a reusable, non-reentrant context manager interface.
45
+ """
46
+
47
+ def __init__(self, timeout: timedelta) -> None:
48
+ self._remaining_timeout = timeout
49
+ self._active_timeout: Timeout | None = None
50
+ self._activation_timestamp: float | None = None
51
+
52
+ async def __aenter__(self) -> timedelta:
53
+ if self._active_timeout is not None or self._activation_timestamp is not None:
54
+ raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55
+
56
+ self._activation_timestamp = time.monotonic()
57
+ self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58
+ await new_timeout.__aenter__()
59
+ return self._remaining_timeout
60
+
61
+ async def __aexit__(
62
+ self,
63
+ exc_type: type[BaseException] | None,
64
+ exc_value: BaseException | None,
65
+ exc_traceback: TracebackType | None,
66
+ ) -> None:
67
+ if self._active_timeout is None or self._activation_timestamp is None:
68
+ raise RuntimeError('Logic error')
69
+
70
+ await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71
+ elapsed = time.monotonic() - self._activation_timestamp
72
+ self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73
+
74
+ self._active_timeout = None
75
+ self._activation_timestamp = None
76
+
77
+
78
+ def format_duration(duration: timedelta | None) -> str:
79
+ """Format a timedelta into a human-readable string with appropriate units."""
80
+ if duration is None:
81
+ return 'None'
82
+
83
+ total_seconds = duration.total_seconds()
84
+
85
+ if total_seconds == 0:
86
+ return '0s'
87
+
88
+ # For very small durations, show in milliseconds
89
+ if total_seconds < 1:
90
+ milliseconds = total_seconds * 1000
91
+ if milliseconds < 1:
92
+ microseconds = total_seconds * 1_000_000
93
+ return f'{microseconds:.1f}μs'
94
+ return f'{milliseconds:.1f}ms'
95
+
96
+ # For durations less than 60 seconds, show in seconds
97
+ if total_seconds < _SECONDS_PER_MINUTE:
98
+ return f'{total_seconds:.2f}s'
99
+
100
+ # For durations less than 1 hour, show in minutes and seconds
101
+ if total_seconds < _SECONDS_PER_HOUR:
102
+ minutes = int(total_seconds // _SECONDS_PER_MINUTE)
103
+ seconds = total_seconds % _SECONDS_PER_MINUTE
104
+ if seconds == 0:
105
+ return f'{minutes}min'
106
+ return f'{minutes}min {seconds:.1f}s'
107
+
108
+ # For longer durations, show in hours, minutes, and seconds
109
+ hours = int(total_seconds // _SECONDS_PER_HOUR)
110
+ remaining_seconds = total_seconds % _SECONDS_PER_HOUR
111
+ minutes = int(remaining_seconds // _SECONDS_PER_MINUTE)
112
+ seconds = remaining_seconds % _SECONDS_PER_MINUTE
113
+
114
+ result = f'{hours}h'
115
+ if minutes > 0:
116
+ result += f' {minutes}min'
117
+ if seconds > 0:
118
+ result += f' {seconds:.1f}s'
119
+
120
+ return result
crawlee/_utils/urls.py CHANGED
@@ -7,6 +7,7 @@ from yarl import URL
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from collections.abc import Iterator
10
+ from logging import Logger
10
11
 
11
12
 
12
13
  def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
22
23
  return str(URL(base_url).join(URL(relative_url)))
23
24
 
24
25
 
25
- def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
26
+ def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
26
27
  """Convert an iterator of relative URLs to absolute URLs using a base URL."""
27
28
  for url in urls:
28
29
  if is_url_absolute(url):
29
30
  yield url
30
31
  else:
31
- yield convert_to_absolute_url(base_url, url)
32
+ converted_url = convert_to_absolute_url(base_url, url)
33
+ # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
34
+ if not is_url_absolute(converted_url):
35
+ if logger:
36
+ logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
37
+ continue
38
+ yield converted_url
32
39
 
33
40
 
34
41
  _http_url_adapter = TypeAdapter(AnyHttpUrl)
@@ -118,7 +118,10 @@ class BrowserPool:
118
118
  """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
119
119
 
120
120
  Args:
121
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
121
+ browser_type: The type of browser to launch:
122
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
123
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
124
+ the system.
122
125
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
123
126
  and local storage.
124
127
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -135,7 +138,7 @@ class BrowserPool:
135
138
  kwargs: Additional arguments for default constructor.
136
139
  """
137
140
  plugin_options: dict = defaultdict(dict)
138
- plugin_options['browser_launch_options'] = browser_launch_options or {}
141
+ plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {}
139
142
  plugin_options['browser_new_context_options'] = browser_new_context_options or {}
140
143
 
141
144
  if headless is not None:
@@ -78,7 +78,8 @@ class PlaywrightPersistentBrowser(Browser):
78
78
 
79
79
  async def _delete_temp_dir(self, _: BrowserContext | None) -> None:
80
80
  if self._temp_dir and self._temp_dir.exists():
81
- await asyncio.to_thread(shutil.rmtree, self._temp_dir, ignore_errors=True)
81
+ temp_dir = self._temp_dir
82
+ await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True)
82
83
 
83
84
  @override
84
85
  async def close(self, **kwargs: Any) -> None:
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from asyncio import Lock
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import TYPE_CHECKING, Any, cast
7
8
 
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
77
78
 
78
79
  self._total_opened_pages = 0
79
80
 
81
+ self._context_creation_lock: Lock | None = None
82
+
83
+ async def _get_context_creation_lock(self) -> Lock:
84
+ """Get context checking and creation lock.
85
+
86
+ It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
87
+ memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
88
+ """
89
+ if self._context_creation_lock:
90
+ return self._context_creation_lock
91
+ self._context_creation_lock = Lock()
92
+ return self._context_creation_lock
93
+
80
94
  @property
81
95
  @override
82
96
  def pages(self) -> list[Page]:
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
137
151
  Raises:
138
152
  ValueError: If the browser has reached the maximum number of open pages.
139
153
  """
140
- if not self._browser_context:
141
- self._browser_context = await self._create_browser_context(
142
- browser_new_context_options=browser_new_context_options,
143
- proxy_info=proxy_info,
144
- )
145
-
146
154
  if not self.has_free_capacity:
147
155
  raise ValueError('Cannot open more pages in this browser.')
148
156
 
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
154
162
  )
155
163
  page = await new_context.new_page()
156
164
  else:
157
- if not self._browser_context:
158
- self._browser_context = await self._create_browser_context(
159
- browser_new_context_options=browser_new_context_options,
160
- proxy_info=proxy_info,
161
- )
165
+ async with await self._get_context_creation_lock():
166
+ if not self._browser_context:
167
+ self._browser_context = await self._create_browser_context(
168
+ browser_new_context_options=browser_new_context_options,
169
+ proxy_info=proxy_info,
170
+ )
162
171
  page = await self._browser_context.new_page()
163
172
 
164
173
  # Handle page close event
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
169
178
  self._last_page_opened_at = datetime.now(timezone.utc)
170
179
 
171
180
  self._total_opened_pages += 1
172
-
173
181
  return page
174
182
 
175
183
  @override
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
206
214
  `self._fingerprint_generator` is available.
207
215
  """
208
216
  browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
209
-
210
217
  if proxy_info:
211
218
  if browser_new_context_options.get('proxy'):
212
- logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
219
+ logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
213
220
 
214
221
  browser_new_context_options['proxy'] = ProxySettings(
215
222
  server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
244
251
  browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
245
252
  'extra_http_headers', extra_http_headers
246
253
  )
247
-
248
254
  return await self._browser.new_context(**browser_new_context_options)
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
34
34
 
35
35
  It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
36
36
  for creating new browser instances and provides a unified interface for interacting with different browser types
37
- (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,
38
- executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
37
+ (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
38
+ mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
39
39
  browser instance, ensuring that resource limits are respected.
40
40
  """
41
41
 
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
55
55
  """Initialize a new instance.
56
56
 
57
57
  Args:
58
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
58
+ browser_type: The type of browser to launch:
59
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
60
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
61
+ the system.
59
62
  user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
60
63
  storage.
61
64
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
80
83
  'chromium_sandbox': not config.disable_browser_sandbox,
81
84
  }
82
85
 
86
+ if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
87
+ raise ValueError(
88
+ 'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
89
+ )
90
+
91
+ # Map 'chrome' to 'chromium' with the 'chrome' channel.
92
+ if browser_type == 'chrome':
93
+ browser_type = 'chromium'
94
+ # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
95
+ default_launch_browser_options['channel'] = 'chrome'
96
+
83
97
  self._browser_type: BrowserType = browser_type
84
98
  self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
85
99
  self._browser_new_context_options = browser_new_context_options or {}
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal
6
6
  if TYPE_CHECKING:
7
7
  from playwright.async_api import Page
8
8
 
9
- BrowserType = Literal['chromium', 'firefox', 'webkit']
9
+ BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']
10
10
 
11
11
 
12
12
  @dataclass
crawlee/configuration.py CHANGED
@@ -28,6 +28,8 @@ class Configuration(BaseSettings):
28
28
  Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
29
29
  """
30
30
 
31
+ # TODO: https://github.com/pydantic/pydantic-settings/issues/706
32
+ # Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
31
33
  model_config = SettingsConfigDict(populate_by_name=True)
32
34
 
33
35
  internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
@@ -1,7 +1,7 @@
1
1
  from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
2
  from crawlee._utils.try_import import try_import as _try_import
3
3
 
4
- from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4
+ from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
5
5
  from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
6
6
  from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
7
7
 
@@ -23,12 +23,14 @@ with _try_import(
23
23
  'AdaptivePlaywrightCrawler',
24
24
  'AdaptivePlaywrightCrawlingContext',
25
25
  'AdaptivePlaywrightPreNavCrawlingContext',
26
+ 'AdaptivePlaywrightCrawlerStatisticState',
26
27
  'RenderingType',
27
28
  'RenderingTypePrediction',
28
29
  'RenderingTypePredictor',
29
30
  ):
30
31
  from ._adaptive_playwright import (
31
32
  AdaptivePlaywrightCrawler,
33
+ AdaptivePlaywrightCrawlerStatisticState,
32
34
  AdaptivePlaywrightCrawlingContext,
33
35
  AdaptivePlaywrightPreNavCrawlingContext,
34
36
  RenderingType,
@@ -41,6 +43,7 @@ __all__ = [
41
43
  'AbstractHttpCrawler',
42
44
  'AbstractHttpParser',
43
45
  'AdaptivePlaywrightCrawler',
46
+ 'AdaptivePlaywrightCrawlerStatisticState',
44
47
  'AdaptivePlaywrightCrawlingContext',
45
48
  'AdaptivePlaywrightPreNavCrawlingContext',
46
49
  'BasicCrawler',
@@ -51,6 +54,7 @@ __all__ = [
51
54
  'BeautifulSoupParserType',
52
55
  'ContextPipeline',
53
56
  'HttpCrawler',
57
+ 'HttpCrawlerOptions',
54
58
  'HttpCrawlingContext',
55
59
  'HttpCrawlingResult',
56
60
  'ParsedHttpCrawlingContext',
@@ -1,9 +1,10 @@
1
- from ._abstract_http_crawler import AbstractHttpCrawler
1
+ from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
2
2
  from ._abstract_http_parser import AbstractHttpParser
3
3
  from ._http_crawling_context import ParsedHttpCrawlingContext
4
4
 
5
5
  __all__ = [
6
6
  'AbstractHttpCrawler',
7
7
  'AbstractHttpParser',
8
+ 'HttpCrawlerOptions',
8
9
  'ParsedHttpCrawlingContext',
9
10
  ]
@@ -3,14 +3,16 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  from abc import ABC
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING, Any, Generic
7
8
 
8
9
  from more_itertools import partition
9
10
  from pydantic import ValidationError
10
- from typing_extensions import TypeVar
11
+ from typing_extensions import NotRequired, TypeVar
11
12
 
12
- from crawlee._request import Request, RequestOptions
13
+ from crawlee._request import Request, RequestOptions, RequestState
13
14
  from crawlee._utils.docs import docs_group
15
+ from crawlee._utils.time import SharedTimeout
14
16
  from crawlee._utils.urls import to_absolute_url_iterator
15
17
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
16
18
  from crawlee.errors import SessionError
@@ -32,9 +34,24 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
32
34
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
33
35
 
34
36
 
37
+ class HttpCrawlerOptions(
38
+ BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39
+ Generic[TCrawlingContext, TStatisticsState],
40
+ ):
41
+ """Arguments for the `AbstractHttpCrawler` constructor.
42
+
43
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
44
+ """
45
+
46
+ navigation_timeout: NotRequired[timedelta | None]
47
+ """Timeout for the HTTP request."""
48
+
49
+
35
50
  @docs_group('Crawlers')
36
51
  class AbstractHttpCrawler(
37
- Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
52
+ BasicCrawler[TCrawlingContext, StatisticsState],
53
+ ABC,
54
+ Generic[TCrawlingContext, TParseResult, TSelectResult],
38
55
  ):
39
56
  """A web crawler for performing HTTP requests.
40
57
 
@@ -54,10 +71,13 @@ class AbstractHttpCrawler(
54
71
  self,
55
72
  *,
56
73
  parser: AbstractHttpParser[TParseResult, TSelectResult],
74
+ navigation_timeout: timedelta | None = None,
57
75
  **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
58
76
  ) -> None:
59
77
  self._parser = parser
78
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
60
79
  self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
61
81
 
62
82
  if '_context_pipeline' not in kwargs:
63
83
  raise ValueError(
@@ -80,9 +100,7 @@ class AbstractHttpCrawler(
80
100
  this method simplifies cases where `TParseResult` is used for both generic parameters.
81
101
  """
82
102
 
83
- class _ParsedHttpCrawler(
84
- AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]
85
- ):
103
+ class _ParsedHttpCrawler(AbstractHttpCrawler):
86
104
  def __init__(
87
105
  self,
88
106
  parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
@@ -110,9 +128,17 @@ class AbstractHttpCrawler(
110
128
  async def _execute_pre_navigation_hooks(
111
129
  self, context: BasicCrawlingContext
112
130
  ) -> AsyncGenerator[BasicCrawlingContext, None]:
113
- for hook in self._pre_navigation_hooks:
114
- await hook(context)
115
- yield context
131
+ context_id = id(context)
132
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
133
+
134
+ try:
135
+ for hook in self._pre_navigation_hooks:
136
+ async with self._shared_navigation_timeouts[context_id]:
137
+ await hook(context)
138
+
139
+ yield context
140
+ finally:
141
+ self._shared_navigation_timeouts.pop(context_id, None)
116
142
 
117
143
  async def _parse_http_response(
118
144
  self, context: HttpCrawlingContext
@@ -163,9 +189,18 @@ class AbstractHttpCrawler(
163
189
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
164
190
 
165
191
  kwargs.setdefault('strategy', 'same-hostname')
192
+ strategy = kwargs.get('strategy', 'same-hostname')
166
193
 
167
194
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
168
- links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
195
+
196
+ # Get base URL from <base> tag if present
197
+ extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
198
+ base_url: str = (
199
+ str(extracted_base_urls[0])
200
+ if extracted_base_urls
201
+ else context.request.loaded_url or context.request.url
202
+ )
203
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
169
204
 
170
205
  if robots_txt_file:
171
206
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -173,7 +208,9 @@ class AbstractHttpCrawler(
173
208
  skipped = iter([])
174
209
 
175
210
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
176
- request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
211
+ request_options = RequestOptions(
212
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
213
+ )
177
214
 
178
215
  if transform_request_function:
179
216
  transform_request_options = transform_request_function(request_options)
@@ -212,13 +249,16 @@ class AbstractHttpCrawler(
212
249
  Yields:
213
250
  The original crawling context enhanced by HTTP response.
214
251
  """
215
- result = await self._http_client.crawl(
216
- request=context.request,
217
- session=context.session,
218
- proxy_info=context.proxy_info,
219
- statistics=self._statistics,
220
- )
252
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
253
+ result = await self._http_client.crawl(
254
+ request=context.request,
255
+ session=context.session,
256
+ proxy_info=context.proxy_info,
257
+ statistics=self._statistics,
258
+ timeout=remaining_timeout,
259
+ )
221
260
 
261
+ context.request.state = RequestState.AFTER_NAV
222
262
  yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
223
263
 
224
264
  async def _handle_status_code_response(
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
16
16
 
17
17
 
18
18
  @docs_group('HTTP parsers')
19
- class AbstractHttpParser(Generic[TParseResult, TSelectResult], ABC):
19
+ class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
20
20
  """Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
21
21
 
22
22
  @abstractmethod