crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -1,20 +1,27 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ from collections import deque
4
5
  from contextlib import suppress
5
6
  from logging import getLogger
6
- from typing import TYPE_CHECKING, Any
7
+ from typing import TYPE_CHECKING, Annotated, Any
7
8
 
8
- from crawlee import Request
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+ from typing_extensions import override
11
+
12
+ from crawlee import Request, RequestOptions
9
13
  from crawlee._utils.docs import docs_group
10
14
  from crawlee._utils.globs import Glob
11
- from crawlee._utils.sitemap import ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
15
+ from crawlee._utils.recoverable_state import RecoverableState
16
+ from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
12
17
  from crawlee.request_loaders._request_loader import RequestLoader
13
18
 
14
19
  if TYPE_CHECKING:
15
20
  import re
16
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
+ from types import TracebackType
17
23
 
24
+ from crawlee import RequestTransformAction
18
25
  from crawlee.http_clients import HttpClient
19
26
  from crawlee.proxy_configuration import ProxyInfo
20
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -23,12 +30,77 @@ if TYPE_CHECKING:
23
30
  logger = getLogger(__name__)
24
31
 
25
32
 
33
+ class SitemapRequestLoaderState(BaseModel):
34
+ """State model for persisting sitemap request loader data.
35
+
36
+ The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.
37
+ The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved
38
+ from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to
39
+ `pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a
40
+ `SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,
41
+ the loader was restarted from a saved state and the URL is skipped.
42
+
43
+ If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is
44
+ incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`
45
+ is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is
46
+ cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in
47
+ `processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.
48
+
49
+ When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.
50
+ When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and
51
+ `handled_count` is incremented by 1.
52
+
53
+ During initial startup or restart after persistence, state validation occurs in `_get_state`. If both
54
+ `pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a
55
+ fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is
56
+ restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and
57
+ `in_progress` is cleared.
58
+ """
59
+
60
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
61
+
62
+ url_queue: Annotated[deque[str], Field(alias='urlQueue')]
63
+ """Queue of URLs extracted from sitemaps and ready for processing."""
64
+
65
+ in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
66
+ """Set of request URLs currently being processed."""
67
+
68
+ pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]
69
+ """Queue of sitemap URLs that need to be fetched and processed."""
70
+
71
+ in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None
72
+ """The sitemap URL currently being processed."""
73
+
74
+ current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set()
75
+ """URLs from the current sitemap that have been added to the queue."""
76
+
77
+ processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set()
78
+ """Set of processed sitemap URLs."""
79
+
80
+ completed: Annotated[bool, Field(alias='sitemapCompleted')] = False
81
+ """Whether all sitemaps have been fully processed."""
82
+
83
+ total_count: Annotated[int, Field(alias='totalCount')] = 0
84
+ """Total number of URLs found and added to the queue from all processed sitemaps."""
85
+
86
+ handled_count: Annotated[int, Field(alias='handledCount')] = 0
87
+ """Number of URLs that have been successfully handled."""
88
+
89
+
26
90
  @docs_group('Request loaders')
27
91
  class SitemapRequestLoader(RequestLoader):
28
92
  """A request loader that reads URLs from sitemap(s).
29
93
 
94
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
95
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
96
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
97
+ and the `enqueue_links` functionality.
98
+
30
99
  The loader fetches and parses sitemaps in the background, allowing crawling to start
31
100
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
101
+
102
+ The loader supports state persistence, allowing it to resume from where it left off
103
+ after interruption when a `persist_state_key` is provided during initialization.
32
104
  """
33
105
 
34
106
  def __init__(
@@ -40,7 +112,8 @@ class SitemapRequestLoader(RequestLoader):
40
112
  include: list[re.Pattern[Any] | Glob] | None = None,
41
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
42
114
  max_buffer_size: int = 200,
43
- parse_sitemap_options: ParseSitemapOptions | None = None,
115
+ persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
44
117
  ) -> None:
45
118
  """Initialize the sitemap request loader.
46
119
 
@@ -50,27 +123,68 @@ class SitemapRequestLoader(RequestLoader):
50
123
  include: List of glob or regex patterns to include URLs.
51
124
  exclude: List of glob or regex patterns to exclude URLs.
52
125
  max_buffer_size: Maximum number of URLs to buffer in memory.
53
- parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`.
54
126
  http_client: the instance of `HttpClient` to use for fetching sitemaps.
127
+ persist_state_key: A key for persisting the loader's state in the KeyValueStore.
128
+ When provided, allows resuming from where it left off after interruption.
129
+ If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
55
133
  """
56
134
  self._http_client = http_client
57
-
58
135
  self._sitemap_urls = sitemap_urls
59
136
  self._include = include
60
137
  self._exclude = exclude
61
138
  self._proxy_info = proxy_info
62
- self._parse_sitemap_options = parse_sitemap_options or ParseSitemapOptions()
139
+ self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
141
+
142
+ # Synchronization for queue operations
143
+ self._queue_has_capacity = asyncio.Event()
144
+ self._queue_has_capacity.set()
145
+ self._queue_lock = asyncio.Lock()
146
+
147
+ # Initialize recoverable state
148
+ self._state = RecoverableState(
149
+ default_state=SitemapRequestLoaderState(
150
+ url_queue=deque(),
151
+ pending_sitemap_urls=deque(),
152
+ ),
153
+ persistence_enabled=bool(persist_state_key),
154
+ persist_state_key=persist_state_key or '',
155
+ logger=logger,
156
+ )
157
+
158
+ # Start background loading
159
+ self._loading_task = asyncio.create_task(self._load_sitemaps())
63
160
 
64
- self._handled_count = 0
65
- self._total_count = 0
161
+ async def _get_state(self) -> SitemapRequestLoaderState:
162
+ """Initialize and return the current state."""
163
+ async with self._queue_lock:
164
+ if self._state.is_initialized:
165
+ return self._state.current_value
66
166
 
67
- # URL queue and tracking
68
- self._url_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=max_buffer_size)
69
- self._in_progress: set[str] = set()
70
- self._processed_urls: set[str] = set()
167
+ await self._state.initialize()
71
168
 
72
- # Loading state
73
- self._loading_task = asyncio.create_task(self._load_sitemaps())
169
+ # Initialize pending sitemaps on first run
170
+ has_sitemap_for_processing = (
171
+ self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url
172
+ )
173
+ if not has_sitemap_for_processing and not self._state.current_value.completed:
174
+ self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)
175
+
176
+ if self._state.current_value.in_progress:
177
+ self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)
178
+ self._state.current_value.in_progress.clear()
179
+
180
+ if (
181
+ self._state.current_value.url_queue
182
+ and len(self._state.current_value.url_queue) >= self._max_buffer_size
183
+ ):
184
+ # Notify that the queue is full
185
+ self._queue_has_capacity.clear()
186
+
187
+ return self._state.current_value
74
188
 
75
189
  def _check_url_patterns(
76
190
  self,
@@ -105,73 +219,157 @@ class SitemapRequestLoader(RequestLoader):
105
219
  async def _load_sitemaps(self) -> None:
106
220
  """Load URLs from sitemaps in the background."""
107
221
  try:
108
- async for item in parse_sitemap(
109
- [SitemapSource(type='url', url=url) for url in self._sitemap_urls],
110
- self._http_client,
111
- proxy_info=self._proxy_info,
112
- options=self._parse_sitemap_options,
113
- ):
114
- # Only process URL items (not nested sitemaps)
115
- if isinstance(item, SitemapUrl):
116
- url = item.loc
117
-
118
- # Skip if already processed
119
- if url in self._processed_urls:
222
+ # Get actual state
223
+ while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url):
224
+ # Get sitemap URL for parsing
225
+ sitemap_url = state.in_progress_sitemap_url
226
+ if not sitemap_url:
227
+ sitemap_url = state.pending_sitemap_urls.popleft()
228
+ # Skip processed urls
229
+ if sitemap_url in state.processed_sitemap_urls:
120
230
  continue
121
-
122
- # Check if URL should be included
123
- if not self._check_url_patterns(url, self._include, self._exclude):
231
+ state.in_progress_sitemap_url = sitemap_url
232
+
233
+ parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
234
+
235
+ async for item in parse_sitemap(
236
+ [SitemapSource(type='url', url=sitemap_url)],
237
+ self._http_client,
238
+ proxy_info=self._proxy_info,
239
+ options=parse_options,
240
+ ):
241
+ if isinstance(item, NestedSitemap):
242
+ # Add nested sitemap to queue
243
+ if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
244
+ state.pending_sitemap_urls.append(item.loc)
124
245
  continue
125
246
 
126
- await self._url_queue.put(url)
127
- self._processed_urls.add(url)
128
- self._total_count += 1
247
+ if isinstance(item, SitemapUrl):
248
+ url = item.loc
249
+
250
+ state = await self._get_state()
251
+
252
+ # Skip if already processed
253
+ if url in state.current_sitemap_processed_urls:
254
+ continue
255
+
256
+ # Check if URL should be included
257
+ if not self._check_url_patterns(url, self._include, self._exclude):
258
+ continue
259
+
260
+ # Check if we have capacity in the queue
261
+ await self._queue_has_capacity.wait()
262
+
263
+ state = await self._get_state()
264
+ async with self._queue_lock:
265
+ state.url_queue.append(url)
266
+ state.current_sitemap_processed_urls.add(url)
267
+ state.total_count += 1
268
+ if len(state.url_queue) >= self._max_buffer_size:
269
+ # Notify that the queue is full
270
+ self._queue_has_capacity.clear()
271
+
272
+ # Clear current sitemap after processing
273
+ state = await self._get_state()
274
+ current_sitemap_url = state.in_progress_sitemap_url
275
+ state.in_progress_sitemap_url = None
276
+ if current_sitemap_url:
277
+ state.processed_sitemap_urls.add(current_sitemap_url)
278
+ state.current_sitemap_processed_urls.clear()
279
+
280
+ # Mark as completed after processing all sitemap urls
281
+ state.completed = True
129
282
 
130
283
  except Exception:
131
284
  logger.exception('Error loading sitemaps')
132
285
  raise
133
286
 
287
+ @override
134
288
  async def get_total_count(self) -> int:
135
289
  """Return the total number of URLs found so far."""
136
- return self._total_count
290
+ state = await self._get_state()
291
+ return state.total_count
137
292
 
293
+ @override
294
+ async def get_handled_count(self) -> int:
295
+ """Return the number of URLs that have been handled."""
296
+ state = await self._get_state()
297
+ return state.handled_count
298
+
299
+ @override
138
300
  async def is_empty(self) -> bool:
139
301
  """Check if there are no more URLs to process."""
140
- return self._url_queue.empty() and self._loading_task.done()
302
+ state = await self._get_state()
303
+ return not state.url_queue
141
304
 
305
+ @override
142
306
  async def is_finished(self) -> bool:
143
307
  """Check if all URLs have been processed."""
144
- return self._url_queue.empty() and len(self._in_progress) == 0 and self._loading_task.done()
308
+ state = await self._get_state()
309
+ return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()
145
310
 
311
+ @override
146
312
  async def fetch_next_request(self) -> Request | None:
147
313
  """Fetch the next request to process."""
148
- while not (self._loading_task.done() and self._url_queue.empty()):
149
- if self._url_queue.empty():
150
- await asyncio.sleep(0.5)
314
+ while not (await self.is_finished()):
315
+ state = await self._get_state()
316
+ if not state.url_queue:
317
+ await asyncio.sleep(0.1)
151
318
  continue
152
319
 
153
- url = await self._url_queue.get()
320
+ async with self._queue_lock:
321
+ url = state.url_queue.popleft()
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
331
+ state.in_progress.add(request.url)
332
+ if len(state.url_queue) < self._max_buffer_size:
333
+ self._queue_has_capacity.set()
154
334
 
155
- request = Request.from_url(url)
156
- self._in_progress.add(request.id)
157
335
  return request
158
336
 
159
337
  return None
160
338
 
339
+ @override
161
340
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
162
341
  """Mark a request as successfully handled."""
163
- if request.id in self._in_progress:
164
- self._in_progress.remove(request.id)
165
- self._handled_count += 1
342
+ state = await self._get_state()
343
+ if request.url in state.in_progress:
344
+ state.in_progress.remove(request.url)
345
+ state.handled_count += 1
166
346
  return None
167
347
 
168
- async def get_handled_count(self) -> int:
169
- """Return the number of handled requests."""
170
- return self._handled_count
171
-
172
348
  async def abort_loading(self) -> None:
173
349
  """Abort the sitemap loading process."""
174
350
  if self._loading_task and not self._loading_task.done():
175
351
  self._loading_task.cancel()
176
352
  with suppress(asyncio.CancelledError):
177
353
  await self._loading_task
354
+
355
+ async def start(self) -> None:
356
+ """Start the sitemap loading process."""
357
+ if self._loading_task and not self._loading_task.done():
358
+ return
359
+ self._loading_task = asyncio.create_task(self._load_sitemaps())
360
+
361
+ async def close(self) -> None:
362
+ """Close the request loader."""
363
+ await self.abort_loading()
364
+ await self._state.teardown()
365
+
366
+ async def __aenter__(self) -> SitemapRequestLoader:
367
+ """Enter the context manager."""
368
+ await self.start()
369
+ return self
370
+
371
+ async def __aexit__(
372
+ self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
373
+ ) -> None:
374
+ """Exit the context manager."""
375
+ await self.close()
crawlee/router.py CHANGED
@@ -1,13 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from collections.abc import Awaitable, Callable
4
5
  from typing import Generic, TypeVar
5
6
 
7
+ from crawlee._request import RequestState
6
8
  from crawlee._types import BasicCrawlingContext
7
9
  from crawlee._utils.docs import docs_group
8
10
 
9
11
  __all__ = ['Router']
10
12
 
13
+ from crawlee.errors import UserHandlerTimeoutError
14
+
11
15
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
12
16
  RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
13
17
 
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
89
93
 
90
94
  async def __call__(self, context: TCrawlingContext) -> None:
91
95
  """Invoke a request handler that matches the request label (or the default)."""
96
+ context.request.state = RequestState.REQUEST_HANDLER
92
97
  if context.request.label is None or context.request.label not in self._handlers_by_label:
93
98
  if self._default_handler is None:
94
99
  raise RuntimeError(
95
100
  f'No handler matches label `{context.request.label}` and no default handler is configured'
96
101
  )
97
102
 
98
- return await self._default_handler(context)
103
+ user_defined_handler = self._default_handler
104
+ else:
105
+ user_defined_handler = self._handlers_by_label[context.request.label]
99
106
 
100
- handler = self._handlers_by_label[context.request.label]
101
- return await handler(context)
107
+ try:
108
+ return await user_defined_handler(context)
109
+ except asyncio.TimeoutError as e:
110
+ # Timeout in handler, but not timeout of handler.
111
+ raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
@@ -10,6 +10,7 @@ from crawlee._utils.docs import docs_group
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  from collections.abc import Iterator
13
+ from typing import TypeGuard
13
14
 
14
15
 
15
16
  @docs_group('Session management')
@@ -66,17 +67,18 @@ class SessionCookies:
66
67
 
67
68
  self._jar = CookieJar()
68
69
 
69
- if isinstance(cookies, dict):
70
- for key, value in cookies.items():
71
- self.set(key, value)
72
-
73
- elif isinstance(cookies, list):
70
+ if isinstance(cookies, list):
74
71
  for item in cookies:
75
72
  self.set(**item)
76
73
 
77
74
  elif isinstance(cookies, SessionCookies):
78
75
  for cookie in cookies.jar:
79
- self.jar.set_cookie(cookie)
76
+ self._jar.set_cookie(cookie)
77
+
78
+ elif isinstance(cookies, dict):
79
+ cookies_dict: dict[str, str] = cookies
80
+ for key, value in cookies_dict.items():
81
+ self.set(key, value)
80
82
 
81
83
  @property
82
84
  def jar(self) -> CookieJar:
@@ -151,8 +153,8 @@ class SessionCookies:
151
153
  if cookie.expires:
152
154
  cookie_dict['expires'] = cookie.expires
153
155
 
154
- if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site in {'Lax', 'None', 'Strict'}:
155
- cookie_dict['same_site'] = same_site # type: ignore[typeddict-item]
156
+ if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):
157
+ cookie_dict['same_site'] = same_site
156
158
 
157
159
  return cookie_dict
158
160
 
@@ -273,3 +275,6 @@ class SessionCookies:
273
275
  """Return hash based on the cookies key attributes."""
274
276
  cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)
275
277
  return hash(cookie_tuples)
278
+
279
+ def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:
280
+ return value in {'Lax', 'None', 'Strict'}
@@ -20,7 +20,7 @@ from ._session import Session
20
20
  class SessionModel(BaseModel):
21
21
  """Model for a Session object."""
22
22
 
23
- model_config = ConfigDict(populate_by_name=True)
23
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
24
24
 
25
25
  id: Annotated[str, Field(alias='id')]
26
26
  max_age: Annotated[timedelta, Field(alias='maxAge')]
@@ -38,7 +38,7 @@ class SessionModel(BaseModel):
38
38
  class SessionPoolModel(BaseModel):
39
39
  """Model for a SessionPool object."""
40
40
 
41
- model_config = ConfigDict(populate_by_name=True)
41
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
42
42
 
43
43
  max_pool_size: Annotated[int, Field(alias='maxPoolSize')]
44
44
 
@@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel):
63
63
  ),
64
64
  ]
65
65
 
66
- @computed_field(alias='sessionCount') # type: ignore[prop-decorator]
66
+ @computed_field(alias='sessionCount')
67
67
  @property
68
68
  def session_count(self) -> int:
69
69
  """Get the total number of sessions currently maintained in the pool."""
70
70
  return len(self.sessions)
71
71
 
72
- @computed_field(alias='usableSessionCount') # type: ignore[prop-decorator]
72
+ @computed_field(alias='usableSessionCount')
73
73
  @property
74
74
  def usable_session_count(self) -> int:
75
75
  """Get the number of sessions that are currently usable."""
76
76
  return len([session for _, session in self.sessions.items() if session.is_usable])
77
77
 
78
- @computed_field(alias='retiredSessionCount') # type: ignore[prop-decorator]
78
+ @computed_field(alias='retiredSessionCount')
79
79
  @property
80
80
  def retired_session_count(self) -> int:
81
81
  """Get the number of sessions that are no longer usable."""
@@ -163,7 +163,7 @@ class SessionPool:
163
163
  def add_session(self, session: Session) -> None:
164
164
  """Add an externally created session to the pool.
165
165
 
166
- This is intened only for the cases when you want to add a session that was created outside of the pool.
166
+ This is intended only for the cases when you want to add a session that was created outside of the pool.
167
167
  Otherwise, the pool will create new sessions automatically.
168
168
 
169
169
  Args:
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
32
32
  """Capture error snapshot and save it to key value store.
33
33
 
34
34
  It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
35
- it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
35
+ it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
36
36
  returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
37
37
  an exception.
38
38