crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -6,7 +6,7 @@ from logging import getLogger
6
6
  from typing import TYPE_CHECKING, Any, TypedDict
7
7
 
8
8
  from cachetools import LRUCache
9
- from impit import AsyncClient, Browser, HTTPError, Response, TransportError
9
+ from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
10
10
  from impit import ProxyError as ImpitProxyError
11
11
  from typing_extensions import override
12
12
 
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
125
125
  session: Session | None = None,
126
126
  proxy_info: ProxyInfo | None = None,
127
127
  statistics: Statistics | None = None,
128
+ timeout: timedelta | None = None,
128
129
  ) -> HttpCrawlingResult:
129
130
  client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
130
131
 
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
134
135
  method=request.method,
135
136
  content=request.payload,
136
137
  headers=dict(request.headers) if request.headers else None,
138
+ timeout=timeout.total_seconds() if timeout else None,
137
139
  )
140
+ except TimeoutException as exc:
141
+ raise asyncio.TimeoutError from exc
138
142
  except (TransportError, HTTPError) as exc:
139
143
  if self._is_proxy_error(exc):
140
144
  raise ProxyError from exc
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
157
161
  payload: HttpPayload | None = None,
158
162
  session: Session | None = None,
159
163
  proxy_info: ProxyInfo | None = None,
164
+ timeout: timedelta | None = None,
160
165
  ) -> HttpResponse:
161
166
  if isinstance(headers, dict) or headers is None:
162
167
  headers = HttpHeaders(headers or {})
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
165
170
 
166
171
  try:
167
172
  response = await client.request(
168
- method=method, url=url, content=payload, headers=dict(headers) if headers else None
173
+ method=method,
174
+ url=url,
175
+ content=payload,
176
+ headers=dict(headers) if headers else None,
177
+ timeout=timeout.total_seconds() if timeout else None,
169
178
  )
179
+ except TimeoutException as exc:
180
+ raise asyncio.TimeoutError from exc
170
181
  except (TransportError, HTTPError) as exc:
171
182
  if self._is_proxy_error(exc):
172
183
  raise ProxyError from exc
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
189
200
  ) -> AsyncGenerator[HttpResponse]:
190
201
  client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
191
202
 
192
- response = await client.request(
193
- method=method,
194
- url=url,
195
- content=payload,
196
- headers=dict(headers) if headers else None,
197
- timeout=timeout.total_seconds() if timeout else None,
198
- stream=True,
199
- )
203
+ try:
204
+ response = await client.request(
205
+ method=method,
206
+ url=url,
207
+ content=payload,
208
+ headers=dict(headers) if headers else None,
209
+ timeout=timeout.total_seconds() if timeout else None,
210
+ stream=True,
211
+ )
212
+ except TimeoutException as exc:
213
+ raise asyncio.TimeoutError from exc
214
+
200
215
  try:
201
216
  yield _ImpitResponse(response)
202
217
  finally:
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
69
69
 
70
70
  if request_handling_instrumentation:
71
71
 
72
- async def middlware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
72
+ async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
73
73
  with self._tracer.start_as_current_span(
74
74
  name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
75
75
  attributes={
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
111
111
  # Handpicked interesting methods to instrument
112
112
  self._instrumented.extend(
113
113
  [
114
- (_Middleware, 'action', middlware_wrapper),
115
- (_Middleware, 'cleanup', middlware_wrapper),
114
+ (_Middleware, 'action', middleware_wrapper),
115
+ (_Middleware, 'cleanup', middleware_wrapper),
116
116
  (ContextPipeline, '__call__', context_pipeline_wrapper),
117
117
  (BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
118
118
  (BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
@@ -5,8 +5,8 @@
5
5
  # % endif
6
6
  # % if cookiecutter.http_client == 'curl-impersonate'
7
7
  # % do extras.append('curl-impersonate')
8
- # % elif cookiecutter.http_client == 'impit'
9
- # % do extras.append('impit')
8
+ # % elif cookiecutter.http_client == 'httpx'
9
+ # % do extras.append('httpx')
10
10
  # % endif
11
11
 
12
12
  [project]
@@ -10,4 +10,7 @@ apify
10
10
  # % if cookiecutter.http_client == 'curl-impersonate'
11
11
  # % do extras.append('curl-impersonate')
12
12
  # % endif
13
+ # % if cookiecutter.http_client == 'httpx'
14
+ # % do extras.append('httpx')
15
+ # % endif
13
16
  crawlee[{{ extras | join(',') }}]
@@ -17,7 +17,7 @@ logger = getLogger(__name__)
17
17
 
18
18
 
19
19
  class RequestListState(BaseModel):
20
- model_config = ConfigDict(populate_by_name=True)
20
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
21
21
 
22
22
  next_index: Annotated[int, Field(alias='nextIndex')] = 0
23
23
  next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
@@ -166,7 +166,7 @@ class RequestList(RequestLoader):
166
166
  return None
167
167
 
168
168
  state = await self._get_state()
169
- state.in_progress.add(self._next[0].id)
169
+ state.in_progress.add(self._next[0].unique_key)
170
170
  self._assumed_total_count += 1
171
171
 
172
172
  next_request = self._next[0]
@@ -183,7 +183,7 @@ class RequestList(RequestLoader):
183
183
  async def mark_request_as_handled(self, request: Request) -> None:
184
184
  self._handled_count += 1
185
185
  state = await self._get_state()
186
- state.in_progress.remove(request.id)
186
+ state.in_progress.remove(request.unique_key)
187
187
 
188
188
  async def _ensure_next_request(self) -> None:
189
189
  await self._get_state()
@@ -43,7 +43,11 @@ class RequestLoader(ABC):
43
43
 
44
44
  @abstractmethod
45
45
  async def fetch_next_request(self) -> Request | None:
46
- """Return the next request to be processed, or `null` if there are no more pending requests."""
46
+ """Return the next request to be processed, or `None` if there are no more pending requests.
47
+
48
+ The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method
49
+ should wait until a request appears.
50
+ """
47
51
 
48
52
  @abstractmethod
49
53
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
@@ -1,20 +1,27 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ from collections import deque
4
5
  from contextlib import suppress
5
6
  from logging import getLogger
6
- from typing import TYPE_CHECKING, Any
7
+ from typing import TYPE_CHECKING, Annotated, Any
7
8
 
8
- from crawlee import Request
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+ from typing_extensions import override
11
+
12
+ from crawlee import Request, RequestOptions
9
13
  from crawlee._utils.docs import docs_group
10
14
  from crawlee._utils.globs import Glob
11
- from crawlee._utils.sitemap import ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
15
+ from crawlee._utils.recoverable_state import RecoverableState
16
+ from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
12
17
  from crawlee.request_loaders._request_loader import RequestLoader
13
18
 
14
19
  if TYPE_CHECKING:
15
20
  import re
16
- from collections.abc import Sequence
21
+ from collections.abc import Callable, Sequence
22
+ from types import TracebackType
17
23
 
24
+ from crawlee import RequestTransformAction
18
25
  from crawlee.http_clients import HttpClient
19
26
  from crawlee.proxy_configuration import ProxyInfo
20
27
  from crawlee.storage_clients.models import ProcessedRequest
@@ -23,12 +30,77 @@ if TYPE_CHECKING:
23
30
  logger = getLogger(__name__)
24
31
 
25
32
 
33
+ class SitemapRequestLoaderState(BaseModel):
34
+ """State model for persisting sitemap request loader data.
35
+
36
+ The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.
37
+ The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved
38
+ from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to
39
+ `pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a
40
+ `SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,
41
+ the loader was restarted from a saved state and the URL is skipped.
42
+
43
+ If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is
44
+ incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`
45
+ is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is
46
+ cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in
47
+ `processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.
48
+
49
+ When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.
50
+ When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and
51
+ `handled_count` is incremented by 1.
52
+
53
+ During initial startup or restart after persistence, state validation occurs in `_get_state`. If both
54
+ `pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a
55
+ fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is
56
+ restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and
57
+ `in_progress` is cleared.
58
+ """
59
+
60
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
61
+
62
+ url_queue: Annotated[deque[str], Field(alias='urlQueue')]
63
+ """Queue of URLs extracted from sitemaps and ready for processing."""
64
+
65
+ in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
66
+ """Set of request URLs currently being processed."""
67
+
68
+ pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]
69
+ """Queue of sitemap URLs that need to be fetched and processed."""
70
+
71
+ in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None
72
+ """The sitemap URL currently being processed."""
73
+
74
+ current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set()
75
+ """URLs from the current sitemap that have been added to the queue."""
76
+
77
+ processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set()
78
+ """Set of processed sitemap URLs."""
79
+
80
+ completed: Annotated[bool, Field(alias='sitemapCompleted')] = False
81
+ """Whether all sitemaps have been fully processed."""
82
+
83
+ total_count: Annotated[int, Field(alias='totalCount')] = 0
84
+ """Total number of URLs found and added to the queue from all processed sitemaps."""
85
+
86
+ handled_count: Annotated[int, Field(alias='handledCount')] = 0
87
+ """Number of URLs that have been successfully handled."""
88
+
89
+
26
90
  @docs_group('Request loaders')
27
91
  class SitemapRequestLoader(RequestLoader):
28
92
  """A request loader that reads URLs from sitemap(s).
29
93
 
94
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
95
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
96
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
97
+ and the `enqueue_links` functionality.
98
+
30
99
  The loader fetches and parses sitemaps in the background, allowing crawling to start
31
100
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
101
+
102
+ The loader supports state persistence, allowing it to resume from where it left off
103
+ after interruption when a `persist_state_key` is provided during initialization.
32
104
  """
33
105
 
34
106
  def __init__(
@@ -40,7 +112,8 @@ class SitemapRequestLoader(RequestLoader):
40
112
  include: list[re.Pattern[Any] | Glob] | None = None,
41
113
  exclude: list[re.Pattern[Any] | Glob] | None = None,
42
114
  max_buffer_size: int = 200,
43
- parse_sitemap_options: ParseSitemapOptions | None = None,
115
+ persist_state_key: str | None = None,
116
+ transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
44
117
  ) -> None:
45
118
  """Initialize the sitemap request loader.
46
119
 
@@ -50,27 +123,68 @@ class SitemapRequestLoader(RequestLoader):
50
123
  include: List of glob or regex patterns to include URLs.
51
124
  exclude: List of glob or regex patterns to exclude URLs.
52
125
  max_buffer_size: Maximum number of URLs to buffer in memory.
53
- parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`.
54
126
  http_client: the instance of `HttpClient` to use for fetching sitemaps.
127
+ persist_state_key: A key for persisting the loader's state in the KeyValueStore.
128
+ When provided, allows resuming from where it left off after interruption.
129
+ If None, no state persistence occurs.
130
+ transform_request_function: An optional function to transform requests
131
+ generated by the loader. It receives `RequestOptions` with `url` and should return either
132
+ modified `RequestOptions` or a `RequestTransformAction`.
55
133
  """
56
134
  self._http_client = http_client
57
-
58
135
  self._sitemap_urls = sitemap_urls
59
136
  self._include = include
60
137
  self._exclude = exclude
61
138
  self._proxy_info = proxy_info
62
- self._parse_sitemap_options = parse_sitemap_options or ParseSitemapOptions()
139
+ self._max_buffer_size = max_buffer_size
140
+ self._transform_request_function = transform_request_function
141
+
142
+ # Synchronization for queue operations
143
+ self._queue_has_capacity = asyncio.Event()
144
+ self._queue_has_capacity.set()
145
+ self._queue_lock = asyncio.Lock()
146
+
147
+ # Initialize recoverable state
148
+ self._state = RecoverableState(
149
+ default_state=SitemapRequestLoaderState(
150
+ url_queue=deque(),
151
+ pending_sitemap_urls=deque(),
152
+ ),
153
+ persistence_enabled=bool(persist_state_key),
154
+ persist_state_key=persist_state_key or '',
155
+ logger=logger,
156
+ )
157
+
158
+ # Start background loading
159
+ self._loading_task = asyncio.create_task(self._load_sitemaps())
63
160
 
64
- self._handled_count = 0
65
- self._total_count = 0
161
+ async def _get_state(self) -> SitemapRequestLoaderState:
162
+ """Initialize and return the current state."""
163
+ async with self._queue_lock:
164
+ if self._state.is_initialized:
165
+ return self._state.current_value
66
166
 
67
- # URL queue and tracking
68
- self._url_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=max_buffer_size)
69
- self._in_progress: set[str] = set()
70
- self._processed_urls: set[str] = set()
167
+ await self._state.initialize()
71
168
 
72
- # Loading state
73
- self._loading_task = asyncio.create_task(self._load_sitemaps())
169
+ # Initialize pending sitemaps on first run
170
+ has_sitemap_for_processing = (
171
+ self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url
172
+ )
173
+ if not has_sitemap_for_processing and not self._state.current_value.completed:
174
+ self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)
175
+
176
+ if self._state.current_value.in_progress:
177
+ self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)
178
+ self._state.current_value.in_progress.clear()
179
+
180
+ if (
181
+ self._state.current_value.url_queue
182
+ and len(self._state.current_value.url_queue) >= self._max_buffer_size
183
+ ):
184
+ # Notify that the queue is full
185
+ self._queue_has_capacity.clear()
186
+
187
+ return self._state.current_value
74
188
 
75
189
  def _check_url_patterns(
76
190
  self,
@@ -105,73 +219,157 @@ class SitemapRequestLoader(RequestLoader):
105
219
  async def _load_sitemaps(self) -> None:
106
220
  """Load URLs from sitemaps in the background."""
107
221
  try:
108
- async for item in parse_sitemap(
109
- [SitemapSource(type='url', url=url) for url in self._sitemap_urls],
110
- self._http_client,
111
- proxy_info=self._proxy_info,
112
- options=self._parse_sitemap_options,
113
- ):
114
- # Only process URL items (not nested sitemaps)
115
- if isinstance(item, SitemapUrl):
116
- url = item.loc
117
-
118
- # Skip if already processed
119
- if url in self._processed_urls:
222
+ # Get actual state
223
+ while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url):
224
+ # Get sitemap URL for parsing
225
+ sitemap_url = state.in_progress_sitemap_url
226
+ if not sitemap_url:
227
+ sitemap_url = state.pending_sitemap_urls.popleft()
228
+ # Skip processed urls
229
+ if sitemap_url in state.processed_sitemap_urls:
120
230
  continue
121
-
122
- # Check if URL should be included
123
- if not self._check_url_patterns(url, self._include, self._exclude):
231
+ state.in_progress_sitemap_url = sitemap_url
232
+
233
+ parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
234
+
235
+ async for item in parse_sitemap(
236
+ [SitemapSource(type='url', url=sitemap_url)],
237
+ self._http_client,
238
+ proxy_info=self._proxy_info,
239
+ options=parse_options,
240
+ ):
241
+ if isinstance(item, NestedSitemap):
242
+ # Add nested sitemap to queue
243
+ if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
244
+ state.pending_sitemap_urls.append(item.loc)
124
245
  continue
125
246
 
126
- await self._url_queue.put(url)
127
- self._processed_urls.add(url)
128
- self._total_count += 1
247
+ if isinstance(item, SitemapUrl):
248
+ url = item.loc
249
+
250
+ state = await self._get_state()
251
+
252
+ # Skip if already processed
253
+ if url in state.current_sitemap_processed_urls:
254
+ continue
255
+
256
+ # Check if URL should be included
257
+ if not self._check_url_patterns(url, self._include, self._exclude):
258
+ continue
259
+
260
+ # Check if we have capacity in the queue
261
+ await self._queue_has_capacity.wait()
262
+
263
+ state = await self._get_state()
264
+ async with self._queue_lock:
265
+ state.url_queue.append(url)
266
+ state.current_sitemap_processed_urls.add(url)
267
+ state.total_count += 1
268
+ if len(state.url_queue) >= self._max_buffer_size:
269
+ # Notify that the queue is full
270
+ self._queue_has_capacity.clear()
271
+
272
+ # Clear current sitemap after processing
273
+ state = await self._get_state()
274
+ current_sitemap_url = state.in_progress_sitemap_url
275
+ state.in_progress_sitemap_url = None
276
+ if current_sitemap_url:
277
+ state.processed_sitemap_urls.add(current_sitemap_url)
278
+ state.current_sitemap_processed_urls.clear()
279
+
280
+ # Mark as completed after processing all sitemap urls
281
+ state.completed = True
129
282
 
130
283
  except Exception:
131
284
  logger.exception('Error loading sitemaps')
132
285
  raise
133
286
 
287
+ @override
134
288
  async def get_total_count(self) -> int:
135
289
  """Return the total number of URLs found so far."""
136
- return self._total_count
290
+ state = await self._get_state()
291
+ return state.total_count
137
292
 
293
+ @override
294
+ async def get_handled_count(self) -> int:
295
+ """Return the number of URLs that have been handled."""
296
+ state = await self._get_state()
297
+ return state.handled_count
298
+
299
+ @override
138
300
  async def is_empty(self) -> bool:
139
301
  """Check if there are no more URLs to process."""
140
- return self._url_queue.empty() and self._loading_task.done()
302
+ state = await self._get_state()
303
+ return not state.url_queue
141
304
 
305
+ @override
142
306
  async def is_finished(self) -> bool:
143
307
  """Check if all URLs have been processed."""
144
- return self._url_queue.empty() and len(self._in_progress) == 0 and self._loading_task.done()
308
+ state = await self._get_state()
309
+ return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()
145
310
 
311
+ @override
146
312
  async def fetch_next_request(self) -> Request | None:
147
313
  """Fetch the next request to process."""
148
- while not (self._loading_task.done() and self._url_queue.empty()):
149
- if self._url_queue.empty():
150
- await asyncio.sleep(0.5)
314
+ while not (await self.is_finished()):
315
+ state = await self._get_state()
316
+ if not state.url_queue:
317
+ await asyncio.sleep(0.1)
151
318
  continue
152
319
 
153
- url = await self._url_queue.get()
320
+ async with self._queue_lock:
321
+ url = state.url_queue.popleft()
322
+ request_option = RequestOptions(url=url)
323
+ if self._transform_request_function:
324
+ transform_request_option = self._transform_request_function(request_option)
325
+ if transform_request_option == 'skip':
326
+ state.total_count -= 1
327
+ continue
328
+ if transform_request_option != 'unchanged':
329
+ request_option = transform_request_option
330
+ request = Request.from_url(**request_option)
331
+ state.in_progress.add(request.url)
332
+ if len(state.url_queue) < self._max_buffer_size:
333
+ self._queue_has_capacity.set()
154
334
 
155
- request = Request.from_url(url)
156
- self._in_progress.add(request.id)
157
335
  return request
158
336
 
159
337
  return None
160
338
 
339
+ @override
161
340
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
162
341
  """Mark a request as successfully handled."""
163
- if request.id in self._in_progress:
164
- self._in_progress.remove(request.id)
165
- self._handled_count += 1
342
+ state = await self._get_state()
343
+ if request.url in state.in_progress:
344
+ state.in_progress.remove(request.url)
345
+ state.handled_count += 1
166
346
  return None
167
347
 
168
- async def get_handled_count(self) -> int:
169
- """Return the number of handled requests."""
170
- return self._handled_count
171
-
172
348
  async def abort_loading(self) -> None:
173
349
  """Abort the sitemap loading process."""
174
350
  if self._loading_task and not self._loading_task.done():
175
351
  self._loading_task.cancel()
176
352
  with suppress(asyncio.CancelledError):
177
353
  await self._loading_task
354
+
355
+ async def start(self) -> None:
356
+ """Start the sitemap loading process."""
357
+ if self._loading_task and not self._loading_task.done():
358
+ return
359
+ self._loading_task = asyncio.create_task(self._load_sitemaps())
360
+
361
+ async def close(self) -> None:
362
+ """Close the request loader."""
363
+ await self.abort_loading()
364
+ await self._state.teardown()
365
+
366
+ async def __aenter__(self) -> SitemapRequestLoader:
367
+ """Enter the context manager."""
368
+ await self.start()
369
+ return self
370
+
371
+ async def __aexit__(
372
+ self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
373
+ ) -> None:
374
+ """Exit the context manager."""
375
+ await self.close()
@@ -20,7 +20,7 @@ from ._session import Session
20
20
  class SessionModel(BaseModel):
21
21
  """Model for a Session object."""
22
22
 
23
- model_config = ConfigDict(populate_by_name=True)
23
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
24
24
 
25
25
  id: Annotated[str, Field(alias='id')]
26
26
  max_age: Annotated[timedelta, Field(alias='maxAge')]
@@ -38,7 +38,7 @@ class SessionModel(BaseModel):
38
38
  class SessionPoolModel(BaseModel):
39
39
  """Model for a SessionPool object."""
40
40
 
41
- model_config = ConfigDict(populate_by_name=True)
41
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
42
42
 
43
43
  max_pool_size: Annotated[int, Field(alias='maxPoolSize')]
44
44
 
@@ -163,7 +163,7 @@ class SessionPool:
163
163
  def add_session(self, session: Session) -> None:
164
164
  """Add an externally created session to the pool.
165
165
 
166
- This is intened only for the cases when you want to add a session that was created outside of the pool.
166
+ This is intended only for the cases when you want to add a session that was created outside of the pool.
167
167
  Otherwise, the pool will create new sessions automatically.
168
168
 
169
169
  Args:
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
32
32
  """Capture error snapshot and save it to key value store.
33
33
 
34
34
  It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
35
- it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
35
+ it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
36
36
  returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
37
37
  an exception.
38
38