crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +35 -33
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +106 -34
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +4 -2
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +219 -126
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +43 -4
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
- crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +13 -11
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
crawlee/http_clients/_impit.py
CHANGED
|
@@ -6,7 +6,7 @@ from logging import getLogger
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, TypedDict
|
|
7
7
|
|
|
8
8
|
from cachetools import LRUCache
|
|
9
|
-
from impit import AsyncClient, Browser, HTTPError, Response, TransportError
|
|
9
|
+
from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
|
|
10
10
|
from impit import ProxyError as ImpitProxyError
|
|
11
11
|
from typing_extensions import override
|
|
12
12
|
|
|
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
|
|
|
125
125
|
session: Session | None = None,
|
|
126
126
|
proxy_info: ProxyInfo | None = None,
|
|
127
127
|
statistics: Statistics | None = None,
|
|
128
|
+
timeout: timedelta | None = None,
|
|
128
129
|
) -> HttpCrawlingResult:
|
|
129
130
|
client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
|
|
130
131
|
|
|
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
|
|
|
134
135
|
method=request.method,
|
|
135
136
|
content=request.payload,
|
|
136
137
|
headers=dict(request.headers) if request.headers else None,
|
|
138
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
137
139
|
)
|
|
140
|
+
except TimeoutException as exc:
|
|
141
|
+
raise asyncio.TimeoutError from exc
|
|
138
142
|
except (TransportError, HTTPError) as exc:
|
|
139
143
|
if self._is_proxy_error(exc):
|
|
140
144
|
raise ProxyError from exc
|
|
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
|
|
|
157
161
|
payload: HttpPayload | None = None,
|
|
158
162
|
session: Session | None = None,
|
|
159
163
|
proxy_info: ProxyInfo | None = None,
|
|
164
|
+
timeout: timedelta | None = None,
|
|
160
165
|
) -> HttpResponse:
|
|
161
166
|
if isinstance(headers, dict) or headers is None:
|
|
162
167
|
headers = HttpHeaders(headers or {})
|
|
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
|
|
|
165
170
|
|
|
166
171
|
try:
|
|
167
172
|
response = await client.request(
|
|
168
|
-
method=method,
|
|
173
|
+
method=method,
|
|
174
|
+
url=url,
|
|
175
|
+
content=payload,
|
|
176
|
+
headers=dict(headers) if headers else None,
|
|
177
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
169
178
|
)
|
|
179
|
+
except TimeoutException as exc:
|
|
180
|
+
raise asyncio.TimeoutError from exc
|
|
170
181
|
except (TransportError, HTTPError) as exc:
|
|
171
182
|
if self._is_proxy_error(exc):
|
|
172
183
|
raise ProxyError from exc
|
|
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
|
|
|
189
200
|
) -> AsyncGenerator[HttpResponse]:
|
|
190
201
|
client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
|
|
191
202
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
203
|
+
try:
|
|
204
|
+
response = await client.request(
|
|
205
|
+
method=method,
|
|
206
|
+
url=url,
|
|
207
|
+
content=payload,
|
|
208
|
+
headers=dict(headers) if headers else None,
|
|
209
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
210
|
+
stream=True,
|
|
211
|
+
)
|
|
212
|
+
except TimeoutException as exc:
|
|
213
|
+
raise asyncio.TimeoutError from exc
|
|
214
|
+
|
|
200
215
|
try:
|
|
201
216
|
yield _ImpitResponse(response)
|
|
202
217
|
finally:
|
|
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
69
69
|
|
|
70
70
|
if request_handling_instrumentation:
|
|
71
71
|
|
|
72
|
-
async def
|
|
72
|
+
async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
|
|
73
73
|
with self._tracer.start_as_current_span(
|
|
74
74
|
name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
|
|
75
75
|
attributes={
|
|
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
111
111
|
# Handpicked interesting methods to instrument
|
|
112
112
|
self._instrumented.extend(
|
|
113
113
|
[
|
|
114
|
-
(_Middleware, 'action',
|
|
115
|
-
(_Middleware, 'cleanup',
|
|
114
|
+
(_Middleware, 'action', middleware_wrapper),
|
|
115
|
+
(_Middleware, 'cleanup', middleware_wrapper),
|
|
116
116
|
(ContextPipeline, '__call__', context_pipeline_wrapper),
|
|
117
117
|
(BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
|
|
118
118
|
(BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
# % endif
|
|
6
6
|
# % if cookiecutter.http_client == 'curl-impersonate'
|
|
7
7
|
# % do extras.append('curl-impersonate')
|
|
8
|
-
# % elif cookiecutter.http_client == '
|
|
9
|
-
# % do extras.append('
|
|
8
|
+
# % elif cookiecutter.http_client == 'httpx'
|
|
9
|
+
# % do extras.append('httpx')
|
|
10
10
|
# % endif
|
|
11
11
|
|
|
12
12
|
[project]
|
|
@@ -17,7 +17,7 @@ logger = getLogger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class RequestListState(BaseModel):
|
|
20
|
-
model_config = ConfigDict(
|
|
20
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
21
21
|
|
|
22
22
|
next_index: Annotated[int, Field(alias='nextIndex')] = 0
|
|
23
23
|
next_unique_key: Annotated[str | None, Field(alias='nextUniqueKey')] = None
|
|
@@ -166,7 +166,7 @@ class RequestList(RequestLoader):
|
|
|
166
166
|
return None
|
|
167
167
|
|
|
168
168
|
state = await self._get_state()
|
|
169
|
-
state.in_progress.add(self._next[0].
|
|
169
|
+
state.in_progress.add(self._next[0].unique_key)
|
|
170
170
|
self._assumed_total_count += 1
|
|
171
171
|
|
|
172
172
|
next_request = self._next[0]
|
|
@@ -183,7 +183,7 @@ class RequestList(RequestLoader):
|
|
|
183
183
|
async def mark_request_as_handled(self, request: Request) -> None:
|
|
184
184
|
self._handled_count += 1
|
|
185
185
|
state = await self._get_state()
|
|
186
|
-
state.in_progress.remove(request.
|
|
186
|
+
state.in_progress.remove(request.unique_key)
|
|
187
187
|
|
|
188
188
|
async def _ensure_next_request(self) -> None:
|
|
189
189
|
await self._get_state()
|
|
@@ -43,7 +43,11 @@ class RequestLoader(ABC):
|
|
|
43
43
|
|
|
44
44
|
@abstractmethod
|
|
45
45
|
async def fetch_next_request(self) -> Request | None:
|
|
46
|
-
"""Return the next request to be processed, or `
|
|
46
|
+
"""Return the next request to be processed, or `None` if there are no more pending requests.
|
|
47
|
+
|
|
48
|
+
The method should return `None` if and only if `is_finished` would return `True`. In other cases, the method
|
|
49
|
+
should wait until a request appears.
|
|
50
|
+
"""
|
|
47
51
|
|
|
48
52
|
@abstractmethod
|
|
49
53
|
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
|
|
@@ -1,20 +1,27 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
from collections import deque
|
|
4
5
|
from contextlib import suppress
|
|
5
6
|
from logging import getLogger
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
7
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
|
7
8
|
|
|
8
|
-
from
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
from typing_extensions import override
|
|
11
|
+
|
|
12
|
+
from crawlee import Request, RequestOptions
|
|
9
13
|
from crawlee._utils.docs import docs_group
|
|
10
14
|
from crawlee._utils.globs import Glob
|
|
11
|
-
from crawlee._utils.
|
|
15
|
+
from crawlee._utils.recoverable_state import RecoverableState
|
|
16
|
+
from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
|
|
12
17
|
from crawlee.request_loaders._request_loader import RequestLoader
|
|
13
18
|
|
|
14
19
|
if TYPE_CHECKING:
|
|
15
20
|
import re
|
|
16
|
-
from collections.abc import Sequence
|
|
21
|
+
from collections.abc import Callable, Sequence
|
|
22
|
+
from types import TracebackType
|
|
17
23
|
|
|
24
|
+
from crawlee import RequestTransformAction
|
|
18
25
|
from crawlee.http_clients import HttpClient
|
|
19
26
|
from crawlee.proxy_configuration import ProxyInfo
|
|
20
27
|
from crawlee.storage_clients.models import ProcessedRequest
|
|
@@ -23,12 +30,77 @@ if TYPE_CHECKING:
|
|
|
23
30
|
logger = getLogger(__name__)
|
|
24
31
|
|
|
25
32
|
|
|
33
|
+
class SitemapRequestLoaderState(BaseModel):
|
|
34
|
+
"""State model for persisting sitemap request loader data.
|
|
35
|
+
|
|
36
|
+
The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.
|
|
37
|
+
The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved
|
|
38
|
+
from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to
|
|
39
|
+
`pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a
|
|
40
|
+
`SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,
|
|
41
|
+
the loader was restarted from a saved state and the URL is skipped.
|
|
42
|
+
|
|
43
|
+
If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is
|
|
44
|
+
incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`
|
|
45
|
+
is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is
|
|
46
|
+
cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in
|
|
47
|
+
`processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.
|
|
48
|
+
|
|
49
|
+
When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.
|
|
50
|
+
When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and
|
|
51
|
+
`handled_count` is incremented by 1.
|
|
52
|
+
|
|
53
|
+
During initial startup or restart after persistence, state validation occurs in `_get_state`. If both
|
|
54
|
+
`pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a
|
|
55
|
+
fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is
|
|
56
|
+
restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and
|
|
57
|
+
`in_progress` is cleared.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
61
|
+
|
|
62
|
+
url_queue: Annotated[deque[str], Field(alias='urlQueue')]
|
|
63
|
+
"""Queue of URLs extracted from sitemaps and ready for processing."""
|
|
64
|
+
|
|
65
|
+
in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
|
|
66
|
+
"""Set of request URLs currently being processed."""
|
|
67
|
+
|
|
68
|
+
pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]
|
|
69
|
+
"""Queue of sitemap URLs that need to be fetched and processed."""
|
|
70
|
+
|
|
71
|
+
in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None
|
|
72
|
+
"""The sitemap URL currently being processed."""
|
|
73
|
+
|
|
74
|
+
current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set()
|
|
75
|
+
"""URLs from the current sitemap that have been added to the queue."""
|
|
76
|
+
|
|
77
|
+
processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set()
|
|
78
|
+
"""Set of processed sitemap URLs."""
|
|
79
|
+
|
|
80
|
+
completed: Annotated[bool, Field(alias='sitemapCompleted')] = False
|
|
81
|
+
"""Whether all sitemaps have been fully processed."""
|
|
82
|
+
|
|
83
|
+
total_count: Annotated[int, Field(alias='totalCount')] = 0
|
|
84
|
+
"""Total number of URLs found and added to the queue from all processed sitemaps."""
|
|
85
|
+
|
|
86
|
+
handled_count: Annotated[int, Field(alias='handledCount')] = 0
|
|
87
|
+
"""Number of URLs that have been successfully handled."""
|
|
88
|
+
|
|
89
|
+
|
|
26
90
|
@docs_group('Request loaders')
|
|
27
91
|
class SitemapRequestLoader(RequestLoader):
|
|
28
92
|
"""A request loader that reads URLs from sitemap(s).
|
|
29
93
|
|
|
94
|
+
The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
|
|
95
|
+
(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
|
|
96
|
+
Note that HTML pages containing links are not supported - those should be handled by regular crawlers
|
|
97
|
+
and the `enqueue_links` functionality.
|
|
98
|
+
|
|
30
99
|
The loader fetches and parses sitemaps in the background, allowing crawling to start
|
|
31
100
|
before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
|
|
101
|
+
|
|
102
|
+
The loader supports state persistence, allowing it to resume from where it left off
|
|
103
|
+
after interruption when a `persist_state_key` is provided during initialization.
|
|
32
104
|
"""
|
|
33
105
|
|
|
34
106
|
def __init__(
|
|
@@ -40,7 +112,8 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
40
112
|
include: list[re.Pattern[Any] | Glob] | None = None,
|
|
41
113
|
exclude: list[re.Pattern[Any] | Glob] | None = None,
|
|
42
114
|
max_buffer_size: int = 200,
|
|
43
|
-
|
|
115
|
+
persist_state_key: str | None = None,
|
|
116
|
+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
44
117
|
) -> None:
|
|
45
118
|
"""Initialize the sitemap request loader.
|
|
46
119
|
|
|
@@ -50,27 +123,68 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
50
123
|
include: List of glob or regex patterns to include URLs.
|
|
51
124
|
exclude: List of glob or regex patterns to exclude URLs.
|
|
52
125
|
max_buffer_size: Maximum number of URLs to buffer in memory.
|
|
53
|
-
parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`.
|
|
54
126
|
http_client: the instance of `HttpClient` to use for fetching sitemaps.
|
|
127
|
+
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
|
|
128
|
+
When provided, allows resuming from where it left off after interruption.
|
|
129
|
+
If None, no state persistence occurs.
|
|
130
|
+
transform_request_function: An optional function to transform requests
|
|
131
|
+
generated by the loader. It receives `RequestOptions` with `url` and should return either
|
|
132
|
+
modified `RequestOptions` or a `RequestTransformAction`.
|
|
55
133
|
"""
|
|
56
134
|
self._http_client = http_client
|
|
57
|
-
|
|
58
135
|
self._sitemap_urls = sitemap_urls
|
|
59
136
|
self._include = include
|
|
60
137
|
self._exclude = exclude
|
|
61
138
|
self._proxy_info = proxy_info
|
|
62
|
-
self.
|
|
139
|
+
self._max_buffer_size = max_buffer_size
|
|
140
|
+
self._transform_request_function = transform_request_function
|
|
141
|
+
|
|
142
|
+
# Synchronization for queue operations
|
|
143
|
+
self._queue_has_capacity = asyncio.Event()
|
|
144
|
+
self._queue_has_capacity.set()
|
|
145
|
+
self._queue_lock = asyncio.Lock()
|
|
146
|
+
|
|
147
|
+
# Initialize recoverable state
|
|
148
|
+
self._state = RecoverableState(
|
|
149
|
+
default_state=SitemapRequestLoaderState(
|
|
150
|
+
url_queue=deque(),
|
|
151
|
+
pending_sitemap_urls=deque(),
|
|
152
|
+
),
|
|
153
|
+
persistence_enabled=bool(persist_state_key),
|
|
154
|
+
persist_state_key=persist_state_key or '',
|
|
155
|
+
logger=logger,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Start background loading
|
|
159
|
+
self._loading_task = asyncio.create_task(self._load_sitemaps())
|
|
63
160
|
|
|
64
|
-
|
|
65
|
-
|
|
161
|
+
async def _get_state(self) -> SitemapRequestLoaderState:
|
|
162
|
+
"""Initialize and return the current state."""
|
|
163
|
+
async with self._queue_lock:
|
|
164
|
+
if self._state.is_initialized:
|
|
165
|
+
return self._state.current_value
|
|
66
166
|
|
|
67
|
-
|
|
68
|
-
self._url_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=max_buffer_size)
|
|
69
|
-
self._in_progress: set[str] = set()
|
|
70
|
-
self._processed_urls: set[str] = set()
|
|
167
|
+
await self._state.initialize()
|
|
71
168
|
|
|
72
|
-
|
|
73
|
-
|
|
169
|
+
# Initialize pending sitemaps on first run
|
|
170
|
+
has_sitemap_for_processing = (
|
|
171
|
+
self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url
|
|
172
|
+
)
|
|
173
|
+
if not has_sitemap_for_processing and not self._state.current_value.completed:
|
|
174
|
+
self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)
|
|
175
|
+
|
|
176
|
+
if self._state.current_value.in_progress:
|
|
177
|
+
self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)
|
|
178
|
+
self._state.current_value.in_progress.clear()
|
|
179
|
+
|
|
180
|
+
if (
|
|
181
|
+
self._state.current_value.url_queue
|
|
182
|
+
and len(self._state.current_value.url_queue) >= self._max_buffer_size
|
|
183
|
+
):
|
|
184
|
+
# Notify that the queue is full
|
|
185
|
+
self._queue_has_capacity.clear()
|
|
186
|
+
|
|
187
|
+
return self._state.current_value
|
|
74
188
|
|
|
75
189
|
def _check_url_patterns(
|
|
76
190
|
self,
|
|
@@ -105,73 +219,157 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
105
219
|
async def _load_sitemaps(self) -> None:
|
|
106
220
|
"""Load URLs from sitemaps in the background."""
|
|
107
221
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
url = item.loc
|
|
117
|
-
|
|
118
|
-
# Skip if already processed
|
|
119
|
-
if url in self._processed_urls:
|
|
222
|
+
# Get actual state
|
|
223
|
+
while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url):
|
|
224
|
+
# Get sitemap URL for parsing
|
|
225
|
+
sitemap_url = state.in_progress_sitemap_url
|
|
226
|
+
if not sitemap_url:
|
|
227
|
+
sitemap_url = state.pending_sitemap_urls.popleft()
|
|
228
|
+
# Skip processed urls
|
|
229
|
+
if sitemap_url in state.processed_sitemap_urls:
|
|
120
230
|
continue
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
231
|
+
state.in_progress_sitemap_url = sitemap_url
|
|
232
|
+
|
|
233
|
+
parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
|
|
234
|
+
|
|
235
|
+
async for item in parse_sitemap(
|
|
236
|
+
[SitemapSource(type='url', url=sitemap_url)],
|
|
237
|
+
self._http_client,
|
|
238
|
+
proxy_info=self._proxy_info,
|
|
239
|
+
options=parse_options,
|
|
240
|
+
):
|
|
241
|
+
if isinstance(item, NestedSitemap):
|
|
242
|
+
# Add nested sitemap to queue
|
|
243
|
+
if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
|
|
244
|
+
state.pending_sitemap_urls.append(item.loc)
|
|
124
245
|
continue
|
|
125
246
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
247
|
+
if isinstance(item, SitemapUrl):
|
|
248
|
+
url = item.loc
|
|
249
|
+
|
|
250
|
+
state = await self._get_state()
|
|
251
|
+
|
|
252
|
+
# Skip if already processed
|
|
253
|
+
if url in state.current_sitemap_processed_urls:
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# Check if URL should be included
|
|
257
|
+
if not self._check_url_patterns(url, self._include, self._exclude):
|
|
258
|
+
continue
|
|
259
|
+
|
|
260
|
+
# Check if we have capacity in the queue
|
|
261
|
+
await self._queue_has_capacity.wait()
|
|
262
|
+
|
|
263
|
+
state = await self._get_state()
|
|
264
|
+
async with self._queue_lock:
|
|
265
|
+
state.url_queue.append(url)
|
|
266
|
+
state.current_sitemap_processed_urls.add(url)
|
|
267
|
+
state.total_count += 1
|
|
268
|
+
if len(state.url_queue) >= self._max_buffer_size:
|
|
269
|
+
# Notify that the queue is full
|
|
270
|
+
self._queue_has_capacity.clear()
|
|
271
|
+
|
|
272
|
+
# Clear current sitemap after processing
|
|
273
|
+
state = await self._get_state()
|
|
274
|
+
current_sitemap_url = state.in_progress_sitemap_url
|
|
275
|
+
state.in_progress_sitemap_url = None
|
|
276
|
+
if current_sitemap_url:
|
|
277
|
+
state.processed_sitemap_urls.add(current_sitemap_url)
|
|
278
|
+
state.current_sitemap_processed_urls.clear()
|
|
279
|
+
|
|
280
|
+
# Mark as completed after processing all sitemap urls
|
|
281
|
+
state.completed = True
|
|
129
282
|
|
|
130
283
|
except Exception:
|
|
131
284
|
logger.exception('Error loading sitemaps')
|
|
132
285
|
raise
|
|
133
286
|
|
|
287
|
+
@override
|
|
134
288
|
async def get_total_count(self) -> int:
|
|
135
289
|
"""Return the total number of URLs found so far."""
|
|
136
|
-
|
|
290
|
+
state = await self._get_state()
|
|
291
|
+
return state.total_count
|
|
137
292
|
|
|
293
|
+
@override
|
|
294
|
+
async def get_handled_count(self) -> int:
|
|
295
|
+
"""Return the number of URLs that have been handled."""
|
|
296
|
+
state = await self._get_state()
|
|
297
|
+
return state.handled_count
|
|
298
|
+
|
|
299
|
+
@override
|
|
138
300
|
async def is_empty(self) -> bool:
|
|
139
301
|
"""Check if there are no more URLs to process."""
|
|
140
|
-
|
|
302
|
+
state = await self._get_state()
|
|
303
|
+
return not state.url_queue
|
|
141
304
|
|
|
305
|
+
@override
|
|
142
306
|
async def is_finished(self) -> bool:
|
|
143
307
|
"""Check if all URLs have been processed."""
|
|
144
|
-
|
|
308
|
+
state = await self._get_state()
|
|
309
|
+
return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()
|
|
145
310
|
|
|
311
|
+
@override
|
|
146
312
|
async def fetch_next_request(self) -> Request | None:
|
|
147
313
|
"""Fetch the next request to process."""
|
|
148
|
-
while not (
|
|
149
|
-
|
|
150
|
-
|
|
314
|
+
while not (await self.is_finished()):
|
|
315
|
+
state = await self._get_state()
|
|
316
|
+
if not state.url_queue:
|
|
317
|
+
await asyncio.sleep(0.1)
|
|
151
318
|
continue
|
|
152
319
|
|
|
153
|
-
|
|
320
|
+
async with self._queue_lock:
|
|
321
|
+
url = state.url_queue.popleft()
|
|
322
|
+
request_option = RequestOptions(url=url)
|
|
323
|
+
if self._transform_request_function:
|
|
324
|
+
transform_request_option = self._transform_request_function(request_option)
|
|
325
|
+
if transform_request_option == 'skip':
|
|
326
|
+
state.total_count -= 1
|
|
327
|
+
continue
|
|
328
|
+
if transform_request_option != 'unchanged':
|
|
329
|
+
request_option = transform_request_option
|
|
330
|
+
request = Request.from_url(**request_option)
|
|
331
|
+
state.in_progress.add(request.url)
|
|
332
|
+
if len(state.url_queue) < self._max_buffer_size:
|
|
333
|
+
self._queue_has_capacity.set()
|
|
154
334
|
|
|
155
|
-
request = Request.from_url(url)
|
|
156
|
-
self._in_progress.add(request.id)
|
|
157
335
|
return request
|
|
158
336
|
|
|
159
337
|
return None
|
|
160
338
|
|
|
339
|
+
@override
|
|
161
340
|
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
|
|
162
341
|
"""Mark a request as successfully handled."""
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
342
|
+
state = await self._get_state()
|
|
343
|
+
if request.url in state.in_progress:
|
|
344
|
+
state.in_progress.remove(request.url)
|
|
345
|
+
state.handled_count += 1
|
|
166
346
|
return None
|
|
167
347
|
|
|
168
|
-
async def get_handled_count(self) -> int:
|
|
169
|
-
"""Return the number of handled requests."""
|
|
170
|
-
return self._handled_count
|
|
171
|
-
|
|
172
348
|
async def abort_loading(self) -> None:
|
|
173
349
|
"""Abort the sitemap loading process."""
|
|
174
350
|
if self._loading_task and not self._loading_task.done():
|
|
175
351
|
self._loading_task.cancel()
|
|
176
352
|
with suppress(asyncio.CancelledError):
|
|
177
353
|
await self._loading_task
|
|
354
|
+
|
|
355
|
+
async def start(self) -> None:
|
|
356
|
+
"""Start the sitemap loading process."""
|
|
357
|
+
if self._loading_task and not self._loading_task.done():
|
|
358
|
+
return
|
|
359
|
+
self._loading_task = asyncio.create_task(self._load_sitemaps())
|
|
360
|
+
|
|
361
|
+
async def close(self) -> None:
|
|
362
|
+
"""Close the request loader."""
|
|
363
|
+
await self.abort_loading()
|
|
364
|
+
await self._state.teardown()
|
|
365
|
+
|
|
366
|
+
async def __aenter__(self) -> SitemapRequestLoader:
|
|
367
|
+
"""Enter the context manager."""
|
|
368
|
+
await self.start()
|
|
369
|
+
return self
|
|
370
|
+
|
|
371
|
+
async def __aexit__(
|
|
372
|
+
self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
|
|
373
|
+
) -> None:
|
|
374
|
+
"""Exit the context manager."""
|
|
375
|
+
await self.close()
|
crawlee/sessions/_models.py
CHANGED
|
@@ -20,7 +20,7 @@ from ._session import Session
|
|
|
20
20
|
class SessionModel(BaseModel):
|
|
21
21
|
"""Model for a Session object."""
|
|
22
22
|
|
|
23
|
-
model_config = ConfigDict(
|
|
23
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
24
24
|
|
|
25
25
|
id: Annotated[str, Field(alias='id')]
|
|
26
26
|
max_age: Annotated[timedelta, Field(alias='maxAge')]
|
|
@@ -38,7 +38,7 @@ class SessionModel(BaseModel):
|
|
|
38
38
|
class SessionPoolModel(BaseModel):
|
|
39
39
|
"""Model for a SessionPool object."""
|
|
40
40
|
|
|
41
|
-
model_config = ConfigDict(
|
|
41
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
42
42
|
|
|
43
43
|
max_pool_size: Annotated[int, Field(alias='maxPoolSize')]
|
|
44
44
|
|
|
@@ -163,7 +163,7 @@ class SessionPool:
|
|
|
163
163
|
def add_session(self, session: Session) -> None:
|
|
164
164
|
"""Add an externally created session to the pool.
|
|
165
165
|
|
|
166
|
-
This is
|
|
166
|
+
This is intended only for the cases when you want to add a session that was created outside of the pool.
|
|
167
167
|
Otherwise, the pool will create new sessions automatically.
|
|
168
168
|
|
|
169
169
|
Args:
|
|
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
|
|
|
32
32
|
"""Capture error snapshot and save it to key value store.
|
|
33
33
|
|
|
34
34
|
It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
|
|
35
|
-
it returns `KeyValueStoreChangeRecords` which is
|
|
35
|
+
it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
|
|
36
36
|
returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
|
|
37
37
|
an exception.
|
|
38
38
|
|