crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +2 -1
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +76 -17
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/sitemap.py +3 -1
- crawlee/_utils/system.py +3 -3
- crawlee/browsers/_playwright_browser_controller.py +20 -14
- crawlee/configuration.py +1 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +107 -27
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +1 -1
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +228 -48
- crawlee/sessions/_models.py +2 -2
- crawlee/statistics/_models.py +1 -1
- crawlee/storage_clients/__init__.py +12 -0
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
- crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
- crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +14 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +269 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +9 -2
- crawlee/storages/_key_value_store.py +9 -2
- crawlee/storages/_request_queue.py +7 -2
- crawlee/storages/_storage_instance_manager.py +126 -72
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/METADATA +12 -5
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/RECORD +59 -49
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/WHEEL +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,19 +1,25 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
from collections import deque
|
|
4
5
|
from contextlib import suppress
|
|
5
6
|
from logging import getLogger
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
7
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
from typing_extensions import override
|
|
7
11
|
|
|
8
12
|
from crawlee import Request
|
|
9
13
|
from crawlee._utils.docs import docs_group
|
|
10
14
|
from crawlee._utils.globs import Glob
|
|
11
|
-
from crawlee._utils.
|
|
15
|
+
from crawlee._utils.recoverable_state import RecoverableState
|
|
16
|
+
from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
|
|
12
17
|
from crawlee.request_loaders._request_loader import RequestLoader
|
|
13
18
|
|
|
14
19
|
if TYPE_CHECKING:
|
|
15
20
|
import re
|
|
16
21
|
from collections.abc import Sequence
|
|
22
|
+
from types import TracebackType
|
|
17
23
|
|
|
18
24
|
from crawlee.http_clients import HttpClient
|
|
19
25
|
from crawlee.proxy_configuration import ProxyInfo
|
|
@@ -23,12 +29,72 @@ if TYPE_CHECKING:
|
|
|
23
29
|
logger = getLogger(__name__)
|
|
24
30
|
|
|
25
31
|
|
|
32
|
+
class SitemapRequestLoaderState(BaseModel):
|
|
33
|
+
"""State model for persisting sitemap request loader data.
|
|
34
|
+
|
|
35
|
+
The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.
|
|
36
|
+
The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved
|
|
37
|
+
from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to
|
|
38
|
+
`pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a
|
|
39
|
+
`SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,
|
|
40
|
+
the loader was restarted from a saved state and the URL is skipped.
|
|
41
|
+
|
|
42
|
+
If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is
|
|
43
|
+
incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`
|
|
44
|
+
is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is
|
|
45
|
+
cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in
|
|
46
|
+
`processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.
|
|
47
|
+
|
|
48
|
+
When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.
|
|
49
|
+
When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and
|
|
50
|
+
`handled_count` is incremented by 1.
|
|
51
|
+
|
|
52
|
+
During initial startup or restart after persistence, state validation occurs in `_get_state`. If both
|
|
53
|
+
`pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a
|
|
54
|
+
fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is
|
|
55
|
+
restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and
|
|
56
|
+
`in_progress` is cleared.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
60
|
+
|
|
61
|
+
url_queue: Annotated[deque[str], Field(alias='urlQueue')]
|
|
62
|
+
"""Queue of URLs extracted from sitemaps and ready for processing."""
|
|
63
|
+
|
|
64
|
+
in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
|
|
65
|
+
"""Set of request URLs currently being processed."""
|
|
66
|
+
|
|
67
|
+
pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]
|
|
68
|
+
"""Queue of sitemap URLs that need to be fetched and processed."""
|
|
69
|
+
|
|
70
|
+
in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None
|
|
71
|
+
"""The sitemap URL currently being processed."""
|
|
72
|
+
|
|
73
|
+
current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set()
|
|
74
|
+
"""URLs from the current sitemap that have been added to the queue."""
|
|
75
|
+
|
|
76
|
+
processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set()
|
|
77
|
+
"""Set of processed sitemap URLs."""
|
|
78
|
+
|
|
79
|
+
completed: Annotated[bool, Field(alias='sitemapCompleted')] = False
|
|
80
|
+
"""Whether all sitemaps have been fully processed."""
|
|
81
|
+
|
|
82
|
+
total_count: Annotated[int, Field(alias='totalCount')] = 0
|
|
83
|
+
"""Total number of URLs found and added to the queue from all processed sitemaps."""
|
|
84
|
+
|
|
85
|
+
handled_count: Annotated[int, Field(alias='handledCount')] = 0
|
|
86
|
+
"""Number of URLs that have been successfully handled."""
|
|
87
|
+
|
|
88
|
+
|
|
26
89
|
@docs_group('Request loaders')
|
|
27
90
|
class SitemapRequestLoader(RequestLoader):
|
|
28
91
|
"""A request loader that reads URLs from sitemap(s).
|
|
29
92
|
|
|
30
93
|
The loader fetches and parses sitemaps in the background, allowing crawling to start
|
|
31
94
|
before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
|
|
95
|
+
|
|
96
|
+
The loader supports state persistence, allowing it to resume from where it left off
|
|
97
|
+
after interruption when a `persist_state_key` is provided during initialization.
|
|
32
98
|
"""
|
|
33
99
|
|
|
34
100
|
def __init__(
|
|
@@ -40,7 +106,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
40
106
|
include: list[re.Pattern[Any] | Glob] | None = None,
|
|
41
107
|
exclude: list[re.Pattern[Any] | Glob] | None = None,
|
|
42
108
|
max_buffer_size: int = 200,
|
|
43
|
-
|
|
109
|
+
persist_state_key: str | None = None,
|
|
44
110
|
) -> None:
|
|
45
111
|
"""Initialize the sitemap request loader.
|
|
46
112
|
|
|
@@ -50,27 +116,64 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
50
116
|
include: List of glob or regex patterns to include URLs.
|
|
51
117
|
exclude: List of glob or regex patterns to exclude URLs.
|
|
52
118
|
max_buffer_size: Maximum number of URLs to buffer in memory.
|
|
53
|
-
parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`.
|
|
54
119
|
http_client: the instance of `HttpClient` to use for fetching sitemaps.
|
|
120
|
+
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
|
|
121
|
+
When provided, allows resuming from where it left off after interruption.
|
|
122
|
+
If None, no state persistence occurs.
|
|
55
123
|
"""
|
|
56
124
|
self._http_client = http_client
|
|
57
|
-
|
|
58
125
|
self._sitemap_urls = sitemap_urls
|
|
59
126
|
self._include = include
|
|
60
127
|
self._exclude = exclude
|
|
61
128
|
self._proxy_info = proxy_info
|
|
62
|
-
self.
|
|
129
|
+
self._max_buffer_size = max_buffer_size
|
|
130
|
+
|
|
131
|
+
# Synchronization for queue operations
|
|
132
|
+
self._queue_has_capacity = asyncio.Event()
|
|
133
|
+
self._queue_has_capacity.set()
|
|
134
|
+
self._queue_lock = asyncio.Lock()
|
|
135
|
+
|
|
136
|
+
# Initialize recoverable state
|
|
137
|
+
self._state = RecoverableState(
|
|
138
|
+
default_state=SitemapRequestLoaderState(
|
|
139
|
+
url_queue=deque(),
|
|
140
|
+
pending_sitemap_urls=deque(),
|
|
141
|
+
),
|
|
142
|
+
persistence_enabled=bool(persist_state_key),
|
|
143
|
+
persist_state_key=persist_state_key or '',
|
|
144
|
+
logger=logger,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Start background loading
|
|
148
|
+
self._loading_task = asyncio.create_task(self._load_sitemaps())
|
|
63
149
|
|
|
64
|
-
|
|
65
|
-
|
|
150
|
+
async def _get_state(self) -> SitemapRequestLoaderState:
|
|
151
|
+
"""Initialize and return the current state."""
|
|
152
|
+
async with self._queue_lock:
|
|
153
|
+
if self._state.is_initialized:
|
|
154
|
+
return self._state.current_value
|
|
66
155
|
|
|
67
|
-
|
|
68
|
-
self._url_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=max_buffer_size)
|
|
69
|
-
self._in_progress: set[str] = set()
|
|
70
|
-
self._processed_urls: set[str] = set()
|
|
156
|
+
await self._state.initialize()
|
|
71
157
|
|
|
72
|
-
|
|
73
|
-
|
|
158
|
+
# Initialize pending sitemaps on first run
|
|
159
|
+
has_sitemap_for_processing = (
|
|
160
|
+
self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url
|
|
161
|
+
)
|
|
162
|
+
if not has_sitemap_for_processing and not self._state.current_value.completed:
|
|
163
|
+
self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)
|
|
164
|
+
|
|
165
|
+
if self._state.current_value.in_progress:
|
|
166
|
+
self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)
|
|
167
|
+
self._state.current_value.in_progress.clear()
|
|
168
|
+
|
|
169
|
+
if (
|
|
170
|
+
self._state.current_value.url_queue
|
|
171
|
+
and len(self._state.current_value.url_queue) >= self._max_buffer_size
|
|
172
|
+
):
|
|
173
|
+
# Notify that the queue is full
|
|
174
|
+
self._queue_has_capacity.clear()
|
|
175
|
+
|
|
176
|
+
return self._state.current_value
|
|
74
177
|
|
|
75
178
|
def _check_url_patterns(
|
|
76
179
|
self,
|
|
@@ -105,73 +208,150 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
105
208
|
async def _load_sitemaps(self) -> None:
|
|
106
209
|
"""Load URLs from sitemaps in the background."""
|
|
107
210
|
try:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
url = item.loc
|
|
117
|
-
|
|
118
|
-
# Skip if already processed
|
|
119
|
-
if url in self._processed_urls:
|
|
211
|
+
# Get actual state
|
|
212
|
+
while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url):
|
|
213
|
+
# Get sitemap URL for parsing
|
|
214
|
+
sitemap_url = state.in_progress_sitemap_url
|
|
215
|
+
if not sitemap_url:
|
|
216
|
+
sitemap_url = state.pending_sitemap_urls.popleft()
|
|
217
|
+
# Skip processed urls
|
|
218
|
+
if sitemap_url in state.processed_sitemap_urls:
|
|
120
219
|
continue
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
220
|
+
state.in_progress_sitemap_url = sitemap_url
|
|
221
|
+
|
|
222
|
+
parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
|
|
223
|
+
|
|
224
|
+
async for item in parse_sitemap(
|
|
225
|
+
[SitemapSource(type='url', url=sitemap_url)],
|
|
226
|
+
self._http_client,
|
|
227
|
+
proxy_info=self._proxy_info,
|
|
228
|
+
options=parse_options,
|
|
229
|
+
):
|
|
230
|
+
if isinstance(item, NestedSitemap):
|
|
231
|
+
# Add nested sitemap to queue
|
|
232
|
+
if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
|
|
233
|
+
state.pending_sitemap_urls.append(item.loc)
|
|
124
234
|
continue
|
|
125
235
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
236
|
+
if isinstance(item, SitemapUrl):
|
|
237
|
+
url = item.loc
|
|
238
|
+
|
|
239
|
+
state = await self._get_state()
|
|
240
|
+
|
|
241
|
+
# Skip if already processed
|
|
242
|
+
if url in state.current_sitemap_processed_urls:
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
# Check if URL should be included
|
|
246
|
+
if not self._check_url_patterns(url, self._include, self._exclude):
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
# Check if we have capacity in the queue
|
|
250
|
+
await self._queue_has_capacity.wait()
|
|
251
|
+
|
|
252
|
+
state = await self._get_state()
|
|
253
|
+
async with self._queue_lock:
|
|
254
|
+
state.url_queue.append(url)
|
|
255
|
+
state.current_sitemap_processed_urls.add(url)
|
|
256
|
+
state.total_count += 1
|
|
257
|
+
if len(state.url_queue) >= self._max_buffer_size:
|
|
258
|
+
# Notify that the queue is full
|
|
259
|
+
self._queue_has_capacity.clear()
|
|
260
|
+
|
|
261
|
+
# Clear current sitemap after processing
|
|
262
|
+
state = await self._get_state()
|
|
263
|
+
current_sitemap_url = state.in_progress_sitemap_url
|
|
264
|
+
state.in_progress_sitemap_url = None
|
|
265
|
+
if current_sitemap_url:
|
|
266
|
+
state.processed_sitemap_urls.add(current_sitemap_url)
|
|
267
|
+
state.current_sitemap_processed_urls.clear()
|
|
268
|
+
|
|
269
|
+
# Mark as completed after processing all sitemap urls
|
|
270
|
+
state.completed = True
|
|
129
271
|
|
|
130
272
|
except Exception:
|
|
131
273
|
logger.exception('Error loading sitemaps')
|
|
132
274
|
raise
|
|
133
275
|
|
|
276
|
+
@override
|
|
134
277
|
async def get_total_count(self) -> int:
|
|
135
278
|
"""Return the total number of URLs found so far."""
|
|
136
|
-
|
|
279
|
+
state = await self._get_state()
|
|
280
|
+
return state.total_count
|
|
137
281
|
|
|
282
|
+
@override
|
|
283
|
+
async def get_handled_count(self) -> int:
|
|
284
|
+
"""Return the number of URLs that have been handled."""
|
|
285
|
+
state = await self._get_state()
|
|
286
|
+
return state.handled_count
|
|
287
|
+
|
|
288
|
+
@override
|
|
138
289
|
async def is_empty(self) -> bool:
|
|
139
290
|
"""Check if there are no more URLs to process."""
|
|
140
|
-
|
|
291
|
+
state = await self._get_state()
|
|
292
|
+
return not state.url_queue
|
|
141
293
|
|
|
294
|
+
@override
|
|
142
295
|
async def is_finished(self) -> bool:
|
|
143
296
|
"""Check if all URLs have been processed."""
|
|
144
|
-
|
|
297
|
+
state = await self._get_state()
|
|
298
|
+
return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()
|
|
145
299
|
|
|
300
|
+
@override
|
|
146
301
|
async def fetch_next_request(self) -> Request | None:
|
|
147
302
|
"""Fetch the next request to process."""
|
|
148
|
-
while not (
|
|
149
|
-
|
|
150
|
-
|
|
303
|
+
while not (await self.is_finished()):
|
|
304
|
+
state = await self._get_state()
|
|
305
|
+
if not state.url_queue:
|
|
306
|
+
await asyncio.sleep(0.1)
|
|
151
307
|
continue
|
|
152
308
|
|
|
153
|
-
|
|
309
|
+
async with self._queue_lock:
|
|
310
|
+
url = state.url_queue.popleft()
|
|
311
|
+
|
|
312
|
+
request = Request.from_url(url)
|
|
313
|
+
state.in_progress.add(request.url)
|
|
314
|
+
if len(state.url_queue) < self._max_buffer_size:
|
|
315
|
+
self._queue_has_capacity.set()
|
|
154
316
|
|
|
155
|
-
request = Request.from_url(url)
|
|
156
|
-
self._in_progress.add(request.unique_key)
|
|
157
317
|
return request
|
|
158
318
|
|
|
159
319
|
return None
|
|
160
320
|
|
|
321
|
+
@override
|
|
161
322
|
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
|
|
162
323
|
"""Mark a request as successfully handled."""
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
324
|
+
state = await self._get_state()
|
|
325
|
+
if request.url in state.in_progress:
|
|
326
|
+
state.in_progress.remove(request.url)
|
|
327
|
+
state.handled_count += 1
|
|
166
328
|
return None
|
|
167
329
|
|
|
168
|
-
async def get_handled_count(self) -> int:
|
|
169
|
-
"""Return the number of handled requests."""
|
|
170
|
-
return self._handled_count
|
|
171
|
-
|
|
172
330
|
async def abort_loading(self) -> None:
|
|
173
331
|
"""Abort the sitemap loading process."""
|
|
174
332
|
if self._loading_task and not self._loading_task.done():
|
|
175
333
|
self._loading_task.cancel()
|
|
176
334
|
with suppress(asyncio.CancelledError):
|
|
177
335
|
await self._loading_task
|
|
336
|
+
|
|
337
|
+
async def start(self) -> None:
|
|
338
|
+
"""Start the sitemap loading process."""
|
|
339
|
+
if self._loading_task and not self._loading_task.done():
|
|
340
|
+
return
|
|
341
|
+
self._loading_task = asyncio.create_task(self._load_sitemaps())
|
|
342
|
+
|
|
343
|
+
async def close(self) -> None:
|
|
344
|
+
"""Close the request loader."""
|
|
345
|
+
await self.abort_loading()
|
|
346
|
+
await self._state.teardown()
|
|
347
|
+
|
|
348
|
+
async def __aenter__(self) -> SitemapRequestLoader:
|
|
349
|
+
"""Enter the context manager."""
|
|
350
|
+
await self.start()
|
|
351
|
+
return self
|
|
352
|
+
|
|
353
|
+
async def __aexit__(
|
|
354
|
+
self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
|
|
355
|
+
) -> None:
|
|
356
|
+
"""Exit the context manager."""
|
|
357
|
+
await self.close()
|
crawlee/sessions/_models.py
CHANGED
|
@@ -20,7 +20,7 @@ from ._session import Session
|
|
|
20
20
|
class SessionModel(BaseModel):
|
|
21
21
|
"""Model for a Session object."""
|
|
22
22
|
|
|
23
|
-
model_config = ConfigDict(
|
|
23
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
24
24
|
|
|
25
25
|
id: Annotated[str, Field(alias='id')]
|
|
26
26
|
max_age: Annotated[timedelta, Field(alias='maxAge')]
|
|
@@ -38,7 +38,7 @@ class SessionModel(BaseModel):
|
|
|
38
38
|
class SessionPoolModel(BaseModel):
|
|
39
39
|
"""Model for a SessionPool object."""
|
|
40
40
|
|
|
41
|
-
model_config = ConfigDict(
|
|
41
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
42
42
|
|
|
43
43
|
max_pool_size: Annotated[int, Field(alias='maxPoolSize')]
|
|
44
44
|
|
crawlee/statistics/_models.py
CHANGED
|
@@ -57,7 +57,7 @@ class FinalStatistics:
|
|
|
57
57
|
class StatisticsState(BaseModel):
|
|
58
58
|
"""Statistic data about a crawler run."""
|
|
59
59
|
|
|
60
|
-
model_config = ConfigDict(
|
|
60
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
|
|
61
61
|
stats_id: Annotated[int | None, Field(alias='statsId')] = None
|
|
62
62
|
|
|
63
63
|
requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0
|
|
@@ -1,9 +1,21 @@
|
|
|
1
|
+
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
|
+
from crawlee._utils.try_import import try_import as _try_import
|
|
3
|
+
|
|
4
|
+
# These imports have only mandatory dependencies, so they are imported directly.
|
|
1
5
|
from ._base import StorageClient
|
|
2
6
|
from ._file_system import FileSystemStorageClient
|
|
3
7
|
from ._memory import MemoryStorageClient
|
|
4
8
|
|
|
9
|
+
_install_import_hook(__name__)
|
|
10
|
+
|
|
11
|
+
# The following imports are wrapped in try_import to handle optional dependencies,
|
|
12
|
+
# ensuring the module can still function even if these dependencies are missing.
|
|
13
|
+
with _try_import(__name__, 'SqlStorageClient'):
|
|
14
|
+
from ._sql import SqlStorageClient
|
|
15
|
+
|
|
5
16
|
__all__ = [
|
|
6
17
|
'FileSystemStorageClient',
|
|
7
18
|
'MemoryStorageClient',
|
|
19
|
+
'SqlStorageClient',
|
|
8
20
|
'StorageClient',
|
|
9
21
|
]
|
|
@@ -6,6 +6,8 @@ from typing import TYPE_CHECKING
|
|
|
6
6
|
from crawlee._utils.docs import docs_group
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Hashable
|
|
10
|
+
|
|
9
11
|
from crawlee.configuration import Configuration
|
|
10
12
|
|
|
11
13
|
from ._dataset_client import DatasetClient
|
|
@@ -28,12 +30,21 @@ class StorageClient(ABC):
|
|
|
28
30
|
(where applicable), and consistent access patterns across all storage types it supports.
|
|
29
31
|
"""
|
|
30
32
|
|
|
33
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002
|
|
34
|
+
"""Return a cache key that can differentiate between different storages of this and other clients.
|
|
35
|
+
|
|
36
|
+
Can be based on configuration or on the client itself. By default, returns a module and name of the client
|
|
37
|
+
class.
|
|
38
|
+
"""
|
|
39
|
+
return f'{self.__class__.__module__}.{self.__class__.__name__}'
|
|
40
|
+
|
|
31
41
|
@abstractmethod
|
|
32
42
|
async def create_dataset_client(
|
|
33
43
|
self,
|
|
34
44
|
*,
|
|
35
45
|
id: str | None = None,
|
|
36
46
|
name: str | None = None,
|
|
47
|
+
alias: str | None = None,
|
|
37
48
|
configuration: Configuration | None = None,
|
|
38
49
|
) -> DatasetClient:
|
|
39
50
|
"""Create a dataset client."""
|
|
@@ -44,6 +55,7 @@ class StorageClient(ABC):
|
|
|
44
55
|
*,
|
|
45
56
|
id: str | None = None,
|
|
46
57
|
name: str | None = None,
|
|
58
|
+
alias: str | None = None,
|
|
47
59
|
configuration: Configuration | None = None,
|
|
48
60
|
) -> KeyValueStoreClient:
|
|
49
61
|
"""Create a key-value store client."""
|
|
@@ -54,6 +66,7 @@ class StorageClient(ABC):
|
|
|
54
66
|
*,
|
|
55
67
|
id: str | None = None,
|
|
56
68
|
name: str | None = None,
|
|
69
|
+
alias: str | None = None,
|
|
57
70
|
configuration: Configuration | None = None,
|
|
58
71
|
) -> RequestQueueClient:
|
|
59
72
|
"""Create a request queue client."""
|
|
@@ -14,6 +14,7 @@ from typing_extensions import override
|
|
|
14
14
|
from crawlee._consts import METADATA_FILENAME
|
|
15
15
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
16
16
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
17
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
17
18
|
from crawlee.storage_clients._base import DatasetClient
|
|
18
19
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
19
20
|
|
|
@@ -56,7 +57,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
56
57
|
self,
|
|
57
58
|
*,
|
|
58
59
|
metadata: DatasetMetadata,
|
|
59
|
-
|
|
60
|
+
path_to_dataset: Path,
|
|
60
61
|
lock: asyncio.Lock,
|
|
61
62
|
) -> None:
|
|
62
63
|
"""Initialize a new instance.
|
|
@@ -65,8 +66,8 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
65
66
|
"""
|
|
66
67
|
self._metadata = metadata
|
|
67
68
|
|
|
68
|
-
self.
|
|
69
|
-
"""The
|
|
69
|
+
self._path_to_dataset = path_to_dataset
|
|
70
|
+
"""The full path to the dataset directory."""
|
|
70
71
|
|
|
71
72
|
self._lock = lock
|
|
72
73
|
"""A lock to ensure that only one operation is performed at a time."""
|
|
@@ -78,10 +79,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
78
79
|
@property
|
|
79
80
|
def path_to_dataset(self) -> Path:
|
|
80
81
|
"""The full path to the dataset directory."""
|
|
81
|
-
|
|
82
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
|
|
83
|
-
|
|
84
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
|
|
82
|
+
return self._path_to_dataset
|
|
85
83
|
|
|
86
84
|
@property
|
|
87
85
|
def path_to_metadata(self) -> Path:
|
|
@@ -94,6 +92,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
94
92
|
*,
|
|
95
93
|
id: str | None,
|
|
96
94
|
name: str | None,
|
|
95
|
+
alias: str | None,
|
|
97
96
|
configuration: Configuration,
|
|
98
97
|
) -> FileSystemDatasetClient:
|
|
99
98
|
"""Open or create a file system dataset client.
|
|
@@ -104,17 +103,21 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
104
103
|
|
|
105
104
|
Args:
|
|
106
105
|
id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
|
|
107
|
-
name: The name of the dataset
|
|
106
|
+
name: The name of the dataset for named (global scope) storages.
|
|
107
|
+
alias: The alias of the dataset for unnamed (run scope) storages.
|
|
108
108
|
configuration: The configuration object containing storage directory settings.
|
|
109
109
|
|
|
110
110
|
Returns:
|
|
111
111
|
An instance for the opened or created storage client.
|
|
112
112
|
|
|
113
113
|
Raises:
|
|
114
|
-
ValueError: If a dataset with the specified ID is not found,
|
|
114
|
+
ValueError: If a dataset with the specified ID is not found, if metadata is invalid,
|
|
115
|
+
or if both name and alias are provided.
|
|
115
116
|
"""
|
|
116
|
-
|
|
117
|
-
|
|
117
|
+
# Validate input parameters.
|
|
118
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
119
|
+
|
|
120
|
+
dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
118
121
|
|
|
119
122
|
if not dataset_base_path.exists():
|
|
120
123
|
await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True)
|
|
@@ -126,19 +129,19 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
126
129
|
if not dataset_dir.is_dir():
|
|
127
130
|
continue
|
|
128
131
|
|
|
129
|
-
|
|
130
|
-
if not
|
|
132
|
+
path_to_metadata = dataset_dir / METADATA_FILENAME
|
|
133
|
+
if not path_to_metadata.exists():
|
|
131
134
|
continue
|
|
132
135
|
|
|
133
136
|
try:
|
|
134
|
-
file = await asyncio.to_thread(
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open)
|
|
135
138
|
try:
|
|
136
139
|
file_content = json.load(file)
|
|
137
140
|
metadata = DatasetMetadata(**file_content)
|
|
138
141
|
if metadata.id == id:
|
|
139
142
|
client = cls(
|
|
140
143
|
metadata=metadata,
|
|
141
|
-
|
|
144
|
+
path_to_dataset=dataset_base_path / dataset_dir,
|
|
142
145
|
lock=asyncio.Lock(),
|
|
143
146
|
)
|
|
144
147
|
await client._update_metadata(update_accessed_at=True)
|
|
@@ -152,16 +155,15 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
152
155
|
if not found:
|
|
153
156
|
raise ValueError(f'Dataset with ID "{id}" not found')
|
|
154
157
|
|
|
155
|
-
# Get a new instance by name.
|
|
158
|
+
# Get a new instance by name or alias.
|
|
156
159
|
else:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
metadata_path = dataset_path / METADATA_FILENAME
|
|
160
|
+
dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')
|
|
161
|
+
path_to_dataset = dataset_base_path / dataset_dir
|
|
162
|
+
path_to_metadata = path_to_dataset / METADATA_FILENAME
|
|
161
163
|
|
|
162
164
|
# If the dataset directory exists, reconstruct the client from the metadata file.
|
|
163
|
-
if
|
|
164
|
-
file = await asyncio.to_thread(open,
|
|
165
|
+
if path_to_dataset.exists() and path_to_metadata.exists():
|
|
166
|
+
file = await asyncio.to_thread(open, path_to_metadata)
|
|
165
167
|
try:
|
|
166
168
|
file_content = json.load(file)
|
|
167
169
|
finally:
|
|
@@ -169,11 +171,11 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
169
171
|
try:
|
|
170
172
|
metadata = DatasetMetadata(**file_content)
|
|
171
173
|
except ValidationError as exc:
|
|
172
|
-
raise ValueError(f'Invalid metadata file for dataset "{name}"') from exc
|
|
174
|
+
raise ValueError(f'Invalid metadata file for dataset "{name or alias}"') from exc
|
|
173
175
|
|
|
174
176
|
client = cls(
|
|
175
177
|
metadata=metadata,
|
|
176
|
-
|
|
178
|
+
path_to_dataset=path_to_dataset,
|
|
177
179
|
lock=asyncio.Lock(),
|
|
178
180
|
)
|
|
179
181
|
|
|
@@ -192,7 +194,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
192
194
|
)
|
|
193
195
|
client = cls(
|
|
194
196
|
metadata=metadata,
|
|
195
|
-
|
|
197
|
+
path_to_dataset=path_to_dataset,
|
|
196
198
|
lock=asyncio.Lock(),
|
|
197
199
|
)
|
|
198
200
|
await client._update_metadata()
|