crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/_types.py
CHANGED
|
@@ -2,18 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
from collections.abc import Callable, Iterator, Mapping
|
|
5
|
+
from copy import deepcopy
|
|
5
6
|
from dataclasses import dataclass
|
|
6
|
-
from typing import
|
|
7
|
-
TYPE_CHECKING,
|
|
8
|
-
Annotated,
|
|
9
|
-
Any,
|
|
10
|
-
Literal,
|
|
11
|
-
Protocol,
|
|
12
|
-
TypedDict,
|
|
13
|
-
TypeVar,
|
|
14
|
-
cast,
|
|
15
|
-
overload,
|
|
16
|
-
)
|
|
7
|
+
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
|
|
17
8
|
|
|
18
9
|
from pydantic import ConfigDict, Field, PlainValidator, RootModel
|
|
19
10
|
|
|
@@ -25,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
25
16
|
import re
|
|
26
17
|
from collections.abc import Callable, Coroutine, Sequence
|
|
27
18
|
|
|
28
|
-
from typing_extensions import NotRequired, Required, Unpack
|
|
19
|
+
from typing_extensions import NotRequired, Required, Self, Unpack
|
|
29
20
|
|
|
30
21
|
from crawlee import Glob, Request
|
|
31
22
|
from crawlee._request import RequestOptions
|
|
@@ -69,13 +60,17 @@ def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
|
|
|
69
60
|
class HttpHeaders(RootModel, Mapping[str, str]):
|
|
70
61
|
"""A dictionary-like object representing HTTP headers."""
|
|
71
62
|
|
|
72
|
-
model_config = ConfigDict(
|
|
63
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
73
64
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
65
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
66
|
+
if TYPE_CHECKING:
|
|
67
|
+
root: dict[str, str] = {}
|
|
68
|
+
else:
|
|
69
|
+
root: Annotated[
|
|
70
|
+
dict[str, str],
|
|
71
|
+
PlainValidator(lambda value: _normalize_headers(value)),
|
|
72
|
+
Field(default_factory=lambda: dict[str, str]()),
|
|
73
|
+
]
|
|
79
74
|
|
|
80
75
|
def __getitem__(self, key: str) -> str:
|
|
81
76
|
return self.root[key.lower()]
|
|
@@ -96,7 +91,7 @@ class HttpHeaders(RootModel, Mapping[str, str]):
|
|
|
96
91
|
combined_headers = {**other, **self.root}
|
|
97
92
|
return HttpHeaders(combined_headers)
|
|
98
93
|
|
|
99
|
-
def __iter__(self) -> Iterator[str]: #
|
|
94
|
+
def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
|
|
100
95
|
yield from self.root
|
|
101
96
|
|
|
102
97
|
def __len__(self) -> int:
|
|
@@ -110,9 +105,9 @@ class ConcurrencySettings:
|
|
|
110
105
|
def __init__(
|
|
111
106
|
self,
|
|
112
107
|
min_concurrency: int = 1,
|
|
113
|
-
max_concurrency: int =
|
|
108
|
+
max_concurrency: int = 100,
|
|
114
109
|
max_tasks_per_minute: float = float('inf'),
|
|
115
|
-
desired_concurrency: int
|
|
110
|
+
desired_concurrency: int = 10,
|
|
116
111
|
) -> None:
|
|
117
112
|
"""Initialize a new instance.
|
|
118
113
|
|
|
@@ -125,21 +120,24 @@ class ConcurrencySettings:
|
|
|
125
120
|
desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
|
|
126
121
|
if there is a large enough supply of them. By default, it is `min_concurrency`.
|
|
127
122
|
"""
|
|
128
|
-
if desired_concurrency is not None and desired_concurrency < 1:
|
|
129
|
-
raise ValueError('desired_concurrency must be 1 or larger')
|
|
130
|
-
|
|
131
123
|
if min_concurrency < 1:
|
|
132
124
|
raise ValueError('min_concurrency must be 1 or larger')
|
|
133
125
|
|
|
134
126
|
if max_concurrency < min_concurrency:
|
|
135
127
|
raise ValueError('max_concurrency cannot be less than min_concurrency')
|
|
136
128
|
|
|
129
|
+
if desired_concurrency < min_concurrency:
|
|
130
|
+
raise ValueError('desired_concurrency cannot be less than min_concurrency')
|
|
131
|
+
|
|
132
|
+
if desired_concurrency > max_concurrency:
|
|
133
|
+
raise ValueError('desired_concurrency cannot be greater than max_concurrency')
|
|
134
|
+
|
|
137
135
|
if max_tasks_per_minute <= 0:
|
|
138
136
|
raise ValueError('max_tasks_per_minute must be positive')
|
|
139
137
|
|
|
140
138
|
self.min_concurrency = min_concurrency
|
|
141
139
|
self.max_concurrency = max_concurrency
|
|
142
|
-
self.desired_concurrency = desired_concurrency
|
|
140
|
+
self.desired_concurrency = desired_concurrency
|
|
143
141
|
self.max_tasks_per_minute = max_tasks_per_minute
|
|
144
142
|
|
|
145
143
|
|
|
@@ -180,6 +178,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
|
|
|
180
178
|
requests: Sequence[str | Request]
|
|
181
179
|
"""Requests to be added to the `RequestManager`."""
|
|
182
180
|
|
|
181
|
+
rq_id: str | None
|
|
182
|
+
"""ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
|
|
183
|
+
|
|
184
|
+
rq_name: str | None
|
|
185
|
+
"""Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
rq_alias: str | None
|
|
189
|
+
"""Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
|
|
190
|
+
"""
|
|
191
|
+
|
|
183
192
|
|
|
184
193
|
class PushDataKwargs(TypedDict):
|
|
185
194
|
"""Keyword arguments for dataset's `push_data` method."""
|
|
@@ -189,6 +198,7 @@ class PushDataFunctionCall(PushDataKwargs):
|
|
|
189
198
|
data: list[dict[str, Any]] | dict[str, Any]
|
|
190
199
|
dataset_id: str | None
|
|
191
200
|
dataset_name: str | None
|
|
201
|
+
dataset_alias: str | None
|
|
192
202
|
|
|
193
203
|
|
|
194
204
|
class KeyValueStoreInterface(Protocol):
|
|
@@ -251,25 +261,46 @@ class KeyValueStoreChangeRecords:
|
|
|
251
261
|
class RequestHandlerRunResult:
|
|
252
262
|
"""Record of calls to storage-related context helpers."""
|
|
253
263
|
|
|
254
|
-
def __init__(
|
|
264
|
+
def __init__(
|
|
265
|
+
self,
|
|
266
|
+
*,
|
|
267
|
+
key_value_store_getter: GetKeyValueStoreFunction,
|
|
268
|
+
request: Request,
|
|
269
|
+
) -> None:
|
|
255
270
|
self._key_value_store_getter = key_value_store_getter
|
|
256
271
|
self.add_requests_calls = list[AddRequestsKwargs]()
|
|
257
272
|
self.push_data_calls = list[PushDataFunctionCall]()
|
|
258
|
-
self.key_value_store_changes = dict[tuple[str | None, str | None], KeyValueStoreChangeRecords]()
|
|
273
|
+
self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
|
|
274
|
+
|
|
275
|
+
# Isolated copies for handler execution
|
|
276
|
+
self._request = deepcopy(request)
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def request(self) -> Request:
|
|
280
|
+
return self._request
|
|
259
281
|
|
|
260
282
|
async def add_requests(
|
|
261
283
|
self,
|
|
262
284
|
requests: Sequence[str | Request],
|
|
285
|
+
rq_id: str | None = None,
|
|
286
|
+
rq_name: str | None = None,
|
|
287
|
+
rq_alias: str | None = None,
|
|
263
288
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
264
289
|
) -> None:
|
|
265
290
|
"""Track a call to the `add_requests` context helper."""
|
|
266
|
-
|
|
291
|
+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
|
|
292
|
+
if specified_params > 1:
|
|
293
|
+
raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
|
|
294
|
+
self.add_requests_calls.append(
|
|
295
|
+
AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
|
|
296
|
+
)
|
|
267
297
|
|
|
268
298
|
async def push_data(
|
|
269
299
|
self,
|
|
270
300
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
271
301
|
dataset_id: str | None = None,
|
|
272
302
|
dataset_name: str | None = None,
|
|
303
|
+
dataset_alias: str | None = None,
|
|
273
304
|
**kwargs: Unpack[PushDataKwargs],
|
|
274
305
|
) -> None:
|
|
275
306
|
"""Track a call to the `push_data` context helper."""
|
|
@@ -278,6 +309,7 @@ class RequestHandlerRunResult:
|
|
|
278
309
|
data=data,
|
|
279
310
|
dataset_id=dataset_id,
|
|
280
311
|
dataset_name=dataset_name,
|
|
312
|
+
dataset_alias=dataset_alias,
|
|
281
313
|
**kwargs,
|
|
282
314
|
)
|
|
283
315
|
)
|
|
@@ -287,13 +319,22 @@ class RequestHandlerRunResult:
|
|
|
287
319
|
*,
|
|
288
320
|
id: str | None = None,
|
|
289
321
|
name: str | None = None,
|
|
322
|
+
alias: str | None = None,
|
|
290
323
|
) -> KeyValueStoreInterface:
|
|
291
|
-
if (id, name) not in self.key_value_store_changes:
|
|
292
|
-
self.key_value_store_changes[id, name] = KeyValueStoreChangeRecords(
|
|
293
|
-
await self._key_value_store_getter(id=id, name=name)
|
|
324
|
+
if (id, name, alias) not in self.key_value_store_changes:
|
|
325
|
+
self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(
|
|
326
|
+
await self._key_value_store_getter(id=id, name=name, alias=alias)
|
|
294
327
|
)
|
|
295
328
|
|
|
296
|
-
return self.key_value_store_changes[id, name]
|
|
329
|
+
return self.key_value_store_changes[id, name, alias]
|
|
330
|
+
|
|
331
|
+
def apply_request_changes(self, target: Request) -> None:
|
|
332
|
+
"""Apply tracked changes from handler copy to original request."""
|
|
333
|
+
if self.request.user_data != target.user_data:
|
|
334
|
+
target.user_data = self.request.user_data
|
|
335
|
+
|
|
336
|
+
if self.request.headers != target.headers:
|
|
337
|
+
target.headers = self.request.headers
|
|
297
338
|
|
|
298
339
|
|
|
299
340
|
@docs_group('Functions')
|
|
@@ -307,12 +348,21 @@ class AddRequestsFunction(Protocol):
|
|
|
307
348
|
def __call__(
|
|
308
349
|
self,
|
|
309
350
|
requests: Sequence[str | Request],
|
|
351
|
+
rq_id: str | None = None,
|
|
352
|
+
rq_name: str | None = None,
|
|
353
|
+
rq_alias: str | None = None,
|
|
310
354
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
311
355
|
) -> Coroutine[None, None, None]:
|
|
312
356
|
"""Call dunder method.
|
|
313
357
|
|
|
314
358
|
Args:
|
|
315
359
|
requests: Requests to be added to the `RequestManager`.
|
|
360
|
+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
|
|
361
|
+
provided.
|
|
362
|
+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
363
|
+
can be provided.
|
|
364
|
+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
365
|
+
can be provided.
|
|
316
366
|
**kwargs: Additional keyword arguments.
|
|
317
367
|
"""
|
|
318
368
|
|
|
@@ -340,12 +390,21 @@ class EnqueueLinksFunction(Protocol):
|
|
|
340
390
|
label: str | None = None,
|
|
341
391
|
user_data: dict[str, Any] | None = None,
|
|
342
392
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
393
|
+
rq_id: str | None = None,
|
|
394
|
+
rq_name: str | None = None,
|
|
395
|
+
rq_alias: str | None = None,
|
|
343
396
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
344
397
|
) -> Coroutine[None, None, None]: ...
|
|
345
398
|
|
|
346
399
|
@overload
|
|
347
400
|
def __call__(
|
|
348
|
-
self,
|
|
401
|
+
self,
|
|
402
|
+
*,
|
|
403
|
+
requests: Sequence[str | Request] | None = None,
|
|
404
|
+
rq_id: str | None = None,
|
|
405
|
+
rq_name: str | None = None,
|
|
406
|
+
rq_alias: str | None = None,
|
|
407
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
349
408
|
) -> Coroutine[None, None, None]: ...
|
|
350
409
|
|
|
351
410
|
def __call__(
|
|
@@ -356,6 +415,9 @@ class EnqueueLinksFunction(Protocol):
|
|
|
356
415
|
user_data: dict[str, Any] | None = None,
|
|
357
416
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
358
417
|
requests: Sequence[str | Request] | None = None,
|
|
418
|
+
rq_id: str | None = None,
|
|
419
|
+
rq_name: str | None = None,
|
|
420
|
+
rq_alias: str | None = None,
|
|
359
421
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
360
422
|
) -> Coroutine[None, None, None]:
|
|
361
423
|
"""Call enqueue links function.
|
|
@@ -373,6 +435,12 @@ class EnqueueLinksFunction(Protocol):
|
|
|
373
435
|
- `'skip'` to exclude the request from being enqueued,
|
|
374
436
|
- `'unchanged'` to use the original request options without modification.
|
|
375
437
|
requests: Requests to be added to the `RequestManager`.
|
|
438
|
+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
|
|
439
|
+
provided.
|
|
440
|
+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
441
|
+
can be provided.
|
|
442
|
+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
443
|
+
can be provided.
|
|
376
444
|
**kwargs: Additional keyword arguments.
|
|
377
445
|
"""
|
|
378
446
|
|
|
@@ -424,12 +492,14 @@ class GetKeyValueStoreFunction(Protocol):
|
|
|
424
492
|
*,
|
|
425
493
|
id: str | None = None,
|
|
426
494
|
name: str | None = None,
|
|
495
|
+
alias: str | None = None,
|
|
427
496
|
) -> Coroutine[None, None, KeyValueStore]:
|
|
428
497
|
"""Call dunder method.
|
|
429
498
|
|
|
430
499
|
Args:
|
|
431
500
|
id: The ID of the `KeyValueStore` to get.
|
|
432
|
-
name: The name of the `KeyValueStore` to get.
|
|
501
|
+
name: The name of the `KeyValueStore` to get (global scope, named storage).
|
|
502
|
+
alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
|
|
433
503
|
"""
|
|
434
504
|
|
|
435
505
|
|
|
@@ -444,12 +514,14 @@ class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
|
|
|
444
514
|
*,
|
|
445
515
|
id: str | None = None,
|
|
446
516
|
name: str | None = None,
|
|
517
|
+
alias: str | None = None,
|
|
447
518
|
) -> Coroutine[None, None, KeyValueStoreInterface]:
|
|
448
519
|
"""Call dunder method.
|
|
449
520
|
|
|
450
521
|
Args:
|
|
451
522
|
id: The ID of the `KeyValueStore` to get.
|
|
452
|
-
name: The name of the `KeyValueStore` to get.
|
|
523
|
+
name: The name of the `KeyValueStore` to get (global scope, named storage).
|
|
524
|
+
alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
|
|
453
525
|
"""
|
|
454
526
|
|
|
455
527
|
|
|
@@ -466,6 +538,7 @@ class PushDataFunction(Protocol):
|
|
|
466
538
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
467
539
|
dataset_id: str | None = None,
|
|
468
540
|
dataset_name: str | None = None,
|
|
541
|
+
dataset_alias: str | None = None,
|
|
469
542
|
**kwargs: Unpack[PushDataKwargs],
|
|
470
543
|
) -> Coroutine[None, None, None]:
|
|
471
544
|
"""Call dunder method.
|
|
@@ -473,7 +546,8 @@ class PushDataFunction(Protocol):
|
|
|
473
546
|
Args:
|
|
474
547
|
data: The data to push to the `Dataset`.
|
|
475
548
|
dataset_id: The ID of the `Dataset` to push the data to.
|
|
476
|
-
dataset_name: The name of the `Dataset` to push the data to.
|
|
549
|
+
dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).
|
|
550
|
+
dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).
|
|
477
551
|
**kwargs: Additional keyword arguments.
|
|
478
552
|
"""
|
|
479
553
|
|
|
@@ -590,6 +664,24 @@ class BasicCrawlingContext:
|
|
|
590
664
|
"""Return hash of the context. Each context is considered unique."""
|
|
591
665
|
return id(self)
|
|
592
666
|
|
|
667
|
+
def create_modified_copy(
|
|
668
|
+
self,
|
|
669
|
+
push_data: PushDataFunction | None = None,
|
|
670
|
+
add_requests: AddRequestsFunction | None = None,
|
|
671
|
+
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
|
|
672
|
+
) -> Self:
|
|
673
|
+
"""Create a modified copy of the crawling context with specified changes."""
|
|
674
|
+
modifications = dict[str, Any]()
|
|
675
|
+
|
|
676
|
+
if push_data is not None:
|
|
677
|
+
modifications['push_data'] = push_data
|
|
678
|
+
if add_requests is not None:
|
|
679
|
+
modifications['add_requests'] = add_requests
|
|
680
|
+
if get_key_value_store is not None:
|
|
681
|
+
modifications['get_key_value_store'] = get_key_value_store
|
|
682
|
+
|
|
683
|
+
return dataclasses.replace(self, **modifications)
|
|
684
|
+
|
|
593
685
|
|
|
594
686
|
class GetDataKwargs(TypedDict):
|
|
595
687
|
"""Keyword arguments for dataset's `get_data` method."""
|
crawlee/_utils/context.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import inspect
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from functools import wraps
|
|
6
|
-
from typing import Any, TypeVar
|
|
6
|
+
from typing import Any, TypeVar, cast
|
|
7
7
|
|
|
8
8
|
T = TypeVar('T', bound=Callable[..., Any])
|
|
9
9
|
|
|
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
|
|
|
44
44
|
|
|
45
45
|
return await method(self, *args, **kwargs)
|
|
46
46
|
|
|
47
|
-
return async_wrapper if
|
|
47
|
+
return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)
|
crawlee/_utils/file.py
CHANGED
|
@@ -163,7 +163,14 @@ async def export_csv_to_stream(
|
|
|
163
163
|
dst: TextIO,
|
|
164
164
|
**kwargs: Unpack[ExportDataCsvKwargs],
|
|
165
165
|
) -> None:
|
|
166
|
-
|
|
166
|
+
# Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
|
|
167
|
+
# The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
|
|
168
|
+
# to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
|
|
169
|
+
# conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
|
|
170
|
+
if 'lineterminator' not in kwargs:
|
|
171
|
+
kwargs['lineterminator'] = '\n'
|
|
172
|
+
|
|
173
|
+
writer = csv.writer(dst, **kwargs)
|
|
167
174
|
write_header = True
|
|
168
175
|
|
|
169
176
|
# Iterate over the dataset and write to CSV.
|
crawlee/_utils/globs.py
CHANGED
|
@@ -33,12 +33,12 @@ def _translate(
|
|
|
33
33
|
|
|
34
34
|
HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
|
|
35
35
|
"""
|
|
36
|
-
if
|
|
37
|
-
seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep
|
|
36
|
+
_seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps
|
|
38
37
|
|
|
39
|
-
escaped_seps = ''.join(map(re.escape,
|
|
40
|
-
any_sep = f'[{escaped_seps}]' if len(
|
|
38
|
+
escaped_seps = ''.join(map(re.escape, _seps))
|
|
39
|
+
any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
|
|
41
40
|
not_sep = f'[^{escaped_seps}]'
|
|
41
|
+
|
|
42
42
|
if include_hidden:
|
|
43
43
|
one_last_segment = f'{not_sep}+'
|
|
44
44
|
one_segment = f'{one_last_segment}{any_sep}'
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
|
|
5
|
+
"""Raise ValueError if there are more non-None kwargs then max_kwargs."""
|
|
6
|
+
none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
|
|
7
|
+
if len(none_kwargs_names) > max_kwargs:
|
|
8
|
+
all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
|
|
9
|
+
raise ValueError(
|
|
10
|
+
f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
|
|
11
|
+
f'specified: {", ".join(none_kwargs_names)}.'
|
|
12
|
+
)
|
|
@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
|
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
7
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
7
8
|
from crawlee.events._types import Event, EventPersistStateData
|
|
8
9
|
|
|
9
10
|
if TYPE_CHECKING:
|
|
10
11
|
import logging
|
|
12
|
+
from collections.abc import Callable, Coroutine
|
|
11
13
|
|
|
12
|
-
from crawlee.storages
|
|
14
|
+
from crawlee.storages import KeyValueStore
|
|
13
15
|
|
|
14
16
|
TStateModel = TypeVar('TStateModel', bound=BaseModel)
|
|
15
17
|
|
|
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
37
39
|
persistence_enabled: Literal[True, False, 'explicit_only'] = False,
|
|
38
40
|
persist_state_kvs_name: str | None = None,
|
|
39
41
|
persist_state_kvs_id: str | None = None,
|
|
42
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
40
43
|
logger: logging.Logger,
|
|
41
44
|
) -> None:
|
|
42
45
|
"""Initialize a new recoverable state object.
|
|
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
51
54
|
If neither a name nor and id are supplied, the default store will be used.
|
|
52
55
|
persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
|
|
53
56
|
If neither a name nor and id are supplied, the default store will be used.
|
|
57
|
+
persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
|
|
58
|
+
not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
|
|
54
59
|
logger: A logger instance for logging operations related to state persistence
|
|
55
60
|
"""
|
|
61
|
+
raise_if_too_many_kwargs(
|
|
62
|
+
persist_state_kvs_name=persist_state_kvs_name,
|
|
63
|
+
persist_state_kvs_id=persist_state_kvs_id,
|
|
64
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
65
|
+
)
|
|
66
|
+
if not persist_state_kvs_factory:
|
|
67
|
+
logger.debug(
|
|
68
|
+
'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
|
|
69
|
+
'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
|
|
70
|
+
'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
|
|
71
|
+
'global side effects.'
|
|
72
|
+
)
|
|
73
|
+
|
|
56
74
|
self._default_state = default_state
|
|
57
75
|
self._state_type: type[TStateModel] = self._default_state.__class__
|
|
58
76
|
self._state: TStateModel | None = None
|
|
59
77
|
self._persistence_enabled = persistence_enabled
|
|
60
78
|
self._persist_state_key = persist_state_key
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
79
|
+
if persist_state_kvs_factory is None:
|
|
80
|
+
|
|
81
|
+
async def kvs_factory() -> KeyValueStore:
|
|
82
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
83
|
+
|
|
84
|
+
return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
|
|
85
|
+
|
|
86
|
+
self._persist_state_kvs_factory = kvs_factory
|
|
87
|
+
else:
|
|
88
|
+
self._persist_state_kvs_factory = persist_state_kvs_factory
|
|
89
|
+
|
|
90
|
+
self._key_value_store: KeyValueStore | None = None
|
|
64
91
|
self._log = logger
|
|
65
92
|
|
|
66
93
|
async def initialize(self) -> TStateModel:
|
|
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
77
104
|
return self.current_value
|
|
78
105
|
|
|
79
106
|
# Import here to avoid circular imports.
|
|
80
|
-
from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415
|
|
81
107
|
|
|
82
|
-
self._key_value_store = await
|
|
83
|
-
name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
|
|
84
|
-
)
|
|
108
|
+
self._key_value_store = await self._persist_state_kvs_factory()
|
|
85
109
|
|
|
86
110
|
await self._load_saved_state()
|
|
87
111
|
|
crawlee/_utils/recurring_task.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import inspect
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
if TYPE_CHECKING:
|
|
8
9
|
from collections.abc import Callable
|
|
9
10
|
from datetime import timedelta
|
|
11
|
+
from types import TracebackType
|
|
12
|
+
|
|
13
|
+
from typing_extensions import Self
|
|
10
14
|
|
|
11
15
|
logger = getLogger(__name__)
|
|
12
16
|
|
|
@@ -21,11 +25,27 @@ class RecurringTask:
|
|
|
21
25
|
"""
|
|
22
26
|
|
|
23
27
|
def __init__(self, func: Callable, delay: timedelta) -> None:
|
|
24
|
-
logger.debug(
|
|
28
|
+
logger.debug(
|
|
29
|
+
'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
|
|
30
|
+
func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
|
|
31
|
+
delay,
|
|
32
|
+
)
|
|
25
33
|
self.func = func
|
|
26
34
|
self.delay = delay
|
|
27
35
|
self.task: asyncio.Task | None = None
|
|
28
36
|
|
|
37
|
+
async def __aenter__(self) -> Self:
|
|
38
|
+
self.start()
|
|
39
|
+
return self
|
|
40
|
+
|
|
41
|
+
async def __aexit__(
|
|
42
|
+
self,
|
|
43
|
+
exc_type: type[BaseException] | None,
|
|
44
|
+
exc_value: BaseException | None,
|
|
45
|
+
exc_traceback: TracebackType | None,
|
|
46
|
+
) -> None:
|
|
47
|
+
await self.stop()
|
|
48
|
+
|
|
29
49
|
async def _wrapper(self) -> None:
|
|
30
50
|
"""Continuously execute the provided function with the specified delay.
|
|
31
51
|
|
|
@@ -34,12 +54,16 @@ class RecurringTask:
|
|
|
34
54
|
"""
|
|
35
55
|
sleep_time_secs = self.delay.total_seconds()
|
|
36
56
|
while True:
|
|
37
|
-
await self.func() if
|
|
57
|
+
await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
|
|
38
58
|
await asyncio.sleep(sleep_time_secs)
|
|
39
59
|
|
|
40
60
|
def start(self) -> None:
|
|
41
61
|
"""Start the recurring task execution."""
|
|
42
|
-
self.
|
|
62
|
+
name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
|
|
63
|
+
self.task = asyncio.create_task(
|
|
64
|
+
self._wrapper(),
|
|
65
|
+
name=f'Task-recurring-{name}',
|
|
66
|
+
)
|
|
43
67
|
|
|
44
68
|
async def stop(self) -> None:
|
|
45
69
|
"""Stop the recurring task execution."""
|
crawlee/_utils/requests.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import re
|
|
4
|
-
from base64 import b64encode
|
|
5
|
-
from hashlib import sha256
|
|
6
3
|
from logging import getLogger
|
|
7
4
|
from typing import TYPE_CHECKING
|
|
8
5
|
|
|
@@ -16,29 +13,6 @@ if TYPE_CHECKING:
|
|
|
16
13
|
logger = getLogger(__name__)
|
|
17
14
|
|
|
18
15
|
|
|
19
|
-
def unique_key_to_request_id(unique_key: str, *, request_id_length: int = 15) -> str:
|
|
20
|
-
"""Generate a deterministic request ID based on a unique key.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
unique_key: The unique key to convert into a request ID.
|
|
24
|
-
request_id_length: The length of the request ID.
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
A URL-safe, truncated request ID based on the unique key.
|
|
28
|
-
"""
|
|
29
|
-
# Encode the unique key and compute its SHA-256 hash
|
|
30
|
-
hashed_key = sha256(unique_key.encode('utf-8')).digest()
|
|
31
|
-
|
|
32
|
-
# Encode the hash in base64 and decode it to get a string
|
|
33
|
-
base64_encoded = b64encode(hashed_key).decode('utf-8')
|
|
34
|
-
|
|
35
|
-
# Remove characters that are not URL-safe ('+', '/', or '=')
|
|
36
|
-
url_safe_key = re.sub(r'(\+|\/|=)', '', base64_encoded)
|
|
37
|
-
|
|
38
|
-
# Truncate the key to the desired length
|
|
39
|
-
return url_safe_key[:request_id_length]
|
|
40
|
-
|
|
41
|
-
|
|
42
16
|
def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
|
|
43
17
|
"""Normalize a URL.
|
|
44
18
|
|
crawlee/_utils/robots.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from logging import getLogger
|
|
3
4
|
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
6
|
from protego import Protego
|
|
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
|
|
|
15
16
|
from crawlee.proxy_configuration import ProxyInfo
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
logger = getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
18
22
|
class RobotsTxtFile:
|
|
19
23
|
def __init__(
|
|
20
24
|
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
|
|
@@ -56,12 +60,20 @@ class RobotsTxtFile:
|
|
|
56
60
|
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
|
|
57
61
|
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
|
|
58
62
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
try:
|
|
64
|
+
response = await http_client.send_request(url, proxy_info=proxy_info)
|
|
65
|
+
|
|
66
|
+
body = (
|
|
67
|
+
b'User-agent: *\nAllow: /'
|
|
68
|
+
if is_status_code_client_error(response.status_code)
|
|
69
|
+
else await response.read()
|
|
70
|
+
)
|
|
71
|
+
robots = Protego.parse(body.decode('utf-8'))
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
|
|
63
75
|
|
|
64
|
-
|
|
76
|
+
robots = Protego.parse('User-agent: *\nAllow: /')
|
|
65
77
|
|
|
66
78
|
return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
|
|
67
79
|
|