crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +62 -32
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +52 -19
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +160 -134
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
- crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
- crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
- crawlee/storage_clients/_memory/_dataset_client.py +2 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_dataset_client.py +2 -2
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
- crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
- crawlee/storage_clients/_sql/_storage_client.py +1 -1
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +3 -0
- crawlee/storages/_key_value_store.py +8 -2
- crawlee/storages/_request_queue.py +3 -0
- crawlee/storages/_storage_instance_manager.py +109 -42
- crawlee/storages/_utils.py +11 -0
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from importlib import metadata
|
|
2
2
|
|
|
3
|
-
from ._request import Request, RequestOptions
|
|
3
|
+
from ._request import Request, RequestOptions, RequestState
|
|
4
4
|
from ._service_locator import service_locator
|
|
5
5
|
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
|
|
6
6
|
from ._utils.globs import Glob
|
|
@@ -14,6 +14,7 @@ __all__ = [
|
|
|
14
14
|
'HttpHeaders',
|
|
15
15
|
'Request',
|
|
16
16
|
'RequestOptions',
|
|
17
|
+
'RequestState',
|
|
17
18
|
'RequestTransformAction',
|
|
18
19
|
'SkippedReason',
|
|
19
20
|
'service_locator',
|
|
@@ -1,4 +1,8 @@
|
|
|
1
1
|
# ruff: noqa: N802, PLC0415
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from collections.abc import Callable
|
|
2
6
|
|
|
3
7
|
|
|
4
8
|
def patch_browserforge() -> None:
|
|
@@ -12,7 +16,7 @@ def patch_browserforge() -> None:
|
|
|
12
16
|
import apify_fingerprint_datapoints
|
|
13
17
|
from browserforge import download
|
|
14
18
|
|
|
15
|
-
download.DATA_DIRS
|
|
19
|
+
download.DATA_DIRS = {
|
|
16
20
|
'headers': apify_fingerprint_datapoints.get_header_network().parent,
|
|
17
21
|
'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
|
|
18
22
|
}
|
|
@@ -20,7 +24,7 @@ def patch_browserforge() -> None:
|
|
|
20
24
|
def DownloadIfNotExists(**flags: bool) -> None:
|
|
21
25
|
pass
|
|
22
26
|
|
|
23
|
-
download.DownloadIfNotExists = DownloadIfNotExists
|
|
27
|
+
download.DownloadIfNotExists: Callable[..., None] = DownloadIfNotExists
|
|
24
28
|
|
|
25
29
|
import browserforge.bayesian_network
|
|
26
30
|
|
|
@@ -33,7 +37,7 @@ def patch_browserforge() -> None:
|
|
|
33
37
|
path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
|
|
34
38
|
super().__init__(path)
|
|
35
39
|
|
|
36
|
-
browserforge.bayesian_network.BayesianNetwork = BayesianNetwork
|
|
40
|
+
browserforge.bayesian_network.BayesianNetwork: BayesianNetwork = BayesianNetwork
|
|
37
41
|
import browserforge.headers.generator
|
|
38
42
|
|
|
39
43
|
browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']
|
crawlee/_request.py
CHANGED
|
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
|
|
|
34
34
|
class CrawleeRequestData(BaseModel):
|
|
35
35
|
"""Crawlee-specific configuration stored in the `user_data`."""
|
|
36
36
|
|
|
37
|
-
max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
|
|
37
|
+
max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
|
|
38
38
|
"""Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
|
|
39
39
|
`BasicCrawler`."""
|
|
40
40
|
|
|
41
41
|
enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
|
|
42
42
|
"""The strategy that was used for enqueuing the request."""
|
|
43
43
|
|
|
44
|
-
state: RequestState
|
|
44
|
+
state: RequestState = RequestState.UNPROCESSED
|
|
45
45
|
"""Describes the request's current lifecycle state."""
|
|
46
46
|
|
|
47
47
|
session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
|
|
@@ -93,7 +93,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
|
|
|
93
93
|
def __delitem__(self, key: str) -> None:
|
|
94
94
|
del self.__pydantic_extra__[key]
|
|
95
95
|
|
|
96
|
-
def __iter__(self) -> Iterator[str]: #
|
|
96
|
+
def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
|
|
97
97
|
yield from self.__pydantic_extra__
|
|
98
98
|
|
|
99
99
|
def __len__(self) -> int:
|
|
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
|
|
|
137
137
|
always_enqueue: NotRequired[bool]
|
|
138
138
|
user_data: NotRequired[dict[str, JsonSerializable]]
|
|
139
139
|
no_retry: NotRequired[bool]
|
|
140
|
+
enqueue_strategy: NotRequired[EnqueueStrategy]
|
|
141
|
+
max_retries: NotRequired[int | None]
|
|
140
142
|
|
|
141
143
|
|
|
142
144
|
@docs_group('Storage data')
|
|
@@ -166,7 +168,7 @@ class Request(BaseModel):
|
|
|
166
168
|
|
|
167
169
|
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
168
170
|
|
|
169
|
-
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
171
|
+
unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
|
|
170
172
|
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
|
|
171
173
|
to the same URL.
|
|
172
174
|
|
|
@@ -178,40 +180,52 @@ class Request(BaseModel):
|
|
|
178
180
|
and specify which URLs shall be considered equal.
|
|
179
181
|
"""
|
|
180
182
|
|
|
181
|
-
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
|
|
183
|
+
url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
|
|
182
184
|
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
|
|
183
185
|
and fragments."""
|
|
184
186
|
|
|
185
|
-
method: HttpMethod = 'GET'
|
|
187
|
+
method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
|
|
186
188
|
"""HTTP request method."""
|
|
187
189
|
|
|
188
|
-
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
|
|
189
|
-
"""HTTP request headers."""
|
|
190
|
-
|
|
191
190
|
payload: Annotated[
|
|
192
191
|
HttpPayload | None,
|
|
193
192
|
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
|
|
194
193
|
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
|
|
194
|
+
Field(frozen=True),
|
|
195
195
|
] = None
|
|
196
196
|
"""HTTP request payload."""
|
|
197
197
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
198
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
199
|
+
if TYPE_CHECKING:
|
|
200
|
+
headers: HttpHeaders = HttpHeaders()
|
|
201
|
+
"""HTTP request headers."""
|
|
202
|
+
|
|
203
|
+
user_data: dict[str, JsonSerializable] = {}
|
|
204
|
+
"""Custom user data assigned to the request. Use this to save any request related data to the
|
|
205
|
+
request's scope, keeping them accessible on retries, failures etc.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
else:
|
|
209
|
+
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
|
|
210
|
+
"""HTTP request headers."""
|
|
211
|
+
|
|
212
|
+
user_data: Annotated[
|
|
213
|
+
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
|
|
214
|
+
Field(alias='userData', default_factory=lambda: UserData()),
|
|
215
|
+
PlainValidator(user_data_adapter.validate_python),
|
|
216
|
+
PlainSerializer(
|
|
217
|
+
lambda instance: user_data_adapter.dump_python(
|
|
218
|
+
instance,
|
|
219
|
+
by_alias=True,
|
|
220
|
+
exclude_none=True,
|
|
221
|
+
exclude_unset=True,
|
|
222
|
+
exclude_defaults=True,
|
|
223
|
+
)
|
|
224
|
+
),
|
|
225
|
+
]
|
|
226
|
+
"""Custom user data assigned to the request. Use this to save any request related data to the
|
|
227
|
+
request's scope, keeping them accessible on retries, failures etc.
|
|
228
|
+
"""
|
|
215
229
|
|
|
216
230
|
retry_count: Annotated[int, Field(alias='retryCount')] = 0
|
|
217
231
|
"""Number of times the request has been retried."""
|
|
@@ -239,6 +253,8 @@ class Request(BaseModel):
|
|
|
239
253
|
keep_url_fragment: bool = False,
|
|
240
254
|
use_extended_unique_key: bool = False,
|
|
241
255
|
always_enqueue: bool = False,
|
|
256
|
+
enqueue_strategy: EnqueueStrategy | None = None,
|
|
257
|
+
max_retries: int | None = None,
|
|
242
258
|
**kwargs: Any,
|
|
243
259
|
) -> Self:
|
|
244
260
|
"""Create a new `Request` instance from a URL.
|
|
@@ -266,6 +282,9 @@ class Request(BaseModel):
|
|
|
266
282
|
`unique_key` computation. This is only relevant when `unique_key` is not provided.
|
|
267
283
|
always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
|
|
268
284
|
Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
|
|
285
|
+
enqueue_strategy: The strategy that will be used for enqueuing the request.
|
|
286
|
+
max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
|
|
287
|
+
option of `BasicCrawler`.
|
|
269
288
|
**kwargs: Additional request properties.
|
|
270
289
|
"""
|
|
271
290
|
if unique_key is not None and always_enqueue:
|
|
@@ -288,7 +307,21 @@ class Request(BaseModel):
|
|
|
288
307
|
)
|
|
289
308
|
|
|
290
309
|
if always_enqueue:
|
|
291
|
-
unique_key = f'{
|
|
310
|
+
unique_key = f'{crypto_random_object_id()}|{unique_key}'
|
|
311
|
+
|
|
312
|
+
user_data_dict = kwargs.pop('user_data', {}) or {}
|
|
313
|
+
crawlee_data_dict = user_data_dict.get('__crawlee', {})
|
|
314
|
+
|
|
315
|
+
if max_retries is not None:
|
|
316
|
+
crawlee_data_dict['maxRetries'] = max_retries
|
|
317
|
+
|
|
318
|
+
if enqueue_strategy is not None:
|
|
319
|
+
crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
|
|
320
|
+
|
|
321
|
+
crawlee_data = CrawleeRequestData(**crawlee_data_dict)
|
|
322
|
+
|
|
323
|
+
if crawlee_data:
|
|
324
|
+
user_data_dict['__crawlee'] = crawlee_data
|
|
292
325
|
|
|
293
326
|
request = cls(
|
|
294
327
|
url=url,
|
|
@@ -296,6 +329,7 @@ class Request(BaseModel):
|
|
|
296
329
|
method=method,
|
|
297
330
|
headers=headers,
|
|
298
331
|
payload=payload,
|
|
332
|
+
user_data=user_data_dict,
|
|
299
333
|
**kwargs,
|
|
300
334
|
)
|
|
301
335
|
|
|
@@ -341,7 +375,7 @@ class Request(BaseModel):
|
|
|
341
375
|
self.crawlee_data.crawl_depth = new_value
|
|
342
376
|
|
|
343
377
|
@property
|
|
344
|
-
def state(self) -> RequestState
|
|
378
|
+
def state(self) -> RequestState:
|
|
345
379
|
"""Crawlee-specific request handling state."""
|
|
346
380
|
return self.crawlee_data.state
|
|
347
381
|
|
|
@@ -354,10 +388,6 @@ class Request(BaseModel):
|
|
|
354
388
|
"""Crawlee-specific limit on the number of retries of the request."""
|
|
355
389
|
return self.crawlee_data.max_retries
|
|
356
390
|
|
|
357
|
-
@max_retries.setter
|
|
358
|
-
def max_retries(self, new_max_retries: int) -> None:
|
|
359
|
-
self.crawlee_data.max_retries = new_max_retries
|
|
360
|
-
|
|
361
391
|
@property
|
|
362
392
|
def session_rotation_count(self) -> int | None:
|
|
363
393
|
"""Crawlee-specific number of finished session rotations for the request."""
|
crawlee/_service_locator.py
CHANGED
|
@@ -38,7 +38,7 @@ class ServiceLocator:
|
|
|
38
38
|
def get_configuration(self) -> Configuration:
|
|
39
39
|
"""Get the configuration."""
|
|
40
40
|
if self._configuration is None:
|
|
41
|
-
logger.
|
|
41
|
+
logger.debug('No configuration set, implicitly creating and using default Configuration.')
|
|
42
42
|
self._configuration = Configuration()
|
|
43
43
|
|
|
44
44
|
return self._configuration
|
|
@@ -63,9 +63,9 @@ class ServiceLocator:
|
|
|
63
63
|
def get_event_manager(self) -> EventManager:
|
|
64
64
|
"""Get the event manager."""
|
|
65
65
|
if self._event_manager is None:
|
|
66
|
-
logger.
|
|
66
|
+
logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
|
|
67
67
|
if self._configuration is None:
|
|
68
|
-
logger.
|
|
68
|
+
logger.debug(
|
|
69
69
|
'Implicit creation of event manager will implicitly set configuration as side effect. '
|
|
70
70
|
'It is advised to explicitly first set the configuration instead.'
|
|
71
71
|
)
|
|
@@ -93,7 +93,7 @@ class ServiceLocator:
|
|
|
93
93
|
def get_storage_client(self) -> StorageClient:
|
|
94
94
|
"""Get the storage client."""
|
|
95
95
|
if self._storage_client is None:
|
|
96
|
-
logger.
|
|
96
|
+
logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
|
|
97
97
|
if self._configuration is None:
|
|
98
98
|
logger.warning(
|
|
99
99
|
'Implicit creation of storage client will implicitly set configuration as side effect. '
|
crawlee/_types.py
CHANGED
|
@@ -2,18 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
from collections.abc import Callable, Iterator, Mapping
|
|
5
|
+
from copy import deepcopy
|
|
5
6
|
from dataclasses import dataclass
|
|
6
|
-
from typing import
|
|
7
|
-
TYPE_CHECKING,
|
|
8
|
-
Annotated,
|
|
9
|
-
Any,
|
|
10
|
-
Literal,
|
|
11
|
-
Protocol,
|
|
12
|
-
TypedDict,
|
|
13
|
-
TypeVar,
|
|
14
|
-
cast,
|
|
15
|
-
overload,
|
|
16
|
-
)
|
|
7
|
+
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
|
|
17
8
|
|
|
18
9
|
from pydantic import ConfigDict, Field, PlainValidator, RootModel
|
|
19
10
|
|
|
@@ -25,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
25
16
|
import re
|
|
26
17
|
from collections.abc import Callable, Coroutine, Sequence
|
|
27
18
|
|
|
28
|
-
from typing_extensions import NotRequired, Required, Unpack
|
|
19
|
+
from typing_extensions import NotRequired, Required, Self, Unpack
|
|
29
20
|
|
|
30
21
|
from crawlee import Glob, Request
|
|
31
22
|
from crawlee._request import RequestOptions
|
|
@@ -71,11 +62,15 @@ class HttpHeaders(RootModel, Mapping[str, str]):
|
|
|
71
62
|
|
|
72
63
|
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
73
64
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
65
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
66
|
+
if TYPE_CHECKING:
|
|
67
|
+
root: dict[str, str] = {}
|
|
68
|
+
else:
|
|
69
|
+
root: Annotated[
|
|
70
|
+
dict[str, str],
|
|
71
|
+
PlainValidator(lambda value: _normalize_headers(value)),
|
|
72
|
+
Field(default_factory=lambda: dict[str, str]()),
|
|
73
|
+
]
|
|
79
74
|
|
|
80
75
|
def __getitem__(self, key: str) -> str:
|
|
81
76
|
return self.root[key.lower()]
|
|
@@ -96,7 +91,7 @@ class HttpHeaders(RootModel, Mapping[str, str]):
|
|
|
96
91
|
combined_headers = {**other, **self.root}
|
|
97
92
|
return HttpHeaders(combined_headers)
|
|
98
93
|
|
|
99
|
-
def __iter__(self) -> Iterator[str]: #
|
|
94
|
+
def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
|
|
100
95
|
yield from self.root
|
|
101
96
|
|
|
102
97
|
def __len__(self) -> int:
|
|
@@ -266,12 +261,24 @@ class KeyValueStoreChangeRecords:
|
|
|
266
261
|
class RequestHandlerRunResult:
|
|
267
262
|
"""Record of calls to storage-related context helpers."""
|
|
268
263
|
|
|
269
|
-
def __init__(
|
|
264
|
+
def __init__(
|
|
265
|
+
self,
|
|
266
|
+
*,
|
|
267
|
+
key_value_store_getter: GetKeyValueStoreFunction,
|
|
268
|
+
request: Request,
|
|
269
|
+
) -> None:
|
|
270
270
|
self._key_value_store_getter = key_value_store_getter
|
|
271
271
|
self.add_requests_calls = list[AddRequestsKwargs]()
|
|
272
272
|
self.push_data_calls = list[PushDataFunctionCall]()
|
|
273
273
|
self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
|
|
274
274
|
|
|
275
|
+
# Isolated copies for handler execution
|
|
276
|
+
self._request = deepcopy(request)
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def request(self) -> Request:
|
|
280
|
+
return self._request
|
|
281
|
+
|
|
275
282
|
async def add_requests(
|
|
276
283
|
self,
|
|
277
284
|
requests: Sequence[str | Request],
|
|
@@ -321,6 +328,14 @@ class RequestHandlerRunResult:
|
|
|
321
328
|
|
|
322
329
|
return self.key_value_store_changes[id, name, alias]
|
|
323
330
|
|
|
331
|
+
def apply_request_changes(self, target: Request) -> None:
|
|
332
|
+
"""Apply tracked changes from handler copy to original request."""
|
|
333
|
+
if self.request.user_data != target.user_data:
|
|
334
|
+
target.user_data = self.request.user_data
|
|
335
|
+
|
|
336
|
+
if self.request.headers != target.headers:
|
|
337
|
+
target.headers = self.request.headers
|
|
338
|
+
|
|
324
339
|
|
|
325
340
|
@docs_group('Functions')
|
|
326
341
|
class AddRequestsFunction(Protocol):
|
|
@@ -649,6 +664,24 @@ class BasicCrawlingContext:
|
|
|
649
664
|
"""Return hash of the context. Each context is considered unique."""
|
|
650
665
|
return id(self)
|
|
651
666
|
|
|
667
|
+
def create_modified_copy(
|
|
668
|
+
self,
|
|
669
|
+
push_data: PushDataFunction | None = None,
|
|
670
|
+
add_requests: AddRequestsFunction | None = None,
|
|
671
|
+
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
|
|
672
|
+
) -> Self:
|
|
673
|
+
"""Create a modified copy of the crawling context with specified changes."""
|
|
674
|
+
modifications = dict[str, Any]()
|
|
675
|
+
|
|
676
|
+
if push_data is not None:
|
|
677
|
+
modifications['push_data'] = push_data
|
|
678
|
+
if add_requests is not None:
|
|
679
|
+
modifications['add_requests'] = add_requests
|
|
680
|
+
if get_key_value_store is not None:
|
|
681
|
+
modifications['get_key_value_store'] = get_key_value_store
|
|
682
|
+
|
|
683
|
+
return dataclasses.replace(self, **modifications)
|
|
684
|
+
|
|
652
685
|
|
|
653
686
|
class GetDataKwargs(TypedDict):
|
|
654
687
|
"""Keyword arguments for dataset's `get_data` method."""
|
crawlee/_utils/context.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import inspect
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from functools import wraps
|
|
6
|
-
from typing import Any, TypeVar
|
|
6
|
+
from typing import Any, TypeVar, cast
|
|
7
7
|
|
|
8
8
|
T = TypeVar('T', bound=Callable[..., Any])
|
|
9
9
|
|
|
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
|
|
|
44
44
|
|
|
45
45
|
return await method(self, *args, **kwargs)
|
|
46
46
|
|
|
47
|
-
return async_wrapper if
|
|
47
|
+
return cast('T', async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper)
|
crawlee/_utils/file.py
CHANGED
|
@@ -163,7 +163,14 @@ async def export_csv_to_stream(
|
|
|
163
163
|
dst: TextIO,
|
|
164
164
|
**kwargs: Unpack[ExportDataCsvKwargs],
|
|
165
165
|
) -> None:
|
|
166
|
-
|
|
166
|
+
# Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
|
|
167
|
+
# The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
|
|
168
|
+
# to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
|
|
169
|
+
# conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
|
|
170
|
+
if 'lineterminator' not in kwargs:
|
|
171
|
+
kwargs['lineterminator'] = '\n'
|
|
172
|
+
|
|
173
|
+
writer = csv.writer(dst, **kwargs)
|
|
167
174
|
write_header = True
|
|
168
175
|
|
|
169
176
|
# Iterate over the dataset and write to CSV.
|
crawlee/_utils/globs.py
CHANGED
|
@@ -33,12 +33,12 @@ def _translate(
|
|
|
33
33
|
|
|
34
34
|
HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
|
|
35
35
|
"""
|
|
36
|
-
if
|
|
37
|
-
seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep
|
|
36
|
+
_seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps
|
|
38
37
|
|
|
39
|
-
escaped_seps = ''.join(map(re.escape,
|
|
40
|
-
any_sep = f'[{escaped_seps}]' if len(
|
|
38
|
+
escaped_seps = ''.join(map(re.escape, _seps))
|
|
39
|
+
any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
|
|
41
40
|
not_sep = f'[^{escaped_seps}]'
|
|
41
|
+
|
|
42
42
|
if include_hidden:
|
|
43
43
|
one_last_segment = f'{not_sep}+'
|
|
44
44
|
one_segment = f'{one_last_segment}{any_sep}'
|
|
@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
|
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
7
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
7
8
|
from crawlee.events._types import Event, EventPersistStateData
|
|
8
9
|
|
|
9
10
|
if TYPE_CHECKING:
|
|
10
11
|
import logging
|
|
12
|
+
from collections.abc import Callable, Coroutine
|
|
11
13
|
|
|
12
|
-
from crawlee.storages
|
|
14
|
+
from crawlee.storages import KeyValueStore
|
|
13
15
|
|
|
14
16
|
TStateModel = TypeVar('TStateModel', bound=BaseModel)
|
|
15
17
|
|
|
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
37
39
|
persistence_enabled: Literal[True, False, 'explicit_only'] = False,
|
|
38
40
|
persist_state_kvs_name: str | None = None,
|
|
39
41
|
persist_state_kvs_id: str | None = None,
|
|
42
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
40
43
|
logger: logging.Logger,
|
|
41
44
|
) -> None:
|
|
42
45
|
"""Initialize a new recoverable state object.
|
|
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
51
54
|
If neither a name nor and id are supplied, the default store will be used.
|
|
52
55
|
persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
|
|
53
56
|
If neither a name nor and id are supplied, the default store will be used.
|
|
57
|
+
persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
|
|
58
|
+
not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
|
|
54
59
|
logger: A logger instance for logging operations related to state persistence
|
|
55
60
|
"""
|
|
61
|
+
raise_if_too_many_kwargs(
|
|
62
|
+
persist_state_kvs_name=persist_state_kvs_name,
|
|
63
|
+
persist_state_kvs_id=persist_state_kvs_id,
|
|
64
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
65
|
+
)
|
|
66
|
+
if not persist_state_kvs_factory:
|
|
67
|
+
logger.debug(
|
|
68
|
+
'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
|
|
69
|
+
'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
|
|
70
|
+
'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
|
|
71
|
+
'global side effects.'
|
|
72
|
+
)
|
|
73
|
+
|
|
56
74
|
self._default_state = default_state
|
|
57
75
|
self._state_type: type[TStateModel] = self._default_state.__class__
|
|
58
76
|
self._state: TStateModel | None = None
|
|
59
77
|
self._persistence_enabled = persistence_enabled
|
|
60
78
|
self._persist_state_key = persist_state_key
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
79
|
+
if persist_state_kvs_factory is None:
|
|
80
|
+
|
|
81
|
+
async def kvs_factory() -> KeyValueStore:
|
|
82
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
83
|
+
|
|
84
|
+
return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
|
|
85
|
+
|
|
86
|
+
self._persist_state_kvs_factory = kvs_factory
|
|
87
|
+
else:
|
|
88
|
+
self._persist_state_kvs_factory = persist_state_kvs_factory
|
|
89
|
+
|
|
90
|
+
self._key_value_store: KeyValueStore | None = None
|
|
64
91
|
self._log = logger
|
|
65
92
|
|
|
66
93
|
async def initialize(self) -> TStateModel:
|
|
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
77
104
|
return self.current_value
|
|
78
105
|
|
|
79
106
|
# Import here to avoid circular imports.
|
|
80
|
-
from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415
|
|
81
107
|
|
|
82
|
-
self._key_value_store = await
|
|
83
|
-
name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
|
|
84
|
-
)
|
|
108
|
+
self._key_value_store = await self._persist_state_kvs_factory()
|
|
85
109
|
|
|
86
110
|
await self._load_saved_state()
|
|
87
111
|
|
crawlee/_utils/recurring_task.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import inspect
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
if TYPE_CHECKING:
|
|
8
9
|
from collections.abc import Callable
|
|
9
10
|
from datetime import timedelta
|
|
11
|
+
from types import TracebackType
|
|
12
|
+
|
|
13
|
+
from typing_extensions import Self
|
|
10
14
|
|
|
11
15
|
logger = getLogger(__name__)
|
|
12
16
|
|
|
@@ -21,11 +25,27 @@ class RecurringTask:
|
|
|
21
25
|
"""
|
|
22
26
|
|
|
23
27
|
def __init__(self, func: Callable, delay: timedelta) -> None:
|
|
24
|
-
logger.debug(
|
|
28
|
+
logger.debug(
|
|
29
|
+
'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
|
|
30
|
+
func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
|
|
31
|
+
delay,
|
|
32
|
+
)
|
|
25
33
|
self.func = func
|
|
26
34
|
self.delay = delay
|
|
27
35
|
self.task: asyncio.Task | None = None
|
|
28
36
|
|
|
37
|
+
async def __aenter__(self) -> Self:
|
|
38
|
+
self.start()
|
|
39
|
+
return self
|
|
40
|
+
|
|
41
|
+
async def __aexit__(
|
|
42
|
+
self,
|
|
43
|
+
exc_type: type[BaseException] | None,
|
|
44
|
+
exc_value: BaseException | None,
|
|
45
|
+
exc_traceback: TracebackType | None,
|
|
46
|
+
) -> None:
|
|
47
|
+
await self.stop()
|
|
48
|
+
|
|
29
49
|
async def _wrapper(self) -> None:
|
|
30
50
|
"""Continuously execute the provided function with the specified delay.
|
|
31
51
|
|
|
@@ -34,12 +54,16 @@ class RecurringTask:
|
|
|
34
54
|
"""
|
|
35
55
|
sleep_time_secs = self.delay.total_seconds()
|
|
36
56
|
while True:
|
|
37
|
-
await self.func() if
|
|
57
|
+
await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
|
|
38
58
|
await asyncio.sleep(sleep_time_secs)
|
|
39
59
|
|
|
40
60
|
def start(self) -> None:
|
|
41
61
|
"""Start the recurring task execution."""
|
|
42
|
-
self.
|
|
62
|
+
name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
|
|
63
|
+
self.task = asyncio.create_task(
|
|
64
|
+
self._wrapper(),
|
|
65
|
+
name=f'Task-recurring-{name}',
|
|
66
|
+
)
|
|
43
67
|
|
|
44
68
|
async def stop(self) -> None:
|
|
45
69
|
"""Stop the recurring task execution."""
|
crawlee/_utils/robots.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from logging import getLogger
|
|
3
4
|
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
6
|
from protego import Protego
|
|
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
|
|
|
15
16
|
from crawlee.proxy_configuration import ProxyInfo
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
logger = getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
18
22
|
class RobotsTxtFile:
|
|
19
23
|
def __init__(
|
|
20
24
|
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
|
|
@@ -56,12 +60,20 @@ class RobotsTxtFile:
|
|
|
56
60
|
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
|
|
57
61
|
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
|
|
58
62
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
try:
|
|
64
|
+
response = await http_client.send_request(url, proxy_info=proxy_info)
|
|
65
|
+
|
|
66
|
+
body = (
|
|
67
|
+
b'User-agent: *\nAllow: /'
|
|
68
|
+
if is_status_code_client_error(response.status_code)
|
|
69
|
+
else await response.read()
|
|
70
|
+
)
|
|
71
|
+
robots = Protego.parse(body.decode('utf-8'))
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
|
|
63
75
|
|
|
64
|
-
|
|
76
|
+
robots = Protego.parse('User-agent: *\nAllow: /')
|
|
65
77
|
|
|
66
78
|
return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
|
|
67
79
|
|