crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from importlib import metadata
|
|
2
2
|
|
|
3
|
-
from ._request import Request, RequestOptions
|
|
3
|
+
from ._request import Request, RequestOptions, RequestState
|
|
4
4
|
from ._service_locator import service_locator
|
|
5
5
|
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
|
|
6
6
|
from ._utils.globs import Glob
|
|
@@ -14,6 +14,7 @@ __all__ = [
|
|
|
14
14
|
'HttpHeaders',
|
|
15
15
|
'Request',
|
|
16
16
|
'RequestOptions',
|
|
17
|
+
'RequestState',
|
|
17
18
|
'RequestTransformAction',
|
|
18
19
|
'SkippedReason',
|
|
19
20
|
'service_locator',
|
|
@@ -113,7 +113,7 @@ class Snapshotter:
|
|
|
113
113
|
Args:
|
|
114
114
|
config: The `Configuration` instance. Uses the global (default) one if not provided.
|
|
115
115
|
"""
|
|
116
|
-
config = service_locator.get_configuration()
|
|
116
|
+
config = config or service_locator.get_configuration()
|
|
117
117
|
|
|
118
118
|
# Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
|
|
119
119
|
# it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
|
|
@@ -1,4 +1,8 @@
|
|
|
1
1
|
# ruff: noqa: N802, PLC0415
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
if TYPE_CHECKING:
|
|
5
|
+
from collections.abc import Callable
|
|
2
6
|
|
|
3
7
|
|
|
4
8
|
def patch_browserforge() -> None:
|
|
@@ -12,7 +16,7 @@ def patch_browserforge() -> None:
|
|
|
12
16
|
import apify_fingerprint_datapoints
|
|
13
17
|
from browserforge import download
|
|
14
18
|
|
|
15
|
-
download.DATA_DIRS
|
|
19
|
+
download.DATA_DIRS = {
|
|
16
20
|
'headers': apify_fingerprint_datapoints.get_header_network().parent,
|
|
17
21
|
'fingerprints': apify_fingerprint_datapoints.get_fingerprint_network().parent,
|
|
18
22
|
}
|
|
@@ -20,7 +24,7 @@ def patch_browserforge() -> None:
|
|
|
20
24
|
def DownloadIfNotExists(**flags: bool) -> None:
|
|
21
25
|
pass
|
|
22
26
|
|
|
23
|
-
download.DownloadIfNotExists = DownloadIfNotExists
|
|
27
|
+
download.DownloadIfNotExists: Callable[..., None] = DownloadIfNotExists
|
|
24
28
|
|
|
25
29
|
import browserforge.bayesian_network
|
|
26
30
|
|
|
@@ -33,7 +37,7 @@ def patch_browserforge() -> None:
|
|
|
33
37
|
path = download.DATA_DIRS['fingerprints'] / download.DATA_FILES['fingerprints'][path.name]
|
|
34
38
|
super().__init__(path)
|
|
35
39
|
|
|
36
|
-
browserforge.bayesian_network.BayesianNetwork = BayesianNetwork
|
|
40
|
+
browserforge.bayesian_network.BayesianNetwork: BayesianNetwork = BayesianNetwork
|
|
37
41
|
import browserforge.headers.generator
|
|
38
42
|
|
|
39
43
|
browserforge.headers.generator.DATA_DIR = download.DATA_DIRS['headers']
|
crawlee/_request.py
CHANGED
|
@@ -11,7 +11,7 @@ from yarl import URL
|
|
|
11
11
|
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
|
|
12
12
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
13
13
|
from crawlee._utils.docs import docs_group
|
|
14
|
-
from crawlee._utils.requests import compute_unique_key
|
|
14
|
+
from crawlee._utils.requests import compute_unique_key
|
|
15
15
|
from crawlee._utils.urls import validate_http_url
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
|
|
|
34
34
|
class CrawleeRequestData(BaseModel):
|
|
35
35
|
"""Crawlee-specific configuration stored in the `user_data`."""
|
|
36
36
|
|
|
37
|
-
max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
|
|
37
|
+
max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
|
|
38
38
|
"""Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
|
|
39
39
|
`BasicCrawler`."""
|
|
40
40
|
|
|
41
41
|
enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
|
|
42
42
|
"""The strategy that was used for enqueuing the request."""
|
|
43
43
|
|
|
44
|
-
state: RequestState
|
|
44
|
+
state: RequestState = RequestState.UNPROCESSED
|
|
45
45
|
"""Describes the request's current lifecycle state."""
|
|
46
46
|
|
|
47
47
|
session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
|
|
@@ -93,7 +93,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
|
|
|
93
93
|
def __delitem__(self, key: str) -> None:
|
|
94
94
|
del self.__pydantic_extra__[key]
|
|
95
95
|
|
|
96
|
-
def __iter__(self) -> Iterator[str]: #
|
|
96
|
+
def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
|
|
97
97
|
yield from self.__pydantic_extra__
|
|
98
98
|
|
|
99
99
|
def __len__(self) -> int:
|
|
@@ -117,6 +117,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
|
|
|
117
117
|
user_data_adapter = TypeAdapter(UserData)
|
|
118
118
|
|
|
119
119
|
|
|
120
|
+
@docs_group('Other')
|
|
120
121
|
class RequestOptions(TypedDict):
|
|
121
122
|
"""Options that can be used to customize request creation.
|
|
122
123
|
|
|
@@ -136,6 +137,8 @@ class RequestOptions(TypedDict):
|
|
|
136
137
|
always_enqueue: NotRequired[bool]
|
|
137
138
|
user_data: NotRequired[dict[str, JsonSerializable]]
|
|
138
139
|
no_retry: NotRequired[bool]
|
|
140
|
+
enqueue_strategy: NotRequired[EnqueueStrategy]
|
|
141
|
+
max_retries: NotRequired[int | None]
|
|
139
142
|
|
|
140
143
|
|
|
141
144
|
@docs_group('Storage data')
|
|
@@ -163,13 +166,9 @@ class Request(BaseModel):
|
|
|
163
166
|
```
|
|
164
167
|
"""
|
|
165
168
|
|
|
166
|
-
model_config = ConfigDict(
|
|
169
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
167
170
|
|
|
168
|
-
|
|
169
|
-
"""A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
|
|
170
|
-
with `unique_key`."""
|
|
171
|
-
|
|
172
|
-
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
171
|
+
unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
|
|
173
172
|
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
|
|
174
173
|
to the same URL.
|
|
175
174
|
|
|
@@ -181,40 +180,52 @@ class Request(BaseModel):
|
|
|
181
180
|
and specify which URLs shall be considered equal.
|
|
182
181
|
"""
|
|
183
182
|
|
|
184
|
-
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
|
|
183
|
+
url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
|
|
185
184
|
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
|
|
186
185
|
and fragments."""
|
|
187
186
|
|
|
188
|
-
method: HttpMethod = 'GET'
|
|
187
|
+
method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
|
|
189
188
|
"""HTTP request method."""
|
|
190
189
|
|
|
191
|
-
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
|
|
192
|
-
"""HTTP request headers."""
|
|
193
|
-
|
|
194
190
|
payload: Annotated[
|
|
195
191
|
HttpPayload | None,
|
|
196
192
|
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
|
|
197
193
|
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
|
|
194
|
+
Field(frozen=True),
|
|
198
195
|
] = None
|
|
199
196
|
"""HTTP request payload."""
|
|
200
197
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
198
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
199
|
+
if TYPE_CHECKING:
|
|
200
|
+
headers: HttpHeaders = HttpHeaders()
|
|
201
|
+
"""HTTP request headers."""
|
|
202
|
+
|
|
203
|
+
user_data: dict[str, JsonSerializable] = {}
|
|
204
|
+
"""Custom user data assigned to the request. Use this to save any request related data to the
|
|
205
|
+
request's scope, keeping them accessible on retries, failures etc.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
else:
|
|
209
|
+
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
|
|
210
|
+
"""HTTP request headers."""
|
|
211
|
+
|
|
212
|
+
user_data: Annotated[
|
|
213
|
+
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
|
|
214
|
+
Field(alias='userData', default_factory=lambda: UserData()),
|
|
215
|
+
PlainValidator(user_data_adapter.validate_python),
|
|
216
|
+
PlainSerializer(
|
|
217
|
+
lambda instance: user_data_adapter.dump_python(
|
|
218
|
+
instance,
|
|
219
|
+
by_alias=True,
|
|
220
|
+
exclude_none=True,
|
|
221
|
+
exclude_unset=True,
|
|
222
|
+
exclude_defaults=True,
|
|
223
|
+
)
|
|
224
|
+
),
|
|
225
|
+
]
|
|
226
|
+
"""Custom user data assigned to the request. Use this to save any request related data to the
|
|
227
|
+
request's scope, keeping them accessible on retries, failures etc.
|
|
228
|
+
"""
|
|
218
229
|
|
|
219
230
|
retry_count: Annotated[int, Field(alias='retryCount')] = 0
|
|
220
231
|
"""Number of times the request has been retried."""
|
|
@@ -239,10 +250,11 @@ class Request(BaseModel):
|
|
|
239
250
|
label: str | None = None,
|
|
240
251
|
session_id: str | None = None,
|
|
241
252
|
unique_key: str | None = None,
|
|
242
|
-
id: str | None = None,
|
|
243
253
|
keep_url_fragment: bool = False,
|
|
244
254
|
use_extended_unique_key: bool = False,
|
|
245
255
|
always_enqueue: bool = False,
|
|
256
|
+
enqueue_strategy: EnqueueStrategy | None = None,
|
|
257
|
+
max_retries: int | None = None,
|
|
246
258
|
**kwargs: Any,
|
|
247
259
|
) -> Self:
|
|
248
260
|
"""Create a new `Request` instance from a URL.
|
|
@@ -264,14 +276,15 @@ class Request(BaseModel):
|
|
|
264
276
|
raised.
|
|
265
277
|
unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
|
|
266
278
|
the URL and other parameters. Requests with the same `unique_key` are treated as identical.
|
|
267
|
-
id: A unique identifier for the request. If not provided, it is automatically generated from the
|
|
268
|
-
`unique_key`.
|
|
269
279
|
keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
|
|
270
280
|
the `unique_key` computation. This is only relevant when `unique_key` is not provided.
|
|
271
281
|
use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the
|
|
272
282
|
`unique_key` computation. This is only relevant when `unique_key` is not provided.
|
|
273
283
|
always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
|
|
274
284
|
Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
|
|
285
|
+
enqueue_strategy: The strategy that will be used for enqueuing the request.
|
|
286
|
+
max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
|
|
287
|
+
option of `BasicCrawler`.
|
|
275
288
|
**kwargs: Additional request properties.
|
|
276
289
|
"""
|
|
277
290
|
if unique_key is not None and always_enqueue:
|
|
@@ -294,17 +307,29 @@ class Request(BaseModel):
|
|
|
294
307
|
)
|
|
295
308
|
|
|
296
309
|
if always_enqueue:
|
|
297
|
-
unique_key = f'{
|
|
310
|
+
unique_key = f'{crypto_random_object_id()}|{unique_key}'
|
|
311
|
+
|
|
312
|
+
user_data_dict = kwargs.pop('user_data', {}) or {}
|
|
313
|
+
crawlee_data_dict = user_data_dict.get('__crawlee', {})
|
|
314
|
+
|
|
315
|
+
if max_retries is not None:
|
|
316
|
+
crawlee_data_dict['maxRetries'] = max_retries
|
|
298
317
|
|
|
299
|
-
|
|
318
|
+
if enqueue_strategy is not None:
|
|
319
|
+
crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
|
|
320
|
+
|
|
321
|
+
crawlee_data = CrawleeRequestData(**crawlee_data_dict)
|
|
322
|
+
|
|
323
|
+
if crawlee_data:
|
|
324
|
+
user_data_dict['__crawlee'] = crawlee_data
|
|
300
325
|
|
|
301
326
|
request = cls(
|
|
302
327
|
url=url,
|
|
303
328
|
unique_key=unique_key,
|
|
304
|
-
id=id,
|
|
305
329
|
method=method,
|
|
306
330
|
headers=headers,
|
|
307
331
|
payload=payload,
|
|
332
|
+
user_data=user_data_dict,
|
|
308
333
|
**kwargs,
|
|
309
334
|
)
|
|
310
335
|
|
|
@@ -350,7 +375,7 @@ class Request(BaseModel):
|
|
|
350
375
|
self.crawlee_data.crawl_depth = new_value
|
|
351
376
|
|
|
352
377
|
@property
|
|
353
|
-
def state(self) -> RequestState
|
|
378
|
+
def state(self) -> RequestState:
|
|
354
379
|
"""Crawlee-specific request handling state."""
|
|
355
380
|
return self.crawlee_data.state
|
|
356
381
|
|
|
@@ -363,10 +388,6 @@ class Request(BaseModel):
|
|
|
363
388
|
"""Crawlee-specific limit on the number of retries of the request."""
|
|
364
389
|
return self.crawlee_data.max_retries
|
|
365
390
|
|
|
366
|
-
@max_retries.setter
|
|
367
|
-
def max_retries(self, new_max_retries: int) -> None:
|
|
368
|
-
self.crawlee_data.max_retries = new_max_retries
|
|
369
|
-
|
|
370
391
|
@property
|
|
371
392
|
def session_rotation_count(self) -> int | None:
|
|
372
393
|
"""Crawlee-specific number of finished session rotations for the request."""
|
crawlee/_service_locator.py
CHANGED
|
@@ -11,6 +11,10 @@ from crawlee.storage_clients import FileSystemStorageClient, StorageClient
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from crawlee.storages._storage_instance_manager import StorageInstanceManager
|
|
13
13
|
|
|
14
|
+
from logging import getLogger
|
|
15
|
+
|
|
16
|
+
logger = getLogger(__name__)
|
|
17
|
+
|
|
14
18
|
|
|
15
19
|
@docs_group('Configuration')
|
|
16
20
|
class ServiceLocator:
|
|
@@ -19,23 +23,24 @@ class ServiceLocator:
|
|
|
19
23
|
All services are initialized to its default value lazily.
|
|
20
24
|
"""
|
|
21
25
|
|
|
22
|
-
|
|
23
|
-
self._configuration: Configuration | None = None
|
|
24
|
-
self._event_manager: EventManager | None = None
|
|
25
|
-
self._storage_client: StorageClient | None = None
|
|
26
|
-
self._storage_instance_manager: StorageInstanceManager | None = None
|
|
26
|
+
global_storage_instance_manager: StorageInstanceManager | None = None
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
self
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
configuration: Configuration | None = None,
|
|
31
|
+
event_manager: EventManager | None = None,
|
|
32
|
+
storage_client: StorageClient | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
self._configuration = configuration
|
|
35
|
+
self._event_manager = event_manager
|
|
36
|
+
self._storage_client = storage_client
|
|
32
37
|
|
|
33
38
|
def get_configuration(self) -> Configuration:
|
|
34
39
|
"""Get the configuration."""
|
|
35
40
|
if self._configuration is None:
|
|
41
|
+
logger.debug('No configuration set, implicitly creating and using default Configuration.')
|
|
36
42
|
self._configuration = Configuration()
|
|
37
43
|
|
|
38
|
-
self._configuration_was_retrieved = True
|
|
39
44
|
return self._configuration
|
|
40
45
|
|
|
41
46
|
def set_configuration(self, configuration: Configuration) -> None:
|
|
@@ -47,7 +52,10 @@ class ServiceLocator:
|
|
|
47
52
|
Raises:
|
|
48
53
|
ServiceConflictError: If the configuration has already been retrieved before.
|
|
49
54
|
"""
|
|
50
|
-
if self.
|
|
55
|
+
if self._configuration is configuration:
|
|
56
|
+
# Same instance, no need to anything
|
|
57
|
+
return
|
|
58
|
+
if self._configuration:
|
|
51
59
|
raise ServiceConflictError(Configuration, configuration, self._configuration)
|
|
52
60
|
|
|
53
61
|
self._configuration = configuration
|
|
@@ -55,13 +63,14 @@ class ServiceLocator:
|
|
|
55
63
|
def get_event_manager(self) -> EventManager:
|
|
56
64
|
"""Get the event manager."""
|
|
57
65
|
if self._event_manager is None:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
66
|
+
logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
|
|
67
|
+
if self._configuration is None:
|
|
68
|
+
logger.debug(
|
|
69
|
+
'Implicit creation of event manager will implicitly set configuration as side effect. '
|
|
70
|
+
'It is advised to explicitly first set the configuration instead.'
|
|
71
|
+
)
|
|
72
|
+
self._event_manager = LocalEventManager().from_config(config=self._configuration)
|
|
63
73
|
|
|
64
|
-
self._event_manager_was_retrieved = True
|
|
65
74
|
return self._event_manager
|
|
66
75
|
|
|
67
76
|
def set_event_manager(self, event_manager: EventManager) -> None:
|
|
@@ -73,7 +82,10 @@ class ServiceLocator:
|
|
|
73
82
|
Raises:
|
|
74
83
|
ServiceConflictError: If the event manager has already been retrieved before.
|
|
75
84
|
"""
|
|
76
|
-
if self.
|
|
85
|
+
if self._event_manager is event_manager:
|
|
86
|
+
# Same instance, no need to anything
|
|
87
|
+
return
|
|
88
|
+
if self._event_manager:
|
|
77
89
|
raise ServiceConflictError(EventManager, event_manager, self._event_manager)
|
|
78
90
|
|
|
79
91
|
self._event_manager = event_manager
|
|
@@ -81,9 +93,14 @@ class ServiceLocator:
|
|
|
81
93
|
def get_storage_client(self) -> StorageClient:
|
|
82
94
|
"""Get the storage client."""
|
|
83
95
|
if self._storage_client is None:
|
|
96
|
+
logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
|
|
97
|
+
if self._configuration is None:
|
|
98
|
+
logger.warning(
|
|
99
|
+
'Implicit creation of storage client will implicitly set configuration as side effect. '
|
|
100
|
+
'It is advised to explicitly first set the configuration instead.'
|
|
101
|
+
)
|
|
84
102
|
self._storage_client = FileSystemStorageClient()
|
|
85
103
|
|
|
86
|
-
self._storage_client_was_retrieved = True
|
|
87
104
|
return self._storage_client
|
|
88
105
|
|
|
89
106
|
def set_storage_client(self, storage_client: StorageClient) -> None:
|
|
@@ -95,21 +112,24 @@ class ServiceLocator:
|
|
|
95
112
|
Raises:
|
|
96
113
|
ServiceConflictError: If the storage client has already been retrieved before.
|
|
97
114
|
"""
|
|
98
|
-
if self.
|
|
115
|
+
if self._storage_client is storage_client:
|
|
116
|
+
# Same instance, no need to anything
|
|
117
|
+
return
|
|
118
|
+
if self._storage_client:
|
|
99
119
|
raise ServiceConflictError(StorageClient, storage_client, self._storage_client)
|
|
100
120
|
|
|
101
121
|
self._storage_client = storage_client
|
|
102
122
|
|
|
103
123
|
@property
|
|
104
124
|
def storage_instance_manager(self) -> StorageInstanceManager:
|
|
105
|
-
"""Get the storage instance manager."""
|
|
106
|
-
if
|
|
125
|
+
"""Get the storage instance manager. It is global manager shared by all instances of ServiceLocator."""
|
|
126
|
+
if ServiceLocator.global_storage_instance_manager is None:
|
|
107
127
|
# Import here to avoid circular imports.
|
|
108
128
|
from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415
|
|
109
129
|
|
|
110
|
-
|
|
130
|
+
ServiceLocator.global_storage_instance_manager = StorageInstanceManager()
|
|
111
131
|
|
|
112
|
-
return
|
|
132
|
+
return ServiceLocator.global_storage_instance_manager
|
|
113
133
|
|
|
114
134
|
|
|
115
135
|
service_locator = ServiceLocator()
|