crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import functools
|
|
4
5
|
import json
|
|
5
6
|
import shutil
|
|
6
7
|
import urllib.parse
|
|
@@ -10,11 +11,12 @@ from pathlib import Path
|
|
|
10
11
|
from typing import TYPE_CHECKING, Any
|
|
11
12
|
|
|
12
13
|
from pydantic import ValidationError
|
|
13
|
-
from typing_extensions import override
|
|
14
|
+
from typing_extensions import Self, override
|
|
14
15
|
|
|
15
16
|
from crawlee._consts import METADATA_FILENAME
|
|
16
17
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
17
18
|
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
|
|
19
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
18
20
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
19
21
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
20
22
|
|
|
@@ -55,7 +57,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
55
57
|
self,
|
|
56
58
|
*,
|
|
57
59
|
metadata: KeyValueStoreMetadata,
|
|
58
|
-
|
|
60
|
+
path_to_kvs: Path,
|
|
59
61
|
lock: asyncio.Lock,
|
|
60
62
|
) -> None:
|
|
61
63
|
"""Initialize a new instance.
|
|
@@ -64,8 +66,8 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
64
66
|
"""
|
|
65
67
|
self._metadata = metadata
|
|
66
68
|
|
|
67
|
-
self.
|
|
68
|
-
"""The
|
|
69
|
+
self._path_to_kvs = path_to_kvs
|
|
70
|
+
"""The full path to the key-value store directory."""
|
|
69
71
|
|
|
70
72
|
self._lock = lock
|
|
71
73
|
"""A lock to ensure that only one operation is performed at a time."""
|
|
@@ -77,10 +79,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
77
79
|
@property
|
|
78
80
|
def path_to_kvs(self) -> Path:
|
|
79
81
|
"""The full path to the key-value store directory."""
|
|
80
|
-
|
|
81
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
|
|
82
|
-
|
|
83
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
|
|
82
|
+
return self._path_to_kvs
|
|
84
83
|
|
|
85
84
|
@property
|
|
86
85
|
def path_to_metadata(self) -> Path:
|
|
@@ -93,8 +92,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
93
92
|
*,
|
|
94
93
|
id: str | None,
|
|
95
94
|
name: str | None,
|
|
95
|
+
alias: str | None,
|
|
96
96
|
configuration: Configuration,
|
|
97
|
-
) ->
|
|
97
|
+
) -> Self:
|
|
98
98
|
"""Open or create a file system key-value store client.
|
|
99
99
|
|
|
100
100
|
This method attempts to open an existing key-value store from the file system. If a KVS with the specified
|
|
@@ -103,17 +103,21 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
103
103
|
|
|
104
104
|
Args:
|
|
105
105
|
id: The ID of the key-value store to open. If provided, searches for existing store by ID.
|
|
106
|
-
name: The name of the key-value store
|
|
106
|
+
name: The name of the key-value store for named (global scope) storages.
|
|
107
|
+
alias: The alias of the key-value store for unnamed (run scope) storages.
|
|
107
108
|
configuration: The configuration object containing storage directory settings.
|
|
108
109
|
|
|
109
110
|
Returns:
|
|
110
111
|
An instance for the opened or created storage client.
|
|
111
112
|
|
|
112
113
|
Raises:
|
|
113
|
-
ValueError: If a store with the specified ID is not found,
|
|
114
|
+
ValueError: If a store with the specified ID is not found, if metadata is invalid,
|
|
115
|
+
or if both name and alias are provided.
|
|
114
116
|
"""
|
|
115
|
-
|
|
116
|
-
|
|
117
|
+
# Validate input parameters.
|
|
118
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
119
|
+
|
|
120
|
+
kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
117
121
|
|
|
118
122
|
if not kvs_base_path.exists():
|
|
119
123
|
await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True)
|
|
@@ -125,19 +129,19 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
125
129
|
if not kvs_dir.is_dir():
|
|
126
130
|
continue
|
|
127
131
|
|
|
128
|
-
|
|
129
|
-
if not
|
|
132
|
+
path_to_metadata = kvs_dir / METADATA_FILENAME
|
|
133
|
+
if not path_to_metadata.exists():
|
|
130
134
|
continue
|
|
131
135
|
|
|
132
136
|
try:
|
|
133
|
-
file = await asyncio.to_thread(
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
134
138
|
try:
|
|
135
139
|
file_content = json.load(file)
|
|
136
140
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
137
141
|
if metadata.id == id:
|
|
138
142
|
client = cls(
|
|
139
143
|
metadata=metadata,
|
|
140
|
-
|
|
144
|
+
path_to_kvs=kvs_base_path / kvs_dir,
|
|
141
145
|
lock=asyncio.Lock(),
|
|
142
146
|
)
|
|
143
147
|
await client._update_metadata(update_accessed_at=True)
|
|
@@ -151,14 +155,15 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
151
155
|
if not found:
|
|
152
156
|
raise ValueError(f'Key-value store with ID "{id}" not found.')
|
|
153
157
|
|
|
154
|
-
# Get a new instance by name.
|
|
158
|
+
# Get a new instance by name or alias.
|
|
155
159
|
else:
|
|
156
|
-
|
|
157
|
-
|
|
160
|
+
kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
|
|
161
|
+
path_to_kvs = kvs_base_path / kvs_dir
|
|
162
|
+
path_to_metadata = path_to_kvs / METADATA_FILENAME
|
|
158
163
|
|
|
159
164
|
# If the key-value store directory exists, reconstruct the client from the metadata file.
|
|
160
|
-
if
|
|
161
|
-
file = await asyncio.to_thread(open,
|
|
165
|
+
if path_to_kvs.exists() and path_to_metadata.exists():
|
|
166
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
162
167
|
try:
|
|
163
168
|
file_content = json.load(file)
|
|
164
169
|
finally:
|
|
@@ -166,11 +171,11 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
166
171
|
try:
|
|
167
172
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
168
173
|
except ValidationError as exc:
|
|
169
|
-
raise ValueError(f'Invalid metadata file for key-value store "{name}"') from exc
|
|
174
|
+
raise ValueError(f'Invalid metadata file for key-value store "{name or alias}"') from exc
|
|
170
175
|
|
|
171
176
|
client = cls(
|
|
172
177
|
metadata=metadata,
|
|
173
|
-
|
|
178
|
+
path_to_kvs=path_to_kvs,
|
|
174
179
|
lock=asyncio.Lock(),
|
|
175
180
|
)
|
|
176
181
|
|
|
@@ -188,7 +193,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
188
193
|
)
|
|
189
194
|
client = cls(
|
|
190
195
|
metadata=metadata,
|
|
191
|
-
|
|
196
|
+
path_to_kvs=path_to_kvs,
|
|
192
197
|
lock=asyncio.Lock(),
|
|
193
198
|
)
|
|
194
199
|
await client._update_metadata()
|
|
@@ -235,7 +240,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
235
240
|
# Read the metadata file
|
|
236
241
|
async with self._lock:
|
|
237
242
|
try:
|
|
238
|
-
file = await asyncio.to_thread(
|
|
243
|
+
file = await asyncio.to_thread(
|
|
244
|
+
functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),
|
|
245
|
+
)
|
|
239
246
|
except FileNotFoundError:
|
|
240
247
|
logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
|
|
241
248
|
return None
|
|
@@ -369,7 +376,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
369
376
|
|
|
370
377
|
# List and sort all files *inside* a brief lock, then release it immediately:
|
|
371
378
|
async with self._lock:
|
|
372
|
-
files = sorted(await asyncio.to_thread(list
|
|
379
|
+
files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))
|
|
373
380
|
|
|
374
381
|
count = 0
|
|
375
382
|
|