crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +2 -1
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +76 -17
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/sitemap.py +3 -1
- crawlee/_utils/system.py +3 -3
- crawlee/browsers/_playwright_browser_controller.py +20 -14
- crawlee/configuration.py +1 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +107 -27
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +1 -1
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +228 -48
- crawlee/sessions/_models.py +2 -2
- crawlee/statistics/_models.py +1 -1
- crawlee/storage_clients/__init__.py +12 -0
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
- crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
- crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +14 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +269 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +9 -2
- crawlee/storages/_key_value_store.py +9 -2
- crawlee/storages/_request_queue.py +7 -2
- crawlee/storages/_storage_instance_manager.py +126 -72
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/METADATA +12 -5
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/RECORD +59 -49
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/WHEEL +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,6 +15,7 @@ from typing_extensions import override
|
|
|
15
15
|
from crawlee._consts import METADATA_FILENAME
|
|
16
16
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
17
17
|
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
|
|
18
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
18
19
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
19
20
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
20
21
|
|
|
@@ -55,7 +56,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
55
56
|
self,
|
|
56
57
|
*,
|
|
57
58
|
metadata: KeyValueStoreMetadata,
|
|
58
|
-
|
|
59
|
+
path_to_kvs: Path,
|
|
59
60
|
lock: asyncio.Lock,
|
|
60
61
|
) -> None:
|
|
61
62
|
"""Initialize a new instance.
|
|
@@ -64,8 +65,8 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
64
65
|
"""
|
|
65
66
|
self._metadata = metadata
|
|
66
67
|
|
|
67
|
-
self.
|
|
68
|
-
"""The
|
|
68
|
+
self._path_to_kvs = path_to_kvs
|
|
69
|
+
"""The full path to the key-value store directory."""
|
|
69
70
|
|
|
70
71
|
self._lock = lock
|
|
71
72
|
"""A lock to ensure that only one operation is performed at a time."""
|
|
@@ -77,10 +78,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
77
78
|
@property
|
|
78
79
|
def path_to_kvs(self) -> Path:
|
|
79
80
|
"""The full path to the key-value store directory."""
|
|
80
|
-
|
|
81
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
|
|
82
|
-
|
|
83
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
|
|
81
|
+
return self._path_to_kvs
|
|
84
82
|
|
|
85
83
|
@property
|
|
86
84
|
def path_to_metadata(self) -> Path:
|
|
@@ -93,6 +91,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
93
91
|
*,
|
|
94
92
|
id: str | None,
|
|
95
93
|
name: str | None,
|
|
94
|
+
alias: str | None,
|
|
96
95
|
configuration: Configuration,
|
|
97
96
|
) -> FileSystemKeyValueStoreClient:
|
|
98
97
|
"""Open or create a file system key-value store client.
|
|
@@ -103,17 +102,21 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
103
102
|
|
|
104
103
|
Args:
|
|
105
104
|
id: The ID of the key-value store to open. If provided, searches for existing store by ID.
|
|
106
|
-
name: The name of the key-value store
|
|
105
|
+
name: The name of the key-value store for named (global scope) storages.
|
|
106
|
+
alias: The alias of the key-value store for unnamed (run scope) storages.
|
|
107
107
|
configuration: The configuration object containing storage directory settings.
|
|
108
108
|
|
|
109
109
|
Returns:
|
|
110
110
|
An instance for the opened or created storage client.
|
|
111
111
|
|
|
112
112
|
Raises:
|
|
113
|
-
ValueError: If a store with the specified ID is not found,
|
|
113
|
+
ValueError: If a store with the specified ID is not found, if metadata is invalid,
|
|
114
|
+
or if both name and alias are provided.
|
|
114
115
|
"""
|
|
115
|
-
|
|
116
|
-
|
|
116
|
+
# Validate input parameters.
|
|
117
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
118
|
+
|
|
119
|
+
kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
117
120
|
|
|
118
121
|
if not kvs_base_path.exists():
|
|
119
122
|
await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True)
|
|
@@ -125,19 +128,19 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
125
128
|
if not kvs_dir.is_dir():
|
|
126
129
|
continue
|
|
127
130
|
|
|
128
|
-
|
|
129
|
-
if not
|
|
131
|
+
path_to_metadata = kvs_dir / METADATA_FILENAME
|
|
132
|
+
if not path_to_metadata.exists():
|
|
130
133
|
continue
|
|
131
134
|
|
|
132
135
|
try:
|
|
133
|
-
file = await asyncio.to_thread(
|
|
136
|
+
file = await asyncio.to_thread(path_to_metadata.open)
|
|
134
137
|
try:
|
|
135
138
|
file_content = json.load(file)
|
|
136
139
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
137
140
|
if metadata.id == id:
|
|
138
141
|
client = cls(
|
|
139
142
|
metadata=metadata,
|
|
140
|
-
|
|
143
|
+
path_to_kvs=kvs_base_path / kvs_dir,
|
|
141
144
|
lock=asyncio.Lock(),
|
|
142
145
|
)
|
|
143
146
|
await client._update_metadata(update_accessed_at=True)
|
|
@@ -151,14 +154,15 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
151
154
|
if not found:
|
|
152
155
|
raise ValueError(f'Key-value store with ID "{id}" not found.')
|
|
153
156
|
|
|
154
|
-
# Get a new instance by name.
|
|
157
|
+
# Get a new instance by name or alias.
|
|
155
158
|
else:
|
|
156
|
-
|
|
157
|
-
|
|
159
|
+
kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
|
|
160
|
+
path_to_kvs = kvs_base_path / kvs_dir
|
|
161
|
+
path_to_metadata = path_to_kvs / METADATA_FILENAME
|
|
158
162
|
|
|
159
163
|
# If the key-value store directory exists, reconstruct the client from the metadata file.
|
|
160
|
-
if
|
|
161
|
-
file = await asyncio.to_thread(open,
|
|
164
|
+
if path_to_kvs.exists() and path_to_metadata.exists():
|
|
165
|
+
file = await asyncio.to_thread(open, path_to_metadata)
|
|
162
166
|
try:
|
|
163
167
|
file_content = json.load(file)
|
|
164
168
|
finally:
|
|
@@ -166,11 +170,11 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
166
170
|
try:
|
|
167
171
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
168
172
|
except ValidationError as exc:
|
|
169
|
-
raise ValueError(f'Invalid metadata file for key-value store "{name}"') from exc
|
|
173
|
+
raise ValueError(f'Invalid metadata file for key-value store "{name or alias}"') from exc
|
|
170
174
|
|
|
171
175
|
client = cls(
|
|
172
176
|
metadata=metadata,
|
|
173
|
-
|
|
177
|
+
path_to_kvs=path_to_kvs,
|
|
174
178
|
lock=asyncio.Lock(),
|
|
175
179
|
)
|
|
176
180
|
|
|
@@ -188,7 +192,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
188
192
|
)
|
|
189
193
|
client = cls(
|
|
190
194
|
metadata=metadata,
|
|
191
|
-
|
|
195
|
+
path_to_kvs=path_to_kvs,
|
|
192
196
|
lock=asyncio.Lock(),
|
|
193
197
|
)
|
|
194
198
|
await client._update_metadata()
|
|
@@ -17,6 +17,7 @@ from crawlee import Request
|
|
|
17
17
|
from crawlee._consts import METADATA_FILENAME
|
|
18
18
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
19
19
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
20
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
20
21
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
21
22
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
22
23
|
from crawlee.storage_clients.models import (
|
|
@@ -89,7 +90,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
89
90
|
self,
|
|
90
91
|
*,
|
|
91
92
|
metadata: RequestQueueMetadata,
|
|
92
|
-
|
|
93
|
+
path_to_rq: Path,
|
|
93
94
|
lock: asyncio.Lock,
|
|
94
95
|
) -> None:
|
|
95
96
|
"""Initialize a new instance.
|
|
@@ -98,8 +99,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
98
99
|
"""
|
|
99
100
|
self._metadata = metadata
|
|
100
101
|
|
|
101
|
-
self.
|
|
102
|
-
"""The
|
|
102
|
+
self._path_to_rq = path_to_rq
|
|
103
|
+
"""The full path to the request queue directory."""
|
|
103
104
|
|
|
104
105
|
self._lock = lock
|
|
105
106
|
"""A lock to ensure that only one operation is performed at a time."""
|
|
@@ -115,9 +116,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
115
116
|
|
|
116
117
|
self._state = RecoverableState[RequestQueueState](
|
|
117
118
|
default_state=RequestQueueState(),
|
|
118
|
-
persist_state_key='
|
|
119
|
+
persist_state_key=f'__RQ_STATE_{self._metadata.id}',
|
|
119
120
|
persistence_enabled=True,
|
|
120
|
-
persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
|
|
121
121
|
logger=logger,
|
|
122
122
|
)
|
|
123
123
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
@@ -129,10 +129,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
129
129
|
@property
|
|
130
130
|
def path_to_rq(self) -> Path:
|
|
131
131
|
"""The full path to the request queue directory."""
|
|
132
|
-
|
|
133
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
|
|
134
|
-
|
|
135
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
|
|
132
|
+
return self._path_to_rq
|
|
136
133
|
|
|
137
134
|
@property
|
|
138
135
|
def path_to_metadata(self) -> Path:
|
|
@@ -145,6 +142,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
145
142
|
*,
|
|
146
143
|
id: str | None,
|
|
147
144
|
name: str | None,
|
|
145
|
+
alias: str | None,
|
|
148
146
|
configuration: Configuration,
|
|
149
147
|
) -> FileSystemRequestQueueClient:
|
|
150
148
|
"""Open or create a file system request queue client.
|
|
@@ -155,17 +153,21 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
155
153
|
|
|
156
154
|
Args:
|
|
157
155
|
id: The ID of the request queue to open. If provided, searches for existing queue by ID.
|
|
158
|
-
name: The name of the request queue
|
|
156
|
+
name: The name of the request queue for named (global scope) storages.
|
|
157
|
+
alias: The alias of the request queue for unnamed (run scope) storages.
|
|
159
158
|
configuration: The configuration object containing storage directory settings.
|
|
160
159
|
|
|
161
160
|
Returns:
|
|
162
161
|
An instance for the opened or created storage client.
|
|
163
162
|
|
|
164
163
|
Raises:
|
|
165
|
-
ValueError: If a queue with the specified ID is not found,
|
|
164
|
+
ValueError: If a queue with the specified ID is not found, if metadata is invalid,
|
|
165
|
+
or if both name and alias are provided.
|
|
166
166
|
"""
|
|
167
|
-
|
|
168
|
-
|
|
167
|
+
# Validate input parameters.
|
|
168
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
169
|
+
|
|
170
|
+
rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
169
171
|
|
|
170
172
|
if not rq_base_path.exists():
|
|
171
173
|
await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)
|
|
@@ -177,12 +179,12 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
177
179
|
if not rq_dir.is_dir():
|
|
178
180
|
continue
|
|
179
181
|
|
|
180
|
-
|
|
181
|
-
if not
|
|
182
|
+
path_to_metadata = rq_dir / METADATA_FILENAME
|
|
183
|
+
if not path_to_metadata.exists():
|
|
182
184
|
continue
|
|
183
185
|
|
|
184
186
|
try:
|
|
185
|
-
file = await asyncio.to_thread(
|
|
187
|
+
file = await asyncio.to_thread(path_to_metadata.open)
|
|
186
188
|
try:
|
|
187
189
|
file_content = json.load(file)
|
|
188
190
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -190,7 +192,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
190
192
|
if metadata.id == id:
|
|
191
193
|
client = cls(
|
|
192
194
|
metadata=metadata,
|
|
193
|
-
|
|
195
|
+
path_to_rq=rq_base_path / rq_dir,
|
|
194
196
|
lock=asyncio.Lock(),
|
|
195
197
|
)
|
|
196
198
|
await client._state.initialize()
|
|
@@ -206,14 +208,15 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
206
208
|
if not found:
|
|
207
209
|
raise ValueError(f'Request queue with ID "{id}" not found')
|
|
208
210
|
|
|
209
|
-
# Open an existing RQ by its name, or create a new one if not found.
|
|
211
|
+
# Open an existing RQ by its name or alias, or create a new one if not found.
|
|
210
212
|
else:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
+
rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
|
|
214
|
+
path_to_rq = rq_base_path / rq_dir
|
|
215
|
+
path_to_metadata = path_to_rq / METADATA_FILENAME
|
|
213
216
|
|
|
214
217
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
215
|
-
if
|
|
216
|
-
file = await asyncio.to_thread(open,
|
|
218
|
+
if path_to_rq.exists() and path_to_metadata.exists():
|
|
219
|
+
file = await asyncio.to_thread(open, path_to_metadata)
|
|
217
220
|
try:
|
|
218
221
|
file_content = json.load(file)
|
|
219
222
|
finally:
|
|
@@ -221,13 +224,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
221
224
|
try:
|
|
222
225
|
metadata = RequestQueueMetadata(**file_content)
|
|
223
226
|
except ValidationError as exc:
|
|
224
|
-
raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc
|
|
225
|
-
|
|
226
|
-
metadata.name = name
|
|
227
|
+
raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc
|
|
227
228
|
|
|
228
229
|
client = cls(
|
|
229
230
|
metadata=metadata,
|
|
230
|
-
|
|
231
|
+
path_to_rq=path_to_rq,
|
|
231
232
|
lock=asyncio.Lock(),
|
|
232
233
|
)
|
|
233
234
|
|
|
@@ -251,7 +252,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
251
252
|
)
|
|
252
253
|
client = cls(
|
|
253
254
|
metadata=metadata,
|
|
254
|
-
|
|
255
|
+
path_to_rq=path_to_rq,
|
|
255
256
|
lock=asyncio.Lock(),
|
|
256
257
|
)
|
|
257
258
|
await client._state.initialize()
|
|
@@ -312,28 +313,43 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
312
313
|
unprocessed_requests = list[UnprocessedRequest]()
|
|
313
314
|
state = self._state.current_value
|
|
314
315
|
|
|
315
|
-
|
|
316
|
-
existing_unique_keys: dict[str, Path] = {}
|
|
317
|
-
existing_request_files = await self._get_request_files(self.path_to_rq)
|
|
316
|
+
all_requests = state.forefront_requests | state.regular_requests
|
|
318
317
|
|
|
319
|
-
|
|
320
|
-
existing_request = await self._parse_request_file(request_file)
|
|
321
|
-
if existing_request is not None:
|
|
322
|
-
existing_unique_keys[existing_request.unique_key] = request_file
|
|
318
|
+
requests_to_enqueue = {}
|
|
323
319
|
|
|
324
|
-
#
|
|
320
|
+
# Determine which requests can be added or are modified.
|
|
325
321
|
for request in requests:
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
322
|
+
# Check if the request has already been handled.
|
|
323
|
+
if request.unique_key in state.handled_requests:
|
|
324
|
+
processed_requests.append(
|
|
325
|
+
ProcessedRequest(
|
|
326
|
+
unique_key=request.unique_key,
|
|
327
|
+
was_already_present=True,
|
|
328
|
+
was_already_handled=True,
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
# Check if the request is already in progress.
|
|
332
|
+
# Or if the request is already in the queue and the `forefront` flag is not used, we do not change the
|
|
333
|
+
# position of the request.
|
|
334
|
+
elif (request.unique_key in state.in_progress_requests) or (
|
|
335
|
+
request.unique_key in all_requests and not forefront
|
|
336
|
+
):
|
|
337
|
+
processed_requests.append(
|
|
338
|
+
ProcessedRequest(
|
|
339
|
+
unique_key=request.unique_key,
|
|
340
|
+
was_already_present=True,
|
|
341
|
+
was_already_handled=False,
|
|
342
|
+
)
|
|
343
|
+
)
|
|
344
|
+
# These requests must either be added or update their position.
|
|
345
|
+
else:
|
|
346
|
+
requests_to_enqueue[request.unique_key] = request
|
|
332
347
|
|
|
333
|
-
|
|
334
|
-
|
|
348
|
+
# Process each request in the batch.
|
|
349
|
+
for request in requests_to_enqueue.values():
|
|
350
|
+
# If the request is not already in the RQ, this is a new request.
|
|
351
|
+
if request.unique_key not in all_requests:
|
|
335
352
|
request_path = self._get_request_path(request.unique_key)
|
|
336
|
-
|
|
337
353
|
# Add sequence number to ensure FIFO ordering using state.
|
|
338
354
|
if forefront:
|
|
339
355
|
sequence_number = state.forefront_sequence_counter
|
|
@@ -352,9 +368,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
352
368
|
new_total_request_count += 1
|
|
353
369
|
new_pending_request_count += 1
|
|
354
370
|
|
|
355
|
-
# Add to our index for subsequent requests in this batch
|
|
356
|
-
existing_unique_keys[request.unique_key] = self._get_request_path(request.unique_key)
|
|
357
|
-
|
|
358
371
|
processed_requests.append(
|
|
359
372
|
ProcessedRequest(
|
|
360
373
|
unique_key=request.unique_key,
|
|
@@ -363,57 +376,33 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
363
376
|
)
|
|
364
377
|
)
|
|
365
378
|
|
|
366
|
-
# If the request already exists in the RQ
|
|
367
|
-
|
|
368
|
-
#
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
# If the request is already in the RQ and handled, just continue with the next one.
|
|
373
|
-
if was_already_present and was_already_handled:
|
|
374
|
-
processed_requests.append(
|
|
375
|
-
ProcessedRequest(
|
|
376
|
-
unique_key=request.unique_key,
|
|
377
|
-
was_already_present=True,
|
|
378
|
-
was_already_handled=True,
|
|
379
|
-
)
|
|
380
|
-
)
|
|
379
|
+
# If the request already exists in the RQ and use the forefront flag to update its position
|
|
380
|
+
elif forefront:
|
|
381
|
+
# If the request is among `regular`, remove it from its current position.
|
|
382
|
+
if request.unique_key in state.regular_requests:
|
|
383
|
+
state.regular_requests.pop(request.unique_key)
|
|
381
384
|
|
|
382
|
-
# If the request is already in
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
state.forefront_sequence_counter += 1
|
|
392
|
-
elif (
|
|
393
|
-
existing_request.unique_key not in state.forefront_requests
|
|
394
|
-
and existing_request.unique_key not in state.regular_requests
|
|
395
|
-
):
|
|
396
|
-
# Keep as regular if not already forefront
|
|
397
|
-
state.regular_requests[existing_request.unique_key] = state.sequence_counter
|
|
398
|
-
state.sequence_counter += 1
|
|
399
|
-
|
|
400
|
-
processed_requests.append(
|
|
401
|
-
ProcessedRequest(
|
|
402
|
-
unique_key=request.unique_key,
|
|
403
|
-
was_already_present=True,
|
|
404
|
-
was_already_handled=False,
|
|
405
|
-
)
|
|
385
|
+
# If the request is already in `forefront`, we just need to update its position.
|
|
386
|
+
state.forefront_requests[request.unique_key] = state.forefront_sequence_counter
|
|
387
|
+
state.forefront_sequence_counter += 1
|
|
388
|
+
|
|
389
|
+
processed_requests.append(
|
|
390
|
+
ProcessedRequest(
|
|
391
|
+
unique_key=request.unique_key,
|
|
392
|
+
was_already_present=True,
|
|
393
|
+
was_already_handled=False,
|
|
406
394
|
)
|
|
395
|
+
)
|
|
407
396
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
)
|
|
397
|
+
else:
|
|
398
|
+
logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
|
|
399
|
+
unprocessed_requests.append(
|
|
400
|
+
UnprocessedRequest(
|
|
401
|
+
unique_key=request.unique_key,
|
|
402
|
+
url=request.url,
|
|
403
|
+
method=request.method,
|
|
416
404
|
)
|
|
405
|
+
)
|
|
417
406
|
|
|
418
407
|
await self._update_metadata(
|
|
419
408
|
update_modified_at=True,
|
|
@@ -752,10 +741,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
752
741
|
files = await asyncio.to_thread(list, path_to_rq.glob('*.json'))
|
|
753
742
|
|
|
754
743
|
# Filter out metadata file and non-file entries.
|
|
755
|
-
filtered = filter(
|
|
756
|
-
lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME,
|
|
757
|
-
files,
|
|
758
|
-
)
|
|
744
|
+
filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
|
|
759
745
|
|
|
760
746
|
return list(filtered)
|
|
761
747
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
3
5
|
from typing_extensions import override
|
|
4
6
|
|
|
5
7
|
from crawlee._utils.docs import docs_group
|
|
@@ -10,6 +12,9 @@ from ._dataset_client import FileSystemDatasetClient
|
|
|
10
12
|
from ._key_value_store_client import FileSystemKeyValueStoreClient
|
|
11
13
|
from ._request_queue_client import FileSystemRequestQueueClient
|
|
12
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Hashable
|
|
17
|
+
|
|
13
18
|
|
|
14
19
|
@docs_group('Storage clients')
|
|
15
20
|
class FileSystemStorageClient(StorageClient):
|
|
@@ -29,16 +34,22 @@ class FileSystemStorageClient(StorageClient):
|
|
|
29
34
|
Use it only when running a single crawler process at a time.
|
|
30
35
|
"""
|
|
31
36
|
|
|
37
|
+
@override
|
|
38
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:
|
|
39
|
+
# Even different client instances should return same storage if the storage_dir is the same.
|
|
40
|
+
return super().get_storage_client_cache_key(configuration), configuration.storage_dir
|
|
41
|
+
|
|
32
42
|
@override
|
|
33
43
|
async def create_dataset_client(
|
|
34
44
|
self,
|
|
35
45
|
*,
|
|
36
46
|
id: str | None = None,
|
|
37
47
|
name: str | None = None,
|
|
48
|
+
alias: str | None = None,
|
|
38
49
|
configuration: Configuration | None = None,
|
|
39
50
|
) -> FileSystemDatasetClient:
|
|
40
51
|
configuration = configuration or Configuration.get_global_configuration()
|
|
41
|
-
client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration)
|
|
52
|
+
client = await FileSystemDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
42
53
|
await self._purge_if_needed(client, configuration)
|
|
43
54
|
return client
|
|
44
55
|
|
|
@@ -48,10 +59,11 @@ class FileSystemStorageClient(StorageClient):
|
|
|
48
59
|
*,
|
|
49
60
|
id: str | None = None,
|
|
50
61
|
name: str | None = None,
|
|
62
|
+
alias: str | None = None,
|
|
51
63
|
configuration: Configuration | None = None,
|
|
52
64
|
) -> FileSystemKeyValueStoreClient:
|
|
53
65
|
configuration = configuration or Configuration.get_global_configuration()
|
|
54
|
-
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
|
|
66
|
+
client = await FileSystemKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
55
67
|
await self._purge_if_needed(client, configuration)
|
|
56
68
|
return client
|
|
57
69
|
|
|
@@ -61,9 +73,10 @@ class FileSystemStorageClient(StorageClient):
|
|
|
61
73
|
*,
|
|
62
74
|
id: str | None = None,
|
|
63
75
|
name: str | None = None,
|
|
76
|
+
alias: str | None = None,
|
|
64
77
|
configuration: Configuration | None = None,
|
|
65
78
|
) -> FileSystemRequestQueueClient:
|
|
66
79
|
configuration = configuration or Configuration.get_global_configuration()
|
|
67
|
-
client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration)
|
|
80
|
+
client = await FileSystemRequestQueueClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
68
81
|
await self._purge_if_needed(client, configuration)
|
|
69
82
|
return client
|
|
File without changes
|
|
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
|
|
|
7
7
|
from typing_extensions import override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
10
11
|
from crawlee.storage_clients._base import DatasetClient
|
|
11
12
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
12
13
|
|
|
@@ -53,6 +54,7 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
53
54
|
*,
|
|
54
55
|
id: str | None,
|
|
55
56
|
name: str | None,
|
|
57
|
+
alias: str | None,
|
|
56
58
|
) -> MemoryDatasetClient:
|
|
57
59
|
"""Open or create a new memory dataset client.
|
|
58
60
|
|
|
@@ -60,14 +62,24 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
60
62
|
datasets don't check for existing datasets with the same name or ID since all data exists only in memory
|
|
61
63
|
and is lost when the process terminates.
|
|
62
64
|
|
|
65
|
+
Alias does not have any effect on the memory storage client implementation, because unnamed storages
|
|
66
|
+
are supported by default, since data are not persisted.
|
|
67
|
+
|
|
63
68
|
Args:
|
|
64
69
|
id: The ID of the dataset. If not provided, a random ID will be generated.
|
|
65
|
-
name: The name of the dataset
|
|
70
|
+
name: The name of the dataset for named (global scope) storages.
|
|
71
|
+
alias: The alias of the dataset for unnamed (run scope) storages.
|
|
66
72
|
|
|
67
73
|
Returns:
|
|
68
74
|
An instance for the opened or created storage client.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.
|
|
69
78
|
"""
|
|
70
|
-
#
|
|
79
|
+
# Validate input parameters.
|
|
80
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
81
|
+
|
|
82
|
+
# Create a new dataset
|
|
71
83
|
dataset_id = id or crypto_random_object_id()
|
|
72
84
|
now = datetime.now(timezone.utc)
|
|
73
85
|
|
|
@@ -8,6 +8,7 @@ from typing_extensions import override
|
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
10
|
from crawlee._utils.file import infer_mime_type
|
|
11
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
11
12
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
12
13
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
13
14
|
|
|
@@ -51,6 +52,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
51
52
|
*,
|
|
52
53
|
id: str | None,
|
|
53
54
|
name: str | None,
|
|
55
|
+
alias: str | None,
|
|
54
56
|
) -> MemoryKeyValueStoreClient:
|
|
55
57
|
"""Open or create a new memory key-value store client.
|
|
56
58
|
|
|
@@ -58,14 +60,24 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
58
60
|
memory KVS don't check for existing stores with the same name or ID since all data exists only in memory
|
|
59
61
|
and is lost when the process terminates.
|
|
60
62
|
|
|
63
|
+
Alias does not have any effect on the memory storage client implementation, because unnamed storages
|
|
64
|
+
are supported by default, since data are not persisted.
|
|
65
|
+
|
|
61
66
|
Args:
|
|
62
67
|
id: The ID of the key-value store. If not provided, a random ID will be generated.
|
|
63
|
-
name: The name of the key-value store
|
|
68
|
+
name: The name of the key-value store for named (global scope) storages.
|
|
69
|
+
alias: The alias of the key-value store for unnamed (run scope) storages.
|
|
64
70
|
|
|
65
71
|
Returns:
|
|
66
72
|
An instance for the opened or created storage client.
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
ValueError: If both name and alias are provided.
|
|
67
76
|
"""
|
|
68
|
-
#
|
|
77
|
+
# Validate input parameters.
|
|
78
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
79
|
+
|
|
80
|
+
# Create a new key-value store
|
|
69
81
|
store_id = id or crypto_random_object_id()
|
|
70
82
|
now = datetime.now(timezone.utc)
|
|
71
83
|
|