crawlee 1.0.1b8__py3-none-any.whl → 1.0.5b18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. crawlee/_request.py +31 -20
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +10 -16
  4. crawlee/_utils/recoverable_state.py +32 -8
  5. crawlee/_utils/recurring_task.py +15 -0
  6. crawlee/_utils/robots.py +17 -5
  7. crawlee/_utils/sitemap.py +1 -1
  8. crawlee/_utils/urls.py +9 -2
  9. crawlee/browsers/_browser_pool.py +4 -1
  10. crawlee/browsers/_playwright_browser_controller.py +1 -1
  11. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  12. crawlee/browsers/_types.py +1 -1
  13. crawlee/configuration.py +3 -1
  14. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  15. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
  16. crawlee/crawlers/_basic/_basic_crawler.py +23 -12
  17. crawlee/crawlers/_playwright/_playwright_crawler.py +11 -4
  18. crawlee/fingerprint_suite/_header_generator.py +2 -2
  19. crawlee/otel/crawler_instrumentor.py +3 -3
  20. crawlee/request_loaders/_sitemap_request_loader.py +5 -0
  21. crawlee/sessions/_session_pool.py +1 -1
  22. crawlee/statistics/_error_snapshotter.py +1 -1
  23. crawlee/statistics/_statistics.py +41 -31
  24. crawlee/storage_clients/__init__.py +4 -0
  25. crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
  26. crawlee/storage_clients/_file_system/_key_value_store_client.py +2 -2
  27. crawlee/storage_clients/_file_system/_request_queue_client.py +26 -8
  28. crawlee/storage_clients/_memory/_dataset_client.py +2 -2
  29. crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
  30. crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
  31. crawlee/storage_clients/_redis/__init__.py +6 -0
  32. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  33. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  34. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  35. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  36. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  37. crawlee/storage_clients/_redis/_utils.py +23 -0
  38. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  39. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  40. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  41. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  42. crawlee/storage_clients/_redis/py.typed +0 -0
  43. crawlee/storage_clients/_sql/_dataset_client.py +2 -2
  44. crawlee/storage_clients/_sql/_db_models.py +1 -2
  45. crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
  46. crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
  47. crawlee/storage_clients/_sql/_storage_client.py +10 -1
  48. crawlee/storages/_base.py +3 -1
  49. crawlee/storages/_dataset.py +3 -0
  50. crawlee/storages/_key_value_store.py +8 -2
  51. crawlee/storages/_request_queue.py +3 -0
  52. crawlee/storages/_storage_instance_manager.py +9 -1
  53. crawlee/storages/_utils.py +11 -0
  54. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/METADATA +9 -5
  55. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/RECORD +58 -45
  56. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/WHEEL +0 -0
  57. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/entry_points.txt +0 -0
  58. {crawlee-1.0.1b8.dist-info → crawlee-1.0.5b18.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,146 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from typing import Literal
5
+
6
+ from redis.asyncio import Redis
7
+ from typing_extensions import override
8
+
9
+ from crawlee._utils.docs import docs_group
10
+ from crawlee.configuration import Configuration
11
+ from crawlee.storage_clients._base import StorageClient
12
+
13
+ from ._dataset_client import RedisDatasetClient
14
+ from ._key_value_store_client import RedisKeyValueStoreClient
15
+ from ._request_queue_client import RedisRequestQueueClient
16
+
17
+
18
+ @docs_group('Storage clients')
19
+ class RedisStorageClient(StorageClient):
20
+ """Redis implementation of the storage client.
21
+
22
+ This storage client provides access to datasets, key-value stores, and request queues that persist data
23
+ to a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for
24
+ efficient storage and retrieval.
25
+
26
+ The client accepts either a Redis connection string or a pre-configured Redis client instance.
27
+ Exactly one of these parameters must be provided during initialization.
28
+
29
+ Storage types use the following Redis data structures:
30
+ - **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects
31
+ - **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage
32
+ - **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking,
33
+ and Bloom filters for request deduplication
34
+
35
+ Warning:
36
+ This is an experimental feature. The behavior and interface may change in future versions.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ *,
42
+ connection_string: str | None = None,
43
+ redis: Redis | None = None,
44
+ queue_dedup_strategy: Literal['default', 'bloom'] = 'default',
45
+ queue_bloom_error_rate: float = 1e-7,
46
+ ) -> None:
47
+ """Initialize the Redis storage client.
48
+
49
+ Args:
50
+ connection_string: Redis connection string (e.g., "redis://localhost:6379").
51
+ Supports standard Redis URL format with optional database selection.
52
+ redis: Pre-configured Redis client instance.
53
+ queue_dedup_strategy: Strategy for request queue deduplication. Options are:
54
+ - 'default': Uses Redis sets for exact deduplication.
55
+ - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using
56
+ this approach, approximately 1 in 1e-7 requests will be falsely considered duplicate.
57
+ queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if
58
+ `queue_dedup_strategy` is set to 'bloom'.
59
+ """
60
+ match (redis, connection_string):
61
+ case (None, None):
62
+ raise ValueError('Either redis or connection_string must be provided.')
63
+ case (Redis(), None):
64
+ self._redis = redis
65
+ case (None, str()):
66
+ self._redis = Redis.from_url(connection_string)
67
+ case (Redis(), str()):
68
+ raise ValueError('Either redis or connection_string must be provided, not both.')
69
+
70
+ self._queue_dedup_strategy = queue_dedup_strategy
71
+ self._queue_bloom_error_rate = queue_bloom_error_rate
72
+
73
+ # Call the notification only once
74
+ warnings.warn(
75
+ (
76
+ 'RedisStorageClient is experimental and its API, behavior, and key structure may change in future '
77
+ 'releases.'
78
+ ),
79
+ category=UserWarning,
80
+ stacklevel=2,
81
+ )
82
+
83
+ @override
84
+ async def create_dataset_client(
85
+ self,
86
+ *,
87
+ id: str | None = None,
88
+ name: str | None = None,
89
+ alias: str | None = None,
90
+ configuration: Configuration | None = None,
91
+ ) -> RedisDatasetClient:
92
+ configuration = configuration or Configuration.get_global_configuration()
93
+
94
+ client = await RedisDatasetClient.open(
95
+ id=id,
96
+ name=name,
97
+ alias=alias,
98
+ redis=self._redis,
99
+ )
100
+
101
+ await self._purge_if_needed(client, configuration)
102
+ return client
103
+
104
+ @override
105
+ async def create_kvs_client(
106
+ self,
107
+ *,
108
+ id: str | None = None,
109
+ name: str | None = None,
110
+ alias: str | None = None,
111
+ configuration: Configuration | None = None,
112
+ ) -> RedisKeyValueStoreClient:
113
+ configuration = configuration or Configuration.get_global_configuration()
114
+
115
+ client = await RedisKeyValueStoreClient.open(
116
+ id=id,
117
+ name=name,
118
+ alias=alias,
119
+ redis=self._redis,
120
+ )
121
+
122
+ await self._purge_if_needed(client, configuration)
123
+ return client
124
+
125
+ @override
126
+ async def create_rq_client(
127
+ self,
128
+ *,
129
+ id: str | None = None,
130
+ name: str | None = None,
131
+ alias: str | None = None,
132
+ configuration: Configuration | None = None,
133
+ ) -> RedisRequestQueueClient:
134
+ configuration = configuration or Configuration.get_global_configuration()
135
+
136
+ client = await RedisRequestQueueClient.open(
137
+ id=id,
138
+ name=name,
139
+ alias=alias,
140
+ redis=self._redis,
141
+ dedup_strategy=self._queue_dedup_strategy,
142
+ bloom_error_rate=self._queue_bloom_error_rate,
143
+ )
144
+
145
+ await self._purge_if_needed(client, configuration)
146
+ return client
@@ -0,0 +1,23 @@
1
+ from collections.abc import Awaitable
2
+ from pathlib import Path
3
+ from typing import TypeVar, overload
4
+
5
+ T = TypeVar('T')
6
+
7
+
8
+ @overload
9
+ async def await_redis_response(response: Awaitable[T]) -> T: ...
10
+ @overload
11
+ async def await_redis_response(response: T) -> T: ...
12
+
13
+
14
+ async def await_redis_response(response: Awaitable[T] | T) -> T:
15
+ """Solve the problem of ambiguous typing for redis."""
16
+ return await response if isinstance(response, Awaitable) else response
17
+
18
+
19
+ def read_lua_script(script_name: str) -> str:
20
+ """Read a Lua script from a file."""
21
+ file_path = Path(__file__).parent / 'lua_scripts' / script_name
22
+ with file_path.open('r', encoding='utf-8') as file:
23
+ return file.read()
@@ -0,0 +1,36 @@
1
+ local added_filter_key = KEYS[1]
2
+ local queue_key = KEYS[2]
3
+ local data_key = KEYS[3]
4
+
5
+ local forefront = ARGV[1] == '1'
6
+ local unique_keys = cjson.decode(ARGV[2])
7
+ local requests_data = cjson.decode(ARGV[3])
8
+
9
+ -- Add and check which unique keys are actually new using Bloom filter
10
+ local bf_results = redis.call('bf.madd', added_filter_key, unpack(unique_keys))
11
+
12
+ local actually_added = {}
13
+ local hset_args = {}
14
+
15
+ -- Process the results
16
+ for i, unique_key in ipairs(unique_keys) do
17
+ if bf_results[i] == 1 then
18
+ -- This key was added by us (did not exist before)
19
+ table.insert(hset_args, unique_key)
20
+ table.insert(hset_args, requests_data[unique_key])
21
+ table.insert(actually_added, unique_key)
22
+ end
23
+ end
24
+
25
+ -- Add only those that are actually new
26
+ if #actually_added > 0 then
27
+ redis.call('hset', data_key, unpack(hset_args))
28
+
29
+ if forefront then
30
+ redis.call('lpush', queue_key, unpack(actually_added))
31
+ else
32
+ redis.call('rpush', queue_key, unpack(actually_added))
33
+ end
34
+ end
35
+
36
+ return cjson.encode(actually_added)
@@ -0,0 +1,49 @@
1
+ local queue_key = KEYS[1]
2
+ local in_progress_key = KEYS[2]
3
+ local data_key = KEYS[3]
4
+ local client_id = ARGV[1]
5
+ local blocked_until_timestamp = ARGV[2]
6
+ local batch_size = tonumber(ARGV[3])
7
+
8
+ -- Pop batch unique_key from queue
9
+ local batch_result = redis.call('LMPOP', 1, queue_key, 'LEFT', 'COUNT', batch_size)
10
+ if not batch_result then
11
+ return nil
12
+ end
13
+ local unique_keys = batch_result[2]
14
+
15
+ -- Get requests data
16
+ local requests_data = redis.call('HMGET', data_key, unpack(unique_keys))
17
+ if not requests_data then
18
+ -- Data missing, skip this request
19
+ return nil
20
+ end
21
+
22
+ -- Prepare results and update in_progress
23
+ local final_result = {}
24
+ local in_progress_hmset = {}
25
+ local pending_decrement = 0
26
+ local in_progress_data = cjson.encode({
27
+ client_id = client_id,
28
+ blocked_until_timestamp = tonumber(blocked_until_timestamp)
29
+ })
30
+ for i = 1, #unique_keys do
31
+ local unique_key = unique_keys[i]
32
+ local request_data = requests_data[i]
33
+
34
+ if request_data then
35
+ -- Add to in_progress hash
36
+ table.insert(in_progress_hmset, unique_key)
37
+ table.insert(in_progress_hmset, in_progress_data)
38
+
39
+ table.insert(final_result, request_data)
40
+ end
41
+ end
42
+
43
+ -- Update in_progress hash
44
+ if #in_progress_hmset > 0 then
45
+ redis.call('HMSET', in_progress_key, unpack(in_progress_hmset))
46
+ end
47
+
48
+ -- Return result with requests data
49
+ return final_result
@@ -0,0 +1,37 @@
1
+ local added_filter_key = KEYS[1]
2
+ local queue_key = KEYS[2]
3
+ local data_key = KEYS[3]
4
+
5
+ local forefront = ARGV[1] == '1'
6
+ local unique_keys = cjson.decode(ARGV[2])
7
+ local requests_data = cjson.decode(ARGV[3])
8
+
9
+ -- Add and check which unique keys are actually new using Redis set
10
+ local actually_added = {}
11
+ local hset_args = {}
12
+
13
+ -- Process each unique key
14
+ for _, unique_key in ipairs(unique_keys) do
15
+ -- Try to add the key to the set, returns 1 if added, 0 if already existed
16
+ local set_result = redis.call('sadd', added_filter_key, unique_key)
17
+
18
+ if set_result == 1 then
19
+ -- This key was added by us (did not exist before)
20
+ table.insert(hset_args, unique_key)
21
+ table.insert(hset_args, requests_data[unique_key])
22
+ table.insert(actually_added, unique_key)
23
+ end
24
+ end
25
+
26
+ -- Add only those that are actually new
27
+ if #actually_added > 0 then
28
+ redis.call('hset', data_key, unpack(hset_args))
29
+
30
+ if forefront then
31
+ redis.call('lpush', queue_key, unpack(actually_added))
32
+ else
33
+ redis.call('rpush', queue_key, unpack(actually_added))
34
+ end
35
+ end
36
+
37
+ return cjson.encode(actually_added)
@@ -0,0 +1,34 @@
1
+ local in_progress_key = KEYS[1]
2
+ local queue_key = KEYS[2]
3
+ local data_key = KEYS[3]
4
+ local current_time = tonumber(ARGV[1])
5
+
6
+ local max_reclaim = 1000
7
+
8
+ local cursor = "0"
9
+ local count = 0
10
+
11
+ repeat
12
+ local result = redis.call('hscan', in_progress_key, cursor, 'COUNT', 100)
13
+ cursor = result[1]
14
+ local entries = result[2]
15
+
16
+ for i = 1, #entries, 2 do
17
+ if count >= max_reclaim then
18
+ break
19
+ end
20
+
21
+ local unique_key = entries[i]
22
+ local data = cjson.decode(entries[i + 1])
23
+
24
+ -- Check if timed out
25
+ if current_time > data.blocked_until_timestamp then
26
+ -- Atomically remove from in_progress and add back to queue
27
+ redis.call('hdel', in_progress_key, unique_key)
28
+ redis.call('rpush', queue_key, unique_key)
29
+ count = count + 1
30
+ end
31
+ end
32
+ until cursor == "0" or count >= max_reclaim
33
+
34
+ return count
File without changes
@@ -4,7 +4,7 @@ from logging import getLogger
4
4
  from typing import TYPE_CHECKING, Any
5
5
 
6
6
  from sqlalchemy import Select, insert, select
7
- from typing_extensions import override
7
+ from typing_extensions import Self, override
8
8
 
9
9
  from crawlee.storage_clients._base import DatasetClient
10
10
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
@@ -78,7 +78,7 @@ class SqlDatasetClient(DatasetClient, SqlClientMixin):
78
78
  name: str | None,
79
79
  alias: str | None,
80
80
  storage_client: SqlStorageClient,
81
- ) -> SqlDatasetClient:
81
+ ) -> Self:
82
82
  """Open an existing dataset or create a new one.
83
83
 
84
84
  Args:
@@ -205,9 +205,8 @@ class RequestDb(Base):
205
205
  'idx_fetch_available',
206
206
  'request_queue_id',
207
207
  'is_handled',
208
- 'time_blocked_until',
209
208
  'sequence_number',
210
- postgresql_where=text('is_handled = false'),
209
+ postgresql_where=text('is_handled is false'),
211
210
  ),
212
211
  )
213
212
 
@@ -2,10 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import json
4
4
  from logging import getLogger
5
- from typing import TYPE_CHECKING, Any
5
+ from typing import TYPE_CHECKING, Any, cast
6
6
 
7
- from sqlalchemy import delete, select
8
- from typing_extensions import override
7
+ from sqlalchemy import CursorResult, delete, select
8
+ from typing_extensions import Self, override
9
9
 
10
10
  from crawlee._utils.file import infer_mime_type
11
11
  from crawlee.storage_clients._base import KeyValueStoreClient
@@ -77,7 +77,7 @@ class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):
77
77
  name: str | None,
78
78
  alias: str | None,
79
79
  storage_client: SqlStorageClient,
80
- ) -> SqlKeyValueStoreClient:
80
+ ) -> Self:
81
81
  """Open or create a SQL key-value store client.
82
82
 
83
83
  This method attempts to open an existing key-value store from the SQL database. If a KVS with the specified
@@ -227,6 +227,7 @@ class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):
227
227
  async with self.get_session(with_simple_commit=True) as session:
228
228
  # Delete the record if it exists
229
229
  result = await session.execute(stmt)
230
+ result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
230
231
 
231
232
  # Update metadata if we actually deleted something
232
233
  if result.rowcount > 0:
@@ -5,12 +5,12 @@ from datetime import datetime, timedelta, timezone
5
5
  from functools import lru_cache
6
6
  from hashlib import sha256
7
7
  from logging import getLogger
8
- from typing import TYPE_CHECKING, Any
8
+ from typing import TYPE_CHECKING, Any, cast
9
9
 
10
- from sqlalchemy import func, or_, select, update
10
+ from sqlalchemy import CursorResult, func, or_, select, update
11
11
  from sqlalchemy.exc import SQLAlchemyError
12
12
  from sqlalchemy.orm import load_only
13
- from typing_extensions import NotRequired, override
13
+ from typing_extensions import NotRequired, Self, override
14
14
 
15
15
  from crawlee import Request
16
16
  from crawlee._utils.crypto import crypto_random_object_id
@@ -119,7 +119,7 @@ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
119
119
  name: str | None,
120
120
  alias: str | None,
121
121
  storage_client: SqlStorageClient,
122
- ) -> SqlRequestQueueClient:
122
+ ) -> Self:
123
123
  """Open an existing request queue or create a new one.
124
124
 
125
125
  This method first tries to find an existing queue by ID or name.
@@ -231,6 +231,7 @@ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
231
231
 
232
232
  async with self.get_session() as session:
233
233
  result = await session.execute(stmt)
234
+ result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
234
235
  existing_requests = {req.request_id: req for req in result.scalars()}
235
236
  state = await self._get_state(session)
236
237
  insert_values: list[dict] = []
@@ -498,9 +499,12 @@ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
498
499
  )
499
500
  async with self.get_session() as session:
500
501
  result = await session.execute(stmt)
502
+ result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
503
+
501
504
  if result.rowcount == 0:
502
505
  logger.warning(f'Request {request.unique_key} not found in database.')
503
506
  return None
507
+
504
508
  await self._update_metadata(
505
509
  session,
506
510
  **_QueueMetadataUpdateParams(
@@ -542,14 +546,24 @@ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
542
546
  block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME)
543
547
  # Extend blocking for forefront request, it is considered blocked by the current client.
544
548
  stmt = stmt.values(
545
- sequence_number=new_sequence, time_blocked_until=block_until, client_key=self.client_key
549
+ sequence_number=new_sequence,
550
+ time_blocked_until=block_until,
551
+ client_key=self.client_key,
552
+ data=request.model_dump_json(),
546
553
  )
547
554
  else:
548
555
  new_sequence = state.sequence_counter
549
556
  state.sequence_counter += 1
550
- stmt = stmt.values(sequence_number=new_sequence, time_blocked_until=None, client_key=None)
557
+ stmt = stmt.values(
558
+ sequence_number=new_sequence,
559
+ time_blocked_until=None,
560
+ client_key=None,
561
+ data=request.model_dump_json(),
562
+ )
551
563
 
552
564
  result = await session.execute(stmt)
565
+ result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
566
+
553
567
  if result.rowcount == 0:
554
568
  logger.warning(f'Request {request.unique_key} not found in database.')
555
569
  return None
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import sys
3
4
  import warnings
4
5
  from datetime import timedelta
5
6
  from pathlib import Path
@@ -149,7 +150,7 @@ class SqlStorageClient(StorageClient):
149
150
  # Raise an error if the new version creates breaking changes in the database schema.
150
151
  if db_version and db_version != __version__:
151
152
  warnings.warn(
152
- f'Database version {db_version.version} does not match library version {__version__}. '
153
+ f'Database version {db_version} does not match library version {__version__}. '
153
154
  'This may lead to unexpected behavior. Drop the db if you want to make sure that '
154
155
  'everything will work fine.',
155
156
  category=UserWarning,
@@ -268,6 +269,14 @@ class SqlStorageClient(StorageClient):
268
269
  'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
269
270
  )
270
271
 
272
+ # TODO: https://github.com/apify/crawlee-python/issues/1555
273
+ if 'postgresql' in connection_string and sys.version_info >= (3, 14):
274
+ raise ValueError(
275
+ 'SqlStorageClient cannot use PostgreSQL with Python 3.14 '
276
+ 'due to asyncpg compatibility limitations. '
277
+ 'Please use Python 3.13 or earlier, or switch to SQLite.'
278
+ )
279
+
271
280
  self._engine = create_async_engine(
272
281
  connection_string,
273
282
  future=True,
crawlee/storages/_base.py CHANGED
@@ -44,7 +44,9 @@ class Storage(ABC):
44
44
 
45
45
  Args:
46
46
  id: The storage ID.
47
- name: The storage name (global scope, persists across runs).
47
+ name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
48
+ the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
49
+ (e.g. "my-value-1").
48
50
  alias: The storage alias (run scope, creates unnamed storage).
49
51
  configuration: Configuration object used during the storage creation or restoration process.
50
52
  storage_client: Underlying storage client to use. If not provided, the default global storage client
@@ -12,6 +12,7 @@ from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
12
12
 
13
13
  from ._base import Storage
14
14
  from ._key_value_store import KeyValueStore
15
+ from ._utils import validate_storage_name
15
16
 
16
17
  if TYPE_CHECKING:
17
18
  from collections.abc import AsyncIterator
@@ -75,6 +76,8 @@ class Dataset(Storage):
75
76
  id: The unique identifier of the storage.
76
77
  name: The name of the storage, if available.
77
78
  """
79
+ validate_storage_name(name)
80
+
78
81
  self._client = client
79
82
  self._id = id
80
83
  self._name = name
@@ -15,6 +15,7 @@ from crawlee._utils.recoverable_state import RecoverableState
15
15
  from crawlee.storage_clients.models import KeyValueStoreMetadata
16
16
 
17
17
  from ._base import Storage
18
+ from ._utils import validate_storage_name
18
19
 
19
20
  if TYPE_CHECKING:
20
21
  from collections.abc import AsyncIterator
@@ -84,6 +85,8 @@ class KeyValueStore(Storage):
84
85
  id: The unique identifier of the storage.
85
86
  name: The name of the storage, if available.
86
87
  """
88
+ validate_storage_name(name)
89
+
87
90
  self._client = client
88
91
  self._id = id
89
92
  self._name = name
@@ -278,11 +281,14 @@ class KeyValueStore(Storage):
278
281
  if key in cache:
279
282
  return cache[key].current_value.root
280
283
 
284
+ async def kvs_factory() -> KeyValueStore:
285
+ return self
286
+
281
287
  cache[key] = recoverable_state = RecoverableState(
282
288
  default_state=AutosavedValue(default_value),
283
- persistence_enabled=True,
284
- persist_state_kvs_id=self.id,
285
289
  persist_state_key=key,
290
+ persistence_enabled=True,
291
+ persist_state_kvs_factory=kvs_factory,
286
292
  logger=logger,
287
293
  )
288
294
 
@@ -13,6 +13,7 @@ from crawlee._utils.wait import wait_for_all_tasks_for_finish
13
13
  from crawlee.request_loaders import RequestManager
14
14
 
15
15
  from ._base import Storage
16
+ from ._utils import validate_storage_name
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from collections.abc import Sequence
@@ -80,6 +81,8 @@ class RequestQueue(Storage, RequestManager):
80
81
  id: The unique identifier of the storage.
81
82
  name: The name of the storage, if available.
82
83
  """
84
+ validate_storage_name(name)
85
+
83
86
  self._client = client
84
87
  self._id = id
85
88
  self._name = name
@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING, TypeVar
8
8
  from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
9
9
  from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient
10
10
 
11
+ from ._utils import validate_storage_name
12
+
11
13
  if TYPE_CHECKING:
12
14
  from ._base import Storage
13
15
 
@@ -90,7 +92,9 @@ class StorageInstanceManager:
90
92
  Args:
91
93
  cls: The storage class to instantiate.
92
94
  id: Storage ID.
93
- name: Storage name. (global scope, persists across runs).
95
+ name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z",
96
+ the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
97
+ (e.g. "my-value-1").
94
98
  alias: Storage alias (run scope, creates unnamed storage).
95
99
  client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
96
100
  storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.
@@ -146,6 +150,10 @@ class StorageInstanceManager:
146
150
  f'Use a different name or drop the existing alias storage first.'
147
151
  )
148
152
 
153
+ # Validate storage name
154
+ if name is not None:
155
+ validate_storage_name(name)
156
+
149
157
  # Create new instance
150
158
  client: KeyValueStoreClient | DatasetClient | RequestQueueClient
151
159
  client = await client_opener_coro
@@ -0,0 +1,11 @@
1
+ import re
2
+
3
+ NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')
4
+
5
+
6
+ def validate_storage_name(name: str | None) -> None:
7
+ if name and not NAME_REGEX.match(name):
8
+ raise ValueError(
9
+ f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
10
+ '"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
11
+ )