crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +44 -5
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +156 -131
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
  61. crawlee/storage_clients/_redis/__init__.py +6 -0
  62. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  63. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  64. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  65. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  66. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  67. crawlee/storage_clients/_redis/_utils.py +23 -0
  68. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  69. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  70. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  71. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  72. crawlee/storage_clients/_redis/py.typed +0 -0
  73. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  74. crawlee/storage_clients/_sql/_db_models.py +1 -2
  75. crawlee/storage_clients/models.py +8 -3
  76. crawlee/storages/_key_value_store.py +5 -2
  77. crawlee/storages/_storage_instance_manager.py +103 -44
  78. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
  79. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
  80. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  81. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  82. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,149 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from typing import Literal
5
+
6
+ from redis.asyncio import Redis
7
+ from typing_extensions import override
8
+
9
+ from crawlee._utils.docs import docs_group
10
+ from crawlee.configuration import Configuration
11
+ from crawlee.storage_clients._base import StorageClient
12
+
13
+ from ._dataset_client import RedisDatasetClient
14
+ from ._key_value_store_client import RedisKeyValueStoreClient
15
+ from ._request_queue_client import RedisRequestQueueClient
16
+
17
+
18
+ @docs_group('Storage clients')
19
+ class RedisStorageClient(StorageClient):
20
+ """Redis implementation of the storage client.
21
+
22
+ This storage client provides access to datasets, key-value stores, and request queues that persist data
23
+ to a Redis database v8.0+. Each storage type uses Redis-specific data structures and key patterns for
24
+ efficient storage and retrieval.
25
+
26
+ The client accepts either a Redis connection string or a pre-configured Redis client instance.
27
+ Exactly one of these parameters must be provided during initialization.
28
+
29
+ Storage types use the following Redis data structures:
30
+ - **Datasets**: Redis JSON arrays for item storage with metadata in JSON objects
31
+ - **Key-value stores**: Redis hashes for key-value pairs with separate metadata storage
32
+ - **Request queues**: Redis lists for FIFO queuing, hashes for request data and in-progress tracking,
33
+ and Bloom filters for request deduplication
34
+
35
+ Warning:
36
+ This is an experimental feature. The behavior and interface may change in future versions.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ *,
42
+ connection_string: str | None = None,
43
+ redis: Redis | None = None,
44
+ queue_dedup_strategy: Literal['default', 'bloom'] = 'default',
45
+ queue_bloom_error_rate: float = 1e-7,
46
+ ) -> None:
47
+ """Initialize the Redis storage client.
48
+
49
+ Args:
50
+ connection_string: Redis connection string (e.g., "redis://localhost:6379").
51
+ Supports standard Redis URL format with optional database selection.
52
+ redis: Pre-configured Redis client instance.
53
+ queue_dedup_strategy: Strategy for request queue deduplication. Options are:
54
+ - 'default': Uses Redis sets for exact deduplication.
55
+ - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using
56
+ this approach, approximately 1 in 1e-7 requests will be falsely considered duplicate.
57
+ queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if
58
+ `queue_dedup_strategy` is set to 'bloom'.
59
+ """
60
+ if redis is None and connection_string is None:
61
+ raise ValueError('Either redis or connection_string must be provided.')
62
+
63
+ if redis is not None and connection_string is not None:
64
+ raise ValueError('Either redis or connection_string must be provided, not both.')
65
+
66
+ if isinstance(redis, Redis) and connection_string is None:
67
+ self._redis = redis
68
+
69
+ if isinstance(connection_string, str) and redis is None:
70
+ self._redis = Redis.from_url(connection_string)
71
+
72
+ self._redis: Redis # to help type checker
73
+ self._queue_dedup_strategy = queue_dedup_strategy
74
+ self._queue_bloom_error_rate = queue_bloom_error_rate
75
+
76
+ # Call the notification only once
77
+ warnings.warn(
78
+ (
79
+ 'RedisStorageClient is experimental and its API, behavior, and key structure may change in future '
80
+ 'releases.'
81
+ ),
82
+ category=UserWarning,
83
+ stacklevel=2,
84
+ )
85
+
86
+ @override
87
+ async def create_dataset_client(
88
+ self,
89
+ *,
90
+ id: str | None = None,
91
+ name: str | None = None,
92
+ alias: str | None = None,
93
+ configuration: Configuration | None = None,
94
+ ) -> RedisDatasetClient:
95
+ configuration = configuration or Configuration.get_global_configuration()
96
+
97
+ client = await RedisDatasetClient.open(
98
+ id=id,
99
+ name=name,
100
+ alias=alias,
101
+ redis=self._redis,
102
+ )
103
+
104
+ await self._purge_if_needed(client, configuration)
105
+ return client
106
+
107
+ @override
108
+ async def create_kvs_client(
109
+ self,
110
+ *,
111
+ id: str | None = None,
112
+ name: str | None = None,
113
+ alias: str | None = None,
114
+ configuration: Configuration | None = None,
115
+ ) -> RedisKeyValueStoreClient:
116
+ configuration = configuration or Configuration.get_global_configuration()
117
+
118
+ client = await RedisKeyValueStoreClient.open(
119
+ id=id,
120
+ name=name,
121
+ alias=alias,
122
+ redis=self._redis,
123
+ )
124
+
125
+ await self._purge_if_needed(client, configuration)
126
+ return client
127
+
128
+ @override
129
+ async def create_rq_client(
130
+ self,
131
+ *,
132
+ id: str | None = None,
133
+ name: str | None = None,
134
+ alias: str | None = None,
135
+ configuration: Configuration | None = None,
136
+ ) -> RedisRequestQueueClient:
137
+ configuration = configuration or Configuration.get_global_configuration()
138
+
139
+ client = await RedisRequestQueueClient.open(
140
+ id=id,
141
+ name=name,
142
+ alias=alias,
143
+ redis=self._redis,
144
+ dedup_strategy=self._queue_dedup_strategy,
145
+ bloom_error_rate=self._queue_bloom_error_rate,
146
+ )
147
+
148
+ await self._purge_if_needed(client, configuration)
149
+ return client
@@ -0,0 +1,23 @@
1
+ from collections.abc import Awaitable
2
+ from pathlib import Path
3
+ from typing import TypeVar, overload
4
+
5
+ T = TypeVar('T')
6
+
7
+
8
+ @overload
9
+ async def await_redis_response(response: Awaitable[T]) -> T: ...
10
+ @overload
11
+ async def await_redis_response(response: T) -> T: ...
12
+
13
+
14
+ async def await_redis_response(response: Awaitable[T] | T) -> T:
15
+ """Solve the problem of ambiguous typing for redis."""
16
+ return await response if isinstance(response, Awaitable) else response
17
+
18
+
19
+ def read_lua_script(script_name: str) -> str:
20
+ """Read a Lua script from a file."""
21
+ file_path = Path(__file__).parent / 'lua_scripts' / script_name
22
+ with file_path.open(mode='r', encoding='utf-8') as file:
23
+ return file.read()
@@ -0,0 +1,36 @@
1
+ local added_filter_key = KEYS[1]
2
+ local queue_key = KEYS[2]
3
+ local data_key = KEYS[3]
4
+
5
+ local forefront = ARGV[1] == '1'
6
+ local unique_keys = cjson.decode(ARGV[2])
7
+ local requests_data = cjson.decode(ARGV[3])
8
+
9
+ -- Add and check which unique keys are actually new using Bloom filter
10
+ local bf_results = redis.call('bf.madd', added_filter_key, unpack(unique_keys))
11
+
12
+ local actually_added = {}
13
+ local hset_args = {}
14
+
15
+ -- Process the results
16
+ for i, unique_key in ipairs(unique_keys) do
17
+ if bf_results[i] == 1 then
18
+ -- This key was added by us (did not exist before)
19
+ table.insert(hset_args, unique_key)
20
+ table.insert(hset_args, requests_data[unique_key])
21
+ table.insert(actually_added, unique_key)
22
+ end
23
+ end
24
+
25
+ -- Add only those that are actually new
26
+ if #actually_added > 0 then
27
+ redis.call('hset', data_key, unpack(hset_args))
28
+
29
+ if forefront then
30
+ redis.call('lpush', queue_key, unpack(actually_added))
31
+ else
32
+ redis.call('rpush', queue_key, unpack(actually_added))
33
+ end
34
+ end
35
+
36
+ return cjson.encode(actually_added)
@@ -0,0 +1,49 @@
1
+ local queue_key = KEYS[1]
2
+ local in_progress_key = KEYS[2]
3
+ local data_key = KEYS[3]
4
+ local client_id = ARGV[1]
5
+ local blocked_until_timestamp = ARGV[2]
6
+ local batch_size = tonumber(ARGV[3])
7
+
8
+ -- Pop batch unique_key from queue
9
+ local batch_result = redis.call('LMPOP', 1, queue_key, 'LEFT', 'COUNT', batch_size)
10
+ if not batch_result then
11
+ return nil
12
+ end
13
+ local unique_keys = batch_result[2]
14
+
15
+ -- Get requests data
16
+ local requests_data = redis.call('HMGET', data_key, unpack(unique_keys))
17
+ if not requests_data then
18
+ -- Data missing, skip this request
19
+ return nil
20
+ end
21
+
22
+ -- Prepare results and update in_progress
23
+ local final_result = {}
24
+ local in_progress_hmset = {}
25
+ local pending_decrement = 0
26
+ local in_progress_data = cjson.encode({
27
+ client_id = client_id,
28
+ blocked_until_timestamp = tonumber(blocked_until_timestamp)
29
+ })
30
+ for i = 1, #unique_keys do
31
+ local unique_key = unique_keys[i]
32
+ local request_data = requests_data[i]
33
+
34
+ if request_data then
35
+ -- Add to in_progress hash
36
+ table.insert(in_progress_hmset, unique_key)
37
+ table.insert(in_progress_hmset, in_progress_data)
38
+
39
+ table.insert(final_result, request_data)
40
+ end
41
+ end
42
+
43
+ -- Update in_progress hash
44
+ if #in_progress_hmset > 0 then
45
+ redis.call('HMSET', in_progress_key, unpack(in_progress_hmset))
46
+ end
47
+
48
+ -- Return result with requests data
49
+ return final_result
@@ -0,0 +1,37 @@
1
+ local added_filter_key = KEYS[1]
2
+ local queue_key = KEYS[2]
3
+ local data_key = KEYS[3]
4
+
5
+ local forefront = ARGV[1] == '1'
6
+ local unique_keys = cjson.decode(ARGV[2])
7
+ local requests_data = cjson.decode(ARGV[3])
8
+
9
+ -- Add and check which unique keys are actually new using Redis set
10
+ local actually_added = {}
11
+ local hset_args = {}
12
+
13
+ -- Process each unique key
14
+ for _, unique_key in ipairs(unique_keys) do
15
+ -- Try to add the key to the set, returns 1 if added, 0 if already existed
16
+ local set_result = redis.call('sadd', added_filter_key, unique_key)
17
+
18
+ if set_result == 1 then
19
+ -- This key was added by us (did not exist before)
20
+ table.insert(hset_args, unique_key)
21
+ table.insert(hset_args, requests_data[unique_key])
22
+ table.insert(actually_added, unique_key)
23
+ end
24
+ end
25
+
26
+ -- Add only those that are actually new
27
+ if #actually_added > 0 then
28
+ redis.call('hset', data_key, unpack(hset_args))
29
+
30
+ if forefront then
31
+ redis.call('lpush', queue_key, unpack(actually_added))
32
+ else
33
+ redis.call('rpush', queue_key, unpack(actually_added))
34
+ end
35
+ end
36
+
37
+ return cjson.encode(actually_added)
@@ -0,0 +1,34 @@
1
+ local in_progress_key = KEYS[1]
2
+ local queue_key = KEYS[2]
3
+ local data_key = KEYS[3]
4
+ local current_time = tonumber(ARGV[1])
5
+
6
+ local max_reclaim = 1000
7
+
8
+ local cursor = "0"
9
+ local count = 0
10
+
11
+ repeat
12
+ local result = redis.call('hscan', in_progress_key, cursor, 'COUNT', 100)
13
+ cursor = result[1]
14
+ local entries = result[2]
15
+
16
+ for i = 1, #entries, 2 do
17
+ if count >= max_reclaim then
18
+ break
19
+ end
20
+
21
+ local unique_key = entries[i]
22
+ local data = cjson.decode(entries[i + 1])
23
+
24
+ -- Check if timed out
25
+ if current_time > data.blocked_until_timestamp then
26
+ -- Atomically remove from in_progress and add back to queue
27
+ redis.call('hdel', in_progress_key, unique_key)
28
+ redis.call('rpush', queue_key, unique_key)
29
+ count = count + 1
30
+ end
31
+ end
32
+ until cursor == "0" or count >= max_reclaim
33
+
34
+ return count
File without changes
@@ -105,7 +105,7 @@ class SqlClientMixin(ABC):
105
105
  else:
106
106
  stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
107
107
  result = await session.execute(stmt)
108
- orm_metadata = result.scalar_one_or_none() # type: ignore[assignment]
108
+ orm_metadata = result.scalar_one_or_none()
109
109
 
110
110
  if orm_metadata:
111
111
  client = cls(id=orm_metadata.id, storage_client=storage_client)
@@ -205,9 +205,8 @@ class RequestDb(Base):
205
205
  'idx_fetch_available',
206
206
  'request_queue_id',
207
207
  'is_handled',
208
- 'time_blocked_until',
209
208
  'sequence_number',
210
- postgresql_where=text('is_handled = false'),
209
+ postgresql_where=text('is_handled is false'),
211
210
  ),
212
211
  )
213
212
 
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from datetime import datetime
4
- from typing import Annotated, Any, Generic
4
+ from typing import TYPE_CHECKING, Annotated, Any, Generic
5
5
 
6
6
  from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
7
7
  from typing_extensions import TypeVar
@@ -127,8 +127,13 @@ class DatasetItemsListPage(BaseModel):
127
127
  desc: Annotated[bool, Field(default=False)]
128
128
  """Indicates if the returned list is in descending order."""
129
129
 
130
- items: Annotated[list[dict], Field(default_factory=list)]
131
- """The list of dataset items returned on this page."""
130
+ # Workaround for Pydantic and type checkers when using Annotated with default_factory
131
+ if TYPE_CHECKING:
132
+ items: list[dict] = []
133
+ """The list of dataset items returned on this page."""
134
+ else:
135
+ items: Annotated[list[dict], Field(default_factory=list)]
136
+ """The list of dataset items returned on this page."""
132
137
 
133
138
 
134
139
  @docs_group('Storage data')
@@ -281,11 +281,14 @@ class KeyValueStore(Storage):
281
281
  if key in cache:
282
282
  return cache[key].current_value.root
283
283
 
284
+ async def kvs_factory() -> KeyValueStore:
285
+ return self
286
+
284
287
  cache[key] = recoverable_state = RecoverableState(
285
288
  default_state=AutosavedValue(default_value),
286
- persistence_enabled=True,
287
- persist_state_kvs_id=self.id,
288
289
  persist_state_key=key,
290
+ persistence_enabled=True,
291
+ persist_state_kvs_factory=kvs_factory,
289
292
  logger=logger,
290
293
  )
291
294
 
@@ -1,9 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from asyncio import Lock
3
4
  from collections import defaultdict
4
5
  from collections.abc import Coroutine, Hashable
5
6
  from dataclasses import dataclass, field
6
7
  from typing import TYPE_CHECKING, TypeVar
8
+ from weakref import WeakValueDictionary
7
9
 
8
10
  from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
9
11
  from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient
@@ -76,6 +78,7 @@ class StorageInstanceManager:
76
78
 
77
79
  def __init__(self) -> None:
78
80
  self._cache: _StorageCache = _StorageCache()
81
+ self._opener_locks: WeakValueDictionary[tuple, Lock] = WeakValueDictionary()
79
82
 
80
83
  async def open_storage_instance(
81
84
  self,
@@ -119,63 +122,71 @@ class StorageInstanceManager:
119
122
  if not any([name, alias, id]):
120
123
  alias = self._DEFAULT_STORAGE_ALIAS
121
124
 
122
- # Check cache
123
- if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):
124
- if isinstance(cached_instance, cls):
125
- return cached_instance
126
- raise RuntimeError('Cached instance type mismatch.')
125
+ # Check cache without lock first for performance.
126
+ if cached_instance := self._get_from_cache(
127
+ cls,
128
+ id=id,
129
+ name=name,
130
+ alias=alias,
131
+ storage_client_cache_key=storage_client_cache_key,
132
+ ):
133
+ return cached_instance
127
134
 
128
- if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):
129
- if isinstance(cached_instance, cls):
130
- return cached_instance
131
- raise RuntimeError('Cached instance type mismatch.')
135
+ # Validate storage name
136
+ if name is not None:
137
+ validate_storage_name(name)
132
138
 
133
- if alias is not None and (
134
- cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)
135
- ):
136
- if isinstance(cached_instance, cls):
139
+ # Acquire lock for this opener
140
+ opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key)
141
+ if not (lock := self._opener_locks.get(opener_lock_key)):
142
+ lock = Lock()
143
+ self._opener_locks[opener_lock_key] = lock
144
+
145
+ async with lock:
146
+ # Another task could have created the storage while we were waiting for the lock - check if that
147
+ # happened
148
+ if cached_instance := self._get_from_cache(
149
+ cls,
150
+ id=id,
151
+ name=name,
152
+ alias=alias,
153
+ storage_client_cache_key=storage_client_cache_key,
154
+ ):
137
155
  return cached_instance
138
- raise RuntimeError('Cached instance type mismatch.')
139
156
 
140
- # Check for conflicts between named and alias storages
141
- if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):
142
- raise ValueError(
143
- f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. '
144
- f'Use a different alias or drop the existing named storage first.'
157
+ # Check for conflicts between named and alias storages
158
+ self._check_name_alias_conflict(
159
+ cls,
160
+ name=name,
161
+ alias=alias,
162
+ storage_client_cache_key=storage_client_cache_key,
145
163
  )
146
164
 
147
- if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):
148
- raise ValueError(
149
- f'Cannot create named storage "{name}" because an alias storage with the same name already exists. '
150
- f'Use a different name or drop the existing alias storage first.'
151
- )
165
+ # Create new instance
166
+ client: KeyValueStoreClient | DatasetClient | RequestQueueClient
167
+ client = await client_opener_coro
152
168
 
153
- # Validate storage name
154
- if name is not None:
155
- validate_storage_name(name)
156
-
157
- # Create new instance
158
- client: KeyValueStoreClient | DatasetClient | RequestQueueClient
159
- client = await client_opener_coro
169
+ metadata = await client.get_metadata()
160
170
 
161
- metadata = await client.get_metadata()
171
+ instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg]
172
+ instance_name = getattr(instance, 'name', None)
162
173
 
163
- instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg]
164
- instance_name = getattr(instance, 'name', None)
174
+ # Cache the instance.
175
+ # Note: No awaits in this section. All cache entries must be written
176
+ # atomically to ensure pre-checks outside the lock see consistent state.
165
177
 
166
- # Cache the instance.
167
- # Always cache by id.
168
- self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance
178
+ # Always cache by id.
179
+ self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance
169
180
 
170
- # Cache named storage.
171
- if instance_name is not None:
172
- self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance
181
+ # Cache named storage.
182
+ if instance_name is not None:
183
+ self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance
173
184
 
174
- # Cache unnamed storage.
175
- if alias is not None:
176
- self._cache.by_alias[cls][alias][storage_client_cache_key] = instance
185
+ # Cache unnamed storage.
186
+ if alias is not None:
187
+ self._cache.by_alias[cls][alias][storage_client_cache_key] = instance
177
188
 
178
- return instance
189
+ return instance
179
190
 
180
191
  finally:
181
192
  # Make sure the client opener is closed.
@@ -193,3 +204,51 @@ class StorageInstanceManager:
193
204
  def clear_cache(self) -> None:
194
205
  """Clear all cached storage instances."""
195
206
  self._cache = _StorageCache()
207
+
208
+ def _get_from_cache(
209
+ self,
210
+ cls: type[T],
211
+ *,
212
+ id: str | None = None,
213
+ name: str | None = None,
214
+ alias: str | None = None,
215
+ storage_client_cache_key: Hashable = '',
216
+ ) -> T | None:
217
+ """Get a storage instance from the cache."""
218
+ if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):
219
+ if isinstance(cached_instance, cls):
220
+ return cached_instance
221
+ raise RuntimeError('Cached instance type mismatch.')
222
+
223
+ if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):
224
+ if isinstance(cached_instance, cls):
225
+ return cached_instance
226
+ raise RuntimeError('Cached instance type mismatch.')
227
+
228
+ if alias is not None and (cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)):
229
+ if isinstance(cached_instance, cls):
230
+ return cached_instance
231
+ raise RuntimeError('Cached instance type mismatch.')
232
+
233
+ return None
234
+
235
+ def _check_name_alias_conflict(
236
+ self,
237
+ cls: type[T],
238
+ *,
239
+ name: str | None = None,
240
+ alias: str | None = None,
241
+ storage_client_cache_key: Hashable = '',
242
+ ) -> None:
243
+ """Check for conflicts between named and alias storages."""
244
+ if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):
245
+ raise ValueError(
246
+ f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. '
247
+ f'Use a different alias or drop the existing named storage first.'
248
+ )
249
+
250
+ if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):
251
+ raise ValueError(
252
+ f'Cannot create named storage "{name}" because an alias storage with the same name already exists. '
253
+ f'Use a different name or drop the existing alias storage first.'
254
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.0.3b6
3
+ Version: 1.2.2b24
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -223,15 +223,17 @@ Classifier: Programming Language :: Python :: 3.10
223
223
  Classifier: Programming Language :: Python :: 3.11
224
224
  Classifier: Programming Language :: Python :: 3.12
225
225
  Classifier: Programming Language :: Python :: 3.13
226
+ Classifier: Programming Language :: Python :: 3.14
226
227
  Classifier: Topic :: Software Development :: Libraries
227
228
  Requires-Python: >=3.10
229
+ Requires-Dist: async-timeout>=5.0.1
228
230
  Requires-Dist: cachetools>=5.5.0
229
231
  Requires-Dist: colorama>=0.4.0
230
- Requires-Dist: impit>=0.6.1
232
+ Requires-Dist: impit>=0.8.0
231
233
  Requires-Dist: more-itertools>=10.2.0
232
234
  Requires-Dist: protego>=0.5.0
233
235
  Requires-Dist: psutil>=6.0.0
234
- Requires-Dist: pydantic-settings!=2.7.0,!=2.7.1,!=2.8.0,>=2.2.0
236
+ Requires-Dist: pydantic-settings>=2.12.0
235
237
  Requires-Dist: pydantic>=2.11.0
236
238
  Requires-Dist: pyee>=9.0.0
237
239
  Requires-Dist: tldextract>=5.1.0
@@ -263,6 +265,7 @@ Requires-Dist: opentelemetry-sdk>=1.34.1; extra == 'all'
263
265
  Requires-Dist: opentelemetry-semantic-conventions>=0.54; extra == 'all'
264
266
  Requires-Dist: parsel>=1.10.0; extra == 'all'
265
267
  Requires-Dist: playwright>=1.27.0; extra == 'all'
268
+ Requires-Dist: redis[hiredis]>=7.0.0; extra == 'all'
266
269
  Requires-Dist: rich>=13.9.0; extra == 'all'
267
270
  Requires-Dist: scikit-learn>=1.6.0; extra == 'all'
268
271
  Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'all'
@@ -296,6 +299,8 @@ Provides-Extra: playwright
296
299
  Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'playwright'
297
300
  Requires-Dist: browserforge>=1.2.3; extra == 'playwright'
298
301
  Requires-Dist: playwright>=1.27.0; extra == 'playwright'
302
+ Provides-Extra: redis
303
+ Requires-Dist: redis[hiredis]>=7.0.0; extra == 'redis'
299
304
  Provides-Extra: sql-postgres
300
305
  Requires-Dist: asyncpg>=0.24.0; extra == 'sql-postgres'
301
306
  Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-postgres'
@@ -319,19 +324,12 @@ Description-Content-Type: text/markdown
319
324
  <a href="https://trendshift.io/repositories/11169" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11169" alt="apify%2Fcrawlee-python | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
320
325
  </p>
321
326
 
322
- <p align=center>
323
- <a href="https://badge.fury.io/py/crawlee" rel="nofollow">
324
- <img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI version" style="max-width: 100%;">
325
- </a>
326
- <a href="https://pypi.org/project/crawlee/" rel="nofollow">
327
- <img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI - Downloads" style="max-width: 100%;">
328
- </a>
329
- <a href="https://pypi.org/project/crawlee/" rel="nofollow">
330
- <img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI - Python Version" style="max-width: 100%;">
331
- </a>
332
- <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow">
333
- <img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on discord" style="max-width: 100%;">
334
- </a>
327
+ <p align="center">
328
+ <a href="https://badge.fury.io/py/crawlee" rel="nofollow"><img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI package version"></a>
329
+ <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI package downloads"></a>
330
+ <a href="https://codecov.io/gh/apify/crawlee-python"><img src="https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG" alt="Codecov report"></a>
331
+ <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI Python version"></a>
332
+ <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow"><img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on Discord"></a>
335
333
  </p>
336
334
 
337
335
  Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**