crawlee 1.0.5b12__py3-none-any.whl → 1.0.5b14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
71
71
  async def __aenter__(self) -> Self:
72
72
  self._active = True
73
73
  await self._state.initialize()
74
- self._after_initialize()
75
74
  return self
76
75
 
77
76
  async def __aexit__(
@@ -1,6 +1,7 @@
1
1
  # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
2
2
  from __future__ import annotations
3
3
 
4
+ import asyncio
4
5
  import math
5
6
  import time
6
7
  from datetime import datetime, timedelta, timezone
@@ -84,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
84
85
  self._id = Statistics.__next_id
85
86
  Statistics.__next_id += 1
86
87
 
87
- self._instance_start: datetime | None = None
88
-
89
88
  self.error_tracker = ErrorTracker(
90
89
  save_error_snapshots=save_error_snapshots,
91
90
  snapshot_kvs_name=persist_state_kvs_name,
@@ -111,6 +110,9 @@ class Statistics(Generic[TStatisticsState]):
111
110
  # Flag to indicate the context state.
112
111
  self._active = False
113
112
 
113
+ # Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
114
+ self._runtime_offset = timedelta(seconds=0)
115
+
114
116
  def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
115
117
  """Create near copy of the `Statistics` with replaced `state_model`."""
116
118
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
@@ -165,14 +167,17 @@ class Statistics(Generic[TStatisticsState]):
165
167
  if self._active:
166
168
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
167
169
 
168
- self._active = True
169
- self._instance_start = datetime.now(timezone.utc)
170
-
171
170
  await self._state.initialize()
172
- self._after_initialize()
173
171
 
172
+ self._runtime_offset = self.state.crawler_runtime
173
+
174
+ # Start periodic logging and let it print initial state before activation.
174
175
  self._periodic_logger.start()
176
+ await asyncio.sleep(0.01)
177
+ self._active = True
175
178
 
179
+ self.state.crawler_last_started_at = datetime.now(timezone.utc)
180
+ self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
176
181
  return self
177
182
 
178
183
  async def __aexit__(
@@ -191,14 +196,16 @@ class Statistics(Generic[TStatisticsState]):
191
196
 
192
197
  if not self.state.crawler_last_started_at:
193
198
  raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
194
- self.state.crawler_finished_at = datetime.now(timezone.utc)
195
- self.state.crawler_runtime += self.state.crawler_finished_at - self.state.crawler_last_started_at
196
-
197
- await self._state.teardown()
198
199
 
200
+ # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
199
201
  await self._periodic_logger.stop()
202
+ self.state.crawler_finished_at = datetime.now(timezone.utc)
203
+ self.state.crawler_runtime = (
204
+ self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
205
+ )
200
206
 
201
207
  self._active = False
208
+ await self._state.teardown()
202
209
 
203
210
  @property
204
211
  def state(self) -> TStatisticsState:
@@ -255,10 +262,19 @@ class Statistics(Generic[TStatisticsState]):
255
262
 
256
263
  del self._requests_in_progress[request_id_or_key]
257
264
 
265
+ def _update_crawler_runtime(self) -> None:
266
+ current_run_duration = (
267
+ (datetime.now(timezone.utc) - self.state.crawler_last_started_at)
268
+ if self.state.crawler_last_started_at
269
+ else timedelta()
270
+ )
271
+ self.state.crawler_runtime = current_run_duration + self._runtime_offset
272
+
258
273
  def calculate(self) -> FinalStatistics:
259
274
  """Calculate the current statistics."""
260
- if self._instance_start is None:
261
- raise RuntimeError('The Statistics object is not initialized')
275
+ if self._active:
276
+ # Only update state when active. If not, just report the last known runtime.
277
+ self._update_crawler_runtime()
262
278
 
263
279
  total_minutes = self.state.crawler_runtime.total_seconds() / 60
264
280
  state = self._state.current_value
@@ -291,21 +307,6 @@ class Statistics(Generic[TStatisticsState]):
291
307
  else:
292
308
  self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
293
309
 
294
- def _after_initialize(self) -> None:
295
- state = self._state.current_value
296
-
297
- if state.crawler_started_at is None:
298
- state.crawler_started_at = datetime.now(timezone.utc)
299
-
300
- if state.stats_persisted_at is not None and state.crawler_last_started_at:
301
- self._instance_start = datetime.now(timezone.utc) - (
302
- state.stats_persisted_at - state.crawler_last_started_at
303
- )
304
- elif state.crawler_last_started_at:
305
- self._instance_start = state.crawler_last_started_at
306
-
307
- state.crawler_last_started_at = self._instance_start
308
-
309
310
  def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
310
311
  retry_count = record.retry_count
311
312
  state = self._state.current_value
@@ -13,9 +13,13 @@ _install_import_hook(__name__)
13
13
  with _try_import(__name__, 'SqlStorageClient'):
14
14
  from ._sql import SqlStorageClient
15
15
 
16
+ with _try_import(__name__, 'RedisStorageClient'):
17
+ from ._redis import RedisStorageClient
18
+
16
19
  __all__ = [
17
20
  'FileSystemStorageClient',
18
21
  'MemoryStorageClient',
22
+ 'RedisStorageClient',
19
23
  'SqlStorageClient',
20
24
  'StorageClient',
21
25
  ]
@@ -0,0 +1,6 @@
1
+ from ._dataset_client import RedisDatasetClient
2
+ from ._key_value_store_client import RedisKeyValueStoreClient
3
+ from ._request_queue_client import RedisRequestQueueClient
4
+ from ._storage_client import RedisStorageClient
5
+
6
+ __all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
@@ -0,0 +1,295 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from contextlib import asynccontextmanager
5
+ from datetime import datetime, timezone
6
+ from logging import getLogger
7
+ from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, overload
8
+
9
+ from crawlee._utils.crypto import crypto_random_object_id
10
+
11
+ from ._utils import await_redis_response, read_lua_script
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import AsyncIterator
15
+
16
+ from redis.asyncio import Redis
17
+ from redis.asyncio.client import Pipeline
18
+ from redis.commands.core import AsyncScript
19
+ from typing_extensions import NotRequired, Self
20
+
21
+ from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata
22
+
23
+
24
+ logger = getLogger(__name__)
25
+
26
+
27
+ class MetadataUpdateParams(TypedDict, total=False):
28
+ """Parameters for updating metadata."""
29
+
30
+ update_accessed_at: NotRequired[bool]
31
+ update_modified_at: NotRequired[bool]
32
+
33
+
34
+ class RedisClientMixin:
35
+ """Mixin class for Redis clients.
36
+
37
+ This mixin provides common Redis operations and basic methods for Redis storage clients.
38
+ """
39
+
40
+ _DEFAULT_NAME = 'default'
41
+ """Default storage name in key prefix when none provided."""
42
+
43
+ _MAIN_KEY: ClassVar[str]
44
+ """Main Redis key prefix for this storage type."""
45
+
46
+ _CLIENT_TYPE: ClassVar[str]
47
+ """Human-readable client type for error messages."""
48
+
49
+ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
50
+ self._storage_name = storage_name
51
+ self._storage_id = storage_id
52
+ self._redis = redis
53
+
54
+ self._scripts_loaded = False
55
+
56
+ @property
57
+ def redis(self) -> Redis:
58
+ """Return the Redis client instance."""
59
+ return self._redis
60
+
61
+ @property
62
+ def metadata_key(self) -> str:
63
+ """Return the Redis key for the metadata of this storage."""
64
+ return f'{self._MAIN_KEY}:{self._storage_name}:metadata'
65
+
66
+ @classmethod
67
+ async def _get_metadata_by_name(cls, name: str, redis: Redis, *, with_wait: bool = False) -> dict | None:
68
+ """Retrieve metadata by storage name.
69
+
70
+ Args:
71
+ name: The name of the storage.
72
+ redis: The Redis client instance.
73
+ with_wait: Whether to wait for the storage to be created if it doesn't exist.
74
+ """
75
+ if with_wait:
76
+ # Wait for the creation signal (max 30 seconds)
77
+ await await_redis_response(redis.blpop([f'{cls._MAIN_KEY}:{name}:created_signal'], timeout=30))
78
+ # Signal consumed, push it back for other waiters
79
+ await await_redis_response(redis.lpush(f'{cls._MAIN_KEY}:{name}:created_signal', 1))
80
+
81
+ response = await await_redis_response(redis.json().get(f'{cls._MAIN_KEY}:{name}:metadata'))
82
+ data = response[0] if response is not None and isinstance(response, list) else response
83
+ if data is not None and not isinstance(data, dict):
84
+ raise TypeError('The metadata data was received in an incorrect format.')
85
+ return data
86
+
87
+ @classmethod
88
+ async def _get_metadata_name_by_id(cls, id: str, redis: Redis) -> str | None:
89
+ """Retrieve storage name by ID from id_to_name index.
90
+
91
+ Args:
92
+ id: The ID of the storage.
93
+ redis: The Redis client instance.
94
+ """
95
+ name = await await_redis_response(redis.hget(f'{cls._MAIN_KEY}:id_to_name', id))
96
+ if isinstance(name, str) or name is None:
97
+ return name
98
+ if isinstance(name, bytes):
99
+ return name.decode('utf-8')
100
+ return None
101
+
102
+ @classmethod
103
+ async def _open(
104
+ cls,
105
+ *,
106
+ id: str | None,
107
+ name: str | None,
108
+ alias: str | None,
109
+ metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
110
+ redis: Redis,
111
+ extra_metadata_fields: dict[str, Any],
112
+ instance_kwargs: dict[str, Any],
113
+ ) -> Self:
114
+ """Open or create a new Redis storage client.
115
+
116
+ Args:
117
+ id: The ID of the storage. If not provided, a random ID will be generated.
118
+ name: The name of the storage for named (global scope) storages.
119
+ alias: The alias of the storage for unnamed (run scope) storages.
120
+ redis: Redis client instance.
121
+ metadata_model: Pydantic model for metadata validation.
122
+ extra_metadata_fields: Storage-specific metadata fields.
123
+ instance_kwargs: Additional arguments for the client constructor.
124
+
125
+ Returns:
126
+ An instance for the opened or created storage client.
127
+ """
128
+ internal_name = name or alias or cls._DEFAULT_NAME
129
+ storage_id: str | None = None
130
+ # Determine if storage exists by ID or name
131
+ if id:
132
+ storage_name = await cls._get_metadata_name_by_id(id=id, redis=redis)
133
+ storage_id = id
134
+ if storage_name is None:
135
+ raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" does not exist.')
136
+ else:
137
+ metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis)
138
+ storage_name = internal_name if metadata_data is not None else None
139
+ storage_id = metadata_data['id'] if metadata_data is not None else None
140
+ # If both storage_name and storage_id are found, open existing storage
141
+ if storage_name and storage_id:
142
+ client = cls(storage_name=storage_name, storage_id=storage_id, redis=redis, **instance_kwargs)
143
+ async with client._get_pipeline() as pipe:
144
+ await client._update_metadata(pipe, update_accessed_at=True)
145
+ # Otherwise, create a new storage
146
+ else:
147
+ now = datetime.now(timezone.utc)
148
+ metadata = metadata_model(
149
+ id=crypto_random_object_id(),
150
+ name=name,
151
+ created_at=now,
152
+ accessed_at=now,
153
+ modified_at=now,
154
+ **extra_metadata_fields,
155
+ )
156
+ client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs)
157
+ created = await client._create_metadata_and_storage(internal_name, metadata.model_dump())
158
+ # The client was probably not created due to a race condition. Let's try to open it using the name.
159
+ if not created:
160
+ metadata_data = await cls._get_metadata_by_name(name=internal_name, redis=redis, with_wait=True)
161
+ client = cls(storage_name=internal_name, storage_id=metadata.id, redis=redis, **instance_kwargs)
162
+
163
+ # Ensure Lua scripts are loaded
164
+ await client._ensure_scripts_loaded()
165
+ return client
166
+
167
+ async def _load_scripts(self) -> None:
168
+ """Load Lua scripts in Redis."""
169
+ return
170
+
171
+ async def _ensure_scripts_loaded(self) -> None:
172
+ """Ensure Lua scripts are loaded in Redis."""
173
+ if not self._scripts_loaded:
174
+ await self._load_scripts()
175
+ self._scripts_loaded = True
176
+
177
+ @asynccontextmanager
178
+ async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pipeline]:
179
+ """Create a new Redis pipeline."""
180
+ async with self._redis.pipeline() as pipe:
181
+ try:
182
+ pipe.multi() # type: ignore[no-untyped-call]
183
+ yield pipe
184
+ finally:
185
+ if with_execute:
186
+ await pipe.execute()
187
+
188
+ async def _create_storage(self, pipeline: Pipeline) -> None:
189
+ """Create the actual storage structure in Redis."""
190
+ _ = pipeline # To avoid unused variable mypy error
191
+
192
+ async def _create_script(self, script_name: str) -> AsyncScript:
193
+ """Load a Lua script from a file and return a Script object."""
194
+ script_content = await asyncio.to_thread(read_lua_script, script_name)
195
+
196
+ return self._redis.register_script(script_content)
197
+
198
+ async def _create_metadata_and_storage(self, storage_name: str, metadata: dict) -> bool:
199
+ index_id_to_name = f'{self._MAIN_KEY}:id_to_name'
200
+ index_name_to_id = f'{self._MAIN_KEY}:name_to_id'
201
+ metadata['created_at'] = metadata['created_at'].isoformat()
202
+ metadata['accessed_at'] = metadata['accessed_at'].isoformat()
203
+ metadata['modified_at'] = metadata['modified_at'].isoformat()
204
+
205
+ # Try to create name_to_id index entry, if it already exists, return False.
206
+ name_to_id = await await_redis_response(self._redis.hsetnx(index_name_to_id, storage_name, metadata['id']))
207
+ # If name already exists, return False. Probably an attempt at parallel creation.
208
+ if not name_to_id:
209
+ return False
210
+
211
+ # Create id_to_name index entry, metadata, and storage structure in a transaction.
212
+ async with self._get_pipeline() as pipe:
213
+ await await_redis_response(pipe.hsetnx(index_id_to_name, metadata['id'], storage_name))
214
+ await await_redis_response(pipe.json().set(self.metadata_key, '$', metadata))
215
+ await await_redis_response(pipe.lpush(f'{self._MAIN_KEY}:{storage_name}:created_signal', 1))
216
+
217
+ await self._create_storage(pipe)
218
+
219
+ return True
220
+
221
+ async def _drop(self, extra_keys: list[str]) -> None:
222
+ async with self._get_pipeline() as pipe:
223
+ await pipe.delete(self.metadata_key)
224
+ await pipe.delete(f'{self._MAIN_KEY}:id_to_name', self._storage_id)
225
+ await pipe.delete(f'{self._MAIN_KEY}:name_to_id', self._storage_name)
226
+ await pipe.delete(f'{self._MAIN_KEY}:{self._storage_name}:created_signal')
227
+ for key in extra_keys:
228
+ await pipe.delete(key)
229
+
230
+ async def _purge(self, extra_keys: list[str], metadata_kwargs: MetadataUpdateParams) -> None:
231
+ async with self._get_pipeline() as pipe:
232
+ for key in extra_keys:
233
+ await pipe.delete(key)
234
+ await self._update_metadata(pipe, **metadata_kwargs)
235
+ await self._create_storage(pipe)
236
+
237
+ @overload
238
+ async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ...
239
+ @overload
240
+ async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ...
241
+ @overload
242
+ async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ...
243
+
244
+ async def _get_metadata(
245
+ self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata]
246
+ ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:
247
+ """Retrieve client metadata."""
248
+ metadata_dict = await self._get_metadata_by_name(name=self._storage_name, redis=self._redis)
249
+ if metadata_dict is None:
250
+ raise ValueError(f'{self._CLIENT_TYPE} with name "{self._storage_name}" does not exist.')
251
+ async with self._get_pipeline() as pipe:
252
+ await self._update_metadata(pipe, update_accessed_at=True)
253
+
254
+ return metadata_model.model_validate(metadata_dict)
255
+
256
+ async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> None:
257
+ """Pipeline operations storage-specific metadata updates.
258
+
259
+ Must be implemented by concrete classes.
260
+
261
+ Args:
262
+ pipeline: The Redis pipeline to use for the update.
263
+ **kwargs: Storage-specific update parameters.
264
+ """
265
+ _ = pipeline # To avoid unused variable mypy error
266
+ _ = kwargs
267
+
268
+ async def _update_metadata(
269
+ self,
270
+ pipeline: Pipeline,
271
+ *,
272
+ update_accessed_at: bool = False,
273
+ update_modified_at: bool = False,
274
+ **kwargs: Any,
275
+ ) -> None:
276
+ """Update storage metadata combining common and specific fields.
277
+
278
+ Args:
279
+ pipeline: The Redis pipeline to use for the update.
280
+ update_accessed_at: Whether to update accessed_at timestamp.
281
+ update_modified_at: Whether to update modified_at timestamp.
282
+ **kwargs: Additional arguments for _specific_update_metadata.
283
+ """
284
+ now = datetime.now(timezone.utc)
285
+
286
+ if update_accessed_at:
287
+ await await_redis_response(
288
+ pipeline.json().set(self.metadata_key, '$.accessed_at', now.isoformat(), nx=False, xx=True)
289
+ )
290
+ if update_modified_at:
291
+ await await_redis_response(
292
+ pipeline.json().set(self.metadata_key, '$.modified_at', now.isoformat(), nx=False, xx=True)
293
+ )
294
+
295
+ await self._specific_update_metadata(pipeline, **kwargs)