crawlee 1.0.2b3__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +30 -17
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +47 -11
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +33 -13
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +126 -112
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +55 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
- crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
- crawlee/storage_clients/_file_system/_request_queue_client.py +27 -9
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/_sql/_key_value_store_client.py +3 -2
- crawlee/storage_clients/_sql/_request_queue_client.py +18 -4
- crawlee/storage_clients/_sql/_storage_client.py +1 -1
- crawlee/storages/_key_value_store.py +5 -2
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +8 -3
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +66 -54
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.2b3.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
|
-
from crawlee import Request
|
|
12
|
+
from crawlee import Request, RequestOptions
|
|
13
13
|
from crawlee._utils.docs import docs_group
|
|
14
14
|
from crawlee._utils.globs import Glob
|
|
15
15
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
|
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
import re
|
|
21
|
-
from collections.abc import Sequence
|
|
21
|
+
from collections.abc import Callable, Sequence
|
|
22
22
|
from types import TracebackType
|
|
23
23
|
|
|
24
|
+
from crawlee import RequestTransformAction
|
|
24
25
|
from crawlee.http_clients import HttpClient
|
|
25
26
|
from crawlee.proxy_configuration import ProxyInfo
|
|
26
27
|
from crawlee.storage_clients.models import ProcessedRequest
|
|
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
|
|
|
90
91
|
class SitemapRequestLoader(RequestLoader):
|
|
91
92
|
"""A request loader that reads URLs from sitemap(s).
|
|
92
93
|
|
|
94
|
+
The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
|
|
95
|
+
(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
|
|
96
|
+
Note that HTML pages containing links are not supported - those should be handled by regular crawlers
|
|
97
|
+
and the `enqueue_links` functionality.
|
|
98
|
+
|
|
93
99
|
The loader fetches and parses sitemaps in the background, allowing crawling to start
|
|
94
100
|
before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
|
|
95
101
|
|
|
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
107
113
|
exclude: list[re.Pattern[Any] | Glob] | None = None,
|
|
108
114
|
max_buffer_size: int = 200,
|
|
109
115
|
persist_state_key: str | None = None,
|
|
116
|
+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
110
117
|
) -> None:
|
|
111
118
|
"""Initialize the sitemap request loader.
|
|
112
119
|
|
|
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
120
127
|
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
|
|
121
128
|
When provided, allows resuming from where it left off after interruption.
|
|
122
129
|
If None, no state persistence occurs.
|
|
130
|
+
transform_request_function: An optional function to transform requests
|
|
131
|
+
generated by the loader. It receives `RequestOptions` with `url` and should return either
|
|
132
|
+
modified `RequestOptions` or a `RequestTransformAction`.
|
|
123
133
|
"""
|
|
124
134
|
self._http_client = http_client
|
|
125
135
|
self._sitemap_urls = sitemap_urls
|
|
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
127
137
|
self._exclude = exclude
|
|
128
138
|
self._proxy_info = proxy_info
|
|
129
139
|
self._max_buffer_size = max_buffer_size
|
|
140
|
+
self._transform_request_function = transform_request_function
|
|
130
141
|
|
|
131
142
|
# Synchronization for queue operations
|
|
132
143
|
self._queue_has_capacity = asyncio.Event()
|
|
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
308
319
|
|
|
309
320
|
async with self._queue_lock:
|
|
310
321
|
url = state.url_queue.popleft()
|
|
311
|
-
|
|
312
|
-
|
|
322
|
+
request_option = RequestOptions(url=url)
|
|
323
|
+
if self._transform_request_function:
|
|
324
|
+
transform_request_option = self._transform_request_function(request_option)
|
|
325
|
+
if transform_request_option == 'skip':
|
|
326
|
+
state.total_count -= 1
|
|
327
|
+
continue
|
|
328
|
+
if transform_request_option != 'unchanged':
|
|
329
|
+
request_option = transform_request_option
|
|
330
|
+
request = Request.from_url(**request_option)
|
|
313
331
|
state.in_progress.add(request.url)
|
|
314
332
|
if len(state.url_queue) < self._max_buffer_size:
|
|
315
333
|
self._queue_has_capacity.set()
|
|
@@ -163,7 +163,7 @@ class SessionPool:
|
|
|
163
163
|
def add_session(self, session: Session) -> None:
|
|
164
164
|
"""Add an externally created session to the pool.
|
|
165
165
|
|
|
166
|
-
This is
|
|
166
|
+
This is intended only for the cases when you want to add a session that was created outside of the pool.
|
|
167
167
|
Otherwise, the pool will create new sessions automatically.
|
|
168
168
|
|
|
169
169
|
Args:
|
|
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
|
|
|
32
32
|
"""Capture error snapshot and save it to key value store.
|
|
33
33
|
|
|
34
34
|
It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
|
|
35
|
-
it returns `KeyValueStoreChangeRecords` which is
|
|
35
|
+
it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
|
|
36
36
|
returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
|
|
37
37
|
an exception.
|
|
38
38
|
|
crawlee/statistics/_models.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import warnings
|
|
4
5
|
from dataclasses import asdict, dataclass
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
7
|
from typing import Annotated, Any
|
|
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
|
|
|
76
77
|
crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
|
|
77
78
|
crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
|
|
78
79
|
crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
|
|
79
|
-
crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
|
|
80
80
|
errors: dict[str, Any] = Field(default_factory=dict)
|
|
81
81
|
retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
|
|
82
82
|
requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
|
|
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
|
|
|
93
93
|
),
|
|
94
94
|
] = {}
|
|
95
95
|
|
|
96
|
+
# Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
|
|
97
|
+
_runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
|
|
98
|
+
|
|
99
|
+
def model_post_init(self, /, __context: Any) -> None:
|
|
100
|
+
self._runtime_offset = self.crawler_runtime or self._runtime_offset
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def crawler_runtime(self) -> timedelta:
|
|
104
|
+
if self.crawler_last_started_at:
|
|
105
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
106
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
107
|
+
return self._runtime_offset
|
|
108
|
+
|
|
109
|
+
@crawler_runtime.setter
|
|
110
|
+
def crawler_runtime(self, value: timedelta) -> None:
|
|
111
|
+
# Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
|
|
112
|
+
# To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
|
|
113
|
+
warnings.warn(
|
|
114
|
+
f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
|
|
115
|
+
f' Value {value} will not be used.',
|
|
116
|
+
DeprecationWarning,
|
|
117
|
+
stacklevel=2,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
@computed_field(alias='crawlerRuntimeMillis')
|
|
121
|
+
def crawler_runtime_for_serialization(self) -> timedelta:
|
|
122
|
+
if self.crawler_last_started_at:
|
|
123
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
124
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
125
|
+
return self._runtime_offset
|
|
126
|
+
|
|
96
127
|
@computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
|
|
97
128
|
@property
|
|
98
129
|
def request_total_duration(self) -> timedelta:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import math
|
|
5
6
|
import time
|
|
6
7
|
from datetime import datetime, timedelta, timezone
|
|
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
|
|
|
17
18
|
from crawlee.statistics._error_tracker import ErrorTracker
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable, Coroutine
|
|
20
22
|
from types import TracebackType
|
|
21
23
|
|
|
24
|
+
from crawlee.storages import KeyValueStore
|
|
25
|
+
|
|
22
26
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
23
27
|
TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
24
28
|
logger = getLogger(__name__)
|
|
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
70
74
|
persistence_enabled: bool | Literal['explicit_only'] = False,
|
|
71
75
|
persist_state_kvs_name: str | None = None,
|
|
72
76
|
persist_state_key: str | None = None,
|
|
77
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
73
78
|
log_message: str = 'Statistics',
|
|
74
79
|
periodic_message_logger: Logger | None = None,
|
|
75
80
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
80
85
|
self._id = Statistics.__next_id
|
|
81
86
|
Statistics.__next_id += 1
|
|
82
87
|
|
|
83
|
-
self._instance_start: datetime | None = None
|
|
84
|
-
|
|
85
88
|
self.error_tracker = ErrorTracker(
|
|
86
89
|
save_error_snapshots=save_error_snapshots,
|
|
87
90
|
snapshot_kvs_name=persist_state_kvs_name,
|
|
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
92
95
|
|
|
93
96
|
self._state = RecoverableState(
|
|
94
97
|
default_state=state_model(stats_id=self._id),
|
|
95
|
-
persist_state_key=persist_state_key or f'
|
|
98
|
+
persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
|
|
96
99
|
persistence_enabled=persistence_enabled,
|
|
97
100
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
101
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
98
102
|
logger=logger,
|
|
99
103
|
)
|
|
100
104
|
|
|
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
110
114
|
"""Create near copy of the `Statistics` with replaced `state_model`."""
|
|
111
115
|
new_statistics: Statistics[TNewStatisticsState] = Statistics(
|
|
112
116
|
persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
|
|
113
|
-
persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
|
|
114
117
|
persist_state_key=self._state._persist_state_key, # noqa: SLF001
|
|
118
|
+
persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
|
|
115
119
|
log_message=self._log_message,
|
|
116
120
|
periodic_message_logger=self._periodic_message_logger,
|
|
117
121
|
state_model=state_model,
|
|
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
125
129
|
persistence_enabled: bool = False,
|
|
126
130
|
persist_state_kvs_name: str | None = None,
|
|
127
131
|
persist_state_key: str | None = None,
|
|
132
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
128
133
|
log_message: str = 'Statistics',
|
|
129
134
|
periodic_message_logger: Logger | None = None,
|
|
130
135
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
136
141
|
persistence_enabled=persistence_enabled,
|
|
137
142
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
138
143
|
persist_state_key=persist_state_key,
|
|
144
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
139
145
|
log_message=log_message,
|
|
140
146
|
periodic_message_logger=periodic_message_logger,
|
|
141
147
|
log_interval=log_interval,
|
|
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
158
164
|
if self._active:
|
|
159
165
|
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
|
|
160
166
|
|
|
161
|
-
self._active = True
|
|
162
|
-
self._instance_start = datetime.now(timezone.utc)
|
|
163
|
-
|
|
164
167
|
await self._state.initialize()
|
|
165
|
-
|
|
168
|
+
# Reset `crawler_finished_at` to indicate a new run in progress.
|
|
169
|
+
self.state.crawler_finished_at = None
|
|
166
170
|
|
|
171
|
+
# Start periodic logging and let it print initial state before activation.
|
|
167
172
|
self._periodic_logger.start()
|
|
173
|
+
await asyncio.sleep(0.01)
|
|
174
|
+
self._active = True
|
|
168
175
|
|
|
176
|
+
self.state.crawler_last_started_at = datetime.now(timezone.utc)
|
|
177
|
+
self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
|
|
169
178
|
return self
|
|
170
179
|
|
|
171
180
|
async def __aexit__(
|
|
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
182
191
|
if not self._active:
|
|
183
192
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
184
193
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
await self._state.teardown()
|
|
194
|
+
if not self.state.crawler_last_started_at:
|
|
195
|
+
raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
|
|
188
196
|
|
|
197
|
+
# Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
|
|
189
198
|
await self._periodic_logger.stop()
|
|
190
|
-
|
|
199
|
+
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
191
200
|
self._active = False
|
|
201
|
+
await self._state.teardown()
|
|
192
202
|
|
|
193
203
|
@property
|
|
194
204
|
def state(self) -> TStatisticsState:
|
|
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
247
257
|
|
|
248
258
|
def calculate(self) -> FinalStatistics:
|
|
249
259
|
"""Calculate the current statistics."""
|
|
250
|
-
|
|
251
|
-
raise RuntimeError('The Statistics object is not initialized')
|
|
252
|
-
|
|
253
|
-
crawler_runtime = datetime.now(timezone.utc) - self._instance_start
|
|
254
|
-
total_minutes = crawler_runtime.total_seconds() / 60
|
|
260
|
+
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
255
261
|
state = self._state.current_value
|
|
256
262
|
serialized_state = state.model_dump(by_alias=False)
|
|
257
263
|
|
|
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
262
268
|
requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
|
|
263
269
|
request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
|
|
264
270
|
requests_total=state.requests_failed + state.requests_finished,
|
|
265
|
-
crawler_runtime=crawler_runtime,
|
|
271
|
+
crawler_runtime=state.crawler_runtime,
|
|
266
272
|
requests_finished=state.requests_finished,
|
|
267
273
|
requests_failed=state.requests_failed,
|
|
268
274
|
retry_histogram=serialized_state['request_retry_histogram'],
|
|
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
282
288
|
else:
|
|
283
289
|
self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
|
|
284
290
|
|
|
285
|
-
def _after_initialize(self) -> None:
|
|
286
|
-
state = self._state.current_value
|
|
287
|
-
|
|
288
|
-
if state.crawler_started_at is None:
|
|
289
|
-
state.crawler_started_at = datetime.now(timezone.utc)
|
|
290
|
-
|
|
291
|
-
if state.stats_persisted_at is not None and state.crawler_last_started_at:
|
|
292
|
-
self._instance_start = datetime.now(timezone.utc) - (
|
|
293
|
-
state.stats_persisted_at - state.crawler_last_started_at
|
|
294
|
-
)
|
|
295
|
-
elif state.crawler_last_started_at:
|
|
296
|
-
self._instance_start = state.crawler_last_started_at
|
|
297
|
-
|
|
298
|
-
state.crawler_last_started_at = self._instance_start
|
|
299
|
-
|
|
300
291
|
def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
|
|
301
292
|
retry_count = record.retry_count
|
|
302
293
|
state = self._state.current_value
|
|
@@ -13,9 +13,13 @@ _install_import_hook(__name__)
|
|
|
13
13
|
with _try_import(__name__, 'SqlStorageClient'):
|
|
14
14
|
from ._sql import SqlStorageClient
|
|
15
15
|
|
|
16
|
+
with _try_import(__name__, 'RedisStorageClient'):
|
|
17
|
+
from ._redis import RedisStorageClient
|
|
18
|
+
|
|
16
19
|
__all__ = [
|
|
17
20
|
'FileSystemStorageClient',
|
|
18
21
|
'MemoryStorageClient',
|
|
22
|
+
'RedisStorageClient',
|
|
19
23
|
'SqlStorageClient',
|
|
20
24
|
'StorageClient',
|
|
21
25
|
]
|
|
@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
134
134
|
continue
|
|
135
135
|
|
|
136
136
|
try:
|
|
137
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
138
138
|
try:
|
|
139
139
|
file_content = json.load(file)
|
|
140
140
|
metadata = DatasetMetadata(**file_content)
|
|
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
163
163
|
|
|
164
164
|
# If the dataset directory exists, reconstruct the client from the metadata file.
|
|
165
165
|
if path_to_dataset.exists() and path_to_metadata.exists():
|
|
166
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
166
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
167
167
|
try:
|
|
168
168
|
file_content = json.load(file)
|
|
169
169
|
finally:
|
|
@@ -133,7 +133,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
133
133
|
continue
|
|
134
134
|
|
|
135
135
|
try:
|
|
136
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
136
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
137
137
|
try:
|
|
138
138
|
file_content = json.load(file)
|
|
139
139
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
@@ -162,7 +162,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
162
162
|
|
|
163
163
|
# If the key-value store directory exists, reconstruct the client from the metadata file.
|
|
164
164
|
if path_to_kvs.exists() and path_to_metadata.exists():
|
|
165
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
165
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
166
166
|
try:
|
|
167
167
|
file_content = json.load(file)
|
|
168
168
|
finally:
|
|
@@ -239,7 +239,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
239
239
|
# Read the metadata file
|
|
240
240
|
async with self._lock:
|
|
241
241
|
try:
|
|
242
|
-
file = await asyncio.to_thread(open, record_metadata_filepath)
|
|
242
|
+
file = await asyncio.to_thread(open, record_metadata_filepath, 'r', encoding='utf-8')
|
|
243
243
|
except FileNotFoundError:
|
|
244
244
|
logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
|
|
245
245
|
return None
|
|
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
|
|
|
31
31
|
from collections.abc import Sequence
|
|
32
32
|
|
|
33
33
|
from crawlee.configuration import Configuration
|
|
34
|
+
from crawlee.storages import KeyValueStore
|
|
34
35
|
|
|
35
36
|
logger = getLogger(__name__)
|
|
36
37
|
|
|
@@ -92,6 +93,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
92
93
|
metadata: RequestQueueMetadata,
|
|
93
94
|
path_to_rq: Path,
|
|
94
95
|
lock: asyncio.Lock,
|
|
96
|
+
recoverable_state: RecoverableState[RequestQueueState],
|
|
95
97
|
) -> None:
|
|
96
98
|
"""Initialize a new instance.
|
|
97
99
|
|
|
@@ -114,12 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
114
116
|
self._is_empty_cache: bool | None = None
|
|
115
117
|
"""Cache for is_empty result: None means unknown, True/False is cached state."""
|
|
116
118
|
|
|
117
|
-
self._state =
|
|
118
|
-
default_state=RequestQueueState(),
|
|
119
|
-
persist_state_key=f'__RQ_STATE_{self._metadata.id}',
|
|
120
|
-
persistence_enabled=True,
|
|
121
|
-
logger=logger,
|
|
122
|
-
)
|
|
119
|
+
self._state = recoverable_state
|
|
123
120
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
124
121
|
|
|
125
122
|
@override
|
|
@@ -136,6 +133,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
136
133
|
"""The full path to the request queue metadata file."""
|
|
137
134
|
return self.path_to_rq / METADATA_FILENAME
|
|
138
135
|
|
|
136
|
+
@classmethod
|
|
137
|
+
async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
|
|
138
|
+
async def kvs_factory() -> KeyValueStore:
|
|
139
|
+
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
|
|
140
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
141
|
+
|
|
142
|
+
return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
|
|
143
|
+
|
|
144
|
+
return RecoverableState[RequestQueueState](
|
|
145
|
+
default_state=RequestQueueState(),
|
|
146
|
+
persist_state_key=f'__RQ_STATE_{id}',
|
|
147
|
+
persist_state_kvs_factory=kvs_factory,
|
|
148
|
+
persistence_enabled=True,
|
|
149
|
+
logger=logger,
|
|
150
|
+
)
|
|
151
|
+
|
|
139
152
|
@classmethod
|
|
140
153
|
async def open(
|
|
141
154
|
cls,
|
|
@@ -184,7 +197,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
184
197
|
continue
|
|
185
198
|
|
|
186
199
|
try:
|
|
187
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
200
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
188
201
|
try:
|
|
189
202
|
file_content = json.load(file)
|
|
190
203
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -194,6 +207,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
194
207
|
metadata=metadata,
|
|
195
208
|
path_to_rq=rq_base_path / rq_dir,
|
|
196
209
|
lock=asyncio.Lock(),
|
|
210
|
+
recoverable_state=await cls._create_recoverable_state(
|
|
211
|
+
id=id, configuration=configuration
|
|
212
|
+
),
|
|
197
213
|
)
|
|
198
214
|
await client._state.initialize()
|
|
199
215
|
await client._discover_existing_requests()
|
|
@@ -216,7 +232,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
216
232
|
|
|
217
233
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
218
234
|
if path_to_rq.exists() and path_to_metadata.exists():
|
|
219
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
235
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
220
236
|
try:
|
|
221
237
|
file_content = json.load(file)
|
|
222
238
|
finally:
|
|
@@ -230,6 +246,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
230
246
|
metadata=metadata,
|
|
231
247
|
path_to_rq=path_to_rq,
|
|
232
248
|
lock=asyncio.Lock(),
|
|
249
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
233
250
|
)
|
|
234
251
|
|
|
235
252
|
await client._state.initialize()
|
|
@@ -254,6 +271,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
254
271
|
metadata=metadata,
|
|
255
272
|
path_to_rq=path_to_rq,
|
|
256
273
|
lock=asyncio.Lock(),
|
|
274
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
257
275
|
)
|
|
258
276
|
await client._state.initialize()
|
|
259
277
|
await client._update_metadata()
|
|
@@ -757,7 +775,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
757
775
|
"""
|
|
758
776
|
# Open the request file.
|
|
759
777
|
try:
|
|
760
|
-
file = await asyncio.to_thread(open, file_path)
|
|
778
|
+
file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8')
|
|
761
779
|
except FileNotFoundError:
|
|
762
780
|
logger.warning(f'Request file "{file_path}" not found.')
|
|
763
781
|
return None
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from ._dataset_client import RedisDatasetClient
|
|
2
|
+
from ._key_value_store_client import RedisKeyValueStoreClient
|
|
3
|
+
from ._request_queue_client import RedisRequestQueueClient
|
|
4
|
+
from ._storage_client import RedisStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
|