crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recurring_task.py +12 -3
- crawlee/_utils/sitemap.py +12 -5
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/browsers/_browser_pool.py +1 -1
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +138 -124
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +1 -3
- crawlee/request_loaders/_sitemap_request_loader.py +18 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +2 -21
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
- crawlee/storage_clients/_redis/_client_mixin.py +1 -4
- crawlee/storage_clients/_redis/_dataset_client.py +6 -2
- crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
- crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
- crawlee/storage_clients/_redis/_storage_client.py +12 -9
- crawlee/storage_clients/_redis/_utils.py +1 -1
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_storage_client.py +0 -9
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
crawlee/http_clients/_impit.py
CHANGED
|
@@ -6,7 +6,7 @@ from logging import getLogger
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, TypedDict
|
|
7
7
|
|
|
8
8
|
from cachetools import LRUCache
|
|
9
|
-
from impit import AsyncClient, Browser, HTTPError, Response, TransportError
|
|
9
|
+
from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
|
|
10
10
|
from impit import ProxyError as ImpitProxyError
|
|
11
11
|
from typing_extensions import override
|
|
12
12
|
|
|
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
|
|
|
125
125
|
session: Session | None = None,
|
|
126
126
|
proxy_info: ProxyInfo | None = None,
|
|
127
127
|
statistics: Statistics | None = None,
|
|
128
|
+
timeout: timedelta | None = None,
|
|
128
129
|
) -> HttpCrawlingResult:
|
|
129
130
|
client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
|
|
130
131
|
|
|
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
|
|
|
134
135
|
method=request.method,
|
|
135
136
|
content=request.payload,
|
|
136
137
|
headers=dict(request.headers) if request.headers else None,
|
|
138
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
137
139
|
)
|
|
140
|
+
except TimeoutException as exc:
|
|
141
|
+
raise asyncio.TimeoutError from exc
|
|
138
142
|
except (TransportError, HTTPError) as exc:
|
|
139
143
|
if self._is_proxy_error(exc):
|
|
140
144
|
raise ProxyError from exc
|
|
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
|
|
|
157
161
|
payload: HttpPayload | None = None,
|
|
158
162
|
session: Session | None = None,
|
|
159
163
|
proxy_info: ProxyInfo | None = None,
|
|
164
|
+
timeout: timedelta | None = None,
|
|
160
165
|
) -> HttpResponse:
|
|
161
166
|
if isinstance(headers, dict) or headers is None:
|
|
162
167
|
headers = HttpHeaders(headers or {})
|
|
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
|
|
|
165
170
|
|
|
166
171
|
try:
|
|
167
172
|
response = await client.request(
|
|
168
|
-
method=method,
|
|
173
|
+
method=method,
|
|
174
|
+
url=url,
|
|
175
|
+
content=payload,
|
|
176
|
+
headers=dict(headers) if headers else None,
|
|
177
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
169
178
|
)
|
|
179
|
+
except TimeoutException as exc:
|
|
180
|
+
raise asyncio.TimeoutError from exc
|
|
170
181
|
except (TransportError, HTTPError) as exc:
|
|
171
182
|
if self._is_proxy_error(exc):
|
|
172
183
|
raise ProxyError from exc
|
|
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
|
|
|
189
200
|
) -> AsyncGenerator[HttpResponse]:
|
|
190
201
|
client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
|
|
191
202
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
203
|
+
try:
|
|
204
|
+
response = await client.request(
|
|
205
|
+
method=method,
|
|
206
|
+
url=url,
|
|
207
|
+
content=payload,
|
|
208
|
+
headers=dict(headers) if headers else None,
|
|
209
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
210
|
+
stream=True,
|
|
211
|
+
)
|
|
212
|
+
except TimeoutException as exc:
|
|
213
|
+
raise asyncio.TimeoutError from exc
|
|
214
|
+
|
|
200
215
|
try:
|
|
201
216
|
yield _ImpitResponse(response)
|
|
202
217
|
finally:
|
|
@@ -3,9 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import inspect
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
|
-
from opentelemetry.instrumentation.instrumentor import
|
|
7
|
-
BaseInstrumentor,
|
|
8
|
-
)
|
|
6
|
+
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
|
|
9
7
|
from opentelemetry.instrumentation.utils import unwrap
|
|
10
8
|
from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME
|
|
11
9
|
from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD
|
|
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
|
-
from crawlee import Request
|
|
12
|
+
from crawlee import Request, RequestOptions
|
|
13
13
|
from crawlee._utils.docs import docs_group
|
|
14
14
|
from crawlee._utils.globs import Glob
|
|
15
15
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
|
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
import re
|
|
21
|
-
from collections.abc import Sequence
|
|
21
|
+
from collections.abc import Callable, Sequence
|
|
22
22
|
from types import TracebackType
|
|
23
23
|
|
|
24
|
+
from crawlee import RequestTransformAction
|
|
24
25
|
from crawlee.http_clients import HttpClient
|
|
25
26
|
from crawlee.proxy_configuration import ProxyInfo
|
|
26
27
|
from crawlee.storage_clients.models import ProcessedRequest
|
|
@@ -112,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
112
113
|
exclude: list[re.Pattern[Any] | Glob] | None = None,
|
|
113
114
|
max_buffer_size: int = 200,
|
|
114
115
|
persist_state_key: str | None = None,
|
|
116
|
+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
115
117
|
) -> None:
|
|
116
118
|
"""Initialize the sitemap request loader.
|
|
117
119
|
|
|
@@ -125,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
125
127
|
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
|
|
126
128
|
When provided, allows resuming from where it left off after interruption.
|
|
127
129
|
If None, no state persistence occurs.
|
|
130
|
+
transform_request_function: An optional function to transform requests
|
|
131
|
+
generated by the loader. It receives `RequestOptions` with `url` and should return either
|
|
132
|
+
modified `RequestOptions` or a `RequestTransformAction`.
|
|
128
133
|
"""
|
|
129
134
|
self._http_client = http_client
|
|
130
135
|
self._sitemap_urls = sitemap_urls
|
|
@@ -132,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
132
137
|
self._exclude = exclude
|
|
133
138
|
self._proxy_info = proxy_info
|
|
134
139
|
self._max_buffer_size = max_buffer_size
|
|
140
|
+
self._transform_request_function = transform_request_function
|
|
135
141
|
|
|
136
142
|
# Synchronization for queue operations
|
|
137
143
|
self._queue_has_capacity = asyncio.Event()
|
|
@@ -224,7 +230,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
224
230
|
continue
|
|
225
231
|
state.in_progress_sitemap_url = sitemap_url
|
|
226
232
|
|
|
227
|
-
parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
|
|
233
|
+
parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
|
|
228
234
|
|
|
229
235
|
async for item in parse_sitemap(
|
|
230
236
|
[SitemapSource(type='url', url=sitemap_url)],
|
|
@@ -313,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
313
319
|
|
|
314
320
|
async with self._queue_lock:
|
|
315
321
|
url = state.url_queue.popleft()
|
|
316
|
-
|
|
317
|
-
|
|
322
|
+
request_option = RequestOptions(url=url)
|
|
323
|
+
if self._transform_request_function:
|
|
324
|
+
transform_request_option = self._transform_request_function(request_option)
|
|
325
|
+
if transform_request_option == 'skip':
|
|
326
|
+
state.total_count -= 1
|
|
327
|
+
continue
|
|
328
|
+
if transform_request_option != 'unchanged':
|
|
329
|
+
request_option = transform_request_option
|
|
330
|
+
request = Request.from_url(**request_option)
|
|
318
331
|
state.in_progress.add(request.url)
|
|
319
332
|
if len(state.url_queue) < self._max_buffer_size:
|
|
320
333
|
self._queue_has_capacity.set()
|
crawlee/router.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from collections.abc import Awaitable, Callable
|
|
4
5
|
from typing import Generic, TypeVar
|
|
5
6
|
|
|
7
|
+
from crawlee._request import RequestState
|
|
6
8
|
from crawlee._types import BasicCrawlingContext
|
|
7
9
|
from crawlee._utils.docs import docs_group
|
|
8
10
|
|
|
9
11
|
__all__ = ['Router']
|
|
10
12
|
|
|
13
|
+
from crawlee.errors import UserHandlerTimeoutError
|
|
14
|
+
|
|
11
15
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
|
|
12
16
|
RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
|
|
13
17
|
|
|
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
|
|
|
89
93
|
|
|
90
94
|
async def __call__(self, context: TCrawlingContext) -> None:
|
|
91
95
|
"""Invoke a request handler that matches the request label (or the default)."""
|
|
96
|
+
context.request.state = RequestState.REQUEST_HANDLER
|
|
92
97
|
if context.request.label is None or context.request.label not in self._handlers_by_label:
|
|
93
98
|
if self._default_handler is None:
|
|
94
99
|
raise RuntimeError(
|
|
95
100
|
f'No handler matches label `{context.request.label}` and no default handler is configured'
|
|
96
101
|
)
|
|
97
102
|
|
|
98
|
-
|
|
103
|
+
user_defined_handler = self._default_handler
|
|
104
|
+
else:
|
|
105
|
+
user_defined_handler = self._handlers_by_label[context.request.label]
|
|
99
106
|
|
|
100
|
-
|
|
101
|
-
|
|
107
|
+
try:
|
|
108
|
+
return await user_defined_handler(context)
|
|
109
|
+
except asyncio.TimeoutError as e:
|
|
110
|
+
# Timeout in handler, but not timeout of handler.
|
|
111
|
+
raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
|
crawlee/sessions/_cookies.py
CHANGED
|
@@ -10,6 +10,7 @@ from crawlee._utils.docs import docs_group
|
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from collections.abc import Iterator
|
|
13
|
+
from typing import TypeGuard
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
@docs_group('Session management')
|
|
@@ -66,17 +67,18 @@ class SessionCookies:
|
|
|
66
67
|
|
|
67
68
|
self._jar = CookieJar()
|
|
68
69
|
|
|
69
|
-
if isinstance(cookies,
|
|
70
|
-
for key, value in cookies.items():
|
|
71
|
-
self.set(key, value)
|
|
72
|
-
|
|
73
|
-
elif isinstance(cookies, list):
|
|
70
|
+
if isinstance(cookies, list):
|
|
74
71
|
for item in cookies:
|
|
75
72
|
self.set(**item)
|
|
76
73
|
|
|
77
74
|
elif isinstance(cookies, SessionCookies):
|
|
78
75
|
for cookie in cookies.jar:
|
|
79
|
-
self.
|
|
76
|
+
self._jar.set_cookie(cookie)
|
|
77
|
+
|
|
78
|
+
elif isinstance(cookies, dict):
|
|
79
|
+
cookies_dict: dict[str, str] = cookies
|
|
80
|
+
for key, value in cookies_dict.items():
|
|
81
|
+
self.set(key, value)
|
|
80
82
|
|
|
81
83
|
@property
|
|
82
84
|
def jar(self) -> CookieJar:
|
|
@@ -151,8 +153,8 @@ class SessionCookies:
|
|
|
151
153
|
if cookie.expires:
|
|
152
154
|
cookie_dict['expires'] = cookie.expires
|
|
153
155
|
|
|
154
|
-
if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site
|
|
155
|
-
cookie_dict['same_site'] = same_site
|
|
156
|
+
if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):
|
|
157
|
+
cookie_dict['same_site'] = same_site
|
|
156
158
|
|
|
157
159
|
return cookie_dict
|
|
158
160
|
|
|
@@ -273,3 +275,6 @@ class SessionCookies:
|
|
|
273
275
|
"""Return hash based on the cookies key attributes."""
|
|
274
276
|
cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)
|
|
275
277
|
return hash(cookie_tuples)
|
|
278
|
+
|
|
279
|
+
def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:
|
|
280
|
+
return value in {'Lax', 'None', 'Strict'}
|
crawlee/sessions/_models.py
CHANGED
|
@@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel):
|
|
|
63
63
|
),
|
|
64
64
|
]
|
|
65
65
|
|
|
66
|
-
@computed_field(alias='sessionCount')
|
|
66
|
+
@computed_field(alias='sessionCount')
|
|
67
67
|
@property
|
|
68
68
|
def session_count(self) -> int:
|
|
69
69
|
"""Get the total number of sessions currently maintained in the pool."""
|
|
70
70
|
return len(self.sessions)
|
|
71
71
|
|
|
72
|
-
@computed_field(alias='usableSessionCount')
|
|
72
|
+
@computed_field(alias='usableSessionCount')
|
|
73
73
|
@property
|
|
74
74
|
def usable_session_count(self) -> int:
|
|
75
75
|
"""Get the number of sessions that are currently usable."""
|
|
76
76
|
return len([session for _, session in self.sessions.items() if session.is_usable])
|
|
77
77
|
|
|
78
|
-
@computed_field(alias='retiredSessionCount')
|
|
78
|
+
@computed_field(alias='retiredSessionCount')
|
|
79
79
|
@property
|
|
80
80
|
def retired_session_count(self) -> int:
|
|
81
81
|
"""Get the number of sessions that are no longer usable."""
|
crawlee/statistics/_models.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import warnings
|
|
4
5
|
from dataclasses import asdict, dataclass
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
|
-
from typing import Annotated, Any
|
|
7
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
|
7
8
|
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field
|
|
9
10
|
from typing_extensions import override
|
|
@@ -76,10 +77,20 @@ class StatisticsState(BaseModel):
|
|
|
76
77
|
crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
|
|
77
78
|
crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
|
|
78
79
|
crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
80
|
+
|
|
81
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
82
|
+
if TYPE_CHECKING:
|
|
83
|
+
errors: dict[str, Any] = {}
|
|
84
|
+
retry_errors: dict[str, Any] = {}
|
|
85
|
+
requests_with_status_code: dict[str, int] = {}
|
|
86
|
+
else:
|
|
87
|
+
errors: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
88
|
+
retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]
|
|
89
|
+
requests_with_status_code: Annotated[
|
|
90
|
+
dict[str, int],
|
|
91
|
+
Field(alias='requestsWithStatusCode', default_factory=dict),
|
|
92
|
+
]
|
|
93
|
+
|
|
83
94
|
stats_persisted_at: Annotated[
|
|
84
95
|
datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))
|
|
85
96
|
] = None
|
|
@@ -93,22 +104,53 @@ class StatisticsState(BaseModel):
|
|
|
93
104
|
),
|
|
94
105
|
] = {}
|
|
95
106
|
|
|
96
|
-
|
|
107
|
+
# Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
|
|
108
|
+
_runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
|
|
109
|
+
|
|
110
|
+
def model_post_init(self, /, __context: Any) -> None:
|
|
111
|
+
self._runtime_offset = self.crawler_runtime or self._runtime_offset
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def crawler_runtime(self) -> timedelta:
|
|
115
|
+
if self.crawler_last_started_at:
|
|
116
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
117
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
118
|
+
return self._runtime_offset
|
|
119
|
+
|
|
120
|
+
@crawler_runtime.setter
|
|
121
|
+
def crawler_runtime(self, value: timedelta) -> None:
|
|
122
|
+
# Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
|
|
123
|
+
# To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
|
|
124
|
+
warnings.warn(
|
|
125
|
+
f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
|
|
126
|
+
f' Value {value} will not be used.',
|
|
127
|
+
DeprecationWarning,
|
|
128
|
+
stacklevel=2,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
@computed_field(alias='crawlerRuntimeMillis')
|
|
132
|
+
def crawler_runtime_for_serialization(self) -> timedelta:
|
|
133
|
+
if self.crawler_last_started_at:
|
|
134
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
135
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
136
|
+
return self._runtime_offset
|
|
137
|
+
|
|
138
|
+
@computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)
|
|
97
139
|
@property
|
|
98
140
|
def request_total_duration(self) -> timedelta:
|
|
99
141
|
return self.request_total_finished_duration + self.request_total_failed_duration
|
|
100
142
|
|
|
101
|
-
@computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
|
|
143
|
+
@computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
|
|
102
144
|
@property
|
|
103
145
|
def request_avg_failed_duration(self) -> timedelta | None:
|
|
104
146
|
return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None
|
|
105
147
|
|
|
106
|
-
@computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
|
|
148
|
+
@computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
|
|
107
149
|
@property
|
|
108
150
|
def request_avg_finished_duration(self) -> timedelta | None:
|
|
109
151
|
return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None
|
|
110
152
|
|
|
111
|
-
@computed_field(alias='requestsTotal')
|
|
153
|
+
@computed_field(alias='requestsTotal')
|
|
112
154
|
@property
|
|
113
155
|
def requests_total(self) -> int:
|
|
114
156
|
return self.requests_failed + self.requests_finished
|
|
@@ -110,9 +110,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
110
110
|
# Flag to indicate the context state.
|
|
111
111
|
self._active = False
|
|
112
112
|
|
|
113
|
-
# Pre-existing runtime offset, that can be non-zero when restoring serialized state from KVS.
|
|
114
|
-
self._runtime_offset = timedelta(seconds=0)
|
|
115
|
-
|
|
116
113
|
def replace_state_model(self, state_model: type[TNewStatisticsState]) -> Statistics[TNewStatisticsState]:
|
|
117
114
|
"""Create near copy of the `Statistics` with replaced `state_model`."""
|
|
118
115
|
new_statistics: Statistics[TNewStatisticsState] = Statistics(
|
|
@@ -168,8 +165,8 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
168
165
|
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
|
|
169
166
|
|
|
170
167
|
await self._state.initialize()
|
|
171
|
-
|
|
172
|
-
self.
|
|
168
|
+
# Reset `crawler_finished_at` to indicate a new run in progress.
|
|
169
|
+
self.state.crawler_finished_at = None
|
|
173
170
|
|
|
174
171
|
# Start periodic logging and let it print initial state before activation.
|
|
175
172
|
self._periodic_logger.start()
|
|
@@ -200,10 +197,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
200
197
|
# Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
|
|
201
198
|
await self._periodic_logger.stop()
|
|
202
199
|
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
203
|
-
self.state.crawler_runtime = (
|
|
204
|
-
self._runtime_offset + self.state.crawler_finished_at - self.state.crawler_last_started_at
|
|
205
|
-
)
|
|
206
|
-
|
|
207
200
|
self._active = False
|
|
208
201
|
await self._state.teardown()
|
|
209
202
|
|
|
@@ -262,20 +255,8 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
262
255
|
|
|
263
256
|
del self._requests_in_progress[request_id_or_key]
|
|
264
257
|
|
|
265
|
-
def _update_crawler_runtime(self) -> None:
|
|
266
|
-
current_run_duration = (
|
|
267
|
-
(datetime.now(timezone.utc) - self.state.crawler_last_started_at)
|
|
268
|
-
if self.state.crawler_last_started_at
|
|
269
|
-
else timedelta()
|
|
270
|
-
)
|
|
271
|
-
self.state.crawler_runtime = current_run_duration + self._runtime_offset
|
|
272
|
-
|
|
273
258
|
def calculate(self) -> FinalStatistics:
|
|
274
259
|
"""Calculate the current statistics."""
|
|
275
|
-
if self._active:
|
|
276
|
-
# Only update state when active. If not, just report the last known runtime.
|
|
277
|
-
self._update_crawler_runtime()
|
|
278
|
-
|
|
279
260
|
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
280
261
|
state = self._state.current_value
|
|
281
262
|
serialized_state = state.model_dump(by_alias=False)
|
|
@@ -87,8 +87,8 @@ class DatasetClient(ABC):
|
|
|
87
87
|
|
|
88
88
|
The backend method for the `Dataset.iterate_items` call.
|
|
89
89
|
"""
|
|
90
|
-
# This syntax is to make
|
|
90
|
+
# This syntax is to make type checker properly work with abstract AsyncIterator.
|
|
91
91
|
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
|
|
92
92
|
raise NotImplementedError
|
|
93
|
-
if False:
|
|
93
|
+
if False:
|
|
94
94
|
yield 0
|
|
@@ -72,10 +72,10 @@ class KeyValueStoreClient(ABC):
|
|
|
72
72
|
|
|
73
73
|
The backend method for the `KeyValueStore.iterate_keys` call.
|
|
74
74
|
"""
|
|
75
|
-
# This syntax is to make
|
|
75
|
+
# This syntax is to make type checker properly work with abstract AsyncIterator.
|
|
76
76
|
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
|
|
77
77
|
raise NotImplementedError
|
|
78
|
-
if False:
|
|
78
|
+
if False:
|
|
79
79
|
yield 0
|
|
80
80
|
|
|
81
81
|
@abstractmethod
|
|
@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
134
134
|
continue
|
|
135
135
|
|
|
136
136
|
try:
|
|
137
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
138
138
|
try:
|
|
139
139
|
file_content = json.load(file)
|
|
140
140
|
metadata = DatasetMetadata(**file_content)
|
|
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
163
163
|
|
|
164
164
|
# If the dataset directory exists, reconstruct the client from the metadata file.
|
|
165
165
|
if path_to_dataset.exists() and path_to_metadata.exists():
|
|
166
|
-
file = await asyncio.to_thread(open,
|
|
166
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
167
167
|
try:
|
|
168
168
|
file_content = json.load(file)
|
|
169
169
|
finally:
|
|
@@ -473,9 +473,10 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
473
473
|
"""
|
|
474
474
|
# Retrieve and sort all JSON files in the dataset directory numerically.
|
|
475
475
|
files = await asyncio.to_thread(
|
|
476
|
-
sorted
|
|
477
|
-
|
|
478
|
-
|
|
476
|
+
lambda: sorted(
|
|
477
|
+
self.path_to_dataset.glob('*.json'),
|
|
478
|
+
key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
|
|
479
|
+
)
|
|
479
480
|
)
|
|
480
481
|
|
|
481
482
|
# Remove the metadata file from the list if present.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import functools
|
|
4
5
|
import json
|
|
5
6
|
import shutil
|
|
6
7
|
import urllib.parse
|
|
@@ -133,7 +134,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
133
134
|
continue
|
|
134
135
|
|
|
135
136
|
try:
|
|
136
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
137
138
|
try:
|
|
138
139
|
file_content = json.load(file)
|
|
139
140
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
@@ -162,7 +163,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
162
163
|
|
|
163
164
|
# If the key-value store directory exists, reconstruct the client from the metadata file.
|
|
164
165
|
if path_to_kvs.exists() and path_to_metadata.exists():
|
|
165
|
-
file = await asyncio.to_thread(open,
|
|
166
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
166
167
|
try:
|
|
167
168
|
file_content = json.load(file)
|
|
168
169
|
finally:
|
|
@@ -239,7 +240,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
239
240
|
# Read the metadata file
|
|
240
241
|
async with self._lock:
|
|
241
242
|
try:
|
|
242
|
-
file = await asyncio.to_thread(
|
|
243
|
+
file = await asyncio.to_thread(
|
|
244
|
+
functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),
|
|
245
|
+
)
|
|
243
246
|
except FileNotFoundError:
|
|
244
247
|
logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
|
|
245
248
|
return None
|
|
@@ -373,7 +376,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
373
376
|
|
|
374
377
|
# List and sort all files *inside* a brief lock, then release it immediately:
|
|
375
378
|
async with self._lock:
|
|
376
|
-
files = sorted(await asyncio.to_thread(list
|
|
379
|
+
files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))
|
|
377
380
|
|
|
378
381
|
count = 0
|
|
379
382
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import functools
|
|
4
5
|
import json
|
|
5
6
|
import shutil
|
|
6
7
|
from collections import deque
|
|
@@ -197,7 +198,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
197
198
|
continue
|
|
198
199
|
|
|
199
200
|
try:
|
|
200
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
201
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
201
202
|
try:
|
|
202
203
|
file_content = json.load(file)
|
|
203
204
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -232,7 +233,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
232
233
|
|
|
233
234
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
234
235
|
if path_to_rq.exists() and path_to_metadata.exists():
|
|
235
|
-
file = await asyncio.to_thread(open,
|
|
236
|
+
file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')
|
|
236
237
|
try:
|
|
237
238
|
file_content = json.load(file)
|
|
238
239
|
finally:
|
|
@@ -756,7 +757,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
756
757
|
await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)
|
|
757
758
|
|
|
758
759
|
# List all the json files.
|
|
759
|
-
files = await asyncio.to_thread(list
|
|
760
|
+
files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json')))
|
|
760
761
|
|
|
761
762
|
# Filter out metadata file and non-file entries.
|
|
762
763
|
filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
|
|
@@ -775,7 +776,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
775
776
|
"""
|
|
776
777
|
# Open the request file.
|
|
777
778
|
try:
|
|
778
|
-
file = await asyncio.to_thread(open,
|
|
779
|
+
file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))
|
|
779
780
|
except FileNotFoundError:
|
|
780
781
|
logger.warning(f'Request file "{file_path}" not found.')
|
|
781
782
|
return None
|
|
@@ -179,7 +179,7 @@ class RedisClientMixin:
|
|
|
179
179
|
"""Create a new Redis pipeline."""
|
|
180
180
|
async with self._redis.pipeline() as pipe:
|
|
181
181
|
try:
|
|
182
|
-
pipe.multi()
|
|
182
|
+
pipe.multi()
|
|
183
183
|
yield pipe
|
|
184
184
|
finally:
|
|
185
185
|
if with_execute:
|
|
@@ -187,7 +187,6 @@ class RedisClientMixin:
|
|
|
187
187
|
|
|
188
188
|
async def _create_storage(self, pipeline: Pipeline) -> None:
|
|
189
189
|
"""Create the actual storage structure in Redis."""
|
|
190
|
-
_ = pipeline # To avoid unused variable mypy error
|
|
191
190
|
|
|
192
191
|
async def _create_script(self, script_name: str) -> AsyncScript:
|
|
193
192
|
"""Load a Lua script from a file and return a Script object."""
|
|
@@ -262,8 +261,6 @@ class RedisClientMixin:
|
|
|
262
261
|
pipeline: The Redis pipeline to use for the update.
|
|
263
262
|
**kwargs: Storage-specific update parameters.
|
|
264
263
|
"""
|
|
265
|
-
_ = pipeline # To avoid unused variable mypy error
|
|
266
|
-
_ = kwargs
|
|
267
264
|
|
|
268
265
|
async def _update_metadata(
|
|
269
266
|
self,
|
|
@@ -179,13 +179,15 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin):
|
|
|
179
179
|
case (True, int(), None):
|
|
180
180
|
json_path += f'[:-{offset}]'
|
|
181
181
|
case (True, int(), int()):
|
|
182
|
-
|
|
182
|
+
# ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
|
|
183
|
+
json_path += f'[-{offset + limit}:-{offset}]' # ty: ignore[unsupported-operator]
|
|
183
184
|
case (False, 0, int()):
|
|
184
185
|
json_path += f'[:{limit}]'
|
|
185
186
|
case (False, int(), None):
|
|
186
187
|
json_path += f'[{offset}:]'
|
|
187
188
|
case (False, int(), int()):
|
|
188
|
-
|
|
189
|
+
# ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887.
|
|
190
|
+
json_path += f'[{offset}:{offset + limit}]' # ty: ignore[unsupported-operator]
|
|
189
191
|
|
|
190
192
|
if json_path == '$':
|
|
191
193
|
json_path = '$[*]'
|
|
@@ -195,6 +197,8 @@ class RedisDatasetClient(DatasetClient, RedisClientMixin):
|
|
|
195
197
|
if data is None:
|
|
196
198
|
data = []
|
|
197
199
|
|
|
200
|
+
data = [item for item in data if isinstance(item, dict)]
|
|
201
|
+
|
|
198
202
|
if skip_empty:
|
|
199
203
|
data = [item for item in data if item]
|
|
200
204
|
|
|
@@ -144,7 +144,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
|
|
|
144
144
|
|
|
145
145
|
async with self._get_pipeline() as pipe:
|
|
146
146
|
# redis-py typing issue
|
|
147
|
-
await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) #
|
|
147
|
+
await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # ty: ignore[invalid-argument-type]
|
|
148
148
|
|
|
149
149
|
await await_redis_response(
|
|
150
150
|
pipe.hset(
|
|
@@ -174,9 +174,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
|
|
|
174
174
|
|
|
175
175
|
# Query the record by key
|
|
176
176
|
# redis-py typing issue
|
|
177
|
-
value_bytes: bytes | None = await await_redis_response(
|
|
178
|
-
self._redis.hget(self._items_key, key) # type: ignore[arg-type]
|
|
179
|
-
)
|
|
177
|
+
value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key)) # ty: ignore[invalid-assignment]
|
|
180
178
|
|
|
181
179
|
if value_bytes is None:
|
|
182
180
|
logger.warning(f'Value for key "{key}" is missing.')
|
|
@@ -225,7 +223,7 @@ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
|
|
|
225
223
|
raise TypeError('The items data was received in an incorrect format.')
|
|
226
224
|
|
|
227
225
|
# Get all keys, sorted alphabetically
|
|
228
|
-
keys = sorted(items_data.keys())
|
|
226
|
+
keys = sorted(items_data.keys()) # ty: ignore[invalid-argument-type]
|
|
229
227
|
|
|
230
228
|
# Apply exclusive_start_key filter if provided
|
|
231
229
|
if exclusive_start_key is not None:
|