crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +156 -131
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_key_value_store.py +5 -2
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
crawlee/http_clients/_httpx.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from contextlib import asynccontextmanager
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import TYPE_CHECKING, Any, cast
|
|
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
146
147
|
session: Session | None = None,
|
|
147
148
|
proxy_info: ProxyInfo | None = None,
|
|
148
149
|
statistics: Statistics | None = None,
|
|
150
|
+
timeout: timedelta | None = None,
|
|
149
151
|
) -> HttpCrawlingResult:
|
|
150
152
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
151
153
|
headers = self._combine_headers(request.headers)
|
|
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
157
159
|
content=request.payload,
|
|
158
160
|
cookies=session.cookies.jar if session else None,
|
|
159
161
|
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
|
|
162
|
+
timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
|
|
160
163
|
)
|
|
161
164
|
|
|
162
165
|
try:
|
|
163
166
|
response = await client.send(http_request)
|
|
167
|
+
except httpx.TimeoutException as exc:
|
|
168
|
+
raise asyncio.TimeoutError from exc
|
|
164
169
|
except httpx.TransportError as exc:
|
|
165
170
|
if self._is_proxy_error(exc):
|
|
166
171
|
raise ProxyError from exc
|
|
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
185
190
|
payload: HttpPayload | None = None,
|
|
186
191
|
session: Session | None = None,
|
|
187
192
|
proxy_info: ProxyInfo | None = None,
|
|
193
|
+
timeout: timedelta | None = None,
|
|
188
194
|
) -> HttpResponse:
|
|
189
195
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
190
196
|
|
|
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
195
201
|
headers=headers,
|
|
196
202
|
payload=payload,
|
|
197
203
|
session=session,
|
|
204
|
+
timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
|
|
198
205
|
)
|
|
199
206
|
|
|
200
207
|
try:
|
|
201
208
|
response = await client.send(http_request)
|
|
209
|
+
except httpx.TimeoutException as exc:
|
|
210
|
+
raise asyncio.TimeoutError from exc
|
|
202
211
|
except httpx.TransportError as exc:
|
|
203
212
|
if self._is_proxy_error(exc):
|
|
204
213
|
raise ProxyError from exc
|
|
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
228
237
|
headers=headers,
|
|
229
238
|
payload=payload,
|
|
230
239
|
session=session,
|
|
231
|
-
timeout=timeout,
|
|
240
|
+
timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
|
|
232
241
|
)
|
|
233
242
|
|
|
234
|
-
|
|
243
|
+
try:
|
|
244
|
+
response = await client.send(http_request, stream=True)
|
|
245
|
+
except httpx.TimeoutException as exc:
|
|
246
|
+
raise asyncio.TimeoutError from exc
|
|
235
247
|
|
|
236
248
|
try:
|
|
237
249
|
yield _HttpxResponse(response)
|
|
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
246
258
|
headers: HttpHeaders | dict[str, str] | None,
|
|
247
259
|
payload: HttpPayload | None,
|
|
248
260
|
session: Session | None = None,
|
|
249
|
-
timeout:
|
|
261
|
+
timeout: httpx.Timeout | None = None,
|
|
250
262
|
) -> httpx.Request:
|
|
251
263
|
"""Build an `httpx.Request` using the provided parameters."""
|
|
252
264
|
if isinstance(headers, dict) or headers is None:
|
|
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
254
266
|
|
|
255
267
|
headers = self._combine_headers(headers)
|
|
256
268
|
|
|
257
|
-
httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
|
|
258
|
-
|
|
259
269
|
return client.build_request(
|
|
260
270
|
url=url,
|
|
261
271
|
method=method,
|
|
262
272
|
headers=dict(headers) if headers else None,
|
|
263
273
|
content=payload,
|
|
264
274
|
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
|
|
265
|
-
timeout=
|
|
275
|
+
timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
|
|
266
276
|
)
|
|
267
277
|
|
|
268
278
|
def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
|
crawlee/http_clients/_impit.py
CHANGED
|
@@ -6,7 +6,7 @@ from logging import getLogger
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, TypedDict
|
|
7
7
|
|
|
8
8
|
from cachetools import LRUCache
|
|
9
|
-
from impit import AsyncClient, Browser, HTTPError, Response, TransportError
|
|
9
|
+
from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
|
|
10
10
|
from impit import ProxyError as ImpitProxyError
|
|
11
11
|
from typing_extensions import override
|
|
12
12
|
|
|
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
|
|
|
125
125
|
session: Session | None = None,
|
|
126
126
|
proxy_info: ProxyInfo | None = None,
|
|
127
127
|
statistics: Statistics | None = None,
|
|
128
|
+
timeout: timedelta | None = None,
|
|
128
129
|
) -> HttpCrawlingResult:
|
|
129
130
|
client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
|
|
130
131
|
|
|
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
|
|
|
134
135
|
method=request.method,
|
|
135
136
|
content=request.payload,
|
|
136
137
|
headers=dict(request.headers) if request.headers else None,
|
|
138
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
137
139
|
)
|
|
140
|
+
except TimeoutException as exc:
|
|
141
|
+
raise asyncio.TimeoutError from exc
|
|
138
142
|
except (TransportError, HTTPError) as exc:
|
|
139
143
|
if self._is_proxy_error(exc):
|
|
140
144
|
raise ProxyError from exc
|
|
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
|
|
|
157
161
|
payload: HttpPayload | None = None,
|
|
158
162
|
session: Session | None = None,
|
|
159
163
|
proxy_info: ProxyInfo | None = None,
|
|
164
|
+
timeout: timedelta | None = None,
|
|
160
165
|
) -> HttpResponse:
|
|
161
166
|
if isinstance(headers, dict) or headers is None:
|
|
162
167
|
headers = HttpHeaders(headers or {})
|
|
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
|
|
|
165
170
|
|
|
166
171
|
try:
|
|
167
172
|
response = await client.request(
|
|
168
|
-
method=method,
|
|
173
|
+
method=method,
|
|
174
|
+
url=url,
|
|
175
|
+
content=payload,
|
|
176
|
+
headers=dict(headers) if headers else None,
|
|
177
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
169
178
|
)
|
|
179
|
+
except TimeoutException as exc:
|
|
180
|
+
raise asyncio.TimeoutError from exc
|
|
170
181
|
except (TransportError, HTTPError) as exc:
|
|
171
182
|
if self._is_proxy_error(exc):
|
|
172
183
|
raise ProxyError from exc
|
|
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
|
|
|
189
200
|
) -> AsyncGenerator[HttpResponse]:
|
|
190
201
|
client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
|
|
191
202
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
203
|
+
try:
|
|
204
|
+
response = await client.request(
|
|
205
|
+
method=method,
|
|
206
|
+
url=url,
|
|
207
|
+
content=payload,
|
|
208
|
+
headers=dict(headers) if headers else None,
|
|
209
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
210
|
+
stream=True,
|
|
211
|
+
)
|
|
212
|
+
except TimeoutException as exc:
|
|
213
|
+
raise asyncio.TimeoutError from exc
|
|
214
|
+
|
|
200
215
|
try:
|
|
201
216
|
yield _ImpitResponse(response)
|
|
202
217
|
finally:
|
|
@@ -3,9 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import inspect
|
|
4
4
|
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
|
-
from opentelemetry.instrumentation.instrumentor import
|
|
7
|
-
BaseInstrumentor,
|
|
8
|
-
)
|
|
6
|
+
from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
|
|
9
7
|
from opentelemetry.instrumentation.utils import unwrap
|
|
10
8
|
from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME
|
|
11
9
|
from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD
|
|
@@ -69,7 +67,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
69
67
|
|
|
70
68
|
if request_handling_instrumentation:
|
|
71
69
|
|
|
72
|
-
async def
|
|
70
|
+
async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
|
|
73
71
|
with self._tracer.start_as_current_span(
|
|
74
72
|
name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
|
|
75
73
|
attributes={
|
|
@@ -111,8 +109,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
111
109
|
# Handpicked interesting methods to instrument
|
|
112
110
|
self._instrumented.extend(
|
|
113
111
|
[
|
|
114
|
-
(_Middleware, 'action',
|
|
115
|
-
(_Middleware, 'cleanup',
|
|
112
|
+
(_Middleware, 'action', middleware_wrapper),
|
|
113
|
+
(_Middleware, 'cleanup', middleware_wrapper),
|
|
116
114
|
(ContextPipeline, '__call__', context_pipeline_wrapper),
|
|
117
115
|
(BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
|
|
118
116
|
(BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
|
|
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
|
-
from crawlee import Request
|
|
12
|
+
from crawlee import Request, RequestOptions
|
|
13
13
|
from crawlee._utils.docs import docs_group
|
|
14
14
|
from crawlee._utils.globs import Glob
|
|
15
15
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
|
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
import re
|
|
21
|
-
from collections.abc import Sequence
|
|
21
|
+
from collections.abc import Callable, Sequence
|
|
22
22
|
from types import TracebackType
|
|
23
23
|
|
|
24
|
+
from crawlee import RequestTransformAction
|
|
24
25
|
from crawlee.http_clients import HttpClient
|
|
25
26
|
from crawlee.proxy_configuration import ProxyInfo
|
|
26
27
|
from crawlee.storage_clients.models import ProcessedRequest
|
|
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
|
|
|
90
91
|
class SitemapRequestLoader(RequestLoader):
|
|
91
92
|
"""A request loader that reads URLs from sitemap(s).
|
|
92
93
|
|
|
94
|
+
The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
|
|
95
|
+
(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
|
|
96
|
+
Note that HTML pages containing links are not supported - those should be handled by regular crawlers
|
|
97
|
+
and the `enqueue_links` functionality.
|
|
98
|
+
|
|
93
99
|
The loader fetches and parses sitemaps in the background, allowing crawling to start
|
|
94
100
|
before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
|
|
95
101
|
|
|
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
107
113
|
exclude: list[re.Pattern[Any] | Glob] | None = None,
|
|
108
114
|
max_buffer_size: int = 200,
|
|
109
115
|
persist_state_key: str | None = None,
|
|
116
|
+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
110
117
|
) -> None:
|
|
111
118
|
"""Initialize the sitemap request loader.
|
|
112
119
|
|
|
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
120
127
|
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
|
|
121
128
|
When provided, allows resuming from where it left off after interruption.
|
|
122
129
|
If None, no state persistence occurs.
|
|
130
|
+
transform_request_function: An optional function to transform requests
|
|
131
|
+
generated by the loader. It receives `RequestOptions` with `url` and should return either
|
|
132
|
+
modified `RequestOptions` or a `RequestTransformAction`.
|
|
123
133
|
"""
|
|
124
134
|
self._http_client = http_client
|
|
125
135
|
self._sitemap_urls = sitemap_urls
|
|
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
127
137
|
self._exclude = exclude
|
|
128
138
|
self._proxy_info = proxy_info
|
|
129
139
|
self._max_buffer_size = max_buffer_size
|
|
140
|
+
self._transform_request_function = transform_request_function
|
|
130
141
|
|
|
131
142
|
# Synchronization for queue operations
|
|
132
143
|
self._queue_has_capacity = asyncio.Event()
|
|
@@ -219,7 +230,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
219
230
|
continue
|
|
220
231
|
state.in_progress_sitemap_url = sitemap_url
|
|
221
232
|
|
|
222
|
-
parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
|
|
233
|
+
parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3)
|
|
223
234
|
|
|
224
235
|
async for item in parse_sitemap(
|
|
225
236
|
[SitemapSource(type='url', url=sitemap_url)],
|
|
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
308
319
|
|
|
309
320
|
async with self._queue_lock:
|
|
310
321
|
url = state.url_queue.popleft()
|
|
311
|
-
|
|
312
|
-
|
|
322
|
+
request_option = RequestOptions(url=url)
|
|
323
|
+
if self._transform_request_function:
|
|
324
|
+
transform_request_option = self._transform_request_function(request_option)
|
|
325
|
+
if transform_request_option == 'skip':
|
|
326
|
+
state.total_count -= 1
|
|
327
|
+
continue
|
|
328
|
+
if transform_request_option != 'unchanged':
|
|
329
|
+
request_option = transform_request_option
|
|
330
|
+
request = Request.from_url(**request_option)
|
|
313
331
|
state.in_progress.add(request.url)
|
|
314
332
|
if len(state.url_queue) < self._max_buffer_size:
|
|
315
333
|
self._queue_has_capacity.set()
|
crawlee/router.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from collections.abc import Awaitable, Callable
|
|
4
5
|
from typing import Generic, TypeVar
|
|
5
6
|
|
|
7
|
+
from crawlee._request import RequestState
|
|
6
8
|
from crawlee._types import BasicCrawlingContext
|
|
7
9
|
from crawlee._utils.docs import docs_group
|
|
8
10
|
|
|
9
11
|
__all__ = ['Router']
|
|
10
12
|
|
|
13
|
+
from crawlee.errors import UserHandlerTimeoutError
|
|
14
|
+
|
|
11
15
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
|
|
12
16
|
RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
|
|
13
17
|
|
|
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
|
|
|
89
93
|
|
|
90
94
|
async def __call__(self, context: TCrawlingContext) -> None:
|
|
91
95
|
"""Invoke a request handler that matches the request label (or the default)."""
|
|
96
|
+
context.request.state = RequestState.REQUEST_HANDLER
|
|
92
97
|
if context.request.label is None or context.request.label not in self._handlers_by_label:
|
|
93
98
|
if self._default_handler is None:
|
|
94
99
|
raise RuntimeError(
|
|
95
100
|
f'No handler matches label `{context.request.label}` and no default handler is configured'
|
|
96
101
|
)
|
|
97
102
|
|
|
98
|
-
|
|
103
|
+
user_defined_handler = self._default_handler
|
|
104
|
+
else:
|
|
105
|
+
user_defined_handler = self._handlers_by_label[context.request.label]
|
|
99
106
|
|
|
100
|
-
|
|
101
|
-
|
|
107
|
+
try:
|
|
108
|
+
return await user_defined_handler(context)
|
|
109
|
+
except asyncio.TimeoutError as e:
|
|
110
|
+
# Timeout in handler, but not timeout of handler.
|
|
111
|
+
raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
|
crawlee/sessions/_cookies.py
CHANGED
|
@@ -10,6 +10,7 @@ from crawlee._utils.docs import docs_group
|
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from collections.abc import Iterator
|
|
13
|
+
from typing import TypeGuard
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
@docs_group('Session management')
|
|
@@ -66,17 +67,18 @@ class SessionCookies:
|
|
|
66
67
|
|
|
67
68
|
self._jar = CookieJar()
|
|
68
69
|
|
|
69
|
-
if isinstance(cookies,
|
|
70
|
-
for key, value in cookies.items():
|
|
71
|
-
self.set(key, value)
|
|
72
|
-
|
|
73
|
-
elif isinstance(cookies, list):
|
|
70
|
+
if isinstance(cookies, list):
|
|
74
71
|
for item in cookies:
|
|
75
72
|
self.set(**item)
|
|
76
73
|
|
|
77
74
|
elif isinstance(cookies, SessionCookies):
|
|
78
75
|
for cookie in cookies.jar:
|
|
79
|
-
self.
|
|
76
|
+
self._jar.set_cookie(cookie)
|
|
77
|
+
|
|
78
|
+
elif isinstance(cookies, dict):
|
|
79
|
+
cookies_dict: dict[str, str] = cookies
|
|
80
|
+
for key, value in cookies_dict.items():
|
|
81
|
+
self.set(key, value)
|
|
80
82
|
|
|
81
83
|
@property
|
|
82
84
|
def jar(self) -> CookieJar:
|
|
@@ -151,8 +153,8 @@ class SessionCookies:
|
|
|
151
153
|
if cookie.expires:
|
|
152
154
|
cookie_dict['expires'] = cookie.expires
|
|
153
155
|
|
|
154
|
-
if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site
|
|
155
|
-
cookie_dict['same_site'] = same_site
|
|
156
|
+
if (same_site := cookie.get_nonstandard_attr('SameSite')) and self._is_valid_same_site(same_site):
|
|
157
|
+
cookie_dict['same_site'] = same_site
|
|
156
158
|
|
|
157
159
|
return cookie_dict
|
|
158
160
|
|
|
@@ -273,3 +275,6 @@ class SessionCookies:
|
|
|
273
275
|
"""Return hash based on the cookies key attributes."""
|
|
274
276
|
cookie_tuples = frozenset((cookie.name, cookie.value, cookie.domain, cookie.path) for cookie in self._jar)
|
|
275
277
|
return hash(cookie_tuples)
|
|
278
|
+
|
|
279
|
+
def _is_valid_same_site(self, value: str | None) -> TypeGuard[Literal['Lax', 'None', 'Strict']]:
|
|
280
|
+
return value in {'Lax', 'None', 'Strict'}
|
crawlee/sessions/_models.py
CHANGED
|
@@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel):
|
|
|
63
63
|
),
|
|
64
64
|
]
|
|
65
65
|
|
|
66
|
-
@computed_field(alias='sessionCount')
|
|
66
|
+
@computed_field(alias='sessionCount')
|
|
67
67
|
@property
|
|
68
68
|
def session_count(self) -> int:
|
|
69
69
|
"""Get the total number of sessions currently maintained in the pool."""
|
|
70
70
|
return len(self.sessions)
|
|
71
71
|
|
|
72
|
-
@computed_field(alias='usableSessionCount')
|
|
72
|
+
@computed_field(alias='usableSessionCount')
|
|
73
73
|
@property
|
|
74
74
|
def usable_session_count(self) -> int:
|
|
75
75
|
"""Get the number of sessions that are currently usable."""
|
|
76
76
|
return len([session for _, session in self.sessions.items() if session.is_usable])
|
|
77
77
|
|
|
78
|
-
@computed_field(alias='retiredSessionCount')
|
|
78
|
+
@computed_field(alias='retiredSessionCount')
|
|
79
79
|
@property
|
|
80
80
|
def retired_session_count(self) -> int:
|
|
81
81
|
"""Get the number of sessions that are no longer usable."""
|
|
@@ -163,7 +163,7 @@ class SessionPool:
|
|
|
163
163
|
def add_session(self, session: Session) -> None:
|
|
164
164
|
"""Add an externally created session to the pool.
|
|
165
165
|
|
|
166
|
-
This is
|
|
166
|
+
This is intended only for the cases when you want to add a session that was created outside of the pool.
|
|
167
167
|
Otherwise, the pool will create new sessions automatically.
|
|
168
168
|
|
|
169
169
|
Args:
|
|
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
|
|
|
32
32
|
"""Capture error snapshot and save it to key value store.
|
|
33
33
|
|
|
34
34
|
It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
|
|
35
|
-
it returns `KeyValueStoreChangeRecords` which is
|
|
35
|
+
it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
|
|
36
36
|
returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
|
|
37
37
|
an exception.
|
|
38
38
|
|
crawlee/statistics/_models.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import warnings
|
|
4
5
|
from dataclasses import asdict, dataclass
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
|
-
from typing import Annotated, Any
|
|
7
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
|
7
8
|
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field
|
|
9
10
|
from typing_extensions import override
|
|
@@ -76,10 +77,20 @@ class StatisticsState(BaseModel):
|
|
|
76
77
|
crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
|
|
77
78
|
crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
|
|
78
79
|
crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
80
|
+
|
|
81
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
82
|
+
if TYPE_CHECKING:
|
|
83
|
+
errors: dict[str, Any] = {}
|
|
84
|
+
retry_errors: dict[str, Any] = {}
|
|
85
|
+
requests_with_status_code: dict[str, int] = {}
|
|
86
|
+
else:
|
|
87
|
+
errors: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
88
|
+
retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)]
|
|
89
|
+
requests_with_status_code: Annotated[
|
|
90
|
+
dict[str, int],
|
|
91
|
+
Field(alias='requestsWithStatusCode', default_factory=dict),
|
|
92
|
+
]
|
|
93
|
+
|
|
83
94
|
stats_persisted_at: Annotated[
|
|
84
95
|
datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc))
|
|
85
96
|
] = None
|
|
@@ -93,22 +104,53 @@ class StatisticsState(BaseModel):
|
|
|
93
104
|
),
|
|
94
105
|
] = {}
|
|
95
106
|
|
|
96
|
-
|
|
107
|
+
# Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
|
|
108
|
+
_runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
|
|
109
|
+
|
|
110
|
+
def model_post_init(self, /, __context: Any) -> None:
|
|
111
|
+
self._runtime_offset = self.crawler_runtime or self._runtime_offset
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def crawler_runtime(self) -> timedelta:
|
|
115
|
+
if self.crawler_last_started_at:
|
|
116
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
117
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
118
|
+
return self._runtime_offset
|
|
119
|
+
|
|
120
|
+
@crawler_runtime.setter
|
|
121
|
+
def crawler_runtime(self, value: timedelta) -> None:
|
|
122
|
+
# Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
|
|
123
|
+
# To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
|
|
124
|
+
warnings.warn(
|
|
125
|
+
f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
|
|
126
|
+
f' Value {value} will not be used.',
|
|
127
|
+
DeprecationWarning,
|
|
128
|
+
stacklevel=2,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
@computed_field(alias='crawlerRuntimeMillis')
|
|
132
|
+
def crawler_runtime_for_serialization(self) -> timedelta:
|
|
133
|
+
if self.crawler_last_started_at:
|
|
134
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
135
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
136
|
+
return self._runtime_offset
|
|
137
|
+
|
|
138
|
+
@computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms)
|
|
97
139
|
@property
|
|
98
140
|
def request_total_duration(self) -> timedelta:
|
|
99
141
|
return self.request_total_finished_duration + self.request_total_failed_duration
|
|
100
142
|
|
|
101
|
-
@computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
|
|
143
|
+
@computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None)
|
|
102
144
|
@property
|
|
103
145
|
def request_avg_failed_duration(self) -> timedelta | None:
|
|
104
146
|
return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None
|
|
105
147
|
|
|
106
|
-
@computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
|
|
148
|
+
@computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None)
|
|
107
149
|
@property
|
|
108
150
|
def request_avg_finished_duration(self) -> timedelta | None:
|
|
109
151
|
return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None
|
|
110
152
|
|
|
111
|
-
@computed_field(alias='requestsTotal')
|
|
153
|
+
@computed_field(alias='requestsTotal')
|
|
112
154
|
@property
|
|
113
155
|
def requests_total(self) -> int:
|
|
114
156
|
return self.requests_failed + self.requests_finished
|