crawlee 1.1.1b1__py3-none-any.whl → 1.1.2b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_types.py +20 -1
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/time.py +41 -1
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +46 -12
- crawlee/crawlers/_basic/_basic_crawler.py +107 -101
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +40 -10
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
- crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
- crawlee/storage_clients/_file_system/_request_queue_client.py +3 -3
- crawlee/storage_clients/_sql/_storage_client.py +0 -9
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/METADATA +4 -3
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/RECORD +24 -24
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/WHEEL +1 -1
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/entry_points.txt +0 -0
- {crawlee-1.1.1b1.dist-info → crawlee-1.1.2b4.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,19 +3,25 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import logging
|
|
5
5
|
import warnings
|
|
6
|
+
from datetime import timedelta
|
|
6
7
|
from functools import partial
|
|
7
8
|
from typing import TYPE_CHECKING, Any, Generic, Literal
|
|
8
9
|
|
|
10
|
+
import playwright.async_api
|
|
9
11
|
from more_itertools import partition
|
|
10
12
|
from pydantic import ValidationError
|
|
11
13
|
from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
12
14
|
|
|
13
15
|
from crawlee import service_locator
|
|
14
16
|
from crawlee._request import Request, RequestOptions
|
|
15
|
-
from crawlee._types import
|
|
17
|
+
from crawlee._types import (
|
|
18
|
+
BasicCrawlingContext,
|
|
19
|
+
ConcurrencySettings,
|
|
20
|
+
)
|
|
16
21
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
17
22
|
from crawlee._utils.docs import docs_group
|
|
18
23
|
from crawlee._utils.robots import RobotsTxtFile
|
|
24
|
+
from crawlee._utils.time import SharedTimeout
|
|
19
25
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
20
26
|
from crawlee.browsers import BrowserPool
|
|
21
27
|
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
|
|
@@ -44,7 +50,6 @@ if TYPE_CHECKING:
|
|
|
44
50
|
|
|
45
51
|
from crawlee import RequestTransformAction
|
|
46
52
|
from crawlee._types import (
|
|
47
|
-
BasicCrawlingContext,
|
|
48
53
|
EnqueueLinksKwargs,
|
|
49
54
|
ExtractLinksFunction,
|
|
50
55
|
HttpHeaders,
|
|
@@ -106,6 +111,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
106
111
|
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
|
|
107
112
|
headless: bool | None = None,
|
|
108
113
|
use_incognito_pages: bool | None = None,
|
|
114
|
+
navigation_timeout: timedelta | None = None,
|
|
109
115
|
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
|
|
110
116
|
) -> None:
|
|
111
117
|
"""Initialize a new instance.
|
|
@@ -134,12 +140,16 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
134
140
|
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
|
|
135
141
|
own context that is destroyed once the page is closed or crashes.
|
|
136
142
|
This option should not be used if `browser_pool` is provided.
|
|
143
|
+
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
|
|
144
|
+
the request handler)
|
|
137
145
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
138
146
|
"""
|
|
139
147
|
configuration = kwargs.pop('configuration', None)
|
|
140
148
|
if configuration is not None:
|
|
141
149
|
service_locator.set_configuration(configuration)
|
|
142
150
|
|
|
151
|
+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
|
|
152
|
+
|
|
143
153
|
if browser_pool:
|
|
144
154
|
# Raise an exception if browser_pool is provided together with other browser-related arguments.
|
|
145
155
|
if any(
|
|
@@ -202,6 +212,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
202
212
|
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
203
213
|
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
204
214
|
|
|
215
|
+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
216
|
+
|
|
205
217
|
super().__init__(**kwargs)
|
|
206
218
|
|
|
207
219
|
async def _open_page(
|
|
@@ -228,10 +240,18 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
228
240
|
block_requests=partial(block_requests, page=crawlee_page.page),
|
|
229
241
|
)
|
|
230
242
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
243
|
+
context_id = id(pre_navigation_context)
|
|
244
|
+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
async with browser_page_context(crawlee_page.page):
|
|
248
|
+
for hook in self._pre_navigation_hooks:
|
|
249
|
+
async with self._shared_navigation_timeouts[context_id]:
|
|
250
|
+
await hook(pre_navigation_context)
|
|
251
|
+
|
|
252
|
+
yield pre_navigation_context
|
|
253
|
+
finally:
|
|
254
|
+
self._shared_navigation_timeouts.pop(context_id, None)
|
|
235
255
|
|
|
236
256
|
def _prepare_request_interceptor(
|
|
237
257
|
self,
|
|
@@ -266,6 +286,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
266
286
|
Raises:
|
|
267
287
|
ValueError: If the browser pool is not initialized.
|
|
268
288
|
SessionError: If the URL cannot be loaded by the browser.
|
|
289
|
+
TimeoutError: If navigation does not succeed within the navigation timeout.
|
|
269
290
|
|
|
270
291
|
Yields:
|
|
271
292
|
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
|
|
@@ -297,7 +318,13 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
297
318
|
# Set route_handler only for current request
|
|
298
319
|
await context.page.route(context.request.url, route_handler)
|
|
299
320
|
|
|
300
|
-
|
|
321
|
+
try:
|
|
322
|
+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
323
|
+
response = await context.page.goto(
|
|
324
|
+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000
|
|
325
|
+
)
|
|
326
|
+
except playwright.async_api.TimeoutError as exc:
|
|
327
|
+
raise asyncio.TimeoutError from exc
|
|
301
328
|
|
|
302
329
|
if response is None:
|
|
303
330
|
raise SessionError(f'Failed to load the URL: {context.request.url}')
|
|
@@ -369,9 +396,12 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
369
396
|
links_iterator: Iterator[str] = iter(
|
|
370
397
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
371
398
|
)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
)
|
|
399
|
+
|
|
400
|
+
# Get base URL from <base> tag if present
|
|
401
|
+
extracted_base_url = await context.page.evaluate('document.baseURI')
|
|
402
|
+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
|
|
403
|
+
|
|
404
|
+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
|
|
375
405
|
|
|
376
406
|
if robots_txt_file:
|
|
377
407
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -59,6 +59,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
59
59
|
session: Session | None = None,
|
|
60
60
|
proxy_info: ProxyInfo | None = None,
|
|
61
61
|
statistics: Statistics | None = None,
|
|
62
|
+
timeout: timedelta | None = None,
|
|
62
63
|
) -> HttpCrawlingResult:
|
|
63
64
|
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
|
|
64
65
|
|
|
@@ -72,6 +73,7 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
72
73
|
payload: HttpPayload | None = None,
|
|
73
74
|
session: Session | None = None,
|
|
74
75
|
proxy_info: ProxyInfo | None = None,
|
|
76
|
+
timeout: timedelta | None = None,
|
|
75
77
|
) -> HttpResponse:
|
|
76
78
|
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
|
|
77
79
|
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
|
|
@@ -87,7 +89,11 @@ class PlaywrightHttpClient(HttpClient):
|
|
|
87
89
|
|
|
88
90
|
# Proxies appropriate to the browser context are used
|
|
89
91
|
response = await browser_context.request.fetch(
|
|
90
|
-
url_or_request=url,
|
|
92
|
+
url_or_request=url,
|
|
93
|
+
method=method.lower(),
|
|
94
|
+
headers=dict(headers) if headers else None,
|
|
95
|
+
data=payload,
|
|
96
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
91
97
|
)
|
|
92
98
|
|
|
93
99
|
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')
|
crawlee/http_clients/_base.py
CHANGED
|
@@ -104,6 +104,7 @@ class HttpClient(ABC):
|
|
|
104
104
|
session: Session | None = None,
|
|
105
105
|
proxy_info: ProxyInfo | None = None,
|
|
106
106
|
statistics: Statistics | None = None,
|
|
107
|
+
timeout: timedelta | None = None,
|
|
107
108
|
) -> HttpCrawlingResult:
|
|
108
109
|
"""Perform the crawling for a given request.
|
|
109
110
|
|
|
@@ -114,6 +115,7 @@ class HttpClient(ABC):
|
|
|
114
115
|
session: The session associated with the request.
|
|
115
116
|
proxy_info: The information about the proxy to be used.
|
|
116
117
|
statistics: The statistics object to register status codes.
|
|
118
|
+
timeout: Maximum time allowed to process the request.
|
|
117
119
|
|
|
118
120
|
Raises:
|
|
119
121
|
ProxyError: Raised if a proxy-related error occurs.
|
|
@@ -132,6 +134,7 @@ class HttpClient(ABC):
|
|
|
132
134
|
payload: HttpPayload | None = None,
|
|
133
135
|
session: Session | None = None,
|
|
134
136
|
proxy_info: ProxyInfo | None = None,
|
|
137
|
+
timeout: timedelta | None = None,
|
|
135
138
|
) -> HttpResponse:
|
|
136
139
|
"""Send an HTTP request via the client.
|
|
137
140
|
|
|
@@ -144,6 +147,7 @@ class HttpClient(ABC):
|
|
|
144
147
|
payload: The data to be sent as the request body.
|
|
145
148
|
session: The session associated with the request.
|
|
146
149
|
proxy_info: The information about the proxy to be used.
|
|
150
|
+
timeout: Maximum time allowed to process the request.
|
|
147
151
|
|
|
148
152
|
Raises:
|
|
149
153
|
ProxyError: Raised if a proxy-related error occurs.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from contextlib import asynccontextmanager
|
|
4
5
|
from typing import TYPE_CHECKING, Any
|
|
5
6
|
|
|
@@ -10,6 +11,7 @@ from curl_cffi.requests.cookies import Cookies as CurlCookies
|
|
|
10
11
|
from curl_cffi.requests.cookies import CurlMorsel
|
|
11
12
|
from curl_cffi.requests.exceptions import ProxyError as CurlProxyError
|
|
12
13
|
from curl_cffi.requests.exceptions import RequestException as CurlRequestError
|
|
14
|
+
from curl_cffi.requests.exceptions import Timeout
|
|
13
15
|
from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME
|
|
14
16
|
from typing_extensions import override
|
|
15
17
|
|
|
@@ -147,6 +149,7 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
147
149
|
session: Session | None = None,
|
|
148
150
|
proxy_info: ProxyInfo | None = None,
|
|
149
151
|
statistics: Statistics | None = None,
|
|
152
|
+
timeout: timedelta | None = None,
|
|
150
153
|
) -> HttpCrawlingResult:
|
|
151
154
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
152
155
|
|
|
@@ -157,7 +160,10 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
157
160
|
headers=request.headers,
|
|
158
161
|
data=request.payload,
|
|
159
162
|
cookies=session.cookies.jar if session else None,
|
|
163
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
160
164
|
)
|
|
165
|
+
except Timeout as exc:
|
|
166
|
+
raise asyncio.TimeoutError from exc
|
|
161
167
|
except CurlRequestError as exc:
|
|
162
168
|
if self._is_proxy_error(exc):
|
|
163
169
|
raise ProxyError from exc
|
|
@@ -186,6 +192,7 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
186
192
|
payload: HttpPayload | None = None,
|
|
187
193
|
session: Session | None = None,
|
|
188
194
|
proxy_info: ProxyInfo | None = None,
|
|
195
|
+
timeout: timedelta | None = None,
|
|
189
196
|
) -> HttpResponse:
|
|
190
197
|
if isinstance(headers, dict) or headers is None:
|
|
191
198
|
headers = HttpHeaders(headers or {})
|
|
@@ -200,7 +207,10 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
200
207
|
headers=dict(headers) if headers else None,
|
|
201
208
|
data=payload,
|
|
202
209
|
cookies=session.cookies.jar if session else None,
|
|
210
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
203
211
|
)
|
|
212
|
+
except Timeout as exc:
|
|
213
|
+
raise asyncio.TimeoutError from exc
|
|
204
214
|
except CurlRequestError as exc:
|
|
205
215
|
if self._is_proxy_error(exc):
|
|
206
216
|
raise ProxyError from exc
|
|
@@ -241,6 +251,8 @@ class CurlImpersonateHttpClient(HttpClient):
|
|
|
241
251
|
stream=True,
|
|
242
252
|
timeout=timeout.total_seconds() if timeout else None,
|
|
243
253
|
)
|
|
254
|
+
except Timeout as exc:
|
|
255
|
+
raise asyncio.TimeoutError from exc
|
|
244
256
|
except CurlRequestError as exc:
|
|
245
257
|
if self._is_proxy_error(exc):
|
|
246
258
|
raise ProxyError from exc
|
crawlee/http_clients/_httpx.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from contextlib import asynccontextmanager
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import TYPE_CHECKING, Any, cast
|
|
@@ -146,6 +147,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
146
147
|
session: Session | None = None,
|
|
147
148
|
proxy_info: ProxyInfo | None = None,
|
|
148
149
|
statistics: Statistics | None = None,
|
|
150
|
+
timeout: timedelta | None = None,
|
|
149
151
|
) -> HttpCrawlingResult:
|
|
150
152
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
151
153
|
headers = self._combine_headers(request.headers)
|
|
@@ -157,10 +159,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
157
159
|
content=request.payload,
|
|
158
160
|
cookies=session.cookies.jar if session else None,
|
|
159
161
|
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
|
|
162
|
+
timeout=timeout.total_seconds() if timeout is not None else httpx.USE_CLIENT_DEFAULT,
|
|
160
163
|
)
|
|
161
164
|
|
|
162
165
|
try:
|
|
163
166
|
response = await client.send(http_request)
|
|
167
|
+
except httpx.TimeoutException as exc:
|
|
168
|
+
raise asyncio.TimeoutError from exc
|
|
164
169
|
except httpx.TransportError as exc:
|
|
165
170
|
if self._is_proxy_error(exc):
|
|
166
171
|
raise ProxyError from exc
|
|
@@ -185,6 +190,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
185
190
|
payload: HttpPayload | None = None,
|
|
186
191
|
session: Session | None = None,
|
|
187
192
|
proxy_info: ProxyInfo | None = None,
|
|
193
|
+
timeout: timedelta | None = None,
|
|
188
194
|
) -> HttpResponse:
|
|
189
195
|
client = self._get_client(proxy_info.url if proxy_info else None)
|
|
190
196
|
|
|
@@ -195,10 +201,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
195
201
|
headers=headers,
|
|
196
202
|
payload=payload,
|
|
197
203
|
session=session,
|
|
204
|
+
timeout=httpx.Timeout(timeout.total_seconds()) if timeout is not None else None,
|
|
198
205
|
)
|
|
199
206
|
|
|
200
207
|
try:
|
|
201
208
|
response = await client.send(http_request)
|
|
209
|
+
except httpx.TimeoutException as exc:
|
|
210
|
+
raise asyncio.TimeoutError from exc
|
|
202
211
|
except httpx.TransportError as exc:
|
|
203
212
|
if self._is_proxy_error(exc):
|
|
204
213
|
raise ProxyError from exc
|
|
@@ -228,10 +237,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
228
237
|
headers=headers,
|
|
229
238
|
payload=payload,
|
|
230
239
|
session=session,
|
|
231
|
-
timeout=timeout,
|
|
240
|
+
timeout=httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None,
|
|
232
241
|
)
|
|
233
242
|
|
|
234
|
-
|
|
243
|
+
try:
|
|
244
|
+
response = await client.send(http_request, stream=True)
|
|
245
|
+
except httpx.TimeoutException as exc:
|
|
246
|
+
raise asyncio.TimeoutError from exc
|
|
235
247
|
|
|
236
248
|
try:
|
|
237
249
|
yield _HttpxResponse(response)
|
|
@@ -246,7 +258,7 @@ class HttpxHttpClient(HttpClient):
|
|
|
246
258
|
headers: HttpHeaders | dict[str, str] | None,
|
|
247
259
|
payload: HttpPayload | None,
|
|
248
260
|
session: Session | None = None,
|
|
249
|
-
timeout:
|
|
261
|
+
timeout: httpx.Timeout | None = None,
|
|
250
262
|
) -> httpx.Request:
|
|
251
263
|
"""Build an `httpx.Request` using the provided parameters."""
|
|
252
264
|
if isinstance(headers, dict) or headers is None:
|
|
@@ -254,15 +266,13 @@ class HttpxHttpClient(HttpClient):
|
|
|
254
266
|
|
|
255
267
|
headers = self._combine_headers(headers)
|
|
256
268
|
|
|
257
|
-
httpx_timeout = httpx.Timeout(None, connect=timeout.total_seconds()) if timeout else None
|
|
258
|
-
|
|
259
269
|
return client.build_request(
|
|
260
270
|
url=url,
|
|
261
271
|
method=method,
|
|
262
272
|
headers=dict(headers) if headers else None,
|
|
263
273
|
content=payload,
|
|
264
274
|
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
|
|
265
|
-
timeout=
|
|
275
|
+
timeout=timeout if timeout else httpx.USE_CLIENT_DEFAULT,
|
|
266
276
|
)
|
|
267
277
|
|
|
268
278
|
def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
|
crawlee/http_clients/_impit.py
CHANGED
|
@@ -6,7 +6,7 @@ from logging import getLogger
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, TypedDict
|
|
7
7
|
|
|
8
8
|
from cachetools import LRUCache
|
|
9
|
-
from impit import AsyncClient, Browser, HTTPError, Response, TransportError
|
|
9
|
+
from impit import AsyncClient, Browser, HTTPError, Response, TimeoutException, TransportError
|
|
10
10
|
from impit import ProxyError as ImpitProxyError
|
|
11
11
|
from typing_extensions import override
|
|
12
12
|
|
|
@@ -125,6 +125,7 @@ class ImpitHttpClient(HttpClient):
|
|
|
125
125
|
session: Session | None = None,
|
|
126
126
|
proxy_info: ProxyInfo | None = None,
|
|
127
127
|
statistics: Statistics | None = None,
|
|
128
|
+
timeout: timedelta | None = None,
|
|
128
129
|
) -> HttpCrawlingResult:
|
|
129
130
|
client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
|
|
130
131
|
|
|
@@ -134,7 +135,10 @@ class ImpitHttpClient(HttpClient):
|
|
|
134
135
|
method=request.method,
|
|
135
136
|
content=request.payload,
|
|
136
137
|
headers=dict(request.headers) if request.headers else None,
|
|
138
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
137
139
|
)
|
|
140
|
+
except TimeoutException as exc:
|
|
141
|
+
raise asyncio.TimeoutError from exc
|
|
138
142
|
except (TransportError, HTTPError) as exc:
|
|
139
143
|
if self._is_proxy_error(exc):
|
|
140
144
|
raise ProxyError from exc
|
|
@@ -157,6 +161,7 @@ class ImpitHttpClient(HttpClient):
|
|
|
157
161
|
payload: HttpPayload | None = None,
|
|
158
162
|
session: Session | None = None,
|
|
159
163
|
proxy_info: ProxyInfo | None = None,
|
|
164
|
+
timeout: timedelta | None = None,
|
|
160
165
|
) -> HttpResponse:
|
|
161
166
|
if isinstance(headers, dict) or headers is None:
|
|
162
167
|
headers = HttpHeaders(headers or {})
|
|
@@ -165,8 +170,14 @@ class ImpitHttpClient(HttpClient):
|
|
|
165
170
|
|
|
166
171
|
try:
|
|
167
172
|
response = await client.request(
|
|
168
|
-
method=method,
|
|
173
|
+
method=method,
|
|
174
|
+
url=url,
|
|
175
|
+
content=payload,
|
|
176
|
+
headers=dict(headers) if headers else None,
|
|
177
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
169
178
|
)
|
|
179
|
+
except TimeoutException as exc:
|
|
180
|
+
raise asyncio.TimeoutError from exc
|
|
170
181
|
except (TransportError, HTTPError) as exc:
|
|
171
182
|
if self._is_proxy_error(exc):
|
|
172
183
|
raise ProxyError from exc
|
|
@@ -189,14 +200,18 @@ class ImpitHttpClient(HttpClient):
|
|
|
189
200
|
) -> AsyncGenerator[HttpResponse]:
|
|
190
201
|
client = self._get_client(proxy_info.url if proxy_info else None, session.cookies.jar if session else None)
|
|
191
202
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
203
|
+
try:
|
|
204
|
+
response = await client.request(
|
|
205
|
+
method=method,
|
|
206
|
+
url=url,
|
|
207
|
+
content=payload,
|
|
208
|
+
headers=dict(headers) if headers else None,
|
|
209
|
+
timeout=timeout.total_seconds() if timeout else None,
|
|
210
|
+
stream=True,
|
|
211
|
+
)
|
|
212
|
+
except TimeoutException as exc:
|
|
213
|
+
raise asyncio.TimeoutError from exc
|
|
214
|
+
|
|
200
215
|
try:
|
|
201
216
|
yield _ImpitResponse(response)
|
|
202
217
|
finally:
|
|
@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
134
134
|
continue
|
|
135
135
|
|
|
136
136
|
try:
|
|
137
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
138
138
|
try:
|
|
139
139
|
file_content = json.load(file)
|
|
140
140
|
metadata = DatasetMetadata(**file_content)
|
|
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
163
163
|
|
|
164
164
|
# If the dataset directory exists, reconstruct the client from the metadata file.
|
|
165
165
|
if path_to_dataset.exists() and path_to_metadata.exists():
|
|
166
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
166
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
167
167
|
try:
|
|
168
168
|
file_content = json.load(file)
|
|
169
169
|
finally:
|
|
@@ -133,7 +133,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
133
133
|
continue
|
|
134
134
|
|
|
135
135
|
try:
|
|
136
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
136
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
137
137
|
try:
|
|
138
138
|
file_content = json.load(file)
|
|
139
139
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
@@ -162,7 +162,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
162
162
|
|
|
163
163
|
# If the key-value store directory exists, reconstruct the client from the metadata file.
|
|
164
164
|
if path_to_kvs.exists() and path_to_metadata.exists():
|
|
165
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
165
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
166
166
|
try:
|
|
167
167
|
file_content = json.load(file)
|
|
168
168
|
finally:
|
|
@@ -239,7 +239,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
239
239
|
# Read the metadata file
|
|
240
240
|
async with self._lock:
|
|
241
241
|
try:
|
|
242
|
-
file = await asyncio.to_thread(open, record_metadata_filepath)
|
|
242
|
+
file = await asyncio.to_thread(open, record_metadata_filepath, 'r', encoding='utf-8')
|
|
243
243
|
except FileNotFoundError:
|
|
244
244
|
logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
|
|
245
245
|
return None
|
|
@@ -197,7 +197,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
197
197
|
continue
|
|
198
198
|
|
|
199
199
|
try:
|
|
200
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
200
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
201
201
|
try:
|
|
202
202
|
file_content = json.load(file)
|
|
203
203
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -232,7 +232,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
232
232
|
|
|
233
233
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
234
234
|
if path_to_rq.exists() and path_to_metadata.exists():
|
|
235
|
-
file = await asyncio.to_thread(open, path_to_metadata)
|
|
235
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
236
236
|
try:
|
|
237
237
|
file_content = json.load(file)
|
|
238
238
|
finally:
|
|
@@ -775,7 +775,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
775
775
|
"""
|
|
776
776
|
# Open the request file.
|
|
777
777
|
try:
|
|
778
|
-
file = await asyncio.to_thread(open, file_path)
|
|
778
|
+
file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8')
|
|
779
779
|
except FileNotFoundError:
|
|
780
780
|
logger.warning(f'Request file "{file_path}" not found.')
|
|
781
781
|
return None
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import sys
|
|
4
3
|
import warnings
|
|
5
4
|
from datetime import timedelta
|
|
6
5
|
from pathlib import Path
|
|
@@ -269,14 +268,6 @@ class SqlStorageClient(StorageClient):
|
|
|
269
268
|
'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
|
|
270
269
|
)
|
|
271
270
|
|
|
272
|
-
# TODO: https://github.com/apify/crawlee-python/issues/1555
|
|
273
|
-
if 'postgresql' in connection_string and sys.version_info >= (3, 14):
|
|
274
|
-
raise ValueError(
|
|
275
|
-
'SqlStorageClient cannot use PostgreSQL with Python 3.14 '
|
|
276
|
-
'due to asyncpg compatibility limitations. '
|
|
277
|
-
'Please use Python 3.13 or earlier, or switch to SQLite.'
|
|
278
|
-
)
|
|
279
|
-
|
|
280
271
|
self._engine = create_async_engine(
|
|
281
272
|
connection_string,
|
|
282
273
|
future=True,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlee
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.2b4
|
|
4
4
|
Summary: Crawlee for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
|
|
@@ -226,6 +226,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
226
226
|
Classifier: Programming Language :: Python :: 3.14
|
|
227
227
|
Classifier: Topic :: Software Development :: Libraries
|
|
228
228
|
Requires-Python: >=3.10
|
|
229
|
+
Requires-Dist: async-timeout>=5.0.1
|
|
229
230
|
Requires-Dist: cachetools>=5.5.0
|
|
230
231
|
Requires-Dist: colorama>=0.4.0
|
|
231
232
|
Requires-Dist: impit>=0.8.0
|
|
@@ -247,7 +248,7 @@ Requires-Dist: scikit-learn>=1.6.0; extra == 'adaptive-crawler'
|
|
|
247
248
|
Provides-Extra: all
|
|
248
249
|
Requires-Dist: aiosqlite>=0.21.0; extra == 'all'
|
|
249
250
|
Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'all'
|
|
250
|
-
Requires-Dist: asyncpg>=0.24.0;
|
|
251
|
+
Requires-Dist: asyncpg>=0.24.0; extra == 'all'
|
|
251
252
|
Requires-Dist: beautifulsoup4[lxml]>=4.12.0; extra == 'all'
|
|
252
253
|
Requires-Dist: browserforge>=1.2.3; extra == 'all'
|
|
253
254
|
Requires-Dist: cookiecutter>=2.6.0; extra == 'all'
|
|
@@ -301,7 +302,7 @@ Requires-Dist: playwright>=1.27.0; extra == 'playwright'
|
|
|
301
302
|
Provides-Extra: redis
|
|
302
303
|
Requires-Dist: redis[hiredis]>=7.0.0; extra == 'redis'
|
|
303
304
|
Provides-Extra: sql-postgres
|
|
304
|
-
Requires-Dist: asyncpg>=0.24.0;
|
|
305
|
+
Requires-Dist: asyncpg>=0.24.0; extra == 'sql-postgres'
|
|
305
306
|
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-postgres'
|
|
306
307
|
Provides-Extra: sql-sqlite
|
|
307
308
|
Requires-Dist: aiosqlite>=0.21.0; extra == 'sql-sqlite'
|