crawlee 1.1.2b4__py3-none-any.whl → 1.2.1b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_request.py +29 -10
- crawlee/_types.py +22 -1
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/recurring_task.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +10 -33
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +31 -20
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_playwright/_playwright_crawler.py +19 -8
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +1 -3
- crawlee/router.py +13 -3
- {crawlee-1.1.2b4.dist-info → crawlee-1.2.1b7.dist-info}/METADATA +7 -14
- {crawlee-1.1.2b4.dist-info → crawlee-1.2.1b7.dist-info}/RECORD +22 -21
- {crawlee-1.1.2b4.dist-info → crawlee-1.2.1b7.dist-info}/WHEEL +0 -0
- {crawlee-1.1.2b4.dist-info → crawlee-1.2.1b7.dist-info}/entry_points.txt +0 -0
- {crawlee-1.1.2b4.dist-info → crawlee-1.2.1b7.dist-info}/licenses/LICENSE +0 -0
crawlee/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from importlib import metadata
|
|
2
2
|
|
|
3
|
-
from ._request import Request, RequestOptions
|
|
3
|
+
from ._request import Request, RequestOptions, RequestState
|
|
4
4
|
from ._service_locator import service_locator
|
|
5
5
|
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
|
|
6
6
|
from ._utils.globs import Glob
|
|
@@ -14,6 +14,7 @@ __all__ = [
|
|
|
14
14
|
'HttpHeaders',
|
|
15
15
|
'Request',
|
|
16
16
|
'RequestOptions',
|
|
17
|
+
'RequestState',
|
|
17
18
|
'RequestTransformAction',
|
|
18
19
|
'SkippedReason',
|
|
19
20
|
'service_locator',
|
crawlee/_request.py
CHANGED
|
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
|
|
|
34
34
|
class CrawleeRequestData(BaseModel):
|
|
35
35
|
"""Crawlee-specific configuration stored in the `user_data`."""
|
|
36
36
|
|
|
37
|
-
max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
|
|
37
|
+
max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
|
|
38
38
|
"""Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
|
|
39
39
|
`BasicCrawler`."""
|
|
40
40
|
|
|
41
41
|
enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
|
|
42
42
|
"""The strategy that was used for enqueuing the request."""
|
|
43
43
|
|
|
44
|
-
state: RequestState
|
|
44
|
+
state: RequestState = RequestState.UNPROCESSED
|
|
45
45
|
"""Describes the request's current lifecycle state."""
|
|
46
46
|
|
|
47
47
|
session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
|
|
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
|
|
|
137
137
|
always_enqueue: NotRequired[bool]
|
|
138
138
|
user_data: NotRequired[dict[str, JsonSerializable]]
|
|
139
139
|
no_retry: NotRequired[bool]
|
|
140
|
+
enqueue_strategy: NotRequired[EnqueueStrategy]
|
|
141
|
+
max_retries: NotRequired[int | None]
|
|
140
142
|
|
|
141
143
|
|
|
142
144
|
@docs_group('Storage data')
|
|
@@ -166,7 +168,7 @@ class Request(BaseModel):
|
|
|
166
168
|
|
|
167
169
|
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
168
170
|
|
|
169
|
-
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
171
|
+
unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
|
|
170
172
|
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
|
|
171
173
|
to the same URL.
|
|
172
174
|
|
|
@@ -178,17 +180,18 @@ class Request(BaseModel):
|
|
|
178
180
|
and specify which URLs shall be considered equal.
|
|
179
181
|
"""
|
|
180
182
|
|
|
181
|
-
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
|
|
183
|
+
url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
|
|
182
184
|
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
|
|
183
185
|
and fragments."""
|
|
184
186
|
|
|
185
|
-
method: HttpMethod = 'GET'
|
|
187
|
+
method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
|
|
186
188
|
"""HTTP request method."""
|
|
187
189
|
|
|
188
190
|
payload: Annotated[
|
|
189
191
|
HttpPayload | None,
|
|
190
192
|
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
|
|
191
193
|
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
|
|
194
|
+
Field(frozen=True),
|
|
192
195
|
] = None
|
|
193
196
|
"""HTTP request payload."""
|
|
194
197
|
|
|
@@ -250,6 +253,8 @@ class Request(BaseModel):
|
|
|
250
253
|
keep_url_fragment: bool = False,
|
|
251
254
|
use_extended_unique_key: bool = False,
|
|
252
255
|
always_enqueue: bool = False,
|
|
256
|
+
enqueue_strategy: EnqueueStrategy | None = None,
|
|
257
|
+
max_retries: int | None = None,
|
|
253
258
|
**kwargs: Any,
|
|
254
259
|
) -> Self:
|
|
255
260
|
"""Create a new `Request` instance from a URL.
|
|
@@ -277,6 +282,9 @@ class Request(BaseModel):
|
|
|
277
282
|
`unique_key` computation. This is only relevant when `unique_key` is not provided.
|
|
278
283
|
always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
|
|
279
284
|
Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
|
|
285
|
+
enqueue_strategy: The strategy that will be used for enqueuing the request.
|
|
286
|
+
max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
|
|
287
|
+
option of `BasicCrawler`.
|
|
280
288
|
**kwargs: Additional request properties.
|
|
281
289
|
"""
|
|
282
290
|
if unique_key is not None and always_enqueue:
|
|
@@ -301,12 +309,27 @@ class Request(BaseModel):
|
|
|
301
309
|
if always_enqueue:
|
|
302
310
|
unique_key = f'{crypto_random_object_id()}|{unique_key}'
|
|
303
311
|
|
|
312
|
+
user_data_dict = kwargs.pop('user_data', {}) or {}
|
|
313
|
+
crawlee_data_dict = user_data_dict.get('__crawlee', {})
|
|
314
|
+
|
|
315
|
+
if max_retries is not None:
|
|
316
|
+
crawlee_data_dict['maxRetries'] = max_retries
|
|
317
|
+
|
|
318
|
+
if enqueue_strategy is not None:
|
|
319
|
+
crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
|
|
320
|
+
|
|
321
|
+
crawlee_data = CrawleeRequestData(**crawlee_data_dict)
|
|
322
|
+
|
|
323
|
+
if crawlee_data:
|
|
324
|
+
user_data_dict['__crawlee'] = crawlee_data
|
|
325
|
+
|
|
304
326
|
request = cls(
|
|
305
327
|
url=url,
|
|
306
328
|
unique_key=unique_key,
|
|
307
329
|
method=method,
|
|
308
330
|
headers=headers,
|
|
309
331
|
payload=payload,
|
|
332
|
+
user_data=user_data_dict,
|
|
310
333
|
**kwargs,
|
|
311
334
|
)
|
|
312
335
|
|
|
@@ -352,7 +375,7 @@ class Request(BaseModel):
|
|
|
352
375
|
self.crawlee_data.crawl_depth = new_value
|
|
353
376
|
|
|
354
377
|
@property
|
|
355
|
-
def state(self) -> RequestState
|
|
378
|
+
def state(self) -> RequestState:
|
|
356
379
|
"""Crawlee-specific request handling state."""
|
|
357
380
|
return self.crawlee_data.state
|
|
358
381
|
|
|
@@ -365,10 +388,6 @@ class Request(BaseModel):
|
|
|
365
388
|
"""Crawlee-specific limit on the number of retries of the request."""
|
|
366
389
|
return self.crawlee_data.max_retries
|
|
367
390
|
|
|
368
|
-
@max_retries.setter
|
|
369
|
-
def max_retries(self, new_max_retries: int) -> None:
|
|
370
|
-
self.crawlee_data.max_retries = new_max_retries
|
|
371
|
-
|
|
372
391
|
@property
|
|
373
392
|
def session_rotation_count(self) -> int | None:
|
|
374
393
|
"""Crawlee-specific number of finished session rotations for the request."""
|
crawlee/_types.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
from collections.abc import Callable, Iterator, Mapping
|
|
5
|
+
from copy import deepcopy
|
|
5
6
|
from dataclasses import dataclass
|
|
6
7
|
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
|
|
7
8
|
|
|
@@ -260,12 +261,24 @@ class KeyValueStoreChangeRecords:
|
|
|
260
261
|
class RequestHandlerRunResult:
|
|
261
262
|
"""Record of calls to storage-related context helpers."""
|
|
262
263
|
|
|
263
|
-
def __init__(
|
|
264
|
+
def __init__(
|
|
265
|
+
self,
|
|
266
|
+
*,
|
|
267
|
+
key_value_store_getter: GetKeyValueStoreFunction,
|
|
268
|
+
request: Request,
|
|
269
|
+
) -> None:
|
|
264
270
|
self._key_value_store_getter = key_value_store_getter
|
|
265
271
|
self.add_requests_calls = list[AddRequestsKwargs]()
|
|
266
272
|
self.push_data_calls = list[PushDataFunctionCall]()
|
|
267
273
|
self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
|
|
268
274
|
|
|
275
|
+
# Isolated copies for handler execution
|
|
276
|
+
self._request = deepcopy(request)
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def request(self) -> Request:
|
|
280
|
+
return self._request
|
|
281
|
+
|
|
269
282
|
async def add_requests(
|
|
270
283
|
self,
|
|
271
284
|
requests: Sequence[str | Request],
|
|
@@ -315,6 +328,14 @@ class RequestHandlerRunResult:
|
|
|
315
328
|
|
|
316
329
|
return self.key_value_store_changes[id, name, alias]
|
|
317
330
|
|
|
331
|
+
def apply_request_changes(self, target: Request) -> None:
|
|
332
|
+
"""Apply tracked changes from handler copy to original request."""
|
|
333
|
+
if self.request.user_data != target.user_data:
|
|
334
|
+
target.user_data = self.request.user_data
|
|
335
|
+
|
|
336
|
+
if self.request.headers != target.headers:
|
|
337
|
+
target.headers = self.request.headers
|
|
338
|
+
|
|
318
339
|
|
|
319
340
|
@docs_group('Functions')
|
|
320
341
|
class AddRequestsFunction(Protocol):
|
crawlee/_utils/context.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import inspect
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from functools import wraps
|
|
6
6
|
from typing import Any, TypeVar
|
|
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
|
|
|
44
44
|
|
|
45
45
|
return await method(self, *args, **kwargs)
|
|
46
46
|
|
|
47
|
-
return async_wrapper if
|
|
47
|
+
return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
|
crawlee/_utils/recurring_task.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import inspect
|
|
4
5
|
from logging import getLogger
|
|
5
6
|
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
@@ -49,7 +50,7 @@ class RecurringTask:
|
|
|
49
50
|
"""
|
|
50
51
|
sleep_time_secs = self.delay.total_seconds()
|
|
51
52
|
while True:
|
|
52
|
-
await self.func() if
|
|
53
|
+
await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
|
|
53
54
|
await asyncio.sleep(sleep_time_secs)
|
|
54
55
|
|
|
55
56
|
def start(self) -> None:
|
|
@@ -10,7 +10,7 @@ from more_itertools import partition
|
|
|
10
10
|
from pydantic import ValidationError
|
|
11
11
|
from typing_extensions import NotRequired, TypeVar
|
|
12
12
|
|
|
13
|
-
from crawlee._request import Request, RequestOptions
|
|
13
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
14
14
|
from crawlee._utils.docs import docs_group
|
|
15
15
|
from crawlee._utils.time import SharedTimeout
|
|
16
16
|
from crawlee._utils.urls import to_absolute_url_iterator
|
|
@@ -191,6 +191,7 @@ class AbstractHttpCrawler(
|
|
|
191
191
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
192
192
|
|
|
193
193
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
194
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
194
195
|
|
|
195
196
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
196
197
|
|
|
@@ -209,7 +210,9 @@ class AbstractHttpCrawler(
|
|
|
209
210
|
skipped = iter([])
|
|
210
211
|
|
|
211
212
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
212
|
-
request_options = RequestOptions(
|
|
213
|
+
request_options = RequestOptions(
|
|
214
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
215
|
+
)
|
|
213
216
|
|
|
214
217
|
if transform_request_function:
|
|
215
218
|
transform_request_options = transform_request_function(request_options)
|
|
@@ -257,6 +260,7 @@ class AbstractHttpCrawler(
|
|
|
257
260
|
timeout=remaining_timeout,
|
|
258
261
|
)
|
|
259
262
|
|
|
263
|
+
context.request.state = RequestState.AFTER_NAV
|
|
260
264
|
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
|
|
261
265
|
|
|
262
266
|
async def _handle_status_code_response(
|
|
@@ -290,11 +290,14 @@ class AdaptivePlaywrightCrawler(
|
|
|
290
290
|
use_state_function = context.use_state
|
|
291
291
|
|
|
292
292
|
# New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
|
|
293
|
-
result = RequestHandlerRunResult(
|
|
293
|
+
result = RequestHandlerRunResult(
|
|
294
|
+
key_value_store_getter=self.get_key_value_store,
|
|
295
|
+
request=context.request,
|
|
296
|
+
)
|
|
294
297
|
context_linked_to_result = BasicCrawlingContext(
|
|
295
|
-
request=
|
|
296
|
-
session=
|
|
297
|
-
proxy_info=
|
|
298
|
+
request=result.request,
|
|
299
|
+
session=context.session,
|
|
300
|
+
proxy_info=context.proxy_info,
|
|
298
301
|
send_request=context.send_request,
|
|
299
302
|
add_requests=result.add_requests,
|
|
300
303
|
push_data=result.push_data,
|
|
@@ -314,7 +317,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
314
317
|
),
|
|
315
318
|
logger=self._logger,
|
|
316
319
|
)
|
|
317
|
-
return SubCrawlerRun(result=result
|
|
320
|
+
return SubCrawlerRun(result=result)
|
|
318
321
|
except Exception as e:
|
|
319
322
|
return SubCrawlerRun(exception=e)
|
|
320
323
|
|
|
@@ -370,8 +373,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
370
373
|
self.track_http_only_request_handler_runs()
|
|
371
374
|
|
|
372
375
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
373
|
-
if static_run.result and
|
|
374
|
-
self._update_context_from_copy(context, static_run.run_context)
|
|
376
|
+
if static_run.result and self.result_checker(static_run.result):
|
|
375
377
|
self._context_result_map[context] = static_run.result
|
|
376
378
|
return
|
|
377
379
|
if static_run.exception:
|
|
@@ -402,7 +404,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
402
404
|
if pw_run.exception is not None:
|
|
403
405
|
raise pw_run.exception
|
|
404
406
|
|
|
405
|
-
if pw_run.result
|
|
407
|
+
if pw_run.result:
|
|
406
408
|
if should_detect_rendering_type:
|
|
407
409
|
detection_result: RenderingType
|
|
408
410
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
@@ -414,7 +416,6 @@ class AdaptivePlaywrightCrawler(
|
|
|
414
416
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
415
417
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
416
418
|
|
|
417
|
-
self._update_context_from_copy(context, pw_run.run_context)
|
|
418
419
|
self._context_result_map[context] = pw_run.result
|
|
419
420
|
|
|
420
421
|
def pre_navigation_hook(
|
|
@@ -451,32 +452,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
451
452
|
def track_rendering_type_mispredictions(self) -> None:
|
|
452
453
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
453
454
|
|
|
454
|
-
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
455
|
-
"""Update mutable fields of `context` from `context_copy`.
|
|
456
|
-
|
|
457
|
-
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
458
|
-
allowing state synchronization after isolated crawler execution.
|
|
459
|
-
"""
|
|
460
|
-
updating_attributes = {
|
|
461
|
-
'request': ('headers', 'user_data'),
|
|
462
|
-
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
for attr, sub_attrs in updating_attributes.items():
|
|
466
|
-
original_sub_obj = getattr(context, attr)
|
|
467
|
-
copy_sub_obj = getattr(context_copy, attr)
|
|
468
|
-
|
|
469
|
-
# Check that both sub objects are not None
|
|
470
|
-
if original_sub_obj is None or copy_sub_obj is None:
|
|
471
|
-
continue
|
|
472
|
-
|
|
473
|
-
for sub_attr in sub_attrs:
|
|
474
|
-
new_value = getattr(copy_sub_obj, sub_attr)
|
|
475
|
-
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
476
|
-
|
|
477
455
|
|
|
478
456
|
@dataclass(frozen=True)
|
|
479
457
|
class SubCrawlerRun:
|
|
480
458
|
result: RequestHandlerRunResult | None = None
|
|
481
459
|
exception: Exception | None = None
|
|
482
|
-
run_context: BasicCrawlingContext | None = None
|
|
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
|
|
|
17
17
|
from playwright.async_api import Page, Response
|
|
18
18
|
from typing_extensions import Self
|
|
19
19
|
|
|
20
|
-
from crawlee.crawlers._playwright._types import BlockRequestsFunction
|
|
20
|
+
from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
TStaticParseResult = TypeVar('TStaticParseResult')
|
|
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
|
|
|
190
190
|
http_response = await PlaywrightHttpResponse.from_playwright_response(
|
|
191
191
|
response=context.response, protocol=protocol_guess or ''
|
|
192
192
|
)
|
|
193
|
-
# block_requests
|
|
193
|
+
# block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
|
|
194
194
|
context_kwargs.pop('block_requests')
|
|
195
|
+
context_kwargs.pop('goto_options')
|
|
195
196
|
return cls(
|
|
196
197
|
parsed_content=await parser.parse(http_response),
|
|
197
198
|
http_response=http_response,
|
|
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
212
213
|
block_requests: BlockRequestsFunction | None = None
|
|
213
214
|
"""Blocks network requests matching specified URL patterns."""
|
|
214
215
|
|
|
216
|
+
goto_options: GotoOptions | None = None
|
|
217
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
218
|
+
|
|
215
219
|
@property
|
|
216
220
|
def page(self) -> Page:
|
|
217
221
|
"""The Playwright `Page` object for the current page.
|
|
@@ -59,6 +59,7 @@ from crawlee.errors import (
|
|
|
59
59
|
RequestHandlerError,
|
|
60
60
|
SessionError,
|
|
61
61
|
UserDefinedErrorHandlerError,
|
|
62
|
+
UserHandlerTimeoutError,
|
|
62
63
|
)
|
|
63
64
|
from crawlee.events._types import Event, EventCrawlerStatusData
|
|
64
65
|
from crawlee.http_clients import ImpitHttpClient
|
|
@@ -68,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
|
|
|
68
69
|
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
|
|
69
70
|
|
|
70
71
|
from ._context_pipeline import ContextPipeline
|
|
72
|
+
from ._context_utils import swaped_context
|
|
71
73
|
from ._logging_utils import (
|
|
72
74
|
get_one_line_error_summary_if_possible,
|
|
73
75
|
reduce_asyncio_timeout_error_to_relevant_traceback_parts,
|
|
@@ -1037,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1037
1039
|
warning_flag = True
|
|
1038
1040
|
|
|
1039
1041
|
for request in request_iterator:
|
|
1040
|
-
|
|
1042
|
+
if isinstance(request, Request):
|
|
1043
|
+
if request.enqueue_strategy != strategy:
|
|
1044
|
+
request.enqueue_strategy = strategy
|
|
1045
|
+
target_url = request.url
|
|
1046
|
+
else:
|
|
1047
|
+
target_url = request
|
|
1041
1048
|
parsed_target_url = urlparse(target_url)
|
|
1042
1049
|
|
|
1043
1050
|
if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
|
|
@@ -1134,7 +1141,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1134
1141
|
request.retry_count += 1
|
|
1135
1142
|
reduced_error = str(error).split('\n')[0]
|
|
1136
1143
|
self.log.warning(
|
|
1137
|
-
f'Retrying request to {context.request.url} due to: {reduced_error}'
|
|
1144
|
+
f'Retrying request to {context.request.url} due to: {reduced_error}. '
|
|
1138
1145
|
f'{get_one_line_error_summary_if_possible(error)}'
|
|
1139
1146
|
)
|
|
1140
1147
|
await self._statistics.error_tracker.add(error=error, context=context)
|
|
@@ -1152,6 +1159,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1152
1159
|
|
|
1153
1160
|
await request_manager.reclaim_request(request)
|
|
1154
1161
|
else:
|
|
1162
|
+
request.state = RequestState.ERROR
|
|
1155
1163
|
await self._mark_request_as_handled(request)
|
|
1156
1164
|
await self._handle_failed_request(context, error)
|
|
1157
1165
|
self._statistics.record_request_processing_failure(request.unique_key)
|
|
@@ -1167,8 +1175,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1167
1175
|
f'{self._internal_timeout.total_seconds()} seconds',
|
|
1168
1176
|
logger=self._logger,
|
|
1169
1177
|
)
|
|
1170
|
-
|
|
1171
|
-
context.request.state = RequestState.DONE
|
|
1172
1178
|
except UserDefinedErrorHandlerError:
|
|
1173
1179
|
context.request.state = RequestState.ERROR
|
|
1174
1180
|
raise
|
|
@@ -1201,8 +1207,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1201
1207
|
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
|
|
1202
1208
|
) -> None:
|
|
1203
1209
|
if need_mark and isinstance(request, Request):
|
|
1204
|
-
await self._mark_request_as_handled(request)
|
|
1205
1210
|
request.state = RequestState.SKIPPED
|
|
1211
|
+
await self._mark_request_as_handled(request)
|
|
1206
1212
|
|
|
1207
1213
|
url = request.url if isinstance(request, Request) else request
|
|
1208
1214
|
|
|
@@ -1222,10 +1228,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1222
1228
|
|
|
1223
1229
|
if (
|
|
1224
1230
|
isinstance(error, asyncio.exceptions.TimeoutError)
|
|
1231
|
+
and traceback_parts
|
|
1225
1232
|
and self._request_handler_timeout_text in traceback_parts[-1]
|
|
1226
|
-
):
|
|
1233
|
+
) or isinstance(error, UserHandlerTimeoutError):
|
|
1227
1234
|
used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
1228
|
-
used_traceback_parts.
|
|
1235
|
+
used_traceback_parts.extend(traceback_parts[-1:])
|
|
1229
1236
|
|
|
1230
1237
|
return ''.join(used_traceback_parts).strip('\n')
|
|
1231
1238
|
|
|
@@ -1320,6 +1327,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1320
1327
|
|
|
1321
1328
|
await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
|
|
1322
1329
|
|
|
1330
|
+
result.apply_request_changes(target=context.request)
|
|
1331
|
+
|
|
1323
1332
|
@staticmethod
|
|
1324
1333
|
async def _commit_key_value_store_changes(
|
|
1325
1334
|
result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
|
|
@@ -1385,10 +1394,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1385
1394
|
else:
|
|
1386
1395
|
session = await self._get_session()
|
|
1387
1396
|
proxy_info = await self._get_proxy_info(request, session)
|
|
1388
|
-
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
|
|
1397
|
+
result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
|
|
1389
1398
|
|
|
1390
1399
|
context = BasicCrawlingContext(
|
|
1391
|
-
request=request,
|
|
1400
|
+
request=result.request,
|
|
1392
1401
|
session=session,
|
|
1393
1402
|
proxy_info=proxy_info,
|
|
1394
1403
|
send_request=self._prepare_send_request_function(session, proxy_info),
|
|
@@ -1405,26 +1414,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1405
1414
|
try:
|
|
1406
1415
|
request.state = RequestState.REQUEST_HANDLER
|
|
1407
1416
|
|
|
1408
|
-
self._check_request_collision(context.request, context.session)
|
|
1409
|
-
|
|
1410
1417
|
try:
|
|
1411
|
-
|
|
1418
|
+
with swaped_context(context, request):
|
|
1419
|
+
self._check_request_collision(request, session)
|
|
1420
|
+
await self._run_request_handler(context=context)
|
|
1412
1421
|
except asyncio.TimeoutError as e:
|
|
1413
1422
|
raise RequestHandlerError(e, context) from e
|
|
1414
1423
|
|
|
1415
1424
|
await self._commit_request_handler_result(context)
|
|
1416
1425
|
|
|
1417
|
-
await self._mark_request_as_handled(request)
|
|
1418
|
-
|
|
1419
1426
|
request.state = RequestState.DONE
|
|
1420
1427
|
|
|
1421
|
-
|
|
1422
|
-
|
|
1428
|
+
await self._mark_request_as_handled(request)
|
|
1429
|
+
|
|
1430
|
+
if session and session.is_usable:
|
|
1431
|
+
session.mark_good()
|
|
1423
1432
|
|
|
1424
1433
|
self._statistics.record_request_processing_finish(request.unique_key)
|
|
1425
1434
|
|
|
1426
1435
|
except RequestCollisionError as request_error:
|
|
1427
|
-
|
|
1436
|
+
request.no_retry = True
|
|
1428
1437
|
await self._handle_request_error(context, request_error)
|
|
1429
1438
|
|
|
1430
1439
|
except RequestHandlerError as primary_error:
|
|
@@ -1439,7 +1448,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1439
1448
|
await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
|
|
1440
1449
|
|
|
1441
1450
|
except SessionError as session_error:
|
|
1442
|
-
if not
|
|
1451
|
+
if not session:
|
|
1443
1452
|
raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
|
|
1444
1453
|
|
|
1445
1454
|
if self._error_handler:
|
|
@@ -1449,10 +1458,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1449
1458
|
exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
|
|
1450
1459
|
self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
|
|
1451
1460
|
|
|
1452
|
-
|
|
1461
|
+
if session:
|
|
1462
|
+
session.retire()
|
|
1453
1463
|
|
|
1454
1464
|
# Increment session rotation count.
|
|
1455
|
-
|
|
1465
|
+
request.session_rotation_count = (request.session_rotation_count or 0) + 1
|
|
1456
1466
|
|
|
1457
1467
|
await request_manager.reclaim_request(request)
|
|
1458
1468
|
await self._statistics.error_tracker_retry.add(error=session_error, context=context)
|
|
@@ -1483,6 +1493,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1483
1493
|
raise
|
|
1484
1494
|
|
|
1485
1495
|
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
|
|
1496
|
+
context.request.state = RequestState.BEFORE_NAV
|
|
1486
1497
|
await self._context_pipeline(
|
|
1487
1498
|
context,
|
|
1488
1499
|
lambda final_context: wait_for(
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Iterator
|
|
8
|
+
|
|
9
|
+
from crawlee._request import Request
|
|
10
|
+
|
|
11
|
+
from ._basic_crawling_context import BasicCrawlingContext
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@contextmanager
|
|
15
|
+
def swaped_context(
|
|
16
|
+
context: BasicCrawlingContext,
|
|
17
|
+
request: Request,
|
|
18
|
+
) -> Iterator[None]:
|
|
19
|
+
"""Replace context's isolated copies with originals after handler execution."""
|
|
20
|
+
try:
|
|
21
|
+
yield
|
|
22
|
+
finally:
|
|
23
|
+
# Restore original context state to avoid side effects between different handlers.
|
|
24
|
+
object.__setattr__(context, 'request', request)
|
|
@@ -2,9 +2,21 @@ import asyncio
|
|
|
2
2
|
import re
|
|
3
3
|
import traceback
|
|
4
4
|
|
|
5
|
+
import crawlee.errors
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
def _get_only_innermost_exception(error: BaseException) -> BaseException:
|
|
7
|
-
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
9
|
+
"""Get innermost exception by following __cause__ and __context__ attributes of exception.
|
|
10
|
+
|
|
11
|
+
If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
|
|
12
|
+
"""
|
|
13
|
+
if type(error) is crawlee.errors.UserHandlerTimeoutError:
|
|
14
|
+
if error.__cause__:
|
|
15
|
+
return error.__cause__
|
|
16
|
+
if error.__context__:
|
|
17
|
+
return error.__context__
|
|
18
|
+
return error
|
|
19
|
+
|
|
8
20
|
if error.__cause__:
|
|
9
21
|
return _get_only_innermost_exception(error.__cause__)
|
|
10
22
|
if error.__context__:
|
|
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
|
|
|
34
46
|
|
|
35
47
|
|
|
36
48
|
def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
37
|
-
timeout_error: asyncio.exceptions.TimeoutError,
|
|
49
|
+
timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
|
|
38
50
|
) -> list[str]:
|
|
39
51
|
innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
|
|
40
52
|
return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
|
|
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
|
|
|
43
55
|
def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
|
|
44
56
|
innermost_error = _get_only_innermost_exception(error)
|
|
45
57
|
return traceback.format_exception(
|
|
46
|
-
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=
|
|
58
|
+
type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
|
|
47
59
|
)
|
|
48
60
|
|
|
49
61
|
|
|
50
62
|
def get_one_line_error_summary_if_possible(error: Exception) -> str:
|
|
51
63
|
if isinstance(error, asyncio.exceptions.TimeoutError):
|
|
52
|
-
|
|
64
|
+
relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
|
|
65
|
+
most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
|
|
66
|
+
elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
|
|
67
|
+
# Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
|
|
68
|
+
# code and third line the topmost user error
|
|
69
|
+
traceback_parts = _get_traceback_parts_for_innermost_exception(error)
|
|
70
|
+
relevant_index_from_start = 3
|
|
71
|
+
most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
|
|
53
72
|
elif 'playwright._impl._errors.Error' in str(error.__class__):
|
|
54
73
|
# Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
|
|
55
74
|
# point to deep internals.
|
|
@@ -13,7 +13,7 @@ from pydantic import ValidationError
|
|
|
13
13
|
from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
14
14
|
|
|
15
15
|
from crawlee import service_locator
|
|
16
|
-
from crawlee._request import Request, RequestOptions
|
|
16
|
+
from crawlee._request import Request, RequestOptions, RequestState
|
|
17
17
|
from crawlee._types import (
|
|
18
18
|
BasicCrawlingContext,
|
|
19
19
|
ConcurrencySettings,
|
|
@@ -35,6 +35,7 @@ from crawlee.statistics import StatisticsState
|
|
|
35
35
|
from ._playwright_crawling_context import PlaywrightCrawlingContext
|
|
36
36
|
from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
|
|
37
37
|
from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
|
|
38
|
+
from ._types import GotoOptions
|
|
38
39
|
from ._utils import block_requests, infinite_scroll
|
|
39
40
|
|
|
40
41
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
|
|
@@ -108,6 +109,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
108
109
|
user_data_dir: str | Path | None = None,
|
|
109
110
|
browser_launch_options: Mapping[str, Any] | None = None,
|
|
110
111
|
browser_new_context_options: Mapping[str, Any] | None = None,
|
|
112
|
+
goto_options: GotoOptions | None = None,
|
|
111
113
|
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
|
|
112
114
|
headless: bool | None = None,
|
|
113
115
|
use_incognito_pages: bool | None = None,
|
|
@@ -142,6 +144,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
142
144
|
This option should not be used if `browser_pool` is provided.
|
|
143
145
|
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
|
|
144
146
|
the request handler)
|
|
147
|
+
goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
|
|
148
|
+
not supported, use `navigation_timeout` instead.
|
|
145
149
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
146
150
|
"""
|
|
147
151
|
configuration = kwargs.pop('configuration', None)
|
|
@@ -213,6 +217,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
213
217
|
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
214
218
|
|
|
215
219
|
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
|
|
220
|
+
self._goto_options = goto_options or GotoOptions()
|
|
216
221
|
|
|
217
222
|
super().__init__(**kwargs)
|
|
218
223
|
|
|
@@ -238,6 +243,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
238
243
|
log=context.log,
|
|
239
244
|
page=crawlee_page.page,
|
|
240
245
|
block_requests=partial(block_requests, page=crawlee_page.page),
|
|
246
|
+
goto_options=GotoOptions(**self._goto_options),
|
|
241
247
|
)
|
|
242
248
|
|
|
243
249
|
context_id = id(pre_navigation_context)
|
|
@@ -321,8 +327,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
321
327
|
try:
|
|
322
328
|
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
|
|
323
329
|
response = await context.page.goto(
|
|
324
|
-
context.request.url, timeout=remaining_timeout.total_seconds() * 1000
|
|
330
|
+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
|
|
325
331
|
)
|
|
332
|
+
context.request.state = RequestState.AFTER_NAV
|
|
326
333
|
except playwright.async_api.TimeoutError as exc:
|
|
327
334
|
raise asyncio.TimeoutError from exc
|
|
328
335
|
|
|
@@ -351,6 +358,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
351
358
|
extract_links=extract_links,
|
|
352
359
|
enqueue_links=self._create_enqueue_links_function(context, extract_links),
|
|
353
360
|
block_requests=partial(block_requests, page=context.page),
|
|
361
|
+
goto_options=context.goto_options,
|
|
354
362
|
)
|
|
355
363
|
|
|
356
364
|
if context.session:
|
|
@@ -391,6 +399,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
391
399
|
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
|
|
392
400
|
|
|
393
401
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
402
|
+
strategy = kwargs.get('strategy', 'same-hostname')
|
|
394
403
|
|
|
395
404
|
elements = await context.page.query_selector_all(selector)
|
|
396
405
|
links_iterator: Iterator[str] = iter(
|
|
@@ -409,17 +418,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
409
418
|
skipped = iter([])
|
|
410
419
|
|
|
411
420
|
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
|
|
412
|
-
|
|
421
|
+
request_options = RequestOptions(
|
|
422
|
+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
|
|
423
|
+
)
|
|
413
424
|
|
|
414
425
|
if transform_request_function:
|
|
415
|
-
|
|
416
|
-
if
|
|
426
|
+
transform_request_options = transform_request_function(request_options)
|
|
427
|
+
if transform_request_options == 'skip':
|
|
417
428
|
continue
|
|
418
|
-
if
|
|
419
|
-
|
|
429
|
+
if transform_request_options != 'unchanged':
|
|
430
|
+
request_options = transform_request_options
|
|
420
431
|
|
|
421
432
|
try:
|
|
422
|
-
request = Request.from_url(**
|
|
433
|
+
request = Request.from_url(**request_options)
|
|
423
434
|
except ValidationError as exc:
|
|
424
435
|
context.log.debug(
|
|
425
436
|
f'Skipping URL "{url}" due to invalid format: {exc}. '
|
|
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from playwright.async_api import Page
|
|
11
11
|
|
|
12
|
-
from ._types import BlockRequestsFunction
|
|
12
|
+
from ._types import BlockRequestsFunction, GotoOptions
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@dataclass(frozen=True)
|
|
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
|
|
|
26
26
|
block_requests: BlockRequestsFunction
|
|
27
27
|
"""Blocks network requests matching specified URL patterns."""
|
|
28
28
|
|
|
29
|
+
goto_options: GotoOptions
|
|
30
|
+
"""Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
|
|
31
|
+
|
|
29
32
|
async def get_snapshot(self) -> PageSnapshot:
|
|
30
33
|
"""Get snapshot of crawled page."""
|
|
31
34
|
html = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import TYPE_CHECKING, Protocol
|
|
4
|
+
from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
|
|
5
5
|
|
|
6
6
|
from crawlee import HttpHeaders
|
|
7
7
|
from crawlee._utils.docs import docs_group
|
|
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
|
|
10
10
|
from collections.abc import AsyncGenerator
|
|
11
11
|
|
|
12
12
|
from playwright.async_api import APIResponse, Response
|
|
13
|
-
from typing_extensions import Self
|
|
13
|
+
from typing_extensions import NotRequired, Self
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
@docs_group('Functions')
|
|
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
|
|
|
58
58
|
_content = await response.body()
|
|
59
59
|
|
|
60
60
|
return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class GotoOptions(TypedDict):
|
|
64
|
+
"""Keyword arguments for Playwright's `Page.goto()` method."""
|
|
65
|
+
|
|
66
|
+
wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
|
|
67
|
+
"""When to consider operation succeeded, defaults to 'load' event."""
|
|
68
|
+
|
|
69
|
+
referer: NotRequired[str]
|
|
70
|
+
"""Referer header value."""
|
crawlee/errors.py
CHANGED
|
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
|
|
|
29
29
|
"""Wraps an exception thrown from an user-defined error handler."""
|
|
30
30
|
|
|
31
31
|
|
|
32
|
+
class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
|
|
33
|
+
"""Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
|
|
34
|
+
|
|
35
|
+
|
|
32
36
|
@docs_group('Errors')
|
|
33
37
|
class SessionError(Exception):
|
|
34
38
|
"""Errors of `SessionError` type will trigger a session rotation.
|
crawlee/events/_event_manager.py
CHANGED
|
@@ -174,11 +174,9 @@ class EventManager:
|
|
|
174
174
|
# to avoid blocking the event loop
|
|
175
175
|
coro = (
|
|
176
176
|
listener(*bound_args.args, **bound_args.kwargs)
|
|
177
|
-
if
|
|
177
|
+
if inspect.iscoroutinefunction(listener)
|
|
178
178
|
else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
|
|
179
179
|
)
|
|
180
|
-
# Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
|
|
181
|
-
# unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
|
|
182
180
|
|
|
183
181
|
listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}')
|
|
184
182
|
self._listener_tasks.add(listener_task)
|
crawlee/router.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
from collections.abc import Awaitable, Callable
|
|
4
5
|
from typing import Generic, TypeVar
|
|
5
6
|
|
|
7
|
+
from crawlee._request import RequestState
|
|
6
8
|
from crawlee._types import BasicCrawlingContext
|
|
7
9
|
from crawlee._utils.docs import docs_group
|
|
8
10
|
|
|
9
11
|
__all__ = ['Router']
|
|
10
12
|
|
|
13
|
+
from crawlee.errors import UserHandlerTimeoutError
|
|
14
|
+
|
|
11
15
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
|
|
12
16
|
RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
|
|
13
17
|
|
|
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
|
|
|
89
93
|
|
|
90
94
|
async def __call__(self, context: TCrawlingContext) -> None:
|
|
91
95
|
"""Invoke a request handler that matches the request label (or the default)."""
|
|
96
|
+
context.request.state = RequestState.REQUEST_HANDLER
|
|
92
97
|
if context.request.label is None or context.request.label not in self._handlers_by_label:
|
|
93
98
|
if self._default_handler is None:
|
|
94
99
|
raise RuntimeError(
|
|
95
100
|
f'No handler matches label `{context.request.label}` and no default handler is configured'
|
|
96
101
|
)
|
|
97
102
|
|
|
98
|
-
|
|
103
|
+
user_defined_handler = self._default_handler
|
|
104
|
+
else:
|
|
105
|
+
user_defined_handler = self._handlers_by_label[context.request.label]
|
|
99
106
|
|
|
100
|
-
|
|
101
|
-
|
|
107
|
+
try:
|
|
108
|
+
return await user_defined_handler(context)
|
|
109
|
+
except asyncio.TimeoutError as e:
|
|
110
|
+
# Timeout in handler, but not timeout of handler.
|
|
111
|
+
raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlee
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.1b7
|
|
4
4
|
Summary: Crawlee for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
|
|
@@ -324,19 +324,12 @@ Description-Content-Type: text/markdown
|
|
|
324
324
|
<a href="https://trendshift.io/repositories/11169" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11169" alt="apify%2Fcrawlee-python | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
325
325
|
</p>
|
|
326
326
|
|
|
327
|
-
<p align=center>
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
</a>
|
|
334
|
-
<a href="https://pypi.org/project/crawlee/" rel="nofollow">
|
|
335
|
-
<img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI - Python Version" style="max-width: 100%;">
|
|
336
|
-
</a>
|
|
337
|
-
<a href="https://discord.gg/jyEM2PRvMU" rel="nofollow">
|
|
338
|
-
<img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on discord" style="max-width: 100%;">
|
|
339
|
-
</a>
|
|
327
|
+
<p align="center">
|
|
328
|
+
<a href="https://badge.fury.io/py/crawlee" rel="nofollow"><img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI package version"></a>
|
|
329
|
+
<a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI package downloads"></a>
|
|
330
|
+
<a href="https://codecov.io/gh/apify/crawlee-python"><img src="https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG" alt="Codecov report"></a>
|
|
331
|
+
<a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI Python version"></a>
|
|
332
|
+
<a href="https://discord.gg/jyEM2PRvMU" rel="nofollow"><img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on Discord"></a>
|
|
340
333
|
</p>
|
|
341
334
|
|
|
342
335
|
Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
crawlee/__init__.py,sha256=
|
|
1
|
+
crawlee/__init__.py,sha256=ECFcNbLQp3HX-o6K4eMo38rZQ5NnZg7udvEEkjkqnuw,548
|
|
2
2
|
crawlee/_browserforge_workaround.py,sha256=FYQaqpqfZGYkx-A8evF9nsHnj4KK4IMtjNq3LtmX_vA,1664
|
|
3
3
|
crawlee/_cli.py,sha256=czuEsGD8QYEiq5gtMcBxrL08hQ5OJQQkMVhAr1pvDaQ,10353
|
|
4
4
|
crawlee/_consts.py,sha256=RQ96gx7V-WPH91cVsMUz76X5UZUNDNhCudtlyGkxFVk,133
|
|
5
5
|
crawlee/_log_config.py,sha256=VyxoEfWCq_9fyicmmJbjiZ5KC91onMcAtX2L4oKX4m4,5999
|
|
6
|
-
crawlee/_request.py,sha256=
|
|
6
|
+
crawlee/_request.py,sha256=M8hTSs5dJTBBW0JIDh0QSUhWyEWarEg86Un9kX12qy4,17374
|
|
7
7
|
crawlee/_service_locator.py,sha256=SJ8ABYtclBl7rz8kfZ2jZkIgKq5oNIoGT7WmN8ApTzo,5058
|
|
8
|
-
crawlee/_types.py,sha256=
|
|
8
|
+
crawlee/_types.py,sha256=_CQyq1BmvuHr0p25NFn6rHbgsiuR65o8gLxCCuQWfAg,30534
|
|
9
9
|
crawlee/configuration.py,sha256=DWS2z1FC6Ua93W2tStK3R1ZKZbZjVQYWGiGFbZFaRtA,8064
|
|
10
|
-
crawlee/errors.py,sha256=
|
|
10
|
+
crawlee/errors.py,sha256=fnAFpyvJKMDq3VDGr1iq1E-JqnfoOEI7cd8YjDaqb9s,4062
|
|
11
11
|
crawlee/proxy_configuration.py,sha256=rqf67yerXvLvraBaAHW04nvf5ECze3wMQbK7LlqXucM,10386
|
|
12
12
|
crawlee/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
crawlee/router.py,sha256=
|
|
13
|
+
crawlee/router.py,sha256=HbKxE22r8ZVu93tIxBdGObMa3fGPcuSvKthqibimekU,4252
|
|
14
14
|
crawlee/_autoscaling/__init__.py,sha256=t6Z44gU488C0UmkBCTtwsgAR8iqJcv2g4ZlC4NYh0ZI,182
|
|
15
15
|
crawlee/_autoscaling/_types.py,sha256=xnrRHXYOVn7GwELLVHi_y7B-Ic7u3hPkYl3P-LT3Fhk,5453
|
|
16
16
|
crawlee/_autoscaling/autoscaled_pool.py,sha256=Bcu2jDgK2SYMnZN5xfjs8Oxti0ZxrktjydWv3J0Hz48,12214
|
|
@@ -21,7 +21,7 @@ crawlee/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
21
21
|
crawlee/_utils/blocked.py,sha256=sxN99AouFXMoe6uG1EvCTCmKMGk73DBMUk9nOkWK86I,863
|
|
22
22
|
crawlee/_utils/byte_size.py,sha256=zs4qWUEDgTGDqYfUJ7t5edWNYYJCG8Y1EyJ9GASfRL4,3744
|
|
23
23
|
crawlee/_utils/console.py,sha256=vAIM8AO7cT-HdXg44eR8zQyHAHk8X8G7J1KKFCBL2LY,2242
|
|
24
|
-
crawlee/_utils/context.py,sha256=
|
|
24
|
+
crawlee/_utils/context.py,sha256=LFIXjJQBhv94j1prbK-2yjH3EXg5jPOfVqW8P6cwNIY,1726
|
|
25
25
|
crawlee/_utils/crypto.py,sha256=tYzn2z91KgV3ugxz4CKtSTcCjW-3FC8un7hpDNCl6rs,757
|
|
26
26
|
crawlee/_utils/docs.py,sha256=S09-3xAQAlUvrmPpBXVJpE8wblB8LtS6QduLNncfqdQ,1130
|
|
27
27
|
crawlee/_utils/file.py,sha256=FJHTC25qSWQs3ZhCZrLgs0cUwA9K81MlQRGEmcWKAQU,5758
|
|
@@ -30,7 +30,7 @@ crawlee/_utils/html_to_text.py,sha256=1iykT-OXd2xXNy7isHVWHqPxe23X82CGQBHIfbZbZk
|
|
|
30
30
|
crawlee/_utils/models.py,sha256=EqM50Uc-xvxKlLCLA2lPpRduzfKvT0z_-Q-UWG8aTRQ,1955
|
|
31
31
|
crawlee/_utils/raise_if_too_many_kwargs.py,sha256=J2gaUJmsmNwexohuehXw_mdYKv-eWiui6WUHFsQ3qTQ,597
|
|
32
32
|
crawlee/_utils/recoverable_state.py,sha256=c1D2ZecxEliGZzhqYz9_oU5CF2Hm0UKvpOHqO6CDJRE,9032
|
|
33
|
-
crawlee/_utils/recurring_task.py,sha256=
|
|
33
|
+
crawlee/_utils/recurring_task.py,sha256=_injmSsvG4p0xS4nBtoZZIR02syBG8JcLkuwgNDL8Nc,2143
|
|
34
34
|
crawlee/_utils/requests.py,sha256=yOjai7bHR9_duPJ0ck-L76y9AnKZr49JBfSOQv9kvJc,5048
|
|
35
35
|
crawlee/_utils/robots.py,sha256=DBU5ni4Y-p7bIKMbLd_ws8wgHSFc4K8zPVF3JvH_pkw,4661
|
|
36
36
|
crawlee/_utils/sitemap.py,sha256=UI9EJiFiyFvV5_flVUtdsEVz8ZsJeRERPtcx8ZsqjTU,16632
|
|
@@ -53,22 +53,23 @@ crawlee/crawlers/__init__.py,sha256=jNFMsPizSgCN0ARYSmHs9Ppk8yvGgjUH5PxUeDchljE,
|
|
|
53
53
|
crawlee/crawlers/_types.py,sha256=xbGTJQirgz5wUbfr12afMR4q-_5AWP7ngF2e8K5P8l0,355
|
|
54
54
|
crawlee/crawlers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
55
|
crawlee/crawlers/_abstract_http/__init__.py,sha256=h8jVWcPbDXzWHill1Vm7J7iliJW0hIrea0gkg-Hkb-M,319
|
|
56
|
-
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py,sha256=
|
|
56
|
+
crawlee/crawlers/_abstract_http/_abstract_http_crawler.py,sha256=Y12SBNAiF8QNJH83s6pPoao1W5ZSUhxHRHHKjE0qZhk,13174
|
|
57
57
|
crawlee/crawlers/_abstract_http/_abstract_http_parser.py,sha256=Y5o_hiW_0mQAte5GFqkUxscwKEFpWrBYRsLKP1cfBwE,3521
|
|
58
58
|
crawlee/crawlers/_abstract_http/_http_crawling_context.py,sha256=Rno_uJ8ivmyRxFQv2MyY_z9B5WPHSEd5MAPz31_1ZIo,2179
|
|
59
59
|
crawlee/crawlers/_abstract_http/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
crawlee/crawlers/_adaptive_playwright/__init__.py,sha256=LREq9WR9BKsE8S8lSsEhlCoNjQaLhlJ9yo8y_6a8o4c,1072
|
|
61
|
-
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=
|
|
61
|
+
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=qAtZUwzGMwASwl5NKLAOsYnVA03IpZkk-BLKm3SwHoM,21588
|
|
62
62
|
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py,sha256=_At8T8S3JLGPA-1AeCFGrpE-FuCDW9sazrXt9U0tK6U,1048
|
|
63
|
-
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=
|
|
63
|
+
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=tejw-yfA8zVR8L-shIZOTFoMUQOI5Kt7FBJa8H0q4H0,10664
|
|
64
64
|
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkbIN_059jUyCG8Z6XAb_FBLClIKw7z-aDvjon2I,10834
|
|
65
65
|
crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
|
|
66
66
|
crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
|
|
67
67
|
crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
|
|
68
|
-
crawlee/crawlers/_basic/_basic_crawler.py,sha256=
|
|
68
|
+
crawlee/crawlers/_basic/_basic_crawler.py,sha256=wTZW_1vM2A1x14VADRBsUr0TJzKfGoJODeHX0gOZnnY,73914
|
|
69
69
|
crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
|
|
70
70
|
crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
|
|
71
|
-
crawlee/crawlers/_basic/
|
|
71
|
+
crawlee/crawlers/_basic/_context_utils.py,sha256=U1s0nl7EW9k-JrZA2VM7d_aWnE7Je3lXK04RFrXvRC4,655
|
|
72
|
+
crawlee/crawlers/_basic/_logging_utils.py,sha256=6Q206Sv0RzHztwu5y5XSdUpZhpqQ5-zSapQzUY9GxCo,4014
|
|
72
73
|
crawlee/crawlers/_basic/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
74
|
crawlee/crawlers/_beautifulsoup/__init__.py,sha256=7pL273ashA7yYDrH6nokYZ7SAMUAezilGIWdfThi_Co,822
|
|
74
75
|
crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py,sha256=Q8Sb_lflpdYIwDZ1fIeuquPzdDG2zCnKsrcj8fe8n6k,3056
|
|
@@ -85,14 +86,14 @@ crawlee/crawlers/_parsel/_parsel_crawling_context.py,sha256=sZB26RcRLjSoD15myEOM
|
|
|
85
86
|
crawlee/crawlers/_parsel/_parsel_parser.py,sha256=yWBfuXUHMriK4DRnyrXTQoGeqX5WV9bOEkBp_g0YCvQ,1540
|
|
86
87
|
crawlee/crawlers/_parsel/_utils.py,sha256=MbRwx-cdjlq1zLzFYf64M3spOGQ6yxum4FvP0sdqA_Q,2693
|
|
87
88
|
crawlee/crawlers/_playwright/__init__.py,sha256=6Cahe6VEF82o8CYiP8Cmp58Cmb6Rb8uMeyy7wnwe5ms,837
|
|
88
|
-
crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=
|
|
89
|
+
crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=WappMIb0w-AnS745vlJpQNxwibKS7ok6_5a6iAcoTDs,26207
|
|
89
90
|
crawlee/crawlers/_playwright/_playwright_crawling_context.py,sha256=Oi0tMBXHaEDlFjqG01DzgB7Ck52bjVjz-X__eMioxas,1249
|
|
90
91
|
crawlee/crawlers/_playwright/_playwright_http_client.py,sha256=4mvaCI9Zum7znbm0F-ZZ6T1FEqZ-N-cvPOk1iqtcUSo,4164
|
|
91
|
-
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=
|
|
92
|
-
crawlee/crawlers/_playwright/_types.py,sha256=
|
|
92
|
+
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=NFenJKgXcPuifaVYc2sdU5AV2BX6836GUuqFTE2Q0lU,1545
|
|
93
|
+
crawlee/crawlers/_playwright/_types.py,sha256=D4MaRWgYdps1CwgNWURJRLKkJk_9Oyue70jvkHAxnEU,2534
|
|
93
94
|
crawlee/crawlers/_playwright/_utils.py,sha256=FQ_-LYo7DGHsNHRrTtWt3mC06VzQvQ2wkGqpA2wBzYU,3441
|
|
94
95
|
crawlee/events/__init__.py,sha256=YMgOXKI0LsXfImKQy06PZ2Vdjy-uD_-acioagHft1do,577
|
|
95
|
-
crawlee/events/_event_manager.py,sha256=
|
|
96
|
+
crawlee/events/_event_manager.py,sha256=M8nKPc2BJo8RIBVHaG9BYuks0jwt5v3BFYQLA7IvolI,11380
|
|
96
97
|
crawlee/events/_local_event_manager.py,sha256=CSiMJ6a_BwX0PPwtffEOtHm21dmALJz1zifo3AuMAk8,3708
|
|
97
98
|
crawlee/events/_types.py,sha256=MKsI014OOKKhjPJRrvWYrezIDGoLjGGhWXrkqYw26Ns,3313
|
|
98
99
|
crawlee/events/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -199,8 +200,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
199
200
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
200
201
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
201
202
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
202
|
-
crawlee-1.
|
|
203
|
-
crawlee-1.
|
|
204
|
-
crawlee-1.
|
|
205
|
-
crawlee-1.
|
|
206
|
-
crawlee-1.
|
|
203
|
+
crawlee-1.2.1b7.dist-info/METADATA,sha256=YPjeW0r_pqD_lHRtFfJ8GL84Z4t1IvEgu1uBtxc4IuY,29526
|
|
204
|
+
crawlee-1.2.1b7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
205
|
+
crawlee-1.2.1b7.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
206
|
+
crawlee-1.2.1b7.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
207
|
+
crawlee-1.2.1b7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|