crawlee 1.1.2b4__py3-none-any.whl → 1.2.1b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

crawlee/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from importlib import metadata
2
2
 
3
- from ._request import Request, RequestOptions
3
+ from ._request import Request, RequestOptions, RequestState
4
4
  from ._service_locator import service_locator
5
5
  from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
6
6
  from ._utils.globs import Glob
@@ -14,6 +14,7 @@ __all__ = [
14
14
  'HttpHeaders',
15
15
  'Request',
16
16
  'RequestOptions',
17
+ 'RequestState',
17
18
  'RequestTransformAction',
18
19
  'SkippedReason',
19
20
  'service_locator',
crawlee/_request.py CHANGED
@@ -34,14 +34,14 @@ class RequestState(IntEnum):
34
34
  class CrawleeRequestData(BaseModel):
35
35
  """Crawlee-specific configuration stored in the `user_data`."""
36
36
 
37
- max_retries: Annotated[int | None, Field(alias='maxRetries')] = None
37
+ max_retries: Annotated[int | None, Field(alias='maxRetries', frozen=True)] = None
38
38
  """Maximum number of retries for this request. Allows to override the global `max_request_retries` option of
39
39
  `BasicCrawler`."""
40
40
 
41
41
  enqueue_strategy: Annotated[EnqueueStrategy | None, Field(alias='enqueueStrategy')] = None
42
42
  """The strategy that was used for enqueuing the request."""
43
43
 
44
- state: RequestState | None = None
44
+ state: RequestState = RequestState.UNPROCESSED
45
45
  """Describes the request's current lifecycle state."""
46
46
 
47
47
  session_rotation_count: Annotated[int | None, Field(alias='sessionRotationCount')] = None
@@ -137,6 +137,8 @@ class RequestOptions(TypedDict):
137
137
  always_enqueue: NotRequired[bool]
138
138
  user_data: NotRequired[dict[str, JsonSerializable]]
139
139
  no_retry: NotRequired[bool]
140
+ enqueue_strategy: NotRequired[EnqueueStrategy]
141
+ max_retries: NotRequired[int | None]
140
142
 
141
143
 
142
144
  @docs_group('Storage data')
@@ -166,7 +168,7 @@ class Request(BaseModel):
166
168
 
167
169
  model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
168
170
 
169
- unique_key: Annotated[str, Field(alias='uniqueKey')]
171
+ unique_key: Annotated[str, Field(alias='uniqueKey', frozen=True)]
170
172
  """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
171
173
  to the same URL.
172
174
 
@@ -178,17 +180,18 @@ class Request(BaseModel):
178
180
  and specify which URLs shall be considered equal.
179
181
  """
180
182
 
181
- url: Annotated[str, BeforeValidator(validate_http_url), Field()]
183
+ url: Annotated[str, BeforeValidator(validate_http_url), Field(frozen=True)]
182
184
  """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
183
185
  and fragments."""
184
186
 
185
- method: HttpMethod = 'GET'
187
+ method: Annotated[HttpMethod, Field(frozen=True)] = 'GET'
186
188
  """HTTP request method."""
187
189
 
188
190
  payload: Annotated[
189
191
  HttpPayload | None,
190
192
  BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
191
193
  PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
194
+ Field(frozen=True),
192
195
  ] = None
193
196
  """HTTP request payload."""
194
197
 
@@ -250,6 +253,8 @@ class Request(BaseModel):
250
253
  keep_url_fragment: bool = False,
251
254
  use_extended_unique_key: bool = False,
252
255
  always_enqueue: bool = False,
256
+ enqueue_strategy: EnqueueStrategy | None = None,
257
+ max_retries: int | None = None,
253
258
  **kwargs: Any,
254
259
  ) -> Self:
255
260
  """Create a new `Request` instance from a URL.
@@ -277,6 +282,9 @@ class Request(BaseModel):
277
282
  `unique_key` computation. This is only relevant when `unique_key` is not provided.
278
283
  always_enqueue: If set to `True`, the request will be enqueued even if it is already present in the queue.
279
284
  Using this is not allowed when a custom `unique_key` is also provided and will result in a `ValueError`.
285
+ enqueue_strategy: The strategy that will be used for enqueuing the request.
286
+ max_retries: Maximum number of retries for this request. Allows to override the global `max_request_retries`
287
+ option of `BasicCrawler`.
280
288
  **kwargs: Additional request properties.
281
289
  """
282
290
  if unique_key is not None and always_enqueue:
@@ -301,12 +309,27 @@ class Request(BaseModel):
301
309
  if always_enqueue:
302
310
  unique_key = f'{crypto_random_object_id()}|{unique_key}'
303
311
 
312
+ user_data_dict = kwargs.pop('user_data', {}) or {}
313
+ crawlee_data_dict = user_data_dict.get('__crawlee', {})
314
+
315
+ if max_retries is not None:
316
+ crawlee_data_dict['maxRetries'] = max_retries
317
+
318
+ if enqueue_strategy is not None:
319
+ crawlee_data_dict['enqueueStrategy'] = enqueue_strategy
320
+
321
+ crawlee_data = CrawleeRequestData(**crawlee_data_dict)
322
+
323
+ if crawlee_data:
324
+ user_data_dict['__crawlee'] = crawlee_data
325
+
304
326
  request = cls(
305
327
  url=url,
306
328
  unique_key=unique_key,
307
329
  method=method,
308
330
  headers=headers,
309
331
  payload=payload,
332
+ user_data=user_data_dict,
310
333
  **kwargs,
311
334
  )
312
335
 
@@ -352,7 +375,7 @@ class Request(BaseModel):
352
375
  self.crawlee_data.crawl_depth = new_value
353
376
 
354
377
  @property
355
- def state(self) -> RequestState | None:
378
+ def state(self) -> RequestState:
356
379
  """Crawlee-specific request handling state."""
357
380
  return self.crawlee_data.state
358
381
 
@@ -365,10 +388,6 @@ class Request(BaseModel):
365
388
  """Crawlee-specific limit on the number of retries of the request."""
366
389
  return self.crawlee_data.max_retries
367
390
 
368
- @max_retries.setter
369
- def max_retries(self, new_max_retries: int) -> None:
370
- self.crawlee_data.max_retries = new_max_retries
371
-
372
391
  @property
373
392
  def session_rotation_count(self) -> int | None:
374
393
  """Crawlee-specific number of finished session rotations for the request."""
crawlee/_types.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import dataclasses
4
4
  from collections.abc import Callable, Iterator, Mapping
5
+ from copy import deepcopy
5
6
  from dataclasses import dataclass
6
7
  from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
7
8
 
@@ -260,12 +261,24 @@ class KeyValueStoreChangeRecords:
260
261
  class RequestHandlerRunResult:
261
262
  """Record of calls to storage-related context helpers."""
262
263
 
263
- def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
264
+ def __init__(
265
+ self,
266
+ *,
267
+ key_value_store_getter: GetKeyValueStoreFunction,
268
+ request: Request,
269
+ ) -> None:
264
270
  self._key_value_store_getter = key_value_store_getter
265
271
  self.add_requests_calls = list[AddRequestsKwargs]()
266
272
  self.push_data_calls = list[PushDataFunctionCall]()
267
273
  self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
268
274
 
275
+ # Isolated copies for handler execution
276
+ self._request = deepcopy(request)
277
+
278
+ @property
279
+ def request(self) -> Request:
280
+ return self._request
281
+
269
282
  async def add_requests(
270
283
  self,
271
284
  requests: Sequence[str | Request],
@@ -315,6 +328,14 @@ class RequestHandlerRunResult:
315
328
 
316
329
  return self.key_value_store_changes[id, name, alias]
317
330
 
331
+ def apply_request_changes(self, target: Request) -> None:
332
+ """Apply tracked changes from handler copy to original request."""
333
+ if self.request.user_data != target.user_data:
334
+ target.user_data = self.request.user_data
335
+
336
+ if self.request.headers != target.headers:
337
+ target.headers = self.request.headers
338
+
318
339
 
319
340
  @docs_group('Functions')
320
341
  class AddRequestsFunction(Protocol):
crawlee/_utils/context.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
3
+ import inspect
4
4
  from collections.abc import Callable
5
5
  from functools import wraps
6
6
  from typing import Any, TypeVar
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
44
44
 
45
45
  return await method(self, *args, **kwargs)
46
46
 
47
- return async_wrapper if asyncio.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
47
+ return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import inspect
4
5
  from logging import getLogger
5
6
  from typing import TYPE_CHECKING
6
7
 
@@ -49,7 +50,7 @@ class RecurringTask:
49
50
  """
50
51
  sleep_time_secs = self.delay.total_seconds()
51
52
  while True:
52
- await self.func() if asyncio.iscoroutinefunction(self.func) else self.func()
53
+ await self.func() if inspect.iscoroutinefunction(self.func) else self.func()
53
54
  await asyncio.sleep(sleep_time_secs)
54
55
 
55
56
  def start(self) -> None:
@@ -10,7 +10,7 @@ from more_itertools import partition
10
10
  from pydantic import ValidationError
11
11
  from typing_extensions import NotRequired, TypeVar
12
12
 
13
- from crawlee._request import Request, RequestOptions
13
+ from crawlee._request import Request, RequestOptions, RequestState
14
14
  from crawlee._utils.docs import docs_group
15
15
  from crawlee._utils.time import SharedTimeout
16
16
  from crawlee._utils.urls import to_absolute_url_iterator
@@ -191,6 +191,7 @@ class AbstractHttpCrawler(
191
191
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
192
192
 
193
193
  kwargs.setdefault('strategy', 'same-hostname')
194
+ strategy = kwargs.get('strategy', 'same-hostname')
194
195
 
195
196
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
196
197
 
@@ -209,7 +210,9 @@ class AbstractHttpCrawler(
209
210
  skipped = iter([])
210
211
 
211
212
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
212
- request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
213
+ request_options = RequestOptions(
214
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
215
+ )
213
216
 
214
217
  if transform_request_function:
215
218
  transform_request_options = transform_request_function(request_options)
@@ -257,6 +260,7 @@ class AbstractHttpCrawler(
257
260
  timeout=remaining_timeout,
258
261
  )
259
262
 
263
+ context.request.state = RequestState.AFTER_NAV
260
264
  yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
261
265
 
262
266
  async def _handle_status_code_response(
@@ -290,11 +290,14 @@ class AdaptivePlaywrightCrawler(
290
290
  use_state_function = context.use_state
291
291
 
292
292
  # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
293
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
293
+ result = RequestHandlerRunResult(
294
+ key_value_store_getter=self.get_key_value_store,
295
+ request=context.request,
296
+ )
294
297
  context_linked_to_result = BasicCrawlingContext(
295
- request=deepcopy(context.request),
296
- session=deepcopy(context.session),
297
- proxy_info=deepcopy(context.proxy_info),
298
+ request=result.request,
299
+ session=context.session,
300
+ proxy_info=context.proxy_info,
298
301
  send_request=context.send_request,
299
302
  add_requests=result.add_requests,
300
303
  push_data=result.push_data,
@@ -314,7 +317,7 @@ class AdaptivePlaywrightCrawler(
314
317
  ),
315
318
  logger=self._logger,
316
319
  )
317
- return SubCrawlerRun(result=result, run_context=context_linked_to_result)
320
+ return SubCrawlerRun(result=result)
318
321
  except Exception as e:
319
322
  return SubCrawlerRun(exception=e)
320
323
 
@@ -370,8 +373,7 @@ class AdaptivePlaywrightCrawler(
370
373
  self.track_http_only_request_handler_runs()
371
374
 
372
375
  static_run = await self._crawl_one(rendering_type='static', context=context)
373
- if static_run.result and static_run.run_context and self.result_checker(static_run.result):
374
- self._update_context_from_copy(context, static_run.run_context)
376
+ if static_run.result and self.result_checker(static_run.result):
375
377
  self._context_result_map[context] = static_run.result
376
378
  return
377
379
  if static_run.exception:
@@ -402,7 +404,7 @@ class AdaptivePlaywrightCrawler(
402
404
  if pw_run.exception is not None:
403
405
  raise pw_run.exception
404
406
 
405
- if pw_run.result and pw_run.run_context:
407
+ if pw_run.result:
406
408
  if should_detect_rendering_type:
407
409
  detection_result: RenderingType
408
410
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
@@ -414,7 +416,6 @@ class AdaptivePlaywrightCrawler(
414
416
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
415
417
  self.rendering_type_predictor.store_result(context.request, detection_result)
416
418
 
417
- self._update_context_from_copy(context, pw_run.run_context)
418
419
  self._context_result_map[context] = pw_run.result
419
420
 
420
421
  def pre_navigation_hook(
@@ -451,32 +452,8 @@ class AdaptivePlaywrightCrawler(
451
452
  def track_rendering_type_mispredictions(self) -> None:
452
453
  self.statistics.state.rendering_type_mispredictions += 1
453
454
 
454
- def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
455
- """Update mutable fields of `context` from `context_copy`.
456
-
457
- Uses object.__setattr__ to bypass frozen dataclass restrictions,
458
- allowing state synchronization after isolated crawler execution.
459
- """
460
- updating_attributes = {
461
- 'request': ('headers', 'user_data'),
462
- 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
463
- }
464
-
465
- for attr, sub_attrs in updating_attributes.items():
466
- original_sub_obj = getattr(context, attr)
467
- copy_sub_obj = getattr(context_copy, attr)
468
-
469
- # Check that both sub objects are not None
470
- if original_sub_obj is None or copy_sub_obj is None:
471
- continue
472
-
473
- for sub_attr in sub_attrs:
474
- new_value = getattr(copy_sub_obj, sub_attr)
475
- object.__setattr__(original_sub_obj, sub_attr, new_value)
476
-
477
455
 
478
456
  @dataclass(frozen=True)
479
457
  class SubCrawlerRun:
480
458
  result: RequestHandlerRunResult | None = None
481
459
  exception: Exception | None = None
482
- run_context: BasicCrawlingContext | None = None
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
17
17
  from playwright.async_api import Page, Response
18
18
  from typing_extensions import Self
19
19
 
20
- from crawlee.crawlers._playwright._types import BlockRequestsFunction
20
+ from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
21
21
 
22
22
 
23
23
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
190
190
  http_response = await PlaywrightHttpResponse.from_playwright_response(
191
191
  response=context.response, protocol=protocol_guess or ''
192
192
  )
193
- # block_requests is useful only on pre-navigation contexts. It is useless here.
193
+ # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
194
194
  context_kwargs.pop('block_requests')
195
+ context_kwargs.pop('goto_options')
195
196
  return cls(
196
197
  parsed_content=await parser.parse(http_response),
197
198
  http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
212
213
  block_requests: BlockRequestsFunction | None = None
213
214
  """Blocks network requests matching specified URL patterns."""
214
215
 
216
+ goto_options: GotoOptions | None = None
217
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
218
+
215
219
  @property
216
220
  def page(self) -> Page:
217
221
  """The Playwright `Page` object for the current page.
@@ -59,6 +59,7 @@ from crawlee.errors import (
59
59
  RequestHandlerError,
60
60
  SessionError,
61
61
  UserDefinedErrorHandlerError,
62
+ UserHandlerTimeoutError,
62
63
  )
63
64
  from crawlee.events._types import Event, EventCrawlerStatusData
64
65
  from crawlee.http_clients import ImpitHttpClient
@@ -68,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
68
69
  from crawlee.storages import Dataset, KeyValueStore, RequestQueue
69
70
 
70
71
  from ._context_pipeline import ContextPipeline
72
+ from ._context_utils import swaped_context
71
73
  from ._logging_utils import (
72
74
  get_one_line_error_summary_if_possible,
73
75
  reduce_asyncio_timeout_error_to_relevant_traceback_parts,
@@ -1037,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1037
1039
  warning_flag = True
1038
1040
 
1039
1041
  for request in request_iterator:
1040
- target_url = request.url if isinstance(request, Request) else request
1042
+ if isinstance(request, Request):
1043
+ if request.enqueue_strategy != strategy:
1044
+ request.enqueue_strategy = strategy
1045
+ target_url = request.url
1046
+ else:
1047
+ target_url = request
1041
1048
  parsed_target_url = urlparse(target_url)
1042
1049
 
1043
1050
  if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
@@ -1134,7 +1141,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1134
1141
  request.retry_count += 1
1135
1142
  reduced_error = str(error).split('\n')[0]
1136
1143
  self.log.warning(
1137
- f'Retrying request to {context.request.url} due to: {reduced_error}'
1144
+ f'Retrying request to {context.request.url} due to: {reduced_error}. '
1138
1145
  f'{get_one_line_error_summary_if_possible(error)}'
1139
1146
  )
1140
1147
  await self._statistics.error_tracker.add(error=error, context=context)
@@ -1152,6 +1159,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1152
1159
 
1153
1160
  await request_manager.reclaim_request(request)
1154
1161
  else:
1162
+ request.state = RequestState.ERROR
1155
1163
  await self._mark_request_as_handled(request)
1156
1164
  await self._handle_failed_request(context, error)
1157
1165
  self._statistics.record_request_processing_failure(request.unique_key)
@@ -1167,8 +1175,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1167
1175
  f'{self._internal_timeout.total_seconds()} seconds',
1168
1176
  logger=self._logger,
1169
1177
  )
1170
-
1171
- context.request.state = RequestState.DONE
1172
1178
  except UserDefinedErrorHandlerError:
1173
1179
  context.request.state = RequestState.ERROR
1174
1180
  raise
@@ -1201,8 +1207,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1201
1207
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1202
1208
  ) -> None:
1203
1209
  if need_mark and isinstance(request, Request):
1204
- await self._mark_request_as_handled(request)
1205
1210
  request.state = RequestState.SKIPPED
1211
+ await self._mark_request_as_handled(request)
1206
1212
 
1207
1213
  url = request.url if isinstance(request, Request) else request
1208
1214
 
@@ -1222,10 +1228,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1222
1228
 
1223
1229
  if (
1224
1230
  isinstance(error, asyncio.exceptions.TimeoutError)
1231
+ and traceback_parts
1225
1232
  and self._request_handler_timeout_text in traceback_parts[-1]
1226
- ):
1233
+ ) or isinstance(error, UserHandlerTimeoutError):
1227
1234
  used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1228
- used_traceback_parts.append(traceback_parts[-1])
1235
+ used_traceback_parts.extend(traceback_parts[-1:])
1229
1236
 
1230
1237
  return ''.join(used_traceback_parts).strip('\n')
1231
1238
 
@@ -1320,6 +1327,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1320
1327
 
1321
1328
  await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
1322
1329
 
1330
+ result.apply_request_changes(target=context.request)
1331
+
1323
1332
  @staticmethod
1324
1333
  async def _commit_key_value_store_changes(
1325
1334
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
@@ -1385,10 +1394,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1385
1394
  else:
1386
1395
  session = await self._get_session()
1387
1396
  proxy_info = await self._get_proxy_info(request, session)
1388
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
1397
+ result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
1389
1398
 
1390
1399
  context = BasicCrawlingContext(
1391
- request=request,
1400
+ request=result.request,
1392
1401
  session=session,
1393
1402
  proxy_info=proxy_info,
1394
1403
  send_request=self._prepare_send_request_function(session, proxy_info),
@@ -1405,26 +1414,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1405
1414
  try:
1406
1415
  request.state = RequestState.REQUEST_HANDLER
1407
1416
 
1408
- self._check_request_collision(context.request, context.session)
1409
-
1410
1417
  try:
1411
- await self._run_request_handler(context=context)
1418
+ with swaped_context(context, request):
1419
+ self._check_request_collision(request, session)
1420
+ await self._run_request_handler(context=context)
1412
1421
  except asyncio.TimeoutError as e:
1413
1422
  raise RequestHandlerError(e, context) from e
1414
1423
 
1415
1424
  await self._commit_request_handler_result(context)
1416
1425
 
1417
- await self._mark_request_as_handled(request)
1418
-
1419
1426
  request.state = RequestState.DONE
1420
1427
 
1421
- if context.session and context.session.is_usable:
1422
- context.session.mark_good()
1428
+ await self._mark_request_as_handled(request)
1429
+
1430
+ if session and session.is_usable:
1431
+ session.mark_good()
1423
1432
 
1424
1433
  self._statistics.record_request_processing_finish(request.unique_key)
1425
1434
 
1426
1435
  except RequestCollisionError as request_error:
1427
- context.request.no_retry = True
1436
+ request.no_retry = True
1428
1437
  await self._handle_request_error(context, request_error)
1429
1438
 
1430
1439
  except RequestHandlerError as primary_error:
@@ -1439,7 +1448,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1439
1448
  await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
1440
1449
 
1441
1450
  except SessionError as session_error:
1442
- if not context.session:
1451
+ if not session:
1443
1452
  raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
1444
1453
 
1445
1454
  if self._error_handler:
@@ -1449,10 +1458,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1449
1458
  exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
1450
1459
  self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
1451
1460
 
1452
- context.session.retire()
1461
+ if session:
1462
+ session.retire()
1453
1463
 
1454
1464
  # Increment session rotation count.
1455
- context.request.session_rotation_count = (context.request.session_rotation_count or 0) + 1
1465
+ request.session_rotation_count = (request.session_rotation_count or 0) + 1
1456
1466
 
1457
1467
  await request_manager.reclaim_request(request)
1458
1468
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
@@ -1483,6 +1493,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1483
1493
  raise
1484
1494
 
1485
1495
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1496
+ context.request.state = RequestState.BEFORE_NAV
1486
1497
  await self._context_pipeline(
1487
1498
  context,
1488
1499
  lambda final_context: wait_for(
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterator
8
+
9
+ from crawlee._request import Request
10
+
11
+ from ._basic_crawling_context import BasicCrawlingContext
12
+
13
+
14
+ @contextmanager
15
+ def swaped_context(
16
+ context: BasicCrawlingContext,
17
+ request: Request,
18
+ ) -> Iterator[None]:
19
+ """Replace context's isolated copies with originals after handler execution."""
20
+ try:
21
+ yield
22
+ finally:
23
+ # Restore original context state to avoid side effects between different handlers.
24
+ object.__setattr__(context, 'request', request)
@@ -2,9 +2,21 @@ import asyncio
2
2
  import re
3
3
  import traceback
4
4
 
5
+ import crawlee.errors
6
+
5
7
 
6
8
  def _get_only_innermost_exception(error: BaseException) -> BaseException:
7
- """Get innermost exception by following __cause__ and __context__ attributes of exception."""
9
+ """Get innermost exception by following __cause__ and __context__ attributes of exception.
10
+
11
+ If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
12
+ """
13
+ if type(error) is crawlee.errors.UserHandlerTimeoutError:
14
+ if error.__cause__:
15
+ return error.__cause__
16
+ if error.__context__:
17
+ return error.__context__
18
+ return error
19
+
8
20
  if error.__cause__:
9
21
  return _get_only_innermost_exception(error.__cause__)
10
22
  if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
34
46
 
35
47
 
36
48
  def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
37
- timeout_error: asyncio.exceptions.TimeoutError,
49
+ timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
38
50
  ) -> list[str]:
39
51
  innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
40
52
  return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
43
55
  def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
44
56
  innermost_error = _get_only_innermost_exception(error)
45
57
  return traceback.format_exception(
46
- type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
58
+ type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
47
59
  )
48
60
 
49
61
 
50
62
  def get_one_line_error_summary_if_possible(error: Exception) -> str:
51
63
  if isinstance(error, asyncio.exceptions.TimeoutError):
52
- most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
64
+ relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
65
+ most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66
+ elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
67
+ # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
68
+ # code and third line the topmost user error
69
+ traceback_parts = _get_traceback_parts_for_innermost_exception(error)
70
+ relevant_index_from_start = 3
71
+ most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
53
72
  elif 'playwright._impl._errors.Error' in str(error.__class__):
54
73
  # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
55
74
  # point to deep internals.
@@ -13,7 +13,7 @@ from pydantic import ValidationError
13
13
  from typing_extensions import NotRequired, TypedDict, TypeVar
14
14
 
15
15
  from crawlee import service_locator
16
- from crawlee._request import Request, RequestOptions
16
+ from crawlee._request import Request, RequestOptions, RequestState
17
17
  from crawlee._types import (
18
18
  BasicCrawlingContext,
19
19
  ConcurrencySettings,
@@ -35,6 +35,7 @@ from crawlee.statistics import StatisticsState
35
35
  from ._playwright_crawling_context import PlaywrightCrawlingContext
36
36
  from ._playwright_http_client import PlaywrightHttpClient, browser_page_context
37
37
  from ._playwright_pre_nav_crawling_context import PlaywrightPreNavCrawlingContext
38
+ from ._types import GotoOptions
38
39
  from ._utils import block_requests, infinite_scroll
39
40
 
40
41
  TCrawlingContext = TypeVar('TCrawlingContext', bound=PlaywrightCrawlingContext)
@@ -108,6 +109,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
108
109
  user_data_dir: str | Path | None = None,
109
110
  browser_launch_options: Mapping[str, Any] | None = None,
110
111
  browser_new_context_options: Mapping[str, Any] | None = None,
112
+ goto_options: GotoOptions | None = None,
111
113
  fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
112
114
  headless: bool | None = None,
113
115
  use_incognito_pages: bool | None = None,
@@ -142,6 +144,8 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
142
144
  This option should not be used if `browser_pool` is provided.
143
145
  navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
144
146
  the request handler)
147
+ goto_options: Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is
148
+ not supported, use `navigation_timeout` instead.
145
149
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
146
150
  """
147
151
  configuration = kwargs.pop('configuration', None)
@@ -213,6 +217,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
213
217
  kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
214
218
 
215
219
  self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
220
+ self._goto_options = goto_options or GotoOptions()
216
221
 
217
222
  super().__init__(**kwargs)
218
223
 
@@ -238,6 +243,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
238
243
  log=context.log,
239
244
  page=crawlee_page.page,
240
245
  block_requests=partial(block_requests, page=crawlee_page.page),
246
+ goto_options=GotoOptions(**self._goto_options),
241
247
  )
242
248
 
243
249
  context_id = id(pre_navigation_context)
@@ -321,8 +327,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
321
327
  try:
322
328
  async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
323
329
  response = await context.page.goto(
324
- context.request.url, timeout=remaining_timeout.total_seconds() * 1000
330
+ context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options
325
331
  )
332
+ context.request.state = RequestState.AFTER_NAV
326
333
  except playwright.async_api.TimeoutError as exc:
327
334
  raise asyncio.TimeoutError from exc
328
335
 
@@ -351,6 +358,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
351
358
  extract_links=extract_links,
352
359
  enqueue_links=self._create_enqueue_links_function(context, extract_links),
353
360
  block_requests=partial(block_requests, page=context.page),
361
+ goto_options=context.goto_options,
354
362
  )
355
363
 
356
364
  if context.session:
@@ -391,6 +399,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
391
399
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
392
400
 
393
401
  kwargs.setdefault('strategy', 'same-hostname')
402
+ strategy = kwargs.get('strategy', 'same-hostname')
394
403
 
395
404
  elements = await context.page.query_selector_all(selector)
396
405
  links_iterator: Iterator[str] = iter(
@@ -409,17 +418,19 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
409
418
  skipped = iter([])
410
419
 
411
420
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
412
- request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
421
+ request_options = RequestOptions(
422
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
423
+ )
413
424
 
414
425
  if transform_request_function:
415
- transform_request_option = transform_request_function(request_option)
416
- if transform_request_option == 'skip':
426
+ transform_request_options = transform_request_function(request_options)
427
+ if transform_request_options == 'skip':
417
428
  continue
418
- if transform_request_option != 'unchanged':
419
- request_option = transform_request_option
429
+ if transform_request_options != 'unchanged':
430
+ request_options = transform_request_options
420
431
 
421
432
  try:
422
- request = Request.from_url(**request_option)
433
+ request = Request.from_url(**request_options)
423
434
  except ValidationError as exc:
424
435
  context.log.debug(
425
436
  f'Skipping URL "{url}" due to invalid format: {exc}. '
@@ -9,7 +9,7 @@ from crawlee._utils.docs import docs_group
9
9
  if TYPE_CHECKING:
10
10
  from playwright.async_api import Page
11
11
 
12
- from ._types import BlockRequestsFunction
12
+ from ._types import BlockRequestsFunction, GotoOptions
13
13
 
14
14
 
15
15
  @dataclass(frozen=True)
@@ -26,6 +26,9 @@ class PlaywrightPreNavCrawlingContext(BasicCrawlingContext):
26
26
  block_requests: BlockRequestsFunction
27
27
  """Blocks network requests matching specified URL patterns."""
28
28
 
29
+ goto_options: GotoOptions
30
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
31
+
29
32
  async def get_snapshot(self) -> PageSnapshot:
30
33
  """Get snapshot of crawled page."""
31
34
  html = None
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import TYPE_CHECKING, Protocol
4
+ from typing import TYPE_CHECKING, Literal, Protocol, TypedDict
5
5
 
6
6
  from crawlee import HttpHeaders
7
7
  from crawlee._utils.docs import docs_group
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
10
10
  from collections.abc import AsyncGenerator
11
11
 
12
12
  from playwright.async_api import APIResponse, Response
13
- from typing_extensions import Self
13
+ from typing_extensions import NotRequired, Self
14
14
 
15
15
 
16
16
  @docs_group('Functions')
@@ -58,3 +58,13 @@ class PlaywrightHttpResponse:
58
58
  _content = await response.body()
59
59
 
60
60
  return cls(http_version=http_version, status_code=status_code, headers=headers, _content=_content)
61
+
62
+
63
+ class GotoOptions(TypedDict):
64
+ """Keyword arguments for Playwright's `Page.goto()` method."""
65
+
66
+ wait_until: NotRequired[Literal['domcontentloaded', 'load', 'networkidle', 'commit']]
67
+ """When to consider operation succeeded, defaults to 'load' event."""
68
+
69
+ referer: NotRequired[str]
70
+ """Referer header value."""
crawlee/errors.py CHANGED
@@ -29,6 +29,10 @@ class UserDefinedErrorHandlerError(Exception):
29
29
  """Wraps an exception thrown from an user-defined error handler."""
30
30
 
31
31
 
32
+ class UserHandlerTimeoutError(UserDefinedErrorHandlerError):
33
+ """Raised when a router fails due to user raised timeout. This is different from user-defined handler timing out."""
34
+
35
+
32
36
  @docs_group('Errors')
33
37
  class SessionError(Exception):
34
38
  """Errors of `SessionError` type will trigger a session rotation.
@@ -174,11 +174,9 @@ class EventManager:
174
174
  # to avoid blocking the event loop
175
175
  coro = (
176
176
  listener(*bound_args.args, **bound_args.kwargs)
177
- if asyncio.iscoroutinefunction(listener)
177
+ if inspect.iscoroutinefunction(listener)
178
178
  else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs)
179
179
  )
180
- # Note: use `asyncio.iscoroutinefunction` rather then `inspect.iscoroutinefunction` since it works with
181
- # unittests.mock.AsyncMock. See https://github.com/python/cpython/issues/84753.
182
180
 
183
181
  listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}')
184
182
  self._listener_tasks.add(listener_task)
crawlee/router.py CHANGED
@@ -1,13 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from collections.abc import Awaitable, Callable
4
5
  from typing import Generic, TypeVar
5
6
 
7
+ from crawlee._request import RequestState
6
8
  from crawlee._types import BasicCrawlingContext
7
9
  from crawlee._utils.docs import docs_group
8
10
 
9
11
  __all__ = ['Router']
10
12
 
13
+ from crawlee.errors import UserHandlerTimeoutError
14
+
11
15
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext)
12
16
  RequestHandler = Callable[[TCrawlingContext], Awaitable[None]]
13
17
 
@@ -89,13 +93,19 @@ class Router(Generic[TCrawlingContext]):
89
93
 
90
94
  async def __call__(self, context: TCrawlingContext) -> None:
91
95
  """Invoke a request handler that matches the request label (or the default)."""
96
+ context.request.state = RequestState.REQUEST_HANDLER
92
97
  if context.request.label is None or context.request.label not in self._handlers_by_label:
93
98
  if self._default_handler is None:
94
99
  raise RuntimeError(
95
100
  f'No handler matches label `{context.request.label}` and no default handler is configured'
96
101
  )
97
102
 
98
- return await self._default_handler(context)
103
+ user_defined_handler = self._default_handler
104
+ else:
105
+ user_defined_handler = self._handlers_by_label[context.request.label]
99
106
 
100
- handler = self._handlers_by_label[context.request.label]
101
- return await handler(context)
107
+ try:
108
+ return await user_defined_handler(context)
109
+ except asyncio.TimeoutError as e:
110
+ # Timeout in handler, but not timeout of handler.
111
+ raise UserHandlerTimeoutError('Timeout raised by user defined handler') from e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.1.2b4
3
+ Version: 1.2.1b7
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -324,19 +324,12 @@ Description-Content-Type: text/markdown
324
324
  <a href="https://trendshift.io/repositories/11169" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11169" alt="apify%2Fcrawlee-python | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
325
325
  </p>
326
326
 
327
- <p align=center>
328
- <a href="https://badge.fury.io/py/crawlee" rel="nofollow">
329
- <img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI version" style="max-width: 100%;">
330
- </a>
331
- <a href="https://pypi.org/project/crawlee/" rel="nofollow">
332
- <img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI - Downloads" style="max-width: 100%;">
333
- </a>
334
- <a href="https://pypi.org/project/crawlee/" rel="nofollow">
335
- <img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI - Python Version" style="max-width: 100%;">
336
- </a>
337
- <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow">
338
- <img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on discord" style="max-width: 100%;">
339
- </a>
327
+ <p align="center">
328
+ <a href="https://badge.fury.io/py/crawlee" rel="nofollow"><img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI package version"></a>
329
+ <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI package downloads"></a>
330
+ <a href="https://codecov.io/gh/apify/crawlee-python"><img src="https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG" alt="Codecov report"></a>
331
+ <a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI Python version"></a>
332
+ <a href="https://discord.gg/jyEM2PRvMU" rel="nofollow"><img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on Discord"></a>
340
333
  </p>
341
334
 
342
335
  Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**
@@ -1,16 +1,16 @@
1
- crawlee/__init__.py,sha256=GdTXZXJsxj4Cb53y6raz4tlaUWLO918fKKshw91eE70,514
1
+ crawlee/__init__.py,sha256=ECFcNbLQp3HX-o6K4eMo38rZQ5NnZg7udvEEkjkqnuw,548
2
2
  crawlee/_browserforge_workaround.py,sha256=FYQaqpqfZGYkx-A8evF9nsHnj4KK4IMtjNq3LtmX_vA,1664
3
3
  crawlee/_cli.py,sha256=czuEsGD8QYEiq5gtMcBxrL08hQ5OJQQkMVhAr1pvDaQ,10353
4
4
  crawlee/_consts.py,sha256=RQ96gx7V-WPH91cVsMUz76X5UZUNDNhCudtlyGkxFVk,133
5
5
  crawlee/_log_config.py,sha256=VyxoEfWCq_9fyicmmJbjiZ5KC91onMcAtX2L4oKX4m4,5999
6
- crawlee/_request.py,sha256=fnUofyFMV3HJwfcLjYr2BCZ5K9mEwl6vZd8Pr309wCE,16458
6
+ crawlee/_request.py,sha256=M8hTSs5dJTBBW0JIDh0QSUhWyEWarEg86Un9kX12qy4,17374
7
7
  crawlee/_service_locator.py,sha256=SJ8ABYtclBl7rz8kfZ2jZkIgKq5oNIoGT7WmN8ApTzo,5058
8
- crawlee/_types.py,sha256=93yoGr_KqMDIkq3__3QYpIAJmEzZvDoilHAF7_X4J4A,29933
8
+ crawlee/_types.py,sha256=_CQyq1BmvuHr0p25NFn6rHbgsiuR65o8gLxCCuQWfAg,30534
9
9
  crawlee/configuration.py,sha256=DWS2z1FC6Ua93W2tStK3R1ZKZbZjVQYWGiGFbZFaRtA,8064
10
- crawlee/errors.py,sha256=RhFNA_uT615nVBHf9TylpX5YWwtDuHUUEV8LPT4CYa4,3878
10
+ crawlee/errors.py,sha256=fnAFpyvJKMDq3VDGr1iq1E-JqnfoOEI7cd8YjDaqb9s,4062
11
11
  crawlee/proxy_configuration.py,sha256=rqf67yerXvLvraBaAHW04nvf5ECze3wMQbK7LlqXucM,10386
12
12
  crawlee/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- crawlee/router.py,sha256=DE0CU_hzXS8RXpYu2v-699hAzQF-KUlSwlX1xgtjuH4,3825
13
+ crawlee/router.py,sha256=HbKxE22r8ZVu93tIxBdGObMa3fGPcuSvKthqibimekU,4252
14
14
  crawlee/_autoscaling/__init__.py,sha256=t6Z44gU488C0UmkBCTtwsgAR8iqJcv2g4ZlC4NYh0ZI,182
15
15
  crawlee/_autoscaling/_types.py,sha256=xnrRHXYOVn7GwELLVHi_y7B-Ic7u3hPkYl3P-LT3Fhk,5453
16
16
  crawlee/_autoscaling/autoscaled_pool.py,sha256=Bcu2jDgK2SYMnZN5xfjs8Oxti0ZxrktjydWv3J0Hz48,12214
@@ -21,7 +21,7 @@ crawlee/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  crawlee/_utils/blocked.py,sha256=sxN99AouFXMoe6uG1EvCTCmKMGk73DBMUk9nOkWK86I,863
22
22
  crawlee/_utils/byte_size.py,sha256=zs4qWUEDgTGDqYfUJ7t5edWNYYJCG8Y1EyJ9GASfRL4,3744
23
23
  crawlee/_utils/console.py,sha256=vAIM8AO7cT-HdXg44eR8zQyHAHk8X8G7J1KKFCBL2LY,2242
24
- crawlee/_utils/context.py,sha256=FsRh394RpNZTRovuVHVlyVTZ8AbSLGe-K7RpLgU9AX8,1726
24
+ crawlee/_utils/context.py,sha256=LFIXjJQBhv94j1prbK-2yjH3EXg5jPOfVqW8P6cwNIY,1726
25
25
  crawlee/_utils/crypto.py,sha256=tYzn2z91KgV3ugxz4CKtSTcCjW-3FC8un7hpDNCl6rs,757
26
26
  crawlee/_utils/docs.py,sha256=S09-3xAQAlUvrmPpBXVJpE8wblB8LtS6QduLNncfqdQ,1130
27
27
  crawlee/_utils/file.py,sha256=FJHTC25qSWQs3ZhCZrLgs0cUwA9K81MlQRGEmcWKAQU,5758
@@ -30,7 +30,7 @@ crawlee/_utils/html_to_text.py,sha256=1iykT-OXd2xXNy7isHVWHqPxe23X82CGQBHIfbZbZk
30
30
  crawlee/_utils/models.py,sha256=EqM50Uc-xvxKlLCLA2lPpRduzfKvT0z_-Q-UWG8aTRQ,1955
31
31
  crawlee/_utils/raise_if_too_many_kwargs.py,sha256=J2gaUJmsmNwexohuehXw_mdYKv-eWiui6WUHFsQ3qTQ,597
32
32
  crawlee/_utils/recoverable_state.py,sha256=c1D2ZecxEliGZzhqYz9_oU5CF2Hm0UKvpOHqO6CDJRE,9032
33
- crawlee/_utils/recurring_task.py,sha256=sQMiURuDXbwwfAcIXK8V4NXncSxIBxsqN1cZWX7DLyg,2128
33
+ crawlee/_utils/recurring_task.py,sha256=_injmSsvG4p0xS4nBtoZZIR02syBG8JcLkuwgNDL8Nc,2143
34
34
  crawlee/_utils/requests.py,sha256=yOjai7bHR9_duPJ0ck-L76y9AnKZr49JBfSOQv9kvJc,5048
35
35
  crawlee/_utils/robots.py,sha256=DBU5ni4Y-p7bIKMbLd_ws8wgHSFc4K8zPVF3JvH_pkw,4661
36
36
  crawlee/_utils/sitemap.py,sha256=UI9EJiFiyFvV5_flVUtdsEVz8ZsJeRERPtcx8ZsqjTU,16632
@@ -53,22 +53,23 @@ crawlee/crawlers/__init__.py,sha256=jNFMsPizSgCN0ARYSmHs9Ppk8yvGgjUH5PxUeDchljE,
53
53
  crawlee/crawlers/_types.py,sha256=xbGTJQirgz5wUbfr12afMR4q-_5AWP7ngF2e8K5P8l0,355
54
54
  crawlee/crawlers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
55
  crawlee/crawlers/_abstract_http/__init__.py,sha256=h8jVWcPbDXzWHill1Vm7J7iliJW0hIrea0gkg-Hkb-M,319
56
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py,sha256=AfrEea3Ez2RvS6G6BFBfB6gg9aAFu-84RWqWryNHFrc,12977
56
+ crawlee/crawlers/_abstract_http/_abstract_http_crawler.py,sha256=Y12SBNAiF8QNJH83s6pPoao1W5ZSUhxHRHHKjE0qZhk,13174
57
57
  crawlee/crawlers/_abstract_http/_abstract_http_parser.py,sha256=Y5o_hiW_0mQAte5GFqkUxscwKEFpWrBYRsLKP1cfBwE,3521
58
58
  crawlee/crawlers/_abstract_http/_http_crawling_context.py,sha256=Rno_uJ8ivmyRxFQv2MyY_z9B5WPHSEd5MAPz31_1ZIo,2179
59
59
  crawlee/crawlers/_abstract_http/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  crawlee/crawlers/_adaptive_playwright/__init__.py,sha256=LREq9WR9BKsE8S8lSsEhlCoNjQaLhlJ9yo8y_6a8o4c,1072
61
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=nPFB9Q_3xQDJprb24NIQO53gf56J8wXjbM9C-58iiZ8,22862
61
+ crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=qAtZUwzGMwASwl5NKLAOsYnVA03IpZkk-BLKm3SwHoM,21588
62
62
  crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py,sha256=_At8T8S3JLGPA-1AeCFGrpE-FuCDW9sazrXt9U0tK6U,1048
63
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=9FlHIUC05IzUhJsVldQvpnDnj1jk8GJpqC98mPLN_fw,10431
63
+ crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=tejw-yfA8zVR8L-shIZOTFoMUQOI5Kt7FBJa8H0q4H0,10664
64
64
  crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkbIN_059jUyCG8Z6XAb_FBLClIKw7z-aDvjon2I,10834
65
65
  crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
66
66
  crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
67
67
  crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
68
- crawlee/crawlers/_basic/_basic_crawler.py,sha256=uL9QDGis__8z0W0u6ShsJMpvlrMmIi3MaucOvIxh0iI,73437
68
+ crawlee/crawlers/_basic/_basic_crawler.py,sha256=wTZW_1vM2A1x14VADRBsUr0TJzKfGoJODeHX0gOZnnY,73914
69
69
  crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
70
70
  crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
71
- crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
71
+ crawlee/crawlers/_basic/_context_utils.py,sha256=U1s0nl7EW9k-JrZA2VM7d_aWnE7Je3lXK04RFrXvRC4,655
72
+ crawlee/crawlers/_basic/_logging_utils.py,sha256=6Q206Sv0RzHztwu5y5XSdUpZhpqQ5-zSapQzUY9GxCo,4014
72
73
  crawlee/crawlers/_basic/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
74
  crawlee/crawlers/_beautifulsoup/__init__.py,sha256=7pL273ashA7yYDrH6nokYZ7SAMUAezilGIWdfThi_Co,822
74
75
  crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py,sha256=Q8Sb_lflpdYIwDZ1fIeuquPzdDG2zCnKsrcj8fe8n6k,3056
@@ -85,14 +86,14 @@ crawlee/crawlers/_parsel/_parsel_crawling_context.py,sha256=sZB26RcRLjSoD15myEOM
85
86
  crawlee/crawlers/_parsel/_parsel_parser.py,sha256=yWBfuXUHMriK4DRnyrXTQoGeqX5WV9bOEkBp_g0YCvQ,1540
86
87
  crawlee/crawlers/_parsel/_utils.py,sha256=MbRwx-cdjlq1zLzFYf64M3spOGQ6yxum4FvP0sdqA_Q,2693
87
88
  crawlee/crawlers/_playwright/__init__.py,sha256=6Cahe6VEF82o8CYiP8Cmp58Cmb6Rb8uMeyy7wnwe5ms,837
88
- crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=PHoU2qn_SlUM0mGeiPVfLPaKgXIjzvPn-ZDI3rdGVII,25546
89
+ crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=WappMIb0w-AnS745vlJpQNxwibKS7ok6_5a6iAcoTDs,26207
89
90
  crawlee/crawlers/_playwright/_playwright_crawling_context.py,sha256=Oi0tMBXHaEDlFjqG01DzgB7Ck52bjVjz-X__eMioxas,1249
90
91
  crawlee/crawlers/_playwright/_playwright_http_client.py,sha256=4mvaCI9Zum7znbm0F-ZZ6T1FEqZ-N-cvPOk1iqtcUSo,4164
91
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=fEI2laWhmJdWiGoMF5JBLBsim9NtENfagZt6FFd2Rgo,1387
92
- crawlee/crawlers/_playwright/_types.py,sha256=hMKA9K9gjzQuwwbnmmfJsQrwR-kq235jH-WBXWeqkGo,2174
92
+ crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=NFenJKgXcPuifaVYc2sdU5AV2BX6836GUuqFTE2Q0lU,1545
93
+ crawlee/crawlers/_playwright/_types.py,sha256=D4MaRWgYdps1CwgNWURJRLKkJk_9Oyue70jvkHAxnEU,2534
93
94
  crawlee/crawlers/_playwright/_utils.py,sha256=FQ_-LYo7DGHsNHRrTtWt3mC06VzQvQ2wkGqpA2wBzYU,3441
94
95
  crawlee/events/__init__.py,sha256=YMgOXKI0LsXfImKQy06PZ2Vdjy-uD_-acioagHft1do,577
95
- crawlee/events/_event_manager.py,sha256=wjZTYIKBI8daKUkOVxUrbPHuU8LnFpUtWStdkts7r3U,11588
96
+ crawlee/events/_event_manager.py,sha256=M8nKPc2BJo8RIBVHaG9BYuks0jwt5v3BFYQLA7IvolI,11380
96
97
  crawlee/events/_local_event_manager.py,sha256=CSiMJ6a_BwX0PPwtffEOtHm21dmALJz1zifo3AuMAk8,3708
97
98
  crawlee/events/_types.py,sha256=MKsI014OOKKhjPJRrvWYrezIDGoLjGGhWXrkqYw26Ns,3313
98
99
  crawlee/events/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -199,8 +200,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
199
200
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
200
201
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
201
202
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
202
- crawlee-1.1.2b4.dist-info/METADATA,sha256=xHqoYBVYvhy6i_bpLm0VF44jWovuibr2Xl2bl4wfRPU,29508
203
- crawlee-1.1.2b4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
204
- crawlee-1.1.2b4.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
- crawlee-1.1.2b4.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
- crawlee-1.1.2b4.dist-info/RECORD,,
203
+ crawlee-1.2.1b7.dist-info/METADATA,sha256=YPjeW0r_pqD_lHRtFfJ8GL84Z4t1IvEgu1uBtxc4IuY,29526
204
+ crawlee-1.2.1b7.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
205
+ crawlee-1.2.1b7.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
206
+ crawlee-1.2.1b7.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
207
+ crawlee-1.2.1b7.dist-info/RECORD,,