crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_types.py +44 -5
  5. crawlee/_utils/context.py +3 -3
  6. crawlee/_utils/file.py +8 -1
  7. crawlee/_utils/globs.py +4 -4
  8. crawlee/_utils/recurring_task.py +12 -3
  9. crawlee/_utils/sitemap.py +12 -5
  10. crawlee/_utils/system.py +27 -11
  11. crawlee/_utils/time.py +41 -1
  12. crawlee/browsers/_browser_pool.py +1 -1
  13. crawlee/browsers/_playwright_browser.py +2 -1
  14. crawlee/crawlers/__init__.py +5 -1
  15. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  16. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
  17. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  18. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
  19. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  20. crawlee/crawlers/_basic/_basic_crawler.py +138 -124
  21. crawlee/crawlers/_basic/_context_utils.py +24 -0
  22. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  23. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  24. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  25. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  26. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
  27. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  28. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  29. crawlee/crawlers/_playwright/_types.py +12 -2
  30. crawlee/errors.py +4 -0
  31. crawlee/events/_event_manager.py +12 -6
  32. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  33. crawlee/http_clients/_base.py +4 -0
  34. crawlee/http_clients/_curl_impersonate.py +68 -14
  35. crawlee/http_clients/_httpx.py +16 -6
  36. crawlee/http_clients/_impit.py +25 -10
  37. crawlee/otel/crawler_instrumentor.py +1 -3
  38. crawlee/request_loaders/_sitemap_request_loader.py +18 -5
  39. crawlee/router.py +13 -3
  40. crawlee/sessions/_cookies.py +13 -8
  41. crawlee/sessions/_models.py +3 -3
  42. crawlee/statistics/_models.py +51 -9
  43. crawlee/statistics/_statistics.py +2 -21
  44. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  45. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  46. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  47. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  48. crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
  49. crawlee/storage_clients/_redis/_client_mixin.py +1 -4
  50. crawlee/storage_clients/_redis/_dataset_client.py +6 -2
  51. crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
  52. crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
  53. crawlee/storage_clients/_redis/_storage_client.py +12 -9
  54. crawlee/storage_clients/_redis/_utils.py +1 -1
  55. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  56. crawlee/storage_clients/_sql/_storage_client.py +0 -9
  57. crawlee/storage_clients/models.py +8 -3
  58. crawlee/storages/_storage_instance_manager.py +103 -44
  59. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
  60. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
  61. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  62. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  63. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
@@ -3,14 +3,16 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import logging
5
5
  from abc import ABC
6
+ from datetime import timedelta
6
7
  from typing import TYPE_CHECKING, Any, Generic
7
8
 
8
9
  from more_itertools import partition
9
10
  from pydantic import ValidationError
10
- from typing_extensions import TypeVar
11
+ from typing_extensions import NotRequired, TypeVar
11
12
 
12
- from crawlee._request import Request, RequestOptions
13
+ from crawlee._request import Request, RequestOptions, RequestState
13
14
  from crawlee._utils.docs import docs_group
15
+ from crawlee._utils.time import SharedTimeout
14
16
  from crawlee._utils.urls import to_absolute_url_iterator
15
17
  from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
16
18
  from crawlee.errors import SessionError
@@ -32,6 +34,19 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=ParsedHttpCrawlingContext)
32
34
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
33
35
 
34
36
 
37
+ class HttpCrawlerOptions(
38
+ BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39
+ Generic[TCrawlingContext, TStatisticsState],
40
+ ):
41
+ """Arguments for the `AbstractHttpCrawler` constructor.
42
+
43
+ It is intended for typing forwarded `__init__` arguments in the subclasses.
44
+ """
45
+
46
+ navigation_timeout: NotRequired[timedelta | None]
47
+ """Timeout for the HTTP request."""
48
+
49
+
35
50
  @docs_group('Crawlers')
36
51
  class AbstractHttpCrawler(
37
52
  BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ class AbstractHttpCrawler(
56
71
  self,
57
72
  *,
58
73
  parser: AbstractHttpParser[TParseResult, TSelectResult],
74
+ navigation_timeout: timedelta | None = None,
59
75
  **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
60
76
  ) -> None:
61
77
  self._parser = parser
78
+ self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
62
79
  self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80
+ self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
63
81
 
64
82
  if '_context_pipeline' not in kwargs:
65
83
  raise ValueError(
@@ -82,9 +100,7 @@ class AbstractHttpCrawler(
82
100
  this method simplifies cases where `TParseResult` is used for both generic parameters.
83
101
  """
84
102
 
85
- class _ParsedHttpCrawler(
86
- AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult]
87
- ):
103
+ class _ParsedHttpCrawler(AbstractHttpCrawler):
88
104
  def __init__(
89
105
  self,
90
106
  parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
@@ -112,9 +128,17 @@ class AbstractHttpCrawler(
112
128
  async def _execute_pre_navigation_hooks(
113
129
  self, context: BasicCrawlingContext
114
130
  ) -> AsyncGenerator[BasicCrawlingContext, None]:
115
- for hook in self._pre_navigation_hooks:
116
- await hook(context)
117
- yield context
131
+ context_id = id(context)
132
+ self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
133
+
134
+ try:
135
+ for hook in self._pre_navigation_hooks:
136
+ async with self._shared_navigation_timeouts[context_id]:
137
+ await hook(context)
138
+
139
+ yield context
140
+ finally:
141
+ self._shared_navigation_timeouts.pop(context_id, None)
118
142
 
119
143
  async def _parse_http_response(
120
144
  self, context: HttpCrawlingContext
@@ -165,11 +189,18 @@ class AbstractHttpCrawler(
165
189
  robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
166
190
 
167
191
  kwargs.setdefault('strategy', 'same-hostname')
192
+ strategy = kwargs.get('strategy', 'same-hostname')
168
193
 
169
194
  links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170
- links_iterator = to_absolute_url_iterator(
171
- context.request.loaded_url or context.request.url, links_iterator, logger=context.log
195
+
196
+ # Get base URL from <base> tag if present
197
+ extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
198
+ base_url: str = (
199
+ str(extracted_base_urls[0])
200
+ if extracted_base_urls
201
+ else context.request.loaded_url or context.request.url
172
202
  )
203
+ links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
173
204
 
174
205
  if robots_txt_file:
175
206
  skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
@@ -177,7 +208,9 @@ class AbstractHttpCrawler(
177
208
  skipped = iter([])
178
209
 
179
210
  for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
180
- request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
211
+ request_options = RequestOptions(
212
+ url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
213
+ )
181
214
 
182
215
  if transform_request_function:
183
216
  transform_request_options = transform_request_function(request_options)
@@ -216,13 +249,16 @@ class AbstractHttpCrawler(
216
249
  Yields:
217
250
  The original crawling context enhanced by HTTP response.
218
251
  """
219
- result = await self._http_client.crawl(
220
- request=context.request,
221
- session=context.session,
222
- proxy_info=context.proxy_info,
223
- statistics=self._statistics,
224
- )
252
+ async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
253
+ result = await self._http_client.crawl(
254
+ request=context.request,
255
+ session=context.session,
256
+ proxy_info=context.proxy_info,
257
+ statistics=self._statistics,
258
+ timeout=remaining_timeout,
259
+ )
225
260
 
261
+ context.request.state = RequestState.AFTER_NAV
226
262
  yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
227
263
 
228
264
  async def _handle_status_code_response(
@@ -11,13 +11,16 @@ _install_import_hook(__name__)
11
11
 
12
12
  # The following imports are wrapped in try_import to handle optional dependencies,
13
13
  # ensuring the module can still function even if these dependencies are missing.
14
- with _try_import(__name__, 'BeautifulSoupCrawler'):
14
+ with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):
15
15
  from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
16
- with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
16
+ with _try_import(__name__, 'AdaptivePlaywrightCrawler'):
17
17
  from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
18
+ with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
19
+ from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState
18
20
 
19
21
  __all__ = [
20
22
  'AdaptivePlaywrightCrawler',
23
+ 'AdaptivePlaywrightCrawlerStatisticState',
21
24
  'AdaptivePlaywrightCrawlingContext',
22
25
  'AdaptivePlaywrightPreNavCrawlingContext',
23
26
  'RenderingType',
@@ -27,23 +27,16 @@ from crawlee.crawlers import (
27
27
  )
28
28
  from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
29
29
  from crawlee.crawlers._parsel._parsel_parser import ParselParser
30
+ from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
30
31
  from crawlee.statistics import Statistics, StatisticsState
31
32
 
32
- from ._adaptive_playwright_crawler_statistics import (
33
- AdaptivePlaywrightCrawlerStatisticState,
34
- )
33
+ from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
35
34
  from ._adaptive_playwright_crawling_context import (
36
35
  AdaptivePlaywrightCrawlingContext,
37
36
  AdaptivePlaywrightPreNavCrawlingContext,
38
37
  )
39
- from ._rendering_type_predictor import (
40
- DefaultRenderingTypePredictor,
41
- RenderingType,
42
- RenderingTypePredictor,
43
- )
44
- from ._result_comparator import (
45
- create_default_comparator,
46
- )
38
+ from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
39
+ from ._result_comparator import create_default_comparator
47
40
 
48
41
  if TYPE_CHECKING:
49
42
  from types import TracebackType
@@ -51,7 +44,6 @@ if TYPE_CHECKING:
51
44
  from typing_extensions import Unpack
52
45
 
53
46
  from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
54
- from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
55
47
 
56
48
 
57
49
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -157,10 +149,12 @@ class AdaptivePlaywrightCrawler(
157
149
  if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
158
150
  kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
159
151
 
160
- super().__init__(statistics=statistics, **kwargs)
152
+ adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
153
+
154
+ super().__init__(statistics=adaptive_statistics, **kwargs)
161
155
 
162
156
  # Sub crawlers related.
163
- playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {}
157
+ playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()
164
158
 
165
159
  # Each sub crawler will use custom logger .
166
160
  static_logger = getLogger('Subcrawler_static')
@@ -290,11 +284,14 @@ class AdaptivePlaywrightCrawler(
290
284
  use_state_function = context.use_state
291
285
 
292
286
  # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
293
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
287
+ result = RequestHandlerRunResult(
288
+ key_value_store_getter=self.get_key_value_store,
289
+ request=context.request,
290
+ )
294
291
  context_linked_to_result = BasicCrawlingContext(
295
- request=deepcopy(context.request),
296
- session=deepcopy(context.session),
297
- proxy_info=deepcopy(context.proxy_info),
292
+ request=result.request,
293
+ session=context.session,
294
+ proxy_info=context.proxy_info,
298
295
  send_request=context.send_request,
299
296
  add_requests=result.add_requests,
300
297
  push_data=result.push_data,
@@ -314,7 +311,7 @@ class AdaptivePlaywrightCrawler(
314
311
  ),
315
312
  logger=self._logger,
316
313
  )
317
- return SubCrawlerRun(result=result, run_context=context_linked_to_result)
314
+ return SubCrawlerRun(result=result)
318
315
  except Exception as e:
319
316
  return SubCrawlerRun(exception=e)
320
317
 
@@ -332,7 +329,7 @@ class AdaptivePlaywrightCrawler(
332
329
  )
333
330
  await self.router(adaptive_crawling_context)
334
331
 
335
- return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)
332
+ return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
336
333
 
337
334
  if rendering_type == 'client only':
338
335
 
@@ -342,7 +339,7 @@ class AdaptivePlaywrightCrawler(
342
339
  )
343
340
  await self.router(adaptive_crawling_context)
344
341
 
345
- return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)
342
+ return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
346
343
 
347
344
  raise RuntimeError(
348
345
  f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
@@ -370,8 +367,7 @@ class AdaptivePlaywrightCrawler(
370
367
  self.track_http_only_request_handler_runs()
371
368
 
372
369
  static_run = await self._crawl_one(rendering_type='static', context=context)
373
- if static_run.result and static_run.run_context and self.result_checker(static_run.result):
374
- self._update_context_from_copy(context, static_run.run_context)
370
+ if static_run.result and self.result_checker(static_run.result):
375
371
  self._context_result_map[context] = static_run.result
376
372
  return
377
373
  if static_run.exception:
@@ -402,7 +398,7 @@ class AdaptivePlaywrightCrawler(
402
398
  if pw_run.exception is not None:
403
399
  raise pw_run.exception
404
400
 
405
- if pw_run.result and pw_run.run_context:
401
+ if pw_run.result:
406
402
  if should_detect_rendering_type:
407
403
  detection_result: RenderingType
408
404
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
@@ -414,7 +410,6 @@ class AdaptivePlaywrightCrawler(
414
410
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
415
411
  self.rendering_type_predictor.store_result(context.request, detection_result)
416
412
 
417
- self._update_context_from_copy(context, pw_run.run_context)
418
413
  self._context_result_map[context] = pw_run.result
419
414
 
420
415
  def pre_navigation_hook(
@@ -451,32 +446,8 @@ class AdaptivePlaywrightCrawler(
451
446
  def track_rendering_type_mispredictions(self) -> None:
452
447
  self.statistics.state.rendering_type_mispredictions += 1
453
448
 
454
- def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
455
- """Update mutable fields of `context` from `context_copy`.
456
-
457
- Uses object.__setattr__ to bypass frozen dataclass restrictions,
458
- allowing state synchronization after isolated crawler execution.
459
- """
460
- updating_attributes = {
461
- 'request': ('headers', 'user_data'),
462
- 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
463
- }
464
-
465
- for attr, sub_attrs in updating_attributes.items():
466
- original_sub_obj = getattr(context, attr)
467
- copy_sub_obj = getattr(context_copy, attr)
468
-
469
- # Check that both sub objects are not None
470
- if original_sub_obj is None or copy_sub_obj is None:
471
- continue
472
-
473
- for sub_attr in sub_attrs:
474
- new_value = getattr(copy_sub_obj, sub_attr)
475
- object.__setattr__(original_sub_obj, sub_attr, new_value)
476
-
477
449
 
478
450
  @dataclass(frozen=True)
479
451
  class SubCrawlerRun:
480
452
  result: RequestHandlerRunResult | None = None
481
453
  exception: Exception | None = None
482
- run_context: BasicCrawlingContext | None = None
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
17
17
  from playwright.async_api import Page, Response
18
18
  from typing_extensions import Self
19
19
 
20
- from crawlee.crawlers._playwright._types import BlockRequestsFunction
20
+ from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
21
21
 
22
22
 
23
23
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
190
190
  http_response = await PlaywrightHttpResponse.from_playwright_response(
191
191
  response=context.response, protocol=protocol_guess or ''
192
192
  )
193
- # block_requests is useful only on pre-navigation contexts. It is useless here.
193
+ # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
194
194
  context_kwargs.pop('block_requests')
195
+ context_kwargs.pop('goto_options')
195
196
  return cls(
196
197
  parsed_content=await parser.parse(http_response),
197
198
  http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
212
213
  block_requests: BlockRequestsFunction | None = None
213
214
  """Blocks network requests matching specified URL patterns."""
214
215
 
216
+ goto_options: GotoOptions | None = None
217
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
218
+
215
219
  @property
216
220
  def page(self) -> Page:
217
221
  """The Playwright `Page` object for the current page.