crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -31,7 +31,7 @@ class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
31
31
 
32
32
  @dataclass(frozen=True)
33
33
  @docs_group('Crawling contexts')
34
- class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
34
+ class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
35
35
  """The crawling context used by `AbstractHttpCrawler`.
36
36
 
37
37
  It provides access to key objects as well as utility functions for handling crawling tasks.
@@ -11,13 +11,16 @@ _install_import_hook(__name__)
11
11
 
12
12
  # The following imports are wrapped in try_import to handle optional dependencies,
13
13
  # ensuring the module can still function even if these dependencies are missing.
14
- with _try_import(__name__, 'BeautifulSoupCrawler'):
14
+ with _try_import(__name__, 'RenderingType', 'RenderingTypePrediction', 'RenderingTypePredictor'):
15
15
  from ._rendering_type_predictor import RenderingType, RenderingTypePrediction, RenderingTypePredictor
16
- with _try_import(__name__, 'BeautifulSoupCrawlingContext'):
16
+ with _try_import(__name__, 'AdaptivePlaywrightCrawler'):
17
17
  from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawler
18
+ with _try_import(__name__, 'AdaptivePlaywrightCrawlerStatisticState'):
19
+ from ._adaptive_playwright_crawler import AdaptivePlaywrightCrawlerStatisticState
18
20
 
19
21
  __all__ = [
20
22
  'AdaptivePlaywrightCrawler',
23
+ 'AdaptivePlaywrightCrawlerStatisticState',
21
24
  'AdaptivePlaywrightCrawlingContext',
22
25
  'AdaptivePlaywrightPreNavCrawlingContext',
23
26
  'RenderingType',
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
12
12
  from parsel import Selector
13
13
  from typing_extensions import Self, TypeVar, override
14
14
 
15
- from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
15
+ from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
16
16
  from crawlee._utils.docs import docs_group
17
17
  from crawlee._utils.wait import wait_for
18
18
  from crawlee.crawlers import (
@@ -27,23 +27,16 @@ from crawlee.crawlers import (
27
27
  )
28
28
  from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser
29
29
  from crawlee.crawlers._parsel._parsel_parser import ParselParser
30
+ from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
30
31
  from crawlee.statistics import Statistics, StatisticsState
31
32
 
32
- from ._adaptive_playwright_crawler_statistics import (
33
- AdaptivePlaywrightCrawlerStatisticState,
34
- )
33
+ from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState
35
34
  from ._adaptive_playwright_crawling_context import (
36
35
  AdaptivePlaywrightCrawlingContext,
37
36
  AdaptivePlaywrightPreNavCrawlingContext,
38
37
  )
39
- from ._rendering_type_predictor import (
40
- DefaultRenderingTypePredictor,
41
- RenderingType,
42
- RenderingTypePredictor,
43
- )
44
- from ._result_comparator import (
45
- create_default_comparator,
46
- )
38
+ from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor
39
+ from ._result_comparator import create_default_comparator
47
40
 
48
41
  if TYPE_CHECKING:
49
42
  from types import TracebackType
@@ -51,7 +44,6 @@ if TYPE_CHECKING:
51
44
  from typing_extensions import Unpack
52
45
 
53
46
  from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
54
- from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
55
47
 
56
48
 
57
49
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -71,7 +63,6 @@ class _NonPersistentStatistics(Statistics):
71
63
  async def __aenter__(self) -> Self:
72
64
  self._active = True
73
65
  await self._state.initialize()
74
- self._after_initialize()
75
66
  return self
76
67
 
77
68
  async def __aexit__(
@@ -85,8 +76,8 @@ class _NonPersistentStatistics(Statistics):
85
76
 
86
77
  @docs_group('Crawlers')
87
78
  class AdaptivePlaywrightCrawler(
88
- Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
89
79
  BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
80
+ Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
90
81
  ):
91
82
  """An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.
92
83
 
@@ -149,28 +140,30 @@ class AdaptivePlaywrightCrawler(
149
140
  non-default configuration.
150
141
  kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
151
142
  """
152
- # Some sub crawler kwargs are internally modified. Prepare copies.
153
- basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
154
- basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
155
-
156
143
  # Adaptive crawling related.
157
144
  self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
158
145
  self.result_checker = result_checker or (lambda _: True)
159
146
  self.result_comparator = result_comparator or create_default_comparator(result_checker)
160
147
 
161
- super().__init__(statistics=statistics, **kwargs)
148
+ # Set default concurrency settings for browser crawlers if not provided
149
+ if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
150
+ kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
151
+
152
+ adaptive_statistics = statistics or Statistics(state_model=AdaptivePlaywrightCrawlerStatisticState)
153
+
154
+ super().__init__(statistics=adaptive_statistics, **kwargs)
162
155
 
163
156
  # Sub crawlers related.
164
- playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {}
157
+ playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions()
165
158
 
166
159
  # Each sub crawler will use custom logger .
167
160
  static_logger = getLogger('Subcrawler_static')
168
161
  static_logger.setLevel(logging.ERROR)
169
- basic_crawler_kwargs_for_static_crawler['_logger'] = static_logger
162
+ basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
170
163
 
171
164
  pw_logger = getLogger('Subcrawler_playwright')
172
165
  pw_logger.setLevel(logging.ERROR)
173
- basic_crawler_kwargs_for_pw_crawler['_logger'] = pw_logger
166
+ basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
174
167
 
175
168
  # Initialize sub crawlers to create their pipelines.
176
169
  static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
@@ -291,11 +284,14 @@ class AdaptivePlaywrightCrawler(
291
284
  use_state_function = context.use_state
292
285
 
293
286
  # New result is created and injected to newly created context. This is done to ensure isolation of sub crawlers.
294
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
287
+ result = RequestHandlerRunResult(
288
+ key_value_store_getter=self.get_key_value_store,
289
+ request=context.request,
290
+ )
295
291
  context_linked_to_result = BasicCrawlingContext(
296
- request=deepcopy(context.request),
297
- session=deepcopy(context.session),
298
- proxy_info=deepcopy(context.proxy_info),
292
+ request=result.request,
293
+ session=context.session,
294
+ proxy_info=context.proxy_info,
299
295
  send_request=context.send_request,
300
296
  add_requests=result.add_requests,
301
297
  push_data=result.push_data,
@@ -333,7 +329,7 @@ class AdaptivePlaywrightCrawler(
333
329
  )
334
330
  await self.router(adaptive_crawling_context)
335
331
 
336
- return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router)
332
+ return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
337
333
 
338
334
  if rendering_type == 'client only':
339
335
 
@@ -343,7 +339,7 @@ class AdaptivePlaywrightCrawler(
343
339
  )
344
340
  await self.router(adaptive_crawling_context)
345
341
 
346
- return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router)
342
+ return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type]
347
343
 
348
344
  raise RuntimeError(
349
345
  f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}'
@@ -403,12 +399,9 @@ class AdaptivePlaywrightCrawler(
403
399
  raise pw_run.exception
404
400
 
405
401
  if pw_run.result:
406
- self._context_result_map[context] = pw_run.result
407
-
408
402
  if should_detect_rendering_type:
409
403
  detection_result: RenderingType
410
404
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
411
-
412
405
  if static_run.result and self.result_comparator(static_run.result, pw_run.result):
413
406
  detection_result = 'static'
414
407
  else:
@@ -417,6 +410,8 @@ class AdaptivePlaywrightCrawler(
417
410
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
418
411
  self.rendering_type_predictor.store_result(context.request, detection_result)
419
412
 
413
+ self._context_result_map[context] = pw_run.result
414
+
420
415
  def pre_navigation_hook(
421
416
  self,
422
417
  hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -12,7 +12,7 @@ from crawlee.statistics import StatisticsState
12
12
  class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
13
13
  """Statistic data about a crawler run with additional information related to adaptive crawling."""
14
14
 
15
- model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
15
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
16
16
 
17
17
  http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
18
18
  """Number representing how many times static http based crawling was used."""
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
17
17
  from playwright.async_api import Page, Response
18
18
  from typing_extensions import Self
19
19
 
20
- from crawlee.crawlers._playwright._types import BlockRequestsFunction
20
+ from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
21
21
 
22
22
 
23
23
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -31,7 +31,8 @@ class AdaptiveContextError(RuntimeError):
31
31
  @dataclass(frozen=True)
32
32
  @docs_group('Crawling contexts')
33
33
  class AdaptivePlaywrightCrawlingContext(
34
- Generic[TStaticParseResult, TStaticSelectResult], ParsedHttpCrawlingContext[TStaticParseResult]
34
+ ParsedHttpCrawlingContext[TStaticParseResult],
35
+ Generic[TStaticParseResult, TStaticSelectResult],
35
36
  ):
36
37
  _static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
37
38
  """The crawling context used by `AdaptivePlaywrightCrawler`.
@@ -189,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
189
190
  http_response = await PlaywrightHttpResponse.from_playwright_response(
190
191
  response=context.response, protocol=protocol_guess or ''
191
192
  )
192
- # block_requests is useful only on pre-navigation contexts. It is useless here.
193
+ # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
193
194
  context_kwargs.pop('block_requests')
195
+ context_kwargs.pop('goto_options')
194
196
  return cls(
195
197
  parsed_content=await parser.parse(http_response),
196
198
  http_response=http_response,
@@ -211,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
211
213
  block_requests: BlockRequestsFunction | None = None
212
214
  """Blocks network requests matching specified URL patterns."""
213
215
 
216
+ goto_options: GotoOptions | None = None
217
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
218
+
214
219
  @property
215
220
  def page(self) -> Page:
216
221
  """The Playwright `Page` object for the current page.
@@ -32,7 +32,7 @@ FeatureVector = tuple[float, float]
32
32
 
33
33
 
34
34
  class RenderingTypePredictorState(BaseModel):
35
- model_config = ConfigDict(populate_by_name=True)
35
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
36
36
 
37
37
  model: Annotated[
38
38
  LogisticRegression,