crawlee 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +67 -24
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
- crawlee/crawlers/_basic/_basic_crawler.py +51 -14
- crawlee/crawlers/_playwright/_playwright_crawler.py +16 -4
- crawlee/events/_event_manager.py +3 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +5 -4
- crawlee/storage_clients/_file_system/_dataset_client.py +4 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_file_system/_request_queue_client.py +28 -12
- crawlee/storage_clients/_file_system/_storage_client.py +2 -2
- crawlee/storage_clients/_memory/_dataset_client.py +4 -5
- crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +291 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +5 -3
- crawlee/storages/_key_value_store.py +11 -6
- crawlee/storages/_request_queue.py +5 -3
- crawlee/storages/_storage_instance_manager.py +54 -68
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +16 -5
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +69 -47
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -167,7 +167,9 @@ class AbstractHttpCrawler(
|
|
|
167
167
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
168
168
|
|
|
169
169
|
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
|
|
170
|
-
links_iterator = to_absolute_url_iterator(
|
|
170
|
+
links_iterator = to_absolute_url_iterator(
|
|
171
|
+
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
|
|
172
|
+
)
|
|
171
173
|
|
|
172
174
|
if robots_txt_file:
|
|
173
175
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
|
|
|
12
12
|
from parsel import Selector
|
|
13
13
|
from typing_extensions import Self, TypeVar, override
|
|
14
14
|
|
|
15
|
-
from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
|
|
15
|
+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
|
|
16
16
|
from crawlee._utils.docs import docs_group
|
|
17
17
|
from crawlee._utils.wait import wait_for
|
|
18
18
|
from crawlee.crawlers import (
|
|
@@ -71,7 +71,6 @@ class _NonPersistentStatistics(Statistics):
|
|
|
71
71
|
async def __aenter__(self) -> Self:
|
|
72
72
|
self._active = True
|
|
73
73
|
await self._state.initialize()
|
|
74
|
-
self._after_initialize()
|
|
75
74
|
return self
|
|
76
75
|
|
|
77
76
|
async def __aexit__(
|
|
@@ -149,15 +148,15 @@ class AdaptivePlaywrightCrawler(
|
|
|
149
148
|
non-default configuration.
|
|
150
149
|
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
|
|
151
150
|
"""
|
|
152
|
-
# Some sub crawler kwargs are internally modified. Prepare copies.
|
|
153
|
-
basic_crawler_kwargs_for_static_crawler = deepcopy(kwargs)
|
|
154
|
-
basic_crawler_kwargs_for_pw_crawler = deepcopy(kwargs)
|
|
155
|
-
|
|
156
151
|
# Adaptive crawling related.
|
|
157
152
|
self.rendering_type_predictor = rendering_type_predictor or DefaultRenderingTypePredictor()
|
|
158
153
|
self.result_checker = result_checker or (lambda _: True)
|
|
159
154
|
self.result_comparator = result_comparator or create_default_comparator(result_checker)
|
|
160
155
|
|
|
156
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
157
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
158
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
159
|
+
|
|
161
160
|
super().__init__(statistics=statistics, **kwargs)
|
|
162
161
|
|
|
163
162
|
# Sub crawlers related.
|
|
@@ -166,11 +165,11 @@ class AdaptivePlaywrightCrawler(
|
|
|
166
165
|
# Each sub crawler will use custom logger .
|
|
167
166
|
static_logger = getLogger('Subcrawler_static')
|
|
168
167
|
static_logger.setLevel(logging.ERROR)
|
|
169
|
-
basic_crawler_kwargs_for_static_crawler
|
|
168
|
+
basic_crawler_kwargs_for_static_crawler: _BasicCrawlerOptions = {'_logger': static_logger, **kwargs}
|
|
170
169
|
|
|
171
170
|
pw_logger = getLogger('Subcrawler_playwright')
|
|
172
171
|
pw_logger.setLevel(logging.ERROR)
|
|
173
|
-
basic_crawler_kwargs_for_pw_crawler
|
|
172
|
+
basic_crawler_kwargs_for_pw_crawler: _BasicCrawlerOptions = {'_logger': pw_logger, **kwargs}
|
|
174
173
|
|
|
175
174
|
# Initialize sub crawlers to create their pipelines.
|
|
176
175
|
static_crawler_class = AbstractHttpCrawler.create_parsed_http_crawler_class(static_parser=static_parser)
|
|
@@ -315,7 +314,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
315
314
|
),
|
|
316
315
|
logger=self._logger,
|
|
317
316
|
)
|
|
318
|
-
return SubCrawlerRun(result=result)
|
|
317
|
+
return SubCrawlerRun(result=result, run_context=context_linked_to_result)
|
|
319
318
|
except Exception as e:
|
|
320
319
|
return SubCrawlerRun(exception=e)
|
|
321
320
|
|
|
@@ -371,7 +370,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
371
370
|
self.track_http_only_request_handler_runs()
|
|
372
371
|
|
|
373
372
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
374
|
-
if static_run.result and self.result_checker(static_run.result):
|
|
373
|
+
if static_run.result and static_run.run_context and self.result_checker(static_run.result):
|
|
374
|
+
self._update_context_from_copy(context, static_run.run_context)
|
|
375
375
|
self._context_result_map[context] = static_run.result
|
|
376
376
|
return
|
|
377
377
|
if static_run.exception:
|
|
@@ -402,13 +402,10 @@ class AdaptivePlaywrightCrawler(
|
|
|
402
402
|
if pw_run.exception is not None:
|
|
403
403
|
raise pw_run.exception
|
|
404
404
|
|
|
405
|
-
if pw_run.result:
|
|
406
|
-
self._context_result_map[context] = pw_run.result
|
|
407
|
-
|
|
405
|
+
if pw_run.result and pw_run.run_context:
|
|
408
406
|
if should_detect_rendering_type:
|
|
409
407
|
detection_result: RenderingType
|
|
410
408
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
411
|
-
|
|
412
409
|
if static_run.result and self.result_comparator(static_run.result, pw_run.result):
|
|
413
410
|
detection_result = 'static'
|
|
414
411
|
else:
|
|
@@ -417,6 +414,9 @@ class AdaptivePlaywrightCrawler(
|
|
|
417
414
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
418
415
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
419
416
|
|
|
417
|
+
self._update_context_from_copy(context, pw_run.run_context)
|
|
418
|
+
self._context_result_map[context] = pw_run.result
|
|
419
|
+
|
|
420
420
|
def pre_navigation_hook(
|
|
421
421
|
self,
|
|
422
422
|
hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
|
|
@@ -451,8 +451,32 @@ class AdaptivePlaywrightCrawler(
|
|
|
451
451
|
def track_rendering_type_mispredictions(self) -> None:
|
|
452
452
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
453
453
|
|
|
454
|
+
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
455
|
+
"""Update mutable fields of `context` from `context_copy`.
|
|
456
|
+
|
|
457
|
+
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
458
|
+
allowing state synchronization after isolated crawler execution.
|
|
459
|
+
"""
|
|
460
|
+
updating_attributes = {
|
|
461
|
+
'request': ('headers', 'user_data'),
|
|
462
|
+
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
for attr, sub_attrs in updating_attributes.items():
|
|
466
|
+
original_sub_obj = getattr(context, attr)
|
|
467
|
+
copy_sub_obj = getattr(context_copy, attr)
|
|
468
|
+
|
|
469
|
+
# Check that both sub objects are not None
|
|
470
|
+
if original_sub_obj is None or copy_sub_obj is None:
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
for sub_attr in sub_attrs:
|
|
474
|
+
new_value = getattr(copy_sub_obj, sub_attr)
|
|
475
|
+
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
476
|
+
|
|
454
477
|
|
|
455
478
|
@dataclass(frozen=True)
|
|
456
479
|
class SubCrawlerRun:
|
|
457
480
|
result: RequestHandlerRunResult | None = None
|
|
458
481
|
exception: Exception | None = None
|
|
482
|
+
run_context: BasicCrawlingContext | None = None
|
|
@@ -437,14 +437,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
437
437
|
self._statistics_log_format = statistics_log_format
|
|
438
438
|
|
|
439
439
|
# Statistics
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
440
|
+
if statistics:
|
|
441
|
+
self._statistics = statistics
|
|
442
|
+
else:
|
|
443
|
+
|
|
444
|
+
async def persist_state_factory() -> KeyValueStore:
|
|
445
|
+
return await self.get_key_value_store()
|
|
446
|
+
|
|
447
|
+
self._statistics = cast(
|
|
448
|
+
'Statistics[TStatisticsState]',
|
|
449
|
+
Statistics.with_default_state(
|
|
450
|
+
persistence_enabled=True,
|
|
451
|
+
periodic_message_logger=self._logger,
|
|
452
|
+
statistics_log_format=self._statistics_log_format,
|
|
453
|
+
log_message='Current request statistics:',
|
|
454
|
+
persist_state_kvs_factory=persist_state_factory,
|
|
455
|
+
),
|
|
456
|
+
)
|
|
448
457
|
|
|
449
458
|
# Additional context managers to enter and exit
|
|
450
459
|
self._additional_context_managers = _additional_context_managers or []
|
|
@@ -659,7 +668,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
659
668
|
request_manager = await self.get_request_manager()
|
|
660
669
|
if purge_request_queue and isinstance(request_manager, RequestQueue):
|
|
661
670
|
await request_manager.drop()
|
|
662
|
-
self._request_manager = await RequestQueue.open(
|
|
671
|
+
self._request_manager = await RequestQueue.open(
|
|
672
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
673
|
+
configuration=self._service_locator.get_configuration(),
|
|
674
|
+
)
|
|
663
675
|
|
|
664
676
|
if requests is not None:
|
|
665
677
|
await self.add_requests(requests)
|
|
@@ -686,7 +698,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
686
698
|
except CancelledError:
|
|
687
699
|
pass
|
|
688
700
|
finally:
|
|
689
|
-
await self._crawler_state_rec_task.stop()
|
|
690
701
|
if threading.current_thread() is threading.main_thread():
|
|
691
702
|
with suppress(NotImplementedError):
|
|
692
703
|
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
|
|
@@ -718,8 +729,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
718
729
|
async def _run_crawler(self) -> None:
|
|
719
730
|
event_manager = self._service_locator.get_event_manager()
|
|
720
731
|
|
|
721
|
-
self._crawler_state_rec_task.start()
|
|
722
|
-
|
|
723
732
|
# Collect the context managers to be entered. Context managers that are already active are excluded,
|
|
724
733
|
# as they were likely entered by the caller, who will also be responsible for exiting them.
|
|
725
734
|
contexts_to_enter = [
|
|
@@ -730,6 +739,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
730
739
|
self._statistics,
|
|
731
740
|
self._session_pool if self._use_session_pool else None,
|
|
732
741
|
self._http_client,
|
|
742
|
+
self._crawler_state_rec_task,
|
|
733
743
|
*self._additional_context_managers,
|
|
734
744
|
)
|
|
735
745
|
if cm and getattr(cm, 'active', False) is False
|
|
@@ -944,6 +954,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
944
954
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
|
|
945
955
|
| None = None,
|
|
946
956
|
requests: Sequence[str | Request] | None = None,
|
|
957
|
+
rq_id: str | None = None,
|
|
958
|
+
rq_name: str | None = None,
|
|
959
|
+
rq_alias: str | None = None,
|
|
947
960
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
948
961
|
) -> None:
|
|
949
962
|
kwargs.setdefault('strategy', 'same-hostname')
|
|
@@ -955,7 +968,9 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
955
968
|
'`transform_request_function` arguments when `requests` is provided.'
|
|
956
969
|
)
|
|
957
970
|
# Add directly passed requests.
|
|
958
|
-
await context.add_requests(
|
|
971
|
+
await context.add_requests(
|
|
972
|
+
requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs
|
|
973
|
+
)
|
|
959
974
|
else:
|
|
960
975
|
# Add requests from extracted links.
|
|
961
976
|
await context.add_requests(
|
|
@@ -964,7 +979,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
964
979
|
label=label,
|
|
965
980
|
user_data=user_data,
|
|
966
981
|
transform_request_function=transform_request_function,
|
|
982
|
+
**kwargs,
|
|
967
983
|
),
|
|
984
|
+
rq_id=rq_id,
|
|
985
|
+
rq_name=rq_name,
|
|
986
|
+
rq_alias=rq_alias,
|
|
968
987
|
**kwargs,
|
|
969
988
|
)
|
|
970
989
|
|
|
@@ -1241,10 +1260,28 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1241
1260
|
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1242
1261
|
result = self._context_result_map[context]
|
|
1243
1262
|
|
|
1244
|
-
|
|
1263
|
+
base_request_manager = await self.get_request_manager()
|
|
1264
|
+
|
|
1245
1265
|
origin = context.request.loaded_url or context.request.url
|
|
1246
1266
|
|
|
1247
1267
|
for add_requests_call in result.add_requests_calls:
|
|
1268
|
+
rq_id = add_requests_call.get('rq_id')
|
|
1269
|
+
rq_name = add_requests_call.get('rq_name')
|
|
1270
|
+
rq_alias = add_requests_call.get('rq_alias')
|
|
1271
|
+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
|
|
1272
|
+
if specified_params > 1:
|
|
1273
|
+
raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
|
|
1274
|
+
if rq_id or rq_name or rq_alias:
|
|
1275
|
+
request_manager: RequestManager | RequestQueue = await RequestQueue.open(
|
|
1276
|
+
id=rq_id,
|
|
1277
|
+
name=rq_name,
|
|
1278
|
+
alias=rq_alias,
|
|
1279
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1280
|
+
configuration=self._service_locator.get_configuration(),
|
|
1281
|
+
)
|
|
1282
|
+
else:
|
|
1283
|
+
request_manager = base_request_manager
|
|
1284
|
+
|
|
1248
1285
|
requests = list[Request]()
|
|
1249
1286
|
|
|
1250
1287
|
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
@@ -12,6 +12,7 @@ from typing_extensions import NotRequired, TypedDict, TypeVar
|
|
|
12
12
|
|
|
13
13
|
from crawlee import service_locator
|
|
14
14
|
from crawlee._request import Request, RequestOptions
|
|
15
|
+
from crawlee._types import ConcurrencySettings
|
|
15
16
|
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
|
|
16
17
|
from crawlee._utils.docs import docs_group
|
|
17
18
|
from crawlee._utils.robots import RobotsTxtFile
|
|
@@ -113,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
113
114
|
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
|
|
114
115
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
115
116
|
and local storage.
|
|
116
|
-
browser_type: The type of browser to launch
|
|
117
|
+
browser_type: The type of browser to launch:
|
|
118
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
119
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
120
|
+
the system.
|
|
117
121
|
This option should not be used if `browser_pool` is provided.
|
|
118
122
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
119
123
|
directly to Playwright's `browser_type.launch` method. For more details, refer to the
|
|
@@ -152,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
152
156
|
):
|
|
153
157
|
raise ValueError(
|
|
154
158
|
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
|
|
155
|
-
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`
|
|
159
|
+
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
|
|
156
160
|
'`fingerprint_generator` arguments when `browser_pool` is provided.'
|
|
157
161
|
)
|
|
158
162
|
|
|
@@ -194,6 +198,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
194
198
|
|
|
195
199
|
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
|
|
196
200
|
|
|
201
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
202
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
203
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
204
|
+
|
|
197
205
|
super().__init__(**kwargs)
|
|
198
206
|
|
|
199
207
|
async def _open_page(
|
|
@@ -361,7 +369,9 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
361
369
|
links_iterator: Iterator[str] = iter(
|
|
362
370
|
[url for element in elements if (url := await element.get_attribute('href')) is not None]
|
|
363
371
|
)
|
|
364
|
-
links_iterator = to_absolute_url_iterator(
|
|
372
|
+
links_iterator = to_absolute_url_iterator(
|
|
373
|
+
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
|
|
374
|
+
)
|
|
365
375
|
|
|
366
376
|
if robots_txt_file:
|
|
367
377
|
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
|
|
@@ -489,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
489
499
|
"""A `BrowserPool` instance to be used for launching the browsers and getting pages."""
|
|
490
500
|
|
|
491
501
|
browser_type: NotRequired[BrowserType]
|
|
492
|
-
"""The type of browser to launch
|
|
502
|
+
"""The type of browser to launch:
|
|
503
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
504
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
|
|
493
505
|
This option should not be used if `browser_pool` is provided."""
|
|
494
506
|
|
|
495
507
|
browser_launch_options: NotRequired[Mapping[str, Any]]
|
crawlee/events/_event_manager.py
CHANGED
|
@@ -130,11 +130,13 @@ class EventManager:
|
|
|
130
130
|
if not self._active:
|
|
131
131
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
132
132
|
|
|
133
|
+
# Stop persist state event periodic emission and manually emit last one to ensure latest state is saved.
|
|
134
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
135
|
+
await self._emit_persist_state_event()
|
|
133
136
|
await self.wait_for_all_listeners_to_complete(timeout=self._close_timeout)
|
|
134
137
|
self._event_emitter.remove_all_listeners()
|
|
135
138
|
self._listener_tasks.clear()
|
|
136
139
|
self._listeners_to_wrappers.clear()
|
|
137
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
138
140
|
self._active = False
|
|
139
141
|
|
|
140
142
|
@overload
|
|
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def fingerprint_browser_type_from_playwright_browser_type(
|
|
14
|
-
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
|
|
14
|
+
playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
|
|
15
15
|
) -> SupportedBrowserType:
|
|
16
|
-
if playwright_browser_type
|
|
16
|
+
if playwright_browser_type in {'chromium', 'chrome'}:
|
|
17
17
|
return 'chrome'
|
|
18
18
|
if playwright_browser_type == 'firefox':
|
|
19
19
|
return 'firefox'
|
|
@@ -69,7 +69,7 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
69
69
|
|
|
70
70
|
if request_handling_instrumentation:
|
|
71
71
|
|
|
72
|
-
async def
|
|
72
|
+
async def middleware_wrapper(wrapped: Any, instance: _Middleware, args: Any, kwargs: Any) -> Any:
|
|
73
73
|
with self._tracer.start_as_current_span(
|
|
74
74
|
name=f'{instance.generator.__name__}, {wrapped.__name__}', # type:ignore[attr-defined] # valid in our context
|
|
75
75
|
attributes={
|
|
@@ -111,8 +111,8 @@ class CrawlerInstrumentor(BaseInstrumentor):
|
|
|
111
111
|
# Handpicked interesting methods to instrument
|
|
112
112
|
self._instrumented.extend(
|
|
113
113
|
[
|
|
114
|
-
(_Middleware, 'action',
|
|
115
|
-
(_Middleware, 'cleanup',
|
|
114
|
+
(_Middleware, 'action', middleware_wrapper),
|
|
115
|
+
(_Middleware, 'cleanup', middleware_wrapper),
|
|
116
116
|
(ContextPipeline, '__call__', context_pipeline_wrapper),
|
|
117
117
|
(BasicCrawler, '_BasicCrawler__run_task_function', self._simple_async_wrapper),
|
|
118
118
|
(BasicCrawler, '_commit_request_handler_result', _commit_request_handler_result_wrapper),
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
# % endif
|
|
6
6
|
# % if cookiecutter.http_client == 'curl-impersonate'
|
|
7
7
|
# % do extras.append('curl-impersonate')
|
|
8
|
-
# % elif cookiecutter.http_client == '
|
|
9
|
-
# % do extras.append('
|
|
8
|
+
# % elif cookiecutter.http_client == 'httpx'
|
|
9
|
+
# % do extras.append('httpx')
|
|
10
10
|
# % endif
|
|
11
11
|
|
|
12
12
|
[project]
|
|
@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Annotated, Any
|
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
12
|
-
from crawlee import Request
|
|
12
|
+
from crawlee import Request, RequestOptions
|
|
13
13
|
from crawlee._utils.docs import docs_group
|
|
14
14
|
from crawlee._utils.globs import Glob
|
|
15
15
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
@@ -18,9 +18,10 @@ from crawlee.request_loaders._request_loader import RequestLoader
|
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
import re
|
|
21
|
-
from collections.abc import Sequence
|
|
21
|
+
from collections.abc import Callable, Sequence
|
|
22
22
|
from types import TracebackType
|
|
23
23
|
|
|
24
|
+
from crawlee import RequestTransformAction
|
|
24
25
|
from crawlee.http_clients import HttpClient
|
|
25
26
|
from crawlee.proxy_configuration import ProxyInfo
|
|
26
27
|
from crawlee.storage_clients.models import ProcessedRequest
|
|
@@ -90,6 +91,11 @@ class SitemapRequestLoaderState(BaseModel):
|
|
|
90
91
|
class SitemapRequestLoader(RequestLoader):
|
|
91
92
|
"""A request loader that reads URLs from sitemap(s).
|
|
92
93
|
|
|
94
|
+
The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
|
|
95
|
+
(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
|
|
96
|
+
Note that HTML pages containing links are not supported - those should be handled by regular crawlers
|
|
97
|
+
and the `enqueue_links` functionality.
|
|
98
|
+
|
|
93
99
|
The loader fetches and parses sitemaps in the background, allowing crawling to start
|
|
94
100
|
before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
|
|
95
101
|
|
|
@@ -107,6 +113,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
107
113
|
exclude: list[re.Pattern[Any] | Glob] | None = None,
|
|
108
114
|
max_buffer_size: int = 200,
|
|
109
115
|
persist_state_key: str | None = None,
|
|
116
|
+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
110
117
|
) -> None:
|
|
111
118
|
"""Initialize the sitemap request loader.
|
|
112
119
|
|
|
@@ -120,6 +127,9 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
120
127
|
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
|
|
121
128
|
When provided, allows resuming from where it left off after interruption.
|
|
122
129
|
If None, no state persistence occurs.
|
|
130
|
+
transform_request_function: An optional function to transform requests
|
|
131
|
+
generated by the loader. It receives `RequestOptions` with `url` and should return either
|
|
132
|
+
modified `RequestOptions` or a `RequestTransformAction`.
|
|
123
133
|
"""
|
|
124
134
|
self._http_client = http_client
|
|
125
135
|
self._sitemap_urls = sitemap_urls
|
|
@@ -127,6 +137,7 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
127
137
|
self._exclude = exclude
|
|
128
138
|
self._proxy_info = proxy_info
|
|
129
139
|
self._max_buffer_size = max_buffer_size
|
|
140
|
+
self._transform_request_function = transform_request_function
|
|
130
141
|
|
|
131
142
|
# Synchronization for queue operations
|
|
132
143
|
self._queue_has_capacity = asyncio.Event()
|
|
@@ -308,8 +319,15 @@ class SitemapRequestLoader(RequestLoader):
|
|
|
308
319
|
|
|
309
320
|
async with self._queue_lock:
|
|
310
321
|
url = state.url_queue.popleft()
|
|
311
|
-
|
|
312
|
-
|
|
322
|
+
request_option = RequestOptions(url=url)
|
|
323
|
+
if self._transform_request_function:
|
|
324
|
+
transform_request_option = self._transform_request_function(request_option)
|
|
325
|
+
if transform_request_option == 'skip':
|
|
326
|
+
state.total_count -= 1
|
|
327
|
+
continue
|
|
328
|
+
if transform_request_option != 'unchanged':
|
|
329
|
+
request_option = transform_request_option
|
|
330
|
+
request = Request.from_url(**request_option)
|
|
313
331
|
state.in_progress.add(request.url)
|
|
314
332
|
if len(state.url_queue) < self._max_buffer_size:
|
|
315
333
|
self._queue_has_capacity.set()
|
|
@@ -163,7 +163,7 @@ class SessionPool:
|
|
|
163
163
|
def add_session(self, session: Session) -> None:
|
|
164
164
|
"""Add an externally created session to the pool.
|
|
165
165
|
|
|
166
|
-
This is
|
|
166
|
+
This is intended only for the cases when you want to add a session that was created outside of the pool.
|
|
167
167
|
Otherwise, the pool will create new sessions automatically.
|
|
168
168
|
|
|
169
169
|
Args:
|
|
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
|
|
|
32
32
|
"""Capture error snapshot and save it to key value store.
|
|
33
33
|
|
|
34
34
|
It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
|
|
35
|
-
it returns `KeyValueStoreChangeRecords` which is
|
|
35
|
+
it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
|
|
36
36
|
returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
|
|
37
37
|
an exception.
|
|
38
38
|
|
crawlee/statistics/_models.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import warnings
|
|
4
5
|
from dataclasses import asdict, dataclass
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
7
|
from typing import Annotated, Any
|
|
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
|
|
|
76
77
|
crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
|
|
77
78
|
crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
|
|
78
79
|
crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
|
|
79
|
-
crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
|
|
80
80
|
errors: dict[str, Any] = Field(default_factory=dict)
|
|
81
81
|
retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
|
|
82
82
|
requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
|
|
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
|
|
|
93
93
|
),
|
|
94
94
|
] = {}
|
|
95
95
|
|
|
96
|
+
# Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
|
|
97
|
+
_runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
|
|
98
|
+
|
|
99
|
+
def model_post_init(self, /, __context: Any) -> None:
|
|
100
|
+
self._runtime_offset = self.crawler_runtime or self._runtime_offset
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def crawler_runtime(self) -> timedelta:
|
|
104
|
+
if self.crawler_last_started_at:
|
|
105
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
106
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
107
|
+
return self._runtime_offset
|
|
108
|
+
|
|
109
|
+
@crawler_runtime.setter
|
|
110
|
+
def crawler_runtime(self, value: timedelta) -> None:
|
|
111
|
+
# Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
|
|
112
|
+
# To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
|
|
113
|
+
warnings.warn(
|
|
114
|
+
f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
|
|
115
|
+
f' Value {value} will not be used.',
|
|
116
|
+
DeprecationWarning,
|
|
117
|
+
stacklevel=2,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
@computed_field(alias='crawlerRuntimeMillis')
|
|
121
|
+
def crawler_runtime_for_serialization(self) -> timedelta:
|
|
122
|
+
if self.crawler_last_started_at:
|
|
123
|
+
finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
|
|
124
|
+
return self._runtime_offset + finished_at - self.crawler_last_started_at
|
|
125
|
+
return self._runtime_offset
|
|
126
|
+
|
|
96
127
|
@computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
|
|
97
128
|
@property
|
|
98
129
|
def request_total_duration(self) -> timedelta:
|