crawlee 1.0.4b8__py3-none-any.whl → 1.0.5b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +31 -6
- crawlee/request_loaders/_sitemap_request_loader.py +5 -0
- {crawlee-1.0.4b8.dist-info → crawlee-1.0.5b1.dist-info}/METADATA +1 -1
- {crawlee-1.0.4b8.dist-info → crawlee-1.0.5b1.dist-info}/RECORD +7 -7
- {crawlee-1.0.4b8.dist-info → crawlee-1.0.5b1.dist-info}/WHEEL +0 -0
- {crawlee-1.0.4b8.dist-info → crawlee-1.0.5b1.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.4b8.dist-info → crawlee-1.0.5b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -315,7 +315,7 @@ class AdaptivePlaywrightCrawler(
|
|
|
315
315
|
),
|
|
316
316
|
logger=self._logger,
|
|
317
317
|
)
|
|
318
|
-
return SubCrawlerRun(result=result)
|
|
318
|
+
return SubCrawlerRun(result=result, run_context=context_linked_to_result)
|
|
319
319
|
except Exception as e:
|
|
320
320
|
return SubCrawlerRun(exception=e)
|
|
321
321
|
|
|
@@ -371,7 +371,8 @@ class AdaptivePlaywrightCrawler(
|
|
|
371
371
|
self.track_http_only_request_handler_runs()
|
|
372
372
|
|
|
373
373
|
static_run = await self._crawl_one(rendering_type='static', context=context)
|
|
374
|
-
if static_run.result and self.result_checker(static_run.result):
|
|
374
|
+
if static_run.result and static_run.run_context and self.result_checker(static_run.result):
|
|
375
|
+
self._update_context_from_copy(context, static_run.run_context)
|
|
375
376
|
self._context_result_map[context] = static_run.result
|
|
376
377
|
return
|
|
377
378
|
if static_run.exception:
|
|
@@ -402,13 +403,10 @@ class AdaptivePlaywrightCrawler(
|
|
|
402
403
|
if pw_run.exception is not None:
|
|
403
404
|
raise pw_run.exception
|
|
404
405
|
|
|
405
|
-
if pw_run.result:
|
|
406
|
-
self._context_result_map[context] = pw_run.result
|
|
407
|
-
|
|
406
|
+
if pw_run.result and pw_run.run_context:
|
|
408
407
|
if should_detect_rendering_type:
|
|
409
408
|
detection_result: RenderingType
|
|
410
409
|
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
|
|
411
|
-
|
|
412
410
|
if static_run.result and self.result_comparator(static_run.result, pw_run.result):
|
|
413
411
|
detection_result = 'static'
|
|
414
412
|
else:
|
|
@@ -417,6 +415,9 @@ class AdaptivePlaywrightCrawler(
|
|
|
417
415
|
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
|
|
418
416
|
self.rendering_type_predictor.store_result(context.request, detection_result)
|
|
419
417
|
|
|
418
|
+
self._update_context_from_copy(context, pw_run.run_context)
|
|
419
|
+
self._context_result_map[context] = pw_run.result
|
|
420
|
+
|
|
420
421
|
def pre_navigation_hook(
|
|
421
422
|
self,
|
|
422
423
|
hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
|
|
@@ -451,8 +452,32 @@ class AdaptivePlaywrightCrawler(
|
|
|
451
452
|
def track_rendering_type_mispredictions(self) -> None:
|
|
452
453
|
self.statistics.state.rendering_type_mispredictions += 1
|
|
453
454
|
|
|
455
|
+
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
|
|
456
|
+
"""Update mutable fields of `context` from `context_copy`.
|
|
457
|
+
|
|
458
|
+
Uses object.__setattr__ to bypass frozen dataclass restrictions,
|
|
459
|
+
allowing state synchronization after isolated crawler execution.
|
|
460
|
+
"""
|
|
461
|
+
updating_attributes = {
|
|
462
|
+
'request': ('headers', 'user_data'),
|
|
463
|
+
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
for attr, sub_attrs in updating_attributes.items():
|
|
467
|
+
original_sub_obj = getattr(context, attr)
|
|
468
|
+
copy_sub_obj = getattr(context_copy, attr)
|
|
469
|
+
|
|
470
|
+
# Check that both sub objects are not None
|
|
471
|
+
if original_sub_obj is None or copy_sub_obj is None:
|
|
472
|
+
continue
|
|
473
|
+
|
|
474
|
+
for sub_attr in sub_attrs:
|
|
475
|
+
new_value = getattr(copy_sub_obj, sub_attr)
|
|
476
|
+
object.__setattr__(original_sub_obj, sub_attr, new_value)
|
|
477
|
+
|
|
454
478
|
|
|
455
479
|
@dataclass(frozen=True)
|
|
456
480
|
class SubCrawlerRun:
|
|
457
481
|
result: RequestHandlerRunResult | None = None
|
|
458
482
|
exception: Exception | None = None
|
|
483
|
+
run_context: BasicCrawlingContext | None = None
|
|
@@ -90,6 +90,11 @@ class SitemapRequestLoaderState(BaseModel):
|
|
|
90
90
|
class SitemapRequestLoader(RequestLoader):
|
|
91
91
|
"""A request loader that reads URLs from sitemap(s).
|
|
92
92
|
|
|
93
|
+
The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
|
|
94
|
+
(https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
|
|
95
|
+
Note that HTML pages containing links are not supported - those should be handled by regular crawlers
|
|
96
|
+
and the `enqueue_links` functionality.
|
|
97
|
+
|
|
93
98
|
The loader fetches and parses sitemaps in the background, allowing crawling to start
|
|
94
99
|
before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
|
|
95
100
|
|
|
@@ -58,7 +58,7 @@ crawlee/crawlers/_abstract_http/_abstract_http_parser.py,sha256=Y5o_hiW_0mQAte5G
|
|
|
58
58
|
crawlee/crawlers/_abstract_http/_http_crawling_context.py,sha256=Rno_uJ8ivmyRxFQv2MyY_z9B5WPHSEd5MAPz31_1ZIo,2179
|
|
59
59
|
crawlee/crawlers/_abstract_http/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
crawlee/crawlers/_adaptive_playwright/__init__.py,sha256=LREq9WR9BKsE8S8lSsEhlCoNjQaLhlJ9yo8y_6a8o4c,1072
|
|
61
|
-
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=
|
|
61
|
+
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=ME90JLkScWj_ynUymA59f832vEvvVpkP01cYfEc8m-Y,22895
|
|
62
62
|
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py,sha256=_At8T8S3JLGPA-1AeCFGrpE-FuCDW9sazrXt9U0tK6U,1048
|
|
63
63
|
crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=9FlHIUC05IzUhJsVldQvpnDnj1jk8GJpqC98mPLN_fw,10431
|
|
64
64
|
crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkbIN_059jUyCG8Z6XAb_FBLClIKw7z-aDvjon2I,10834
|
|
@@ -137,7 +137,7 @@ crawlee/request_loaders/_request_list.py,sha256=SIalHBMuFanE5GLnFocI0QCppWUiJQjr
|
|
|
137
137
|
crawlee/request_loaders/_request_loader.py,sha256=2Bg-AWWkIV1W-Dwjqo91dPY8nmc7H3teQy7d6OSgliQ,3620
|
|
138
138
|
crawlee/request_loaders/_request_manager.py,sha256=qFizyJuV2meIb9iiPfuii7ciuERMrp4SldAufiH46dc,3000
|
|
139
139
|
crawlee/request_loaders/_request_manager_tandem.py,sha256=lv-s94KPsoQAqx1KaXFch96ejhO147uOflF3UK5ORTk,4058
|
|
140
|
-
crawlee/request_loaders/_sitemap_request_loader.py,sha256=
|
|
140
|
+
crawlee/request_loaders/_sitemap_request_loader.py,sha256=s65D_N0mZxeIrGJEjqUYfu1uYj2AXSOkmErSnfAHv2A,15554
|
|
141
141
|
crawlee/sessions/__init__.py,sha256=dJdelbL-6MK5sW4SMU4QrjFbb9kRZ9uRnN-VS3R5-8Y,190
|
|
142
142
|
crawlee/sessions/_cookies.py,sha256=ihYbmpXfCzClzXDT7M2wefB_3KVzcMUdIzTZo6uGk6Y,9356
|
|
143
143
|
crawlee/sessions/_models.py,sha256=JMRQgDUP30XUdZ32isncHowOsXvK9jC_m9QYegbBI1E,2916
|
|
@@ -187,8 +187,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
187
187
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
188
188
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
189
189
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
190
|
-
crawlee-1.0.
|
|
191
|
-
crawlee-1.0.
|
|
192
|
-
crawlee-1.0.
|
|
193
|
-
crawlee-1.0.
|
|
194
|
-
crawlee-1.0.
|
|
190
|
+
crawlee-1.0.5b1.dist-info/METADATA,sha256=_yVzpir-oFvR6kVrhdjuxf1a9eUAJvuZu52sdRIV0_s,29314
|
|
191
|
+
crawlee-1.0.5b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
192
|
+
crawlee-1.0.5b1.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
193
|
+
crawlee-1.0.5b1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
194
|
+
crawlee-1.0.5b1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|