crawlee 1.0.4b8__py3-none-any.whl → 1.0.5b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

@@ -315,7 +315,7 @@ class AdaptivePlaywrightCrawler(
315
315
  ),
316
316
  logger=self._logger,
317
317
  )
318
- return SubCrawlerRun(result=result)
318
+ return SubCrawlerRun(result=result, run_context=context_linked_to_result)
319
319
  except Exception as e:
320
320
  return SubCrawlerRun(exception=e)
321
321
 
@@ -371,7 +371,8 @@ class AdaptivePlaywrightCrawler(
371
371
  self.track_http_only_request_handler_runs()
372
372
 
373
373
  static_run = await self._crawl_one(rendering_type='static', context=context)
374
- if static_run.result and self.result_checker(static_run.result):
374
+ if static_run.result and static_run.run_context and self.result_checker(static_run.result):
375
+ self._update_context_from_copy(context, static_run.run_context)
375
376
  self._context_result_map[context] = static_run.result
376
377
  return
377
378
  if static_run.exception:
@@ -402,13 +403,10 @@ class AdaptivePlaywrightCrawler(
402
403
  if pw_run.exception is not None:
403
404
  raise pw_run.exception
404
405
 
405
- if pw_run.result:
406
- self._context_result_map[context] = pw_run.result
407
-
406
+ if pw_run.result and pw_run.run_context:
408
407
  if should_detect_rendering_type:
409
408
  detection_result: RenderingType
410
409
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
411
-
412
410
  if static_run.result and self.result_comparator(static_run.result, pw_run.result):
413
411
  detection_result = 'static'
414
412
  else:
@@ -417,6 +415,9 @@ class AdaptivePlaywrightCrawler(
417
415
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
418
416
  self.rendering_type_predictor.store_result(context.request, detection_result)
419
417
 
418
+ self._update_context_from_copy(context, pw_run.run_context)
419
+ self._context_result_map[context] = pw_run.result
420
+
420
421
  def pre_navigation_hook(
421
422
  self,
422
423
  hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +452,32 @@ class AdaptivePlaywrightCrawler(
451
452
  def track_rendering_type_mispredictions(self) -> None:
452
453
  self.statistics.state.rendering_type_mispredictions += 1
453
454
 
455
+ def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
456
+ """Update mutable fields of `context` from `context_copy`.
457
+
458
+ Uses object.__setattr__ to bypass frozen dataclass restrictions,
459
+ allowing state synchronization after isolated crawler execution.
460
+ """
461
+ updating_attributes = {
462
+ 'request': ('headers', 'user_data'),
463
+ 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
464
+ }
465
+
466
+ for attr, sub_attrs in updating_attributes.items():
467
+ original_sub_obj = getattr(context, attr)
468
+ copy_sub_obj = getattr(context_copy, attr)
469
+
470
+ # Check that both sub objects are not None
471
+ if original_sub_obj is None or copy_sub_obj is None:
472
+ continue
473
+
474
+ for sub_attr in sub_attrs:
475
+ new_value = getattr(copy_sub_obj, sub_attr)
476
+ object.__setattr__(original_sub_obj, sub_attr, new_value)
477
+
454
478
 
455
479
  @dataclass(frozen=True)
456
480
  class SubCrawlerRun:
457
481
  result: RequestHandlerRunResult | None = None
458
482
  exception: Exception | None = None
483
+ run_context: BasicCrawlingContext | None = None
@@ -90,6 +90,11 @@ class SitemapRequestLoaderState(BaseModel):
90
90
  class SitemapRequestLoader(RequestLoader):
91
91
  """A request loader that reads URLs from sitemap(s).
92
92
 
93
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
94
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
95
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
96
+ and the `enqueue_links` functionality.
97
+
93
98
  The loader fetches and parses sitemaps in the background, allowing crawling to start
94
99
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
95
100
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.0.4b8
3
+ Version: 1.0.5b1
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -58,7 +58,7 @@ crawlee/crawlers/_abstract_http/_abstract_http_parser.py,sha256=Y5o_hiW_0mQAte5G
58
58
  crawlee/crawlers/_abstract_http/_http_crawling_context.py,sha256=Rno_uJ8ivmyRxFQv2MyY_z9B5WPHSEd5MAPz31_1ZIo,2179
59
59
  crawlee/crawlers/_abstract_http/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  crawlee/crawlers/_adaptive_playwright/__init__.py,sha256=LREq9WR9BKsE8S8lSsEhlCoNjQaLhlJ9yo8y_6a8o4c,1072
61
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=bMKlIv-yS_vkMIJSQoVt5v730fKF_spKfW01kq2sZFs,21593
61
+ crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=ME90JLkScWj_ynUymA59f832vEvvVpkP01cYfEc8m-Y,22895
62
62
  crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py,sha256=_At8T8S3JLGPA-1AeCFGrpE-FuCDW9sazrXt9U0tK6U,1048
63
63
  crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=9FlHIUC05IzUhJsVldQvpnDnj1jk8GJpqC98mPLN_fw,10431
64
64
  crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkbIN_059jUyCG8Z6XAb_FBLClIKw7z-aDvjon2I,10834
@@ -137,7 +137,7 @@ crawlee/request_loaders/_request_list.py,sha256=SIalHBMuFanE5GLnFocI0QCppWUiJQjr
137
137
  crawlee/request_loaders/_request_loader.py,sha256=2Bg-AWWkIV1W-Dwjqo91dPY8nmc7H3teQy7d6OSgliQ,3620
138
138
  crawlee/request_loaders/_request_manager.py,sha256=qFizyJuV2meIb9iiPfuii7ciuERMrp4SldAufiH46dc,3000
139
139
  crawlee/request_loaders/_request_manager_tandem.py,sha256=lv-s94KPsoQAqx1KaXFch96ejhO147uOflF3UK5ORTk,4058
140
- crawlee/request_loaders/_sitemap_request_loader.py,sha256=y5KQs5riT32WMdN3Awk2lKQNkYcXT3FYQ-mmbd31AJc,15201
140
+ crawlee/request_loaders/_sitemap_request_loader.py,sha256=s65D_N0mZxeIrGJEjqUYfu1uYj2AXSOkmErSnfAHv2A,15554
141
141
  crawlee/sessions/__init__.py,sha256=dJdelbL-6MK5sW4SMU4QrjFbb9kRZ9uRnN-VS3R5-8Y,190
142
142
  crawlee/sessions/_cookies.py,sha256=ihYbmpXfCzClzXDT7M2wefB_3KVzcMUdIzTZo6uGk6Y,9356
143
143
  crawlee/sessions/_models.py,sha256=JMRQgDUP30XUdZ32isncHowOsXvK9jC_m9QYegbBI1E,2916
@@ -187,8 +187,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
187
187
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
188
188
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
189
189
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
- crawlee-1.0.4b8.dist-info/METADATA,sha256=9KTRoSTrAE5-FOL5i2dkpOw94-Jjb3wrPJTtpCRosrY,29314
191
- crawlee-1.0.4b8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
192
- crawlee-1.0.4b8.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
193
- crawlee-1.0.4b8.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
194
- crawlee-1.0.4b8.dist-info/RECORD,,
190
+ crawlee-1.0.5b1.dist-info/METADATA,sha256=_yVzpir-oFvR6kVrhdjuxf1a9eUAJvuZu52sdRIV0_s,29314
191
+ crawlee-1.0.5b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
192
+ crawlee-1.0.5b1.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
193
+ crawlee-1.0.5b1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
194
+ crawlee-1.0.5b1.dist-info/RECORD,,