crawlee 1.0.4b8__py3-none-any.whl → 1.0.5b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -118,7 +118,10 @@ class BrowserPool:
118
118
  """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
119
119
 
120
120
  Args:
121
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
121
+ browser_type: The type of browser to launch:
122
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
123
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
124
+ the system.
122
125
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
123
126
  and local storage.
124
127
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -216,7 +216,7 @@ class PlaywrightBrowserController(BrowserController):
216
216
  browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
217
217
  if proxy_info:
218
218
  if browser_new_context_options.get('proxy'):
219
- logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
219
+ logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
220
220
 
221
221
  browser_new_context_options['proxy'] = ProxySettings(
222
222
  server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
34
34
 
35
35
  It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
36
36
  for creating new browser instances and provides a unified interface for interacting with different browser types
37
- (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode,
38
- executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
37
+ (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
38
+ mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
39
39
  browser instance, ensuring that resource limits are respected.
40
40
  """
41
41
 
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
55
55
  """Initialize a new instance.
56
56
 
57
57
  Args:
58
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
58
+ browser_type: The type of browser to launch:
59
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
60
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
61
+ the system.
59
62
  user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
60
63
  storage.
61
64
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
80
83
  'chromium_sandbox': not config.disable_browser_sandbox,
81
84
  }
82
85
 
86
+ if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
87
+ raise ValueError(
88
+ 'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
89
+ )
90
+
91
+ # Map 'chrome' to 'chromium' with the 'chrome' channel.
92
+ if browser_type == 'chrome':
93
+ browser_type = 'chromium'
94
+ # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
95
+ default_launch_browser_options['channel'] = 'chrome'
96
+
83
97
  self._browser_type: BrowserType = browser_type
84
98
  self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
85
99
  self._browser_new_context_options = browser_new_context_options or {}
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Literal
6
6
  if TYPE_CHECKING:
7
7
  from playwright.async_api import Page
8
8
 
9
- BrowserType = Literal['chromium', 'firefox', 'webkit']
9
+ BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome']
10
10
 
11
11
 
12
12
  @dataclass
@@ -315,7 +315,7 @@ class AdaptivePlaywrightCrawler(
315
315
  ),
316
316
  logger=self._logger,
317
317
  )
318
- return SubCrawlerRun(result=result)
318
+ return SubCrawlerRun(result=result, run_context=context_linked_to_result)
319
319
  except Exception as e:
320
320
  return SubCrawlerRun(exception=e)
321
321
 
@@ -371,7 +371,8 @@ class AdaptivePlaywrightCrawler(
371
371
  self.track_http_only_request_handler_runs()
372
372
 
373
373
  static_run = await self._crawl_one(rendering_type='static', context=context)
374
- if static_run.result and self.result_checker(static_run.result):
374
+ if static_run.result and static_run.run_context and self.result_checker(static_run.result):
375
+ self._update_context_from_copy(context, static_run.run_context)
375
376
  self._context_result_map[context] = static_run.result
376
377
  return
377
378
  if static_run.exception:
@@ -402,13 +403,10 @@ class AdaptivePlaywrightCrawler(
402
403
  if pw_run.exception is not None:
403
404
  raise pw_run.exception
404
405
 
405
- if pw_run.result:
406
- self._context_result_map[context] = pw_run.result
407
-
406
+ if pw_run.result and pw_run.run_context:
408
407
  if should_detect_rendering_type:
409
408
  detection_result: RenderingType
410
409
  static_run = await self._crawl_one('static', context=context, state=old_state_copy)
411
-
412
410
  if static_run.result and self.result_comparator(static_run.result, pw_run.result):
413
411
  detection_result = 'static'
414
412
  else:
@@ -417,6 +415,9 @@ class AdaptivePlaywrightCrawler(
417
415
  context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
418
416
  self.rendering_type_predictor.store_result(context.request, detection_result)
419
417
 
418
+ self._update_context_from_copy(context, pw_run.run_context)
419
+ self._context_result_map[context] = pw_run.result
420
+
420
421
  def pre_navigation_hook(
421
422
  self,
422
423
  hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +452,32 @@ class AdaptivePlaywrightCrawler(
451
452
  def track_rendering_type_mispredictions(self) -> None:
452
453
  self.statistics.state.rendering_type_mispredictions += 1
453
454
 
455
+ def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
456
+ """Update mutable fields of `context` from `context_copy`.
457
+
458
+ Uses object.__setattr__ to bypass frozen dataclass restrictions,
459
+ allowing state synchronization after isolated crawler execution.
460
+ """
461
+ updating_attributes = {
462
+ 'request': ('headers', 'user_data'),
463
+ 'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
464
+ }
465
+
466
+ for attr, sub_attrs in updating_attributes.items():
467
+ original_sub_obj = getattr(context, attr)
468
+ copy_sub_obj = getattr(context_copy, attr)
469
+
470
+ # Check that both sub objects are not None
471
+ if original_sub_obj is None or copy_sub_obj is None:
472
+ continue
473
+
474
+ for sub_attr in sub_attrs:
475
+ new_value = getattr(copy_sub_obj, sub_attr)
476
+ object.__setattr__(original_sub_obj, sub_attr, new_value)
477
+
454
478
 
455
479
  @dataclass(frozen=True)
456
480
  class SubCrawlerRun:
457
481
  result: RequestHandlerRunResult | None = None
458
482
  exception: Exception | None = None
483
+ run_context: BasicCrawlingContext | None = None
@@ -114,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
114
114
  browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
115
115
  user_data_dir: Path to a user data directory, which stores browser session data like cookies
116
116
  and local storage.
117
- browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
117
+ browser_type: The type of browser to launch:
118
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
119
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
120
+ the system.
118
121
  This option should not be used if `browser_pool` is provided.
119
122
  browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
120
123
  directly to Playwright's `browser_type.launch` method. For more details, refer to the
@@ -153,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
153
156
  ):
154
157
  raise ValueError(
155
158
  'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
156
- '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or'
159
+ '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
157
160
  '`fingerprint_generator` arguments when `browser_pool` is provided.'
158
161
  )
159
162
 
@@ -496,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
496
499
  """A `BrowserPool` instance to be used for launching the browsers and getting pages."""
497
500
 
498
501
  browser_type: NotRequired[BrowserType]
499
- """The type of browser to launch ('chromium', 'firefox', or 'webkit').
502
+ """The type of browser to launch:
503
+ - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
504
+ - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
500
505
  This option should not be used if `browser_pool` is provided."""
501
506
 
502
507
  browser_launch_options: NotRequired[Mapping[str, Any]]
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
11
11
 
12
12
 
13
13
  def fingerprint_browser_type_from_playwright_browser_type(
14
- playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
14
+ playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
15
15
  ) -> SupportedBrowserType:
16
- if playwright_browser_type == 'chromium':
16
+ if playwright_browser_type in {'chromium', 'chrome'}:
17
17
  return 'chrome'
18
18
  if playwright_browser_type == 'firefox':
19
19
  return 'firefox'
@@ -90,6 +90,11 @@ class SitemapRequestLoaderState(BaseModel):
90
90
  class SitemapRequestLoader(RequestLoader):
91
91
  """A request loader that reads URLs from sitemap(s).
92
92
 
93
+ The loader is designed to handle sitemaps that follow the format described in the Sitemaps protocol
94
+ (https://www.sitemaps.org/protocol.html). It supports both XML and plain text sitemap formats.
95
+ Note that HTML pages containing links are not supported - those should be handled by regular crawlers
96
+ and the `enqueue_links` functionality.
97
+
93
98
  The loader fetches and parses sitemaps in the background, allowing crawling to start
94
99
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
95
100
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.0.4b8
3
+ Version: 1.0.5b2
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -43,11 +43,11 @@ crawlee/_utils/web.py,sha256=nnKhg8pUSWz0RY64Qd-_GPNBX1fWI2hXS-gzcfQ-rig,364
43
43
  crawlee/browsers/__init__.py,sha256=TghkrNSbI_k87UgVBlgNNcEm8Ot05pSLEAPRSv6YsUs,1064
44
44
  crawlee/browsers/_browser_controller.py,sha256=-g0pB5Nx5q67eMZVka49x-HMfQqJYoI6kcV-g3AXE0I,3068
45
45
  crawlee/browsers/_browser_plugin.py,sha256=Wuojop___8ZO9eDoMs4JFmwMAFe5mZaTl0-Vz1PjkD8,3057
46
- crawlee/browsers/_browser_pool.py,sha256=2pT4m_g0DfopjTHYXb-piN6GqxvkayOeb4gmOtn1QNM,15634
46
+ crawlee/browsers/_browser_pool.py,sha256=n1GTVS220yxo-aMaKDVfQO571_AqEV5pMawWbr0zUHk,15832
47
47
  crawlee/browsers/_playwright_browser.py,sha256=1yXD6cXuLefZZGUG1m0CT38xXYSwIC7n95bJBdMOxbo,3820
48
- crawlee/browsers/_playwright_browser_controller.py,sha256=YaY19slRj8gIKrZy0M8rzF_zy2Z1Ym6d0S_vXcMX108,10215
49
- crawlee/browsers/_playwright_browser_plugin.py,sha256=axZa_yZNCPHyM3Ijx9jW4CzzRXQTVzYAswcGAZHP3Hk,8106
50
- crawlee/browsers/_types.py,sha256=eWgpoLMWu103hMQQTObkA01sVc_7hdPESl-TCyDMMV0,426
48
+ crawlee/browsers/_playwright_browser_controller.py,sha256=W6G5MjZpg9IcZoHts6lTML5VxSEpBTgzx5qeQ8XDigY,10216
49
+ crawlee/browsers/_playwright_browser_plugin.py,sha256=A1qa1nJhTSKNP9uOiO-oGzR7VGlnOMo0A0YNedccb2A,8869
50
+ crawlee/browsers/_types.py,sha256=ZnDgJHeQNSd_s_mXhgQnAf09c2smuiXC31VbawHHXUM,436
51
51
  crawlee/browsers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  crawlee/crawlers/__init__.py,sha256=9VmFahav3rjE-2Bxa5PAhBgkYXP0k5SSAEpdG2xMZ7c,2340
53
53
  crawlee/crawlers/_types.py,sha256=xbGTJQirgz5wUbfr12afMR4q-_5AWP7ngF2e8K5P8l0,355
@@ -58,7 +58,7 @@ crawlee/crawlers/_abstract_http/_abstract_http_parser.py,sha256=Y5o_hiW_0mQAte5G
58
58
  crawlee/crawlers/_abstract_http/_http_crawling_context.py,sha256=Rno_uJ8ivmyRxFQv2MyY_z9B5WPHSEd5MAPz31_1ZIo,2179
59
59
  crawlee/crawlers/_abstract_http/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  crawlee/crawlers/_adaptive_playwright/__init__.py,sha256=LREq9WR9BKsE8S8lSsEhlCoNjQaLhlJ9yo8y_6a8o4c,1072
61
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=bMKlIv-yS_vkMIJSQoVt5v730fKF_spKfW01kq2sZFs,21593
61
+ crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py,sha256=ME90JLkScWj_ynUymA59f832vEvvVpkP01cYfEc8m-Y,22895
62
62
  crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py,sha256=_At8T8S3JLGPA-1AeCFGrpE-FuCDW9sazrXt9U0tK6U,1048
63
63
  crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py,sha256=9FlHIUC05IzUhJsVldQvpnDnj1jk8GJpqC98mPLN_fw,10431
64
64
  crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkbIN_059jUyCG8Z6XAb_FBLClIKw7z-aDvjon2I,10834
@@ -85,7 +85,7 @@ crawlee/crawlers/_parsel/_parsel_crawling_context.py,sha256=sZB26RcRLjSoD15myEOM
85
85
  crawlee/crawlers/_parsel/_parsel_parser.py,sha256=yWBfuXUHMriK4DRnyrXTQoGeqX5WV9bOEkBp_g0YCvQ,1540
86
86
  crawlee/crawlers/_parsel/_utils.py,sha256=MbRwx-cdjlq1zLzFYf64M3spOGQ6yxum4FvP0sdqA_Q,2693
87
87
  crawlee/crawlers/_playwright/__init__.py,sha256=6Cahe6VEF82o8CYiP8Cmp58Cmb6Rb8uMeyy7wnwe5ms,837
88
- crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=QfZVWj6A0H1idC0yQT-WAxlWTk7janB4TtKDtf8htt8,23806
88
+ crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=53iytj5LZHw19QOYqYNlZL4ApPlhbWn9Ds-DPTKANhQ,24158
89
89
  crawlee/crawlers/_playwright/_playwright_crawling_context.py,sha256=Oi0tMBXHaEDlFjqG01DzgB7Ck52bjVjz-X__eMioxas,1249
90
90
  crawlee/crawlers/_playwright/_playwright_http_client.py,sha256=Nfm69dqX85k68jN1p3ljZWbn8egqDWPIPRykXyXsoQs,3977
91
91
  crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=fEI2laWhmJdWiGoMF5JBLBsim9NtENfagZt6FFd2Rgo,1387
@@ -100,7 +100,7 @@ crawlee/fingerprint_suite/__init__.py,sha256=noY9qw80B0seZGj_B3bBvCDIDk2YWOSN-ll
100
100
  crawlee/fingerprint_suite/_browserforge_adapter.py,sha256=bsGebBjjHawM-FiINgqkZW5I9a9Fnv3SGwdKgaVWiRI,11934
101
101
  crawlee/fingerprint_suite/_consts.py,sha256=SgykWfxD-pYvOpRp_ooQ4ZTPS0sQ2b3wDyyCjwU_8-w,258
102
102
  crawlee/fingerprint_suite/_fingerprint_generator.py,sha256=Di4sDk1qioiFGx4ZcoVyHhtFHF8JXDhxQt8ZPug99k8,730
103
- crawlee/fingerprint_suite/_header_generator.py,sha256=GMveKjVRsfyt_iq0Cvif98D7pFiPryPD0-zQ7efE8j8,3491
103
+ crawlee/fingerprint_suite/_header_generator.py,sha256=9X9FbStehXdw-FZc_D0y-nLk1BUHXVYFxs7fv4dl9zU,3513
104
104
  crawlee/fingerprint_suite/_types.py,sha256=7n2LJTiL2XvL-H4G-Y26Uoq5-ZXzH07Dq4o50uhMa-w,2423
105
105
  crawlee/fingerprint_suite/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
106
  crawlee/http_clients/__init__.py,sha256=OQFhR9F8BrdlIaS5aRS7hvgQ0tKJPQ8FiyYPualyQcU,890
@@ -137,7 +137,7 @@ crawlee/request_loaders/_request_list.py,sha256=SIalHBMuFanE5GLnFocI0QCppWUiJQjr
137
137
  crawlee/request_loaders/_request_loader.py,sha256=2Bg-AWWkIV1W-Dwjqo91dPY8nmc7H3teQy7d6OSgliQ,3620
138
138
  crawlee/request_loaders/_request_manager.py,sha256=qFizyJuV2meIb9iiPfuii7ciuERMrp4SldAufiH46dc,3000
139
139
  crawlee/request_loaders/_request_manager_tandem.py,sha256=lv-s94KPsoQAqx1KaXFch96ejhO147uOflF3UK5ORTk,4058
140
- crawlee/request_loaders/_sitemap_request_loader.py,sha256=y5KQs5riT32WMdN3Awk2lKQNkYcXT3FYQ-mmbd31AJc,15201
140
+ crawlee/request_loaders/_sitemap_request_loader.py,sha256=s65D_N0mZxeIrGJEjqUYfu1uYj2AXSOkmErSnfAHv2A,15554
141
141
  crawlee/sessions/__init__.py,sha256=dJdelbL-6MK5sW4SMU4QrjFbb9kRZ9uRnN-VS3R5-8Y,190
142
142
  crawlee/sessions/_cookies.py,sha256=ihYbmpXfCzClzXDT7M2wefB_3KVzcMUdIzTZo6uGk6Y,9356
143
143
  crawlee/sessions/_models.py,sha256=JMRQgDUP30XUdZ32isncHowOsXvK9jC_m9QYegbBI1E,2916
@@ -187,8 +187,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
187
187
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
188
188
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
189
189
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
- crawlee-1.0.4b8.dist-info/METADATA,sha256=9KTRoSTrAE5-FOL5i2dkpOw94-Jjb3wrPJTtpCRosrY,29314
191
- crawlee-1.0.4b8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
192
- crawlee-1.0.4b8.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
193
- crawlee-1.0.4b8.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
194
- crawlee-1.0.4b8.dist-info/RECORD,,
190
+ crawlee-1.0.5b2.dist-info/METADATA,sha256=NUxv7PyqJSRwtpv5-OWm7DGiVJXjeDKQxF8O6WLfUW8,29314
191
+ crawlee-1.0.5b2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
192
+ crawlee-1.0.5b2.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
193
+ crawlee-1.0.5b2.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
194
+ crawlee-1.0.5b2.dist-info/RECORD,,