crawlee 1.0.5b1__py3-none-any.whl → 1.0.5b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +8 -3
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- {crawlee-1.0.5b1.dist-info → crawlee-1.0.5b2.dist-info}/METADATA +1 -1
- {crawlee-1.0.5b1.dist-info → crawlee-1.0.5b2.dist-info}/RECORD +11 -11
- {crawlee-1.0.5b1.dist-info → crawlee-1.0.5b2.dist-info}/WHEEL +0 -0
- {crawlee-1.0.5b1.dist-info → crawlee-1.0.5b2.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.5b1.dist-info → crawlee-1.0.5b2.dist-info}/licenses/LICENSE +0 -0
|
@@ -118,7 +118,10 @@ class BrowserPool:
|
|
|
118
118
|
"""Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
|
|
119
119
|
|
|
120
120
|
Args:
|
|
121
|
-
browser_type: The type of browser to launch
|
|
121
|
+
browser_type: The type of browser to launch:
|
|
122
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
123
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
124
|
+
the system.
|
|
122
125
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
123
126
|
and local storage.
|
|
124
127
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -216,7 +216,7 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
216
216
|
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
|
|
217
217
|
if proxy_info:
|
|
218
218
|
if browser_new_context_options.get('proxy'):
|
|
219
|
-
logger.warning("browser_new_context_options['proxy']
|
|
219
|
+
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
|
|
220
220
|
|
|
221
221
|
browser_new_context_options['proxy'] = ProxySettings(
|
|
222
222
|
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
|
|
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
34
34
|
|
|
35
35
|
It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
|
|
36
36
|
for creating new browser instances and provides a unified interface for interacting with different browser types
|
|
37
|
-
(chromium, firefox, and
|
|
38
|
-
executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
37
|
+
(chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
|
|
38
|
+
mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
39
39
|
browser instance, ensuring that resource limits are respected.
|
|
40
40
|
"""
|
|
41
41
|
|
|
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
55
55
|
"""Initialize a new instance.
|
|
56
56
|
|
|
57
57
|
Args:
|
|
58
|
-
browser_type: The type of browser to launch
|
|
58
|
+
browser_type: The type of browser to launch:
|
|
59
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
60
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
61
|
+
the system.
|
|
59
62
|
user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
|
|
60
63
|
storage.
|
|
61
64
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
80
83
|
'chromium_sandbox': not config.disable_browser_sandbox,
|
|
81
84
|
}
|
|
82
85
|
|
|
86
|
+
if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Map 'chrome' to 'chromium' with the 'chrome' channel.
|
|
92
|
+
if browser_type == 'chrome':
|
|
93
|
+
browser_type = 'chromium'
|
|
94
|
+
# Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
|
|
95
|
+
default_launch_browser_options['channel'] = 'chrome'
|
|
96
|
+
|
|
83
97
|
self._browser_type: BrowserType = browser_type
|
|
84
98
|
self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
|
|
85
99
|
self._browser_new_context_options = browser_new_context_options or {}
|
crawlee/browsers/_types.py
CHANGED
|
@@ -114,7 +114,10 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
114
114
|
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
|
|
115
115
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
116
116
|
and local storage.
|
|
117
|
-
browser_type: The type of browser to launch
|
|
117
|
+
browser_type: The type of browser to launch:
|
|
118
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
119
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
120
|
+
the system.
|
|
118
121
|
This option should not be used if `browser_pool` is provided.
|
|
119
122
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
120
123
|
directly to Playwright's `browser_type.launch` method. For more details, refer to the
|
|
@@ -153,7 +156,7 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext, StatisticsState]
|
|
|
153
156
|
):
|
|
154
157
|
raise ValueError(
|
|
155
158
|
'You cannot provide `headless`, `browser_type`, `browser_launch_options`, '
|
|
156
|
-
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir`
|
|
159
|
+
'`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or '
|
|
157
160
|
'`fingerprint_generator` arguments when `browser_pool` is provided.'
|
|
158
161
|
)
|
|
159
162
|
|
|
@@ -496,7 +499,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict):
|
|
|
496
499
|
"""A `BrowserPool` instance to be used for launching the browsers and getting pages."""
|
|
497
500
|
|
|
498
501
|
browser_type: NotRequired[BrowserType]
|
|
499
|
-
"""The type of browser to launch
|
|
502
|
+
"""The type of browser to launch:
|
|
503
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
504
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system.
|
|
500
505
|
This option should not be used if `browser_pool` is provided."""
|
|
501
506
|
|
|
502
507
|
browser_launch_options: NotRequired[Mapping[str, Any]]
|
|
@@ -11,9 +11,9 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def fingerprint_browser_type_from_playwright_browser_type(
|
|
14
|
-
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
|
|
14
|
+
playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'],
|
|
15
15
|
) -> SupportedBrowserType:
|
|
16
|
-
if playwright_browser_type
|
|
16
|
+
if playwright_browser_type in {'chromium', 'chrome'}:
|
|
17
17
|
return 'chrome'
|
|
18
18
|
if playwright_browser_type == 'firefox':
|
|
19
19
|
return 'firefox'
|
|
@@ -43,11 +43,11 @@ crawlee/_utils/web.py,sha256=nnKhg8pUSWz0RY64Qd-_GPNBX1fWI2hXS-gzcfQ-rig,364
|
|
|
43
43
|
crawlee/browsers/__init__.py,sha256=TghkrNSbI_k87UgVBlgNNcEm8Ot05pSLEAPRSv6YsUs,1064
|
|
44
44
|
crawlee/browsers/_browser_controller.py,sha256=-g0pB5Nx5q67eMZVka49x-HMfQqJYoI6kcV-g3AXE0I,3068
|
|
45
45
|
crawlee/browsers/_browser_plugin.py,sha256=Wuojop___8ZO9eDoMs4JFmwMAFe5mZaTl0-Vz1PjkD8,3057
|
|
46
|
-
crawlee/browsers/_browser_pool.py,sha256=
|
|
46
|
+
crawlee/browsers/_browser_pool.py,sha256=n1GTVS220yxo-aMaKDVfQO571_AqEV5pMawWbr0zUHk,15832
|
|
47
47
|
crawlee/browsers/_playwright_browser.py,sha256=1yXD6cXuLefZZGUG1m0CT38xXYSwIC7n95bJBdMOxbo,3820
|
|
48
|
-
crawlee/browsers/_playwright_browser_controller.py,sha256=
|
|
49
|
-
crawlee/browsers/_playwright_browser_plugin.py,sha256=
|
|
50
|
-
crawlee/browsers/_types.py,sha256=
|
|
48
|
+
crawlee/browsers/_playwright_browser_controller.py,sha256=W6G5MjZpg9IcZoHts6lTML5VxSEpBTgzx5qeQ8XDigY,10216
|
|
49
|
+
crawlee/browsers/_playwright_browser_plugin.py,sha256=A1qa1nJhTSKNP9uOiO-oGzR7VGlnOMo0A0YNedccb2A,8869
|
|
50
|
+
crawlee/browsers/_types.py,sha256=ZnDgJHeQNSd_s_mXhgQnAf09c2smuiXC31VbawHHXUM,436
|
|
51
51
|
crawlee/browsers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
52
|
crawlee/crawlers/__init__.py,sha256=9VmFahav3rjE-2Bxa5PAhBgkYXP0k5SSAEpdG2xMZ7c,2340
|
|
53
53
|
crawlee/crawlers/_types.py,sha256=xbGTJQirgz5wUbfr12afMR4q-_5AWP7ngF2e8K5P8l0,355
|
|
@@ -85,7 +85,7 @@ crawlee/crawlers/_parsel/_parsel_crawling_context.py,sha256=sZB26RcRLjSoD15myEOM
|
|
|
85
85
|
crawlee/crawlers/_parsel/_parsel_parser.py,sha256=yWBfuXUHMriK4DRnyrXTQoGeqX5WV9bOEkBp_g0YCvQ,1540
|
|
86
86
|
crawlee/crawlers/_parsel/_utils.py,sha256=MbRwx-cdjlq1zLzFYf64M3spOGQ6yxum4FvP0sdqA_Q,2693
|
|
87
87
|
crawlee/crawlers/_playwright/__init__.py,sha256=6Cahe6VEF82o8CYiP8Cmp58Cmb6Rb8uMeyy7wnwe5ms,837
|
|
88
|
-
crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=
|
|
88
|
+
crawlee/crawlers/_playwright/_playwright_crawler.py,sha256=53iytj5LZHw19QOYqYNlZL4ApPlhbWn9Ds-DPTKANhQ,24158
|
|
89
89
|
crawlee/crawlers/_playwright/_playwright_crawling_context.py,sha256=Oi0tMBXHaEDlFjqG01DzgB7Ck52bjVjz-X__eMioxas,1249
|
|
90
90
|
crawlee/crawlers/_playwright/_playwright_http_client.py,sha256=Nfm69dqX85k68jN1p3ljZWbn8egqDWPIPRykXyXsoQs,3977
|
|
91
91
|
crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py,sha256=fEI2laWhmJdWiGoMF5JBLBsim9NtENfagZt6FFd2Rgo,1387
|
|
@@ -100,7 +100,7 @@ crawlee/fingerprint_suite/__init__.py,sha256=noY9qw80B0seZGj_B3bBvCDIDk2YWOSN-ll
|
|
|
100
100
|
crawlee/fingerprint_suite/_browserforge_adapter.py,sha256=bsGebBjjHawM-FiINgqkZW5I9a9Fnv3SGwdKgaVWiRI,11934
|
|
101
101
|
crawlee/fingerprint_suite/_consts.py,sha256=SgykWfxD-pYvOpRp_ooQ4ZTPS0sQ2b3wDyyCjwU_8-w,258
|
|
102
102
|
crawlee/fingerprint_suite/_fingerprint_generator.py,sha256=Di4sDk1qioiFGx4ZcoVyHhtFHF8JXDhxQt8ZPug99k8,730
|
|
103
|
-
crawlee/fingerprint_suite/_header_generator.py,sha256=
|
|
103
|
+
crawlee/fingerprint_suite/_header_generator.py,sha256=9X9FbStehXdw-FZc_D0y-nLk1BUHXVYFxs7fv4dl9zU,3513
|
|
104
104
|
crawlee/fingerprint_suite/_types.py,sha256=7n2LJTiL2XvL-H4G-Y26Uoq5-ZXzH07Dq4o50uhMa-w,2423
|
|
105
105
|
crawlee/fingerprint_suite/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
106
106
|
crawlee/http_clients/__init__.py,sha256=OQFhR9F8BrdlIaS5aRS7hvgQ0tKJPQ8FiyYPualyQcU,890
|
|
@@ -187,8 +187,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
187
187
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
188
188
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
189
189
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
190
|
-
crawlee-1.0.
|
|
191
|
-
crawlee-1.0.
|
|
192
|
-
crawlee-1.0.
|
|
193
|
-
crawlee-1.0.
|
|
194
|
-
crawlee-1.0.
|
|
190
|
+
crawlee-1.0.5b2.dist-info/METADATA,sha256=NUxv7PyqJSRwtpv5-OWm7DGiVJXjeDKQxF8O6WLfUW8,29314
|
|
191
|
+
crawlee-1.0.5b2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
192
|
+
crawlee-1.0.5b2.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
193
|
+
crawlee-1.0.5b2.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
194
|
+
crawlee-1.0.5b2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|