scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +29 -19
  2. scrapling/cli.py +21 -4
  3. scrapling/core/_types.py +3 -2
  4. scrapling/core/ai.py +24 -15
  5. scrapling/core/custom_types.py +20 -27
  6. scrapling/core/mixins.py +15 -9
  7. scrapling/core/shell.py +6 -4
  8. scrapling/core/storage.py +7 -6
  9. scrapling/core/translator.py +13 -8
  10. scrapling/core/utils/__init__.py +0 -1
  11. scrapling/engines/_browsers/__init__.py +0 -2
  12. scrapling/engines/_browsers/_base.py +45 -21
  13. scrapling/engines/_browsers/_camoufox.py +98 -43
  14. scrapling/engines/_browsers/_config_tools.py +1 -1
  15. scrapling/engines/_browsers/_controllers.py +34 -13
  16. scrapling/engines/_browsers/_validators.py +31 -10
  17. scrapling/engines/constants.py +0 -15
  18. scrapling/engines/static.py +749 -336
  19. scrapling/engines/toolbelt/convertor.py +13 -15
  20. scrapling/engines/toolbelt/custom.py +6 -9
  21. scrapling/engines/toolbelt/fingerprints.py +17 -10
  22. scrapling/engines/toolbelt/navigation.py +11 -3
  23. scrapling/fetchers/__init__.py +46 -0
  24. scrapling/fetchers/chrome.py +210 -0
  25. scrapling/fetchers/firefox.py +212 -0
  26. scrapling/fetchers/requests.py +28 -0
  27. scrapling/parser.py +109 -84
  28. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
  29. scrapling-0.3.7.dist-info/RECORD +47 -0
  30. scrapling/fetchers.py +0 -444
  31. scrapling-0.3.5.dist-info/RECORD +0 -44
  32. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,7 @@ from playwright.async_api import (
10
10
  BrowserContext as AsyncBrowserContext,
11
11
  Playwright as AsyncPlaywright,
12
12
  Locator as AsyncLocator,
13
+ Page as async_Page,
13
14
  )
14
15
  from patchright.sync_api import sync_playwright as sync_patchright
15
16
  from patchright.async_api import async_playwright as async_patchright
@@ -18,10 +19,12 @@ from scrapling.core.utils import log
18
19
  from ._base import SyncSession, AsyncSession, DynamicSessionMixin
19
20
  from ._validators import validate_fetch as _validate
20
21
  from scrapling.core._types import (
22
+ Any,
21
23
  Dict,
22
24
  List,
23
25
  Optional,
24
26
  Callable,
27
+ TYPE_CHECKING,
25
28
  SelectorWaitStates,
26
29
  )
27
30
  from scrapling.engines.toolbelt.convertor import (
@@ -30,7 +33,7 @@ from scrapling.engines.toolbelt.convertor import (
30
33
  )
31
34
  from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
32
35
 
33
- _UNSET = object()
36
+ _UNSET: Any = object()
34
37
 
35
38
 
36
39
  class DynamicSession(DynamicSessionMixin, SyncSession):
@@ -94,7 +97,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
94
97
  network_idle: bool = False,
95
98
  load_dom: bool = True,
96
99
  wait_selector_state: SelectorWaitStates = "attached",
100
+ user_data_dir: str = "",
97
101
  selector_config: Optional[Dict] = None,
102
+ additional_args: Optional[Dict] = None,
98
103
  ):
99
104
  """A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
100
105
 
@@ -117,11 +122,13 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
117
122
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
118
123
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
119
124
  :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
120
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
125
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
121
126
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
122
127
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
123
128
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
129
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
124
130
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
131
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
125
132
  """
126
133
  self.__validate__(
127
134
  wait=wait,
@@ -140,11 +147,13 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
140
147
  hide_canvas=hide_canvas,
141
148
  init_script=init_script,
142
149
  network_idle=network_idle,
150
+ user_data_dir=user_data_dir,
143
151
  google_search=google_search,
144
152
  extra_headers=extra_headers,
145
153
  wait_selector=wait_selector,
146
154
  disable_webgl=disable_webgl,
147
155
  selector_config=selector_config,
156
+ additional_args=additional_args,
148
157
  disable_resources=disable_resources,
149
158
  wait_selector_state=wait_selector_state,
150
159
  )
@@ -154,14 +163,14 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
154
163
  """Create a browser for this instance and context."""
155
164
  sync_context = sync_patchright if self.stealth else sync_playwright
156
165
 
157
- self.playwright: Playwright = sync_context().start()
166
+ self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
158
167
 
159
168
  if self.cdp_url: # pragma: no cover
160
169
  self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
161
170
  **self.context_options
162
171
  )
163
172
  else:
164
- self.context = self.playwright.chromium.launch_persistent_context(user_data_dir="", **self.launch_options)
173
+ self.context = self.playwright.chromium.launch_persistent_context(**self.launch_options)
165
174
 
166
175
  if self.init_script: # pragma: no cover
167
176
  self.context.add_init_script(path=self.init_script)
@@ -187,7 +196,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
187
196
 
188
197
  if self.playwright:
189
198
  self.playwright.stop()
190
- self.playwright = None
199
+ self.playwright = None # pyright: ignore
191
200
 
192
201
  self._closed = True
193
202
 
@@ -254,6 +263,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
254
263
  if (
255
264
  finished_response.request.resource_type == "document"
256
265
  and finished_response.request.is_navigation_request()
266
+ and finished_response.request.frame == page_info.page.main_frame
257
267
  ):
258
268
  final_response = finished_response
259
269
 
@@ -299,7 +309,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
299
309
  page_info.page, first_response, final_response, params.selector_config
300
310
  )
301
311
 
302
- # Close the page, to free up resources
312
+ # Close the page to free up resources
303
313
  page_info.page.close()
304
314
  self.page_pool.pages.remove(page_info)
305
315
 
@@ -337,7 +347,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
337
347
  network_idle: bool = False,
338
348
  load_dom: bool = True,
339
349
  wait_selector_state: SelectorWaitStates = "attached",
350
+ user_data_dir: str = "",
340
351
  selector_config: Optional[Dict] = None,
352
+ additional_args: Optional[Dict] = None,
341
353
  ):
342
354
  """A Browser session manager with page pooling
343
355
 
@@ -360,12 +372,14 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
360
372
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
361
373
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
362
374
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
363
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
375
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
364
376
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
365
377
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
366
378
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
367
379
  :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
380
+ :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
368
381
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
382
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
369
383
  """
370
384
 
371
385
  self.__validate__(
@@ -385,11 +399,13 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
385
399
  hide_canvas=hide_canvas,
386
400
  init_script=init_script,
387
401
  network_idle=network_idle,
402
+ user_data_dir=user_data_dir,
388
403
  google_search=google_search,
389
404
  extra_headers=extra_headers,
390
405
  wait_selector=wait_selector,
391
406
  disable_webgl=disable_webgl,
392
407
  selector_config=selector_config,
408
+ additional_args=additional_args,
393
409
  disable_resources=disable_resources,
394
410
  wait_selector_state=wait_selector_state,
395
411
  )
@@ -399,21 +415,21 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
399
415
  """Create a browser for this instance and context."""
400
416
  async_context = async_patchright if self.stealth else async_playwright
401
417
 
402
- self.playwright: AsyncPlaywright = await async_context().start()
418
+ self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
403
419
 
404
420
  if self.cdp_url:
405
421
  browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
406
422
  self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
407
423
  else:
408
424
  self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
409
- user_data_dir="", **self.launch_options
425
+ **self.launch_options
410
426
  )
411
427
 
412
428
  if self.init_script: # pragma: no cover
413
429
  await self.context.add_init_script(path=self.init_script)
414
430
 
415
431
  if self.cookies:
416
- await self.context.add_cookies(self.cookies)
432
+ await self.context.add_cookies(self.cookies) # pyright: ignore
417
433
 
418
434
  async def __aenter__(self):
419
435
  await self.__create__()
@@ -429,11 +445,11 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
429
445
 
430
446
  if self.context:
431
447
  await self.context.close()
432
- self.context = None
448
+ self.context = None # pyright: ignore
433
449
 
434
450
  if self.playwright:
435
451
  await self.playwright.stop()
436
- self.playwright = None
452
+ self.playwright = None # pyright: ignore
437
453
 
438
454
  self._closed = True
439
455
 
@@ -500,12 +516,17 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
500
516
  if (
501
517
  finished_response.request.resource_type == "document"
502
518
  and finished_response.request.is_navigation_request()
519
+ and finished_response.request.frame == page_info.page.main_frame
503
520
  ):
504
521
  final_response = finished_response
505
522
 
506
523
  page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
507
524
  page_info.mark_busy(url=url)
508
525
 
526
+ if TYPE_CHECKING:
527
+ if not isinstance(page_info.page, async_Page):
528
+ raise TypeError
529
+
509
530
  try:
510
531
  # Navigate to URL and wait for a specified state
511
532
  page_info.page.on("response", handle_response)
@@ -545,7 +566,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
545
566
  page_info.page, first_response, final_response, params.selector_config
546
567
  )
547
568
 
548
- # Close the page, to free up resources
569
+ # Close the page to free up resources
549
570
  await page_info.page.close()
550
571
  self.page_pool.pages.remove(page_info)
551
572
  return response
@@ -11,7 +11,9 @@ from scrapling.core._types import (
11
11
  Tuple,
12
12
  Optional,
13
13
  Callable,
14
+ Iterable,
14
15
  SelectorWaitStates,
16
+ overload,
15
17
  )
16
18
  from scrapling.engines.toolbelt.navigation import construct_proxy_dict
17
19
 
@@ -73,7 +75,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
73
75
  stealth: bool = False
74
76
  wait: Seconds = 0
75
77
  page_action: Optional[Callable] = None
76
- proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
78
+ proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
77
79
  locale: str = "en-US"
78
80
  extra_headers: Optional[Dict[str, str]] = None
79
81
  useragent: Optional[str] = None
@@ -81,11 +83,13 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
81
83
  init_script: Optional[str] = None
82
84
  disable_resources: bool = False
83
85
  wait_selector: Optional[str] = None
84
- cookies: Optional[List[Dict]] = None
86
+ cookies: Optional[Iterable[Dict]] = None
85
87
  network_idle: bool = False
86
88
  load_dom: bool = True
87
89
  wait_selector_state: SelectorWaitStates = "attached"
88
- selector_config: Optional[Dict] = None
90
+ user_data_dir: str = ""
91
+ selector_config: Optional[Dict] = {}
92
+ additional_args: Optional[Dict] = {}
89
93
 
90
94
  def __post_init__(self):
91
95
  """Custom validation after msgspec validation"""
@@ -100,6 +104,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
100
104
  self.cookies = []
101
105
  if not self.selector_config:
102
106
  self.selector_config = {}
107
+ if not self.additional_args:
108
+ self.additional_args = {}
103
109
 
104
110
  if self.init_script is not None:
105
111
  _validate_file_path(self.init_script)
@@ -125,15 +131,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
125
131
  wait_selector: Optional[str] = None
126
132
  addons: Optional[List[str]] = None
127
133
  wait_selector_state: SelectorWaitStates = "attached"
128
- cookies: Optional[List[Dict]] = None
134
+ cookies: Optional[Iterable[Dict]] = None
129
135
  google_search: bool = True
130
136
  extra_headers: Optional[Dict[str, str]] = None
131
- proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
137
+ proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
132
138
  os_randomize: bool = False
133
139
  disable_ads: bool = False
134
140
  geoip: bool = False
135
- selector_config: Optional[Dict] = None
136
- additional_args: Optional[Dict] = None
141
+ user_data_dir: str = ""
142
+ selector_config: Optional[Dict] = {}
143
+ additional_args: Optional[Dict] = {}
137
144
 
138
145
  def __post_init__(self):
139
146
  """Custom validation after msgspec validation"""
@@ -177,7 +184,7 @@ class FetchConfig(Struct, kw_only=True):
177
184
  network_idle: bool = False
178
185
  load_dom: bool = True
179
186
  solve_cloudflare: bool = False
180
- selector_config: Optional[Dict] = {}
187
+ selector_config: Dict = {}
181
188
 
182
189
  def to_dict(self):
183
190
  return {f: getattr(self, f) for f in self.__struct_fields__}
@@ -198,7 +205,7 @@ class _fetch_params:
198
205
  network_idle: bool
199
206
  load_dom: bool
200
207
  solve_cloudflare: bool
201
- selector_config: Optional[Dict]
208
+ selector_config: Dict
202
209
 
203
210
 
204
211
  def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
@@ -222,7 +229,21 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
222
229
  return _fetch_params(**result)
223
230
 
224
231
 
225
- def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
232
+ @overload
233
+ def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
234
+
235
+
236
+ @overload
237
+ def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
238
+
239
+
240
+ @overload
241
+ def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
242
+
243
+
244
+ def validate(
245
+ params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
246
+ ) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
226
247
  try:
227
248
  return convert(params, model)
228
249
  except ValidationError as e:
@@ -101,18 +101,3 @@ DEFAULT_STEALTH_FLAGS = (
101
101
  "--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
102
102
  "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
103
103
  )
104
-
105
- # Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
106
- NSTBROWSER_DEFAULT_QUERY = {
107
- "once": True,
108
- "headless": True,
109
- "autoClose": True,
110
- "fingerprint": {
111
- "flags": {"timezone": "BasedOnIp", "screen": "Custom"},
112
- "platform": "linux", # support: windows, mac, linux
113
- "kernel": "chromium", # only support: chromium
114
- "kernelMilestone": "128",
115
- "hardwareConcurrency": 8,
116
- "deviceMemory": 8,
117
- },
118
- }