scrapling 0.3__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +51 -129
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +238 -293
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +220 -278
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +29 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +41 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/METADATA +57 -47
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.dist-info/RECORD +0 -41
  32. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -6,7 +6,7 @@ from playwright.async_api import Page as AsyncPage
6
6
 
7
7
  from scrapling.core._types import Optional, List, Literal
8
8
 
9
- PageState = Literal["ready", "busy", "error"] # States that a page can be in
9
+ PageState = Literal["finished", "ready", "busy", "error"] # States that a page can be in
10
10
 
11
11
 
12
12
  @dataclass
@@ -23,9 +23,9 @@ class PageInfo:
23
23
  self.state = "busy"
24
24
  self.url = url
25
25
 
26
- def mark_ready(self):
27
- """Mark the page as ready for new requests"""
28
- self.state = "ready"
26
+ def mark_finished(self):
27
+ """Mark the page as finished for new requests"""
28
+ self.state = "finished"
29
29
  self.url = ""
30
30
 
31
31
  def mark_error(self):
@@ -62,24 +62,16 @@ class PagePool:
62
62
  self.pages.append(page_info)
63
63
  return page_info
64
64
 
65
- def get_ready_page(self) -> Optional[PageInfo]:
66
- """Get a page that's ready for use"""
67
- with self._lock:
68
- for page_info in self.pages:
69
- if page_info.state == "ready":
70
- return page_info
71
- return None
72
-
73
65
  @property
74
66
  def pages_count(self) -> int:
75
67
  """Get the total number of pages"""
76
68
  return len(self.pages)
77
69
 
78
70
  @property
79
- def ready_count(self) -> int:
80
- """Get the number of ready pages"""
71
+ def finished_count(self) -> int:
72
+ """Get the number of finished pages"""
81
73
  with self._lock:
82
- return sum(1 for p in self.pages if p.state == "ready")
74
+ return sum(1 for p in self.pages if p.state == "finished")
83
75
 
84
76
  @property
85
77
  def busy_count(self) -> int:
@@ -91,3 +83,33 @@ class PagePool:
91
83
  """Remove pages in error state"""
92
84
  with self._lock:
93
85
  self.pages = [p for p in self.pages if p.state != "error"]
86
+
87
+ def close_all_finished_pages(self):
88
+ """Close all pages in finished state and remove them from the pool"""
89
+ with self._lock:
90
+ pages_to_remove = []
91
+ for page_info in self.pages:
92
+ if page_info.state == "finished":
93
+ try:
94
+ page_info.page.close()
95
+ except Exception:
96
+ pass
97
+ pages_to_remove.append(page_info)
98
+
99
+ for page_info in pages_to_remove:
100
+ self.pages.remove(page_info)
101
+
102
+ async def aclose_all_finished_pages(self):
103
+ """Async version: Close all pages in finished state and remove them from the pool"""
104
+ with self._lock:
105
+ pages_to_remove = []
106
+ for page_info in self.pages:
107
+ if page_info.state == "finished":
108
+ try:
109
+ await page_info.page.close()
110
+ except Exception:
111
+ pass
112
+ pages_to_remove.append(page_info)
113
+
114
+ for page_info in pages_to_remove:
115
+ self.pages.remove(page_info)
@@ -9,7 +9,7 @@ from scrapling.core._types import (
9
9
  List,
10
10
  SelectorWaitStates,
11
11
  )
12
- from scrapling.engines.toolbelt import construct_proxy_dict
12
+ from scrapling.engines.toolbelt.navigation import construct_proxy_dict
13
13
 
14
14
 
15
15
  class PlaywrightConfig(Struct, kw_only=True, frozen=False):
@@ -25,17 +25,17 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
25
25
  stealth: bool = False
26
26
  wait: int | float = 0
27
27
  page_action: Optional[Callable] = None
28
- proxy: Optional[str | Dict[str, str]] = (
29
- None # The default value for proxy in Playwright's source is `None`
30
- )
28
+ proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
31
29
  locale: str = "en-US"
32
30
  extra_headers: Optional[Dict[str, str]] = None
33
31
  useragent: Optional[str] = None
34
32
  timeout: int | float = 30000
33
+ init_script: Optional[str] = None
35
34
  disable_resources: bool = False
36
35
  wait_selector: Optional[str] = None
37
36
  cookies: Optional[List[Dict]] = None
38
37
  network_idle: bool = False
38
+ load_dom: bool = True
39
39
  wait_selector_state: SelectorWaitStates = "attached"
40
40
  selector_config: Optional[Dict] = None
41
41
 
@@ -45,10 +45,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
45
45
  raise ValueError("max_pages must be between 1 and 50")
46
46
  if self.timeout < 0:
47
47
  raise ValueError("timeout must be >= 0")
48
- if self.page_action is not None and not callable(self.page_action):
49
- raise TypeError(
50
- f"page_action must be callable, got {type(self.page_action).__name__}"
51
- )
48
+ if self.page_action and not callable(self.page_action):
49
+ raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
52
50
  if self.proxy:
53
51
  self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
54
52
  if self.cdp_url:
@@ -58,6 +56,15 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
58
56
  if not self.selector_config:
59
57
  self.selector_config = {}
60
58
 
59
+ if self.init_script is not None:
60
+ script_path = Path(self.init_script)
61
+ if not script_path.exists():
62
+ raise ValueError("Init script path not found")
63
+ elif not script_path.is_file():
64
+ raise ValueError("Init script is not a file")
65
+ elif not script_path.is_absolute():
66
+ raise ValueError("Init script is not a absolute path")
67
+
61
68
  @staticmethod
62
69
  def __validate_cdp(cdp_url):
63
70
  try:
@@ -86,10 +93,12 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
86
93
  block_webrtc: bool = False
87
94
  allow_webgl: bool = True
88
95
  network_idle: bool = False
96
+ load_dom: bool = True
89
97
  humanize: bool | float = True
90
98
  solve_cloudflare: bool = False
91
99
  wait: int | float = 0
92
100
  timeout: int | float = 30000
101
+ init_script: Optional[str] = None
93
102
  page_action: Optional[Callable] = None
94
103
  wait_selector: Optional[str] = None
95
104
  addons: Optional[List[str]] = None
@@ -97,9 +106,7 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
97
106
  cookies: Optional[List[Dict]] = None
98
107
  google_search: bool = True
99
108
  extra_headers: Optional[Dict[str, str]] = None
100
- proxy: Optional[str | Dict[str, str]] = (
101
- None # The default value for proxy in Playwright's source is `None`
102
- )
109
+ proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
103
110
  os_randomize: bool = False
104
111
  disable_ads: bool = False
105
112
  geoip: bool = False
@@ -112,10 +119,8 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
112
119
  raise ValueError("max_pages must be between 1 and 50")
113
120
  if self.timeout < 0:
114
121
  raise ValueError("timeout must be >= 0")
115
- if self.page_action is not None and not callable(self.page_action):
116
- raise TypeError(
117
- f"page_action must be callable, got {type(self.page_action).__name__}"
118
- )
122
+ if self.page_action and not callable(self.page_action):
123
+ raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
119
124
  if self.proxy:
120
125
  self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
121
126
 
@@ -131,6 +136,15 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
131
136
  f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
132
137
  )
133
138
 
139
+ if self.init_script is not None:
140
+ script_path = Path(self.init_script)
141
+ if not script_path.exists():
142
+ raise ValueError("Init script path not found")
143
+ elif not script_path.is_file():
144
+ raise ValueError("Init script is not a file")
145
+ elif not script_path.is_absolute():
146
+ raise ValueError("Init script is not a absolute path")
147
+
134
148
  if not self.cookies:
135
149
  self.cookies = []
136
150
  if self.solve_cloudflare and self.timeout < 60_000:
@@ -16,9 +16,9 @@ HARMFUL_DEFAULT_ARGS = (
16
16
  # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
17
17
  "--enable-automation",
18
18
  "--disable-popup-blocking",
19
- # '--disable-component-update',
20
- # '--disable-default-apps',
21
- # '--disable-extensions',
19
+ "--disable-component-update",
20
+ "--disable-default-apps",
21
+ "--disable-extensions",
22
22
  )
23
23
 
24
24
  DEFAULT_FLAGS = (
@@ -50,7 +50,6 @@ DEFAULT_STEALTH_FLAGS = (
50
50
  "--accept-lang=en-US",
51
51
  "--use-mock-keychain",
52
52
  "--disable-translate",
53
- "--disable-extensions",
54
53
  "--disable-voice-input",
55
54
  "--window-position=0,0",
56
55
  "--disable-wake-on-wifi",
@@ -59,7 +58,6 @@ DEFAULT_STEALTH_FLAGS = (
59
58
  "--enable-web-bluetooth",
60
59
  "--disable-hang-monitor",
61
60
  "--disable-cloud-import",
62
- "--disable-default-apps",
63
61
  "--disable-print-preview",
64
62
  "--disable-dev-shm-usage",
65
63
  # '--disable-popup-blocking',
@@ -72,7 +70,6 @@ DEFAULT_STEALTH_FLAGS = (
72
70
  "--force-color-profile=srgb",
73
71
  "--font-render-hinting=none",
74
72
  "--aggressive-cache-discard",
75
- "--disable-component-update",
76
73
  "--disable-cookie-encryption",
77
74
  "--disable-domain-reliability",
78
75
  "--disable-threaded-animation",
@@ -24,13 +24,9 @@ from scrapling.core._types import (
24
24
  Any,
25
25
  )
26
26
 
27
- from .toolbelt import (
28
- Response,
29
- generate_convincing_referer,
30
- generate_headers,
31
- ResponseFactory,
32
- __default_useragent__,
33
- )
27
+ from .toolbelt.custom import Response
28
+ from .toolbelt.convertor import ResponseFactory
29
+ from .toolbelt.fingerprints import generate_convincing_referer, generate_headers, __default_useragent__
34
30
 
35
31
  _UNSET = object()
36
32
 
@@ -108,13 +104,9 @@ class FetcherSession:
108
104
 
109
105
  headers = self.get_with_precedence(kwargs, "headers", self.default_headers)
110
106
  stealth = self.get_with_precedence(kwargs, "stealth", self.stealth)
111
- impersonate = self.get_with_precedence(
112
- kwargs, "impersonate", self.default_impersonate
113
- )
107
+ impersonate = self.get_with_precedence(kwargs, "impersonate", self.default_impersonate)
114
108
 
115
- if self.get_with_precedence(
116
- kwargs, "http3", self.default_http3
117
- ): # pragma: no cover
109
+ if self.get_with_precedence(kwargs, "http3", self.default_http3): # pragma: no cover
118
110
  request_args["http_version"] = CurlHttpVersion.V3ONLY
119
111
  if impersonate:
120
112
  log.warning(
@@ -126,25 +118,13 @@ class FetcherSession:
126
118
  "url": url,
127
119
  # Curl automatically generates the suitable browser headers when you use `impersonate`
128
120
  "headers": self._headers_job(url, headers, stealth, bool(impersonate)),
129
- "proxies": self.get_with_precedence(
130
- kwargs, "proxies", self.default_proxies
131
- ),
121
+ "proxies": self.get_with_precedence(kwargs, "proxies", self.default_proxies),
132
122
  "proxy": self.get_with_precedence(kwargs, "proxy", self.default_proxy),
133
- "proxy_auth": self.get_with_precedence(
134
- kwargs, "proxy_auth", self.default_proxy_auth
135
- ),
136
- "timeout": self.get_with_precedence(
137
- kwargs, "timeout", self.default_timeout
138
- ),
139
- "allow_redirects": self.get_with_precedence(
140
- kwargs, "allow_redirects", self.default_follow_redirects
141
- ),
142
- "max_redirects": self.get_with_precedence(
143
- kwargs, "max_redirects", self.default_max_redirects
144
- ),
145
- "verify": self.get_with_precedence(
146
- kwargs, "verify", self.default_verify
147
- ),
123
+ "proxy_auth": self.get_with_precedence(kwargs, "proxy_auth", self.default_proxy_auth),
124
+ "timeout": self.get_with_precedence(kwargs, "timeout", self.default_timeout),
125
+ "allow_redirects": self.get_with_precedence(kwargs, "allow_redirects", self.default_follow_redirects),
126
+ "max_redirects": self.get_with_precedence(kwargs, "max_redirects", self.default_max_redirects),
127
+ "verify": self.get_with_precedence(kwargs, "verify", self.default_verify),
148
128
  "cert": self.get_with_precedence(kwargs, "cert", self.default_cert),
149
129
  "impersonate": impersonate,
150
130
  **{
@@ -192,18 +172,12 @@ class FetcherSession:
192
172
 
193
173
  extra_headers = generate_headers(browser_mode=False)
194
174
  # Don't overwrite user-supplied headers
195
- extra_headers = {
196
- key: value
197
- for key, value in extra_headers.items()
198
- if key.lower() not in headers_keys
199
- }
175
+ extra_headers = {key: value for key, value in extra_headers.items() if key.lower() not in headers_keys}
200
176
  headers.update(extra_headers)
201
177
 
202
178
  elif "user-agent" not in headers_keys and not impersonate_enabled:
203
179
  headers["User-Agent"] = __default_useragent__
204
- log.debug(
205
- f"Can't find useragent in headers so '{headers['User-Agent']}' was used."
206
- )
180
+ log.debug(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
207
181
 
208
182
  return headers
209
183
 
@@ -215,9 +189,7 @@ class FetcherSession:
215
189
  "Create a new FetcherSession instance for a new independent session, "
216
190
  "or use the current instance sequentially after the previous context has exited."
217
191
  )
218
- if (
219
- self._async_curl_session
220
- ): # Prevent mixing if async is active from this instance
192
+ if self._async_curl_session: # Prevent mixing if async is active from this instance
221
193
  raise RuntimeError(
222
194
  "This FetcherSession instance has an active asynchronous session. "
223
195
  "Cannot enter a synchronous context simultaneously with the same manager instance."
@@ -275,9 +247,7 @@ class FetcherSession:
275
247
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
276
248
  """
277
249
  session = self._curl_session
278
- if session is True and not any(
279
- (self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
280
- ):
250
+ if session is True and not any((self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)):
281
251
  # For usage inside FetcherClient
282
252
  # It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
283
253
  session = CurlSession()
@@ -290,9 +260,7 @@ class FetcherSession:
290
260
  return ResponseFactory.from_http_request(response, selector_config)
291
261
  except CurlError as e: # pragma: no cover
292
262
  if attempt < max_retries - 1:
293
- log.error(
294
- f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds..."
295
- )
263
+ log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
296
264
  time_sleep(retry_delay)
297
265
  else:
298
266
  log.error(f"Failed after {max_retries} attempts: {e}")
@@ -320,9 +288,7 @@ class FetcherSession:
320
288
  :return: A `Response` object for synchronous requests or an awaitable for asynchronous.
321
289
  """
322
290
  session = self._async_curl_session
323
- if session is True and not any(
324
- (self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)
325
- ):
291
+ if session is True and not any((self.__enter__, self.__exit__, self.__aenter__, self.__aexit__)):
326
292
  # For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons
327
293
  # 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.
328
294
  # 2. `curl_cffi` doesn't support making async requests without sessions
@@ -337,9 +303,7 @@ class FetcherSession:
337
303
  return ResponseFactory.from_http_request(response, selector_config)
338
304
  except CurlError as e: # pragma: no cover
339
305
  if attempt < max_retries - 1:
340
- log.error(
341
- f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds..."
342
- )
306
+ log.error(f"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...")
343
307
  await asyncio_sleep(retry_delay)
344
308
  else:
345
309
  log.error(f"Failed after {max_retries} attempts: {e}")
@@ -372,19 +336,13 @@ class FetcherSession:
372
336
 
373
337
  selector_config = kwargs.pop("selector_config", {}) or self.selector_config
374
338
  max_retries = self.get_with_precedence(kwargs, "retries", self.default_retries)
375
- retry_delay = self.get_with_precedence(
376
- kwargs, "retry_delay", self.default_retry_delay
377
- )
339
+ retry_delay = self.get_with_precedence(kwargs, "retry_delay", self.default_retry_delay)
378
340
  request_args = self._merge_request_args(stealth=stealth, **kwargs)
379
341
  if self._curl_session:
380
- return self.__make_request(
381
- method, request_args, max_retries, retry_delay, selector_config
382
- )
342
+ return self.__make_request(method, request_args, max_retries, retry_delay, selector_config)
383
343
  elif self._async_curl_session:
384
344
  # The returned value is a Coroutine
385
- return self.__make_async_request(
386
- method, request_args, max_retries, retry_delay, selector_config
387
- )
345
+ return self.__make_async_request(method, request_args, max_retries, retry_delay, selector_config)
388
346
 
389
347
  raise RuntimeError("No active session available.")
390
348
 
@@ -455,9 +413,7 @@ class FetcherSession:
455
413
  "http3": http3,
456
414
  **kwargs,
457
415
  }
458
- return self.__prepare_and_dispatch(
459
- "GET", stealth=stealthy_headers, **request_args
460
- )
416
+ return self.__prepare_and_dispatch("GET", stealth=stealthy_headers, **request_args)
461
417
 
462
418
  def post(
463
419
  self,
@@ -532,9 +488,7 @@ class FetcherSession:
532
488
  "http3": http3,
533
489
  **kwargs,
534
490
  }
535
- return self.__prepare_and_dispatch(
536
- "POST", stealth=stealthy_headers, **request_args
537
- )
491
+ return self.__prepare_and_dispatch("POST", stealth=stealthy_headers, **request_args)
538
492
 
539
493
  def put(
540
494
  self,
@@ -609,9 +563,7 @@ class FetcherSession:
609
563
  "http3": http3,
610
564
  **kwargs,
611
565
  }
612
- return self.__prepare_and_dispatch(
613
- "PUT", stealth=stealthy_headers, **request_args
614
- )
566
+ return self.__prepare_and_dispatch("PUT", stealth=stealthy_headers, **request_args)
615
567
 
616
568
  def delete(
617
569
  self,
@@ -688,9 +640,7 @@ class FetcherSession:
688
640
  "http3": http3,
689
641
  **kwargs,
690
642
  }
691
- return self.__prepare_and_dispatch(
692
- "DELETE", stealth=stealthy_headers, **request_args
693
- )
643
+ return self.__prepare_and_dispatch("DELETE", stealth=stealthy_headers, **request_args)
694
644
 
695
645
 
696
646
  class FetcherClient(FetcherSession):
@@ -1,20 +1 @@
1
- from .custom import (
2
- BaseFetcher,
3
- Response,
4
- StatusText,
5
- get_variable_name,
6
- )
7
- from .fingerprints import (
8
- generate_convincing_referer,
9
- generate_headers,
10
- get_os_name,
11
- __default_useragent__,
12
- )
13
- from .navigation import (
14
- async_intercept_route,
15
- construct_cdp_url,
16
- construct_proxy_dict,
17
- intercept_route,
18
- js_bypass_path,
19
- )
20
- from .convertor import ResponseFactory
1
+