scrapling 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ from functools import lru_cache
2
2
  from re import compile as re_compile
3
3
 
4
4
  from curl_cffi.requests import Response as CurlResponse
5
+ from playwright._impl._errors import Error as PlaywrightError
5
6
  from playwright.sync_api import Page as SyncPage, Response as SyncResponse
6
7
  from playwright.async_api import Page as AsyncPage, Response as AsyncResponse
7
8
 
@@ -24,15 +25,15 @@ class ResponseFactory:
24
25
 
25
26
  @classmethod
26
27
  @lru_cache(maxsize=16)
27
- def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
28
+ def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
28
29
  """Extract browser encoding from headers.
29
30
  Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
30
31
  """
31
32
  if content_type:
32
33
  # Because Playwright can't do that by themselves like all libraries for some reason :3
33
34
  match = __CHARSET_RE__.search(content_type)
34
- return match.group(1) if match else None
35
- return None
35
+ return match.group(1) if match else default
36
+ return default
36
37
 
37
38
  @classmethod
38
39
  def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
@@ -58,7 +59,8 @@ class ResponseFactory:
58
59
  "encoding": cls.__extract_browser_encoding(
59
60
  current_response.headers.get("content-type", "")
60
61
  )
61
- or "utf-8",
62
+ if current_response
63
+ else "utf-8",
62
64
  "cookies": tuple(),
63
65
  "headers": current_response.all_headers() if current_response else {},
64
66
  "request_headers": current_request.all_headers(),
@@ -83,6 +85,7 @@ class ResponseFactory:
83
85
  first_response: SyncResponse,
84
86
  final_response: Optional[SyncResponse],
85
87
  parser_arguments: Dict,
88
+ automated_page: bool = False,
86
89
  ) -> Response:
87
90
  """
88
91
  Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -98,6 +101,7 @@ class ResponseFactory:
98
101
  :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
99
102
  :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
100
103
  the `Response` object.
104
+ :param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
101
105
 
102
106
  :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
103
107
  :rtype: Response
@@ -107,15 +111,13 @@ class ResponseFactory:
107
111
  if not final_response:
108
112
  raise ValueError("Failed to get a response from the page")
109
113
 
110
- encoding = (
111
- cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
112
- ) # default encoding
114
+ encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
113
115
  # PlayWright API sometimes give empty status text for some reason!
114
116
  status_text = final_response.status_text or StatusText.get(final_response.status)
115
117
 
116
118
  history = cls._process_response_history(first_response, parser_arguments)
117
119
  try:
118
- page_content = page.content()
120
+ page_content = final_response.text() if not automated_page else cls._get_page_content(page)
119
121
  except Exception as e: # pragma: no cover
120
122
  log.error(f"Error getting page content: {e}")
121
123
  page_content = ""
@@ -161,7 +163,8 @@ class ResponseFactory:
161
163
  "encoding": cls.__extract_browser_encoding(
162
164
  current_response.headers.get("content-type", "")
163
165
  )
164
- or "utf-8",
166
+ if current_response
167
+ else "utf-8",
165
168
  "cookies": tuple(),
166
169
  "headers": await current_response.all_headers() if current_response else {},
167
170
  "request_headers": await current_request.all_headers(),
@@ -179,6 +182,36 @@ class ResponseFactory:
179
182
 
180
183
  return history
181
184
 
185
+ @classmethod
186
+ def _get_page_content(cls, page: SyncPage) -> str:
187
+ """
188
+ A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
189
+ :param page: The page to extract content from.
190
+ :return:
191
+ """
192
+ while True:
193
+ try:
194
+ return page.content() or ""
195
+ except PlaywrightError:
196
+ page.wait_for_timeout(500)
197
+ continue
198
+ return "" # pyright: ignore
199
+
200
+ @classmethod
201
+ async def _get_async_page_content(cls, page: AsyncPage) -> str:
202
+ """
203
+ A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
204
+ :param page: The page to extract content from.
205
+ :return:
206
+ """
207
+ while True:
208
+ try:
209
+ return (await page.content()) or ""
210
+ except PlaywrightError:
211
+ await page.wait_for_timeout(500)
212
+ continue
213
+ return "" # pyright: ignore
214
+
182
215
  @classmethod
183
216
  async def from_async_playwright_response(
184
217
  cls,
@@ -186,6 +219,7 @@ class ResponseFactory:
186
219
  first_response: AsyncResponse,
187
220
  final_response: Optional[AsyncResponse],
188
221
  parser_arguments: Dict,
222
+ automated_page: bool = False,
189
223
  ) -> Response:
190
224
  """
191
225
  Transforms a Playwright response into an internal `Response` object, encapsulating
@@ -201,6 +235,7 @@ class ResponseFactory:
201
235
  :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.
202
236
  :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into
203
237
  the `Response` object.
238
+ :param automated_page: If True, it means the `page_action` argument was being used, so the response retrieving method changes to use Playwright's page instead of the final response.
204
239
 
205
240
  :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.
206
241
  :rtype: Response
@@ -210,15 +245,13 @@ class ResponseFactory:
210
245
  if not final_response:
211
246
  raise ValueError("Failed to get a response from the page")
212
247
 
213
- encoding = (
214
- cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
215
- ) # default encoding
248
+ encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
216
249
  # PlayWright API sometimes give empty status text for some reason!
217
250
  status_text = final_response.status_text or StatusText.get(final_response.status)
218
251
 
219
252
  history = await cls._async_process_response_history(first_response, parser_arguments)
220
253
  try:
221
- page_content = await page.content()
254
+ page_content = await (final_response.text() if not automated_page else cls._get_async_page_content(page))
222
255
  except Exception as e: # pragma: no cover
223
256
  log.error(f"Error getting page content in async: {e}")
224
257
  page_content = ""
@@ -255,8 +288,8 @@ class ResponseFactory:
255
288
  "encoding": response.encoding or "utf-8",
256
289
  "cookies": dict(response.cookies),
257
290
  "headers": dict(response.headers),
258
- "request_headers": dict(response.request.headers),
259
- "method": response.request.method,
291
+ "request_headers": dict(response.request.headers) if response.request else {},
292
+ "method": response.request.method if response.request else "GET",
260
293
  "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
261
294
  **parser_arguments,
262
295
  }
@@ -8,6 +8,7 @@ from scrapling.core.utils import log
8
8
  from scrapling.core._types import (
9
9
  Any,
10
10
  Dict,
11
+ cast,
11
12
  List,
12
13
  Optional,
13
14
  Tuple,
@@ -30,10 +31,10 @@ class Response(Selector):
30
31
  request_headers: Dict,
31
32
  encoding: str = "utf-8",
32
33
  method: str = "GET",
33
- history: List = None,
34
- **selector_config: Dict,
34
+ history: List | None = None,
35
+ **selector_config: Any,
35
36
  ):
36
- adaptive_domain = selector_config.pop("adaptive_domain", None)
37
+ adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
37
38
  self.status = status
38
39
  self.reason = reason
39
40
  self.cookies = cookies
@@ -58,7 +59,7 @@ class BaseFetcher:
58
59
  keep_cdata: Optional[bool] = False
59
60
  storage_args: Optional[Dict] = None
60
61
  keep_comments: Optional[bool] = False
61
- adaptive_domain: Optional[str] = None
62
+ adaptive_domain: str = ""
62
63
  parser_keywords: Tuple = (
63
64
  "huge_tree",
64
65
  "adaptive",
@@ -124,12 +125,8 @@ class BaseFetcher:
124
125
  adaptive=cls.adaptive,
125
126
  storage=cls.storage,
126
127
  storage_args=cls.storage_args,
128
+ adaptive_domain=cls.adaptive_domain,
127
129
  )
128
- if cls.adaptive_domain:
129
- if not isinstance(cls.adaptive_domain, str):
130
- log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
131
- else:
132
- parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
133
130
 
134
131
  return parser_arguments
135
132
 
@@ -212,15 +209,3 @@ class StatusText:
212
209
  def get(cls, status_code: int) -> str:
213
210
  """Get the phrase for a given HTTP status code."""
214
211
  return cls._phrases.get(status_code, "Unknown Status Code")
215
-
216
-
217
- def get_variable_name(var: Any) -> Optional[str]:
218
- """Get the name of a variable using global and local scopes.
219
- :param var: The variable to find the name for
220
- :return: The name of the variable if found, None otherwise
221
- """
222
- for scope in [globals(), locals()]:
223
- for name, value in scope.items():
224
- if value is var:
225
- return name
226
- return None
@@ -7,10 +7,12 @@ from platform import system as platform_system
7
7
 
8
8
  from tldextract import extract
9
9
  from browserforge.headers import Browser, HeaderGenerator
10
+ from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
10
11
 
11
- from scrapling.core._types import Dict, Optional
12
+ from scrapling.core._types import Dict, Literal, Tuple
12
13
 
13
14
  __OS_NAME__ = platform_system()
15
+ OSName = Literal["linux", "macos", "windows"]
14
16
 
15
17
 
16
18
  @lru_cache(10, typed=True)
@@ -28,16 +30,20 @@ def generate_convincing_referer(url: str) -> str:
28
30
 
29
31
 
30
32
  @lru_cache(1, typed=True)
31
- def get_os_name() -> Optional[str]:
32
- """Get the current OS name in the same format needed for browserforge
33
+ def get_os_name() -> OSName | Tuple:
34
+ """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
33
35
 
34
36
  :return: Current OS name or `None` otherwise
35
37
  """
36
- return {
37
- "Linux": "linux",
38
- "Darwin": "macos",
39
- "Windows": "windows",
40
- }.get(__OS_NAME__)
38
+ match __OS_NAME__: # pragma: no cover
39
+ case "Linux":
40
+ return "linux"
41
+ case "Darwin":
42
+ return "macos"
43
+ case "Windows":
44
+ return "windows"
45
+ case _:
46
+ return SUPPORTED_OPERATING_SYSTEMS
41
47
 
42
48
 
43
49
  def generate_headers(browser_mode: bool = False) -> Dict:
@@ -58,7 +64,6 @@ def generate_headers(browser_mode: bool = False) -> Dict:
58
64
  Browser(name="edge", min_version=130),
59
65
  ]
60
66
  )
61
-
62
67
  return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
63
68
 
64
69
 
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
11
11
  from playwright.sync_api import Route
12
12
 
13
13
  from scrapling.core.utils import log
14
- from scrapling.core._types import Dict, Optional, Tuple
14
+ from scrapling.core._types import Dict, Tuple, overload, Literal
15
15
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
16
16
 
17
17
  __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
49
49
  await route.continue_()
50
50
 
51
51
 
52
- def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
52
+ @overload
53
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
54
+
55
+
56
+ @overload
57
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
58
+
59
+
60
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
53
61
  """Validate a proxy and return it in the acceptable format for Playwright
54
62
  Reference: https://playwright.dev/python/docs/network#http-proxy
55
63
 
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
83
91
  except ValidationError as e:
84
92
  raise TypeError(f"Invalid proxy dictionary: {e}")
85
93
 
86
- return None
94
+ raise TypeError(f"Invalid proxy string: {proxy_string}")
87
95
 
88
96
 
89
97
  @lru_cache(10, typed=True)
@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
19
19
  "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
20
20
  }
21
21
 
22
- __all__ = ["Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
22
+ __all__ = [
23
+ "Fetcher",
24
+ "AsyncFetcher",
25
+ "FetcherSession",
26
+ "DynamicFetcher",
27
+ "DynamicSession",
28
+ "AsyncDynamicSession",
29
+ "StealthyFetcher",
30
+ "StealthySession",
31
+ "AsyncStealthySession",
32
+ ]
23
33
 
24
34
 
25
35
  def __getattr__(name: str) -> Any:
@@ -1,10 +1,9 @@
1
1
  from scrapling.core._types import (
2
2
  Callable,
3
- Dict,
4
3
  List,
4
+ Dict,
5
5
  Optional,
6
6
  SelectorWaitStates,
7
- Iterable,
8
7
  )
9
8
  from scrapling.engines.toolbelt.custom import BaseFetcher, Response
10
9
  from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
@@ -47,10 +46,12 @@ class DynamicFetcher(BaseFetcher):
47
46
  disable_resources: bool = False,
48
47
  wait_selector: Optional[str] = None,
49
48
  init_script: Optional[str] = None,
50
- cookies: Optional[Iterable[Dict]] = None,
49
+ cookies: Optional[List[Dict]] = None,
51
50
  network_idle: bool = False,
52
51
  load_dom: bool = True,
53
52
  wait_selector_state: SelectorWaitStates = "attached",
53
+ extra_flags: Optional[List[str]] = None,
54
+ additional_args: Optional[Dict] = None,
54
55
  custom_config: Optional[Dict] = None,
55
56
  ) -> Response:
56
57
  """Opens up a browser and do your request based on your chosen options below.
@@ -79,7 +80,9 @@ class DynamicFetcher(BaseFetcher):
79
80
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
80
81
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
81
82
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
83
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
82
84
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
85
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
83
86
  :return: A `Response` object.
84
87
  """
85
88
  if not custom_config:
@@ -107,6 +110,8 @@ class DynamicFetcher(BaseFetcher):
107
110
  extra_headers=extra_headers,
108
111
  wait_selector=wait_selector,
109
112
  disable_webgl=disable_webgl,
113
+ extra_flags=extra_flags,
114
+ additional_args=additional_args,
110
115
  disable_resources=disable_resources,
111
116
  wait_selector_state=wait_selector_state,
112
117
  selector_config={**cls._generate_parser_arguments(), **custom_config},
@@ -134,10 +139,12 @@ class DynamicFetcher(BaseFetcher):
134
139
  disable_resources: bool = False,
135
140
  wait_selector: Optional[str] = None,
136
141
  init_script: Optional[str] = None,
137
- cookies: Optional[Iterable[Dict]] = None,
142
+ cookies: Optional[List[Dict]] = None,
138
143
  network_idle: bool = False,
139
144
  load_dom: bool = True,
140
145
  wait_selector_state: SelectorWaitStates = "attached",
146
+ extra_flags: Optional[List[str]] = None,
147
+ additional_args: Optional[Dict] = None,
141
148
  custom_config: Optional[Dict] = None,
142
149
  ) -> Response:
143
150
  """Opens up a browser and do your request based on your chosen options below.
@@ -166,7 +173,9 @@ class DynamicFetcher(BaseFetcher):
166
173
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
167
174
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
168
175
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
176
+ :param extra_flags: A list of additional browser flags to pass to the browser on launch.
169
177
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
178
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
170
179
  :return: A `Response` object.
171
180
  """
172
181
  if not custom_config:
@@ -195,6 +204,8 @@ class DynamicFetcher(BaseFetcher):
195
204
  extra_headers=extra_headers,
196
205
  wait_selector=wait_selector,
197
206
  disable_webgl=disable_webgl,
207
+ extra_flags=extra_flags,
208
+ additional_args=additional_args,
198
209
  disable_resources=disable_resources,
199
210
  wait_selector_state=wait_selector_state,
200
211
  selector_config={**cls._generate_parser_arguments(), **custom_config},
@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
83
83
  """
84
84
  if not custom_config:
85
85
  custom_config = {}
86
- elif not isinstance(custom_config, dict):
87
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
88
86
 
89
87
  with StealthySession(
90
88
  wait=wait,
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
182
180
  """
183
181
  if not custom_config:
184
182
  custom_config = {}
185
- elif not isinstance(custom_config, dict):
186
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
187
183
 
188
184
  async with AsyncStealthySession(
189
185
  wait=wait,