scrapling 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,15 +24,15 @@ class ResponseFactory:
24
24
 
25
25
  @classmethod
26
26
  @lru_cache(maxsize=16)
27
- def __extract_browser_encoding(cls, content_type: str | None) -> Optional[str]:
27
+ def __extract_browser_encoding(cls, content_type: str | None, default: str = "utf-8") -> str:
28
28
  """Extract browser encoding from headers.
29
29
  Ex: from header "content-type: text/html; charset=utf-8" -> "utf-8
30
30
  """
31
31
  if content_type:
32
32
  # Because Playwright can't do that by themselves like all libraries for some reason :3
33
33
  match = __CHARSET_RE__.search(content_type)
34
- return match.group(1) if match else None
35
- return None
34
+ return match.group(1) if match else default
35
+ return default
36
36
 
37
37
  @classmethod
38
38
  def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:
@@ -58,7 +58,8 @@ class ResponseFactory:
58
58
  "encoding": cls.__extract_browser_encoding(
59
59
  current_response.headers.get("content-type", "")
60
60
  )
61
- or "utf-8",
61
+ if current_response
62
+ else "utf-8",
62
63
  "cookies": tuple(),
63
64
  "headers": current_response.all_headers() if current_response else {},
64
65
  "request_headers": current_request.all_headers(),
@@ -107,15 +108,13 @@ class ResponseFactory:
107
108
  if not final_response:
108
109
  raise ValueError("Failed to get a response from the page")
109
110
 
110
- encoding = (
111
- cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
112
- ) # default encoding
111
+ encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
113
112
  # PlayWright API sometimes give empty status text for some reason!
114
113
  status_text = final_response.status_text or StatusText.get(final_response.status)
115
114
 
116
115
  history = cls._process_response_history(first_response, parser_arguments)
117
116
  try:
118
- page_content = page.content()
117
+ page_content = final_response.text()
119
118
  except Exception as e: # pragma: no cover
120
119
  log.error(f"Error getting page content: {e}")
121
120
  page_content = ""
@@ -161,7 +160,8 @@ class ResponseFactory:
161
160
  "encoding": cls.__extract_browser_encoding(
162
161
  current_response.headers.get("content-type", "")
163
162
  )
164
- or "utf-8",
163
+ if current_response
164
+ else "utf-8",
165
165
  "cookies": tuple(),
166
166
  "headers": await current_response.all_headers() if current_response else {},
167
167
  "request_headers": await current_request.all_headers(),
@@ -210,15 +210,13 @@ class ResponseFactory:
210
210
  if not final_response:
211
211
  raise ValueError("Failed to get a response from the page")
212
212
 
213
- encoding = (
214
- cls.__extract_browser_encoding(final_response.headers.get("content-type", "")) or "utf-8"
215
- ) # default encoding
213
+ encoding = cls.__extract_browser_encoding(final_response.headers.get("content-type", ""))
216
214
  # PlayWright API sometimes give empty status text for some reason!
217
215
  status_text = final_response.status_text or StatusText.get(final_response.status)
218
216
 
219
217
  history = await cls._async_process_response_history(first_response, parser_arguments)
220
218
  try:
221
- page_content = await page.content()
219
+ page_content = await final_response.text()
222
220
  except Exception as e: # pragma: no cover
223
221
  log.error(f"Error getting page content in async: {e}")
224
222
  page_content = ""
@@ -255,8 +253,8 @@ class ResponseFactory:
255
253
  "encoding": response.encoding or "utf-8",
256
254
  "cookies": dict(response.cookies),
257
255
  "headers": dict(response.headers),
258
- "request_headers": dict(response.request.headers),
259
- "method": response.request.method,
256
+ "request_headers": dict(response.request.headers) if response.request else {},
257
+ "method": response.request.method if response.request else "GET",
260
258
  "history": response.history, # https://github.com/lexiforest/curl_cffi/issues/82
261
259
  **parser_arguments,
262
260
  }
@@ -8,6 +8,7 @@ from scrapling.core.utils import log
8
8
  from scrapling.core._types import (
9
9
  Any,
10
10
  Dict,
11
+ cast,
11
12
  List,
12
13
  Optional,
13
14
  Tuple,
@@ -30,10 +31,10 @@ class Response(Selector):
30
31
  request_headers: Dict,
31
32
  encoding: str = "utf-8",
32
33
  method: str = "GET",
33
- history: List = None,
34
- **selector_config: Dict,
34
+ history: List | None = None,
35
+ **selector_config: Any,
35
36
  ):
36
- adaptive_domain = selector_config.pop("adaptive_domain", None)
37
+ adaptive_domain: str = cast(str, selector_config.pop("adaptive_domain", ""))
37
38
  self.status = status
38
39
  self.reason = reason
39
40
  self.cookies = cookies
@@ -58,7 +59,7 @@ class BaseFetcher:
58
59
  keep_cdata: Optional[bool] = False
59
60
  storage_args: Optional[Dict] = None
60
61
  keep_comments: Optional[bool] = False
61
- adaptive_domain: Optional[str] = None
62
+ adaptive_domain: str = ""
62
63
  parser_keywords: Tuple = (
63
64
  "huge_tree",
64
65
  "adaptive",
@@ -124,12 +125,8 @@ class BaseFetcher:
124
125
  adaptive=cls.adaptive,
125
126
  storage=cls.storage,
126
127
  storage_args=cls.storage_args,
128
+ adaptive_domain=cls.adaptive_domain,
127
129
  )
128
- if cls.adaptive_domain:
129
- if not isinstance(cls.adaptive_domain, str):
130
- log.warning('[Ignored] The argument "adaptive_domain" must be of string type')
131
- else:
132
- parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
133
130
 
134
131
  return parser_arguments
135
132
 
@@ -8,9 +8,10 @@ from platform import system as platform_system
8
8
  from tldextract import extract
9
9
  from browserforge.headers import Browser, HeaderGenerator
10
10
 
11
- from scrapling.core._types import Dict, Optional
11
+ from scrapling.core._types import Dict, Literal
12
12
 
13
13
  __OS_NAME__ = platform_system()
14
+ OSName = Literal["linux", "macos", "windows"]
14
15
 
15
16
 
16
17
  @lru_cache(10, typed=True)
@@ -28,16 +29,20 @@ def generate_convincing_referer(url: str) -> str:
28
29
 
29
30
 
30
31
  @lru_cache(1, typed=True)
31
- def get_os_name() -> Optional[str]:
32
- """Get the current OS name in the same format needed for browserforge
32
+ def get_os_name() -> OSName | None:
33
+ """Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.
33
34
 
34
35
  :return: Current OS name or `None` otherwise
35
36
  """
36
- return {
37
- "Linux": "linux",
38
- "Darwin": "macos",
39
- "Windows": "windows",
40
- }.get(__OS_NAME__)
37
+ match __OS_NAME__:
38
+ case "Linux":
39
+ return "linux"
40
+ case "Darwin":
41
+ return "macos"
42
+ case "Windows":
43
+ return "windows"
44
+ case _:
45
+ return None
41
46
 
42
47
 
43
48
  def generate_headers(browser_mode: bool = False) -> Dict:
@@ -58,8 +63,10 @@ def generate_headers(browser_mode: bool = False) -> Dict:
58
63
  Browser(name="edge", min_version=130),
59
64
  ]
60
65
  )
61
-
62
- return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
66
+ if os_name:
67
+ return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
68
+ else:
69
+ return HeaderGenerator(browser=browsers, device="desktop").generate()
63
70
 
64
71
 
65
72
  __default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
@@ -11,7 +11,7 @@ from msgspec import Struct, structs, convert, ValidationError
11
11
  from playwright.sync_api import Route
12
12
 
13
13
  from scrapling.core.utils import log
14
- from scrapling.core._types import Dict, Optional, Tuple
14
+ from scrapling.core._types import Dict, Tuple, overload, Literal
15
15
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
16
16
 
17
17
  __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
@@ -49,7 +49,15 @@ async def async_intercept_route(route: async_Route):
49
49
  await route.continue_()
50
50
 
51
51
 
52
- def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) -> Optional[Dict | Tuple]:
52
+ @overload
53
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[True]) -> Tuple: ...
54
+
55
+
56
+ @overload
57
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: Literal[False] = False) -> Dict: ...
58
+
59
+
60
+ def construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple, as_tuple: bool = False) -> Dict | Tuple:
53
61
  """Validate a proxy and return it in the acceptable format for Playwright
54
62
  Reference: https://playwright.dev/python/docs/network#http-proxy
55
63
 
@@ -83,7 +91,7 @@ def construct_proxy_dict(proxy_string: str | Dict[str, str], as_tuple=False) ->
83
91
  except ValidationError as e:
84
92
  raise TypeError(f"Invalid proxy dictionary: {e}")
85
93
 
86
- return None
94
+ raise TypeError(f"Invalid proxy string: {proxy_string}")
87
95
 
88
96
 
89
97
  @lru_cache(10, typed=True)
@@ -19,7 +19,17 @@ _LAZY_IMPORTS = {
19
19
  "AsyncStealthySession": ("scrapling.fetchers.firefox", "AsyncStealthySession"),
20
20
  }
21
21
 
22
- __all__ = ["Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
22
+ __all__ = [
23
+ "Fetcher",
24
+ "AsyncFetcher",
25
+ "FetcherSession",
26
+ "DynamicFetcher",
27
+ "DynamicSession",
28
+ "AsyncDynamicSession",
29
+ "StealthyFetcher",
30
+ "StealthySession",
31
+ "AsyncStealthySession",
32
+ ]
23
33
 
24
34
 
25
35
  def __getattr__(name: str) -> Any:
@@ -1,10 +1,9 @@
1
1
  from scrapling.core._types import (
2
2
  Callable,
3
- Dict,
4
3
  List,
4
+ Dict,
5
5
  Optional,
6
6
  SelectorWaitStates,
7
- Iterable,
8
7
  )
9
8
  from scrapling.engines.toolbelt.custom import BaseFetcher, Response
10
9
  from scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession
@@ -47,10 +46,11 @@ class DynamicFetcher(BaseFetcher):
47
46
  disable_resources: bool = False,
48
47
  wait_selector: Optional[str] = None,
49
48
  init_script: Optional[str] = None,
50
- cookies: Optional[Iterable[Dict]] = None,
49
+ cookies: Optional[List[Dict]] = None,
51
50
  network_idle: bool = False,
52
51
  load_dom: bool = True,
53
52
  wait_selector_state: SelectorWaitStates = "attached",
53
+ additional_args: Optional[Dict] = None,
54
54
  custom_config: Optional[Dict] = None,
55
55
  ) -> Response:
56
56
  """Opens up a browser and do your request based on your chosen options below.
@@ -80,6 +80,7 @@ class DynamicFetcher(BaseFetcher):
80
80
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
81
81
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
82
82
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
83
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
83
84
  :return: A `Response` object.
84
85
  """
85
86
  if not custom_config:
@@ -107,6 +108,7 @@ class DynamicFetcher(BaseFetcher):
107
108
  extra_headers=extra_headers,
108
109
  wait_selector=wait_selector,
109
110
  disable_webgl=disable_webgl,
111
+ additional_args=additional_args,
110
112
  disable_resources=disable_resources,
111
113
  wait_selector_state=wait_selector_state,
112
114
  selector_config={**cls._generate_parser_arguments(), **custom_config},
@@ -134,10 +136,11 @@ class DynamicFetcher(BaseFetcher):
134
136
  disable_resources: bool = False,
135
137
  wait_selector: Optional[str] = None,
136
138
  init_script: Optional[str] = None,
137
- cookies: Optional[Iterable[Dict]] = None,
139
+ cookies: Optional[List[Dict]] = None,
138
140
  network_idle: bool = False,
139
141
  load_dom: bool = True,
140
142
  wait_selector_state: SelectorWaitStates = "attached",
143
+ additional_args: Optional[Dict] = None,
141
144
  custom_config: Optional[Dict] = None,
142
145
  ) -> Response:
143
146
  """Opens up a browser and do your request based on your chosen options below.
@@ -167,6 +170,7 @@ class DynamicFetcher(BaseFetcher):
167
170
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
168
171
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
169
172
  :param custom_config: A dictionary of custom parser arguments to use with this request. Any argument passed will override any class parameters values.
173
+ :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
170
174
  :return: A `Response` object.
171
175
  """
172
176
  if not custom_config:
@@ -195,6 +199,7 @@ class DynamicFetcher(BaseFetcher):
195
199
  extra_headers=extra_headers,
196
200
  wait_selector=wait_selector,
197
201
  disable_webgl=disable_webgl,
202
+ additional_args=additional_args,
198
203
  disable_resources=disable_resources,
199
204
  wait_selector_state=wait_selector_state,
200
205
  selector_config={**cls._generate_parser_arguments(), **custom_config},
@@ -83,8 +83,6 @@ class StealthyFetcher(BaseFetcher):
83
83
  """
84
84
  if not custom_config:
85
85
  custom_config = {}
86
- elif not isinstance(custom_config, dict):
87
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
88
86
 
89
87
  with StealthySession(
90
88
  wait=wait,
@@ -182,8 +180,6 @@ class StealthyFetcher(BaseFetcher):
182
180
  """
183
181
  if not custom_config:
184
182
  custom_config = {}
185
- elif not isinstance(custom_config, dict):
186
- ValueError(f"The custom parser config must be of type dictionary, got {cls.__class__}")
187
183
 
188
184
  async with AsyncStealthySession(
189
185
  wait=wait,