scrapling 0.2.9__py3-none-any.whl → 0.2.92__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/__init__.py CHANGED
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.9"
8
+ __version__ = "0.2.92"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
scrapling/cli.py ADDED
@@ -0,0 +1,37 @@
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+
9
+ def get_package_dir():
10
+ return Path(os.path.dirname(__file__))
11
+
12
+
13
+ def run_command(command, line):
14
+ print(f"Installing {line}...")
15
+ _ = subprocess.check_call(command, shell=True)
16
+ # I meant to not use try except here
17
+
18
+
19
+ @click.command(help="Install all Scrapling's Fetchers dependencies")
20
+ def install():
21
+ if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
22
+ run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
23
+ run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
24
+ run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
25
+ # if no errors raised by above commands, then we add below file
26
+ get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
27
+ else:
28
+ print('The dependencies are already installed')
29
+
30
+
31
+ @click.group()
32
+ def main():
33
+ pass
34
+
35
+
36
+ # Adding commands
37
+ main.add_command(install)
scrapling/core/_types.py CHANGED
@@ -5,6 +5,8 @@ Type definitions for type checking purposes.
5
5
  from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
6
  List, Literal, Optional, Pattern, Tuple, Type, Union)
7
7
 
8
+ SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
9
+
8
10
  try:
9
11
  from typing import Protocol
10
12
  except ImportError:
scrapling/engines/camo.py CHANGED
@@ -3,7 +3,7 @@ from camoufox.async_api import AsyncCamoufox
3
3
  from camoufox.sync_api import Camoufox
4
4
 
5
5
  from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
6
- Union)
6
+ SelectorWaitStates, Union)
7
7
  from scrapling.core.utils import log
8
8
  from scrapling.engines.toolbelt import (Response, StatusText,
9
9
  async_intercept_route,
@@ -18,7 +18,7 @@ class CamoufoxEngine:
18
18
  self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
19
19
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
20
20
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
21
+ wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
22
22
  proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
23
23
  geoip: Optional[bool] = False,
24
24
  adaptor_arguments: Dict = None,
@@ -84,6 +84,14 @@ class CamoufoxEngine:
84
84
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
85
85
  """
86
86
  addons = [] if self.disable_ads else [DefaultAddons.UBO]
87
+ # Store the final response
88
+ final_response = None
89
+
90
+ def handle_response(finished_response):
91
+ nonlocal final_response
92
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
93
+ final_response = finished_response
94
+
87
95
  with Camoufox(
88
96
  geoip=self.geoip,
89
97
  proxy=self.proxy,
@@ -100,13 +108,15 @@ class CamoufoxEngine:
100
108
  page = browser.new_page()
101
109
  page.set_default_navigation_timeout(self.timeout)
102
110
  page.set_default_timeout(self.timeout)
111
+ # Listen for all responses
112
+ page.on("response", handle_response)
103
113
  if self.disable_resources:
104
114
  page.route("**/*", intercept_route)
105
115
 
106
116
  if self.extra_headers:
107
117
  page.set_extra_http_headers(self.extra_headers)
108
118
 
109
- res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
119
+ first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
110
120
  page.wait_for_load_state(state="domcontentloaded")
111
121
  if self.network_idle:
112
122
  page.wait_for_load_state('networkidle')
@@ -123,21 +133,23 @@ class CamoufoxEngine:
123
133
  if self.network_idle:
124
134
  page.wait_for_load_state('networkidle')
125
135
 
136
+ # In case we didn't catch a document type somehow
137
+ final_response = final_response if final_response else first_response
126
138
  # This will be parsed inside `Response`
127
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
139
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
128
140
  # PlayWright API sometimes give empty status text for some reason!
129
- status_text = res.status_text or StatusText.get(res.status)
141
+ status_text = final_response.status_text or StatusText.get(final_response.status)
130
142
 
131
143
  response = Response(
132
- url=res.url,
144
+ url=page.url,
133
145
  text=page.content(),
134
146
  body=page.content().encode('utf-8'),
135
- status=res.status,
147
+ status=final_response.status,
136
148
  reason=status_text,
137
149
  encoding=encoding,
138
150
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
139
- headers=res.all_headers(),
140
- request_headers=res.request.all_headers(),
151
+ headers=first_response.all_headers(),
152
+ request_headers=first_response.request.all_headers(),
141
153
  **self.adaptor_arguments
142
154
  )
143
155
  page.close()
@@ -151,6 +163,14 @@ class CamoufoxEngine:
151
163
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
152
164
  """
153
165
  addons = [] if self.disable_ads else [DefaultAddons.UBO]
166
+ # Store the final response
167
+ final_response = None
168
+
169
+ async def handle_response(finished_response):
170
+ nonlocal final_response
171
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
172
+ final_response = finished_response
173
+
154
174
  async with AsyncCamoufox(
155
175
  geoip=self.geoip,
156
176
  proxy=self.proxy,
@@ -167,13 +187,15 @@ class CamoufoxEngine:
167
187
  page = await browser.new_page()
168
188
  page.set_default_navigation_timeout(self.timeout)
169
189
  page.set_default_timeout(self.timeout)
190
+ # Listen for all responses
191
+ page.on("response", handle_response)
170
192
  if self.disable_resources:
171
193
  await page.route("**/*", async_intercept_route)
172
194
 
173
195
  if self.extra_headers:
174
196
  await page.set_extra_http_headers(self.extra_headers)
175
197
 
176
- res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
198
+ first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
177
199
  await page.wait_for_load_state(state="domcontentloaded")
178
200
  if self.network_idle:
179
201
  await page.wait_for_load_state('networkidle')
@@ -190,21 +212,23 @@ class CamoufoxEngine:
190
212
  if self.network_idle:
191
213
  await page.wait_for_load_state('networkidle')
192
214
 
215
+ # In case we didn't catch a document type somehow
216
+ final_response = final_response if final_response else first_response
193
217
  # This will be parsed inside `Response`
194
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
218
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
195
219
  # PlayWright API sometimes give empty status text for some reason!
196
- status_text = res.status_text or StatusText.get(res.status)
220
+ status_text = final_response.status_text or StatusText.get(final_response.status)
197
221
 
198
222
  response = Response(
199
- url=res.url,
223
+ url=page.url,
200
224
  text=await page.content(),
201
225
  body=(await page.content()).encode('utf-8'),
202
- status=res.status,
226
+ status=final_response.status,
203
227
  reason=status_text,
204
228
  encoding=encoding,
205
229
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
206
- headers=await res.all_headers(),
207
- request_headers=await res.request.all_headers(),
230
+ headers=await first_response.all_headers(),
231
+ request_headers=await first_response.request.all_headers(),
208
232
  **self.adaptor_arguments
209
233
  )
210
234
  await page.close()
scrapling/engines/pw.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import json
2
2
 
3
- from scrapling.core._types import Callable, Dict, Optional, Union
3
+ from scrapling.core._types import (Callable, Dict, Optional,
4
+ SelectorWaitStates, Union)
4
5
  from scrapling.core.utils import log, lru_cache
5
6
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
7
  NSTBROWSER_DEFAULT_QUERY)
@@ -23,7 +24,7 @@ class PlaywrightEngine:
23
24
  page_action: Callable = None,
24
25
  wait_selector: Optional[str] = None,
25
26
  locale: Optional[str] = 'en-US',
26
- wait_selector_state: Optional[str] = 'attached',
27
+ wait_selector_state: SelectorWaitStates = 'attached',
27
28
  stealth: Optional[bool] = False,
28
29
  real_chrome: Optional[bool] = False,
29
30
  hide_canvas: Optional[bool] = False,
@@ -193,12 +194,21 @@ class PlaywrightEngine:
193
194
  :param url: Target url.
194
195
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
195
196
  """
197
+ from playwright.sync_api import Response as PlaywrightResponse
196
198
  if not self.stealth or self.real_chrome:
197
199
  # Because rebrowser_playwright doesn't play well with real browsers
198
200
  from playwright.sync_api import sync_playwright
199
201
  else:
200
202
  from rebrowser_playwright.sync_api import sync_playwright
201
203
 
204
+ # Store the final response
205
+ final_response = None
206
+
207
+ def handle_response(finished_response: PlaywrightResponse):
208
+ nonlocal final_response
209
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
210
+ final_response = finished_response
211
+
202
212
  with sync_playwright() as p:
203
213
  # Creating the browser
204
214
  if self.cdp_url:
@@ -212,6 +222,8 @@ class PlaywrightEngine:
212
222
  page = context.new_page()
213
223
  page.set_default_navigation_timeout(self.timeout)
214
224
  page.set_default_timeout(self.timeout)
225
+ # Listen for all responses
226
+ page.on("response", handle_response)
215
227
 
216
228
  if self.extra_headers:
217
229
  page.set_extra_http_headers(self.extra_headers)
@@ -223,7 +235,7 @@ class PlaywrightEngine:
223
235
  for script in self.__stealth_scripts():
224
236
  page.add_init_script(path=script)
225
237
 
226
- res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
238
+ first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
227
239
  page.wait_for_load_state(state="domcontentloaded")
228
240
  if self.network_idle:
229
241
  page.wait_for_load_state('networkidle')
@@ -240,21 +252,23 @@ class PlaywrightEngine:
240
252
  if self.network_idle:
241
253
  page.wait_for_load_state('networkidle')
242
254
 
255
+ # In case we didn't catch a document type somehow
256
+ final_response = final_response if final_response else first_response
243
257
  # This will be parsed inside `Response`
244
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
258
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
245
259
  # PlayWright API sometimes give empty status text for some reason!
246
- status_text = res.status_text or StatusText.get(res.status)
260
+ status_text = final_response.status_text or StatusText.get(final_response.status)
247
261
 
248
262
  response = Response(
249
- url=res.url,
263
+ url=page.url,
250
264
  text=page.content(),
251
265
  body=page.content().encode('utf-8'),
252
- status=res.status,
266
+ status=final_response.status,
253
267
  reason=status_text,
254
268
  encoding=encoding,
255
269
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
256
- headers=res.all_headers(),
257
- request_headers=res.request.all_headers(),
270
+ headers=first_response.all_headers(),
271
+ request_headers=first_response.request.all_headers(),
258
272
  **self.adaptor_arguments
259
273
  )
260
274
  page.close()
@@ -266,12 +280,21 @@ class PlaywrightEngine:
266
280
  :param url: Target url.
267
281
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
268
282
  """
283
+ from playwright.async_api import Response as PlaywrightResponse
269
284
  if not self.stealth or self.real_chrome:
270
285
  # Because rebrowser_playwright doesn't play well with real browsers
271
286
  from playwright.async_api import async_playwright
272
287
  else:
273
288
  from rebrowser_playwright.async_api import async_playwright
274
289
 
290
+ # Store the final response
291
+ final_response = None
292
+
293
+ async def handle_response(finished_response: PlaywrightResponse):
294
+ nonlocal final_response
295
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
296
+ final_response = finished_response
297
+
275
298
  async with async_playwright() as p:
276
299
  # Creating the browser
277
300
  if self.cdp_url:
@@ -285,6 +308,8 @@ class PlaywrightEngine:
285
308
  page = await context.new_page()
286
309
  page.set_default_navigation_timeout(self.timeout)
287
310
  page.set_default_timeout(self.timeout)
311
+ # Listen for all responses
312
+ page.on("response", handle_response)
288
313
 
289
314
  if self.extra_headers:
290
315
  await page.set_extra_http_headers(self.extra_headers)
@@ -296,7 +321,7 @@ class PlaywrightEngine:
296
321
  for script in self.__stealth_scripts():
297
322
  await page.add_init_script(path=script)
298
323
 
299
- res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
324
+ first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
300
325
  await page.wait_for_load_state(state="domcontentloaded")
301
326
  if self.network_idle:
302
327
  await page.wait_for_load_state('networkidle')
@@ -313,21 +338,23 @@ class PlaywrightEngine:
313
338
  if self.network_idle:
314
339
  await page.wait_for_load_state('networkidle')
315
340
 
341
+ # In case we didn't catch a document type somehow
342
+ final_response = final_response if final_response else first_response
316
343
  # This will be parsed inside `Response`
317
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
344
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
318
345
  # PlayWright API sometimes give empty status text for some reason!
319
- status_text = res.status_text or StatusText.get(res.status)
346
+ status_text = final_response.status_text or StatusText.get(final_response.status)
320
347
 
321
348
  response = Response(
322
- url=res.url,
349
+ url=page.url,
323
350
  text=await page.content(),
324
351
  body=(await page.content()).encode('utf-8'),
325
- status=res.status,
352
+ status=final_response.status,
326
353
  reason=status_text,
327
354
  encoding=encoding,
328
355
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
329
- headers=await res.all_headers(),
330
- request_headers=await res.request.all_headers(),
356
+ headers=await first_response.all_headers(),
357
+ request_headers=await first_response.request.all_headers(),
331
358
  **self.adaptor_arguments
332
359
  )
333
360
  await page.close()
@@ -84,8 +84,6 @@ class ResponseEncoding:
84
84
  class Response(Adaptor):
85
85
  """This class is returned by all engines as a way to unify response type between different libraries."""
86
86
 
87
- _is_response_result_logged = False # Class-level flag, initialized to False
88
-
89
87
  def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
90
88
  encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
91
89
  automatch_domain = adaptor_arguments.pop('automatch_domain', None)
@@ -99,9 +97,7 @@ class Response(Adaptor):
99
97
  # For back-ward compatibility
100
98
  self.adaptor = self
101
99
  # For easier debugging while working from a Python shell
102
- if not Response._is_response_result_logged:
103
- log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
104
- Response._is_response_result_logged = True
100
+ log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
105
101
 
106
102
  # def __repr__(self):
107
103
  # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
scrapling/fetchers.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
2
- Union)
2
+ SelectorWaitStates, Union)
3
3
  from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
4
4
  check_if_engine_usable)
5
5
  from scrapling.engines.toolbelt import BaseFetcher, Response
@@ -176,8 +176,8 @@ class StealthyFetcher(BaseFetcher):
176
176
  self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
177
177
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
178
178
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
179
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
180
- os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
179
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
180
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
181
181
  ) -> Response:
182
182
  """
183
183
  Opens up a browser and do your request based on your chosen options below.
@@ -234,8 +234,8 @@ class StealthyFetcher(BaseFetcher):
234
234
  self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
235
235
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
236
236
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
237
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
238
- os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
237
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
238
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
239
239
  ) -> Response:
240
240
  """
241
241
  Opens up a browser and do your request based on your chosen options below.
@@ -308,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
308
308
  def fetch(
309
309
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
310
310
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
311
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
311
+ page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
312
312
  hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
313
313
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
314
314
  stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
@@ -368,7 +368,7 @@ class PlayWrightFetcher(BaseFetcher):
368
368
  async def async_fetch(
369
369
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
370
370
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
371
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
371
+ page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
372
372
  hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
373
373
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
374
374
  stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
scrapling/parser.py CHANGED
@@ -155,7 +155,7 @@ class Adaptor(SelectorsGeneration):
155
155
  else:
156
156
  if issubclass(type(element), html.HtmlMixin):
157
157
 
158
- return self.__class__(
158
+ return Adaptor(
159
159
  root=element,
160
160
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
161
161
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
@@ -474,7 +474,7 @@ class Adaptor(SelectorsGeneration):
474
474
 
475
475
  def css(self, selector: str, identifier: str = '',
476
476
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0
477
- ) -> Union['Adaptors[Adaptor]', List]:
477
+ ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
478
478
  """Search current tree with CSS3 selectors
479
479
 
480
480
  **Important:
@@ -517,7 +517,7 @@ class Adaptor(SelectorsGeneration):
517
517
 
518
518
  def xpath(self, selector: str, identifier: str = '',
519
519
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
520
- ) -> Union['Adaptors[Adaptor]', List]:
520
+ ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
521
521
  """Search current tree with XPath selectors
522
522
 
523
523
  **Important:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.9
3
+ Version: 0.2.92
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Classifier: Programming Language :: Python :: 3
23
23
  Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.8
25
24
  Classifier: Programming Language :: Python :: 3.9
26
25
  Classifier: Programming Language :: Python :: 3.10
27
26
  Classifier: Programming Language :: Python :: 3.11
@@ -35,10 +34,11 @@ License-File: LICENSE
35
34
  Requires-Dist: requests>=2.3
36
35
  Requires-Dist: lxml>=4.5
37
36
  Requires-Dist: cssselect>=1.2
37
+ Requires-Dist: click
38
38
  Requires-Dist: w3lib
39
39
  Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
- Requires-Dist: httpx[brotli,zstd]
41
+ Requires-Dist: httpx[brotli,socks,zstd]
42
42
  Requires-Dist: playwright>=1.49.1
43
43
  Requires-Dist: rebrowser-playwright>=1.49.1
44
44
  Requires-Dist: camoufox[geoip]>=0.4.9
@@ -212,52 +212,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
212
212
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
213
213
 
214
214
  ## Installation
215
- Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
215
+ Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
216
216
  ```bash
217
217
  pip3 install scrapling
218
218
  ```
219
- - For using the `StealthyFetcher`, go to the command line and download the browser with
220
- <details><summary>Windows OS</summary>
221
-
222
- ```bash
223
- camoufox fetch --browserforge
224
- ```
225
- </details>
226
- <details><summary>MacOS</summary>
227
-
228
- ```bash
229
- python3 -m camoufox fetch --browserforge
230
- ```
231
- </details>
232
- <details><summary>Linux</summary>
233
-
219
+ Then run this command to install browsers' dependencies needed to use Fetcher classes
234
220
  ```bash
235
- python -m camoufox fetch --browserforge
236
- ```
237
- On a fresh installation of Linux, you may also need the following Firefox dependencies:
238
- - Debian-based distros
239
- ```bash
240
- sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
241
- ```
242
- - Arch-based distros
243
- ```bash
244
- sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
245
- ```
246
- </details>
247
-
248
- <small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
249
-
250
- - If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
251
- ```commandline
252
- playwright install chromium
253
- ```
254
- - If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
255
- ```commandline
256
- python -m browserforge update
221
+ scrapling install
257
222
  ```
223
+ If you have any installation issues, please open an issue.
258
224
 
259
225
  ## Fetching Websites
260
- Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
226
+ Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
261
227
 
262
228
  ### Features
263
229
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
@@ -1,22 +1,23 @@
1
- scrapling/__init__.py,sha256=4adit4xM1Io6mBz-VnnSHcPCQxIYhvDmDVMhbXu8VF4,499
1
+ scrapling/__init__.py,sha256=0iEOX168f4gLFpReEUemMOhTske8AS2o0UQHJWXn-4o,500
2
+ scrapling/cli.py,sha256=njPdJKmbLFHeWjtSiGEm9ALBdSyfUp0IaJvxQL5C31Q,1125
2
3
  scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
3
- scrapling/fetchers.py,sha256=I_N32DMjCzNCMmrkGYoX480x1Eh5Lka6cMJ-EcSfszk,35342
4
- scrapling/parser.py,sha256=NKwOsGR6TB7XC9lMkA418_DRWE6pyUqK0XtmTAA51ic,55215
4
+ scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
5
+ scrapling/parser.py,sha256=sT1gh5pnbjpUzFt8K9DGD6x60zKQcAtzmyf8DgiNDCI,55266
5
6
  scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
6
7
  scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- scrapling/core/_types.py,sha256=__HJ2JTk5vx5eg_7HAJmDjaHrMDIaoxNG8fadLLyKV8,566
8
+ scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
8
9
  scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
9
10
  scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
10
11
  scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
11
12
  scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
12
13
  scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
13
14
  scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
14
- scrapling/engines/camo.py,sha256=L5jRNUgJSAY5hE8KCD-tz4SFrx7ZjowJoWpHrl7havI,12359
15
+ scrapling/engines/camo.py,sha256=wJRfaIU0w_hDSlrP2AdpjBU6NNEKw0wSnVbqUoxt1Gk,13682
15
16
  scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
16
- scrapling/engines/pw.py,sha256=0vCDaodve_WcOdbGqBdyRwMECPZmQ0eGLQikh4WHKFc,17011
17
+ scrapling/engines/pw.py,sha256=MCYE5rDx55D2VOIeUNLl44ROXnyFRfku_u2FOcXjqEQ,18534
17
18
  scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
18
19
  scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
19
- scrapling/engines/toolbelt/custom.py,sha256=FbWTUC0Z8NTmTLFDiiCchs4W0_Q40lz2ONnhInRNuvA,12947
20
+ scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
20
21
  scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
21
22
  scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
22
23
  scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
@@ -40,8 +41,9 @@ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxn
40
41
  tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
42
  tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
42
43
  tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
43
- scrapling-0.2.9.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
44
- scrapling-0.2.9.dist-info/METADATA,sha256=Wg6lcRo_5LcyotrB1ZXagT5-gToAyRmtNKsq6TJoNk4,68382
45
- scrapling-0.2.9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
46
- scrapling-0.2.9.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
47
- scrapling-0.2.9.dist-info/RECORD,,
44
+ scrapling-0.2.92.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
45
+ scrapling-0.2.92.dist-info/METADATA,sha256=2I-HK-xEkVFFyQBio8NAKR0eQEBB-dLHFuvb5eluCEQ,67415
46
+ scrapling-0.2.92.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
47
+ scrapling-0.2.92.dist-info/entry_points.txt,sha256=DHyt2Blxy0P5OE2HRcP95Wz9_xo2ERCDcNqrJjYS3o8,49
48
+ scrapling-0.2.92.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
49
+ scrapling-0.2.92.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ scrapling = scrapling.cli:main