scrapling 0.2.9__tar.gz → 0.2.92__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. {scrapling-0.2.9 → scrapling-0.2.92}/MANIFEST.in +3 -0
  2. {scrapling-0.2.9/scrapling.egg-info → scrapling-0.2.92}/PKG-INFO +8 -42
  3. {scrapling-0.2.9 → scrapling-0.2.92}/README.md +5 -39
  4. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/__init__.py +1 -1
  5. scrapling-0.2.92/scrapling/cli.py +37 -0
  6. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/core/_types.py +2 -0
  7. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/camo.py +40 -16
  8. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/pw.py +43 -16
  9. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/custom.py +1 -5
  10. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/fetchers.py +7 -7
  11. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/parser.py +3 -3
  12. {scrapling-0.2.9 → scrapling-0.2.92/scrapling.egg-info}/PKG-INFO +8 -42
  13. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling.egg-info/SOURCES.txt +2 -0
  14. scrapling-0.2.92/scrapling.egg-info/entry_points.txt +2 -0
  15. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling.egg-info/requires.txt +2 -1
  16. {scrapling-0.2.9 → scrapling-0.2.92}/setup.cfg +1 -1
  17. {scrapling-0.2.9 → scrapling-0.2.92}/setup.py +8 -3
  18. {scrapling-0.2.9 → scrapling-0.2.92}/LICENSE +0 -0
  19. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/core/__init__.py +0 -0
  20. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/core/custom_types.py +0 -0
  21. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/core/mixins.py +0 -0
  22. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/core/storage_adaptors.py +0 -0
  23. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/core/translator.py +0 -0
  24. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/core/utils.py +0 -0
  25. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/defaults.py +0 -0
  26. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/__init__.py +0 -0
  27. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/constants.py +0 -0
  28. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/static.py +0 -0
  29. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/__init__.py +0 -0
  30. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  31. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  32. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  33. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  34. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  35. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  36. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  37. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  38. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/engines/toolbelt/navigation.py +0 -0
  39. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling/py.typed +0 -0
  40. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling.egg-info/dependency_links.txt +0 -0
  41. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling.egg-info/not-zip-safe +0 -0
  42. {scrapling-0.2.9 → scrapling-0.2.92}/scrapling.egg-info/top_level.txt +0 -0
  43. {scrapling-0.2.9 → scrapling-0.2.92}/tests/__init__.py +0 -0
  44. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/__init__.py +0 -0
  45. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/async/__init__.py +0 -0
  46. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/async/test_camoufox.py +0 -0
  47. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/async/test_httpx.py +0 -0
  48. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/async/test_playwright.py +0 -0
  49. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/sync/__init__.py +0 -0
  50. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/sync/test_camoufox.py +0 -0
  51. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/sync/test_httpx.py +0 -0
  52. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/sync/test_playwright.py +0 -0
  53. {scrapling-0.2.9 → scrapling-0.2.92}/tests/fetchers/test_utils.py +0 -0
  54. {scrapling-0.2.9 → scrapling-0.2.92}/tests/parser/__init__.py +0 -0
  55. {scrapling-0.2.9 → scrapling-0.2.92}/tests/parser/test_automatch.py +0 -0
  56. {scrapling-0.2.9 → scrapling-0.2.92}/tests/parser/test_general.py +0 -0
@@ -4,7 +4,10 @@ include *.js
4
4
  include scrapling/engines/toolbelt/bypasses/*.js
5
5
  include scrapling/*.db
6
6
  include scrapling/*.db*
7
+ include scrapling/*.db-*
7
8
  include scrapling/py.typed
9
+ include scrapling/.scrapling_dependencies_installed
10
+ include .scrapling_dependencies_installed
8
11
 
9
12
  recursive-exclude * __pycache__
10
13
  recursive-exclude * *.py[co]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.9
3
+ Version: 0.2.92
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Classifier: Programming Language :: Python :: 3
23
23
  Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.8
25
24
  Classifier: Programming Language :: Python :: 3.9
26
25
  Classifier: Programming Language :: Python :: 3.10
27
26
  Classifier: Programming Language :: Python :: 3.11
@@ -35,10 +34,11 @@ License-File: LICENSE
35
34
  Requires-Dist: requests>=2.3
36
35
  Requires-Dist: lxml>=4.5
37
36
  Requires-Dist: cssselect>=1.2
37
+ Requires-Dist: click
38
38
  Requires-Dist: w3lib
39
39
  Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
- Requires-Dist: httpx[brotli,zstd]
41
+ Requires-Dist: httpx[brotli,socks,zstd]
42
42
  Requires-Dist: playwright>=1.49.1
43
43
  Requires-Dist: rebrowser-playwright>=1.49.1
44
44
  Requires-Dist: camoufox[geoip]>=0.4.9
@@ -212,52 +212,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
212
212
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
213
213
 
214
214
  ## Installation
215
- Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
215
+ Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
216
216
  ```bash
217
217
  pip3 install scrapling
218
218
  ```
219
- - For using the `StealthyFetcher`, go to the command line and download the browser with
220
- <details><summary>Windows OS</summary>
221
-
222
- ```bash
223
- camoufox fetch --browserforge
224
- ```
225
- </details>
226
- <details><summary>MacOS</summary>
227
-
228
- ```bash
229
- python3 -m camoufox fetch --browserforge
230
- ```
231
- </details>
232
- <details><summary>Linux</summary>
233
-
219
+ Then run this command to install browsers' dependencies needed to use Fetcher classes
234
220
  ```bash
235
- python -m camoufox fetch --browserforge
236
- ```
237
- On a fresh installation of Linux, you may also need the following Firefox dependencies:
238
- - Debian-based distros
239
- ```bash
240
- sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
241
- ```
242
- - Arch-based distros
243
- ```bash
244
- sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
245
- ```
246
- </details>
247
-
248
- <small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
249
-
250
- - If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
251
- ```commandline
252
- playwright install chromium
253
- ```
254
- - If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
255
- ```commandline
256
- python -m browserforge update
221
+ scrapling install
257
222
  ```
223
+ If you have any installation issues, please open an issue.
258
224
 
259
225
  ## Fetching Websites
260
- Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
226
+ Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
261
227
 
262
228
  ### Features
263
229
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
@@ -167,52 +167,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
167
167
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
168
168
 
169
169
  ## Installation
170
- Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
170
+ Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
171
171
  ```bash
172
172
  pip3 install scrapling
173
173
  ```
174
- - For using the `StealthyFetcher`, go to the command line and download the browser with
175
- <details><summary>Windows OS</summary>
176
-
177
- ```bash
178
- camoufox fetch --browserforge
179
- ```
180
- </details>
181
- <details><summary>MacOS</summary>
182
-
183
- ```bash
184
- python3 -m camoufox fetch --browserforge
185
- ```
186
- </details>
187
- <details><summary>Linux</summary>
188
-
174
+ Then run this command to install browsers' dependencies needed to use Fetcher classes
189
175
  ```bash
190
- python -m camoufox fetch --browserforge
191
- ```
192
- On a fresh installation of Linux, you may also need the following Firefox dependencies:
193
- - Debian-based distros
194
- ```bash
195
- sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
196
- ```
197
- - Arch-based distros
198
- ```bash
199
- sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
200
- ```
201
- </details>
202
-
203
- <small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
204
-
205
- - If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
206
- ```commandline
207
- playwright install chromium
208
- ```
209
- - If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
210
- ```commandline
211
- python -m browserforge update
176
+ scrapling install
212
177
  ```
178
+ If you have any installation issues, please open an issue.
213
179
 
214
180
  ## Fetching Websites
215
- Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
181
+ Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
216
182
 
217
183
  ### Features
218
184
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.9"
8
+ __version__ = "0.2.92"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
@@ -0,0 +1,37 @@
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import click
7
+
8
+
9
+ def get_package_dir():
10
+ return Path(os.path.dirname(__file__))
11
+
12
+
13
+ def run_command(command, line):
14
+ print(f"Installing {line}...")
15
+ _ = subprocess.check_call(command, shell=True)
16
+ # I meant to not use try except here
17
+
18
+
19
+ @click.command(help="Install all Scrapling's Fetchers dependencies")
20
+ def install():
21
+ if not get_package_dir().joinpath(".scrapling_dependencies_installed").exists():
22
+ run_command([sys.executable, "-m", "playwright", "install", 'chromium'], 'Playwright browsers')
23
+ run_command([sys.executable, "-m", "playwright", "install-deps", 'chromium', 'firefox'], 'Playwright dependencies')
24
+ run_command([sys.executable, "-m", "camoufox", "fetch", '--browserforge'], 'Camoufox browser and databases')
25
+ # if no errors raised by above commands, then we add below file
26
+ get_package_dir().joinpath(".scrapling_dependencies_installed").touch()
27
+ else:
28
+ print('The dependencies are already installed')
29
+
30
+
31
+ @click.group()
32
+ def main():
33
+ pass
34
+
35
+
36
+ # Adding commands
37
+ main.add_command(install)
@@ -5,6 +5,8 @@ Type definitions for type checking purposes.
5
5
  from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
6
  List, Literal, Optional, Pattern, Tuple, Type, Union)
7
7
 
8
+ SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
9
+
8
10
  try:
9
11
  from typing import Protocol
10
12
  except ImportError:
@@ -3,7 +3,7 @@ from camoufox.async_api import AsyncCamoufox
3
3
  from camoufox.sync_api import Camoufox
4
4
 
5
5
  from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
6
- Union)
6
+ SelectorWaitStates, Union)
7
7
  from scrapling.core.utils import log
8
8
  from scrapling.engines.toolbelt import (Response, StatusText,
9
9
  async_intercept_route,
@@ -18,7 +18,7 @@ class CamoufoxEngine:
18
18
  self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
19
19
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
20
20
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
21
+ wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
22
22
  proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
23
23
  geoip: Optional[bool] = False,
24
24
  adaptor_arguments: Dict = None,
@@ -84,6 +84,14 @@ class CamoufoxEngine:
84
84
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
85
85
  """
86
86
  addons = [] if self.disable_ads else [DefaultAddons.UBO]
87
+ # Store the final response
88
+ final_response = None
89
+
90
+ def handle_response(finished_response):
91
+ nonlocal final_response
92
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
93
+ final_response = finished_response
94
+
87
95
  with Camoufox(
88
96
  geoip=self.geoip,
89
97
  proxy=self.proxy,
@@ -100,13 +108,15 @@ class CamoufoxEngine:
100
108
  page = browser.new_page()
101
109
  page.set_default_navigation_timeout(self.timeout)
102
110
  page.set_default_timeout(self.timeout)
111
+ # Listen for all responses
112
+ page.on("response", handle_response)
103
113
  if self.disable_resources:
104
114
  page.route("**/*", intercept_route)
105
115
 
106
116
  if self.extra_headers:
107
117
  page.set_extra_http_headers(self.extra_headers)
108
118
 
109
- res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
119
+ first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
110
120
  page.wait_for_load_state(state="domcontentloaded")
111
121
  if self.network_idle:
112
122
  page.wait_for_load_state('networkidle')
@@ -123,21 +133,23 @@ class CamoufoxEngine:
123
133
  if self.network_idle:
124
134
  page.wait_for_load_state('networkidle')
125
135
 
136
+ # In case we didn't catch a document type somehow
137
+ final_response = final_response if final_response else first_response
126
138
  # This will be parsed inside `Response`
127
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
139
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
128
140
  # PlayWright API sometimes give empty status text for some reason!
129
- status_text = res.status_text or StatusText.get(res.status)
141
+ status_text = final_response.status_text or StatusText.get(final_response.status)
130
142
 
131
143
  response = Response(
132
- url=res.url,
144
+ url=page.url,
133
145
  text=page.content(),
134
146
  body=page.content().encode('utf-8'),
135
- status=res.status,
147
+ status=final_response.status,
136
148
  reason=status_text,
137
149
  encoding=encoding,
138
150
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
139
- headers=res.all_headers(),
140
- request_headers=res.request.all_headers(),
151
+ headers=first_response.all_headers(),
152
+ request_headers=first_response.request.all_headers(),
141
153
  **self.adaptor_arguments
142
154
  )
143
155
  page.close()
@@ -151,6 +163,14 @@ class CamoufoxEngine:
151
163
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
152
164
  """
153
165
  addons = [] if self.disable_ads else [DefaultAddons.UBO]
166
+ # Store the final response
167
+ final_response = None
168
+
169
+ async def handle_response(finished_response):
170
+ nonlocal final_response
171
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
172
+ final_response = finished_response
173
+
154
174
  async with AsyncCamoufox(
155
175
  geoip=self.geoip,
156
176
  proxy=self.proxy,
@@ -167,13 +187,15 @@ class CamoufoxEngine:
167
187
  page = await browser.new_page()
168
188
  page.set_default_navigation_timeout(self.timeout)
169
189
  page.set_default_timeout(self.timeout)
190
+ # Listen for all responses
191
+ page.on("response", handle_response)
170
192
  if self.disable_resources:
171
193
  await page.route("**/*", async_intercept_route)
172
194
 
173
195
  if self.extra_headers:
174
196
  await page.set_extra_http_headers(self.extra_headers)
175
197
 
176
- res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
198
+ first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
177
199
  await page.wait_for_load_state(state="domcontentloaded")
178
200
  if self.network_idle:
179
201
  await page.wait_for_load_state('networkidle')
@@ -190,21 +212,23 @@ class CamoufoxEngine:
190
212
  if self.network_idle:
191
213
  await page.wait_for_load_state('networkidle')
192
214
 
215
+ # In case we didn't catch a document type somehow
216
+ final_response = final_response if final_response else first_response
193
217
  # This will be parsed inside `Response`
194
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
218
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
195
219
  # PlayWright API sometimes give empty status text for some reason!
196
- status_text = res.status_text or StatusText.get(res.status)
220
+ status_text = final_response.status_text or StatusText.get(final_response.status)
197
221
 
198
222
  response = Response(
199
- url=res.url,
223
+ url=page.url,
200
224
  text=await page.content(),
201
225
  body=(await page.content()).encode('utf-8'),
202
- status=res.status,
226
+ status=final_response.status,
203
227
  reason=status_text,
204
228
  encoding=encoding,
205
229
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
206
- headers=await res.all_headers(),
207
- request_headers=await res.request.all_headers(),
230
+ headers=await first_response.all_headers(),
231
+ request_headers=await first_response.request.all_headers(),
208
232
  **self.adaptor_arguments
209
233
  )
210
234
  await page.close()
@@ -1,6 +1,7 @@
1
1
  import json
2
2
 
3
- from scrapling.core._types import Callable, Dict, Optional, Union
3
+ from scrapling.core._types import (Callable, Dict, Optional,
4
+ SelectorWaitStates, Union)
4
5
  from scrapling.core.utils import log, lru_cache
5
6
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
7
  NSTBROWSER_DEFAULT_QUERY)
@@ -23,7 +24,7 @@ class PlaywrightEngine:
23
24
  page_action: Callable = None,
24
25
  wait_selector: Optional[str] = None,
25
26
  locale: Optional[str] = 'en-US',
26
- wait_selector_state: Optional[str] = 'attached',
27
+ wait_selector_state: SelectorWaitStates = 'attached',
27
28
  stealth: Optional[bool] = False,
28
29
  real_chrome: Optional[bool] = False,
29
30
  hide_canvas: Optional[bool] = False,
@@ -193,12 +194,21 @@ class PlaywrightEngine:
193
194
  :param url: Target url.
194
195
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
195
196
  """
197
+ from playwright.sync_api import Response as PlaywrightResponse
196
198
  if not self.stealth or self.real_chrome:
197
199
  # Because rebrowser_playwright doesn't play well with real browsers
198
200
  from playwright.sync_api import sync_playwright
199
201
  else:
200
202
  from rebrowser_playwright.sync_api import sync_playwright
201
203
 
204
+ # Store the final response
205
+ final_response = None
206
+
207
+ def handle_response(finished_response: PlaywrightResponse):
208
+ nonlocal final_response
209
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
210
+ final_response = finished_response
211
+
202
212
  with sync_playwright() as p:
203
213
  # Creating the browser
204
214
  if self.cdp_url:
@@ -212,6 +222,8 @@ class PlaywrightEngine:
212
222
  page = context.new_page()
213
223
  page.set_default_navigation_timeout(self.timeout)
214
224
  page.set_default_timeout(self.timeout)
225
+ # Listen for all responses
226
+ page.on("response", handle_response)
215
227
 
216
228
  if self.extra_headers:
217
229
  page.set_extra_http_headers(self.extra_headers)
@@ -223,7 +235,7 @@ class PlaywrightEngine:
223
235
  for script in self.__stealth_scripts():
224
236
  page.add_init_script(path=script)
225
237
 
226
- res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
238
+ first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
227
239
  page.wait_for_load_state(state="domcontentloaded")
228
240
  if self.network_idle:
229
241
  page.wait_for_load_state('networkidle')
@@ -240,21 +252,23 @@ class PlaywrightEngine:
240
252
  if self.network_idle:
241
253
  page.wait_for_load_state('networkidle')
242
254
 
255
+ # In case we didn't catch a document type somehow
256
+ final_response = final_response if final_response else first_response
243
257
  # This will be parsed inside `Response`
244
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
258
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
245
259
  # PlayWright API sometimes give empty status text for some reason!
246
- status_text = res.status_text or StatusText.get(res.status)
260
+ status_text = final_response.status_text or StatusText.get(final_response.status)
247
261
 
248
262
  response = Response(
249
- url=res.url,
263
+ url=page.url,
250
264
  text=page.content(),
251
265
  body=page.content().encode('utf-8'),
252
- status=res.status,
266
+ status=final_response.status,
253
267
  reason=status_text,
254
268
  encoding=encoding,
255
269
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
256
- headers=res.all_headers(),
257
- request_headers=res.request.all_headers(),
270
+ headers=first_response.all_headers(),
271
+ request_headers=first_response.request.all_headers(),
258
272
  **self.adaptor_arguments
259
273
  )
260
274
  page.close()
@@ -266,12 +280,21 @@ class PlaywrightEngine:
266
280
  :param url: Target url.
267
281
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
268
282
  """
283
+ from playwright.async_api import Response as PlaywrightResponse
269
284
  if not self.stealth or self.real_chrome:
270
285
  # Because rebrowser_playwright doesn't play well with real browsers
271
286
  from playwright.async_api import async_playwright
272
287
  else:
273
288
  from rebrowser_playwright.async_api import async_playwright
274
289
 
290
+ # Store the final response
291
+ final_response = None
292
+
293
+ async def handle_response(finished_response: PlaywrightResponse):
294
+ nonlocal final_response
295
+ if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
296
+ final_response = finished_response
297
+
275
298
  async with async_playwright() as p:
276
299
  # Creating the browser
277
300
  if self.cdp_url:
@@ -285,6 +308,8 @@ class PlaywrightEngine:
285
308
  page = await context.new_page()
286
309
  page.set_default_navigation_timeout(self.timeout)
287
310
  page.set_default_timeout(self.timeout)
311
+ # Listen for all responses
312
+ page.on("response", handle_response)
288
313
 
289
314
  if self.extra_headers:
290
315
  await page.set_extra_http_headers(self.extra_headers)
@@ -296,7 +321,7 @@ class PlaywrightEngine:
296
321
  for script in self.__stealth_scripts():
297
322
  await page.add_init_script(path=script)
298
323
 
299
- res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
324
+ first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
300
325
  await page.wait_for_load_state(state="domcontentloaded")
301
326
  if self.network_idle:
302
327
  await page.wait_for_load_state('networkidle')
@@ -313,21 +338,23 @@ class PlaywrightEngine:
313
338
  if self.network_idle:
314
339
  await page.wait_for_load_state('networkidle')
315
340
 
341
+ # In case we didn't catch a document type somehow
342
+ final_response = final_response if final_response else first_response
316
343
  # This will be parsed inside `Response`
317
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
344
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
318
345
  # PlayWright API sometimes give empty status text for some reason!
319
- status_text = res.status_text or StatusText.get(res.status)
346
+ status_text = final_response.status_text or StatusText.get(final_response.status)
320
347
 
321
348
  response = Response(
322
- url=res.url,
349
+ url=page.url,
323
350
  text=await page.content(),
324
351
  body=(await page.content()).encode('utf-8'),
325
- status=res.status,
352
+ status=final_response.status,
326
353
  reason=status_text,
327
354
  encoding=encoding,
328
355
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
329
- headers=await res.all_headers(),
330
- request_headers=await res.request.all_headers(),
356
+ headers=await first_response.all_headers(),
357
+ request_headers=await first_response.request.all_headers(),
331
358
  **self.adaptor_arguments
332
359
  )
333
360
  await page.close()
@@ -84,8 +84,6 @@ class ResponseEncoding:
84
84
  class Response(Adaptor):
85
85
  """This class is returned by all engines as a way to unify response type between different libraries."""
86
86
 
87
- _is_response_result_logged = False # Class-level flag, initialized to False
88
-
89
87
  def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
90
88
  encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
91
89
  automatch_domain = adaptor_arguments.pop('automatch_domain', None)
@@ -99,9 +97,7 @@ class Response(Adaptor):
99
97
  # For back-ward compatibility
100
98
  self.adaptor = self
101
99
  # For easier debugging while working from a Python shell
102
- if not Response._is_response_result_logged:
103
- log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
104
- Response._is_response_result_logged = True
100
+ log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
105
101
 
106
102
  # def __repr__(self):
107
103
  # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
@@ -1,5 +1,5 @@
1
1
  from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
2
- Union)
2
+ SelectorWaitStates, Union)
3
3
  from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
4
4
  check_if_engine_usable)
5
5
  from scrapling.engines.toolbelt import BaseFetcher, Response
@@ -176,8 +176,8 @@ class StealthyFetcher(BaseFetcher):
176
176
  self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
177
177
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
178
178
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
179
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
180
- os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
179
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
180
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
181
181
  ) -> Response:
182
182
  """
183
183
  Opens up a browser and do your request based on your chosen options below.
@@ -234,8 +234,8 @@ class StealthyFetcher(BaseFetcher):
234
234
  self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
235
235
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
236
236
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
237
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
238
- os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
237
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
238
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
239
239
  ) -> Response:
240
240
  """
241
241
  Opens up a browser and do your request based on your chosen options below.
@@ -308,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
308
308
  def fetch(
309
309
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
310
310
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
311
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
311
+ page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
312
312
  hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
313
313
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
314
314
  stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
@@ -368,7 +368,7 @@ class PlayWrightFetcher(BaseFetcher):
368
368
  async def async_fetch(
369
369
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
370
370
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
371
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
371
+ page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
372
372
  hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
373
373
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
374
374
  stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
@@ -155,7 +155,7 @@ class Adaptor(SelectorsGeneration):
155
155
  else:
156
156
  if issubclass(type(element), html.HtmlMixin):
157
157
 
158
- return self.__class__(
158
+ return Adaptor(
159
159
  root=element,
160
160
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
161
161
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
@@ -474,7 +474,7 @@ class Adaptor(SelectorsGeneration):
474
474
 
475
475
  def css(self, selector: str, identifier: str = '',
476
476
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0
477
- ) -> Union['Adaptors[Adaptor]', List]:
477
+ ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
478
478
  """Search current tree with CSS3 selectors
479
479
 
480
480
  **Important:
@@ -517,7 +517,7 @@ class Adaptor(SelectorsGeneration):
517
517
 
518
518
  def xpath(self, selector: str, identifier: str = '',
519
519
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
520
- ) -> Union['Adaptors[Adaptor]', List]:
520
+ ) -> Union['Adaptors[Adaptor]', List, 'TextHandlers[TextHandler]']:
521
521
  """Search current tree with XPath selectors
522
522
 
523
523
  **Important:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.9
3
+ Version: 0.2.92
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Classifier: Programming Language :: Python :: 3
23
23
  Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.8
25
24
  Classifier: Programming Language :: Python :: 3.9
26
25
  Classifier: Programming Language :: Python :: 3.10
27
26
  Classifier: Programming Language :: Python :: 3.11
@@ -35,10 +34,11 @@ License-File: LICENSE
35
34
  Requires-Dist: requests>=2.3
36
35
  Requires-Dist: lxml>=4.5
37
36
  Requires-Dist: cssselect>=1.2
37
+ Requires-Dist: click
38
38
  Requires-Dist: w3lib
39
39
  Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
- Requires-Dist: httpx[brotli,zstd]
41
+ Requires-Dist: httpx[brotli,socks,zstd]
42
42
  Requires-Dist: playwright>=1.49.1
43
43
  Requires-Dist: rebrowser-playwright>=1.49.1
44
44
  Requires-Dist: camoufox[geoip]>=0.4.9
@@ -212,52 +212,18 @@ Scrapling can find elements with more methods and it returns full element `Adapt
212
212
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
213
213
 
214
214
  ## Installation
215
- Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
215
+ Scrapling is a breeze to get started with; Starting from version 0.2.9, we require at least Python 3.9 to work.
216
216
  ```bash
217
217
  pip3 install scrapling
218
218
  ```
219
- - For using the `StealthyFetcher`, go to the command line and download the browser with
220
- <details><summary>Windows OS</summary>
221
-
222
- ```bash
223
- camoufox fetch --browserforge
224
- ```
225
- </details>
226
- <details><summary>MacOS</summary>
227
-
228
- ```bash
229
- python3 -m camoufox fetch --browserforge
230
- ```
231
- </details>
232
- <details><summary>Linux</summary>
233
-
219
+ Then run this command to install browsers' dependencies needed to use Fetcher classes
234
220
  ```bash
235
- python -m camoufox fetch --browserforge
236
- ```
237
- On a fresh installation of Linux, you may also need the following Firefox dependencies:
238
- - Debian-based distros
239
- ```bash
240
- sudo apt install -y libgtk-3-0 libx11-xcb1 libasound2
241
- ```
242
- - Arch-based distros
243
- ```bash
244
- sudo pacman -S gtk3 libx11 libxcb cairo libasound alsa-lib
245
- ```
246
- </details>
247
-
248
- <small> See the official <a href="https://camoufox.com/python/installation/#download-the-browser">Camoufox documentation</a> for more info on installation</small>
249
-
250
- - If you are going to use the `PlayWrightFetcher` options, then install Playwright's Chromium browser with:
251
- ```commandline
252
- playwright install chromium
253
- ```
254
- - If you are going to use normal requests only with the `Fetcher` class then update the fingerprints files with:
255
- ```commandline
256
- python -m browserforge update
221
+ scrapling install
257
222
  ```
223
+ If you have any installation issues, please open an issue.
258
224
 
259
225
  ## Fetching Websites
260
- Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
226
+ Fetchers are interfaces built on top of other libraries with added features that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
261
227
 
262
228
  ### Features
263
229
  You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
@@ -4,6 +4,7 @@ README.md
4
4
  setup.cfg
5
5
  setup.py
6
6
  scrapling/__init__.py
7
+ scrapling/cli.py
7
8
  scrapling/defaults.py
8
9
  scrapling/fetchers.py
9
10
  scrapling/parser.py
@@ -11,6 +12,7 @@ scrapling/py.typed
11
12
  scrapling.egg-info/PKG-INFO
12
13
  scrapling.egg-info/SOURCES.txt
13
14
  scrapling.egg-info/dependency_links.txt
15
+ scrapling.egg-info/entry_points.txt
14
16
  scrapling.egg-info/not-zip-safe
15
17
  scrapling.egg-info/requires.txt
16
18
  scrapling.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ scrapling = scrapling.cli:main
@@ -1,10 +1,11 @@
1
1
  requests>=2.3
2
2
  lxml>=4.5
3
3
  cssselect>=1.2
4
+ click
4
5
  w3lib
5
6
  orjson>=3
6
7
  tldextract
7
- httpx[brotli,zstd]
8
+ httpx[brotli,socks,zstd]
8
9
  playwright>=1.49.1
9
10
  rebrowser-playwright>=1.49.1
10
11
  camoufox[geoip]>=0.4.9
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = scrapling
3
- version = 0.2.9
3
+ version = 0.2.92
4
4
  author = Karim Shoair
5
5
  author_email = karim.shoair@pm.me
6
6
  description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
6
6
 
7
7
  setup(
8
8
  name="scrapling",
9
- version="0.2.9",
9
+ version="0.2.92",
10
10
  description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
11
11
  simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
12
12
  impressive speed improvements over many popular scraping tools.""",
@@ -20,6 +20,11 @@ setup(
20
20
  package_dir={
21
21
  "scrapling": "scrapling",
22
22
  },
23
+ entry_points={
24
+ 'console_scripts': [
25
+ 'scrapling=scrapling.cli:main'
26
+ ],
27
+ },
23
28
  include_package_data=True,
24
29
  classifiers=[
25
30
  "Operating System :: OS Independent",
@@ -37,7 +42,6 @@ setup(
37
42
  "Topic :: Software Development :: Libraries :: Python Modules",
38
43
  "Programming Language :: Python :: 3",
39
44
  "Programming Language :: Python :: 3 :: Only",
40
- "Programming Language :: Python :: 3.8",
41
45
  "Programming Language :: Python :: 3.9",
42
46
  "Programming Language :: Python :: 3.10",
43
47
  "Programming Language :: Python :: 3.11",
@@ -51,10 +55,11 @@ setup(
51
55
  "requests>=2.3",
52
56
  "lxml>=4.5",
53
57
  "cssselect>=1.2",
58
+ 'click',
54
59
  "w3lib",
55
60
  "orjson>=3",
56
61
  "tldextract",
57
- 'httpx[brotli,zstd]',
62
+ 'httpx[brotli,zstd, socks]',
58
63
  'playwright>=1.49.1',
59
64
  'rebrowser-playwright>=1.49.1',
60
65
  'camoufox[geoip]>=0.4.9'
File without changes
File without changes
File without changes