scrapling 0.2.1__tar.gz → 0.2.3__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. {scrapling-0.2.1 → scrapling-0.2.3}/MANIFEST.in +1 -0
  2. {scrapling-0.2.1/scrapling.egg-info → scrapling-0.2.3}/PKG-INFO +16 -5
  3. {scrapling-0.2.1 → scrapling-0.2.3}/README.md +14 -3
  4. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/__init__.py +1 -1
  5. scrapling-0.2.3/scrapling/defaults.py +6 -0
  6. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/camo.py +2 -2
  7. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/pw.py +2 -2
  8. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/static.py +2 -2
  9. scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +40 -0
  10. scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/notification_permission.js +5 -0
  11. scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +5 -0
  12. scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -0
  13. scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/screen_props.js +27 -0
  14. scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +27 -0
  15. scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/window_chrome.js +213 -0
  16. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/toolbelt/custom.py +3 -4
  17. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/parser.py +11 -2
  18. {scrapling-0.2.1 → scrapling-0.2.3/scrapling.egg-info}/PKG-INFO +16 -5
  19. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/SOURCES.txt +8 -0
  20. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/requires.txt +1 -1
  21. {scrapling-0.2.1 → scrapling-0.2.3}/setup.cfg +1 -1
  22. {scrapling-0.2.1 → scrapling-0.2.3}/setup.py +2 -2
  23. {scrapling-0.2.1 → scrapling-0.2.3}/LICENSE +0 -0
  24. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/__init__.py +0 -0
  25. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/_types.py +0 -0
  26. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/custom_types.py +0 -0
  27. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/mixins.py +0 -0
  28. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/storage_adaptors.py +0 -0
  29. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/translator.py +0 -0
  30. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/utils.py +0 -0
  31. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/__init__.py +0 -0
  32. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/constants.py +0 -0
  33. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/toolbelt/__init__.py +0 -0
  34. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  35. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/toolbelt/navigation.py +0 -0
  36. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/fetchers.py +0 -0
  37. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/py.typed +0 -0
  38. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/dependency_links.txt +0 -0
  39. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/not-zip-safe +0 -0
  40. {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/top_level.txt +0 -0
  41. {scrapling-0.2.1 → scrapling-0.2.3}/tests/__init__.py +0 -0
  42. {scrapling-0.2.1 → scrapling-0.2.3}/tests/fetchers/__init__.py +0 -0
  43. {scrapling-0.2.1 → scrapling-0.2.3}/tests/fetchers/test_camoufox.py +0 -0
  44. {scrapling-0.2.1 → scrapling-0.2.3}/tests/fetchers/test_httpx.py +0 -0
  45. {scrapling-0.2.1 → scrapling-0.2.3}/tests/fetchers/test_playwright.py +0 -0
  46. {scrapling-0.2.1 → scrapling-0.2.3}/tests/parser/__init__.py +0 -0
  47. {scrapling-0.2.1 → scrapling-0.2.3}/tests/parser/test_automatch.py +0 -0
  48. {scrapling-0.2.1 → scrapling-0.2.3}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,7 @@
1
1
  include LICENSE
2
2
  include *.db
3
3
  include *.js
4
+ include scrapling/engines/toolbelt/bypasses/*.js
4
5
  include scrapling/*.db
5
6
  include scrapling/*.db*
6
7
  include scrapling/py.typed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
42
  Requires-Dist: playwright
43
43
  Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.3.9
44
+ Requires-Dist: camoufox>=0.3.10
45
45
  Requires-Dist: browserforge
46
46
 
47
47
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -52,9 +52,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
52
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
53
 
54
54
  ```python
55
- >> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
55
+ >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
56
56
  # Fetch websites' source under the radar!
57
- >> page = StealthyFetcher().fetch('https://example.com', headless=True, network_idle=True)
57
+ >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
58
  >> print(page.status)
59
59
  200
60
60
  >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
@@ -257,12 +257,21 @@ python -m browserforge update
257
257
  ```
258
258
 
259
259
  ## Fetching Websites Features
260
- All fetcher-type classes are imported in the same way
260
+ You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
261
261
  ```python
262
262
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
263
263
  ```
264
264
  And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
265
265
 
266
+ If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
267
+ ```python
268
+ from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
269
+ ```
270
+ then use it right away without initializing like:
271
+ ```python
272
+ page = StealthyFetcher.fetch('https://example.com')
273
+ ```
274
+
266
275
  Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
267
276
  > [!NOTE]
268
277
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
@@ -803,6 +812,8 @@ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its st
803
812
 
804
813
  ## More Sponsors!
805
814
  [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
815
+ <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
816
+
806
817
 
807
818
  ## Contributing
808
819
  Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
@@ -6,9 +6,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
6
6
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
7
7
 
8
8
  ```python
9
- >> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
9
+ >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
10
10
  # Fetch websites' source under the radar!
11
- >> page = StealthyFetcher().fetch('https://example.com', headless=True, network_idle=True)
11
+ >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
12
12
  >> print(page.status)
13
13
  200
14
14
  >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
@@ -211,12 +211,21 @@ python -m browserforge update
211
211
  ```
212
212
 
213
213
  ## Fetching Websites Features
214
- All fetcher-type classes are imported in the same way
214
+ You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
215
215
  ```python
216
216
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
217
217
  ```
218
218
  And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
219
219
 
220
+ If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
221
+ ```python
222
+ from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
223
+ ```
224
+ then use it right away without initializing like:
225
+ ```python
226
+ page = StealthyFetcher.fetch('https://example.com')
227
+ ```
228
+
220
229
  Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
221
230
  > [!NOTE]
222
231
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
@@ -757,6 +766,8 @@ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its st
757
766
 
758
767
  ## More Sponsors!
759
768
  [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
769
+ <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
770
+
760
771
 
761
772
  ## Contributing
762
773
  Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
4
4
  from scrapling.core.custom_types import TextHandler, AttributesHandler
5
5
 
6
6
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
- __version__ = "0.2.1"
7
+ __version__ = "0.2.3"
8
8
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
9
 
10
10
 
@@ -0,0 +1,6 @@
1
+ from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
2
+
3
+ # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
4
+ Fetcher = Fetcher()
5
+ StealthyFetcher = StealthyFetcher()
6
+ PlayWrightFetcher = PlayWrightFetcher()
@@ -114,14 +114,14 @@ class CamoufoxEngine:
114
114
  response = Response(
115
115
  url=res.url,
116
116
  text=page.content(),
117
- content=res.body(),
117
+ body=res.body(),
118
118
  status=res.status,
119
119
  reason=res.status_text,
120
120
  encoding=encoding,
121
121
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
122
122
  headers=res.all_headers(),
123
123
  request_headers=res.request.all_headers(),
124
- adaptor_arguments=self.adaptor_arguments
124
+ **self.adaptor_arguments
125
125
  )
126
126
  page.close()
127
127
 
@@ -224,14 +224,14 @@ class PlaywrightEngine:
224
224
  response = Response(
225
225
  url=res.url,
226
226
  text=page.content(),
227
- content=res.body(),
227
+ body=res.body(),
228
228
  status=res.status,
229
229
  reason=res.status_text,
230
230
  encoding=encoding,
231
231
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
232
232
  headers=res.all_headers(),
233
233
  request_headers=res.request.all_headers(),
234
- adaptor_arguments=self.adaptor_arguments
234
+ **self.adaptor_arguments
235
235
  )
236
236
  page.close()
237
237
  return response
@@ -53,14 +53,14 @@ class StaticEngine:
53
53
  return Response(
54
54
  url=str(response.url),
55
55
  text=response.text,
56
- content=response.content,
56
+ body=response.content,
57
57
  status=response.status_code,
58
58
  reason=response.reason_phrase,
59
59
  encoding=response.encoding or 'utf-8',
60
60
  cookies=dict(response.cookies),
61
61
  headers=dict(response.headers),
62
62
  request_headers=dict(response.request.headers),
63
- adaptor_arguments=self.adaptor_arguments
63
+ **self.adaptor_arguments
64
64
  )
65
65
 
66
66
  def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
@@ -0,0 +1,40 @@
1
+ if(navigator.plugins.length == 0){
2
+ Object.defineProperty(navigator, 'plugins', {
3
+ get: () => {
4
+ const PDFViewerPlugin = Object.create(Plugin.prototype, {
5
+ description: { value: 'Portable Document Format', enumerable: false },
6
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
7
+ name: { value: 'PDF Viewer', enumerable: false },
8
+ });
9
+ const ChromePDFViewer = Object.create(Plugin.prototype, {
10
+ description: { value: 'Portable Document Format', enumerable: false },
11
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
12
+ name: { value: 'Chrome PDF Viewer', enumerable: false },
13
+ });
14
+ const ChromiumPDFViewer = Object.create(Plugin.prototype, {
15
+ description: { value: 'Portable Document Format', enumerable: false },
16
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
17
+ name: { value: 'Chromium PDF Viewer', enumerable: false },
18
+ });
19
+ const EdgePDFViewer = Object.create(Plugin.prototype, {
20
+ description: { value: 'Portable Document Format', enumerable: false },
21
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
22
+ name: { value: 'Microsoft Edge PDF Viewer', enumerable: false },
23
+ });
24
+ const WebKitPDFPlugin = Object.create(Plugin.prototype, {
25
+ description: { value: 'Portable Document Format', enumerable: false },
26
+ filename: { value: 'internal-pdf-viewer', enumerable: false },
27
+ name: { value: 'WebKit built-in PDF', enumerable: false },
28
+ });
29
+
30
+ return Object.create(PluginArray.prototype, {
31
+ length: { value: 5 },
32
+ 0: { value: PDFViewerPlugin },
33
+ 1: { value: ChromePDFViewer },
34
+ 2: { value: ChromiumPDFViewer },
35
+ 3: { value: EdgePDFViewer },
36
+ 4: { value: WebKitPDFPlugin },
37
+ });
38
+ },
39
+ });
40
+ }
@@ -0,0 +1,5 @@
1
+ // Bypasses `notificationIsDenied` test in creepsjs's 'Like Headless' sections
2
+ const isSecure = document.location.protocol.startsWith('https')
3
+ if (isSecure){
4
+ Object.defineProperty(Notification, 'permission', {get: () => 'default'})
5
+ }
@@ -0,0 +1,5 @@
1
+ // PDF viewer enabled
2
+ // Bypasses `pdfIsDisabled` test in creepsjs's 'Like Headless' sections
3
+ Object.defineProperty(navigator, 'pdfViewerEnabled', {
4
+ get: () => true,
5
+ });
@@ -0,0 +1,2 @@
1
+ // Remove playwright fingerprint => https://github.com/microsoft/playwright/commit/c9e673c6dca746384338ab6bb0cf63c7e7caa9b2#diff-087773eea292da9db5a3f27de8f1a2940cdb895383ad750c3cd8e01772a35b40R915
2
+ delete __pwInitScripts;
@@ -0,0 +1,27 @@
1
+ const windowScreenProps = {
2
+ // Dimensions
3
+ innerHeight: 0,
4
+ innerWidth: 0,
5
+ outerHeight: 754,
6
+ outerWidth: 1313,
7
+
8
+ // Position
9
+ screenX: 19,
10
+ pageXOffset: 0,
11
+ pageYOffset: 0,
12
+
13
+ // Display
14
+ devicePixelRatio: 2
15
+ };
16
+
17
+ try {
18
+ for (const [prop, value] of Object.entries(windowScreenProps)) {
19
+ if (value > 0) {
20
+ // The 0 values are introduced by collecting in the hidden iframe.
21
+ // They are document sizes anyway so no need to test them or inject them.
22
+ window[prop] = value;
23
+ }
24
+ }
25
+ } catch (e) {
26
+ console.warn(e);
27
+ };
@@ -0,0 +1,27 @@
1
+ // Create a function that looks like a native getter
2
+ const nativeGetter = function get webdriver() {
3
+ return false;
4
+ };
5
+
6
+ // Copy over native function properties
7
+ Object.defineProperties(nativeGetter, {
8
+ name: { value: 'get webdriver', configurable: true },
9
+ length: { value: 0, configurable: true },
10
+ toString: {
11
+ value: function() {
12
+ return `function get webdriver() { [native code] }`;
13
+ },
14
+ configurable: true
15
+ }
16
+ });
17
+
18
+ // Make it look native
19
+ Object.setPrototypeOf(nativeGetter, Function.prototype);
20
+
21
+ // Apply the modified descriptor
22
+ Object.defineProperty(Navigator.prototype, 'webdriver', {
23
+ get: nativeGetter,
24
+ set: undefined,
25
+ enumerable: true,
26
+ configurable: true
27
+ });
@@ -0,0 +1,213 @@
1
+ // To escape `HEADCHR_CHROME_OBJ` test in headless mode => https://github.com/antoinevastel/fp-collect/blob/master/src/fpCollect.js#L322
2
+ // Faking window.chrome fully
3
+
4
+ if (!window.chrome) {
5
+ // First, save all existing properties
6
+ const originalKeys = Object.getOwnPropertyNames(window);
7
+ const tempObj = {};
8
+
9
+ // Recreate all properties in original order
10
+ for (const key of originalKeys) {
11
+ const descriptor = Object.getOwnPropertyDescriptor(window, key);
12
+ const value = window[key];
13
+ // delete window[key];
14
+ Object.defineProperty(tempObj, key, descriptor);
15
+ }
16
+
17
+ // Use the exact property descriptor found in headful Chrome
18
+ // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`
19
+ const mockChrome = {
20
+ loadTimes: {},
21
+ csi: {},
22
+ app: {
23
+ isInstalled: false
24
+ },
25
+ // Add other Chrome-specific properties
26
+ };
27
+
28
+ Object.defineProperty(tempObj, 'chrome', {
29
+ writable: true,
30
+ enumerable: true,
31
+ configurable: false,
32
+ value: mockChrome
33
+ });
34
+ for (const key of Object.getOwnPropertyNames(tempObj)) {
35
+ try {
36
+ Object.defineProperty(window, key,
37
+ Object.getOwnPropertyDescriptor(tempObj, key));
38
+ } catch (e) {}
39
+ };
40
+ // todo: solve this
41
+ // Using line below bypasses the hasHighChromeIndex test in creepjs ==> https://github.com/abrahamjuliot/creepjs/blob/master/src/headless/index.ts#L121
42
+ // Chrome object have to be in the end of the window properties
43
+ // Object.assign(window, tempObj);
44
+ // But makes window.chrome unreadable on 'https://bot.sannysoft.com/'
45
+ }
46
+
47
+ // That means we're running headful and don't need to mock anything
48
+ if ('app' in window.chrome) {
49
+ return; // Nothing to do here
50
+ }
51
+ const makeError = {
52
+ ErrorInInvocation: fn => {
53
+ const err = new TypeError(`Error in invocation of app.${fn}()`);
54
+ return utils.stripErrorWithAnchor(
55
+ err,
56
+ `at ${fn} (eval at <anonymous>`,
57
+ );
58
+ },
59
+ };
60
+ // check with: `JSON.stringify(window.chrome['app'])`
61
+ const STATIC_DATA = JSON.parse(
62
+ `
63
+ {
64
+ "isInstalled": false,
65
+ "InstallState": {
66
+ "DISABLED": "disabled",
67
+ "INSTALLED": "installed",
68
+ "NOT_INSTALLED": "not_installed"
69
+ },
70
+ "RunningState": {
71
+ "CANNOT_RUN": "cannot_run",
72
+ "READY_TO_RUN": "ready_to_run",
73
+ "RUNNING": "running"
74
+ }
75
+ }
76
+ `.trim(),
77
+ );
78
+ window.chrome.app = {
79
+ ...STATIC_DATA,
80
+
81
+ get isInstalled() {
82
+ return false;
83
+ },
84
+
85
+ getDetails: function getDetails() {
86
+ if (arguments.length) {
87
+ throw makeError.ErrorInInvocation(`getDetails`);
88
+ }
89
+ return null;
90
+ },
91
+ getIsInstalled: function getDetails() {
92
+ if (arguments.length) {
93
+ throw makeError.ErrorInInvocation(`getIsInstalled`);
94
+ }
95
+ return false;
96
+ },
97
+ runningState: function getDetails() {
98
+ if (arguments.length) {
99
+ throw makeError.ErrorInInvocation(`runningState`);
100
+ }
101
+ return 'cannot_run';
102
+ },
103
+ };
104
+ // Check that the Navigation Timing API v1 is available, we need that
105
+ if (!window.performance || !window.performance.timing) {
106
+ return;
107
+ }
108
+ const {timing} = window.performance;
109
+ window.chrome.csi = function () {
110
+ return {
111
+ onloadT: timing.domContentLoadedEventEnd,
112
+ startE: timing.navigationStart,
113
+ pageT: Date.now() - timing.navigationStart,
114
+ tran: 15, // Transition type or something
115
+ };
116
+ };
117
+ if (!window.PerformancePaintTiming){
118
+ return;
119
+ }
120
+ const {performance} = window;
121
+ // Some stuff is not available on about:blank as it requires a navigation to occur,
122
+ // let's harden the code to not fail then:
123
+ const ntEntryFallback = {
124
+ nextHopProtocol: 'h2',
125
+ type: 'other',
126
+ };
127
+
128
+ // The API exposes some funky info regarding the connection
129
+ const protocolInfo = {
130
+ get connectionInfo() {
131
+ const ntEntry =
132
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
133
+ return ntEntry.nextHopProtocol;
134
+ },
135
+ get npnNegotiatedProtocol() {
136
+ // NPN is deprecated in favor of ALPN, but this implementation returns the
137
+ // HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
138
+ const ntEntry =
139
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
140
+ return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)
141
+ ? ntEntry.nextHopProtocol
142
+ : 'unknown';
143
+ },
144
+ get navigationType() {
145
+ const ntEntry =
146
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
147
+ return ntEntry.type;
148
+ },
149
+ get wasAlternateProtocolAvailable() {
150
+ // The Alternate-Protocol header is deprecated in favor of Alt-Svc
151
+ // (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this
152
+ // should always return false.
153
+ return false;
154
+ },
155
+ get wasFetchedViaSpdy() {
156
+ // SPDY is deprecated in favor of HTTP/2, but this implementation returns
157
+ // true for HTTP/2 or HTTP2+QUIC/39 as well.
158
+ const ntEntry =
159
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
160
+ return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
161
+ },
162
+ get wasNpnNegotiated() {
163
+ // NPN is deprecated in favor of ALPN, but this implementation returns true
164
+ // for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
165
+ const ntEntry =
166
+ performance.getEntriesByType('navigation')[0] || ntEntryFallback;
167
+ return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
168
+ },
169
+ };
170
+
171
+ // Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3
172
+ function toFixed(num, fixed) {
173
+ var re = new RegExp('^-?\\d+(?:.\\d{0,' + (fixed || -1) + '})?');
174
+ return num.toString().match(re)[0];
175
+ }
176
+
177
+ const timingInfo = {
178
+ get firstPaintAfterLoadTime() {
179
+ // This was never actually implemented and always returns 0.
180
+ return 0;
181
+ },
182
+ get requestTime() {
183
+ return timing.navigationStart / 1000;
184
+ },
185
+ get startLoadTime() {
186
+ return timing.navigationStart / 1000;
187
+ },
188
+ get commitLoadTime() {
189
+ return timing.responseStart / 1000;
190
+ },
191
+ get finishDocumentLoadTime() {
192
+ return timing.domContentLoadedEventEnd / 1000;
193
+ },
194
+ get finishLoadTime() {
195
+ return timing.loadEventEnd / 1000;
196
+ },
197
+ get firstPaintTime() {
198
+ const fpEntry = performance.getEntriesByType('paint')[0] || {
199
+ startTime: timing.loadEventEnd / 1000, // Fallback if no navigation occured (`about:blank`)
200
+ };
201
+ return toFixed(
202
+ (fpEntry.startTime + performance.timeOrigin) / 1000,
203
+ 3,
204
+ );
205
+ },
206
+ };
207
+
208
+ window.chrome.loadTimes = function () {
209
+ return {
210
+ ...protocolInfo,
211
+ ...timingInfo,
212
+ };
213
+ };
@@ -12,15 +12,14 @@ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callab
12
12
  class Response(Adaptor):
13
13
  """This class is returned by all engines as a way to unify response type between different libraries."""
14
14
 
15
- def __init__(self, url: str, text: str, content: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, adaptor_arguments: Dict, encoding: str = 'utf-8'):
15
+ def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
16
16
  automatch_domain = adaptor_arguments.pop('automatch_domain', None)
17
- super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
18
-
19
17
  self.status = status
20
18
  self.reason = reason
21
19
  self.cookies = cookies
22
20
  self.headers = headers
23
21
  self.request_headers = request_headers
22
+ super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
24
23
  # For back-ward compatibility
25
24
  self.adaptor = self
26
25
 
@@ -31,7 +30,7 @@ class Response(Adaptor):
31
30
  class BaseFetcher:
32
31
  def __init__(
33
32
  self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
34
- storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
33
+ storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
35
34
  automatch_domain: Optional[str] = None,
36
35
  ):
37
36
  """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
@@ -32,6 +32,7 @@ class Adaptor(SelectorsGeneration):
32
32
  storage: Any = SQLiteStorageSystem,
33
33
  storage_args: Optional[Dict] = None,
34
34
  debug: Optional[bool] = True,
35
+ **kwargs
35
36
  ):
36
37
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
37
38
  with expressions in CSS, XPath, or with simply text. Check the docs for more info.
@@ -117,6 +118,10 @@ class Adaptor(SelectorsGeneration):
117
118
  self.__attributes = None
118
119
  self.__tag = None
119
120
  self.__debug = debug
121
+ # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
122
+ self.__response_data = {
123
+ key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
124
+ } if hasattr(self, 'status') else {}
120
125
 
121
126
  # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
122
127
  @staticmethod
@@ -138,10 +143,14 @@ class Adaptor(SelectorsGeneration):
138
143
  return TextHandler(str(element))
139
144
  else:
140
145
  if issubclass(type(element), html.HtmlMixin):
146
+
141
147
  return self.__class__(
142
- root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
148
+ root=element,
149
+ text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
150
+ url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
143
151
  keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
144
- huge_tree=self.__huge_tree_enabled, debug=self.__debug
152
+ huge_tree=self.__huge_tree_enabled, debug=self.__debug,
153
+ **self.__response_data
145
154
  )
146
155
  return element
147
156
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
42
  Requires-Dist: playwright
43
43
  Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.3.9
44
+ Requires-Dist: camoufox>=0.3.10
45
45
  Requires-Dist: browserforge
46
46
 
47
47
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -52,9 +52,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
52
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
53
 
54
54
  ```python
55
- >> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
55
+ >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
56
56
  # Fetch websites' source under the radar!
57
- >> page = StealthyFetcher().fetch('https://example.com', headless=True, network_idle=True)
57
+ >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
58
  >> print(page.status)
59
59
  200
60
60
  >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
@@ -257,12 +257,21 @@ python -m browserforge update
257
257
  ```
258
258
 
259
259
  ## Fetching Websites Features
260
- All fetcher-type classes are imported in the same way
260
+ You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
261
261
  ```python
262
262
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
263
263
  ```
264
264
  And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
265
265
 
266
+ If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
267
+ ```python
268
+ from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
269
+ ```
270
+ then use it right away without initializing like:
271
+ ```python
272
+ page = StealthyFetcher.fetch('https://example.com')
273
+ ```
274
+
266
275
  Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
267
276
  > [!NOTE]
268
277
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
@@ -803,6 +812,8 @@ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its st
803
812
 
804
813
  ## More Sponsors!
805
814
  [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
815
+ <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
816
+
806
817
 
807
818
  ## Contributing
808
819
  Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
@@ -4,6 +4,7 @@ README.md
4
4
  setup.cfg
5
5
  setup.py
6
6
  scrapling/__init__.py
7
+ scrapling/defaults.py
7
8
  scrapling/fetchers.py
8
9
  scrapling/parser.py
9
10
  scrapling/py.typed
@@ -29,6 +30,13 @@ scrapling/engines/toolbelt/__init__.py
29
30
  scrapling/engines/toolbelt/custom.py
30
31
  scrapling/engines/toolbelt/fingerprints.py
31
32
  scrapling/engines/toolbelt/navigation.py
33
+ scrapling/engines/toolbelt/bypasses/navigator_plugins.js
34
+ scrapling/engines/toolbelt/bypasses/notification_permission.js
35
+ scrapling/engines/toolbelt/bypasses/pdf_viewer.js
36
+ scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
37
+ scrapling/engines/toolbelt/bypasses/screen_props.js
38
+ scrapling/engines/toolbelt/bypasses/webdriver_fully.js
39
+ scrapling/engines/toolbelt/bypasses/window_chrome.js
32
40
  tests/__init__.py
33
41
  tests/fetchers/__init__.py
34
42
  tests/fetchers/test_camoufox.py
@@ -7,5 +7,5 @@ tldextract
7
7
  httpx[brotli,zstd]
8
8
  playwright
9
9
  rebrowser-playwright
10
- camoufox>=0.3.9
10
+ camoufox>=0.3.10
11
11
  browserforge
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = scrapling
3
- version = 0.2.1
3
+ version = 0.2.3
4
4
  author = Karim Shoair
5
5
  author_email = karim.shoair@pm.me
6
6
  description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
6
6
 
7
7
  setup(
8
8
  name="scrapling",
9
- version="0.2.1",
9
+ version="0.2.3",
10
10
  description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
11
11
  simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
12
12
  impressive speed improvements over many popular scraping tools.""",
@@ -57,7 +57,7 @@ setup(
57
57
  'httpx[brotli,zstd]',
58
58
  'playwright',
59
59
  'rebrowser-playwright',
60
- 'camoufox>=0.3.9',
60
+ 'camoufox>=0.3.10',
61
61
  'browserforge',
62
62
  ],
63
63
  python_requires=">=3.8",
File without changes
File without changes
File without changes