scrapling 0.2.1__tar.gz → 0.2.3__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.1 → scrapling-0.2.3}/MANIFEST.in +1 -0
- {scrapling-0.2.1/scrapling.egg-info → scrapling-0.2.3}/PKG-INFO +16 -5
- {scrapling-0.2.1 → scrapling-0.2.3}/README.md +14 -3
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/__init__.py +1 -1
- scrapling-0.2.3/scrapling/defaults.py +6 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/camo.py +2 -2
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/pw.py +2 -2
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/static.py +2 -2
- scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +40 -0
- scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/notification_permission.js +5 -0
- scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +5 -0
- scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -0
- scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/screen_props.js +27 -0
- scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +27 -0
- scrapling-0.2.3/scrapling/engines/toolbelt/bypasses/window_chrome.js +213 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/toolbelt/custom.py +3 -4
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/parser.py +11 -2
- {scrapling-0.2.1 → scrapling-0.2.3/scrapling.egg-info}/PKG-INFO +16 -5
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/SOURCES.txt +8 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/requires.txt +1 -1
- {scrapling-0.2.1 → scrapling-0.2.3}/setup.cfg +1 -1
- {scrapling-0.2.1 → scrapling-0.2.3}/setup.py +2 -2
- {scrapling-0.2.1 → scrapling-0.2.3}/LICENSE +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/_types.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/translator.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/toolbelt/__init__.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/fetchers.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling/py.typed +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/tests/__init__.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/tests/fetchers/test_camoufox.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/tests/fetchers/test_httpx.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/tests/fetchers/test_playwright.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.1 → scrapling-0.2.3}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
|
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
42
|
Requires-Dist: playwright
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.3.
|
44
|
+
Requires-Dist: camoufox>=0.3.10
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -52,9 +52,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
52
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
53
|
|
54
54
|
```python
|
55
|
-
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
55
|
+
>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
56
56
|
# Fetch websites' source under the radar!
|
57
|
-
>> page = StealthyFetcher
|
57
|
+
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
58
|
>> print(page.status)
|
59
59
|
200
|
60
60
|
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
@@ -257,12 +257,21 @@ python -m browserforge update
|
|
257
257
|
```
|
258
258
|
|
259
259
|
## Fetching Websites Features
|
260
|
-
All fetcher-type classes are imported in the same way
|
260
|
+
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
261
|
```python
|
262
262
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
263
263
|
```
|
264
264
|
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
|
265
265
|
|
266
|
+
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
267
|
+
```python
|
268
|
+
from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
269
|
+
```
|
270
|
+
then use it right away without initializing like:
|
271
|
+
```python
|
272
|
+
page = StealthyFetcher.fetch('https://example.com')
|
273
|
+
```
|
274
|
+
|
266
275
|
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
267
276
|
> [!NOTE]
|
268
277
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
@@ -803,6 +812,8 @@ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its st
|
|
803
812
|
|
804
813
|
## More Sponsors!
|
805
814
|
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
815
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
816
|
+
|
806
817
|
|
807
818
|
## Contributing
|
808
819
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
@@ -6,9 +6,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
6
6
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
7
7
|
|
8
8
|
```python
|
9
|
-
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
9
|
+
>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
10
10
|
# Fetch websites' source under the radar!
|
11
|
-
>> page = StealthyFetcher
|
11
|
+
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
12
12
|
>> print(page.status)
|
13
13
|
200
|
14
14
|
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
@@ -211,12 +211,21 @@ python -m browserforge update
|
|
211
211
|
```
|
212
212
|
|
213
213
|
## Fetching Websites Features
|
214
|
-
All fetcher-type classes are imported in the same way
|
214
|
+
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
215
215
|
```python
|
216
216
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
217
217
|
```
|
218
218
|
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
|
219
219
|
|
220
|
+
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
221
|
+
```python
|
222
|
+
from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
223
|
+
```
|
224
|
+
then use it right away without initializing like:
|
225
|
+
```python
|
226
|
+
page = StealthyFetcher.fetch('https://example.com')
|
227
|
+
```
|
228
|
+
|
220
229
|
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
221
230
|
> [!NOTE]
|
222
231
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
@@ -757,6 +766,8 @@ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its st
|
|
757
766
|
|
758
767
|
## More Sponsors!
|
759
768
|
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
769
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
770
|
+
|
760
771
|
|
761
772
|
## Contributing
|
762
773
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.3"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
@@ -0,0 +1,6 @@
|
|
1
|
+
from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
|
2
|
+
|
3
|
+
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
|
+
Fetcher = Fetcher()
|
5
|
+
StealthyFetcher = StealthyFetcher()
|
6
|
+
PlayWrightFetcher = PlayWrightFetcher()
|
@@ -114,14 +114,14 @@ class CamoufoxEngine:
|
|
114
114
|
response = Response(
|
115
115
|
url=res.url,
|
116
116
|
text=page.content(),
|
117
|
-
|
117
|
+
body=res.body(),
|
118
118
|
status=res.status,
|
119
119
|
reason=res.status_text,
|
120
120
|
encoding=encoding,
|
121
121
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
122
|
headers=res.all_headers(),
|
123
123
|
request_headers=res.request.all_headers(),
|
124
|
-
|
124
|
+
**self.adaptor_arguments
|
125
125
|
)
|
126
126
|
page.close()
|
127
127
|
|
@@ -224,14 +224,14 @@ class PlaywrightEngine:
|
|
224
224
|
response = Response(
|
225
225
|
url=res.url,
|
226
226
|
text=page.content(),
|
227
|
-
|
227
|
+
body=res.body(),
|
228
228
|
status=res.status,
|
229
229
|
reason=res.status_text,
|
230
230
|
encoding=encoding,
|
231
231
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
232
232
|
headers=res.all_headers(),
|
233
233
|
request_headers=res.request.all_headers(),
|
234
|
-
|
234
|
+
**self.adaptor_arguments
|
235
235
|
)
|
236
236
|
page.close()
|
237
237
|
return response
|
@@ -53,14 +53,14 @@ class StaticEngine:
|
|
53
53
|
return Response(
|
54
54
|
url=str(response.url),
|
55
55
|
text=response.text,
|
56
|
-
|
56
|
+
body=response.content,
|
57
57
|
status=response.status_code,
|
58
58
|
reason=response.reason_phrase,
|
59
59
|
encoding=response.encoding or 'utf-8',
|
60
60
|
cookies=dict(response.cookies),
|
61
61
|
headers=dict(response.headers),
|
62
62
|
request_headers=dict(response.request.headers),
|
63
|
-
|
63
|
+
**self.adaptor_arguments
|
64
64
|
)
|
65
65
|
|
66
66
|
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
@@ -0,0 +1,40 @@
|
|
1
|
+
if(navigator.plugins.length == 0){
|
2
|
+
Object.defineProperty(navigator, 'plugins', {
|
3
|
+
get: () => {
|
4
|
+
const PDFViewerPlugin = Object.create(Plugin.prototype, {
|
5
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
6
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
7
|
+
name: { value: 'PDF Viewer', enumerable: false },
|
8
|
+
});
|
9
|
+
const ChromePDFViewer = Object.create(Plugin.prototype, {
|
10
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
11
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
12
|
+
name: { value: 'Chrome PDF Viewer', enumerable: false },
|
13
|
+
});
|
14
|
+
const ChromiumPDFViewer = Object.create(Plugin.prototype, {
|
15
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
16
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
17
|
+
name: { value: 'Chromium PDF Viewer', enumerable: false },
|
18
|
+
});
|
19
|
+
const EdgePDFViewer = Object.create(Plugin.prototype, {
|
20
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
21
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
22
|
+
name: { value: 'Microsoft Edge PDF Viewer', enumerable: false },
|
23
|
+
});
|
24
|
+
const WebKitPDFPlugin = Object.create(Plugin.prototype, {
|
25
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
26
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
27
|
+
name: { value: 'WebKit built-in PDF', enumerable: false },
|
28
|
+
});
|
29
|
+
|
30
|
+
return Object.create(PluginArray.prototype, {
|
31
|
+
length: { value: 5 },
|
32
|
+
0: { value: PDFViewerPlugin },
|
33
|
+
1: { value: ChromePDFViewer },
|
34
|
+
2: { value: ChromiumPDFViewer },
|
35
|
+
3: { value: EdgePDFViewer },
|
36
|
+
4: { value: WebKitPDFPlugin },
|
37
|
+
});
|
38
|
+
},
|
39
|
+
});
|
40
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
const windowScreenProps = {
|
2
|
+
// Dimensions
|
3
|
+
innerHeight: 0,
|
4
|
+
innerWidth: 0,
|
5
|
+
outerHeight: 754,
|
6
|
+
outerWidth: 1313,
|
7
|
+
|
8
|
+
// Position
|
9
|
+
screenX: 19,
|
10
|
+
pageXOffset: 0,
|
11
|
+
pageYOffset: 0,
|
12
|
+
|
13
|
+
// Display
|
14
|
+
devicePixelRatio: 2
|
15
|
+
};
|
16
|
+
|
17
|
+
try {
|
18
|
+
for (const [prop, value] of Object.entries(windowScreenProps)) {
|
19
|
+
if (value > 0) {
|
20
|
+
// The 0 values are introduced by collecting in the hidden iframe.
|
21
|
+
// They are document sizes anyway so no need to test them or inject them.
|
22
|
+
window[prop] = value;
|
23
|
+
}
|
24
|
+
}
|
25
|
+
} catch (e) {
|
26
|
+
console.warn(e);
|
27
|
+
};
|
@@ -0,0 +1,27 @@
|
|
1
|
+
// Create a function that looks like a native getter
|
2
|
+
const nativeGetter = function get webdriver() {
|
3
|
+
return false;
|
4
|
+
};
|
5
|
+
|
6
|
+
// Copy over native function properties
|
7
|
+
Object.defineProperties(nativeGetter, {
|
8
|
+
name: { value: 'get webdriver', configurable: true },
|
9
|
+
length: { value: 0, configurable: true },
|
10
|
+
toString: {
|
11
|
+
value: function() {
|
12
|
+
return `function get webdriver() { [native code] }`;
|
13
|
+
},
|
14
|
+
configurable: true
|
15
|
+
}
|
16
|
+
});
|
17
|
+
|
18
|
+
// Make it look native
|
19
|
+
Object.setPrototypeOf(nativeGetter, Function.prototype);
|
20
|
+
|
21
|
+
// Apply the modified descriptor
|
22
|
+
Object.defineProperty(Navigator.prototype, 'webdriver', {
|
23
|
+
get: nativeGetter,
|
24
|
+
set: undefined,
|
25
|
+
enumerable: true,
|
26
|
+
configurable: true
|
27
|
+
});
|
@@ -0,0 +1,213 @@
|
|
1
|
+
// To escape `HEADCHR_CHROME_OBJ` test in headless mode => https://github.com/antoinevastel/fp-collect/blob/master/src/fpCollect.js#L322
|
2
|
+
// Faking window.chrome fully
|
3
|
+
|
4
|
+
if (!window.chrome) {
|
5
|
+
// First, save all existing properties
|
6
|
+
const originalKeys = Object.getOwnPropertyNames(window);
|
7
|
+
const tempObj = {};
|
8
|
+
|
9
|
+
// Recreate all properties in original order
|
10
|
+
for (const key of originalKeys) {
|
11
|
+
const descriptor = Object.getOwnPropertyDescriptor(window, key);
|
12
|
+
const value = window[key];
|
13
|
+
// delete window[key];
|
14
|
+
Object.defineProperty(tempObj, key, descriptor);
|
15
|
+
}
|
16
|
+
|
17
|
+
// Use the exact property descriptor found in headful Chrome
|
18
|
+
// fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`
|
19
|
+
const mockChrome = {
|
20
|
+
loadTimes: {},
|
21
|
+
csi: {},
|
22
|
+
app: {
|
23
|
+
isInstalled: false
|
24
|
+
},
|
25
|
+
// Add other Chrome-specific properties
|
26
|
+
};
|
27
|
+
|
28
|
+
Object.defineProperty(tempObj, 'chrome', {
|
29
|
+
writable: true,
|
30
|
+
enumerable: true,
|
31
|
+
configurable: false,
|
32
|
+
value: mockChrome
|
33
|
+
});
|
34
|
+
for (const key of Object.getOwnPropertyNames(tempObj)) {
|
35
|
+
try {
|
36
|
+
Object.defineProperty(window, key,
|
37
|
+
Object.getOwnPropertyDescriptor(tempObj, key));
|
38
|
+
} catch (e) {}
|
39
|
+
};
|
40
|
+
// todo: solve this
|
41
|
+
// Using line below bypasses the hasHighChromeIndex test in creepjs ==> https://github.com/abrahamjuliot/creepjs/blob/master/src/headless/index.ts#L121
|
42
|
+
// Chrome object have to be in the end of the window properties
|
43
|
+
// Object.assign(window, tempObj);
|
44
|
+
// But makes window.chrome unreadable on 'https://bot.sannysoft.com/'
|
45
|
+
}
|
46
|
+
|
47
|
+
// That means we're running headful and don't need to mock anything
|
48
|
+
if ('app' in window.chrome) {
|
49
|
+
return; // Nothing to do here
|
50
|
+
}
|
51
|
+
const makeError = {
|
52
|
+
ErrorInInvocation: fn => {
|
53
|
+
const err = new TypeError(`Error in invocation of app.${fn}()`);
|
54
|
+
return utils.stripErrorWithAnchor(
|
55
|
+
err,
|
56
|
+
`at ${fn} (eval at <anonymous>`,
|
57
|
+
);
|
58
|
+
},
|
59
|
+
};
|
60
|
+
// check with: `JSON.stringify(window.chrome['app'])`
|
61
|
+
const STATIC_DATA = JSON.parse(
|
62
|
+
`
|
63
|
+
{
|
64
|
+
"isInstalled": false,
|
65
|
+
"InstallState": {
|
66
|
+
"DISABLED": "disabled",
|
67
|
+
"INSTALLED": "installed",
|
68
|
+
"NOT_INSTALLED": "not_installed"
|
69
|
+
},
|
70
|
+
"RunningState": {
|
71
|
+
"CANNOT_RUN": "cannot_run",
|
72
|
+
"READY_TO_RUN": "ready_to_run",
|
73
|
+
"RUNNING": "running"
|
74
|
+
}
|
75
|
+
}
|
76
|
+
`.trim(),
|
77
|
+
);
|
78
|
+
window.chrome.app = {
|
79
|
+
...STATIC_DATA,
|
80
|
+
|
81
|
+
get isInstalled() {
|
82
|
+
return false;
|
83
|
+
},
|
84
|
+
|
85
|
+
getDetails: function getDetails() {
|
86
|
+
if (arguments.length) {
|
87
|
+
throw makeError.ErrorInInvocation(`getDetails`);
|
88
|
+
}
|
89
|
+
return null;
|
90
|
+
},
|
91
|
+
getIsInstalled: function getDetails() {
|
92
|
+
if (arguments.length) {
|
93
|
+
throw makeError.ErrorInInvocation(`getIsInstalled`);
|
94
|
+
}
|
95
|
+
return false;
|
96
|
+
},
|
97
|
+
runningState: function getDetails() {
|
98
|
+
if (arguments.length) {
|
99
|
+
throw makeError.ErrorInInvocation(`runningState`);
|
100
|
+
}
|
101
|
+
return 'cannot_run';
|
102
|
+
},
|
103
|
+
};
|
104
|
+
// Check that the Navigation Timing API v1 is available, we need that
|
105
|
+
if (!window.performance || !window.performance.timing) {
|
106
|
+
return;
|
107
|
+
}
|
108
|
+
const {timing} = window.performance;
|
109
|
+
window.chrome.csi = function () {
|
110
|
+
return {
|
111
|
+
onloadT: timing.domContentLoadedEventEnd,
|
112
|
+
startE: timing.navigationStart,
|
113
|
+
pageT: Date.now() - timing.navigationStart,
|
114
|
+
tran: 15, // Transition type or something
|
115
|
+
};
|
116
|
+
};
|
117
|
+
if (!window.PerformancePaintTiming){
|
118
|
+
return;
|
119
|
+
}
|
120
|
+
const {performance} = window;
|
121
|
+
// Some stuff is not available on about:blank as it requires a navigation to occur,
|
122
|
+
// let's harden the code to not fail then:
|
123
|
+
const ntEntryFallback = {
|
124
|
+
nextHopProtocol: 'h2',
|
125
|
+
type: 'other',
|
126
|
+
};
|
127
|
+
|
128
|
+
// The API exposes some funky info regarding the connection
|
129
|
+
const protocolInfo = {
|
130
|
+
get connectionInfo() {
|
131
|
+
const ntEntry =
|
132
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
133
|
+
return ntEntry.nextHopProtocol;
|
134
|
+
},
|
135
|
+
get npnNegotiatedProtocol() {
|
136
|
+
// NPN is deprecated in favor of ALPN, but this implementation returns the
|
137
|
+
// HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
|
138
|
+
const ntEntry =
|
139
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
140
|
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)
|
141
|
+
? ntEntry.nextHopProtocol
|
142
|
+
: 'unknown';
|
143
|
+
},
|
144
|
+
get navigationType() {
|
145
|
+
const ntEntry =
|
146
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
147
|
+
return ntEntry.type;
|
148
|
+
},
|
149
|
+
get wasAlternateProtocolAvailable() {
|
150
|
+
// The Alternate-Protocol header is deprecated in favor of Alt-Svc
|
151
|
+
// (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this
|
152
|
+
// should always return false.
|
153
|
+
return false;
|
154
|
+
},
|
155
|
+
get wasFetchedViaSpdy() {
|
156
|
+
// SPDY is deprecated in favor of HTTP/2, but this implementation returns
|
157
|
+
// true for HTTP/2 or HTTP2+QUIC/39 as well.
|
158
|
+
const ntEntry =
|
159
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
160
|
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
|
161
|
+
},
|
162
|
+
get wasNpnNegotiated() {
|
163
|
+
// NPN is deprecated in favor of ALPN, but this implementation returns true
|
164
|
+
// for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
|
165
|
+
const ntEntry =
|
166
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
167
|
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
|
168
|
+
},
|
169
|
+
};
|
170
|
+
|
171
|
+
// Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3
|
172
|
+
function toFixed(num, fixed) {
|
173
|
+
var re = new RegExp('^-?\\d+(?:.\\d{0,' + (fixed || -1) + '})?');
|
174
|
+
return num.toString().match(re)[0];
|
175
|
+
}
|
176
|
+
|
177
|
+
const timingInfo = {
|
178
|
+
get firstPaintAfterLoadTime() {
|
179
|
+
// This was never actually implemented and always returns 0.
|
180
|
+
return 0;
|
181
|
+
},
|
182
|
+
get requestTime() {
|
183
|
+
return timing.navigationStart / 1000;
|
184
|
+
},
|
185
|
+
get startLoadTime() {
|
186
|
+
return timing.navigationStart / 1000;
|
187
|
+
},
|
188
|
+
get commitLoadTime() {
|
189
|
+
return timing.responseStart / 1000;
|
190
|
+
},
|
191
|
+
get finishDocumentLoadTime() {
|
192
|
+
return timing.domContentLoadedEventEnd / 1000;
|
193
|
+
},
|
194
|
+
get finishLoadTime() {
|
195
|
+
return timing.loadEventEnd / 1000;
|
196
|
+
},
|
197
|
+
get firstPaintTime() {
|
198
|
+
const fpEntry = performance.getEntriesByType('paint')[0] || {
|
199
|
+
startTime: timing.loadEventEnd / 1000, // Fallback if no navigation occured (`about:blank`)
|
200
|
+
};
|
201
|
+
return toFixed(
|
202
|
+
(fpEntry.startTime + performance.timeOrigin) / 1000,
|
203
|
+
3,
|
204
|
+
);
|
205
|
+
},
|
206
|
+
};
|
207
|
+
|
208
|
+
window.chrome.loadTimes = function () {
|
209
|
+
return {
|
210
|
+
...protocolInfo,
|
211
|
+
...timingInfo,
|
212
|
+
};
|
213
|
+
};
|
@@ -12,15 +12,14 @@ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callab
|
|
12
12
|
class Response(Adaptor):
|
13
13
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
14
14
|
|
15
|
-
def __init__(self, url: str, text: str,
|
15
|
+
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
|
16
16
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
17
|
-
super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
18
|
-
|
19
17
|
self.status = status
|
20
18
|
self.reason = reason
|
21
19
|
self.cookies = cookies
|
22
20
|
self.headers = headers
|
23
21
|
self.request_headers = request_headers
|
22
|
+
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
24
23
|
# For back-ward compatibility
|
25
24
|
self.adaptor = self
|
26
25
|
|
@@ -31,7 +30,7 @@ class Response(Adaptor):
|
|
31
30
|
class BaseFetcher:
|
32
31
|
def __init__(
|
33
32
|
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
34
|
-
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] =
|
33
|
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
|
35
34
|
automatch_domain: Optional[str] = None,
|
36
35
|
):
|
37
36
|
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
@@ -32,6 +32,7 @@ class Adaptor(SelectorsGeneration):
|
|
32
32
|
storage: Any = SQLiteStorageSystem,
|
33
33
|
storage_args: Optional[Dict] = None,
|
34
34
|
debug: Optional[bool] = True,
|
35
|
+
**kwargs
|
35
36
|
):
|
36
37
|
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
37
38
|
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
@@ -117,6 +118,10 @@ class Adaptor(SelectorsGeneration):
|
|
117
118
|
self.__attributes = None
|
118
119
|
self.__tag = None
|
119
120
|
self.__debug = debug
|
121
|
+
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
122
|
+
self.__response_data = {
|
123
|
+
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
124
|
+
} if hasattr(self, 'status') else {}
|
120
125
|
|
121
126
|
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
122
127
|
@staticmethod
|
@@ -138,10 +143,14 @@ class Adaptor(SelectorsGeneration):
|
|
138
143
|
return TextHandler(str(element))
|
139
144
|
else:
|
140
145
|
if issubclass(type(element), html.HtmlMixin):
|
146
|
+
|
141
147
|
return self.__class__(
|
142
|
-
root=element,
|
148
|
+
root=element,
|
149
|
+
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
150
|
+
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
143
151
|
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
|
144
|
-
huge_tree=self.__huge_tree_enabled, debug=self.__debug
|
152
|
+
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
|
153
|
+
**self.__response_data
|
145
154
|
)
|
146
155
|
return element
|
147
156
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
|
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
42
|
Requires-Dist: playwright
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.3.
|
44
|
+
Requires-Dist: camoufox>=0.3.10
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -52,9 +52,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
52
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
53
|
|
54
54
|
```python
|
55
|
-
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
55
|
+
>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
56
56
|
# Fetch websites' source under the radar!
|
57
|
-
>> page = StealthyFetcher
|
57
|
+
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
58
|
>> print(page.status)
|
59
59
|
200
|
60
60
|
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
@@ -257,12 +257,21 @@ python -m browserforge update
|
|
257
257
|
```
|
258
258
|
|
259
259
|
## Fetching Websites Features
|
260
|
-
All fetcher-type classes are imported in the same way
|
260
|
+
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
261
|
```python
|
262
262
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
263
263
|
```
|
264
264
|
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
|
265
265
|
|
266
|
+
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
267
|
+
```python
|
268
|
+
from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
269
|
+
```
|
270
|
+
then use it right away without initializing like:
|
271
|
+
```python
|
272
|
+
page = StealthyFetcher.fetch('https://example.com')
|
273
|
+
```
|
274
|
+
|
266
275
|
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
267
276
|
> [!NOTE]
|
268
277
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
@@ -803,6 +812,8 @@ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its st
|
|
803
812
|
|
804
813
|
## More Sponsors!
|
805
814
|
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
815
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
816
|
+
|
806
817
|
|
807
818
|
## Contributing
|
808
819
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
@@ -4,6 +4,7 @@ README.md
|
|
4
4
|
setup.cfg
|
5
5
|
setup.py
|
6
6
|
scrapling/__init__.py
|
7
|
+
scrapling/defaults.py
|
7
8
|
scrapling/fetchers.py
|
8
9
|
scrapling/parser.py
|
9
10
|
scrapling/py.typed
|
@@ -29,6 +30,13 @@ scrapling/engines/toolbelt/__init__.py
|
|
29
30
|
scrapling/engines/toolbelt/custom.py
|
30
31
|
scrapling/engines/toolbelt/fingerprints.py
|
31
32
|
scrapling/engines/toolbelt/navigation.py
|
33
|
+
scrapling/engines/toolbelt/bypasses/navigator_plugins.js
|
34
|
+
scrapling/engines/toolbelt/bypasses/notification_permission.js
|
35
|
+
scrapling/engines/toolbelt/bypasses/pdf_viewer.js
|
36
|
+
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
|
37
|
+
scrapling/engines/toolbelt/bypasses/screen_props.js
|
38
|
+
scrapling/engines/toolbelt/bypasses/webdriver_fully.js
|
39
|
+
scrapling/engines/toolbelt/bypasses/window_chrome.js
|
32
40
|
tests/__init__.py
|
33
41
|
tests/fetchers/__init__.py
|
34
42
|
tests/fetchers/test_camoufox.py
|
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name="scrapling",
|
9
|
-
version="0.2.
|
9
|
+
version="0.2.3",
|
10
10
|
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
11
11
|
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
12
12
|
impressive speed improvements over many popular scraping tools.""",
|
@@ -57,7 +57,7 @@ setup(
|
|
57
57
|
'httpx[brotli,zstd]',
|
58
58
|
'playwright',
|
59
59
|
'rebrowser-playwright',
|
60
|
-
'camoufox>=0.3.
|
60
|
+
'camoufox>=0.3.10',
|
61
61
|
'browserforge',
|
62
62
|
],
|
63
63
|
python_requires=">=3.8",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|