scrapling 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/engines/camo.py +8 -2
- scrapling/engines/pw.py +8 -2
- scrapling/engines/toolbelt/__init__.py +1 -0
- scrapling/engines/toolbelt/bypasses/navigator_plugins.js +40 -0
- scrapling/engines/toolbelt/bypasses/notification_permission.js +5 -0
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +5 -0
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -0
- scrapling/engines/toolbelt/bypasses/screen_props.js +27 -0
- scrapling/engines/toolbelt/bypasses/webdriver_fully.js +27 -0
- scrapling/engines/toolbelt/bypasses/window_chrome.js +213 -0
- scrapling/engines/toolbelt/custom.py +79 -1
- {scrapling-0.2.2.dist-info → scrapling-0.2.4.dist-info}/METADATA +2 -2
- {scrapling-0.2.2.dist-info → scrapling-0.2.4.dist-info}/RECORD +17 -10
- {scrapling-0.2.2.dist-info → scrapling-0.2.4.dist-info}/LICENSE +0 -0
- {scrapling-0.2.2.dist-info → scrapling-0.2.4.dist-info}/WHEEL +0 -0
- {scrapling-0.2.2.dist-info → scrapling-0.2.4.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.4"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/engines/camo.py
CHANGED
@@ -4,6 +4,7 @@ from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
|
4
4
|
from scrapling.engines.toolbelt import (
|
5
5
|
Response,
|
6
6
|
do_nothing,
|
7
|
+
StatusText,
|
7
8
|
get_os_name,
|
8
9
|
intercept_route,
|
9
10
|
check_type_validity,
|
@@ -111,12 +112,17 @@ class CamoufoxEngine:
|
|
111
112
|
if 'charset=' in content_type.lower():
|
112
113
|
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
113
114
|
|
115
|
+
status_text = res.status_text
|
116
|
+
# PlayWright API sometimes give empty status text for some reason!
|
117
|
+
if not status_text:
|
118
|
+
status_text = StatusText.get(res.status)
|
119
|
+
|
114
120
|
response = Response(
|
115
121
|
url=res.url,
|
116
122
|
text=page.content(),
|
117
|
-
body=
|
123
|
+
body=page.content().encode('utf-8'),
|
118
124
|
status=res.status,
|
119
|
-
reason=
|
125
|
+
reason=status_text,
|
120
126
|
encoding=encoding,
|
121
127
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
128
|
headers=res.all_headers(),
|
scrapling/engines/pw.py
CHANGED
@@ -6,6 +6,7 @@ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAUL
|
|
6
6
|
from scrapling.engines.toolbelt import (
|
7
7
|
Response,
|
8
8
|
do_nothing,
|
9
|
+
StatusText,
|
9
10
|
js_bypass_path,
|
10
11
|
intercept_route,
|
11
12
|
generate_headers,
|
@@ -221,12 +222,17 @@ class PlaywrightEngine:
|
|
221
222
|
if 'charset=' in content_type.lower():
|
222
223
|
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
223
224
|
|
225
|
+
status_text = res.status_text
|
226
|
+
# PlayWright API sometimes give empty status text for some reason!
|
227
|
+
if not status_text:
|
228
|
+
status_text = StatusText.get(res.status)
|
229
|
+
|
224
230
|
response = Response(
|
225
231
|
url=res.url,
|
226
232
|
text=page.content(),
|
227
|
-
body=
|
233
|
+
body=page.content().encode('utf-8'),
|
228
234
|
status=res.status,
|
229
|
-
reason=
|
235
|
+
reason=status_text,
|
230
236
|
encoding=encoding,
|
231
237
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
232
238
|
headers=res.all_headers(),
|
@@ -0,0 +1,40 @@
|
|
1
|
+
if(navigator.plugins.length == 0){
|
2
|
+
Object.defineProperty(navigator, 'plugins', {
|
3
|
+
get: () => {
|
4
|
+
const PDFViewerPlugin = Object.create(Plugin.prototype, {
|
5
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
6
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
7
|
+
name: { value: 'PDF Viewer', enumerable: false },
|
8
|
+
});
|
9
|
+
const ChromePDFViewer = Object.create(Plugin.prototype, {
|
10
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
11
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
12
|
+
name: { value: 'Chrome PDF Viewer', enumerable: false },
|
13
|
+
});
|
14
|
+
const ChromiumPDFViewer = Object.create(Plugin.prototype, {
|
15
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
16
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
17
|
+
name: { value: 'Chromium PDF Viewer', enumerable: false },
|
18
|
+
});
|
19
|
+
const EdgePDFViewer = Object.create(Plugin.prototype, {
|
20
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
21
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
22
|
+
name: { value: 'Microsoft Edge PDF Viewer', enumerable: false },
|
23
|
+
});
|
24
|
+
const WebKitPDFPlugin = Object.create(Plugin.prototype, {
|
25
|
+
description: { value: 'Portable Document Format', enumerable: false },
|
26
|
+
filename: { value: 'internal-pdf-viewer', enumerable: false },
|
27
|
+
name: { value: 'WebKit built-in PDF', enumerable: false },
|
28
|
+
});
|
29
|
+
|
30
|
+
return Object.create(PluginArray.prototype, {
|
31
|
+
length: { value: 5 },
|
32
|
+
0: { value: PDFViewerPlugin },
|
33
|
+
1: { value: ChromePDFViewer },
|
34
|
+
2: { value: ChromiumPDFViewer },
|
35
|
+
3: { value: EdgePDFViewer },
|
36
|
+
4: { value: WebKitPDFPlugin },
|
37
|
+
});
|
38
|
+
},
|
39
|
+
});
|
40
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
const windowScreenProps = {
|
2
|
+
// Dimensions
|
3
|
+
innerHeight: 0,
|
4
|
+
innerWidth: 0,
|
5
|
+
outerHeight: 754,
|
6
|
+
outerWidth: 1313,
|
7
|
+
|
8
|
+
// Position
|
9
|
+
screenX: 19,
|
10
|
+
pageXOffset: 0,
|
11
|
+
pageYOffset: 0,
|
12
|
+
|
13
|
+
// Display
|
14
|
+
devicePixelRatio: 2
|
15
|
+
};
|
16
|
+
|
17
|
+
try {
|
18
|
+
for (const [prop, value] of Object.entries(windowScreenProps)) {
|
19
|
+
if (value > 0) {
|
20
|
+
// The 0 values are introduced by collecting in the hidden iframe.
|
21
|
+
// They are document sizes anyway so no need to test them or inject them.
|
22
|
+
window[prop] = value;
|
23
|
+
}
|
24
|
+
}
|
25
|
+
} catch (e) {
|
26
|
+
console.warn(e);
|
27
|
+
};
|
@@ -0,0 +1,27 @@
|
|
1
|
+
// Create a function that looks like a native getter
|
2
|
+
const nativeGetter = function get webdriver() {
|
3
|
+
return false;
|
4
|
+
};
|
5
|
+
|
6
|
+
// Copy over native function properties
|
7
|
+
Object.defineProperties(nativeGetter, {
|
8
|
+
name: { value: 'get webdriver', configurable: true },
|
9
|
+
length: { value: 0, configurable: true },
|
10
|
+
toString: {
|
11
|
+
value: function() {
|
12
|
+
return `function get webdriver() { [native code] }`;
|
13
|
+
},
|
14
|
+
configurable: true
|
15
|
+
}
|
16
|
+
});
|
17
|
+
|
18
|
+
// Make it look native
|
19
|
+
Object.setPrototypeOf(nativeGetter, Function.prototype);
|
20
|
+
|
21
|
+
// Apply the modified descriptor
|
22
|
+
Object.defineProperty(Navigator.prototype, 'webdriver', {
|
23
|
+
get: nativeGetter,
|
24
|
+
set: undefined,
|
25
|
+
enumerable: true,
|
26
|
+
configurable: true
|
27
|
+
});
|
@@ -0,0 +1,213 @@
|
|
1
|
+
// To escape `HEADCHR_CHROME_OBJ` test in headless mode => https://github.com/antoinevastel/fp-collect/blob/master/src/fpCollect.js#L322
|
2
|
+
// Faking window.chrome fully
|
3
|
+
|
4
|
+
if (!window.chrome) {
|
5
|
+
// First, save all existing properties
|
6
|
+
const originalKeys = Object.getOwnPropertyNames(window);
|
7
|
+
const tempObj = {};
|
8
|
+
|
9
|
+
// Recreate all properties in original order
|
10
|
+
for (const key of originalKeys) {
|
11
|
+
const descriptor = Object.getOwnPropertyDescriptor(window, key);
|
12
|
+
const value = window[key];
|
13
|
+
// delete window[key];
|
14
|
+
Object.defineProperty(tempObj, key, descriptor);
|
15
|
+
}
|
16
|
+
|
17
|
+
// Use the exact property descriptor found in headful Chrome
|
18
|
+
// fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`
|
19
|
+
const mockChrome = {
|
20
|
+
loadTimes: {},
|
21
|
+
csi: {},
|
22
|
+
app: {
|
23
|
+
isInstalled: false
|
24
|
+
},
|
25
|
+
// Add other Chrome-specific properties
|
26
|
+
};
|
27
|
+
|
28
|
+
Object.defineProperty(tempObj, 'chrome', {
|
29
|
+
writable: true,
|
30
|
+
enumerable: true,
|
31
|
+
configurable: false,
|
32
|
+
value: mockChrome
|
33
|
+
});
|
34
|
+
for (const key of Object.getOwnPropertyNames(tempObj)) {
|
35
|
+
try {
|
36
|
+
Object.defineProperty(window, key,
|
37
|
+
Object.getOwnPropertyDescriptor(tempObj, key));
|
38
|
+
} catch (e) {}
|
39
|
+
};
|
40
|
+
// todo: solve this
|
41
|
+
// Using line below bypasses the hasHighChromeIndex test in creepjs ==> https://github.com/abrahamjuliot/creepjs/blob/master/src/headless/index.ts#L121
|
42
|
+
// Chrome object have to be in the end of the window properties
|
43
|
+
// Object.assign(window, tempObj);
|
44
|
+
// But makes window.chrome unreadable on 'https://bot.sannysoft.com/'
|
45
|
+
}
|
46
|
+
|
47
|
+
// That means we're running headful and don't need to mock anything
|
48
|
+
if ('app' in window.chrome) {
|
49
|
+
return; // Nothing to do here
|
50
|
+
}
|
51
|
+
const makeError = {
|
52
|
+
ErrorInInvocation: fn => {
|
53
|
+
const err = new TypeError(`Error in invocation of app.${fn}()`);
|
54
|
+
return utils.stripErrorWithAnchor(
|
55
|
+
err,
|
56
|
+
`at ${fn} (eval at <anonymous>`,
|
57
|
+
);
|
58
|
+
},
|
59
|
+
};
|
60
|
+
// check with: `JSON.stringify(window.chrome['app'])`
|
61
|
+
const STATIC_DATA = JSON.parse(
|
62
|
+
`
|
63
|
+
{
|
64
|
+
"isInstalled": false,
|
65
|
+
"InstallState": {
|
66
|
+
"DISABLED": "disabled",
|
67
|
+
"INSTALLED": "installed",
|
68
|
+
"NOT_INSTALLED": "not_installed"
|
69
|
+
},
|
70
|
+
"RunningState": {
|
71
|
+
"CANNOT_RUN": "cannot_run",
|
72
|
+
"READY_TO_RUN": "ready_to_run",
|
73
|
+
"RUNNING": "running"
|
74
|
+
}
|
75
|
+
}
|
76
|
+
`.trim(),
|
77
|
+
);
|
78
|
+
window.chrome.app = {
|
79
|
+
...STATIC_DATA,
|
80
|
+
|
81
|
+
get isInstalled() {
|
82
|
+
return false;
|
83
|
+
},
|
84
|
+
|
85
|
+
getDetails: function getDetails() {
|
86
|
+
if (arguments.length) {
|
87
|
+
throw makeError.ErrorInInvocation(`getDetails`);
|
88
|
+
}
|
89
|
+
return null;
|
90
|
+
},
|
91
|
+
getIsInstalled: function getDetails() {
|
92
|
+
if (arguments.length) {
|
93
|
+
throw makeError.ErrorInInvocation(`getIsInstalled`);
|
94
|
+
}
|
95
|
+
return false;
|
96
|
+
},
|
97
|
+
runningState: function getDetails() {
|
98
|
+
if (arguments.length) {
|
99
|
+
throw makeError.ErrorInInvocation(`runningState`);
|
100
|
+
}
|
101
|
+
return 'cannot_run';
|
102
|
+
},
|
103
|
+
};
|
104
|
+
// Check that the Navigation Timing API v1 is available, we need that
|
105
|
+
if (!window.performance || !window.performance.timing) {
|
106
|
+
return;
|
107
|
+
}
|
108
|
+
const {timing} = window.performance;
|
109
|
+
window.chrome.csi = function () {
|
110
|
+
return {
|
111
|
+
onloadT: timing.domContentLoadedEventEnd,
|
112
|
+
startE: timing.navigationStart,
|
113
|
+
pageT: Date.now() - timing.navigationStart,
|
114
|
+
tran: 15, // Transition type or something
|
115
|
+
};
|
116
|
+
};
|
117
|
+
if (!window.PerformancePaintTiming){
|
118
|
+
return;
|
119
|
+
}
|
120
|
+
const {performance} = window;
|
121
|
+
// Some stuff is not available on about:blank as it requires a navigation to occur,
|
122
|
+
// let's harden the code to not fail then:
|
123
|
+
const ntEntryFallback = {
|
124
|
+
nextHopProtocol: 'h2',
|
125
|
+
type: 'other',
|
126
|
+
};
|
127
|
+
|
128
|
+
// The API exposes some funky info regarding the connection
|
129
|
+
const protocolInfo = {
|
130
|
+
get connectionInfo() {
|
131
|
+
const ntEntry =
|
132
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
133
|
+
return ntEntry.nextHopProtocol;
|
134
|
+
},
|
135
|
+
get npnNegotiatedProtocol() {
|
136
|
+
// NPN is deprecated in favor of ALPN, but this implementation returns the
|
137
|
+
// HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
|
138
|
+
const ntEntry =
|
139
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
140
|
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)
|
141
|
+
? ntEntry.nextHopProtocol
|
142
|
+
: 'unknown';
|
143
|
+
},
|
144
|
+
get navigationType() {
|
145
|
+
const ntEntry =
|
146
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
147
|
+
return ntEntry.type;
|
148
|
+
},
|
149
|
+
get wasAlternateProtocolAvailable() {
|
150
|
+
// The Alternate-Protocol header is deprecated in favor of Alt-Svc
|
151
|
+
// (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this
|
152
|
+
// should always return false.
|
153
|
+
return false;
|
154
|
+
},
|
155
|
+
get wasFetchedViaSpdy() {
|
156
|
+
// SPDY is deprecated in favor of HTTP/2, but this implementation returns
|
157
|
+
// true for HTTP/2 or HTTP2+QUIC/39 as well.
|
158
|
+
const ntEntry =
|
159
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
160
|
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
|
161
|
+
},
|
162
|
+
get wasNpnNegotiated() {
|
163
|
+
// NPN is deprecated in favor of ALPN, but this implementation returns true
|
164
|
+
// for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.
|
165
|
+
const ntEntry =
|
166
|
+
performance.getEntriesByType('navigation')[0] || ntEntryFallback;
|
167
|
+
return ['h2', 'hq'].includes(ntEntry.nextHopProtocol);
|
168
|
+
},
|
169
|
+
};
|
170
|
+
|
171
|
+
// Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3
|
172
|
+
function toFixed(num, fixed) {
|
173
|
+
var re = new RegExp('^-?\\d+(?:.\\d{0,' + (fixed || -1) + '})?');
|
174
|
+
return num.toString().match(re)[0];
|
175
|
+
}
|
176
|
+
|
177
|
+
const timingInfo = {
|
178
|
+
get firstPaintAfterLoadTime() {
|
179
|
+
// This was never actually implemented and always returns 0.
|
180
|
+
return 0;
|
181
|
+
},
|
182
|
+
get requestTime() {
|
183
|
+
return timing.navigationStart / 1000;
|
184
|
+
},
|
185
|
+
get startLoadTime() {
|
186
|
+
return timing.navigationStart / 1000;
|
187
|
+
},
|
188
|
+
get commitLoadTime() {
|
189
|
+
return timing.responseStart / 1000;
|
190
|
+
},
|
191
|
+
get finishDocumentLoadTime() {
|
192
|
+
return timing.domContentLoadedEventEnd / 1000;
|
193
|
+
},
|
194
|
+
get finishLoadTime() {
|
195
|
+
return timing.loadEventEnd / 1000;
|
196
|
+
},
|
197
|
+
get firstPaintTime() {
|
198
|
+
const fpEntry = performance.getEntriesByType('paint')[0] || {
|
199
|
+
startTime: timing.loadEventEnd / 1000, // Fallback if no navigation occured (`about:blank`)
|
200
|
+
};
|
201
|
+
return toFixed(
|
202
|
+
(fpEntry.startTime + performance.timeOrigin) / 1000,
|
203
|
+
3,
|
204
|
+
);
|
205
|
+
},
|
206
|
+
};
|
207
|
+
|
208
|
+
window.chrome.loadTimes = function () {
|
209
|
+
return {
|
210
|
+
...protocolInfo,
|
211
|
+
...timingInfo,
|
212
|
+
};
|
213
|
+
};
|
@@ -4,8 +4,9 @@ Functions related to custom types or type checking
|
|
4
4
|
import inspect
|
5
5
|
import logging
|
6
6
|
|
7
|
-
from scrapling.core.
|
7
|
+
from scrapling.core.custom_types import MappingProxyType
|
8
8
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
9
|
+
from scrapling.core.utils import setup_basic_logging, cache
|
9
10
|
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
10
11
|
|
11
12
|
|
@@ -67,6 +68,83 @@ class BaseFetcher:
|
|
67
68
|
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
68
69
|
|
69
70
|
|
71
|
+
class StatusText:
|
72
|
+
"""A class that gets the status text of response status code.
|
73
|
+
|
74
|
+
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
|
75
|
+
"""
|
76
|
+
_phrases = MappingProxyType({
|
77
|
+
100: "Continue",
|
78
|
+
101: "Switching Protocols",
|
79
|
+
102: "Processing",
|
80
|
+
103: "Early Hints",
|
81
|
+
200: "OK",
|
82
|
+
201: "Created",
|
83
|
+
202: "Accepted",
|
84
|
+
203: "Non-Authoritative Information",
|
85
|
+
204: "No Content",
|
86
|
+
205: "Reset Content",
|
87
|
+
206: "Partial Content",
|
88
|
+
207: "Multi-Status",
|
89
|
+
208: "Already Reported",
|
90
|
+
226: "IM Used",
|
91
|
+
300: "Multiple Choices",
|
92
|
+
301: "Moved Permanently",
|
93
|
+
302: "Found",
|
94
|
+
303: "See Other",
|
95
|
+
304: "Not Modified",
|
96
|
+
305: "Use Proxy",
|
97
|
+
307: "Temporary Redirect",
|
98
|
+
308: "Permanent Redirect",
|
99
|
+
400: "Bad Request",
|
100
|
+
401: "Unauthorized",
|
101
|
+
402: "Payment Required",
|
102
|
+
403: "Forbidden",
|
103
|
+
404: "Not Found",
|
104
|
+
405: "Method Not Allowed",
|
105
|
+
406: "Not Acceptable",
|
106
|
+
407: "Proxy Authentication Required",
|
107
|
+
408: "Request Timeout",
|
108
|
+
409: "Conflict",
|
109
|
+
410: "Gone",
|
110
|
+
411: "Length Required",
|
111
|
+
412: "Precondition Failed",
|
112
|
+
413: "Payload Too Large",
|
113
|
+
414: "URI Too Long",
|
114
|
+
415: "Unsupported Media Type",
|
115
|
+
416: "Range Not Satisfiable",
|
116
|
+
417: "Expectation Failed",
|
117
|
+
418: "I'm a teapot",
|
118
|
+
421: "Misdirected Request",
|
119
|
+
422: "Unprocessable Entity",
|
120
|
+
423: "Locked",
|
121
|
+
424: "Failed Dependency",
|
122
|
+
425: "Too Early",
|
123
|
+
426: "Upgrade Required",
|
124
|
+
428: "Precondition Required",
|
125
|
+
429: "Too Many Requests",
|
126
|
+
431: "Request Header Fields Too Large",
|
127
|
+
451: "Unavailable For Legal Reasons",
|
128
|
+
500: "Internal Server Error",
|
129
|
+
501: "Not Implemented",
|
130
|
+
502: "Bad Gateway",
|
131
|
+
503: "Service Unavailable",
|
132
|
+
504: "Gateway Timeout",
|
133
|
+
505: "HTTP Version Not Supported",
|
134
|
+
506: "Variant Also Negotiates",
|
135
|
+
507: "Insufficient Storage",
|
136
|
+
508: "Loop Detected",
|
137
|
+
510: "Not Extended",
|
138
|
+
511: "Network Authentication Required"
|
139
|
+
})
|
140
|
+
|
141
|
+
@classmethod
|
142
|
+
@cache(maxsize=128)
|
143
|
+
def get(cls, status_code: int) -> str:
|
144
|
+
"""Get the phrase for a given HTTP status code."""
|
145
|
+
return cls._phrases.get(status_code, "Unknown Status Code")
|
146
|
+
|
147
|
+
|
70
148
|
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
71
149
|
"""This function check if the passed engine can be used by a Fetcher-type class or not.
|
72
150
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.4
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
|
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
42
|
Requires-Dist: playwright
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox >=0.3.
|
44
|
+
Requires-Dist: camoufox >=0.3.10
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -1,4 +1,4 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=Tj_pDeN1yhufhlxQ0bY7Qnuxntq_JaqBUCQZrz01EFA,435
|
2
2
|
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
3
|
scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
|
4
4
|
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
@@ -11,14 +11,21 @@ scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHA
|
|
11
11
|
scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
|
12
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=WJNDR3OY5LLqNHRMD4YbwuqUdnEZ8U-Et_1YUn6vDiw,7773
|
15
15
|
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
16
|
-
scrapling/engines/pw.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=6iNdnNF9M45FJkazeCvFRicyTFD2EkxSISJJP__uOug,12345
|
17
17
|
scrapling/engines/static.py,sha256=Wsp6_-soZUQJT6kHoKPkLOdHU9J50chLdYxDmQjO4FQ,7101
|
18
|
-
scrapling/engines/toolbelt/__init__.py,sha256=
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
18
|
+
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=6Ip-9t2G8TaXLGLARQAEcbnFqvAN7AmgN1ah0glRiMs,9953
|
20
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
|
+
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
+
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
+
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
+
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
+
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
+
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
+
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
22
29
|
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
23
30
|
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
24
31
|
tests/fetchers/test_camoufox.py,sha256=XPTCDZ9sj_GpCzXyvzKF_uZWhEYX6J_jh_BLeMEl8yY,2874
|
@@ -27,8 +34,8 @@ tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-T
|
|
27
34
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
35
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
29
36
|
tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
|
30
|
-
scrapling-0.2.
|
31
|
-
scrapling-0.2.
|
32
|
-
scrapling-0.2.
|
33
|
-
scrapling-0.2.
|
34
|
-
scrapling-0.2.
|
37
|
+
scrapling-0.2.4.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
38
|
+
scrapling-0.2.4.dist-info/METADATA,sha256=uOp98w2qzOGqE4ofFFG_TgWgZGrscQHWhmP49pfIV3s,64785
|
39
|
+
scrapling-0.2.4.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
40
|
+
scrapling-0.2.4.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
41
|
+
scrapling-0.2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|