scrapling 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/engines/camo.py +8 -2
- scrapling/engines/pw.py +8 -2
- scrapling/engines/toolbelt/__init__.py +1 -0
- scrapling/engines/toolbelt/custom.py +79 -1
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/METADATA +1 -1
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/RECORD +10 -10
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/LICENSE +0 -0
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/WHEEL +0 -0
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.4"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/engines/camo.py
CHANGED
@@ -4,6 +4,7 @@ from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
|
4
4
|
from scrapling.engines.toolbelt import (
|
5
5
|
Response,
|
6
6
|
do_nothing,
|
7
|
+
StatusText,
|
7
8
|
get_os_name,
|
8
9
|
intercept_route,
|
9
10
|
check_type_validity,
|
@@ -111,12 +112,17 @@ class CamoufoxEngine:
|
|
111
112
|
if 'charset=' in content_type.lower():
|
112
113
|
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
113
114
|
|
115
|
+
status_text = res.status_text
|
116
|
+
# PlayWright API sometimes give empty status text for some reason!
|
117
|
+
if not status_text:
|
118
|
+
status_text = StatusText.get(res.status)
|
119
|
+
|
114
120
|
response = Response(
|
115
121
|
url=res.url,
|
116
122
|
text=page.content(),
|
117
|
-
body=
|
123
|
+
body=page.content().encode('utf-8'),
|
118
124
|
status=res.status,
|
119
|
-
reason=
|
125
|
+
reason=status_text,
|
120
126
|
encoding=encoding,
|
121
127
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
128
|
headers=res.all_headers(),
|
scrapling/engines/pw.py
CHANGED
@@ -6,6 +6,7 @@ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAUL
|
|
6
6
|
from scrapling.engines.toolbelt import (
|
7
7
|
Response,
|
8
8
|
do_nothing,
|
9
|
+
StatusText,
|
9
10
|
js_bypass_path,
|
10
11
|
intercept_route,
|
11
12
|
generate_headers,
|
@@ -221,12 +222,17 @@ class PlaywrightEngine:
|
|
221
222
|
if 'charset=' in content_type.lower():
|
222
223
|
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
223
224
|
|
225
|
+
status_text = res.status_text
|
226
|
+
# PlayWright API sometimes give empty status text for some reason!
|
227
|
+
if not status_text:
|
228
|
+
status_text = StatusText.get(res.status)
|
229
|
+
|
224
230
|
response = Response(
|
225
231
|
url=res.url,
|
226
232
|
text=page.content(),
|
227
|
-
body=
|
233
|
+
body=page.content().encode('utf-8'),
|
228
234
|
status=res.status,
|
229
|
-
reason=
|
235
|
+
reason=status_text,
|
230
236
|
encoding=encoding,
|
231
237
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
232
238
|
headers=res.all_headers(),
|
@@ -4,8 +4,9 @@ Functions related to custom types or type checking
|
|
4
4
|
import inspect
|
5
5
|
import logging
|
6
6
|
|
7
|
-
from scrapling.core.
|
7
|
+
from scrapling.core.custom_types import MappingProxyType
|
8
8
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
9
|
+
from scrapling.core.utils import setup_basic_logging, cache
|
9
10
|
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
10
11
|
|
11
12
|
|
@@ -67,6 +68,83 @@ class BaseFetcher:
|
|
67
68
|
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
68
69
|
|
69
70
|
|
71
|
+
class StatusText:
|
72
|
+
"""A class that gets the status text of response status code.
|
73
|
+
|
74
|
+
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
|
75
|
+
"""
|
76
|
+
_phrases = MappingProxyType({
|
77
|
+
100: "Continue",
|
78
|
+
101: "Switching Protocols",
|
79
|
+
102: "Processing",
|
80
|
+
103: "Early Hints",
|
81
|
+
200: "OK",
|
82
|
+
201: "Created",
|
83
|
+
202: "Accepted",
|
84
|
+
203: "Non-Authoritative Information",
|
85
|
+
204: "No Content",
|
86
|
+
205: "Reset Content",
|
87
|
+
206: "Partial Content",
|
88
|
+
207: "Multi-Status",
|
89
|
+
208: "Already Reported",
|
90
|
+
226: "IM Used",
|
91
|
+
300: "Multiple Choices",
|
92
|
+
301: "Moved Permanently",
|
93
|
+
302: "Found",
|
94
|
+
303: "See Other",
|
95
|
+
304: "Not Modified",
|
96
|
+
305: "Use Proxy",
|
97
|
+
307: "Temporary Redirect",
|
98
|
+
308: "Permanent Redirect",
|
99
|
+
400: "Bad Request",
|
100
|
+
401: "Unauthorized",
|
101
|
+
402: "Payment Required",
|
102
|
+
403: "Forbidden",
|
103
|
+
404: "Not Found",
|
104
|
+
405: "Method Not Allowed",
|
105
|
+
406: "Not Acceptable",
|
106
|
+
407: "Proxy Authentication Required",
|
107
|
+
408: "Request Timeout",
|
108
|
+
409: "Conflict",
|
109
|
+
410: "Gone",
|
110
|
+
411: "Length Required",
|
111
|
+
412: "Precondition Failed",
|
112
|
+
413: "Payload Too Large",
|
113
|
+
414: "URI Too Long",
|
114
|
+
415: "Unsupported Media Type",
|
115
|
+
416: "Range Not Satisfiable",
|
116
|
+
417: "Expectation Failed",
|
117
|
+
418: "I'm a teapot",
|
118
|
+
421: "Misdirected Request",
|
119
|
+
422: "Unprocessable Entity",
|
120
|
+
423: "Locked",
|
121
|
+
424: "Failed Dependency",
|
122
|
+
425: "Too Early",
|
123
|
+
426: "Upgrade Required",
|
124
|
+
428: "Precondition Required",
|
125
|
+
429: "Too Many Requests",
|
126
|
+
431: "Request Header Fields Too Large",
|
127
|
+
451: "Unavailable For Legal Reasons",
|
128
|
+
500: "Internal Server Error",
|
129
|
+
501: "Not Implemented",
|
130
|
+
502: "Bad Gateway",
|
131
|
+
503: "Service Unavailable",
|
132
|
+
504: "Gateway Timeout",
|
133
|
+
505: "HTTP Version Not Supported",
|
134
|
+
506: "Variant Also Negotiates",
|
135
|
+
507: "Insufficient Storage",
|
136
|
+
508: "Loop Detected",
|
137
|
+
510: "Not Extended",
|
138
|
+
511: "Network Authentication Required"
|
139
|
+
})
|
140
|
+
|
141
|
+
@classmethod
|
142
|
+
@cache(maxsize=128)
|
143
|
+
def get(cls, status_code: int) -> str:
|
144
|
+
"""Get the phrase for a given HTTP status code."""
|
145
|
+
return cls._phrases.get(status_code, "Unknown Status Code")
|
146
|
+
|
147
|
+
|
70
148
|
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
71
149
|
"""This function check if the passed engine can be used by a Fetcher-type class or not.
|
72
150
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=Tj_pDeN1yhufhlxQ0bY7Qnuxntq_JaqBUCQZrz01EFA,435
|
2
2
|
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
3
|
scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
|
4
4
|
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
@@ -11,12 +11,12 @@ scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHA
|
|
11
11
|
scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
|
12
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=WJNDR3OY5LLqNHRMD4YbwuqUdnEZ8U-Et_1YUn6vDiw,7773
|
15
15
|
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
16
|
-
scrapling/engines/pw.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=6iNdnNF9M45FJkazeCvFRicyTFD2EkxSISJJP__uOug,12345
|
17
17
|
scrapling/engines/static.py,sha256=Wsp6_-soZUQJT6kHoKPkLOdHU9J50chLdYxDmQjO4FQ,7101
|
18
|
-
scrapling/engines/toolbelt/__init__.py,sha256=
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
18
|
+
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=6Ip-9t2G8TaXLGLARQAEcbnFqvAN7AmgN1ah0glRiMs,9953
|
20
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
@@ -34,8 +34,8 @@ tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-T
|
|
34
34
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
36
36
|
tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
|
37
|
-
scrapling-0.2.
|
38
|
-
scrapling-0.2.
|
39
|
-
scrapling-0.2.
|
40
|
-
scrapling-0.2.
|
41
|
-
scrapling-0.2.
|
37
|
+
scrapling-0.2.4.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
38
|
+
scrapling-0.2.4.dist-info/METADATA,sha256=uOp98w2qzOGqE4ofFFG_TgWgZGrscQHWhmP49pfIV3s,64785
|
39
|
+
scrapling-0.2.4.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
40
|
+
scrapling-0.2.4.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
41
|
+
scrapling-0.2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|