scrapling 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/engines/camo.py +8 -2
- scrapling/engines/pw.py +8 -2
- scrapling/engines/toolbelt/__init__.py +1 -0
- scrapling/engines/toolbelt/custom.py +79 -1
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/METADATA +1 -1
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/RECORD +10 -10
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/LICENSE +0 -0
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/WHEEL +0 -0
- {scrapling-0.2.3.dist-info → scrapling-0.2.4.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.4"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/engines/camo.py
CHANGED
@@ -4,6 +4,7 @@ from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
|
4
4
|
from scrapling.engines.toolbelt import (
|
5
5
|
Response,
|
6
6
|
do_nothing,
|
7
|
+
StatusText,
|
7
8
|
get_os_name,
|
8
9
|
intercept_route,
|
9
10
|
check_type_validity,
|
@@ -111,12 +112,17 @@ class CamoufoxEngine:
|
|
111
112
|
if 'charset=' in content_type.lower():
|
112
113
|
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
113
114
|
|
115
|
+
status_text = res.status_text
|
116
|
+
# PlayWright API sometimes give empty status text for some reason!
|
117
|
+
if not status_text:
|
118
|
+
status_text = StatusText.get(res.status)
|
119
|
+
|
114
120
|
response = Response(
|
115
121
|
url=res.url,
|
116
122
|
text=page.content(),
|
117
|
-
body=
|
123
|
+
body=page.content().encode('utf-8'),
|
118
124
|
status=res.status,
|
119
|
-
reason=
|
125
|
+
reason=status_text,
|
120
126
|
encoding=encoding,
|
121
127
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
128
|
headers=res.all_headers(),
|
scrapling/engines/pw.py
CHANGED
@@ -6,6 +6,7 @@ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAUL
|
|
6
6
|
from scrapling.engines.toolbelt import (
|
7
7
|
Response,
|
8
8
|
do_nothing,
|
9
|
+
StatusText,
|
9
10
|
js_bypass_path,
|
10
11
|
intercept_route,
|
11
12
|
generate_headers,
|
@@ -221,12 +222,17 @@ class PlaywrightEngine:
|
|
221
222
|
if 'charset=' in content_type.lower():
|
222
223
|
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
223
224
|
|
225
|
+
status_text = res.status_text
|
226
|
+
# PlayWright API sometimes give empty status text for some reason!
|
227
|
+
if not status_text:
|
228
|
+
status_text = StatusText.get(res.status)
|
229
|
+
|
224
230
|
response = Response(
|
225
231
|
url=res.url,
|
226
232
|
text=page.content(),
|
227
|
-
body=
|
233
|
+
body=page.content().encode('utf-8'),
|
228
234
|
status=res.status,
|
229
|
-
reason=
|
235
|
+
reason=status_text,
|
230
236
|
encoding=encoding,
|
231
237
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
232
238
|
headers=res.all_headers(),
|
@@ -4,8 +4,9 @@ Functions related to custom types or type checking
|
|
4
4
|
import inspect
|
5
5
|
import logging
|
6
6
|
|
7
|
-
from scrapling.core.
|
7
|
+
from scrapling.core.custom_types import MappingProxyType
|
8
8
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
9
|
+
from scrapling.core.utils import setup_basic_logging, cache
|
9
10
|
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
10
11
|
|
11
12
|
|
@@ -67,6 +68,83 @@ class BaseFetcher:
|
|
67
68
|
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
68
69
|
|
69
70
|
|
71
|
+
class StatusText:
|
72
|
+
"""A class that gets the status text of response status code.
|
73
|
+
|
74
|
+
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
|
75
|
+
"""
|
76
|
+
_phrases = MappingProxyType({
|
77
|
+
100: "Continue",
|
78
|
+
101: "Switching Protocols",
|
79
|
+
102: "Processing",
|
80
|
+
103: "Early Hints",
|
81
|
+
200: "OK",
|
82
|
+
201: "Created",
|
83
|
+
202: "Accepted",
|
84
|
+
203: "Non-Authoritative Information",
|
85
|
+
204: "No Content",
|
86
|
+
205: "Reset Content",
|
87
|
+
206: "Partial Content",
|
88
|
+
207: "Multi-Status",
|
89
|
+
208: "Already Reported",
|
90
|
+
226: "IM Used",
|
91
|
+
300: "Multiple Choices",
|
92
|
+
301: "Moved Permanently",
|
93
|
+
302: "Found",
|
94
|
+
303: "See Other",
|
95
|
+
304: "Not Modified",
|
96
|
+
305: "Use Proxy",
|
97
|
+
307: "Temporary Redirect",
|
98
|
+
308: "Permanent Redirect",
|
99
|
+
400: "Bad Request",
|
100
|
+
401: "Unauthorized",
|
101
|
+
402: "Payment Required",
|
102
|
+
403: "Forbidden",
|
103
|
+
404: "Not Found",
|
104
|
+
405: "Method Not Allowed",
|
105
|
+
406: "Not Acceptable",
|
106
|
+
407: "Proxy Authentication Required",
|
107
|
+
408: "Request Timeout",
|
108
|
+
409: "Conflict",
|
109
|
+
410: "Gone",
|
110
|
+
411: "Length Required",
|
111
|
+
412: "Precondition Failed",
|
112
|
+
413: "Payload Too Large",
|
113
|
+
414: "URI Too Long",
|
114
|
+
415: "Unsupported Media Type",
|
115
|
+
416: "Range Not Satisfiable",
|
116
|
+
417: "Expectation Failed",
|
117
|
+
418: "I'm a teapot",
|
118
|
+
421: "Misdirected Request",
|
119
|
+
422: "Unprocessable Entity",
|
120
|
+
423: "Locked",
|
121
|
+
424: "Failed Dependency",
|
122
|
+
425: "Too Early",
|
123
|
+
426: "Upgrade Required",
|
124
|
+
428: "Precondition Required",
|
125
|
+
429: "Too Many Requests",
|
126
|
+
431: "Request Header Fields Too Large",
|
127
|
+
451: "Unavailable For Legal Reasons",
|
128
|
+
500: "Internal Server Error",
|
129
|
+
501: "Not Implemented",
|
130
|
+
502: "Bad Gateway",
|
131
|
+
503: "Service Unavailable",
|
132
|
+
504: "Gateway Timeout",
|
133
|
+
505: "HTTP Version Not Supported",
|
134
|
+
506: "Variant Also Negotiates",
|
135
|
+
507: "Insufficient Storage",
|
136
|
+
508: "Loop Detected",
|
137
|
+
510: "Not Extended",
|
138
|
+
511: "Network Authentication Required"
|
139
|
+
})
|
140
|
+
|
141
|
+
@classmethod
|
142
|
+
@cache(maxsize=128)
|
143
|
+
def get(cls, status_code: int) -> str:
|
144
|
+
"""Get the phrase for a given HTTP status code."""
|
145
|
+
return cls._phrases.get(status_code, "Unknown Status Code")
|
146
|
+
|
147
|
+
|
70
148
|
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
71
149
|
"""This function check if the passed engine can be used by a Fetcher-type class or not.
|
72
150
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=Tj_pDeN1yhufhlxQ0bY7Qnuxntq_JaqBUCQZrz01EFA,435
|
2
2
|
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
3
|
scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
|
4
4
|
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
@@ -11,12 +11,12 @@ scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHA
|
|
11
11
|
scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
|
12
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=WJNDR3OY5LLqNHRMD4YbwuqUdnEZ8U-Et_1YUn6vDiw,7773
|
15
15
|
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
16
|
-
scrapling/engines/pw.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=6iNdnNF9M45FJkazeCvFRicyTFD2EkxSISJJP__uOug,12345
|
17
17
|
scrapling/engines/static.py,sha256=Wsp6_-soZUQJT6kHoKPkLOdHU9J50chLdYxDmQjO4FQ,7101
|
18
|
-
scrapling/engines/toolbelt/__init__.py,sha256=
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
18
|
+
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=6Ip-9t2G8TaXLGLARQAEcbnFqvAN7AmgN1ah0glRiMs,9953
|
20
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
@@ -34,8 +34,8 @@ tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-T
|
|
34
34
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
36
36
|
tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
|
37
|
-
scrapling-0.2.
|
38
|
-
scrapling-0.2.
|
39
|
-
scrapling-0.2.
|
40
|
-
scrapling-0.2.
|
41
|
-
scrapling-0.2.
|
37
|
+
scrapling-0.2.4.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
38
|
+
scrapling-0.2.4.dist-info/METADATA,sha256=uOp98w2qzOGqE4ofFFG_TgWgZGrscQHWhmP49pfIV3s,64785
|
39
|
+
scrapling-0.2.4.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
40
|
+
scrapling-0.2.4.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
41
|
+
scrapling-0.2.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|