scrapling 0.2.3__tar.gz → 0.2.4__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {scrapling-0.2.3/scrapling.egg-info → scrapling-0.2.4}/PKG-INFO +1 -1
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/__init__.py +1 -1
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/camo.py +8 -2
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/pw.py +8 -2
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/__init__.py +1 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/custom.py +79 -1
- {scrapling-0.2.3 → scrapling-0.2.4/scrapling.egg-info}/PKG-INFO +1 -1
- {scrapling-0.2.3 → scrapling-0.2.4}/setup.cfg +1 -1
- {scrapling-0.2.3 → scrapling-0.2.4}/setup.py +1 -1
- {scrapling-0.2.3 → scrapling-0.2.4}/LICENSE +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/MANIFEST.in +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/README.md +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/core/__init__.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/core/_types.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/core/custom_types.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/core/mixins.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/core/storage_adaptors.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/core/translator.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/core/utils.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/defaults.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/__init__.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/constants.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/static.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/fingerprints.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/navigation.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/fetchers.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/parser.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling/py.typed +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling.egg-info/SOURCES.txt +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling.egg-info/dependency_links.txt +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling.egg-info/not-zip-safe +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling.egg-info/requires.txt +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/scrapling.egg-info/top_level.txt +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/tests/__init__.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/tests/fetchers/__init__.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/tests/fetchers/test_camoufox.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/tests/fetchers/test_httpx.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/tests/fetchers/test_playwright.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/tests/parser/__init__.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/tests/parser/test_automatch.py +0 -0
- {scrapling-0.2.3 → scrapling-0.2.4}/tests/parser/test_general.py +0 -0
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.4"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
@@ -4,6 +4,7 @@ from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
|
4
4
|
from scrapling.engines.toolbelt import (
|
5
5
|
Response,
|
6
6
|
do_nothing,
|
7
|
+
StatusText,
|
7
8
|
get_os_name,
|
8
9
|
intercept_route,
|
9
10
|
check_type_validity,
|
@@ -111,12 +112,17 @@ class CamoufoxEngine:
|
|
111
112
|
if 'charset=' in content_type.lower():
|
112
113
|
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
113
114
|
|
115
|
+
status_text = res.status_text
|
116
|
+
# PlayWright API sometimes give empty status text for some reason!
|
117
|
+
if not status_text:
|
118
|
+
status_text = StatusText.get(res.status)
|
119
|
+
|
114
120
|
response = Response(
|
115
121
|
url=res.url,
|
116
122
|
text=page.content(),
|
117
|
-
body=
|
123
|
+
body=page.content().encode('utf-8'),
|
118
124
|
status=res.status,
|
119
|
-
reason=
|
125
|
+
reason=status_text,
|
120
126
|
encoding=encoding,
|
121
127
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
128
|
headers=res.all_headers(),
|
@@ -6,6 +6,7 @@ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAUL
|
|
6
6
|
from scrapling.engines.toolbelt import (
|
7
7
|
Response,
|
8
8
|
do_nothing,
|
9
|
+
StatusText,
|
9
10
|
js_bypass_path,
|
10
11
|
intercept_route,
|
11
12
|
generate_headers,
|
@@ -221,12 +222,17 @@ class PlaywrightEngine:
|
|
221
222
|
if 'charset=' in content_type.lower():
|
222
223
|
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
223
224
|
|
225
|
+
status_text = res.status_text
|
226
|
+
# PlayWright API sometimes give empty status text for some reason!
|
227
|
+
if not status_text:
|
228
|
+
status_text = StatusText.get(res.status)
|
229
|
+
|
224
230
|
response = Response(
|
225
231
|
url=res.url,
|
226
232
|
text=page.content(),
|
227
|
-
body=
|
233
|
+
body=page.content().encode('utf-8'),
|
228
234
|
status=res.status,
|
229
|
-
reason=
|
235
|
+
reason=status_text,
|
230
236
|
encoding=encoding,
|
231
237
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
232
238
|
headers=res.all_headers(),
|
@@ -4,8 +4,9 @@ Functions related to custom types or type checking
|
|
4
4
|
import inspect
|
5
5
|
import logging
|
6
6
|
|
7
|
-
from scrapling.core.
|
7
|
+
from scrapling.core.custom_types import MappingProxyType
|
8
8
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
9
|
+
from scrapling.core.utils import setup_basic_logging, cache
|
9
10
|
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
|
10
11
|
|
11
12
|
|
@@ -67,6 +68,83 @@ class BaseFetcher:
|
|
67
68
|
self.adaptor_arguments.update({'automatch_domain': automatch_domain})
|
68
69
|
|
69
70
|
|
71
|
+
class StatusText:
|
72
|
+
"""A class that gets the status text of response status code.
|
73
|
+
|
74
|
+
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
|
75
|
+
"""
|
76
|
+
_phrases = MappingProxyType({
|
77
|
+
100: "Continue",
|
78
|
+
101: "Switching Protocols",
|
79
|
+
102: "Processing",
|
80
|
+
103: "Early Hints",
|
81
|
+
200: "OK",
|
82
|
+
201: "Created",
|
83
|
+
202: "Accepted",
|
84
|
+
203: "Non-Authoritative Information",
|
85
|
+
204: "No Content",
|
86
|
+
205: "Reset Content",
|
87
|
+
206: "Partial Content",
|
88
|
+
207: "Multi-Status",
|
89
|
+
208: "Already Reported",
|
90
|
+
226: "IM Used",
|
91
|
+
300: "Multiple Choices",
|
92
|
+
301: "Moved Permanently",
|
93
|
+
302: "Found",
|
94
|
+
303: "See Other",
|
95
|
+
304: "Not Modified",
|
96
|
+
305: "Use Proxy",
|
97
|
+
307: "Temporary Redirect",
|
98
|
+
308: "Permanent Redirect",
|
99
|
+
400: "Bad Request",
|
100
|
+
401: "Unauthorized",
|
101
|
+
402: "Payment Required",
|
102
|
+
403: "Forbidden",
|
103
|
+
404: "Not Found",
|
104
|
+
405: "Method Not Allowed",
|
105
|
+
406: "Not Acceptable",
|
106
|
+
407: "Proxy Authentication Required",
|
107
|
+
408: "Request Timeout",
|
108
|
+
409: "Conflict",
|
109
|
+
410: "Gone",
|
110
|
+
411: "Length Required",
|
111
|
+
412: "Precondition Failed",
|
112
|
+
413: "Payload Too Large",
|
113
|
+
414: "URI Too Long",
|
114
|
+
415: "Unsupported Media Type",
|
115
|
+
416: "Range Not Satisfiable",
|
116
|
+
417: "Expectation Failed",
|
117
|
+
418: "I'm a teapot",
|
118
|
+
421: "Misdirected Request",
|
119
|
+
422: "Unprocessable Entity",
|
120
|
+
423: "Locked",
|
121
|
+
424: "Failed Dependency",
|
122
|
+
425: "Too Early",
|
123
|
+
426: "Upgrade Required",
|
124
|
+
428: "Precondition Required",
|
125
|
+
429: "Too Many Requests",
|
126
|
+
431: "Request Header Fields Too Large",
|
127
|
+
451: "Unavailable For Legal Reasons",
|
128
|
+
500: "Internal Server Error",
|
129
|
+
501: "Not Implemented",
|
130
|
+
502: "Bad Gateway",
|
131
|
+
503: "Service Unavailable",
|
132
|
+
504: "Gateway Timeout",
|
133
|
+
505: "HTTP Version Not Supported",
|
134
|
+
506: "Variant Also Negotiates",
|
135
|
+
507: "Insufficient Storage",
|
136
|
+
508: "Loop Detected",
|
137
|
+
510: "Not Extended",
|
138
|
+
511: "Network Authentication Required"
|
139
|
+
})
|
140
|
+
|
141
|
+
@classmethod
|
142
|
+
@cache(maxsize=128)
|
143
|
+
def get(cls, status_code: int) -> str:
|
144
|
+
"""Get the phrase for a given HTTP status code."""
|
145
|
+
return cls._phrases.get(status_code, "Unknown Status Code")
|
146
|
+
|
147
|
+
|
70
148
|
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
71
149
|
"""This function check if the passed engine can be used by a Fetcher-type class or not.
|
72
150
|
|
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
6
6
|
|
7
7
|
setup(
|
8
8
|
name="scrapling",
|
9
|
-
version="0.2.
|
9
|
+
version="0.2.4",
|
10
10
|
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
11
11
|
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
|
12
12
|
impressive speed improvements over many popular scraping tools.""",
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js
RENAMED
File without changes
|
{scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/notification_permission.js
RENAMED
File without changes
|
File without changes
|
{scrapling-0.2.3 → scrapling-0.2.4}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|