scrapling 0.2.96__py3-none-any.whl → 0.2.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +35 -6
- scrapling/core/custom_types.py +1 -3
- scrapling/core/storage_adaptors.py +3 -3
- scrapling/core/translator.py +4 -1
- scrapling/core/utils.py +1 -1
- scrapling/defaults.py +18 -9
- scrapling/engines/camo.py +123 -104
- scrapling/engines/pw.py +100 -75
- scrapling/engines/static.py +22 -42
- scrapling/engines/toolbelt/custom.py +2 -2
- scrapling/engines/toolbelt/fingerprints.py +2 -2
- scrapling/engines/toolbelt/navigation.py +1 -1
- scrapling/fetchers.py +24 -24
- scrapling/parser.py +6 -12
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/METADATA +23 -22
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/RECORD +20 -20
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/WHEEL +1 -1
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/LICENSE +0 -0
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.96.dist-info → scrapling-0.2.98.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -1,12 +1,41 @@
|
|
1
|
-
# Declare top-level shortcuts
|
2
|
-
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
3
|
-
from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
|
4
|
-
PlayWrightFetcher, StealthyFetcher)
|
5
|
-
from scrapling.parser import Adaptor, Adaptors
|
6
1
|
|
7
2
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
8
|
-
__version__ = "0.2.
|
3
|
+
__version__ = "0.2.98"
|
9
4
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
10
5
|
|
11
6
|
|
7
|
+
# A lightweight approach to create lazy loader for each import for backward compatibility
|
8
|
+
# This will reduces initial memory footprint significantly (only loads what's used)
|
9
|
+
def __getattr__(name):
|
10
|
+
if name == 'Fetcher':
|
11
|
+
from scrapling.fetchers import Fetcher as cls
|
12
|
+
return cls
|
13
|
+
elif name == 'Adaptor':
|
14
|
+
from scrapling.parser import Adaptor as cls
|
15
|
+
return cls
|
16
|
+
elif name == 'Adaptors':
|
17
|
+
from scrapling.parser import Adaptors as cls
|
18
|
+
return cls
|
19
|
+
elif name == 'AttributesHandler':
|
20
|
+
from scrapling.core.custom_types import AttributesHandler as cls
|
21
|
+
return cls
|
22
|
+
elif name == 'TextHandler':
|
23
|
+
from scrapling.core.custom_types import TextHandler as cls
|
24
|
+
return cls
|
25
|
+
elif name == 'AsyncFetcher':
|
26
|
+
from scrapling.fetchers import AsyncFetcher as cls
|
27
|
+
return cls
|
28
|
+
elif name == 'StealthyFetcher':
|
29
|
+
from scrapling.fetchers import StealthyFetcher as cls
|
30
|
+
return cls
|
31
|
+
elif name == 'PlayWrightFetcher':
|
32
|
+
from scrapling.fetchers import PlayWrightFetcher as cls
|
33
|
+
return cls
|
34
|
+
elif name == 'CustomFetcher':
|
35
|
+
from scrapling.fetchers import CustomFetcher as cls
|
36
|
+
return cls
|
37
|
+
else:
|
38
|
+
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
39
|
+
|
40
|
+
|
12
41
|
__all__ = ['Adaptor', 'Fetcher', 'AsyncFetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
scrapling/core/custom_types.py
CHANGED
@@ -19,9 +19,7 @@ class TextHandler(str):
|
|
19
19
|
__slots__ = ()
|
20
20
|
|
21
21
|
def __new__(cls, string):
|
22
|
-
|
23
|
-
return super().__new__(cls, string)
|
24
|
-
return super().__new__(cls, '')
|
22
|
+
return super().__new__(cls, str(string))
|
25
23
|
|
26
24
|
def __getitem__(self, key: Union[SupportsIndex, slice]) -> "TextHandler":
|
27
25
|
lst = super().__getitem__(key)
|
@@ -19,7 +19,7 @@ class StorageSystemMixin(ABC):
|
|
19
19
|
"""
|
20
20
|
self.url = url
|
21
21
|
|
22
|
-
@lru_cache(
|
22
|
+
@lru_cache(64, typed=True)
|
23
23
|
def _get_base_url(self, default_value: str = 'default') -> str:
|
24
24
|
if not self.url or type(self.url) is not str:
|
25
25
|
return default_value
|
@@ -51,7 +51,7 @@ class StorageSystemMixin(ABC):
|
|
51
51
|
raise NotImplementedError('Storage system must implement `save` method')
|
52
52
|
|
53
53
|
@staticmethod
|
54
|
-
@lru_cache(
|
54
|
+
@lru_cache(128, typed=True)
|
55
55
|
def _get_hash(identifier: str) -> str:
|
56
56
|
"""If you want to hash identifier in your storage system, use this safer"""
|
57
57
|
identifier = identifier.lower().strip()
|
@@ -63,7 +63,7 @@ class StorageSystemMixin(ABC):
|
|
63
63
|
return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
|
64
64
|
|
65
65
|
|
66
|
-
@lru_cache(
|
66
|
+
@lru_cache(1, typed=True)
|
67
67
|
class SQLiteStorageSystem(StorageSystemMixin):
|
68
68
|
"""The recommended system to use, it's race condition safe and thread safe.
|
69
69
|
Mainly built so the library can run in threaded frameworks like scrapy or threaded tools
|
scrapling/core/translator.py
CHANGED
@@ -139,6 +139,9 @@ class TranslatorMixin:
|
|
139
139
|
|
140
140
|
|
141
141
|
class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
|
142
|
-
@lru_cache(maxsize=
|
142
|
+
@lru_cache(maxsize=256)
|
143
143
|
def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
|
144
144
|
return super().css_to_xpath(css, prefix)
|
145
|
+
|
146
|
+
|
147
|
+
translator_instance = HTMLTranslator()
|
scrapling/core/utils.py
CHANGED
scrapling/defaults.py
CHANGED
@@ -1,10 +1,19 @@
|
|
1
|
-
from .fetchers import AsyncFetcher as _AsyncFetcher
|
2
|
-
from .fetchers import Fetcher as _Fetcher
|
3
|
-
from .fetchers import PlayWrightFetcher as _PlayWrightFetcher
|
4
|
-
from .fetchers import StealthyFetcher as _StealthyFetcher
|
5
|
-
|
6
1
|
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
2
|
+
|
3
|
+
# A lightweight approach to create lazy loader for each import for backward compatibility
|
4
|
+
# This will reduces initial memory footprint significantly (only loads what's used)
|
5
|
+
def __getattr__(name):
|
6
|
+
if name == 'Fetcher':
|
7
|
+
from scrapling.fetchers import Fetcher as cls
|
8
|
+
return cls()
|
9
|
+
elif name == 'AsyncFetcher':
|
10
|
+
from scrapling.fetchers import AsyncFetcher as cls
|
11
|
+
return cls()
|
12
|
+
elif name == 'StealthyFetcher':
|
13
|
+
from scrapling.fetchers import StealthyFetcher as cls
|
14
|
+
return cls()
|
15
|
+
elif name == 'PlayWrightFetcher':
|
16
|
+
from scrapling.fetchers import PlayWrightFetcher as cls
|
17
|
+
return cls()
|
18
|
+
else:
|
19
|
+
raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
|
scrapling/engines/camo.py
CHANGED
@@ -15,12 +15,12 @@ from scrapling.engines.toolbelt import (Response, StatusText,
|
|
15
15
|
|
16
16
|
class CamoufoxEngine:
|
17
17
|
def __init__(
|
18
|
-
self, headless:
|
19
|
-
block_webrtc:
|
18
|
+
self, headless: Union[bool, Literal['virtual']] = True, block_images: bool = False, disable_resources: bool = False,
|
19
|
+
block_webrtc: bool = False, allow_webgl: bool = True, network_idle: bool = False, humanize: Union[bool, float] = True,
|
20
20
|
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
21
|
-
wait_selector_state:
|
22
|
-
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize:
|
23
|
-
geoip:
|
21
|
+
wait_selector_state: SelectorWaitStates = 'attached', google_search: bool = True, extra_headers: Optional[Dict[str, str]] = None,
|
22
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: bool = False, disable_ads: bool = False,
|
23
|
+
geoip: bool = False,
|
24
24
|
adaptor_arguments: Dict = None,
|
25
25
|
):
|
26
26
|
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
@@ -64,107 +64,140 @@ class CamoufoxEngine:
|
|
64
64
|
self.addons = addons or []
|
65
65
|
self.humanize = humanize
|
66
66
|
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
67
|
+
|
68
|
+
# Page action callable validation
|
69
|
+
self.page_action = None
|
67
70
|
if page_action is not None:
|
68
71
|
if callable(page_action):
|
69
72
|
self.page_action = page_action
|
70
73
|
else:
|
71
|
-
self.page_action = None
|
72
74
|
log.error('[Ignored] Argument "page_action" must be callable')
|
73
|
-
else:
|
74
|
-
self.page_action = None
|
75
75
|
|
76
76
|
self.wait_selector = wait_selector
|
77
77
|
self.wait_selector_state = wait_selector_state
|
78
78
|
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
79
79
|
|
80
|
+
def _get_camoufox_options(self):
|
81
|
+
"""Return consistent browser options dictionary for both sync and async methods"""
|
82
|
+
return {
|
83
|
+
"geoip": self.geoip,
|
84
|
+
"proxy": self.proxy,
|
85
|
+
"enable_cache": True,
|
86
|
+
"addons": self.addons,
|
87
|
+
"exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
|
88
|
+
"headless": self.headless,
|
89
|
+
"humanize": self.humanize,
|
90
|
+
"i_know_what_im_doing": True, # To turn warnings off with the user configurations
|
91
|
+
"allow_webgl": self.allow_webgl,
|
92
|
+
"block_webrtc": self.block_webrtc,
|
93
|
+
"block_images": self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
94
|
+
"os": None if self.os_randomize else get_os_name(),
|
95
|
+
}
|
96
|
+
|
97
|
+
def _process_response_history(self, first_response):
|
98
|
+
"""Process response history to build a list of Response objects"""
|
99
|
+
history = []
|
100
|
+
current_request = first_response.request.redirected_from
|
101
|
+
|
102
|
+
try:
|
103
|
+
while current_request:
|
104
|
+
try:
|
105
|
+
current_response = current_request.response()
|
106
|
+
history.insert(0, Response(
|
107
|
+
url=current_request.url,
|
108
|
+
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
109
|
+
text='',
|
110
|
+
body=b'',
|
111
|
+
status=current_response.status if current_response else 301,
|
112
|
+
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
113
|
+
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
114
|
+
cookies={},
|
115
|
+
headers=current_response.all_headers() if current_response else {},
|
116
|
+
request_headers=current_request.all_headers(),
|
117
|
+
**self.adaptor_arguments
|
118
|
+
))
|
119
|
+
except Exception as e:
|
120
|
+
log.error(f"Error processing redirect: {e}")
|
121
|
+
break
|
122
|
+
|
123
|
+
current_request = current_request.redirected_from
|
124
|
+
except Exception as e:
|
125
|
+
log.error(f"Error processing response history: {e}")
|
126
|
+
|
127
|
+
return history
|
128
|
+
|
80
129
|
def fetch(self, url: str) -> Response:
|
81
130
|
"""Opens up the browser and do your request based on your chosen options.
|
82
131
|
|
83
132
|
:param url: Target url.
|
84
133
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
85
134
|
"""
|
86
|
-
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
87
|
-
# Store the final response
|
88
135
|
final_response = None
|
136
|
+
referer = generate_convincing_referer(url) if self.google_search else None
|
89
137
|
|
90
138
|
def handle_response(finished_response):
|
91
139
|
nonlocal final_response
|
92
140
|
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
93
141
|
final_response = finished_response
|
94
142
|
|
95
|
-
with Camoufox(
|
96
|
-
|
97
|
-
|
98
|
-
enable_cache=True,
|
99
|
-
addons=self.addons,
|
100
|
-
exclude_addons=addons,
|
101
|
-
headless=self.headless,
|
102
|
-
humanize=self.humanize,
|
103
|
-
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
104
|
-
allow_webgl=self.allow_webgl,
|
105
|
-
block_webrtc=self.block_webrtc,
|
106
|
-
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
107
|
-
os=None if self.os_randomize else get_os_name(),
|
108
|
-
) as browser:
|
109
|
-
page = browser.new_page()
|
143
|
+
with Camoufox(**self._get_camoufox_options()) as browser:
|
144
|
+
context = browser.new_context()
|
145
|
+
page = context.new_page()
|
110
146
|
page.set_default_navigation_timeout(self.timeout)
|
111
147
|
page.set_default_timeout(self.timeout)
|
112
|
-
# Listen for all responses
|
113
148
|
page.on("response", handle_response)
|
149
|
+
|
114
150
|
if self.disable_resources:
|
115
151
|
page.route("**/*", intercept_route)
|
116
152
|
|
117
153
|
if self.extra_headers:
|
118
154
|
page.set_extra_http_headers(self.extra_headers)
|
119
155
|
|
120
|
-
first_response = page.goto(url, referer=
|
156
|
+
first_response = page.goto(url, referer=referer)
|
121
157
|
page.wait_for_load_state(state="domcontentloaded")
|
158
|
+
|
122
159
|
if self.network_idle:
|
123
160
|
page.wait_for_load_state('networkidle')
|
124
161
|
|
125
162
|
if self.page_action is not None:
|
126
|
-
|
163
|
+
try:
|
164
|
+
page = self.page_action(page)
|
165
|
+
except Exception as e:
|
166
|
+
log.error(f"Error executing page_action: {e}")
|
127
167
|
|
128
168
|
if self.wait_selector and type(self.wait_selector) is str:
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
169
|
+
try:
|
170
|
+
waiter = page.locator(self.wait_selector)
|
171
|
+
waiter.first.wait_for(state=self.wait_selector_state)
|
172
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
173
|
+
page.wait_for_load_state(state="load")
|
174
|
+
page.wait_for_load_state(state="domcontentloaded")
|
175
|
+
if self.network_idle:
|
176
|
+
page.wait_for_load_state('networkidle')
|
177
|
+
except Exception as e:
|
178
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
136
179
|
|
137
180
|
# In case we didn't catch a document type somehow
|
138
181
|
final_response = final_response if final_response else first_response
|
182
|
+
if not final_response:
|
183
|
+
raise ValueError("Failed to get a response from the page")
|
184
|
+
|
139
185
|
# This will be parsed inside `Response`
|
140
186
|
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
141
187
|
# PlayWright API sometimes give empty status text for some reason!
|
142
188
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
143
189
|
|
144
|
-
history =
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
url=current_request.url,
|
151
|
-
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
152
|
-
text='',
|
153
|
-
body=b'',
|
154
|
-
status=current_response.status if current_response else 301,
|
155
|
-
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
156
|
-
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
157
|
-
cookies={},
|
158
|
-
headers=current_response.all_headers() if current_response else {},
|
159
|
-
request_headers=current_request.all_headers(),
|
160
|
-
**self.adaptor_arguments
|
161
|
-
))
|
162
|
-
current_request = current_request.redirected_from
|
190
|
+
history = self._process_response_history(first_response)
|
191
|
+
try:
|
192
|
+
page_content = page.content()
|
193
|
+
except Exception as e:
|
194
|
+
log.error(f"Error getting page content: {e}")
|
195
|
+
page_content = ""
|
163
196
|
|
164
197
|
response = Response(
|
165
198
|
url=page.url,
|
166
|
-
text=
|
167
|
-
body=
|
199
|
+
text=page_content,
|
200
|
+
body=page_content.encode('utf-8'),
|
168
201
|
status=final_response.status,
|
169
202
|
reason=status_text,
|
170
203
|
encoding=encoding,
|
@@ -175,6 +208,7 @@ class CamoufoxEngine:
|
|
175
208
|
**self.adaptor_arguments
|
176
209
|
)
|
177
210
|
page.close()
|
211
|
+
context.close()
|
178
212
|
|
179
213
|
return response
|
180
214
|
|
@@ -184,88 +218,72 @@ class CamoufoxEngine:
|
|
184
218
|
:param url: Target url.
|
185
219
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
186
220
|
"""
|
187
|
-
addons = [] if self.disable_ads else [DefaultAddons.UBO]
|
188
|
-
# Store the final response
|
189
221
|
final_response = None
|
222
|
+
referer = generate_convincing_referer(url) if self.google_search else None
|
190
223
|
|
191
224
|
async def handle_response(finished_response):
|
192
225
|
nonlocal final_response
|
193
226
|
if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request():
|
194
227
|
final_response = finished_response
|
195
228
|
|
196
|
-
async with AsyncCamoufox(
|
197
|
-
|
198
|
-
|
199
|
-
enable_cache=True,
|
200
|
-
addons=self.addons,
|
201
|
-
exclude_addons=addons,
|
202
|
-
headless=self.headless,
|
203
|
-
humanize=self.humanize,
|
204
|
-
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
205
|
-
allow_webgl=self.allow_webgl,
|
206
|
-
block_webrtc=self.block_webrtc,
|
207
|
-
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
208
|
-
os=None if self.os_randomize else get_os_name(),
|
209
|
-
) as browser:
|
210
|
-
page = await browser.new_page()
|
229
|
+
async with AsyncCamoufox(**self._get_camoufox_options()) as browser:
|
230
|
+
context = await browser.new_context()
|
231
|
+
page = await context.new_page()
|
211
232
|
page.set_default_navigation_timeout(self.timeout)
|
212
233
|
page.set_default_timeout(self.timeout)
|
213
|
-
# Listen for all responses
|
214
234
|
page.on("response", handle_response)
|
235
|
+
|
215
236
|
if self.disable_resources:
|
216
237
|
await page.route("**/*", async_intercept_route)
|
217
238
|
|
218
239
|
if self.extra_headers:
|
219
240
|
await page.set_extra_http_headers(self.extra_headers)
|
220
241
|
|
221
|
-
first_response = await page.goto(url, referer=
|
242
|
+
first_response = await page.goto(url, referer=referer)
|
222
243
|
await page.wait_for_load_state(state="domcontentloaded")
|
244
|
+
|
223
245
|
if self.network_idle:
|
224
246
|
await page.wait_for_load_state('networkidle')
|
225
247
|
|
226
248
|
if self.page_action is not None:
|
227
|
-
|
249
|
+
try:
|
250
|
+
page = await self.page_action(page)
|
251
|
+
except Exception as e:
|
252
|
+
log.error(f"Error executing async page_action: {e}")
|
228
253
|
|
229
254
|
if self.wait_selector and type(self.wait_selector) is str:
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
255
|
+
try:
|
256
|
+
waiter = page.locator(self.wait_selector)
|
257
|
+
await waiter.first.wait_for(state=self.wait_selector_state)
|
258
|
+
# Wait again after waiting for the selector, helpful with protections like Cloudflare
|
259
|
+
await page.wait_for_load_state(state="load")
|
260
|
+
await page.wait_for_load_state(state="domcontentloaded")
|
261
|
+
if self.network_idle:
|
262
|
+
await page.wait_for_load_state('networkidle')
|
263
|
+
except Exception as e:
|
264
|
+
log.error(f"Error waiting for selector {self.wait_selector}: {e}")
|
237
265
|
|
238
266
|
# In case we didn't catch a document type somehow
|
239
267
|
final_response = final_response if final_response else first_response
|
268
|
+
if not final_response:
|
269
|
+
raise ValueError("Failed to get a response from the page")
|
270
|
+
|
240
271
|
# This will be parsed inside `Response`
|
241
272
|
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
|
242
273
|
# PlayWright API sometimes give empty status text for some reason!
|
243
274
|
status_text = final_response.status_text or StatusText.get(final_response.status)
|
244
275
|
|
245
|
-
history =
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
url=current_request.url,
|
252
|
-
# using current_response.text() will trigger "Error: Response.text: Response body is unavailable for redirect responses"
|
253
|
-
text='',
|
254
|
-
body=b'',
|
255
|
-
status=current_response.status if current_response else 301,
|
256
|
-
reason=(current_response.status_text or StatusText.get(current_response.status)) if current_response else StatusText.get(301),
|
257
|
-
encoding=current_response.headers.get('content-type', '') or 'utf-8',
|
258
|
-
cookies={},
|
259
|
-
headers=await current_response.all_headers() if current_response else {},
|
260
|
-
request_headers=await current_request.all_headers(),
|
261
|
-
**self.adaptor_arguments
|
262
|
-
))
|
263
|
-
current_request = current_request.redirected_from
|
276
|
+
history = self._process_response_history(first_response)
|
277
|
+
try:
|
278
|
+
page_content = await page.content()
|
279
|
+
except Exception as e:
|
280
|
+
log.error(f"Error getting page content in async: {e}")
|
281
|
+
page_content = ""
|
264
282
|
|
265
283
|
response = Response(
|
266
284
|
url=page.url,
|
267
|
-
text=
|
268
|
-
body=
|
285
|
+
text=page_content,
|
286
|
+
body=page_content.encode('utf-8'),
|
269
287
|
status=final_response.status,
|
270
288
|
reason=status_text,
|
271
289
|
encoding=encoding,
|
@@ -276,5 +294,6 @@ class CamoufoxEngine:
|
|
276
294
|
**self.adaptor_arguments
|
277
295
|
)
|
278
296
|
await page.close()
|
297
|
+
await context.close()
|
279
298
|
|
280
299
|
return response
|