symbolicai 0.21.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +269 -173
- symai/backend/base.py +123 -110
- symai/backend/engines/drawing/engine_bfl.py +45 -44
- symai/backend/engines/drawing/engine_gpt_image.py +112 -97
- symai/backend/engines/embedding/engine_llama_cpp.py +63 -52
- symai/backend/engines/embedding/engine_openai.py +25 -21
- symai/backend/engines/execute/engine_python.py +19 -18
- symai/backend/engines/files/engine_io.py +104 -95
- symai/backend/engines/imagecaptioning/engine_blip2.py +28 -24
- symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +102 -79
- symai/backend/engines/index/engine_pinecone.py +124 -97
- symai/backend/engines/index/engine_qdrant.py +1011 -0
- symai/backend/engines/index/engine_vectordb.py +84 -56
- symai/backend/engines/lean/engine_lean4.py +96 -52
- symai/backend/engines/neurosymbolic/__init__.py +41 -13
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +330 -248
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +329 -264
- symai/backend/engines/neurosymbolic/engine_cerebras.py +328 -0
- symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +118 -88
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +344 -299
- symai/backend/engines/neurosymbolic/engine_groq.py +173 -115
- symai/backend/engines/neurosymbolic/engine_huggingface.py +114 -84
- symai/backend/engines/neurosymbolic/engine_llama_cpp.py +144 -118
- symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +415 -307
- symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +394 -231
- symai/backend/engines/ocr/engine_apilayer.py +23 -27
- symai/backend/engines/output/engine_stdout.py +10 -13
- symai/backend/engines/{webscraping → scrape}/engine_requests.py +101 -54
- symai/backend/engines/search/engine_openai.py +100 -88
- symai/backend/engines/search/engine_parallel.py +665 -0
- symai/backend/engines/search/engine_perplexity.py +44 -45
- symai/backend/engines/search/engine_serpapi.py +37 -34
- symai/backend/engines/speech_to_text/engine_local_whisper.py +54 -51
- symai/backend/engines/symbolic/engine_wolframalpha.py +15 -9
- symai/backend/engines/text_to_speech/engine_openai.py +20 -26
- symai/backend/engines/text_vision/engine_clip.py +39 -37
- symai/backend/engines/userinput/engine_console.py +5 -6
- symai/backend/mixin/__init__.py +13 -0
- symai/backend/mixin/anthropic.py +48 -38
- symai/backend/mixin/deepseek.py +6 -5
- symai/backend/mixin/google.py +7 -4
- symai/backend/mixin/groq.py +2 -4
- symai/backend/mixin/openai.py +140 -110
- symai/backend/settings.py +87 -20
- symai/chat.py +216 -123
- symai/collect/__init__.py +7 -1
- symai/collect/dynamic.py +80 -70
- symai/collect/pipeline.py +67 -51
- symai/collect/stats.py +161 -109
- symai/components.py +707 -360
- symai/constraints.py +24 -12
- symai/core.py +1857 -1233
- symai/core_ext.py +83 -80
- symai/endpoints/api.py +166 -104
- symai/extended/.DS_Store +0 -0
- symai/extended/__init__.py +46 -12
- symai/extended/api_builder.py +29 -21
- symai/extended/arxiv_pdf_parser.py +23 -14
- symai/extended/bibtex_parser.py +9 -6
- symai/extended/conversation.py +156 -126
- symai/extended/document.py +50 -30
- symai/extended/file_merger.py +57 -14
- symai/extended/graph.py +51 -32
- symai/extended/html_style_template.py +18 -14
- symai/extended/interfaces/blip_2.py +2 -3
- symai/extended/interfaces/clip.py +4 -3
- symai/extended/interfaces/console.py +9 -1
- symai/extended/interfaces/dall_e.py +4 -2
- symai/extended/interfaces/file.py +2 -0
- symai/extended/interfaces/flux.py +4 -2
- symai/extended/interfaces/gpt_image.py +16 -7
- symai/extended/interfaces/input.py +2 -1
- symai/extended/interfaces/llava.py +1 -2
- symai/extended/interfaces/{naive_webscraping.py → naive_scrape.py} +4 -3
- symai/extended/interfaces/naive_vectordb.py +9 -10
- symai/extended/interfaces/ocr.py +5 -3
- symai/extended/interfaces/openai_search.py +2 -0
- symai/extended/interfaces/parallel.py +30 -0
- symai/extended/interfaces/perplexity.py +2 -0
- symai/extended/interfaces/pinecone.py +12 -9
- symai/extended/interfaces/python.py +2 -0
- symai/extended/interfaces/serpapi.py +3 -1
- symai/extended/interfaces/terminal.py +2 -4
- symai/extended/interfaces/tts.py +3 -2
- symai/extended/interfaces/whisper.py +3 -2
- symai/extended/interfaces/wolframalpha.py +2 -1
- symai/extended/metrics/__init__.py +11 -1
- symai/extended/metrics/similarity.py +14 -13
- symai/extended/os_command.py +39 -29
- symai/extended/packages/__init__.py +29 -3
- symai/extended/packages/symdev.py +51 -43
- symai/extended/packages/sympkg.py +41 -35
- symai/extended/packages/symrun.py +63 -50
- symai/extended/repo_cloner.py +14 -12
- symai/extended/seo_query_optimizer.py +15 -13
- symai/extended/solver.py +116 -91
- symai/extended/summarizer.py +12 -10
- symai/extended/taypan_interpreter.py +17 -18
- symai/extended/vectordb.py +122 -92
- symai/formatter/__init__.py +9 -1
- symai/formatter/formatter.py +51 -47
- symai/formatter/regex.py +70 -69
- symai/functional.py +325 -176
- symai/imports.py +190 -147
- symai/interfaces.py +57 -28
- symai/memory.py +45 -35
- symai/menu/screen.py +28 -19
- symai/misc/console.py +66 -56
- symai/misc/loader.py +8 -5
- symai/models/__init__.py +17 -1
- symai/models/base.py +395 -236
- symai/models/errors.py +1 -2
- symai/ops/__init__.py +32 -22
- symai/ops/measures.py +24 -25
- symai/ops/primitives.py +1149 -731
- symai/post_processors.py +58 -50
- symai/pre_processors.py +86 -82
- symai/processor.py +21 -13
- symai/prompts.py +764 -685
- symai/server/huggingface_server.py +135 -49
- symai/server/llama_cpp_server.py +21 -11
- symai/server/qdrant_server.py +206 -0
- symai/shell.py +100 -42
- symai/shellsv.py +700 -492
- symai/strategy.py +630 -346
- symai/symbol.py +368 -322
- symai/utils.py +100 -78
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/METADATA +22 -10
- symbolicai-1.1.0.dist-info/RECORD +168 -0
- symbolicai-0.21.0.dist-info/RECORD +0 -162
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/WHEEL +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import requests
|
|
2
1
|
from pathlib import Path
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
import requests
|
|
5
4
|
|
|
5
|
+
from ....symbol import Result
|
|
6
6
|
from ...base import Engine
|
|
7
7
|
from ...settings import SYMAI_CONFIG
|
|
8
|
-
from ....symbol import Result
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
class ApiLayerResult(Result):
|
|
@@ -14,56 +13,53 @@ class ApiLayerResult(Result):
|
|
|
14
13
|
self.raw = text
|
|
15
14
|
try:
|
|
16
15
|
dict_ = self._to_symbol(text).ast()
|
|
17
|
-
self._value = dict_
|
|
18
|
-
|
|
19
|
-
|
|
16
|
+
self._value = dict_.get(
|
|
17
|
+
"all_text", f"OCR Engine Error: {text} - status code {status_code}"
|
|
18
|
+
)
|
|
19
|
+
except Exception:
|
|
20
|
+
self._value = f"OCR Engine Error: {text} - status code {status_code}"
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class OCREngine(Engine):
|
|
23
|
-
def __init__(self, api_key:
|
|
24
|
+
def __init__(self, api_key: str | None = None):
|
|
24
25
|
super().__init__()
|
|
25
26
|
# Opening JSON file
|
|
26
27
|
self.config = SYMAI_CONFIG
|
|
27
|
-
self.headers = {
|
|
28
|
-
"apikey": self.config['OCR_ENGINE_API_KEY'] if api_key is None else api_key
|
|
29
|
-
}
|
|
28
|
+
self.headers = {"apikey": self.config["OCR_ENGINE_API_KEY"] if api_key is None else api_key}
|
|
30
29
|
self.name = self.__class__.__name__
|
|
31
30
|
|
|
32
31
|
def id(self) -> str:
|
|
33
|
-
if self.config[
|
|
34
|
-
return
|
|
35
|
-
return super().id()
|
|
32
|
+
if self.config["OCR_ENGINE_API_KEY"]:
|
|
33
|
+
return "ocr"
|
|
34
|
+
return super().id() # default to unregistered
|
|
36
35
|
|
|
37
36
|
def command(self, *args, **kwargs):
|
|
38
37
|
super().command(*args, **kwargs)
|
|
39
|
-
if
|
|
40
|
-
self.headers = {
|
|
41
|
-
"apikey": kwargs['OCR_ENGINE_API_KEY']
|
|
42
|
-
}
|
|
38
|
+
if "OCR_ENGINE_API_KEY" in kwargs:
|
|
39
|
+
self.headers = {"apikey": kwargs["OCR_ENGINE_API_KEY"]}
|
|
43
40
|
|
|
44
41
|
def forward(self, argument):
|
|
45
|
-
kwargs = argument.kwargs
|
|
46
42
|
image_url = argument.prop.image
|
|
47
43
|
|
|
48
44
|
if image_url.startswith("file://"):
|
|
49
45
|
file_path = Path(image_url[7:]).resolve()
|
|
50
|
-
with open(
|
|
46
|
+
with file_path.open("rb") as file:
|
|
51
47
|
payload = file.read()
|
|
52
|
-
url
|
|
48
|
+
url = "https://api.apilayer.com/image_to_text/upload"
|
|
53
49
|
response = requests.request("POST", url, headers=self.headers, data=payload)
|
|
54
50
|
else:
|
|
55
|
-
payload
|
|
56
|
-
url
|
|
57
|
-
response = requests.request("GET", url, headers=self.headers, data
|
|
51
|
+
payload = {}
|
|
52
|
+
url = f"https://api.apilayer.com/image_to_text/url?url={image_url}"
|
|
53
|
+
response = requests.request("GET", url, headers=self.headers, data=payload)
|
|
58
54
|
|
|
59
55
|
status_code = response.status_code
|
|
60
|
-
rsp
|
|
61
|
-
rsp
|
|
62
|
-
metadata
|
|
56
|
+
rsp = response.text
|
|
57
|
+
rsp = ApiLayerResult(response.text, status_code)
|
|
58
|
+
metadata = {}
|
|
63
59
|
|
|
64
60
|
return [rsp], metadata
|
|
65
61
|
|
|
66
62
|
def prepare(self, argument):
|
|
67
63
|
assert not argument.prop.processed_input, "OCREngine does not support processed_input."
|
|
68
|
-
image
|
|
64
|
+
image = str(argument.prop.image)
|
|
69
65
|
argument.prop.prepared_input = image
|
|
@@ -7,28 +7,25 @@ class OutputEngine(Engine):
|
|
|
7
7
|
self.name = self.__class__.__name__
|
|
8
8
|
|
|
9
9
|
def id(self) -> str:
|
|
10
|
-
return
|
|
10
|
+
return "output"
|
|
11
11
|
|
|
12
12
|
def forward(self, argument):
|
|
13
|
-
expr, processed, args, kwargs
|
|
13
|
+
expr, processed, args, kwargs = argument.prop.prepared_input
|
|
14
14
|
res = None
|
|
15
15
|
args = [] if args is None else args
|
|
16
16
|
kwargs = {} if kwargs is None else kwargs
|
|
17
17
|
if expr:
|
|
18
|
-
if processed
|
|
19
|
-
res = expr(processed, *args, **kwargs)
|
|
20
|
-
else:
|
|
21
|
-
res = expr(*args, **kwargs)
|
|
18
|
+
res = expr(processed, *args, **kwargs) if processed else expr(*args, **kwargs)
|
|
22
19
|
|
|
23
20
|
metadata = {}
|
|
24
|
-
result
|
|
25
|
-
'result': res,
|
|
26
|
-
'processed': processed,
|
|
27
|
-
'args': args,
|
|
28
|
-
'kwargs': kwargs
|
|
29
|
-
}
|
|
21
|
+
result = {"result": res, "processed": processed, "args": args, "kwargs": kwargs}
|
|
30
22
|
|
|
31
23
|
return [result], metadata
|
|
32
24
|
|
|
33
25
|
def prepare(self, argument):
|
|
34
|
-
argument.prop.prepared_input =
|
|
26
|
+
argument.prop.prepared_input = (
|
|
27
|
+
argument.prop.expr,
|
|
28
|
+
argument.prop.processed_input,
|
|
29
|
+
argument.prop.args,
|
|
30
|
+
argument.prop.kwargs,
|
|
31
|
+
)
|
|
@@ -10,6 +10,7 @@ service disruption.
|
|
|
10
10
|
import io
|
|
11
11
|
import logging
|
|
12
12
|
import re
|
|
13
|
+
from typing import Any, ClassVar
|
|
13
14
|
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
|
14
15
|
|
|
15
16
|
import requests
|
|
@@ -19,6 +20,7 @@ from pdfminer.high_level import extract_text
|
|
|
19
20
|
from requests.structures import CaseInsensitiveDict
|
|
20
21
|
|
|
21
22
|
from ....symbol import Result
|
|
23
|
+
from ....utils import UserMessage
|
|
22
24
|
from ...base import Engine
|
|
23
25
|
|
|
24
26
|
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
|
@@ -56,23 +58,23 @@ class RequestsEngine(Engine):
|
|
|
56
58
|
the requests session stay aligned.
|
|
57
59
|
"""
|
|
58
60
|
|
|
59
|
-
COMMON_BYPASS_COOKIES = {
|
|
61
|
+
COMMON_BYPASS_COOKIES: ClassVar[dict[str, str]] = {
|
|
60
62
|
# Some forums display consent or age gates once if a friendly cookie is set.
|
|
61
63
|
"cookieconsent_status": "allow",
|
|
62
64
|
"accepted_cookies": "yes",
|
|
63
65
|
"age_verified": "1",
|
|
64
66
|
}
|
|
65
67
|
|
|
66
|
-
DEFAULT_HEADERS = {
|
|
68
|
+
DEFAULT_HEADERS: ClassVar[dict[str, str]] = {
|
|
67
69
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
68
|
-
|
|
69
|
-
|
|
70
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
71
|
+
"Chrome/120.0.0.0 Safari/537.36",
|
|
70
72
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
71
73
|
"Accept-Language": "en-US,en;q=0.9",
|
|
72
74
|
"DNT": "1",
|
|
73
75
|
}
|
|
74
76
|
|
|
75
|
-
_SAMESITE_CANONICAL = {
|
|
77
|
+
_SAMESITE_CANONICAL: ClassVar[dict[str, str]] = {
|
|
76
78
|
"strict": "Strict",
|
|
77
79
|
"lax": "Lax",
|
|
78
80
|
"none": "None",
|
|
@@ -156,6 +158,58 @@ class RequestsEngine(Engine):
|
|
|
156
158
|
payload["sameSite"] = same_site
|
|
157
159
|
return payload
|
|
158
160
|
|
|
161
|
+
def _collect_playwright_cookies(self, hostname: str) -> list[dict[str, Any]]:
|
|
162
|
+
if not hostname:
|
|
163
|
+
return []
|
|
164
|
+
cookie_payload = []
|
|
165
|
+
for cookie in self.session.cookies:
|
|
166
|
+
payload = self._playwright_cookie_payload(cookie, hostname)
|
|
167
|
+
if payload:
|
|
168
|
+
cookie_payload.append(payload)
|
|
169
|
+
return cookie_payload
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def _add_cookies_to_context(context, cookie_payload: list[dict[str, Any]]) -> None:
|
|
173
|
+
if cookie_payload:
|
|
174
|
+
context.add_cookies(cookie_payload)
|
|
175
|
+
|
|
176
|
+
@staticmethod
|
|
177
|
+
def _navigate_playwright_page(
|
|
178
|
+
page, url: str, wait_selector: str | None, wait_until: str, timeout_ms: int, timeout_error
|
|
179
|
+
):
|
|
180
|
+
try:
|
|
181
|
+
response = page.goto(url, wait_until=wait_until, timeout=timeout_ms)
|
|
182
|
+
if wait_selector:
|
|
183
|
+
page.wait_for_selector(wait_selector, timeout=timeout_ms)
|
|
184
|
+
return response, None
|
|
185
|
+
except timeout_error as exc:
|
|
186
|
+
return None, exc
|
|
187
|
+
|
|
188
|
+
@staticmethod
|
|
189
|
+
def _safe_page_content(page) -> str:
|
|
190
|
+
try:
|
|
191
|
+
return page.content()
|
|
192
|
+
except Exception:
|
|
193
|
+
return ""
|
|
194
|
+
|
|
195
|
+
def _sync_cookies_from_context(self, context) -> None:
|
|
196
|
+
for cookie in context.cookies():
|
|
197
|
+
self.session.cookies.set(
|
|
198
|
+
cookie["name"],
|
|
199
|
+
cookie["value"],
|
|
200
|
+
domain=cookie.get("domain"),
|
|
201
|
+
path=cookie.get("path", "/"),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def _rendered_response_metadata(page, response):
|
|
206
|
+
final_url = page.url
|
|
207
|
+
status = response.status if response is not None else 200
|
|
208
|
+
headers = CaseInsensitiveDict(response.headers if response is not None else {})
|
|
209
|
+
if "content-type" not in headers:
|
|
210
|
+
headers["Content-Type"] = "text/html; charset=utf-8"
|
|
211
|
+
return final_url, status, headers
|
|
212
|
+
|
|
159
213
|
def _follow_meta_refresh(self, resp, timeout=15):
|
|
160
214
|
"""
|
|
161
215
|
Some old forums use <meta http-equiv="refresh" content="0;url=...">
|
|
@@ -180,19 +234,28 @@ class RequestsEngine(Engine):
|
|
|
180
234
|
return resp
|
|
181
235
|
return self.session.get(target, timeout=timeout, allow_redirects=True)
|
|
182
236
|
|
|
183
|
-
def _fetch_with_playwright(
|
|
237
|
+
def _fetch_with_playwright(
|
|
238
|
+
self,
|
|
239
|
+
url: str,
|
|
240
|
+
wait_selector: str | None = None,
|
|
241
|
+
wait_until: str = "networkidle",
|
|
242
|
+
timeout: float | None = None,
|
|
243
|
+
):
|
|
184
244
|
"""
|
|
185
245
|
Render the target URL in a headless browser to execute JavaScript and
|
|
186
246
|
return a synthetic ``requests.Response`` object to keep downstream
|
|
187
247
|
processing consistent with the non-JS path.
|
|
188
248
|
"""
|
|
189
249
|
try:
|
|
190
|
-
|
|
250
|
+
# Playwright is optional; import only when JS rendering is requested.
|
|
251
|
+
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError # noqa
|
|
252
|
+
from playwright.sync_api import sync_playwright # noqa
|
|
253
|
+
|
|
191
254
|
logging.getLogger("playwright").setLevel(logging.WARNING)
|
|
192
255
|
except ImportError as exc:
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
) from exc
|
|
256
|
+
msg = "Playwright is not installed. Install symbolicai[scrape] with Playwright extras to enable render_js."
|
|
257
|
+
UserMessage(msg)
|
|
258
|
+
raise RuntimeError(msg) from exc
|
|
196
259
|
|
|
197
260
|
timeout_seconds = timeout if timeout is not None else self.timeout
|
|
198
261
|
timeout_ms = max(int(timeout_seconds * 1000), 0)
|
|
@@ -200,12 +263,7 @@ class RequestsEngine(Engine):
|
|
|
200
263
|
|
|
201
264
|
parsed = urlparse(url)
|
|
202
265
|
hostname = parsed.hostname or ""
|
|
203
|
-
cookie_payload =
|
|
204
|
-
if hostname:
|
|
205
|
-
for cookie in self.session.cookies:
|
|
206
|
-
payload = self._playwright_cookie_payload(cookie, hostname)
|
|
207
|
-
if payload:
|
|
208
|
-
cookie_payload.append(payload)
|
|
266
|
+
cookie_payload = self._collect_playwright_cookies(hostname)
|
|
209
267
|
|
|
210
268
|
content = ""
|
|
211
269
|
final_url = url
|
|
@@ -219,42 +277,26 @@ class RequestsEngine(Engine):
|
|
|
219
277
|
java_script_enabled=True,
|
|
220
278
|
ignore_https_errors=not self.verify_ssl,
|
|
221
279
|
)
|
|
222
|
-
if cookie_payload:
|
|
223
|
-
context.add_cookies(cookie_payload)
|
|
224
|
-
page = context.new_page()
|
|
225
|
-
|
|
226
|
-
navigation_error = None
|
|
227
|
-
response = None
|
|
228
280
|
try:
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
cookie["name"],
|
|
245
|
-
cookie["value"],
|
|
246
|
-
domain=cookie.get("domain"),
|
|
247
|
-
path=cookie.get("path", "/"),
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
final_url = page.url
|
|
251
|
-
status = response.status if response is not None else 200
|
|
252
|
-
headers = CaseInsensitiveDict(response.headers if response is not None else {})
|
|
253
|
-
if "content-type" not in headers:
|
|
254
|
-
headers["Content-Type"] = "text/html; charset=utf-8"
|
|
255
|
-
|
|
281
|
+
self._add_cookies_to_context(context, cookie_payload)
|
|
282
|
+
page = context.new_page()
|
|
283
|
+
|
|
284
|
+
response, navigation_error = self._navigate_playwright_page(
|
|
285
|
+
page,
|
|
286
|
+
url,
|
|
287
|
+
wait_selector,
|
|
288
|
+
wait_until,
|
|
289
|
+
timeout_ms,
|
|
290
|
+
PlaywrightTimeoutError,
|
|
291
|
+
)
|
|
292
|
+
content = self._safe_page_content(page)
|
|
293
|
+
self._sync_cookies_from_context(context)
|
|
294
|
+
|
|
295
|
+
final_url, status, headers = self._rendered_response_metadata(page, response)
|
|
256
296
|
if navigation_error and not content:
|
|
257
|
-
|
|
297
|
+
msg = f"Playwright timed out while rendering {url}"
|
|
298
|
+
UserMessage(msg)
|
|
299
|
+
raise requests.exceptions.Timeout(msg) from navigation_error
|
|
258
300
|
finally:
|
|
259
301
|
context.close()
|
|
260
302
|
browser.close()
|
|
@@ -268,7 +310,7 @@ class RequestsEngine(Engine):
|
|
|
268
310
|
return rendered_response
|
|
269
311
|
|
|
270
312
|
def id(self) -> str:
|
|
271
|
-
return
|
|
313
|
+
return "scrape"
|
|
272
314
|
|
|
273
315
|
def forward(self, argument):
|
|
274
316
|
"""
|
|
@@ -284,8 +326,11 @@ class RequestsEngine(Engine):
|
|
|
284
326
|
self._maybe_set_bypass_cookies(url)
|
|
285
327
|
|
|
286
328
|
parsed = urlparse(url)
|
|
287
|
-
qs = [
|
|
288
|
-
|
|
329
|
+
qs = [
|
|
330
|
+
(k, v)
|
|
331
|
+
for k, v in parse_qsl(parsed.query, keep_blank_values=True)
|
|
332
|
+
if k.lower() not in {"utm_source", "utm_medium", "utm_campaign"}
|
|
333
|
+
]
|
|
289
334
|
clean_url = urlunparse(parsed._replace(query=urlencode(qs)))
|
|
290
335
|
|
|
291
336
|
render_js = kwargs.get("render_js")
|
|
@@ -302,7 +347,9 @@ class RequestsEngine(Engine):
|
|
|
302
347
|
timeout=render_timeout,
|
|
303
348
|
)
|
|
304
349
|
else:
|
|
305
|
-
resp = self.session.get(
|
|
350
|
+
resp = self.session.get(
|
|
351
|
+
clean_url, timeout=self.timeout, allow_redirects=True, verify=self.verify_ssl
|
|
352
|
+
)
|
|
306
353
|
resp.raise_for_status()
|
|
307
354
|
|
|
308
355
|
# Follow a legacy meta refresh once (do AFTER normal HTTP redirects)
|