symbolicai 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +198 -134
- symai/backend/base.py +51 -51
- symai/backend/engines/drawing/engine_bfl.py +33 -33
- symai/backend/engines/drawing/engine_gpt_image.py +4 -10
- symai/backend/engines/embedding/engine_llama_cpp.py +50 -35
- symai/backend/engines/embedding/engine_openai.py +22 -16
- symai/backend/engines/execute/engine_python.py +16 -16
- symai/backend/engines/files/engine_io.py +51 -49
- symai/backend/engines/imagecaptioning/engine_blip2.py +27 -23
- symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +53 -46
- symai/backend/engines/index/engine_pinecone.py +116 -88
- symai/backend/engines/index/engine_qdrant.py +1011 -0
- symai/backend/engines/index/engine_vectordb.py +78 -52
- symai/backend/engines/lean/engine_lean4.py +65 -25
- symai/backend/engines/neurosymbolic/__init__.py +28 -28
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +137 -135
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +145 -152
- symai/backend/engines/neurosymbolic/engine_cerebras.py +328 -0
- symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +75 -49
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +199 -155
- symai/backend/engines/neurosymbolic/engine_groq.py +106 -72
- symai/backend/engines/neurosymbolic/engine_huggingface.py +100 -67
- symai/backend/engines/neurosymbolic/engine_llama_cpp.py +121 -93
- symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +213 -132
- symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +180 -137
- symai/backend/engines/ocr/engine_apilayer.py +18 -20
- symai/backend/engines/output/engine_stdout.py +9 -9
- symai/backend/engines/{webscraping → scrape}/engine_requests.py +25 -11
- symai/backend/engines/search/engine_openai.py +95 -83
- symai/backend/engines/search/engine_parallel.py +665 -0
- symai/backend/engines/search/engine_perplexity.py +40 -41
- symai/backend/engines/search/engine_serpapi.py +33 -28
- symai/backend/engines/speech_to_text/engine_local_whisper.py +37 -27
- symai/backend/engines/symbolic/engine_wolframalpha.py +14 -8
- symai/backend/engines/text_to_speech/engine_openai.py +15 -19
- symai/backend/engines/text_vision/engine_clip.py +34 -28
- symai/backend/engines/userinput/engine_console.py +3 -4
- symai/backend/mixin/anthropic.py +48 -40
- symai/backend/mixin/deepseek.py +4 -5
- symai/backend/mixin/google.py +5 -4
- symai/backend/mixin/groq.py +2 -4
- symai/backend/mixin/openai.py +132 -110
- symai/backend/settings.py +14 -14
- symai/chat.py +164 -94
- symai/collect/dynamic.py +13 -11
- symai/collect/pipeline.py +39 -31
- symai/collect/stats.py +109 -69
- symai/components.py +556 -238
- symai/constraints.py +14 -5
- symai/core.py +1495 -1210
- symai/core_ext.py +55 -50
- symai/endpoints/api.py +113 -58
- symai/extended/api_builder.py +22 -17
- symai/extended/arxiv_pdf_parser.py +13 -5
- symai/extended/bibtex_parser.py +8 -4
- symai/extended/conversation.py +88 -69
- symai/extended/document.py +40 -27
- symai/extended/file_merger.py +45 -7
- symai/extended/graph.py +38 -24
- symai/extended/html_style_template.py +17 -11
- symai/extended/interfaces/blip_2.py +1 -1
- symai/extended/interfaces/clip.py +4 -2
- symai/extended/interfaces/console.py +5 -3
- symai/extended/interfaces/dall_e.py +3 -1
- symai/extended/interfaces/file.py +2 -0
- symai/extended/interfaces/flux.py +3 -1
- symai/extended/interfaces/gpt_image.py +15 -6
- symai/extended/interfaces/input.py +2 -1
- symai/extended/interfaces/llava.py +1 -1
- symai/extended/interfaces/{naive_webscraping.py → naive_scrape.py} +3 -2
- symai/extended/interfaces/naive_vectordb.py +2 -2
- symai/extended/interfaces/ocr.py +4 -2
- symai/extended/interfaces/openai_search.py +2 -0
- symai/extended/interfaces/parallel.py +30 -0
- symai/extended/interfaces/perplexity.py +2 -0
- symai/extended/interfaces/pinecone.py +6 -4
- symai/extended/interfaces/python.py +2 -0
- symai/extended/interfaces/serpapi.py +2 -0
- symai/extended/interfaces/terminal.py +0 -1
- symai/extended/interfaces/tts.py +2 -1
- symai/extended/interfaces/whisper.py +2 -1
- symai/extended/interfaces/wolframalpha.py +1 -0
- symai/extended/metrics/__init__.py +1 -1
- symai/extended/metrics/similarity.py +5 -2
- symai/extended/os_command.py +31 -22
- symai/extended/packages/symdev.py +39 -34
- symai/extended/packages/sympkg.py +30 -27
- symai/extended/packages/symrun.py +46 -35
- symai/extended/repo_cloner.py +10 -9
- symai/extended/seo_query_optimizer.py +15 -12
- symai/extended/solver.py +104 -76
- symai/extended/summarizer.py +8 -7
- symai/extended/taypan_interpreter.py +10 -9
- symai/extended/vectordb.py +28 -15
- symai/formatter/formatter.py +39 -31
- symai/formatter/regex.py +46 -44
- symai/functional.py +184 -86
- symai/imports.py +85 -51
- symai/interfaces.py +1 -1
- symai/memory.py +33 -24
- symai/menu/screen.py +28 -19
- symai/misc/console.py +27 -27
- symai/misc/loader.py +4 -3
- symai/models/base.py +147 -76
- symai/models/errors.py +1 -1
- symai/ops/__init__.py +1 -1
- symai/ops/measures.py +17 -14
- symai/ops/primitives.py +933 -635
- symai/post_processors.py +28 -24
- symai/pre_processors.py +58 -52
- symai/processor.py +15 -9
- symai/prompts.py +714 -649
- symai/server/huggingface_server.py +115 -32
- symai/server/llama_cpp_server.py +14 -6
- symai/server/qdrant_server.py +206 -0
- symai/shell.py +98 -39
- symai/shellsv.py +307 -223
- symai/strategy.py +135 -81
- symai/symbol.py +276 -225
- symai/utils.py +62 -46
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/METADATA +19 -9
- symbolicai-1.1.0.dist-info/RECORD +168 -0
- symbolicai-1.0.0.dist-info/RECORD +0 -163
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/WHEEL +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -13,9 +13,11 @@ class ApiLayerResult(Result):
|
|
|
13
13
|
self.raw = text
|
|
14
14
|
try:
|
|
15
15
|
dict_ = self._to_symbol(text).ast()
|
|
16
|
-
self._value = dict_.get(
|
|
16
|
+
self._value = dict_.get(
|
|
17
|
+
"all_text", f"OCR Engine Error: {text} - status code {status_code}"
|
|
18
|
+
)
|
|
17
19
|
except Exception:
|
|
18
|
-
self._value = f
|
|
20
|
+
self._value = f"OCR Engine Error: {text} - status code {status_code}"
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
class OCREngine(Engine):
|
|
@@ -23,22 +25,18 @@ class OCREngine(Engine):
|
|
|
23
25
|
super().__init__()
|
|
24
26
|
# Opening JSON file
|
|
25
27
|
self.config = SYMAI_CONFIG
|
|
26
|
-
self.headers = {
|
|
27
|
-
"apikey": self.config['OCR_ENGINE_API_KEY'] if api_key is None else api_key
|
|
28
|
-
}
|
|
28
|
+
self.headers = {"apikey": self.config["OCR_ENGINE_API_KEY"] if api_key is None else api_key}
|
|
29
29
|
self.name = self.__class__.__name__
|
|
30
30
|
|
|
31
31
|
def id(self) -> str:
|
|
32
|
-
if self.config[
|
|
33
|
-
return
|
|
34
|
-
return super().id()
|
|
32
|
+
if self.config["OCR_ENGINE_API_KEY"]:
|
|
33
|
+
return "ocr"
|
|
34
|
+
return super().id() # default to unregistered
|
|
35
35
|
|
|
36
36
|
def command(self, *args, **kwargs):
|
|
37
37
|
super().command(*args, **kwargs)
|
|
38
|
-
if
|
|
39
|
-
self.headers = {
|
|
40
|
-
"apikey": kwargs['OCR_ENGINE_API_KEY']
|
|
41
|
-
}
|
|
38
|
+
if "OCR_ENGINE_API_KEY" in kwargs:
|
|
39
|
+
self.headers = {"apikey": kwargs["OCR_ENGINE_API_KEY"]}
|
|
42
40
|
|
|
43
41
|
def forward(self, argument):
|
|
44
42
|
image_url = argument.prop.image
|
|
@@ -47,21 +45,21 @@ class OCREngine(Engine):
|
|
|
47
45
|
file_path = Path(image_url[7:]).resolve()
|
|
48
46
|
with file_path.open("rb") as file:
|
|
49
47
|
payload = file.read()
|
|
50
|
-
url
|
|
48
|
+
url = "https://api.apilayer.com/image_to_text/upload"
|
|
51
49
|
response = requests.request("POST", url, headers=self.headers, data=payload)
|
|
52
50
|
else:
|
|
53
|
-
payload
|
|
54
|
-
url
|
|
55
|
-
response = requests.request("GET", url, headers=self.headers, data
|
|
51
|
+
payload = {}
|
|
52
|
+
url = f"https://api.apilayer.com/image_to_text/url?url={image_url}"
|
|
53
|
+
response = requests.request("GET", url, headers=self.headers, data=payload)
|
|
56
54
|
|
|
57
55
|
status_code = response.status_code
|
|
58
|
-
rsp
|
|
59
|
-
rsp
|
|
60
|
-
metadata
|
|
56
|
+
rsp = response.text
|
|
57
|
+
rsp = ApiLayerResult(response.text, status_code)
|
|
58
|
+
metadata = {}
|
|
61
59
|
|
|
62
60
|
return [rsp], metadata
|
|
63
61
|
|
|
64
62
|
def prepare(self, argument):
|
|
65
63
|
assert not argument.prop.processed_input, "OCREngine does not support processed_input."
|
|
66
|
-
image
|
|
64
|
+
image = str(argument.prop.image)
|
|
67
65
|
argument.prop.prepared_input = image
|
|
@@ -7,10 +7,10 @@ class OutputEngine(Engine):
|
|
|
7
7
|
self.name = self.__class__.__name__
|
|
8
8
|
|
|
9
9
|
def id(self) -> str:
|
|
10
|
-
return
|
|
10
|
+
return "output"
|
|
11
11
|
|
|
12
12
|
def forward(self, argument):
|
|
13
|
-
expr, processed, args, kwargs
|
|
13
|
+
expr, processed, args, kwargs = argument.prop.prepared_input
|
|
14
14
|
res = None
|
|
15
15
|
args = [] if args is None else args
|
|
16
16
|
kwargs = {} if kwargs is None else kwargs
|
|
@@ -18,14 +18,14 @@ class OutputEngine(Engine):
|
|
|
18
18
|
res = expr(processed, *args, **kwargs) if processed else expr(*args, **kwargs)
|
|
19
19
|
|
|
20
20
|
metadata = {}
|
|
21
|
-
result
|
|
22
|
-
'result': res,
|
|
23
|
-
'processed': processed,
|
|
24
|
-
'args': args,
|
|
25
|
-
'kwargs': kwargs
|
|
26
|
-
}
|
|
21
|
+
result = {"result": res, "processed": processed, "args": args, "kwargs": kwargs}
|
|
27
22
|
|
|
28
23
|
return [result], metadata
|
|
29
24
|
|
|
30
25
|
def prepare(self, argument):
|
|
31
|
-
argument.prop.prepared_input =
|
|
26
|
+
argument.prop.prepared_input = (
|
|
27
|
+
argument.prop.expr,
|
|
28
|
+
argument.prop.processed_input,
|
|
29
|
+
argument.prop.args,
|
|
30
|
+
argument.prop.kwargs,
|
|
31
|
+
)
|
|
@@ -67,8 +67,8 @@ class RequestsEngine(Engine):
|
|
|
67
67
|
|
|
68
68
|
DEFAULT_HEADERS: ClassVar[dict[str, str]] = {
|
|
69
69
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
71
|
+
"Chrome/120.0.0.0 Safari/537.36",
|
|
72
72
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
73
73
|
"Accept-Language": "en-US,en;q=0.9",
|
|
74
74
|
"DNT": "1",
|
|
@@ -174,7 +174,9 @@ class RequestsEngine(Engine):
|
|
|
174
174
|
context.add_cookies(cookie_payload)
|
|
175
175
|
|
|
176
176
|
@staticmethod
|
|
177
|
-
def _navigate_playwright_page(
|
|
177
|
+
def _navigate_playwright_page(
|
|
178
|
+
page, url: str, wait_selector: str | None, wait_until: str, timeout_ms: int, timeout_error
|
|
179
|
+
):
|
|
178
180
|
try:
|
|
179
181
|
response = page.goto(url, wait_until=wait_until, timeout=timeout_ms)
|
|
180
182
|
if wait_selector:
|
|
@@ -232,7 +234,13 @@ class RequestsEngine(Engine):
|
|
|
232
234
|
return resp
|
|
233
235
|
return self.session.get(target, timeout=timeout, allow_redirects=True)
|
|
234
236
|
|
|
235
|
-
def _fetch_with_playwright(
|
|
237
|
+
def _fetch_with_playwright(
|
|
238
|
+
self,
|
|
239
|
+
url: str,
|
|
240
|
+
wait_selector: str | None = None,
|
|
241
|
+
wait_until: str = "networkidle",
|
|
242
|
+
timeout: float | None = None,
|
|
243
|
+
):
|
|
236
244
|
"""
|
|
237
245
|
Render the target URL in a headless browser to execute JavaScript and
|
|
238
246
|
return a synthetic ``requests.Response`` object to keep downstream
|
|
@@ -240,11 +248,12 @@ class RequestsEngine(Engine):
|
|
|
240
248
|
"""
|
|
241
249
|
try:
|
|
242
250
|
# Playwright is optional; import only when JS rendering is requested.
|
|
243
|
-
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
|
244
|
-
from playwright.sync_api import sync_playwright
|
|
251
|
+
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError # noqa
|
|
252
|
+
from playwright.sync_api import sync_playwright # noqa
|
|
253
|
+
|
|
245
254
|
logging.getLogger("playwright").setLevel(logging.WARNING)
|
|
246
255
|
except ImportError as exc:
|
|
247
|
-
msg = "Playwright is not installed. Install symbolicai[
|
|
256
|
+
msg = "Playwright is not installed. Install symbolicai[scrape] with Playwright extras to enable render_js."
|
|
248
257
|
UserMessage(msg)
|
|
249
258
|
raise RuntimeError(msg) from exc
|
|
250
259
|
|
|
@@ -301,7 +310,7 @@ class RequestsEngine(Engine):
|
|
|
301
310
|
return rendered_response
|
|
302
311
|
|
|
303
312
|
def id(self) -> str:
|
|
304
|
-
return
|
|
313
|
+
return "scrape"
|
|
305
314
|
|
|
306
315
|
def forward(self, argument):
|
|
307
316
|
"""
|
|
@@ -317,8 +326,11 @@ class RequestsEngine(Engine):
|
|
|
317
326
|
self._maybe_set_bypass_cookies(url)
|
|
318
327
|
|
|
319
328
|
parsed = urlparse(url)
|
|
320
|
-
qs = [
|
|
321
|
-
|
|
329
|
+
qs = [
|
|
330
|
+
(k, v)
|
|
331
|
+
for k, v in parse_qsl(parsed.query, keep_blank_values=True)
|
|
332
|
+
if k.lower() not in {"utm_source", "utm_medium", "utm_campaign"}
|
|
333
|
+
]
|
|
322
334
|
clean_url = urlunparse(parsed._replace(query=urlencode(qs)))
|
|
323
335
|
|
|
324
336
|
render_js = kwargs.get("render_js")
|
|
@@ -335,7 +347,9 @@ class RequestsEngine(Engine):
|
|
|
335
347
|
timeout=render_timeout,
|
|
336
348
|
)
|
|
337
349
|
else:
|
|
338
|
-
resp = self.session.get(
|
|
350
|
+
resp = self.session.get(
|
|
351
|
+
clean_url, timeout=self.timeout, allow_redirects=True, verify=self.verify_ssl
|
|
352
|
+
)
|
|
339
353
|
resp.raise_for_status()
|
|
340
354
|
|
|
341
355
|
# Follow a legacy meta refresh once (do AFTER normal HTTP redirects)
|
|
@@ -21,9 +21,10 @@ logging.getLogger("httpcore").setLevel(logging.ERROR)
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
TRACKING_KEYS = {
|
|
24
|
-
"utm_source"
|
|
24
|
+
"utm_source" # so far I've only seen this one
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
+
|
|
27
28
|
@dataclass
|
|
28
29
|
class Citation:
|
|
29
30
|
id: int
|
|
@@ -33,14 +34,14 @@ class Citation:
|
|
|
33
34
|
end: int
|
|
34
35
|
|
|
35
36
|
def __hash__(self):
|
|
36
|
-
return hash((self.url,
|
|
37
|
+
return hash((self.url,))
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
class SearchResult(Result):
|
|
40
41
|
def __init__(self, value, **kwargs) -> None:
|
|
41
42
|
super().__init__(value, **kwargs)
|
|
42
|
-
if value.get(
|
|
43
|
-
UserMessage(value[
|
|
43
|
+
if value.get("error"):
|
|
44
|
+
UserMessage(value["error"], raise_with=ValueError)
|
|
44
45
|
try:
|
|
45
46
|
text, annotations = self._extract_text_and_annotations(value)
|
|
46
47
|
if text is None:
|
|
@@ -50,7 +51,9 @@ class SearchResult(Result):
|
|
|
50
51
|
replaced_text, ordered, starts_ends = self._insert_citation_markers(text, annotations)
|
|
51
52
|
self._value = replaced_text
|
|
52
53
|
self._citations = [
|
|
53
|
-
Citation(
|
|
54
|
+
Citation(
|
|
55
|
+
id=cid, title=title, url=url, start=starts_ends[cid][0], end=starts_ends[cid][1]
|
|
56
|
+
)
|
|
54
57
|
for cid, title, url in ordered
|
|
55
58
|
]
|
|
56
59
|
|
|
@@ -59,70 +62,72 @@ class SearchResult(Result):
|
|
|
59
62
|
UserMessage(f"Failed to parse response: {e}", raise_with=ValueError)
|
|
60
63
|
|
|
61
64
|
def _extract_text(self, value) -> str | None:
|
|
62
|
-
if isinstance(value.get(
|
|
63
|
-
return value.get(
|
|
65
|
+
if isinstance(value.get("output_text"), str) and value.get("output_text"):
|
|
66
|
+
return value.get("output_text")
|
|
64
67
|
text = None
|
|
65
|
-
for output in value.get(
|
|
66
|
-
if output.get(
|
|
67
|
-
content0 = output[
|
|
68
|
-
if content0.get(
|
|
69
|
-
text = content0[
|
|
68
|
+
for output in value.get("output", []):
|
|
69
|
+
if output.get("type") == "message" and output.get("content"):
|
|
70
|
+
content0 = output["content"][0]
|
|
71
|
+
if content0.get("text"):
|
|
72
|
+
text = content0["text"]
|
|
70
73
|
return text
|
|
71
74
|
|
|
72
75
|
def _extract_text_and_annotations(self, value):
|
|
73
76
|
segments = []
|
|
74
77
|
global_annotations = []
|
|
75
78
|
pos = 0
|
|
76
|
-
for output in value.get(
|
|
77
|
-
if output.get(
|
|
79
|
+
for output in value.get("output", []) or []:
|
|
80
|
+
if output.get("type") != "message" or not output.get("content"):
|
|
78
81
|
continue
|
|
79
|
-
for content in output.get(
|
|
80
|
-
seg_text = content.get(
|
|
82
|
+
for content in output.get("content", []) or []:
|
|
83
|
+
seg_text = content.get("text") or ""
|
|
81
84
|
if not isinstance(seg_text, str):
|
|
82
85
|
continue
|
|
83
|
-
for ann in
|
|
84
|
-
if ann.get(
|
|
85
|
-
start = ann.get(
|
|
86
|
-
end = ann.get(
|
|
87
|
-
global_annotations.append(
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
86
|
+
for ann in content.get("annotations") or []:
|
|
87
|
+
if ann.get("type") == "url_citation" and ann.get("url"):
|
|
88
|
+
start = ann.get("start_index", 0)
|
|
89
|
+
end = ann.get("end_index", 0)
|
|
90
|
+
global_annotations.append(
|
|
91
|
+
{
|
|
92
|
+
"type": "url_citation",
|
|
93
|
+
"url": ann.get("url"),
|
|
94
|
+
"title": (ann.get("title") or "").strip(),
|
|
95
|
+
"start_index": pos + int(start),
|
|
96
|
+
"end_index": pos + int(end),
|
|
97
|
+
}
|
|
98
|
+
)
|
|
94
99
|
segments.append(seg_text)
|
|
95
100
|
pos += len(seg_text)
|
|
96
101
|
|
|
97
|
-
built_text =
|
|
102
|
+
built_text = "".join(segments) if segments else None
|
|
98
103
|
# Prefer top-level output_text if present AND segments are empty (no way to compute indices)
|
|
99
|
-
if not built_text and isinstance(value.get(
|
|
100
|
-
return value.get(
|
|
104
|
+
if not built_text and isinstance(value.get("output_text"), str):
|
|
105
|
+
return value.get("output_text"), []
|
|
101
106
|
return built_text, global_annotations
|
|
102
107
|
|
|
103
108
|
def _normalize_url(self, u: str) -> str:
|
|
104
109
|
parts = urlsplit(u)
|
|
105
110
|
scheme = parts.scheme.lower()
|
|
106
111
|
netloc = parts.netloc.lower()
|
|
107
|
-
path = parts.path.rstrip(
|
|
112
|
+
path = parts.path.rstrip("/") or "/"
|
|
108
113
|
q = []
|
|
109
114
|
for k, v in parse_qsl(parts.query, keep_blank_values=True):
|
|
110
115
|
kl = k.lower()
|
|
111
|
-
if kl in TRACKING_KEYS or kl.startswith(
|
|
116
|
+
if kl in TRACKING_KEYS or kl.startswith("utm_"):
|
|
112
117
|
continue
|
|
113
118
|
q.append((k, v))
|
|
114
119
|
query = urlencode(q, doseq=True)
|
|
115
|
-
fragment =
|
|
120
|
+
fragment = ""
|
|
116
121
|
return urlunsplit((scheme, netloc, path, query, fragment))
|
|
117
122
|
|
|
118
123
|
def _make_title_map(self, annotations):
|
|
119
124
|
m = {}
|
|
120
125
|
for a in annotations or []:
|
|
121
|
-
url = a.get(
|
|
126
|
+
url = a.get("url")
|
|
122
127
|
if not url:
|
|
123
128
|
continue
|
|
124
129
|
nu = self._normalize_url(url)
|
|
125
|
-
title = (a.get(
|
|
130
|
+
title = (a.get("title") or "").strip()
|
|
126
131
|
if nu not in m and title:
|
|
127
132
|
m[nu] = title
|
|
128
133
|
return m
|
|
@@ -131,7 +136,7 @@ class SearchResult(Result):
|
|
|
131
136
|
return urlsplit(u).netloc
|
|
132
137
|
|
|
133
138
|
def _short_hash_id(self, nu: str, length=6) -> str:
|
|
134
|
-
return hashlib.sha1(nu.encode(
|
|
139
|
+
return hashlib.sha1(nu.encode("utf-8")).hexdigest()[:length]
|
|
135
140
|
|
|
136
141
|
def _insert_citation_markers(self, text: str, annotations):
|
|
137
142
|
title_map = self._make_title_map(annotations)
|
|
@@ -140,8 +145,10 @@ class SearchResult(Result):
|
|
|
140
145
|
ordered: list[tuple[int, str, str]] = [] # (id, title, normalized_url)
|
|
141
146
|
next_id = 1
|
|
142
147
|
|
|
143
|
-
url_anns = [
|
|
144
|
-
|
|
148
|
+
url_anns = [
|
|
149
|
+
a for a in annotations or [] if a.get("type") == "url_citation" and a.get("url")
|
|
150
|
+
]
|
|
151
|
+
url_anns.sort(key=lambda a: int(a.get("start_index", 0)))
|
|
145
152
|
|
|
146
153
|
pieces: list[str] = []
|
|
147
154
|
cursor = 0
|
|
@@ -158,11 +165,11 @@ class SearchResult(Result):
|
|
|
158
165
|
return id_map[nu]
|
|
159
166
|
|
|
160
167
|
for ann in url_anns:
|
|
161
|
-
start = int(ann.get(
|
|
162
|
-
end = int(ann.get(
|
|
168
|
+
start = int(ann.get("start_index", 0))
|
|
169
|
+
end = int(ann.get("end_index", 0))
|
|
163
170
|
if end <= cursor:
|
|
164
171
|
continue # skip overlapping or backwards spans
|
|
165
|
-
url = ann.get(
|
|
172
|
+
url = ann.get("url")
|
|
166
173
|
nu = self._normalize_url(url)
|
|
167
174
|
cid = _get_id(nu)
|
|
168
175
|
title = title_map.get(nu) or self._hostname(nu)
|
|
@@ -189,7 +196,7 @@ class SearchResult(Result):
|
|
|
189
196
|
|
|
190
197
|
tail_clean = self._strip_markdown_links(text[cursor:])
|
|
191
198
|
pieces.append(tail_clean)
|
|
192
|
-
replaced =
|
|
199
|
+
replaced = "".join(pieces)
|
|
193
200
|
|
|
194
201
|
starts_ends = {cid: first_span.get(cid, (0, 0)) for cid, _, _ in ordered}
|
|
195
202
|
return replaced, ordered, starts_ends
|
|
@@ -197,15 +204,15 @@ class SearchResult(Result):
|
|
|
197
204
|
def _strip_markdown_links(self, text: str) -> str:
|
|
198
205
|
# Remove ([text](http...)) including surrounding parentheses
|
|
199
206
|
pattern_paren = re.compile(r"\(\s*\[[^\]]+\]\(https?://[^)]+\)\s*\)")
|
|
200
|
-
text = pattern_paren.sub(
|
|
207
|
+
text = pattern_paren.sub("", text)
|
|
201
208
|
# Remove bare [text](http...)
|
|
202
209
|
pattern_bare = re.compile(r"\[[^\]]+\]\(https?://[^)]+\)")
|
|
203
|
-
text = pattern_bare.sub(
|
|
210
|
+
text = pattern_bare.sub("", text)
|
|
204
211
|
# Remove parentheses that became empty or contain only commas/whitespace like (, , )
|
|
205
212
|
pattern_empty_paren = re.compile(r"\(\s*\)")
|
|
206
|
-
text = pattern_empty_paren.sub(
|
|
213
|
+
text = pattern_empty_paren.sub("", text)
|
|
207
214
|
pattern_commas_only = re.compile(r"\(\s*(,\s*)+\)")
|
|
208
|
-
text = pattern_commas_only.sub(
|
|
215
|
+
text = pattern_commas_only.sub("", text)
|
|
209
216
|
# Collapse potential double spaces resulting from removals
|
|
210
217
|
return re.sub(r"\s{2,}", " ", text).strip()
|
|
211
218
|
|
|
@@ -236,10 +243,12 @@ class GPTXSearchEngine(Engine):
|
|
|
236
243
|
super().__init__()
|
|
237
244
|
self.config = deepcopy(SYMAI_CONFIG)
|
|
238
245
|
if api_key is not None and model is not None:
|
|
239
|
-
self.config[
|
|
240
|
-
self.config[
|
|
241
|
-
self.api_key = self.config.get(
|
|
242
|
-
self.model = self.config.get(
|
|
246
|
+
self.config["SEARCH_ENGINE_API_KEY"] = api_key
|
|
247
|
+
self.config["SEARCH_ENGINE_MODEL"] = model
|
|
248
|
+
self.api_key = self.config.get("SEARCH_ENGINE_API_KEY")
|
|
249
|
+
self.model = self.config.get(
|
|
250
|
+
"SEARCH_ENGINE_MODEL", "gpt-4.1"
|
|
251
|
+
) # Default to gpt-4.1 as per docs
|
|
243
252
|
self.name = self.__class__.__name__
|
|
244
253
|
try:
|
|
245
254
|
self.client = OpenAI(api_key=self.api_key)
|
|
@@ -247,9 +256,12 @@ class GPTXSearchEngine(Engine):
|
|
|
247
256
|
UserMessage(f"Failed to initialize OpenAI client: {e}", raise_with=ValueError)
|
|
248
257
|
|
|
249
258
|
def id(self) -> str:
|
|
250
|
-
if
|
|
251
|
-
self.config.get(
|
|
252
|
-
|
|
259
|
+
if (
|
|
260
|
+
self.config.get("SEARCH_ENGINE_API_KEY")
|
|
261
|
+
and self.config.get("SEARCH_ENGINE_MODEL")
|
|
262
|
+
in OPENAI_CHAT_MODELS + OPENAI_REASONING_MODELS
|
|
263
|
+
):
|
|
264
|
+
return "search"
|
|
253
265
|
return super().id() # default to unregistered
|
|
254
266
|
|
|
255
267
|
def _extract_netloc(self, raw_domain: str | None) -> str | None:
|
|
@@ -258,15 +270,15 @@ class GPTXSearchEngine(Engine):
|
|
|
258
270
|
candidate = raw_domain.strip()
|
|
259
271
|
if not candidate:
|
|
260
272
|
return None
|
|
261
|
-
parsed = urlsplit(candidate if
|
|
273
|
+
parsed = urlsplit(candidate if "://" in candidate else f"//{candidate}")
|
|
262
274
|
netloc = parsed.netloc or parsed.path
|
|
263
275
|
if not netloc:
|
|
264
276
|
return None
|
|
265
|
-
if
|
|
266
|
-
netloc = netloc.split(
|
|
267
|
-
if
|
|
268
|
-
netloc = netloc.split(
|
|
269
|
-
netloc = netloc.strip(
|
|
277
|
+
if "@" in netloc:
|
|
278
|
+
netloc = netloc.split("@", 1)[1]
|
|
279
|
+
if ":" in netloc:
|
|
280
|
+
netloc = netloc.split(":", 1)[0]
|
|
281
|
+
netloc = netloc.strip(".").strip()
|
|
270
282
|
if not netloc:
|
|
271
283
|
return None
|
|
272
284
|
return netloc.lower()
|
|
@@ -313,38 +325,40 @@ class GPTXSearchEngine(Engine):
|
|
|
313
325
|
|
|
314
326
|
def command(self, *args, **kwargs):
|
|
315
327
|
super().command(*args, **kwargs)
|
|
316
|
-
if
|
|
317
|
-
self.api_key = kwargs[
|
|
318
|
-
if
|
|
319
|
-
self.model = kwargs[
|
|
328
|
+
if "SEARCH_ENGINE_API_KEY" in kwargs:
|
|
329
|
+
self.api_key = kwargs["SEARCH_ENGINE_API_KEY"]
|
|
330
|
+
if "SEARCH_ENGINE_MODEL" in kwargs:
|
|
331
|
+
self.model = kwargs["SEARCH_ENGINE_MODEL"]
|
|
320
332
|
|
|
321
333
|
def forward(self, argument):
|
|
322
334
|
messages = argument.prop.prepared_input
|
|
323
335
|
kwargs = argument.kwargs
|
|
324
336
|
|
|
325
337
|
tool_definition = {"type": "web_search"}
|
|
326
|
-
user_location = kwargs.get(
|
|
338
|
+
user_location = kwargs.get("user_location")
|
|
327
339
|
if user_location:
|
|
328
|
-
tool_definition[
|
|
340
|
+
tool_definition["user_location"] = user_location
|
|
329
341
|
|
|
330
|
-
allowed_domains = self._normalize_allowed_domains(kwargs.get(
|
|
342
|
+
allowed_domains = self._normalize_allowed_domains(kwargs.get("allowed_domains"))
|
|
331
343
|
if allowed_domains:
|
|
332
|
-
tool_definition[
|
|
333
|
-
'allowed_domains': allowed_domains
|
|
334
|
-
}
|
|
344
|
+
tool_definition["filters"] = {"allowed_domains": allowed_domains}
|
|
335
345
|
|
|
336
|
-
self.model = kwargs.get(
|
|
346
|
+
self.model = kwargs.get(
|
|
347
|
+
"model", self.model
|
|
348
|
+
) # Important for MetadataTracker to work correctly
|
|
337
349
|
|
|
338
350
|
payload = {
|
|
339
351
|
"model": self.model,
|
|
340
352
|
"input": messages,
|
|
341
353
|
"tools": [tool_definition],
|
|
342
|
-
"tool_choice": {"type": "web_search"}
|
|
354
|
+
"tool_choice": {"type": "web_search"}
|
|
355
|
+
if self.model not in OPENAI_REASONING_MODELS
|
|
356
|
+
else "auto", # force the use of web search tool for non-reasoning models
|
|
343
357
|
}
|
|
344
358
|
|
|
345
359
|
if self.model in OPENAI_REASONING_MODELS:
|
|
346
|
-
reasoning = kwargs.get(
|
|
347
|
-
payload[
|
|
360
|
+
reasoning = kwargs.get("reasoning", {"effort": "low", "summary": "auto"})
|
|
361
|
+
payload["reasoning"] = reasoning
|
|
348
362
|
|
|
349
363
|
try:
|
|
350
364
|
res = self.client.responses.create(**payload)
|
|
@@ -353,21 +367,19 @@ class GPTXSearchEngine(Engine):
|
|
|
353
367
|
UserMessage(f"Failed to make request: {e}", raise_with=ValueError)
|
|
354
368
|
|
|
355
369
|
metadata = {"raw_output": res.raw}
|
|
356
|
-
output
|
|
370
|
+
output = [res]
|
|
357
371
|
|
|
358
372
|
return output, metadata
|
|
359
373
|
|
|
360
374
|
def prepare(self, argument):
|
|
361
|
-
system_message =
|
|
375
|
+
system_message = (
|
|
376
|
+
"You are a helpful AI assistant. Be precise and informative."
|
|
377
|
+
if argument.kwargs.get("system_message") is None
|
|
378
|
+
else argument.kwargs.get("system_message")
|
|
379
|
+
)
|
|
362
380
|
|
|
363
381
|
res = [
|
|
364
|
-
{
|
|
365
|
-
|
|
366
|
-
"content": system_message
|
|
367
|
-
},
|
|
368
|
-
{
|
|
369
|
-
"role": "user",
|
|
370
|
-
"content": f"{argument.prop.query}"
|
|
371
|
-
}
|
|
382
|
+
{"role": "system", "content": system_message},
|
|
383
|
+
{"role": "user", "content": f"{argument.prop.query}"},
|
|
372
384
|
]
|
|
373
385
|
argument.prop.prepared_input = res
|