symbolicai 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +198 -134
- symai/backend/base.py +51 -51
- symai/backend/engines/drawing/engine_bfl.py +33 -33
- symai/backend/engines/drawing/engine_gpt_image.py +4 -10
- symai/backend/engines/embedding/engine_llama_cpp.py +50 -35
- symai/backend/engines/embedding/engine_openai.py +22 -16
- symai/backend/engines/execute/engine_python.py +16 -16
- symai/backend/engines/files/engine_io.py +51 -49
- symai/backend/engines/imagecaptioning/engine_blip2.py +27 -23
- symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +53 -46
- symai/backend/engines/index/engine_pinecone.py +116 -88
- symai/backend/engines/index/engine_qdrant.py +1011 -0
- symai/backend/engines/index/engine_vectordb.py +78 -52
- symai/backend/engines/lean/engine_lean4.py +65 -25
- symai/backend/engines/neurosymbolic/__init__.py +28 -28
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +137 -135
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +145 -152
- symai/backend/engines/neurosymbolic/engine_cerebras.py +328 -0
- symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +75 -49
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +199 -155
- symai/backend/engines/neurosymbolic/engine_groq.py +106 -72
- symai/backend/engines/neurosymbolic/engine_huggingface.py +100 -67
- symai/backend/engines/neurosymbolic/engine_llama_cpp.py +121 -93
- symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +213 -132
- symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +180 -137
- symai/backend/engines/ocr/engine_apilayer.py +18 -20
- symai/backend/engines/output/engine_stdout.py +9 -9
- symai/backend/engines/{webscraping → scrape}/engine_requests.py +25 -11
- symai/backend/engines/search/engine_openai.py +95 -83
- symai/backend/engines/search/engine_parallel.py +665 -0
- symai/backend/engines/search/engine_perplexity.py +40 -41
- symai/backend/engines/search/engine_serpapi.py +33 -28
- symai/backend/engines/speech_to_text/engine_local_whisper.py +37 -27
- symai/backend/engines/symbolic/engine_wolframalpha.py +14 -8
- symai/backend/engines/text_to_speech/engine_openai.py +15 -19
- symai/backend/engines/text_vision/engine_clip.py +34 -28
- symai/backend/engines/userinput/engine_console.py +3 -4
- symai/backend/mixin/anthropic.py +48 -40
- symai/backend/mixin/deepseek.py +4 -5
- symai/backend/mixin/google.py +5 -4
- symai/backend/mixin/groq.py +2 -4
- symai/backend/mixin/openai.py +132 -110
- symai/backend/settings.py +14 -14
- symai/chat.py +164 -94
- symai/collect/dynamic.py +13 -11
- symai/collect/pipeline.py +39 -31
- symai/collect/stats.py +109 -69
- symai/components.py +556 -238
- symai/constraints.py +14 -5
- symai/core.py +1495 -1210
- symai/core_ext.py +55 -50
- symai/endpoints/api.py +113 -58
- symai/extended/api_builder.py +22 -17
- symai/extended/arxiv_pdf_parser.py +13 -5
- symai/extended/bibtex_parser.py +8 -4
- symai/extended/conversation.py +88 -69
- symai/extended/document.py +40 -27
- symai/extended/file_merger.py +45 -7
- symai/extended/graph.py +38 -24
- symai/extended/html_style_template.py +17 -11
- symai/extended/interfaces/blip_2.py +1 -1
- symai/extended/interfaces/clip.py +4 -2
- symai/extended/interfaces/console.py +5 -3
- symai/extended/interfaces/dall_e.py +3 -1
- symai/extended/interfaces/file.py +2 -0
- symai/extended/interfaces/flux.py +3 -1
- symai/extended/interfaces/gpt_image.py +15 -6
- symai/extended/interfaces/input.py +2 -1
- symai/extended/interfaces/llava.py +1 -1
- symai/extended/interfaces/{naive_webscraping.py → naive_scrape.py} +3 -2
- symai/extended/interfaces/naive_vectordb.py +2 -2
- symai/extended/interfaces/ocr.py +4 -2
- symai/extended/interfaces/openai_search.py +2 -0
- symai/extended/interfaces/parallel.py +30 -0
- symai/extended/interfaces/perplexity.py +2 -0
- symai/extended/interfaces/pinecone.py +6 -4
- symai/extended/interfaces/python.py +2 -0
- symai/extended/interfaces/serpapi.py +2 -0
- symai/extended/interfaces/terminal.py +0 -1
- symai/extended/interfaces/tts.py +2 -1
- symai/extended/interfaces/whisper.py +2 -1
- symai/extended/interfaces/wolframalpha.py +1 -0
- symai/extended/metrics/__init__.py +1 -1
- symai/extended/metrics/similarity.py +5 -2
- symai/extended/os_command.py +31 -22
- symai/extended/packages/symdev.py +39 -34
- symai/extended/packages/sympkg.py +30 -27
- symai/extended/packages/symrun.py +46 -35
- symai/extended/repo_cloner.py +10 -9
- symai/extended/seo_query_optimizer.py +15 -12
- symai/extended/solver.py +104 -76
- symai/extended/summarizer.py +8 -7
- symai/extended/taypan_interpreter.py +10 -9
- symai/extended/vectordb.py +28 -15
- symai/formatter/formatter.py +39 -31
- symai/formatter/regex.py +46 -44
- symai/functional.py +184 -86
- symai/imports.py +85 -51
- symai/interfaces.py +1 -1
- symai/memory.py +33 -24
- symai/menu/screen.py +28 -19
- symai/misc/console.py +27 -27
- symai/misc/loader.py +4 -3
- symai/models/base.py +147 -76
- symai/models/errors.py +1 -1
- symai/ops/__init__.py +1 -1
- symai/ops/measures.py +17 -14
- symai/ops/primitives.py +933 -635
- symai/post_processors.py +28 -24
- symai/pre_processors.py +58 -52
- symai/processor.py +15 -9
- symai/prompts.py +714 -649
- symai/server/huggingface_server.py +115 -32
- symai/server/llama_cpp_server.py +14 -6
- symai/server/qdrant_server.py +206 -0
- symai/shell.py +98 -39
- symai/shellsv.py +307 -223
- symai/strategy.py +135 -81
- symai/symbol.py +276 -225
- symai/utils.py +62 -46
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/METADATA +19 -9
- symbolicai-1.1.0.dist-info/RECORD +168 -0
- symbolicai-1.0.0.dist-info/RECORD +0 -163
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/WHEEL +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,665 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import re
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
|
8
|
+
|
|
9
|
+
from ....symbol import Result
|
|
10
|
+
from ....utils import UserMessage
|
|
11
|
+
from ...base import Engine
|
|
12
|
+
from ...settings import SYMAI_CONFIG
|
|
13
|
+
|
|
14
|
+
logging.getLogger("requests").setLevel(logging.ERROR)
|
|
15
|
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
|
16
|
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
|
17
|
+
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from parallel import Parallel
|
|
21
|
+
from parallel.resources.task_run import build_task_spec_param
|
|
22
|
+
|
|
23
|
+
logging.getLogger("parallel").setLevel(logging.ERROR)
|
|
24
|
+
except ImportError as exc:
|
|
25
|
+
msg = (
|
|
26
|
+
"parallel-web SDK is not installed. Install with 'pip install parallel-web' "
|
|
27
|
+
"or add it to your environment."
|
|
28
|
+
)
|
|
29
|
+
UserMessage(msg)
|
|
30
|
+
raise RuntimeError(msg) from exc
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
TRACKING_KEYS = {
|
|
34
|
+
"utm_source",
|
|
35
|
+
"utm_medium",
|
|
36
|
+
"utm_campaign",
|
|
37
|
+
"utm_term",
|
|
38
|
+
"utm_content",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _item_to_mapping(item: Any) -> dict[str, Any]:
|
|
43
|
+
if isinstance(item, dict):
|
|
44
|
+
return item
|
|
45
|
+
if hasattr(item, "model_dump"):
|
|
46
|
+
try:
|
|
47
|
+
return dict(item.model_dump())
|
|
48
|
+
except TypeError:
|
|
49
|
+
return dict(item.model_dump(mode="python"))
|
|
50
|
+
if hasattr(item, "dict"):
|
|
51
|
+
return dict(item.dict())
|
|
52
|
+
if hasattr(item, "__dict__"):
|
|
53
|
+
return deepcopy({k: v for k, v in item.__dict__.items() if not k.startswith("_")})
|
|
54
|
+
return {}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class Citation:
|
|
59
|
+
id: int
|
|
60
|
+
title: str
|
|
61
|
+
url: str
|
|
62
|
+
start: int
|
|
63
|
+
end: int
|
|
64
|
+
|
|
65
|
+
def __hash__(self):
|
|
66
|
+
return hash((self.url,))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class SearchResult(Result):
|
|
70
|
+
def __init__(self, value: dict[str, Any] | Any, **kwargs) -> None:
|
|
71
|
+
super().__init__(value, **kwargs)
|
|
72
|
+
if isinstance(value, dict) and value.get("error"):
|
|
73
|
+
UserMessage(value["error"], raise_with=ValueError)
|
|
74
|
+
self._citations: list[Citation] = []
|
|
75
|
+
try:
|
|
76
|
+
results = self._coerce_results(value)
|
|
77
|
+
text, citations = self._build_text_and_citations(results)
|
|
78
|
+
self._value = text
|
|
79
|
+
self._citations = citations
|
|
80
|
+
except Exception as e:
|
|
81
|
+
self._value = None
|
|
82
|
+
UserMessage(f"Failed to parse Parallel search response: {e}", raise_with=ValueError)
|
|
83
|
+
|
|
84
|
+
def _coerce_results(self, raw: Any) -> list[dict[str, Any]]:
|
|
85
|
+
if raw is None:
|
|
86
|
+
return []
|
|
87
|
+
results = raw.get("results", []) if isinstance(raw, dict) else getattr(raw, "results", None)
|
|
88
|
+
if not results:
|
|
89
|
+
return []
|
|
90
|
+
coerced: list[dict[str, Any]] = []
|
|
91
|
+
for item in results:
|
|
92
|
+
if item is None:
|
|
93
|
+
continue
|
|
94
|
+
coerced.append(_item_to_mapping(item))
|
|
95
|
+
return coerced
|
|
96
|
+
|
|
97
|
+
def _normalize_url(self, url: str) -> str:
|
|
98
|
+
parts = urlsplit(url)
|
|
99
|
+
scheme = parts.scheme.lower() if parts.scheme else "https"
|
|
100
|
+
netloc = parts.netloc.lower()
|
|
101
|
+
path = parts.path.rstrip("/") or "/"
|
|
102
|
+
filtered_query = [
|
|
103
|
+
(k, v)
|
|
104
|
+
for k, v in parse_qsl(parts.query, keep_blank_values=True)
|
|
105
|
+
if k not in TRACKING_KEYS and not k.lower().startswith("utm_")
|
|
106
|
+
]
|
|
107
|
+
query = urlencode(filtered_query, doseq=True)
|
|
108
|
+
return urlunsplit((scheme, netloc, path, query, ""))
|
|
109
|
+
|
|
110
|
+
def _strip_markdown_links(self, text: str) -> str:
|
|
111
|
+
# Matches Markdown links like "[label](https://example.com "title")" and captures only the label.
|
|
112
|
+
pattern = re.compile(
|
|
113
|
+
r"\[(?P<label>[^\]]+)\]\((?P<url>https?://[^)\s]+)(?:\s+\"[^\"]*\")?\)"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _replacement(match: re.Match) -> str:
|
|
117
|
+
label = match.group("label") or ""
|
|
118
|
+
return label.strip()
|
|
119
|
+
|
|
120
|
+
cleaned = pattern.sub(_replacement, text)
|
|
121
|
+
# Remove lingering empty parentheses that previously wrapped the stripped links.
|
|
122
|
+
cleaned = re.sub(r"\(\s*\)", "", cleaned)
|
|
123
|
+
# Remove parentheses that contain only commas or whitespace remnants.
|
|
124
|
+
return re.sub(r"\(\s*(,\s*)+\)", "", cleaned)
|
|
125
|
+
|
|
126
|
+
def _strip_square_brackets(self, text: str) -> str:
|
|
127
|
+
def _replacement(match: re.Match) -> str:
|
|
128
|
+
return match.group(1) or ""
|
|
129
|
+
|
|
130
|
+
# Replace bracketed fragments with their inner text so literal '[' or ']' do not leak into the output.
|
|
131
|
+
return re.sub(r"\[([^\]]*)\]", _replacement, text).replace("[", "").replace("]", "")
|
|
132
|
+
|
|
133
|
+
def _sanitize_excerpt(self, text: str) -> str:
|
|
134
|
+
cleaned = self._strip_markdown_links(text)
|
|
135
|
+
cleaned = self._strip_square_brackets(cleaned)
|
|
136
|
+
# Collapse consecutive spaces/tabs down to a single space for readability.
|
|
137
|
+
cleaned = re.sub(r"[ \t]{2,}", " ", cleaned)
|
|
138
|
+
# Shrink runs of three or more blank lines to a double newline spacer.
|
|
139
|
+
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
|
140
|
+
return cleaned.strip()
|
|
141
|
+
|
|
142
|
+
def _build_text_and_citations(self, results: list[dict[str, Any]]):
|
|
143
|
+
pieces: list[str] = []
|
|
144
|
+
citations: list[Citation] = []
|
|
145
|
+
cursor = 0
|
|
146
|
+
seen_urls: set[str] = set()
|
|
147
|
+
cid = 1
|
|
148
|
+
separator = "\n\n---\n\n"
|
|
149
|
+
|
|
150
|
+
for item in results:
|
|
151
|
+
url = str(item.get("url") or "")
|
|
152
|
+
if not url:
|
|
153
|
+
continue
|
|
154
|
+
normalized_url = self._normalize_url(url)
|
|
155
|
+
if normalized_url in seen_urls:
|
|
156
|
+
continue
|
|
157
|
+
seen_urls.add(normalized_url)
|
|
158
|
+
|
|
159
|
+
title = str(item.get("title") or "") or urlsplit(normalized_url).netloc
|
|
160
|
+
excerpts = item.get("excerpts") or []
|
|
161
|
+
excerpt_parts: list[str] = []
|
|
162
|
+
for ex in excerpts:
|
|
163
|
+
if not isinstance(ex, str):
|
|
164
|
+
continue
|
|
165
|
+
sanitized = self._sanitize_excerpt(ex)
|
|
166
|
+
if sanitized:
|
|
167
|
+
excerpt_parts.append(sanitized)
|
|
168
|
+
if not excerpt_parts:
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
combined_excerpt = "\n\n".join(excerpt_parts)
|
|
172
|
+
source_id = self._coerce_source_identifier(
|
|
173
|
+
item, url=normalized_url, fallback=f"source-{cid}"
|
|
174
|
+
)
|
|
175
|
+
block_body = combined_excerpt
|
|
176
|
+
if source_id:
|
|
177
|
+
block_body = f"{source_id}\n\n{combined_excerpt}"
|
|
178
|
+
|
|
179
|
+
if pieces:
|
|
180
|
+
pieces.append(separator)
|
|
181
|
+
cursor += len(separator)
|
|
182
|
+
|
|
183
|
+
opening_tag = "<source>\n"
|
|
184
|
+
pieces.append(opening_tag)
|
|
185
|
+
cursor += len(opening_tag)
|
|
186
|
+
|
|
187
|
+
pieces.append(block_body)
|
|
188
|
+
cursor += len(block_body)
|
|
189
|
+
|
|
190
|
+
closing_tag = "\n</source>"
|
|
191
|
+
pieces.append(closing_tag)
|
|
192
|
+
cursor += len(closing_tag)
|
|
193
|
+
|
|
194
|
+
marker = f"[{cid}]"
|
|
195
|
+
start = cursor
|
|
196
|
+
pieces.append(marker)
|
|
197
|
+
cursor += len(marker)
|
|
198
|
+
|
|
199
|
+
citations.append(
|
|
200
|
+
Citation(id=cid, title=title, url=normalized_url, start=start, end=cursor)
|
|
201
|
+
)
|
|
202
|
+
cid += 1
|
|
203
|
+
|
|
204
|
+
text = "".join(pieces)
|
|
205
|
+
return text, citations
|
|
206
|
+
|
|
207
|
+
def _coerce_source_identifier(self, item: dict[str, Any], *, url: str, fallback: str) -> str:
|
|
208
|
+
for key in ("source_id", "sourceId", "sourceID", "id"):
|
|
209
|
+
candidate = self._sanitize_source_identifier(item.get(key))
|
|
210
|
+
if candidate:
|
|
211
|
+
return candidate
|
|
212
|
+
|
|
213
|
+
split_url = urlsplit(url)
|
|
214
|
+
derived = split_url.netloc or split_url.path or url
|
|
215
|
+
candidate = self._sanitize_source_identifier(derived)
|
|
216
|
+
if candidate:
|
|
217
|
+
return candidate
|
|
218
|
+
return fallback
|
|
219
|
+
|
|
220
|
+
def _sanitize_source_identifier(self, raw: Any) -> str:
|
|
221
|
+
if raw is None:
|
|
222
|
+
return ""
|
|
223
|
+
text = str(raw).strip()
|
|
224
|
+
if not text:
|
|
225
|
+
return ""
|
|
226
|
+
# Replace any character outside [A-Za-z0-9._:-] with hyphens so IDs are safe for tag embedding.
|
|
227
|
+
sanitized = re.sub(r"[^A-Za-z0-9._:-]+", "-", text)
|
|
228
|
+
sanitized = sanitized.strip("-")
|
|
229
|
+
return sanitized or ""
|
|
230
|
+
|
|
231
|
+
def __str__(self) -> str:
|
|
232
|
+
if isinstance(self._value, str) and self._value:
|
|
233
|
+
return self._value
|
|
234
|
+
try:
|
|
235
|
+
return json.dumps(self.raw, indent=2)
|
|
236
|
+
except TypeError:
|
|
237
|
+
return str(self.raw)
|
|
238
|
+
|
|
239
|
+
def _repr_html_(self) -> str:
|
|
240
|
+
if isinstance(self._value, str) and self._value:
|
|
241
|
+
return f"<pre>{self._value}</pre>"
|
|
242
|
+
try:
|
|
243
|
+
return f"<pre>{json.dumps(self.raw, indent=2)}</pre>"
|
|
244
|
+
except Exception:
|
|
245
|
+
return f"<pre>{self.raw!s}</pre>"
|
|
246
|
+
|
|
247
|
+
def get_citations(self) -> list[Citation]:
|
|
248
|
+
return self._citations
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
class ExtractResult(Result):
|
|
252
|
+
"""Result wrapper for Parallel Extract API responses."""
|
|
253
|
+
|
|
254
|
+
def __init__(self, value: dict[str, Any] | Any, **kwargs) -> None:
|
|
255
|
+
super().__init__(value, **kwargs)
|
|
256
|
+
try:
|
|
257
|
+
results = self._coerce_results(value)
|
|
258
|
+
content_parts: list[str] = []
|
|
259
|
+
for r in results:
|
|
260
|
+
excerpts = r.get("excerpts") or []
|
|
261
|
+
full = r.get("full_content")
|
|
262
|
+
if isinstance(full, str):
|
|
263
|
+
content_parts.append(full)
|
|
264
|
+
elif full is not None:
|
|
265
|
+
content_parts.append(str(full))
|
|
266
|
+
elif excerpts:
|
|
267
|
+
content_parts.extend([s for s in excerpts if isinstance(s, str)])
|
|
268
|
+
self._value = "\n\n".join(content_parts)
|
|
269
|
+
except Exception as e:
|
|
270
|
+
self._value = None
|
|
271
|
+
UserMessage(f"Failed to parse Parallel extract response: {e}", raise_with=ValueError)
|
|
272
|
+
|
|
273
|
+
def _coerce_results(self, raw: Any) -> list[dict[str, Any]]:
|
|
274
|
+
if raw is None:
|
|
275
|
+
return []
|
|
276
|
+
results = raw.get("results", []) if isinstance(raw, dict) else getattr(raw, "results", None)
|
|
277
|
+
if not results:
|
|
278
|
+
return []
|
|
279
|
+
coerced: list[dict[str, Any]] = []
|
|
280
|
+
for item in results:
|
|
281
|
+
if item is None:
|
|
282
|
+
continue
|
|
283
|
+
coerced.append(_item_to_mapping(item))
|
|
284
|
+
return coerced
|
|
285
|
+
|
|
286
|
+
def __str__(self) -> str:
|
|
287
|
+
try:
|
|
288
|
+
return str(self._value or "")
|
|
289
|
+
except Exception:
|
|
290
|
+
return ""
|
|
291
|
+
|
|
292
|
+
def _repr_html_(self) -> str:
|
|
293
|
+
try:
|
|
294
|
+
return f"<pre>{self._value or ''}</pre>"
|
|
295
|
+
except Exception:
|
|
296
|
+
return "<pre></pre>"
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class ParallelEngine(Engine):
|
|
300
|
+
MAX_INCLUDE_DOMAINS = 10
|
|
301
|
+
|
|
302
|
+
def __init__(self, api_key: str | None = None):
|
|
303
|
+
super().__init__()
|
|
304
|
+
self.config = deepcopy(SYMAI_CONFIG)
|
|
305
|
+
self.api_key = api_key or self.config.get("SEARCH_ENGINE_API_KEY")
|
|
306
|
+
self.model = self.config.get("SEARCH_ENGINE_MODEL")
|
|
307
|
+
self.name = self.__class__.__name__
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
self.client = Parallel(api_key=self.api_key)
|
|
311
|
+
except Exception as e:
|
|
312
|
+
UserMessage(f"Failed to initialize Parallel client: {e}", raise_with=ValueError)
|
|
313
|
+
|
|
314
|
+
def id(self) -> str:
|
|
315
|
+
# Register as a search engine when configured with the 'parallel' model token
|
|
316
|
+
if (
|
|
317
|
+
self.config.get("SEARCH_ENGINE_API_KEY")
|
|
318
|
+
and str(self.config.get("SEARCH_ENGINE_MODEL", "")).lower() == "parallel"
|
|
319
|
+
):
|
|
320
|
+
return "search"
|
|
321
|
+
return super().id()
|
|
322
|
+
|
|
323
|
+
def command(self, *args, **kwargs):
|
|
324
|
+
super().command(*args, **kwargs)
|
|
325
|
+
if "SEARCH_ENGINE_API_KEY" in kwargs:
|
|
326
|
+
self.api_key = kwargs["SEARCH_ENGINE_API_KEY"]
|
|
327
|
+
if "SEARCH_ENGINE_MODEL" in kwargs:
|
|
328
|
+
self.model = kwargs["SEARCH_ENGINE_MODEL"]
|
|
329
|
+
|
|
330
|
+
def _extract_netloc(self, raw: str | None) -> str | None:
|
|
331
|
+
if not isinstance(raw, str):
|
|
332
|
+
return None
|
|
333
|
+
s = raw.strip()
|
|
334
|
+
if not s:
|
|
335
|
+
return None
|
|
336
|
+
parts = urlsplit(s if "://" in s else f"//{s}")
|
|
337
|
+
netloc = parts.netloc or parts.path
|
|
338
|
+
netloc = netloc.split("@", 1)[-1]
|
|
339
|
+
netloc = netloc.split(":", 1)[0]
|
|
340
|
+
netloc = netloc.strip(".").strip().lower()
|
|
341
|
+
return netloc or None
|
|
342
|
+
|
|
343
|
+
def _normalize_include_domains(self, domains: list[str] | None) -> list[str]:
|
|
344
|
+
if not isinstance(domains, list):
|
|
345
|
+
return []
|
|
346
|
+
seen: set[str] = set()
|
|
347
|
+
out: list[str] = []
|
|
348
|
+
for d in domains:
|
|
349
|
+
netloc = self._extract_netloc(d)
|
|
350
|
+
if not netloc or netloc in seen:
|
|
351
|
+
continue
|
|
352
|
+
if not self._is_valid_domain(netloc):
|
|
353
|
+
# Skip strings that are not apex domains or bare TLD patterns
|
|
354
|
+
continue
|
|
355
|
+
seen.add(netloc)
|
|
356
|
+
out.append(netloc)
|
|
357
|
+
if len(out) >= self.MAX_INCLUDE_DOMAINS:
|
|
358
|
+
break
|
|
359
|
+
return out
|
|
360
|
+
|
|
361
|
+
def _coerce_search_queries(self, value: Any) -> list[str]:
|
|
362
|
+
if value is None:
|
|
363
|
+
return []
|
|
364
|
+
if isinstance(value, str):
|
|
365
|
+
text = value.strip()
|
|
366
|
+
return [text] if text else []
|
|
367
|
+
if isinstance(value, list):
|
|
368
|
+
cleaned: list[str] = []
|
|
369
|
+
for item in value:
|
|
370
|
+
if item is None:
|
|
371
|
+
continue
|
|
372
|
+
text = str(item).strip()
|
|
373
|
+
if text:
|
|
374
|
+
cleaned.append(text)
|
|
375
|
+
return cleaned
|
|
376
|
+
text = str(value).strip()
|
|
377
|
+
return [text] if text else []
|
|
378
|
+
|
|
379
|
+
def _is_valid_domain(self, s: str) -> bool:
|
|
380
|
+
"""Validate apex domains or bare extension filters.
|
|
381
|
+
|
|
382
|
+
Accepts:
|
|
383
|
+
- Apex/sub domains like "example.com", "www.arstechnica.com"
|
|
384
|
+
- Bare extension patterns like ".gov", ".co.uk"
|
|
385
|
+
Rejects:
|
|
386
|
+
- Values without a dot (e.g., "tomshardware")
|
|
387
|
+
- Schemes, paths, or ports (filtered earlier by _extract_netloc)
|
|
388
|
+
"""
|
|
389
|
+
if not s:
|
|
390
|
+
return False
|
|
391
|
+
if s.startswith("."):
|
|
392
|
+
# Allow bare domain extensions like .gov or .co.uk
|
|
393
|
+
remainder = s[1:]
|
|
394
|
+
return bool((remainder and "." in remainder) or remainder.isalpha())
|
|
395
|
+
# Require at least one dot and valid label characters
|
|
396
|
+
# Matches a single DNS label: 1-63 chars, alphanumeric at both ends, hyphens allowed internally.
|
|
397
|
+
label_re = re.compile(r"^[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?$")
|
|
398
|
+
parts = s.split(".")
|
|
399
|
+
if len(parts) < 2:
|
|
400
|
+
return False
|
|
401
|
+
return all(label_re.fullmatch(p or "") for p in parts)
|
|
402
|
+
|
|
403
|
+
def _search(self, queries: list[str], kwargs: dict[str, Any]):
|
|
404
|
+
if not queries:
|
|
405
|
+
UserMessage(
|
|
406
|
+
"ParallelEngine._search requires at least one query.", raise_with=ValueError
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
mode = kwargs.get("mode") or "one-shot"
|
|
410
|
+
max_results = kwargs.get("max_results", 10)
|
|
411
|
+
max_chars_per_result = kwargs.get("max_chars_per_result", 15000)
|
|
412
|
+
excerpts = {"max_chars_per_result": max_chars_per_result}
|
|
413
|
+
include = self._normalize_include_domains(kwargs.get("allowed_domains"))
|
|
414
|
+
source_policy = {"include_domains": include} if include else None
|
|
415
|
+
objective = kwargs.get("objective")
|
|
416
|
+
|
|
417
|
+
try:
|
|
418
|
+
result = self.client.beta.search(
|
|
419
|
+
objective=objective,
|
|
420
|
+
search_queries=queries,
|
|
421
|
+
max_results=max_results,
|
|
422
|
+
excerpts=excerpts,
|
|
423
|
+
mode=mode,
|
|
424
|
+
source_policy=source_policy,
|
|
425
|
+
)
|
|
426
|
+
except Exception as e:
|
|
427
|
+
UserMessage(f"Failed to call Parallel Search API: {e}", raise_with=ValueError)
|
|
428
|
+
return [SearchResult(result)], {"raw_output": result}
|
|
429
|
+
|
|
430
|
+
def _task(self, queries: list[str], kwargs: dict[str, Any]):
|
|
431
|
+
processor_name = self._coerce_processor(kwargs.get("processor"))
|
|
432
|
+
task_input = self._compose_task_input(queries)
|
|
433
|
+
|
|
434
|
+
include = self._normalize_include_domains(kwargs.get("allowed_domains"))
|
|
435
|
+
source_policy = {"include_domains": include} if include else None
|
|
436
|
+
metadata = self._coerce_metadata(kwargs.get("metadata"))
|
|
437
|
+
|
|
438
|
+
output_schema = (
|
|
439
|
+
kwargs.get("task_output_schema")
|
|
440
|
+
or kwargs.get("task_output")
|
|
441
|
+
or kwargs.get("output_schema")
|
|
442
|
+
or kwargs.get("output")
|
|
443
|
+
)
|
|
444
|
+
task_spec_param = self._build_task_spec(output_schema, task_input)
|
|
445
|
+
timeout, api_timeout = self._collect_task_timeouts(kwargs)
|
|
446
|
+
|
|
447
|
+
run = self._create_task_run(
|
|
448
|
+
task_input=task_input,
|
|
449
|
+
processor=processor_name,
|
|
450
|
+
metadata=metadata,
|
|
451
|
+
source_policy=source_policy,
|
|
452
|
+
task_spec=task_spec_param,
|
|
453
|
+
)
|
|
454
|
+
result = self._fetch_task_result(run.run_id, timeout=timeout, api_timeout=api_timeout)
|
|
455
|
+
|
|
456
|
+
payload = self._task_result_to_search_payload(result)
|
|
457
|
+
return [SearchResult(payload)], {
|
|
458
|
+
"raw_output": result,
|
|
459
|
+
"task_output": payload.get("task_output"),
|
|
460
|
+
"task_output_type": payload.get("task_output_type"),
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
def _coerce_processor(self, processor: Any) -> str:
|
|
464
|
+
if processor is None:
|
|
465
|
+
UserMessage("ParallelEngine.task requires a processor.", raise_with=ValueError)
|
|
466
|
+
value = processor.strip() if isinstance(processor, str) else str(processor).strip()
|
|
467
|
+
if not value:
|
|
468
|
+
UserMessage(
|
|
469
|
+
"ParallelEngine.task requires a non-empty processor.", raise_with=ValueError
|
|
470
|
+
)
|
|
471
|
+
return value
|
|
472
|
+
|
|
473
|
+
def _compose_task_input(self, queries: list[str]) -> str:
|
|
474
|
+
if not queries:
|
|
475
|
+
UserMessage(
|
|
476
|
+
"ParallelEngine.task requires at least one query input.", raise_with=ValueError
|
|
477
|
+
)
|
|
478
|
+
if len(queries) == 1:
|
|
479
|
+
return queries[0]
|
|
480
|
+
return "\n\n".join(f"{idx}. {q}" for idx, q in enumerate(queries, start=1))
|
|
481
|
+
|
|
482
|
+
def _coerce_metadata(self, metadata: Any) -> dict[str, Any] | None:
|
|
483
|
+
if metadata is None or isinstance(metadata, dict):
|
|
484
|
+
return metadata
|
|
485
|
+
return None
|
|
486
|
+
|
|
487
|
+
def _build_task_spec(self, output_schema: Any, task_input: str):
|
|
488
|
+
if output_schema is None:
|
|
489
|
+
return None
|
|
490
|
+
try:
|
|
491
|
+
return build_task_spec_param(output_schema, task_input)
|
|
492
|
+
except Exception as exc:
|
|
493
|
+
UserMessage(f"Invalid task output schema: {exc}", raise_with=ValueError)
|
|
494
|
+
|
|
495
|
+
def _collect_task_timeouts(self, kwargs: dict[str, Any]) -> tuple[Any, int | None]:
|
|
496
|
+
timeout = kwargs.get("task_timeout") or kwargs.get("timeout")
|
|
497
|
+
api_timeout = kwargs.get("task_api_timeout") or kwargs.get("api_timeout")
|
|
498
|
+
if api_timeout is None:
|
|
499
|
+
return timeout, None
|
|
500
|
+
try:
|
|
501
|
+
return timeout, int(api_timeout)
|
|
502
|
+
except (TypeError, ValueError) as exc:
|
|
503
|
+
UserMessage(f"api_timeout must be numeric: {exc}", raise_with=ValueError)
|
|
504
|
+
|
|
505
|
+
def _create_task_run(
|
|
506
|
+
self,
|
|
507
|
+
*,
|
|
508
|
+
task_input: str,
|
|
509
|
+
processor: str,
|
|
510
|
+
metadata: dict[str, Any] | None,
|
|
511
|
+
source_policy: dict[str, Any] | None,
|
|
512
|
+
task_spec: Any,
|
|
513
|
+
):
|
|
514
|
+
task_kwargs: dict[str, Any] = {
|
|
515
|
+
"input": task_input,
|
|
516
|
+
"processor": processor,
|
|
517
|
+
}
|
|
518
|
+
if metadata is not None:
|
|
519
|
+
task_kwargs["metadata"] = metadata
|
|
520
|
+
if source_policy is not None:
|
|
521
|
+
task_kwargs["source_policy"] = source_policy
|
|
522
|
+
if task_spec is not None:
|
|
523
|
+
task_kwargs["task_spec"] = task_spec
|
|
524
|
+
|
|
525
|
+
try:
|
|
526
|
+
return self.client.task_run.create(**task_kwargs)
|
|
527
|
+
except Exception as e:
|
|
528
|
+
UserMessage(f"Failed to create Parallel task: {e}", raise_with=ValueError)
|
|
529
|
+
|
|
530
|
+
def _fetch_task_result(self, run_id: str, *, timeout: Any, api_timeout: int | None):
|
|
531
|
+
result_kwargs: dict[str, Any] = {}
|
|
532
|
+
if api_timeout is not None:
|
|
533
|
+
result_kwargs["api_timeout"] = api_timeout
|
|
534
|
+
if timeout is not None:
|
|
535
|
+
result_kwargs["timeout"] = timeout
|
|
536
|
+
try:
|
|
537
|
+
return self.client.task_run.result(run_id, **result_kwargs)
|
|
538
|
+
except Exception as e:
|
|
539
|
+
UserMessage(f"Failed to fetch Parallel task result: {e}", raise_with=ValueError)
|
|
540
|
+
|
|
541
|
+
def _task_result_to_search_payload(self, task_result: Any) -> dict[str, Any]:
|
|
542
|
+
payload: dict[str, Any] = {"results": []}
|
|
543
|
+
output = getattr(task_result, "output", None)
|
|
544
|
+
if output is None:
|
|
545
|
+
return payload
|
|
546
|
+
|
|
547
|
+
basis_items = getattr(output, "basis", None) or []
|
|
548
|
+
for idx, basis in enumerate(basis_items):
|
|
549
|
+
payload["results"].extend(self._basis_to_results(basis, basis_index=idx))
|
|
550
|
+
|
|
551
|
+
if not payload["results"]:
|
|
552
|
+
payload["results"].append(self._task_fallback_result(output, basis_items))
|
|
553
|
+
|
|
554
|
+
payload["task_output"] = getattr(output, "content", None)
|
|
555
|
+
payload["task_output_type"] = getattr(output, "type", None)
|
|
556
|
+
return payload
|
|
557
|
+
|
|
558
|
+
def _basis_to_results(self, basis: Any, *, basis_index: int) -> list[dict[str, Any]]:
|
|
559
|
+
raw_reasoning = getattr(basis, "reasoning", "") or ""
|
|
560
|
+
reasoning = raw_reasoning if isinstance(raw_reasoning, str) else str(raw_reasoning)
|
|
561
|
+
raw_field = getattr(basis, "field", "") or ""
|
|
562
|
+
field_title = raw_field if isinstance(raw_field, str) else str(raw_field)
|
|
563
|
+
if not field_title.strip():
|
|
564
|
+
field_title = "Parallel Task Output"
|
|
565
|
+
citations = getattr(basis, "citations", None) or []
|
|
566
|
+
if not citations:
|
|
567
|
+
if not reasoning:
|
|
568
|
+
return []
|
|
569
|
+
citations = [None]
|
|
570
|
+
|
|
571
|
+
results: list[dict[str, Any]] = []
|
|
572
|
+
# Convert field titles to lowercase slugs by swapping non-alphanumerics for hyphens.
|
|
573
|
+
slug = re.sub(r"[^a-z0-9]+", "-", field_title.lower()).strip("-") or "field"
|
|
574
|
+
basis_url = f"parallel://task-output/{basis_index:04d}-{slug}"
|
|
575
|
+
for citation in citations:
|
|
576
|
+
if citation is None:
|
|
577
|
+
url = basis_url
|
|
578
|
+
title = field_title
|
|
579
|
+
excerpts = [reasoning]
|
|
580
|
+
else:
|
|
581
|
+
url = str(getattr(citation, "url", "") or "")
|
|
582
|
+
title = str(getattr(citation, "title", "") or field_title)
|
|
583
|
+
raw_excerpts = getattr(citation, "excerpts", None) or []
|
|
584
|
+
excerpts = [snippet for snippet in raw_excerpts if isinstance(snippet, str)]
|
|
585
|
+
if not excerpts and reasoning:
|
|
586
|
+
excerpts = [reasoning]
|
|
587
|
+
results.append(
|
|
588
|
+
{
|
|
589
|
+
"url": url or basis_url,
|
|
590
|
+
"title": title or field_title,
|
|
591
|
+
"excerpts": excerpts or ([reasoning] if reasoning else []),
|
|
592
|
+
}
|
|
593
|
+
)
|
|
594
|
+
return results
|
|
595
|
+
|
|
596
|
+
def _task_fallback_result(self, output: Any, basis_items: list[Any]) -> dict[str, Any]:
|
|
597
|
+
content = getattr(output, "content", None)
|
|
598
|
+
if isinstance(content, str):
|
|
599
|
+
snippet = content
|
|
600
|
+
elif isinstance(content, (dict, list)):
|
|
601
|
+
snippet = json.dumps(content, ensure_ascii=False)
|
|
602
|
+
else:
|
|
603
|
+
snippet = str(content or "")
|
|
604
|
+
if not snippet:
|
|
605
|
+
extra_reasoning: list[str] = []
|
|
606
|
+
for basis in basis_items:
|
|
607
|
+
raw_value = getattr(basis, "reasoning", "") or ""
|
|
608
|
+
if isinstance(raw_value, str):
|
|
609
|
+
extra_reasoning.append(raw_value)
|
|
610
|
+
else:
|
|
611
|
+
extra_reasoning.append(str(raw_value))
|
|
612
|
+
snippet = " ".join(r for r in extra_reasoning if r) or "Parallel task output"
|
|
613
|
+
return {
|
|
614
|
+
"url": "parallel://task-output",
|
|
615
|
+
"title": "Parallel Task Output",
|
|
616
|
+
"excerpts": [snippet],
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
def _extract(self, url: str, kwargs: dict[str, Any]):
|
|
620
|
+
excerpts = kwargs.get("excerpts", True)
|
|
621
|
+
full_content = kwargs.get("full_content", False)
|
|
622
|
+
objective = kwargs.get("objective")
|
|
623
|
+
try:
|
|
624
|
+
result = self.client.beta.extract(
|
|
625
|
+
urls=[url],
|
|
626
|
+
objective=objective,
|
|
627
|
+
excerpts=excerpts,
|
|
628
|
+
full_content=full_content,
|
|
629
|
+
)
|
|
630
|
+
except Exception as e:
|
|
631
|
+
UserMessage(f"Failed to call Parallel Extract API: {e}", raise_with=ValueError)
|
|
632
|
+
return [ExtractResult(result)], {"raw_output": result, "final_url": url}
|
|
633
|
+
|
|
634
|
+
def forward(self, argument):
|
|
635
|
+
kwargs = argument.kwargs
|
|
636
|
+
# Route based on presence of URL vs Query
|
|
637
|
+
url = getattr(argument.prop, "url", None) or kwargs.get("url")
|
|
638
|
+
if url:
|
|
639
|
+
return self._extract(str(url), kwargs)
|
|
640
|
+
|
|
641
|
+
raw_query = getattr(argument.prop, "prepared_input", None)
|
|
642
|
+
if raw_query is None:
|
|
643
|
+
raw_query = getattr(argument.prop, "query", None)
|
|
644
|
+
search_queries = self._coerce_search_queries(raw_query)
|
|
645
|
+
if not search_queries:
|
|
646
|
+
UserMessage(
|
|
647
|
+
"ParallelEngine.forward requires at least one non-empty query or url.",
|
|
648
|
+
raise_with=ValueError,
|
|
649
|
+
)
|
|
650
|
+
processor = kwargs.get("processor")
|
|
651
|
+
if processor is not None:
|
|
652
|
+
return self._task(search_queries, kwargs)
|
|
653
|
+
return self._search(search_queries, kwargs)
|
|
654
|
+
|
|
655
|
+
def prepare(self, argument):
|
|
656
|
+
# For scraping: store URL directly. For search: pass through query string.
|
|
657
|
+
url = argument.kwargs.get("url") or getattr(argument.prop, "url", None)
|
|
658
|
+
if url:
|
|
659
|
+
argument.prop.prepared_input = str(url)
|
|
660
|
+
return
|
|
661
|
+
query = getattr(argument.prop, "query", None)
|
|
662
|
+
if isinstance(query, list):
|
|
663
|
+
argument.prop.prepared_input = self._coerce_search_queries(query)
|
|
664
|
+
return
|
|
665
|
+
argument.prop.prepared_input = str(query or "").strip()
|