symbolicai 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +1 -1
- symai/backend/engines/scrape/engine_requests.py +39 -10
- symai/backend/engines/search/__init__.py +13 -0
- symai/backend/engines/search/engine_firecrawl.py +333 -0
- symai/backend/engines/search/engine_parallel.py +5 -5
- symai/components.py +9 -3
- symai/extended/interfaces/firecrawl.py +30 -0
- symai/extended/interfaces/parallel.py +5 -5
- symai/functional.py +3 -4
- {symbolicai-1.4.0.dist-info → symbolicai-1.5.0.dist-info}/METADATA +2 -1
- {symbolicai-1.4.0.dist-info → symbolicai-1.5.0.dist-info}/RECORD +15 -12
- {symbolicai-1.4.0.dist-info → symbolicai-1.5.0.dist-info}/WHEEL +0 -0
- {symbolicai-1.4.0.dist-info → symbolicai-1.5.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-1.4.0.dist-info → symbolicai-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-1.4.0.dist-info → symbolicai-1.5.0.dist-info}/top_level.txt +0 -0
symai/__init__.py
CHANGED
|
@@ -9,6 +9,7 @@ service disruption.
|
|
|
9
9
|
|
|
10
10
|
import io
|
|
11
11
|
import logging
|
|
12
|
+
import random
|
|
12
13
|
import re
|
|
13
14
|
from typing import Any, ClassVar
|
|
14
15
|
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
|
@@ -17,7 +18,9 @@ import requests
|
|
|
17
18
|
import trafilatura
|
|
18
19
|
from bs4 import BeautifulSoup
|
|
19
20
|
from pdfminer.high_level import extract_text
|
|
21
|
+
from requests.adapters import HTTPAdapter
|
|
20
22
|
from requests.structures import CaseInsensitiveDict
|
|
23
|
+
from urllib3.util.retry import Retry
|
|
21
24
|
|
|
22
25
|
from ....symbol import Result
|
|
23
26
|
from ....utils import UserMessage
|
|
@@ -80,24 +83,49 @@ class RequestsEngine(Engine):
|
|
|
80
83
|
"none": "None",
|
|
81
84
|
}
|
|
82
85
|
|
|
83
|
-
|
|
86
|
+
USER_AGENT_POOL: ClassVar[list[str]] = [
|
|
87
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
88
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
89
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
90
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
91
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
92
|
+
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
93
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
|
94
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
def __init__(self, timeout=15, verify_ssl=True, user_agent=None, retries=3, backoff_factor=0.5, retry_status_codes=(500, 502, 503, 504)):
|
|
84
98
|
"""
|
|
85
99
|
Args:
|
|
86
100
|
timeout: Seconds to wait for network operations before aborting.
|
|
87
101
|
verify_ssl: Toggle for TLS certificate verification.
|
|
88
|
-
user_agent: Optional override for
|
|
102
|
+
user_agent: Optional override for user agent rotation.
|
|
103
|
+
retries: Number of retries for failed requests (default: 3).
|
|
104
|
+
backoff_factor: Multiplier for exponential backoff (default: 0.5).
|
|
105
|
+
retry_status_codes: HTTP status codes to retry on (default: 500, 502, 503, 504).
|
|
89
106
|
"""
|
|
90
107
|
super().__init__()
|
|
91
108
|
self.timeout = timeout
|
|
92
109
|
self.verify_ssl = verify_ssl
|
|
93
110
|
self.name = self.__class__.__name__
|
|
94
|
-
|
|
95
|
-
headers = dict(self.DEFAULT_HEADERS)
|
|
96
|
-
if user_agent:
|
|
97
|
-
headers["User-Agent"] = user_agent
|
|
111
|
+
self._user_agent_override = user_agent
|
|
98
112
|
|
|
99
113
|
self.session = requests.Session()
|
|
100
|
-
self.session.headers.update(
|
|
114
|
+
self.session.headers.update({k: v for k, v in self.DEFAULT_HEADERS.items() if k != "User-Agent"})
|
|
115
|
+
|
|
116
|
+
retry_strategy = Retry(
|
|
117
|
+
total=retries,
|
|
118
|
+
backoff_factor=backoff_factor,
|
|
119
|
+
status_forcelist=retry_status_codes,
|
|
120
|
+
allowed_methods=["GET", "HEAD"],
|
|
121
|
+
)
|
|
122
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
123
|
+
self.session.mount("http://", adapter)
|
|
124
|
+
self.session.mount("https://", adapter)
|
|
125
|
+
|
|
126
|
+
def _get_user_agent(self) -> str:
|
|
127
|
+
"""Return user agent: override if set, otherwise random from pool."""
|
|
128
|
+
return self._user_agent_override or random.choice(self.USER_AGENT_POOL)
|
|
101
129
|
|
|
102
130
|
def _maybe_set_bypass_cookies(self, url: str):
|
|
103
131
|
netloc = urlparse(url).hostname
|
|
@@ -232,7 +260,7 @@ class RequestsEngine(Engine):
|
|
|
232
260
|
# Avoid loops
|
|
233
261
|
if target == resp.url:
|
|
234
262
|
return resp
|
|
235
|
-
return self.session.get(target, timeout=timeout, allow_redirects=True)
|
|
263
|
+
return self.session.get(target, timeout=timeout, allow_redirects=True, headers={"User-Agent": self._get_user_agent()})
|
|
236
264
|
|
|
237
265
|
def _fetch_with_playwright(
|
|
238
266
|
self,
|
|
@@ -259,7 +287,7 @@ class RequestsEngine(Engine):
|
|
|
259
287
|
|
|
260
288
|
timeout_seconds = timeout if timeout is not None else self.timeout
|
|
261
289
|
timeout_ms = max(int(timeout_seconds * 1000), 0)
|
|
262
|
-
user_agent = self.
|
|
290
|
+
user_agent = self._get_user_agent()
|
|
263
291
|
|
|
264
292
|
parsed = urlparse(url)
|
|
265
293
|
hostname = parsed.hostname or ""
|
|
@@ -348,7 +376,8 @@ class RequestsEngine(Engine):
|
|
|
348
376
|
)
|
|
349
377
|
else:
|
|
350
378
|
resp = self.session.get(
|
|
351
|
-
clean_url, timeout=self.timeout, allow_redirects=True, verify=self.verify_ssl
|
|
379
|
+
clean_url, timeout=self.timeout, allow_redirects=True, verify=self.verify_ssl,
|
|
380
|
+
headers={"User-Agent": self._get_user_agent()}
|
|
352
381
|
)
|
|
353
382
|
resp.raise_for_status()
|
|
354
383
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .engine_firecrawl import FirecrawlEngine
|
|
2
|
+
from .engine_parallel import ParallelEngine
|
|
3
|
+
|
|
4
|
+
SEARCH_ENGINE_MAPPING = {
|
|
5
|
+
"firecrawl": FirecrawlEngine,
|
|
6
|
+
"parallel": ParallelEngine,
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"SEARCH_ENGINE_MAPPING",
|
|
11
|
+
"FirecrawlEngine",
|
|
12
|
+
"ParallelEngine",
|
|
13
|
+
]
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
|
7
|
+
|
|
8
|
+
from firecrawl import Firecrawl
|
|
9
|
+
from firecrawl.v2.types import ScrapeOptions
|
|
10
|
+
|
|
11
|
+
from ....symbol import Result
|
|
12
|
+
from ....utils import UserMessage
|
|
13
|
+
from ...base import Engine
|
|
14
|
+
from ...settings import SYMAI_CONFIG
|
|
15
|
+
|
|
16
|
+
logging.getLogger("requests").setLevel(logging.ERROR)
|
|
17
|
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
|
18
|
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
|
19
|
+
|
|
20
|
+
TRACKING_KEYS = {
|
|
21
|
+
"utm_source",
|
|
22
|
+
"utm_medium",
|
|
23
|
+
"utm_campaign",
|
|
24
|
+
"utm_term",
|
|
25
|
+
"utm_content",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class Citation:
|
|
31
|
+
id: int
|
|
32
|
+
title: str
|
|
33
|
+
url: str
|
|
34
|
+
start: int
|
|
35
|
+
end: int
|
|
36
|
+
|
|
37
|
+
def __hash__(self):
|
|
38
|
+
return hash((self.url,))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class FirecrawlSearchResult(Result):
|
|
42
|
+
def __init__(
|
|
43
|
+
self, value: dict[str, Any] | Any, max_chars_per_result: int | None = None, **kwargs
|
|
44
|
+
) -> None:
|
|
45
|
+
raw_dict = value.model_dump() if hasattr(value, "model_dump") else value
|
|
46
|
+
super().__init__(raw_dict, **kwargs)
|
|
47
|
+
self._citations: list[Citation] = []
|
|
48
|
+
self._max_chars_per_result = max_chars_per_result
|
|
49
|
+
try:
|
|
50
|
+
text, citations = self._build_text_and_citations(raw_dict)
|
|
51
|
+
self._value = text
|
|
52
|
+
self._citations = citations
|
|
53
|
+
except Exception as e:
|
|
54
|
+
self._value = None
|
|
55
|
+
UserMessage(f"Failed to parse Firecrawl search response: {e}", raise_with=ValueError)
|
|
56
|
+
|
|
57
|
+
def _build_text_and_citations(self, data: dict[str, Any]) -> tuple[str, list[Citation]]:
|
|
58
|
+
results = []
|
|
59
|
+
for source in ["web", "news", "images"]:
|
|
60
|
+
source_data = data.get(source) or []
|
|
61
|
+
results.extend(source_data)
|
|
62
|
+
|
|
63
|
+
if not results:
|
|
64
|
+
return "", []
|
|
65
|
+
|
|
66
|
+
parts = []
|
|
67
|
+
citations = []
|
|
68
|
+
cursor = 0
|
|
69
|
+
|
|
70
|
+
for idx, item in enumerate(results, 1):
|
|
71
|
+
# Handle both SearchResultWeb (url/title at top level) and Document (url/title in metadata)
|
|
72
|
+
metadata = item.get("metadata") or {}
|
|
73
|
+
url = item.get("url") or metadata.get("url") or metadata.get("source_url") or ""
|
|
74
|
+
title = item.get("title") or metadata.get("title") or ""
|
|
75
|
+
|
|
76
|
+
if not url:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Check if this is a scraped result (has markdown content)
|
|
80
|
+
markdown = item.get("markdown", "")
|
|
81
|
+
if markdown:
|
|
82
|
+
content = markdown
|
|
83
|
+
if self._max_chars_per_result and len(content) > self._max_chars_per_result:
|
|
84
|
+
content = content[: self._max_chars_per_result] + "..."
|
|
85
|
+
result_text = f"{title}\n{url}\n{content}"
|
|
86
|
+
else:
|
|
87
|
+
description = (
|
|
88
|
+
item.get("description")
|
|
89
|
+
or item.get("snippet")
|
|
90
|
+
or metadata.get("description")
|
|
91
|
+
or ""
|
|
92
|
+
)
|
|
93
|
+
result_text = f"{title}\n{url}"
|
|
94
|
+
if description:
|
|
95
|
+
if self._max_chars_per_result and len(description) > self._max_chars_per_result:
|
|
96
|
+
description = description[: self._max_chars_per_result] + "..."
|
|
97
|
+
result_text += f"\n{description}"
|
|
98
|
+
|
|
99
|
+
if parts:
|
|
100
|
+
parts.append("\n\n")
|
|
101
|
+
cursor += 2
|
|
102
|
+
|
|
103
|
+
parts.append(result_text)
|
|
104
|
+
cursor += len(result_text)
|
|
105
|
+
|
|
106
|
+
marker = f"[{idx}]"
|
|
107
|
+
start = cursor
|
|
108
|
+
parts.append(marker)
|
|
109
|
+
cursor += len(marker)
|
|
110
|
+
|
|
111
|
+
citations.append(Citation(id=idx, title=title, url=url, start=start, end=cursor))
|
|
112
|
+
|
|
113
|
+
text = "".join(parts)
|
|
114
|
+
return text, citations
|
|
115
|
+
|
|
116
|
+
def __str__(self) -> str:
|
|
117
|
+
if isinstance(self._value, str) and self._value:
|
|
118
|
+
return self._value
|
|
119
|
+
try:
|
|
120
|
+
return json.dumps(self.raw, indent=2)
|
|
121
|
+
except TypeError:
|
|
122
|
+
return str(self.raw)
|
|
123
|
+
|
|
124
|
+
def _repr_html_(self) -> str:
|
|
125
|
+
if isinstance(self._value, str) and self._value:
|
|
126
|
+
return f"<pre>{self._value}</pre>"
|
|
127
|
+
try:
|
|
128
|
+
return f"<pre>{json.dumps(self.raw, indent=2)}</pre>"
|
|
129
|
+
except Exception:
|
|
130
|
+
return f"<pre>{self.raw!s}</pre>"
|
|
131
|
+
|
|
132
|
+
def get_citations(self) -> list[Citation]:
|
|
133
|
+
return self._citations
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class FirecrawlExtractResult(Result):
|
|
137
|
+
"""Result wrapper for Firecrawl scrape API responses."""
|
|
138
|
+
|
|
139
|
+
def __init__(self, value: Any, **kwargs) -> None:
|
|
140
|
+
raw_dict = value.model_dump() if hasattr(value, "model_dump") else value
|
|
141
|
+
super().__init__(raw_dict, **kwargs)
|
|
142
|
+
try:
|
|
143
|
+
self._value = self._extract_content(raw_dict)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
self._value = None
|
|
146
|
+
UserMessage(f"Failed to parse Firecrawl scrape response: {e}", raise_with=ValueError)
|
|
147
|
+
|
|
148
|
+
def _extract_content(self, data: dict[str, Any]) -> str:
|
|
149
|
+
content = data.get("markdown") or data.get("html") or data.get("raw_html")
|
|
150
|
+
if content:
|
|
151
|
+
return str(content)
|
|
152
|
+
json_data = data.get("json")
|
|
153
|
+
if json_data:
|
|
154
|
+
return json.dumps(json_data, indent=2)
|
|
155
|
+
return ""
|
|
156
|
+
|
|
157
|
+
def __str__(self) -> str:
|
|
158
|
+
try:
|
|
159
|
+
return str(self._value or "")
|
|
160
|
+
except Exception:
|
|
161
|
+
return ""
|
|
162
|
+
|
|
163
|
+
def _repr_html_(self) -> str:
|
|
164
|
+
try:
|
|
165
|
+
return f"<pre>{self._value or ''}</pre>"
|
|
166
|
+
except Exception:
|
|
167
|
+
return "<pre></pre>"
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class FirecrawlEngine(Engine):
|
|
171
|
+
def __init__(self, api_key: str | None = None):
|
|
172
|
+
super().__init__()
|
|
173
|
+
self.config = deepcopy(SYMAI_CONFIG)
|
|
174
|
+
self.api_key = api_key or self.config.get("SEARCH_ENGINE_API_KEY")
|
|
175
|
+
self.model = self.config.get("SEARCH_ENGINE_MODEL")
|
|
176
|
+
self.name = self.__class__.__name__
|
|
177
|
+
|
|
178
|
+
if not self.api_key:
|
|
179
|
+
UserMessage(
|
|
180
|
+
"Firecrawl API key not found. Set SEARCH_ENGINE_API_KEY in config or environment.",
|
|
181
|
+
raise_with=ValueError,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
self.client = Firecrawl(api_key=self.api_key)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
UserMessage(f"Failed to initialize Firecrawl client: {e}", raise_with=ValueError)
|
|
188
|
+
|
|
189
|
+
def id(self) -> str:
|
|
190
|
+
if (
|
|
191
|
+
self.config.get("SEARCH_ENGINE_API_KEY")
|
|
192
|
+
and str(self.config.get("SEARCH_ENGINE_MODEL", "")).lower() == "firecrawl"
|
|
193
|
+
):
|
|
194
|
+
return "search"
|
|
195
|
+
return super().id()
|
|
196
|
+
|
|
197
|
+
def command(self, *args, **kwargs):
|
|
198
|
+
super().command(*args, **kwargs)
|
|
199
|
+
if "SEARCH_ENGINE_API_KEY" in kwargs:
|
|
200
|
+
self.api_key = kwargs["SEARCH_ENGINE_API_KEY"]
|
|
201
|
+
if "SEARCH_ENGINE_MODEL" in kwargs:
|
|
202
|
+
self.model = kwargs["SEARCH_ENGINE_MODEL"]
|
|
203
|
+
|
|
204
|
+
def _normalize_url(self, url: str) -> str:
|
|
205
|
+
parts = urlsplit(url)
|
|
206
|
+
filtered_query = [
|
|
207
|
+
(k, v)
|
|
208
|
+
for k, v in parse_qsl(parts.query, keep_blank_values=True)
|
|
209
|
+
if k not in TRACKING_KEYS and not k.lower().startswith("utm_")
|
|
210
|
+
]
|
|
211
|
+
query = urlencode(filtered_query, doseq=True)
|
|
212
|
+
return urlunsplit((parts.scheme, parts.netloc, parts.path, query, parts.fragment))
|
|
213
|
+
|
|
214
|
+
def _search(self, query: str, kwargs: dict[str, Any]):
|
|
215
|
+
if not query:
|
|
216
|
+
UserMessage(
|
|
217
|
+
"FirecrawlEngine._search requires a non-empty query.", raise_with=ValueError
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
max_chars_per_result = kwargs.get("max_chars_per_result")
|
|
221
|
+
|
|
222
|
+
# Build search kwargs
|
|
223
|
+
search_kwargs = {}
|
|
224
|
+
if "limit" in kwargs:
|
|
225
|
+
search_kwargs["limit"] = kwargs["limit"]
|
|
226
|
+
if "location" in kwargs:
|
|
227
|
+
search_kwargs["location"] = kwargs["location"]
|
|
228
|
+
if "tbs" in kwargs:
|
|
229
|
+
search_kwargs["tbs"] = kwargs["tbs"]
|
|
230
|
+
if "sources" in kwargs:
|
|
231
|
+
search_kwargs["sources"] = kwargs["sources"]
|
|
232
|
+
if "categories" in kwargs:
|
|
233
|
+
search_kwargs["categories"] = kwargs["categories"]
|
|
234
|
+
if "timeout" in kwargs:
|
|
235
|
+
search_kwargs["timeout"] = kwargs["timeout"]
|
|
236
|
+
|
|
237
|
+
# Build scrape options for search results content
|
|
238
|
+
scrape_opts = {}
|
|
239
|
+
if "formats" in kwargs:
|
|
240
|
+
scrape_opts["formats"] = kwargs["formats"]
|
|
241
|
+
if "proxy" in kwargs:
|
|
242
|
+
scrape_opts["proxy"] = kwargs["proxy"]
|
|
243
|
+
if "only_main_content" in kwargs:
|
|
244
|
+
scrape_opts["only_main_content"] = kwargs["only_main_content"]
|
|
245
|
+
if "scrape_location" in kwargs:
|
|
246
|
+
scrape_opts["location"] = kwargs["scrape_location"]
|
|
247
|
+
if "include_tags" in kwargs:
|
|
248
|
+
scrape_opts["include_tags"] = kwargs["include_tags"]
|
|
249
|
+
if "exclude_tags" in kwargs:
|
|
250
|
+
scrape_opts["exclude_tags"] = kwargs["exclude_tags"]
|
|
251
|
+
|
|
252
|
+
if scrape_opts:
|
|
253
|
+
search_kwargs["scrape_options"] = ScrapeOptions(**scrape_opts)
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
result = self.client.search(query, **search_kwargs)
|
|
257
|
+
except Exception as e:
|
|
258
|
+
UserMessage(f"Failed to call Firecrawl Search API: {e}", raise_with=ValueError)
|
|
259
|
+
|
|
260
|
+
raw = result.model_dump() if hasattr(result, "model_dump") else result
|
|
261
|
+
return [FirecrawlSearchResult(result, max_chars_per_result=max_chars_per_result)], {
|
|
262
|
+
"raw_output": raw
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
def _extract(self, url: str, kwargs: dict[str, Any]):
|
|
266
|
+
normalized_url = self._normalize_url(url)
|
|
267
|
+
|
|
268
|
+
# Build scrape kwargs
|
|
269
|
+
scrape_kwargs = {"formats": kwargs.get("formats", ["markdown"])}
|
|
270
|
+
if "only_main_content" in kwargs:
|
|
271
|
+
scrape_kwargs["only_main_content"] = kwargs["only_main_content"]
|
|
272
|
+
if "timeout" in kwargs:
|
|
273
|
+
scrape_kwargs["timeout"] = kwargs["timeout"]
|
|
274
|
+
if "proxy" in kwargs:
|
|
275
|
+
scrape_kwargs["proxy"] = kwargs["proxy"]
|
|
276
|
+
if "location" in kwargs:
|
|
277
|
+
scrape_kwargs["location"] = kwargs["location"]
|
|
278
|
+
if "max_age" in kwargs:
|
|
279
|
+
scrape_kwargs["max_age"] = kwargs["max_age"]
|
|
280
|
+
if "store_in_cache" in kwargs:
|
|
281
|
+
scrape_kwargs["store_in_cache"] = kwargs["store_in_cache"]
|
|
282
|
+
if "actions" in kwargs:
|
|
283
|
+
scrape_kwargs["actions"] = kwargs["actions"]
|
|
284
|
+
if "headers" in kwargs:
|
|
285
|
+
scrape_kwargs["headers"] = kwargs["headers"]
|
|
286
|
+
if "include_tags" in kwargs:
|
|
287
|
+
scrape_kwargs["include_tags"] = kwargs["include_tags"]
|
|
288
|
+
if "exclude_tags" in kwargs:
|
|
289
|
+
scrape_kwargs["exclude_tags"] = kwargs["exclude_tags"]
|
|
290
|
+
if "wait_for" in kwargs:
|
|
291
|
+
scrape_kwargs["wait_for"] = kwargs["wait_for"]
|
|
292
|
+
if "mobile" in kwargs:
|
|
293
|
+
scrape_kwargs["mobile"] = kwargs["mobile"]
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
result = self.client.scrape(normalized_url, **scrape_kwargs)
|
|
297
|
+
except Exception as e:
|
|
298
|
+
UserMessage(f"Failed to call Firecrawl Scrape API: {e}", raise_with=ValueError)
|
|
299
|
+
|
|
300
|
+
raw = result.model_dump() if hasattr(result, "model_dump") else result
|
|
301
|
+
return [FirecrawlExtractResult(result)], {"raw_output": raw, "final_url": normalized_url}
|
|
302
|
+
|
|
303
|
+
def forward(self, argument):
|
|
304
|
+
kwargs = argument.kwargs
|
|
305
|
+
url = argument.prop.url or kwargs.get("url")
|
|
306
|
+
if url:
|
|
307
|
+
return self._extract(str(url), kwargs)
|
|
308
|
+
|
|
309
|
+
raw_query = argument.prop.prepared_input
|
|
310
|
+
if raw_query is None:
|
|
311
|
+
raw_query = argument.prop.query
|
|
312
|
+
|
|
313
|
+
query = str(raw_query or "").strip() if raw_query else ""
|
|
314
|
+
if not query:
|
|
315
|
+
UserMessage(
|
|
316
|
+
"FirecrawlEngine.forward requires at least one non-empty query or url.",
|
|
317
|
+
raise_with=ValueError,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return self._search(query, kwargs)
|
|
321
|
+
|
|
322
|
+
def prepare(self, argument):
|
|
323
|
+
url = argument.kwargs.get("url") or argument.prop.url
|
|
324
|
+
if url:
|
|
325
|
+
argument.prop.prepared_input = str(url)
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
query = argument.prop.query
|
|
329
|
+
if isinstance(query, list):
|
|
330
|
+
argument.prop.prepared_input = " ".join(str(q) for q in query if q)
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
argument.prop.prepared_input = str(query or "").strip()
|
|
@@ -66,7 +66,7 @@ class Citation:
|
|
|
66
66
|
return hash((self.url,))
|
|
67
67
|
|
|
68
68
|
|
|
69
|
-
class
|
|
69
|
+
class ParallelSearchResult(Result):
|
|
70
70
|
def __init__(self, value: dict[str, Any] | Any, **kwargs) -> None:
|
|
71
71
|
super().__init__(value, **kwargs)
|
|
72
72
|
if isinstance(value, dict) and value.get("error"):
|
|
@@ -286,7 +286,7 @@ class SearchResult(Result):
|
|
|
286
286
|
return self._citations
|
|
287
287
|
|
|
288
288
|
|
|
289
|
-
class
|
|
289
|
+
class ParallelExtractResult(Result):
|
|
290
290
|
"""Result wrapper for Parallel Extract API responses."""
|
|
291
291
|
|
|
292
292
|
def __init__(self, value: dict[str, Any] | Any, **kwargs) -> None:
|
|
@@ -485,7 +485,7 @@ class ParallelEngine(Engine):
|
|
|
485
485
|
)
|
|
486
486
|
except Exception as e:
|
|
487
487
|
UserMessage(f"Failed to call Parallel Search API: {e}", raise_with=ValueError)
|
|
488
|
-
return [
|
|
488
|
+
return [ParallelSearchResult(result)], {"raw_output": result}
|
|
489
489
|
|
|
490
490
|
def _task(self, queries: list[str], kwargs: dict[str, Any]):
|
|
491
491
|
processor_name = self._coerce_processor(kwargs.get("processor"))
|
|
@@ -521,7 +521,7 @@ class ParallelEngine(Engine):
|
|
|
521
521
|
result = self._fetch_task_result(run.run_id, timeout=timeout, api_timeout=api_timeout)
|
|
522
522
|
|
|
523
523
|
payload = self._task_result_to_search_payload(result)
|
|
524
|
-
return [
|
|
524
|
+
return [ParallelSearchResult(payload)], {
|
|
525
525
|
"raw_output": result,
|
|
526
526
|
"task_output": payload.get("task_output"),
|
|
527
527
|
"task_output_type": payload.get("task_output_type"),
|
|
@@ -699,7 +699,7 @@ class ParallelEngine(Engine):
|
|
|
699
699
|
)
|
|
700
700
|
except Exception as e:
|
|
701
701
|
UserMessage(f"Failed to call Parallel Extract API: {e}", raise_with=ValueError)
|
|
702
|
-
return [
|
|
702
|
+
return [ParallelExtractResult(result)], {"raw_output": result, "final_url": url}
|
|
703
703
|
|
|
704
704
|
def forward(self, argument):
|
|
705
705
|
kwargs = argument.kwargs
|
symai/components.py
CHANGED
|
@@ -1508,12 +1508,18 @@ class DynamicEngine(Expression):
|
|
|
1508
1508
|
"""Create an engine instance based on the model name."""
|
|
1509
1509
|
# Deferred to avoid components <-> neurosymbolic engine circular imports.
|
|
1510
1510
|
from .backend.engines.neurosymbolic import ENGINE_MAPPING # noqa
|
|
1511
|
-
from .backend.engines.
|
|
1511
|
+
from .backend.engines.search import SEARCH_ENGINE_MAPPING # noqa
|
|
1512
1512
|
|
|
1513
1513
|
try:
|
|
1514
|
+
# Check neurosymbolic engines first
|
|
1514
1515
|
engine_class = ENGINE_MAPPING.get(self.model)
|
|
1515
|
-
|
|
1516
|
-
|
|
1516
|
+
|
|
1517
|
+
# Check search engines
|
|
1518
|
+
if engine_class is None:
|
|
1519
|
+
engine_class = SEARCH_ENGINE_MAPPING.get(self.model)
|
|
1520
|
+
if engine_class is not None:
|
|
1521
|
+
return engine_class(api_key=self.api_key)
|
|
1522
|
+
|
|
1517
1523
|
if engine_class is None:
|
|
1518
1524
|
UserMessage(f"Unsupported model '{self.model}'", raise_with=ValueError)
|
|
1519
1525
|
return engine_class(api_key=self.api_key, model=self.model)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from ... import core
|
|
2
|
+
from ...backend.engines.search.engine_firecrawl import FirecrawlExtractResult, FirecrawlSearchResult
|
|
3
|
+
from ...symbol import Expression, Symbol
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class firecrawl(Expression):
|
|
7
|
+
def __init__(self, *args, **kwargs):
|
|
8
|
+
super().__init__(*args, **kwargs)
|
|
9
|
+
self.name = self.__class__.__name__
|
|
10
|
+
|
|
11
|
+
def search(self, query: Symbol, **kwargs) -> FirecrawlSearchResult:
|
|
12
|
+
query = self._to_symbol(query)
|
|
13
|
+
|
|
14
|
+
@core.search(query=query.value, **kwargs)
|
|
15
|
+
def _func(_) -> FirecrawlSearchResult:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
return _func(self)
|
|
19
|
+
|
|
20
|
+
def scrape(self, url: str, **kwargs) -> FirecrawlExtractResult:
|
|
21
|
+
symbol = self._to_symbol(url)
|
|
22
|
+
options = dict(kwargs)
|
|
23
|
+
options.pop("query", None)
|
|
24
|
+
options["url"] = symbol.value
|
|
25
|
+
|
|
26
|
+
@core.search(query="", **options)
|
|
27
|
+
def _func(_, *_args, **_inner_kwargs) -> FirecrawlExtractResult:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
return _func(self)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from ... import core
|
|
2
|
-
from ...backend.engines.search.engine_parallel import
|
|
2
|
+
from ...backend.engines.search.engine_parallel import ParallelExtractResult, ParallelSearchResult
|
|
3
3
|
from ...symbol import Expression, Symbol
|
|
4
4
|
|
|
5
5
|
|
|
@@ -8,23 +8,23 @@ class parallel(Expression):
|
|
|
8
8
|
super().__init__(*args, **kwargs)
|
|
9
9
|
self.name = self.__class__.__name__
|
|
10
10
|
|
|
11
|
-
def search(self, query: Symbol, **kwargs) ->
|
|
11
|
+
def search(self, query: Symbol, **kwargs) -> ParallelSearchResult:
|
|
12
12
|
query = self._to_symbol(query)
|
|
13
13
|
|
|
14
14
|
@core.search(query=query.value, **kwargs)
|
|
15
|
-
def _func(_) ->
|
|
15
|
+
def _func(_) -> ParallelSearchResult:
|
|
16
16
|
pass
|
|
17
17
|
|
|
18
18
|
return _func(self)
|
|
19
19
|
|
|
20
|
-
def scrape(self, url: str, **kwargs) ->
|
|
20
|
+
def scrape(self, url: str, **kwargs) -> ParallelExtractResult:
|
|
21
21
|
symbol = self._to_symbol(url)
|
|
22
22
|
options = dict(kwargs)
|
|
23
23
|
options.pop("query", None)
|
|
24
24
|
options["url"] = symbol.value
|
|
25
25
|
|
|
26
26
|
@core.search(query="", **options)
|
|
27
|
-
def _func(_, *_args, **_inner_kwargs) ->
|
|
27
|
+
def _func(_, *_args, **_inner_kwargs) -> ParallelExtractResult:
|
|
28
28
|
return None
|
|
29
29
|
|
|
30
30
|
return _func(self)
|
symai/functional.py
CHANGED
|
@@ -498,10 +498,9 @@ class EngineRepository:
|
|
|
498
498
|
def get(engine_name: str, *_args, **_kwargs):
|
|
499
499
|
self = EngineRepository()
|
|
500
500
|
# First check if we're in the context manager that dynamically changes models
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
return engine
|
|
501
|
+
dynamic_engine = self.get_dynamic_engine_instance()
|
|
502
|
+
if dynamic_engine is not None and engine_name in ("neurosymbolic", "search"):
|
|
503
|
+
return dynamic_engine
|
|
505
504
|
|
|
506
505
|
# Otherwise, fallback to normal lookup:
|
|
507
506
|
if engine_name not in self._engines:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: symbolicai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: A Neurosymbolic Perspective on Large Language Models
|
|
5
5
|
Author-email: Marius-Constantin Dinu <marius@extensity.ai>, Leoveanu-Condrei Claudiu <leo@extensity.ai>
|
|
6
6
|
License: BSD 3-Clause License
|
|
@@ -113,6 +113,7 @@ Requires-Dist: openai-whisper>=20240930; extra == "whisper"
|
|
|
113
113
|
Requires-Dist: numba>=0.62.1; extra == "whisper"
|
|
114
114
|
Requires-Dist: llvmlite>=0.45.1; extra == "whisper"
|
|
115
115
|
Provides-Extra: search
|
|
116
|
+
Requires-Dist: firecrawl-py>=4.12.0; extra == "search"
|
|
116
117
|
Requires-Dist: parallel-web>=0.3.3; extra == "search"
|
|
117
118
|
Provides-Extra: serpapi
|
|
118
119
|
Requires-Dist: google_search_results>=2.4.2; extra == "serpapi"
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
symai/TERMS_OF_SERVICE.md,sha256=HN42UXVI_wAVDHjMShzy_k7xAsbjXaATNeMKcIte_eg,91409
|
|
2
|
-
symai/__init__.py,sha256=
|
|
2
|
+
symai/__init__.py,sha256=qlqkm2OjRqXtKhIBltfB9zx0kBf4V4ygckH1RHVPAVE,18530
|
|
3
3
|
symai/chat.py,sha256=DCEbmZ96wv-eitAVt6-oF6PT3JM3cT59Iy3r2Hucd_M,14100
|
|
4
|
-
symai/components.py,sha256=
|
|
4
|
+
symai/components.py,sha256=XL1whwdZd6HCl0viUuXca_7d8no_xxfTGZsqE1hhwqI,64845
|
|
5
5
|
symai/constraints.py,sha256=ljjB9p0qK4DrDl_u5G_Y-Y6WAH5ZHANIqLLxRtwcORs,1980
|
|
6
6
|
symai/context.py,sha256=4M69MJOeWSdPTr2Y9teoNTs-nEvpzcAcr7900UgORXA,189
|
|
7
7
|
symai/core.py,sha256=gI9qvTT0Skq2D0izdhAoN3RdwBtWei59KO52mKN1Sos,70420
|
|
8
8
|
symai/core_ext.py,sha256=lS_BZNeUGmNhhXR-F3dFLF26_nZHq3NVaAwa4vAbkTQ,8937
|
|
9
9
|
symai/exceptions.py,sha256=BxpxI8q3-7Uh_Kg9Xi2PhF6RR6CofxV1h8R07j4v47U,165
|
|
10
|
-
symai/functional.py,sha256=
|
|
10
|
+
symai/functional.py,sha256=GqBs5FZPVZ3iVJ-MlO0Zvkf7cNSDgVhkt3tsL82kFrM,21457
|
|
11
11
|
symai/imports.py,sha256=P5WsamkfKxsK3fs8vlrFpC6CIv5WVpMIMNue9DKJGnE,16126
|
|
12
12
|
symai/interfaces.py,sha256=Z8CDdarnOVa67GCLljKjxQojDH9MhhPKBQFb0pi2WfY,3458
|
|
13
13
|
symai/memory.py,sha256=Cd60UyeJk7SHNBWEYOLrmUXQy54GzQsu3Mjh0lfNQOY,3716
|
|
@@ -59,9 +59,11 @@ symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py,sha256=yWiCT
|
|
|
59
59
|
symai/backend/engines/neurosymbolic/engine_openai_responses.py,sha256=J3P7WcQhxWSPK99uZuLClpIDlLRqLJFWYwDJHrBKox4,17830
|
|
60
60
|
symai/backend/engines/ocr/engine_apilayer.py,sha256=UpC3oHBdSM6wlPVqxwMkemBd-Y0ReVwc270O_EVbRD0,2267
|
|
61
61
|
symai/backend/engines/output/engine_stdout.py,sha256=BWNXACl5U-WYIJnT1pZNwZsTRMzP1XzA0A7o693mmyQ,899
|
|
62
|
-
symai/backend/engines/scrape/engine_requests.py,sha256=
|
|
62
|
+
symai/backend/engines/scrape/engine_requests.py,sha256=uXQ8PGeRN2OyM0_ioEI61rkv5PqSBE0wayAJNS7s8ZA,15819
|
|
63
|
+
symai/backend/engines/search/__init__.py,sha256=iW6kEBOZ-gUiPYfcIWupNgewiqLrFOBGJ643kqwQFoM,274
|
|
64
|
+
symai/backend/engines/search/engine_firecrawl.py,sha256=M_nxXBtvudNqRR4gTC5dXoJzf_9ofrMScYXzaGVTmaM,11990
|
|
63
65
|
symai/backend/engines/search/engine_openai.py,sha256=hAEu3vPZzLTvgmNc4BSZDTcNb4ek4xYeOf8xgti2zRs,14248
|
|
64
|
-
symai/backend/engines/search/engine_parallel.py,sha256=
|
|
66
|
+
symai/backend/engines/search/engine_parallel.py,sha256=voMmeJZ5bf1x3pt7uxMJu84z6VLLG0-ZfgFUWvhM-vI,27048
|
|
65
67
|
symai/backend/engines/search/engine_perplexity.py,sha256=rXnZjMCSiIRuJcNSchE58-f9zWJmYpkKMHONF_XwGnk,4100
|
|
66
68
|
symai/backend/engines/search/engine_serpapi.py,sha256=ZJJBnEDoLjkpxWt_o4vFZanwqojH8ZFBWmWNnEaIbww,3618
|
|
67
69
|
symai/backend/engines/speech_to_text/engine_local_whisper.py,sha256=EOUh2GCeEhZ2Av72i_AZ4NSj9e46Pl7Ft6sIErFy6FI,8387
|
|
@@ -106,6 +108,7 @@ symai/extended/interfaces/clip.py,sha256=l6vjEq3cF-wDX9cRPulyiKpDFQB8QI2609GcGtv
|
|
|
106
108
|
symai/extended/interfaces/console.py,sha256=qeAnG80f95ArADjfpk57AaDA1cHUQSkaUrau2zGNSKs,637
|
|
107
109
|
symai/extended/interfaces/dall_e.py,sha256=SSF1K17SzA-lpdHVtsfHbwRCP6XJxWqsNdXoWwcBYjw,551
|
|
108
110
|
symai/extended/interfaces/file.py,sha256=1_BXHKsHm78MmBeRolA_fFWFTLuA6on7Le-ZF4S_1ds,457
|
|
111
|
+
symai/extended/interfaces/firecrawl.py,sha256=hGA5WxiW6EN5LNsfBSlsYzASgvz9e515TWrHGHcE21s,955
|
|
109
112
|
symai/extended/interfaces/flux.py,sha256=LTY_I9UtIxnh3Nc4cBPQhQ6upB6CVZIhc1uOnFpxEIo,532
|
|
110
113
|
symai/extended/interfaces/gpt_image.py,sha256=Jk5-9og440eZeRAhKmjdyhwP22wX58q0NcFuVhIFWZQ,718
|
|
111
114
|
symai/extended/interfaces/input.py,sha256=CFMLf2j_a-rZ1ApaEwfgqZmWVS7_1yj_u6iiqtiOGPs,456
|
|
@@ -115,7 +118,7 @@ symai/extended/interfaces/naive_scrape.py,sha256=KPjTSBXSCr5zwHwIPgF-VwLSTD2OjVc
|
|
|
115
118
|
symai/extended/interfaces/naive_vectordb.py,sha256=fm7DBMYYnSx7Ma7eNnCmuOVyQwNGnkiDR31oV-qNrJA,1348
|
|
116
119
|
symai/extended/interfaces/ocr.py,sha256=MMxgp8ZKoM44doJPZzzrBVh2VxChs6faFu2uFYnbzfU,563
|
|
117
120
|
symai/extended/interfaces/openai_search.py,sha256=UvnSihdfIwybrLDz2A-yt92aklHEHIvh0pt0hp1Dpis,528
|
|
118
|
-
symai/extended/interfaces/parallel.py,sha256=
|
|
121
|
+
symai/extended/interfaces/parallel.py,sha256=kWRcrs_vTPvZDDhKjl1Hp94ltZeiYH7K8l9zOy5jd-I,947
|
|
119
122
|
symai/extended/interfaces/perplexity.py,sha256=vSUl8CfBsFhFrzxws9Lf8WgfhsoPatJf7eYRfihKRG4,529
|
|
120
123
|
symai/extended/interfaces/pinecone.py,sha256=NA2t1pNQf-G-HSeewEO8jqGnitD3huBV5bucIM9vgi4,1075
|
|
121
124
|
symai/extended/interfaces/python.py,sha256=EcxXQwrlhjGOS5SkRoa_cVt069vu_INDD9DIfbnUses,418
|
|
@@ -163,9 +166,9 @@ symai/server/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
|
163
166
|
symai/server/huggingface_server.py,sha256=wSAVqFiKQsCu5UB2YYVpxJBhJ7GgQBBfePxNi265yP8,9039
|
|
164
167
|
symai/server/llama_cpp_server.py,sha256=-WPTNB2cbnwtnpES4AtPM__MCasDKl83jr94JGS9tmI,2144
|
|
165
168
|
symai/server/qdrant_server.py,sha256=l4r4rz29c7cO1dapXO0LQ4sHW4WF44keuz7j8v5azMc,9854
|
|
166
|
-
symbolicai-1.
|
|
167
|
-
symbolicai-1.
|
|
168
|
-
symbolicai-1.
|
|
169
|
-
symbolicai-1.
|
|
170
|
-
symbolicai-1.
|
|
171
|
-
symbolicai-1.
|
|
169
|
+
symbolicai-1.5.0.dist-info/licenses/LICENSE,sha256=9vRFudlJ1ghVfra5lcCUIYQCqnZSYcBLjLHbGRsrQCs,1505
|
|
170
|
+
symbolicai-1.5.0.dist-info/METADATA,sha256=gQLPEUb1pW2VPNqCtgN-WcXeSQnfUJAWx0KTAN3vnJw,23731
|
|
171
|
+
symbolicai-1.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
172
|
+
symbolicai-1.5.0.dist-info/entry_points.txt,sha256=JV5sdydIfUZdDF6QBEQHiZHod6XNPjCjpWQrXh7gTAw,261
|
|
173
|
+
symbolicai-1.5.0.dist-info/top_level.txt,sha256=bOoIDfpDIvCQtQgXcwVKJvxAKwsxpxo2IL4z92rNJjw,6
|
|
174
|
+
symbolicai-1.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|