symbolicai 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +1 -1
- symai/backend/engines/index/engine_qdrant.py +222 -10
- symai/backend/engines/scrape/engine_requests.py +39 -10
- symai/backend/engines/search/__init__.py +13 -0
- symai/backend/engines/search/engine_firecrawl.py +333 -0
- symai/backend/engines/search/engine_parallel.py +5 -5
- symai/components.py +9 -3
- symai/extended/interfaces/__init__.py +1 -0
- symai/extended/interfaces/firecrawl.py +30 -0
- symai/extended/interfaces/local_search.py +57 -0
- symai/extended/interfaces/parallel.py +5 -5
- symai/functional.py +3 -4
- {symbolicai-1.3.0.dist-info → symbolicai-1.5.0.dist-info}/METADATA +4 -1
- {symbolicai-1.3.0.dist-info → symbolicai-1.5.0.dist-info}/RECORD +18 -14
- {symbolicai-1.3.0.dist-info → symbolicai-1.5.0.dist-info}/WHEEL +0 -0
- {symbolicai-1.3.0.dist-info → symbolicai-1.5.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-1.3.0.dist-info → symbolicai-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-1.3.0.dist-info → symbolicai-1.5.0.dist-info}/top_level.txt +0 -0
symai/__init__.py
CHANGED
|
@@ -4,8 +4,10 @@ import tempfile
|
|
|
4
4
|
import urllib.request
|
|
5
5
|
import uuid
|
|
6
6
|
import warnings
|
|
7
|
+
from dataclasses import dataclass
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Any
|
|
10
|
+
from urllib.parse import urlparse
|
|
9
11
|
|
|
10
12
|
import numpy as np
|
|
11
13
|
|
|
@@ -148,6 +150,108 @@ Matches:
|
|
|
148
150
|
return f"<ul>{doc_str}</ul>"
|
|
149
151
|
|
|
150
152
|
|
|
153
|
+
@dataclass
|
|
154
|
+
class Citation:
|
|
155
|
+
id: int
|
|
156
|
+
title: str
|
|
157
|
+
url: str
|
|
158
|
+
start: int
|
|
159
|
+
end: int
|
|
160
|
+
|
|
161
|
+
def __hash__(self):
|
|
162
|
+
return hash((self.url,))
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class SearchResult(Result):
|
|
166
|
+
def __init__(self, value: dict[str, Any] | Any, **kwargs) -> None:
|
|
167
|
+
super().__init__(value, **kwargs)
|
|
168
|
+
if isinstance(value, dict) and value.get("error"):
|
|
169
|
+
UserMessage(value["error"], raise_with=ValueError)
|
|
170
|
+
results = self._coerce_results(value)
|
|
171
|
+
text, citations = self._build_text_and_citations(results)
|
|
172
|
+
self._value = text
|
|
173
|
+
self._citations = citations
|
|
174
|
+
|
|
175
|
+
def _coerce_results(self, raw: Any) -> list[dict[str, Any]]:
|
|
176
|
+
if raw is None:
|
|
177
|
+
return []
|
|
178
|
+
results = raw.get("results", []) if isinstance(raw, dict) else getattr(raw, "results", None)
|
|
179
|
+
if not results:
|
|
180
|
+
return []
|
|
181
|
+
return [item for item in results if isinstance(item, dict)]
|
|
182
|
+
|
|
183
|
+
def _source_identifier(self, item: dict[str, Any], url: str) -> str:
|
|
184
|
+
for key in ("source_id", "sourceId", "sourceID", "id"):
|
|
185
|
+
raw = item.get(key)
|
|
186
|
+
if raw is None:
|
|
187
|
+
continue
|
|
188
|
+
text = str(raw).strip()
|
|
189
|
+
if text:
|
|
190
|
+
return text
|
|
191
|
+
path = Path(urlparse(url).path)
|
|
192
|
+
return path.name or path.as_posix() or url
|
|
193
|
+
|
|
194
|
+
def _build_text_and_citations(self, results: list[dict[str, Any]]):
|
|
195
|
+
pieces = []
|
|
196
|
+
citations = []
|
|
197
|
+
cursor = 0
|
|
198
|
+
cid = 1
|
|
199
|
+
separator = "\n\n---\n\n"
|
|
200
|
+
|
|
201
|
+
for item in results:
|
|
202
|
+
url = str(item.get("url") or "")
|
|
203
|
+
if not url:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
title = str(item.get("title") or "")
|
|
207
|
+
if not title:
|
|
208
|
+
path = Path(urlparse(url).path)
|
|
209
|
+
title = path.name or url
|
|
210
|
+
|
|
211
|
+
excerpts = item.get("excerpts") or []
|
|
212
|
+
excerpt_parts = [ex.strip() for ex in excerpts if isinstance(ex, str) and ex.strip()]
|
|
213
|
+
if not excerpt_parts:
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
combined_excerpt = "\n\n".join(excerpt_parts)
|
|
217
|
+
source_id = self._source_identifier(item, url)
|
|
218
|
+
block_body = combined_excerpt if not source_id else f"{source_id}\n\n{combined_excerpt}"
|
|
219
|
+
|
|
220
|
+
if pieces:
|
|
221
|
+
pieces.append(separator)
|
|
222
|
+
cursor += len(separator)
|
|
223
|
+
|
|
224
|
+
opening_tag = "<source>\n"
|
|
225
|
+
pieces.append(opening_tag)
|
|
226
|
+
cursor += len(opening_tag)
|
|
227
|
+
|
|
228
|
+
pieces.append(block_body)
|
|
229
|
+
cursor += len(block_body)
|
|
230
|
+
|
|
231
|
+
closing_tag = "\n</source>"
|
|
232
|
+
pieces.append(closing_tag)
|
|
233
|
+
cursor += len(closing_tag)
|
|
234
|
+
|
|
235
|
+
marker = f"[{cid}]"
|
|
236
|
+
start = cursor
|
|
237
|
+
pieces.append(marker)
|
|
238
|
+
cursor += len(marker)
|
|
239
|
+
|
|
240
|
+
citations.append(Citation(id=cid, title=title or url, url=url, start=start, end=cursor))
|
|
241
|
+
cid += 1
|
|
242
|
+
|
|
243
|
+
return "".join(pieces), citations
|
|
244
|
+
|
|
245
|
+
def __str__(self) -> str:
|
|
246
|
+
return str(self._value or "")
|
|
247
|
+
|
|
248
|
+
def _repr_html_(self) -> str:
|
|
249
|
+
return f"<pre>{self._value or ''}</pre>"
|
|
250
|
+
|
|
251
|
+
def get_citations(self) -> list[Citation]:
|
|
252
|
+
return self._citations
|
|
253
|
+
|
|
254
|
+
|
|
151
255
|
class QdrantIndexEngine(Engine):
|
|
152
256
|
_default_url = "http://localhost:6333"
|
|
153
257
|
_default_api_key = SYMAI_CONFIG.get("INDEXING_ENGINE_API_KEY", None)
|
|
@@ -421,15 +525,18 @@ class QdrantIndexEngine(Engine):
|
|
|
421
525
|
kwargs["index_get"] = True
|
|
422
526
|
self._configure_collection(**kwargs)
|
|
423
527
|
|
|
528
|
+
treat_as_search_engine = False
|
|
424
529
|
if operation == "search":
|
|
425
530
|
# Ensure collection exists - fail fast if it doesn't
|
|
426
531
|
self._ensure_collection_exists(collection_name)
|
|
427
|
-
|
|
532
|
+
search_kwargs = dict(kwargs)
|
|
533
|
+
index_top_k = search_kwargs.pop("index_top_k", self.index_top_k)
|
|
428
534
|
# Optional search parameters
|
|
429
|
-
score_threshold =
|
|
535
|
+
score_threshold = search_kwargs.pop("score_threshold", None)
|
|
430
536
|
# Accept both `query_filter` and `filter` for convenience
|
|
431
|
-
raw_filter =
|
|
537
|
+
raw_filter = search_kwargs.pop("query_filter", search_kwargs.pop("filter", None))
|
|
432
538
|
query_filter = self._build_query_filter(raw_filter)
|
|
539
|
+
treat_as_search_engine = bool(search_kwargs.pop("treat_as_search_engine", False))
|
|
433
540
|
|
|
434
541
|
# Use shared search helper that already handles retries and normalization
|
|
435
542
|
rsp = self._search_sync(
|
|
@@ -438,6 +545,7 @@ class QdrantIndexEngine(Engine):
|
|
|
438
545
|
limit=index_top_k,
|
|
439
546
|
score_threshold=score_threshold,
|
|
440
547
|
query_filter=query_filter,
|
|
548
|
+
**search_kwargs,
|
|
441
549
|
)
|
|
442
550
|
elif operation == "add":
|
|
443
551
|
# Create collection if it doesn't exist (only for write operations)
|
|
@@ -462,7 +570,10 @@ class QdrantIndexEngine(Engine):
|
|
|
462
570
|
|
|
463
571
|
metadata = {}
|
|
464
572
|
|
|
465
|
-
|
|
573
|
+
if operation == "search" and treat_as_search_engine:
|
|
574
|
+
rsp = self._format_search_results(rsp, collection_name)
|
|
575
|
+
else:
|
|
576
|
+
rsp = QdrantResult(rsp, query, embedding)
|
|
466
577
|
return [rsp], metadata
|
|
467
578
|
|
|
468
579
|
def prepare(self, argument):
|
|
@@ -513,7 +624,33 @@ class QdrantIndexEngine(Engine):
|
|
|
513
624
|
jitter=self.jitter,
|
|
514
625
|
)
|
|
515
626
|
def _func():
|
|
627
|
+
qdrant_kwargs = dict(kwargs)
|
|
516
628
|
query_vector_normalized = self._normalize_vector(query_vector)
|
|
629
|
+
with_payload = qdrant_kwargs.pop("with_payload", True)
|
|
630
|
+
with_vectors = qdrant_kwargs.pop("with_vectors", self.index_values)
|
|
631
|
+
# qdrant-client `query_points` is strict about extra kwargs and will assert if any
|
|
632
|
+
# unknown arguments are provided. Because our engine `forward()` passes decorator
|
|
633
|
+
# kwargs through the stack, we must drop engine-internal fields here.
|
|
634
|
+
#
|
|
635
|
+
# Keep only kwargs that `qdrant_client.QdrantClient.query_points` accepts (besides
|
|
636
|
+
# those we pass explicitly).
|
|
637
|
+
if "filter" in qdrant_kwargs and "query_filter" not in qdrant_kwargs:
|
|
638
|
+
# Convenience alias supported by our public API
|
|
639
|
+
qdrant_kwargs["query_filter"] = qdrant_kwargs.pop("filter")
|
|
640
|
+
|
|
641
|
+
allowed_qdrant_kwargs = {
|
|
642
|
+
"using",
|
|
643
|
+
"prefetch",
|
|
644
|
+
"query_filter",
|
|
645
|
+
"search_params",
|
|
646
|
+
"offset",
|
|
647
|
+
"score_threshold",
|
|
648
|
+
"lookup_from",
|
|
649
|
+
"consistency",
|
|
650
|
+
"shard_key_selector",
|
|
651
|
+
"timeout",
|
|
652
|
+
}
|
|
653
|
+
qdrant_kwargs = {k: v for k, v in qdrant_kwargs.items() if k in allowed_qdrant_kwargs}
|
|
517
654
|
# For single vector collections, pass vector directly to query parameter
|
|
518
655
|
# For named vector collections, use Query(near_vector=NamedVector(name="vector_name", vector=...))
|
|
519
656
|
# query_points API uses query_filter (not filter) for filtering
|
|
@@ -521,9 +658,9 @@ class QdrantIndexEngine(Engine):
|
|
|
521
658
|
collection_name=collection_name,
|
|
522
659
|
query=query_vector_normalized,
|
|
523
660
|
limit=top_k,
|
|
524
|
-
with_payload=
|
|
525
|
-
with_vectors=
|
|
526
|
-
**
|
|
661
|
+
with_payload=with_payload,
|
|
662
|
+
with_vectors=with_vectors,
|
|
663
|
+
**qdrant_kwargs,
|
|
527
664
|
)
|
|
528
665
|
# query_points returns QueryResponse with .points attribute, extract it
|
|
529
666
|
return response.points
|
|
@@ -860,6 +997,82 @@ class QdrantIndexEngine(Engine):
|
|
|
860
997
|
# Use _query which handles retry logic and vector normalization
|
|
861
998
|
return self._query(collection_name, query_vector, limit, **search_kwargs)
|
|
862
999
|
|
|
1000
|
+
def _resolve_payload_url(
|
|
1001
|
+
self, payload: dict[str, Any], collection_name: str, point_id: Any
|
|
1002
|
+
) -> str:
|
|
1003
|
+
source = (
|
|
1004
|
+
payload.get("source")
|
|
1005
|
+
or payload.get("url")
|
|
1006
|
+
or payload.get("file_path")
|
|
1007
|
+
or payload.get("path")
|
|
1008
|
+
)
|
|
1009
|
+
if isinstance(source, str) and source:
|
|
1010
|
+
if source.startswith(("http://", "https://", "file://")):
|
|
1011
|
+
return source
|
|
1012
|
+
|
|
1013
|
+
source_path = Path(source).expanduser()
|
|
1014
|
+
try:
|
|
1015
|
+
resolved = source_path.resolve()
|
|
1016
|
+
if resolved.exists() or source_path.is_absolute():
|
|
1017
|
+
return resolved.as_uri()
|
|
1018
|
+
except Exception:
|
|
1019
|
+
return str(source_path)
|
|
1020
|
+
return str(source_path)
|
|
1021
|
+
|
|
1022
|
+
return f"qdrant://{collection_name}/{point_id}"
|
|
1023
|
+
|
|
1024
|
+
def _resolve_payload_title(self, payload: dict[str, Any], url: str, page: Any) -> str:
|
|
1025
|
+
raw_title = payload.get("title")
|
|
1026
|
+
if isinstance(raw_title, str) and raw_title.strip():
|
|
1027
|
+
base = raw_title.strip()
|
|
1028
|
+
else:
|
|
1029
|
+
parsed = urlparse(url)
|
|
1030
|
+
path_part = parsed.path or url
|
|
1031
|
+
base = Path(path_part).stem or url
|
|
1032
|
+
|
|
1033
|
+
try:
|
|
1034
|
+
page_int = int(page) if page is not None else None
|
|
1035
|
+
except (TypeError, ValueError):
|
|
1036
|
+
page_int = None
|
|
1037
|
+
|
|
1038
|
+
if Path(urlparse(url).path).suffix.lower() == ".pdf" and page_int is not None:
|
|
1039
|
+
base = f"{base}#p{page_int}"
|
|
1040
|
+
|
|
1041
|
+
return base
|
|
1042
|
+
|
|
1043
|
+
def _format_search_results(self, points: list[ScoredPoint] | None, collection_name: str):
|
|
1044
|
+
results: list[dict[str, Any]] = []
|
|
1045
|
+
|
|
1046
|
+
for point in points or []:
|
|
1047
|
+
payload = getattr(point, "payload", {}) or {}
|
|
1048
|
+
text = payload.get("text") or payload.get("content")
|
|
1049
|
+
if isinstance(text, list):
|
|
1050
|
+
text = " ".join([t for t in text if isinstance(t, str)])
|
|
1051
|
+
if not isinstance(text, str):
|
|
1052
|
+
continue
|
|
1053
|
+
excerpt = text.strip()
|
|
1054
|
+
if not excerpt:
|
|
1055
|
+
continue
|
|
1056
|
+
|
|
1057
|
+
page = payload.get("page") or payload.get("page_number") or payload.get("pageIndex")
|
|
1058
|
+
url = self._resolve_payload_url(payload, collection_name, getattr(point, "id", ""))
|
|
1059
|
+
title = self._resolve_payload_title(payload, url, page)
|
|
1060
|
+
|
|
1061
|
+
results.append(
|
|
1062
|
+
{
|
|
1063
|
+
"url": url,
|
|
1064
|
+
"title": title,
|
|
1065
|
+
"excerpts": [excerpt],
|
|
1066
|
+
"source_id": payload.get("source_id")
|
|
1067
|
+
or payload.get("sourceId")
|
|
1068
|
+
or payload.get("chunk_id")
|
|
1069
|
+
or payload.get("chunkId")
|
|
1070
|
+
or getattr(point, "id", None),
|
|
1071
|
+
}
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
return SearchResult({"results": results})
|
|
1075
|
+
|
|
863
1076
|
async def search(
|
|
864
1077
|
self,
|
|
865
1078
|
collection_name: str,
|
|
@@ -923,7 +1136,7 @@ class QdrantIndexEngine(Engine):
|
|
|
923
1136
|
if tmp_path.exists():
|
|
924
1137
|
tmp_path.unlink()
|
|
925
1138
|
|
|
926
|
-
async def chunk_and_upsert(
|
|
1139
|
+
async def chunk_and_upsert(
|
|
927
1140
|
self,
|
|
928
1141
|
collection_name: str,
|
|
929
1142
|
text: str | Symbol | None = None,
|
|
@@ -1001,8 +1214,7 @@ class QdrantIndexEngine(Engine):
|
|
|
1001
1214
|
# Add source to metadata if not already present
|
|
1002
1215
|
if metadata is None:
|
|
1003
1216
|
metadata = {}
|
|
1004
|
-
|
|
1005
|
-
metadata["source"] = doc_path.name
|
|
1217
|
+
metadata["source"] = str(doc_path.resolve())
|
|
1006
1218
|
|
|
1007
1219
|
# Handle document_url: download and read file using FileReader
|
|
1008
1220
|
elif document_url is not None:
|
|
@@ -9,6 +9,7 @@ service disruption.
|
|
|
9
9
|
|
|
10
10
|
import io
|
|
11
11
|
import logging
|
|
12
|
+
import random
|
|
12
13
|
import re
|
|
13
14
|
from typing import Any, ClassVar
|
|
14
15
|
from urllib.parse import parse_qsl, urlencode, urljoin, urlparse, urlunparse
|
|
@@ -17,7 +18,9 @@ import requests
|
|
|
17
18
|
import trafilatura
|
|
18
19
|
from bs4 import BeautifulSoup
|
|
19
20
|
from pdfminer.high_level import extract_text
|
|
21
|
+
from requests.adapters import HTTPAdapter
|
|
20
22
|
from requests.structures import CaseInsensitiveDict
|
|
23
|
+
from urllib3.util.retry import Retry
|
|
21
24
|
|
|
22
25
|
from ....symbol import Result
|
|
23
26
|
from ....utils import UserMessage
|
|
@@ -80,24 +83,49 @@ class RequestsEngine(Engine):
|
|
|
80
83
|
"none": "None",
|
|
81
84
|
}
|
|
82
85
|
|
|
83
|
-
|
|
86
|
+
USER_AGENT_POOL: ClassVar[list[str]] = [
|
|
87
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
88
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
89
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
90
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
91
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
92
|
+
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
93
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
|
94
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
def __init__(self, timeout=15, verify_ssl=True, user_agent=None, retries=3, backoff_factor=0.5, retry_status_codes=(500, 502, 503, 504)):
|
|
84
98
|
"""
|
|
85
99
|
Args:
|
|
86
100
|
timeout: Seconds to wait for network operations before aborting.
|
|
87
101
|
verify_ssl: Toggle for TLS certificate verification.
|
|
88
|
-
user_agent: Optional override for
|
|
102
|
+
user_agent: Optional override for user agent rotation.
|
|
103
|
+
retries: Number of retries for failed requests (default: 3).
|
|
104
|
+
backoff_factor: Multiplier for exponential backoff (default: 0.5).
|
|
105
|
+
retry_status_codes: HTTP status codes to retry on (default: 500, 502, 503, 504).
|
|
89
106
|
"""
|
|
90
107
|
super().__init__()
|
|
91
108
|
self.timeout = timeout
|
|
92
109
|
self.verify_ssl = verify_ssl
|
|
93
110
|
self.name = self.__class__.__name__
|
|
94
|
-
|
|
95
|
-
headers = dict(self.DEFAULT_HEADERS)
|
|
96
|
-
if user_agent:
|
|
97
|
-
headers["User-Agent"] = user_agent
|
|
111
|
+
self._user_agent_override = user_agent
|
|
98
112
|
|
|
99
113
|
self.session = requests.Session()
|
|
100
|
-
self.session.headers.update(
|
|
114
|
+
self.session.headers.update({k: v for k, v in self.DEFAULT_HEADERS.items() if k != "User-Agent"})
|
|
115
|
+
|
|
116
|
+
retry_strategy = Retry(
|
|
117
|
+
total=retries,
|
|
118
|
+
backoff_factor=backoff_factor,
|
|
119
|
+
status_forcelist=retry_status_codes,
|
|
120
|
+
allowed_methods=["GET", "HEAD"],
|
|
121
|
+
)
|
|
122
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
123
|
+
self.session.mount("http://", adapter)
|
|
124
|
+
self.session.mount("https://", adapter)
|
|
125
|
+
|
|
126
|
+
def _get_user_agent(self) -> str:
|
|
127
|
+
"""Return user agent: override if set, otherwise random from pool."""
|
|
128
|
+
return self._user_agent_override or random.choice(self.USER_AGENT_POOL)
|
|
101
129
|
|
|
102
130
|
def _maybe_set_bypass_cookies(self, url: str):
|
|
103
131
|
netloc = urlparse(url).hostname
|
|
@@ -232,7 +260,7 @@ class RequestsEngine(Engine):
|
|
|
232
260
|
# Avoid loops
|
|
233
261
|
if target == resp.url:
|
|
234
262
|
return resp
|
|
235
|
-
return self.session.get(target, timeout=timeout, allow_redirects=True)
|
|
263
|
+
return self.session.get(target, timeout=timeout, allow_redirects=True, headers={"User-Agent": self._get_user_agent()})
|
|
236
264
|
|
|
237
265
|
def _fetch_with_playwright(
|
|
238
266
|
self,
|
|
@@ -259,7 +287,7 @@ class RequestsEngine(Engine):
|
|
|
259
287
|
|
|
260
288
|
timeout_seconds = timeout if timeout is not None else self.timeout
|
|
261
289
|
timeout_ms = max(int(timeout_seconds * 1000), 0)
|
|
262
|
-
user_agent = self.
|
|
290
|
+
user_agent = self._get_user_agent()
|
|
263
291
|
|
|
264
292
|
parsed = urlparse(url)
|
|
265
293
|
hostname = parsed.hostname or ""
|
|
@@ -348,7 +376,8 @@ class RequestsEngine(Engine):
|
|
|
348
376
|
)
|
|
349
377
|
else:
|
|
350
378
|
resp = self.session.get(
|
|
351
|
-
clean_url, timeout=self.timeout, allow_redirects=True, verify=self.verify_ssl
|
|
379
|
+
clean_url, timeout=self.timeout, allow_redirects=True, verify=self.verify_ssl,
|
|
380
|
+
headers={"User-Agent": self._get_user_agent()}
|
|
352
381
|
)
|
|
353
382
|
resp.raise_for_status()
|
|
354
383
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .engine_firecrawl import FirecrawlEngine
|
|
2
|
+
from .engine_parallel import ParallelEngine
|
|
3
|
+
|
|
4
|
+
SEARCH_ENGINE_MAPPING = {
|
|
5
|
+
"firecrawl": FirecrawlEngine,
|
|
6
|
+
"parallel": ParallelEngine,
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"SEARCH_ENGINE_MAPPING",
|
|
11
|
+
"FirecrawlEngine",
|
|
12
|
+
"ParallelEngine",
|
|
13
|
+
]
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
|
7
|
+
|
|
8
|
+
from firecrawl import Firecrawl
|
|
9
|
+
from firecrawl.v2.types import ScrapeOptions
|
|
10
|
+
|
|
11
|
+
from ....symbol import Result
|
|
12
|
+
from ....utils import UserMessage
|
|
13
|
+
from ...base import Engine
|
|
14
|
+
from ...settings import SYMAI_CONFIG
|
|
15
|
+
|
|
16
|
+
logging.getLogger("requests").setLevel(logging.ERROR)
|
|
17
|
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
|
18
|
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
|
19
|
+
|
|
20
|
+
TRACKING_KEYS = {
|
|
21
|
+
"utm_source",
|
|
22
|
+
"utm_medium",
|
|
23
|
+
"utm_campaign",
|
|
24
|
+
"utm_term",
|
|
25
|
+
"utm_content",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class Citation:
|
|
31
|
+
id: int
|
|
32
|
+
title: str
|
|
33
|
+
url: str
|
|
34
|
+
start: int
|
|
35
|
+
end: int
|
|
36
|
+
|
|
37
|
+
def __hash__(self):
|
|
38
|
+
return hash((self.url,))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class FirecrawlSearchResult(Result):
|
|
42
|
+
def __init__(
|
|
43
|
+
self, value: dict[str, Any] | Any, max_chars_per_result: int | None = None, **kwargs
|
|
44
|
+
) -> None:
|
|
45
|
+
raw_dict = value.model_dump() if hasattr(value, "model_dump") else value
|
|
46
|
+
super().__init__(raw_dict, **kwargs)
|
|
47
|
+
self._citations: list[Citation] = []
|
|
48
|
+
self._max_chars_per_result = max_chars_per_result
|
|
49
|
+
try:
|
|
50
|
+
text, citations = self._build_text_and_citations(raw_dict)
|
|
51
|
+
self._value = text
|
|
52
|
+
self._citations = citations
|
|
53
|
+
except Exception as e:
|
|
54
|
+
self._value = None
|
|
55
|
+
UserMessage(f"Failed to parse Firecrawl search response: {e}", raise_with=ValueError)
|
|
56
|
+
|
|
57
|
+
def _build_text_and_citations(self, data: dict[str, Any]) -> tuple[str, list[Citation]]:
|
|
58
|
+
results = []
|
|
59
|
+
for source in ["web", "news", "images"]:
|
|
60
|
+
source_data = data.get(source) or []
|
|
61
|
+
results.extend(source_data)
|
|
62
|
+
|
|
63
|
+
if not results:
|
|
64
|
+
return "", []
|
|
65
|
+
|
|
66
|
+
parts = []
|
|
67
|
+
citations = []
|
|
68
|
+
cursor = 0
|
|
69
|
+
|
|
70
|
+
for idx, item in enumerate(results, 1):
|
|
71
|
+
# Handle both SearchResultWeb (url/title at top level) and Document (url/title in metadata)
|
|
72
|
+
metadata = item.get("metadata") or {}
|
|
73
|
+
url = item.get("url") or metadata.get("url") or metadata.get("source_url") or ""
|
|
74
|
+
title = item.get("title") or metadata.get("title") or ""
|
|
75
|
+
|
|
76
|
+
if not url:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Check if this is a scraped result (has markdown content)
|
|
80
|
+
markdown = item.get("markdown", "")
|
|
81
|
+
if markdown:
|
|
82
|
+
content = markdown
|
|
83
|
+
if self._max_chars_per_result and len(content) > self._max_chars_per_result:
|
|
84
|
+
content = content[: self._max_chars_per_result] + "..."
|
|
85
|
+
result_text = f"{title}\n{url}\n{content}"
|
|
86
|
+
else:
|
|
87
|
+
description = (
|
|
88
|
+
item.get("description")
|
|
89
|
+
or item.get("snippet")
|
|
90
|
+
or metadata.get("description")
|
|
91
|
+
or ""
|
|
92
|
+
)
|
|
93
|
+
result_text = f"{title}\n{url}"
|
|
94
|
+
if description:
|
|
95
|
+
if self._max_chars_per_result and len(description) > self._max_chars_per_result:
|
|
96
|
+
description = description[: self._max_chars_per_result] + "..."
|
|
97
|
+
result_text += f"\n{description}"
|
|
98
|
+
|
|
99
|
+
if parts:
|
|
100
|
+
parts.append("\n\n")
|
|
101
|
+
cursor += 2
|
|
102
|
+
|
|
103
|
+
parts.append(result_text)
|
|
104
|
+
cursor += len(result_text)
|
|
105
|
+
|
|
106
|
+
marker = f"[{idx}]"
|
|
107
|
+
start = cursor
|
|
108
|
+
parts.append(marker)
|
|
109
|
+
cursor += len(marker)
|
|
110
|
+
|
|
111
|
+
citations.append(Citation(id=idx, title=title, url=url, start=start, end=cursor))
|
|
112
|
+
|
|
113
|
+
text = "".join(parts)
|
|
114
|
+
return text, citations
|
|
115
|
+
|
|
116
|
+
def __str__(self) -> str:
|
|
117
|
+
if isinstance(self._value, str) and self._value:
|
|
118
|
+
return self._value
|
|
119
|
+
try:
|
|
120
|
+
return json.dumps(self.raw, indent=2)
|
|
121
|
+
except TypeError:
|
|
122
|
+
return str(self.raw)
|
|
123
|
+
|
|
124
|
+
def _repr_html_(self) -> str:
|
|
125
|
+
if isinstance(self._value, str) and self._value:
|
|
126
|
+
return f"<pre>{self._value}</pre>"
|
|
127
|
+
try:
|
|
128
|
+
return f"<pre>{json.dumps(self.raw, indent=2)}</pre>"
|
|
129
|
+
except Exception:
|
|
130
|
+
return f"<pre>{self.raw!s}</pre>"
|
|
131
|
+
|
|
132
|
+
def get_citations(self) -> list[Citation]:
|
|
133
|
+
return self._citations
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class FirecrawlExtractResult(Result):
|
|
137
|
+
"""Result wrapper for Firecrawl scrape API responses."""
|
|
138
|
+
|
|
139
|
+
def __init__(self, value: Any, **kwargs) -> None:
|
|
140
|
+
raw_dict = value.model_dump() if hasattr(value, "model_dump") else value
|
|
141
|
+
super().__init__(raw_dict, **kwargs)
|
|
142
|
+
try:
|
|
143
|
+
self._value = self._extract_content(raw_dict)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
self._value = None
|
|
146
|
+
UserMessage(f"Failed to parse Firecrawl scrape response: {e}", raise_with=ValueError)
|
|
147
|
+
|
|
148
|
+
def _extract_content(self, data: dict[str, Any]) -> str:
|
|
149
|
+
content = data.get("markdown") or data.get("html") or data.get("raw_html")
|
|
150
|
+
if content:
|
|
151
|
+
return str(content)
|
|
152
|
+
json_data = data.get("json")
|
|
153
|
+
if json_data:
|
|
154
|
+
return json.dumps(json_data, indent=2)
|
|
155
|
+
return ""
|
|
156
|
+
|
|
157
|
+
def __str__(self) -> str:
|
|
158
|
+
try:
|
|
159
|
+
return str(self._value or "")
|
|
160
|
+
except Exception:
|
|
161
|
+
return ""
|
|
162
|
+
|
|
163
|
+
def _repr_html_(self) -> str:
|
|
164
|
+
try:
|
|
165
|
+
return f"<pre>{self._value or ''}</pre>"
|
|
166
|
+
except Exception:
|
|
167
|
+
return "<pre></pre>"
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class FirecrawlEngine(Engine):
|
|
171
|
+
def __init__(self, api_key: str | None = None):
|
|
172
|
+
super().__init__()
|
|
173
|
+
self.config = deepcopy(SYMAI_CONFIG)
|
|
174
|
+
self.api_key = api_key or self.config.get("SEARCH_ENGINE_API_KEY")
|
|
175
|
+
self.model = self.config.get("SEARCH_ENGINE_MODEL")
|
|
176
|
+
self.name = self.__class__.__name__
|
|
177
|
+
|
|
178
|
+
if not self.api_key:
|
|
179
|
+
UserMessage(
|
|
180
|
+
"Firecrawl API key not found. Set SEARCH_ENGINE_API_KEY in config or environment.",
|
|
181
|
+
raise_with=ValueError,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
self.client = Firecrawl(api_key=self.api_key)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
UserMessage(f"Failed to initialize Firecrawl client: {e}", raise_with=ValueError)
|
|
188
|
+
|
|
189
|
+
def id(self) -> str:
|
|
190
|
+
if (
|
|
191
|
+
self.config.get("SEARCH_ENGINE_API_KEY")
|
|
192
|
+
and str(self.config.get("SEARCH_ENGINE_MODEL", "")).lower() == "firecrawl"
|
|
193
|
+
):
|
|
194
|
+
return "search"
|
|
195
|
+
return super().id()
|
|
196
|
+
|
|
197
|
+
def command(self, *args, **kwargs):
|
|
198
|
+
super().command(*args, **kwargs)
|
|
199
|
+
if "SEARCH_ENGINE_API_KEY" in kwargs:
|
|
200
|
+
self.api_key = kwargs["SEARCH_ENGINE_API_KEY"]
|
|
201
|
+
if "SEARCH_ENGINE_MODEL" in kwargs:
|
|
202
|
+
self.model = kwargs["SEARCH_ENGINE_MODEL"]
|
|
203
|
+
|
|
204
|
+
def _normalize_url(self, url: str) -> str:
|
|
205
|
+
parts = urlsplit(url)
|
|
206
|
+
filtered_query = [
|
|
207
|
+
(k, v)
|
|
208
|
+
for k, v in parse_qsl(parts.query, keep_blank_values=True)
|
|
209
|
+
if k not in TRACKING_KEYS and not k.lower().startswith("utm_")
|
|
210
|
+
]
|
|
211
|
+
query = urlencode(filtered_query, doseq=True)
|
|
212
|
+
return urlunsplit((parts.scheme, parts.netloc, parts.path, query, parts.fragment))
|
|
213
|
+
|
|
214
|
+
def _search(self, query: str, kwargs: dict[str, Any]):
|
|
215
|
+
if not query:
|
|
216
|
+
UserMessage(
|
|
217
|
+
"FirecrawlEngine._search requires a non-empty query.", raise_with=ValueError
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
max_chars_per_result = kwargs.get("max_chars_per_result")
|
|
221
|
+
|
|
222
|
+
# Build search kwargs
|
|
223
|
+
search_kwargs = {}
|
|
224
|
+
if "limit" in kwargs:
|
|
225
|
+
search_kwargs["limit"] = kwargs["limit"]
|
|
226
|
+
if "location" in kwargs:
|
|
227
|
+
search_kwargs["location"] = kwargs["location"]
|
|
228
|
+
if "tbs" in kwargs:
|
|
229
|
+
search_kwargs["tbs"] = kwargs["tbs"]
|
|
230
|
+
if "sources" in kwargs:
|
|
231
|
+
search_kwargs["sources"] = kwargs["sources"]
|
|
232
|
+
if "categories" in kwargs:
|
|
233
|
+
search_kwargs["categories"] = kwargs["categories"]
|
|
234
|
+
if "timeout" in kwargs:
|
|
235
|
+
search_kwargs["timeout"] = kwargs["timeout"]
|
|
236
|
+
|
|
237
|
+
# Build scrape options for search results content
|
|
238
|
+
scrape_opts = {}
|
|
239
|
+
if "formats" in kwargs:
|
|
240
|
+
scrape_opts["formats"] = kwargs["formats"]
|
|
241
|
+
if "proxy" in kwargs:
|
|
242
|
+
scrape_opts["proxy"] = kwargs["proxy"]
|
|
243
|
+
if "only_main_content" in kwargs:
|
|
244
|
+
scrape_opts["only_main_content"] = kwargs["only_main_content"]
|
|
245
|
+
if "scrape_location" in kwargs:
|
|
246
|
+
scrape_opts["location"] = kwargs["scrape_location"]
|
|
247
|
+
if "include_tags" in kwargs:
|
|
248
|
+
scrape_opts["include_tags"] = kwargs["include_tags"]
|
|
249
|
+
if "exclude_tags" in kwargs:
|
|
250
|
+
scrape_opts["exclude_tags"] = kwargs["exclude_tags"]
|
|
251
|
+
|
|
252
|
+
if scrape_opts:
|
|
253
|
+
search_kwargs["scrape_options"] = ScrapeOptions(**scrape_opts)
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
result = self.client.search(query, **search_kwargs)
|
|
257
|
+
except Exception as e:
|
|
258
|
+
UserMessage(f"Failed to call Firecrawl Search API: {e}", raise_with=ValueError)
|
|
259
|
+
|
|
260
|
+
raw = result.model_dump() if hasattr(result, "model_dump") else result
|
|
261
|
+
return [FirecrawlSearchResult(result, max_chars_per_result=max_chars_per_result)], {
|
|
262
|
+
"raw_output": raw
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
def _extract(self, url: str, kwargs: dict[str, Any]):
|
|
266
|
+
normalized_url = self._normalize_url(url)
|
|
267
|
+
|
|
268
|
+
# Build scrape kwargs
|
|
269
|
+
scrape_kwargs = {"formats": kwargs.get("formats", ["markdown"])}
|
|
270
|
+
if "only_main_content" in kwargs:
|
|
271
|
+
scrape_kwargs["only_main_content"] = kwargs["only_main_content"]
|
|
272
|
+
if "timeout" in kwargs:
|
|
273
|
+
scrape_kwargs["timeout"] = kwargs["timeout"]
|
|
274
|
+
if "proxy" in kwargs:
|
|
275
|
+
scrape_kwargs["proxy"] = kwargs["proxy"]
|
|
276
|
+
if "location" in kwargs:
|
|
277
|
+
scrape_kwargs["location"] = kwargs["location"]
|
|
278
|
+
if "max_age" in kwargs:
|
|
279
|
+
scrape_kwargs["max_age"] = kwargs["max_age"]
|
|
280
|
+
if "store_in_cache" in kwargs:
|
|
281
|
+
scrape_kwargs["store_in_cache"] = kwargs["store_in_cache"]
|
|
282
|
+
if "actions" in kwargs:
|
|
283
|
+
scrape_kwargs["actions"] = kwargs["actions"]
|
|
284
|
+
if "headers" in kwargs:
|
|
285
|
+
scrape_kwargs["headers"] = kwargs["headers"]
|
|
286
|
+
if "include_tags" in kwargs:
|
|
287
|
+
scrape_kwargs["include_tags"] = kwargs["include_tags"]
|
|
288
|
+
if "exclude_tags" in kwargs:
|
|
289
|
+
scrape_kwargs["exclude_tags"] = kwargs["exclude_tags"]
|
|
290
|
+
if "wait_for" in kwargs:
|
|
291
|
+
scrape_kwargs["wait_for"] = kwargs["wait_for"]
|
|
292
|
+
if "mobile" in kwargs:
|
|
293
|
+
scrape_kwargs["mobile"] = kwargs["mobile"]
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
result = self.client.scrape(normalized_url, **scrape_kwargs)
|
|
297
|
+
except Exception as e:
|
|
298
|
+
UserMessage(f"Failed to call Firecrawl Scrape API: {e}", raise_with=ValueError)
|
|
299
|
+
|
|
300
|
+
raw = result.model_dump() if hasattr(result, "model_dump") else result
|
|
301
|
+
return [FirecrawlExtractResult(result)], {"raw_output": raw, "final_url": normalized_url}
|
|
302
|
+
|
|
303
|
+
def forward(self, argument):
|
|
304
|
+
kwargs = argument.kwargs
|
|
305
|
+
url = argument.prop.url or kwargs.get("url")
|
|
306
|
+
if url:
|
|
307
|
+
return self._extract(str(url), kwargs)
|
|
308
|
+
|
|
309
|
+
raw_query = argument.prop.prepared_input
|
|
310
|
+
if raw_query is None:
|
|
311
|
+
raw_query = argument.prop.query
|
|
312
|
+
|
|
313
|
+
query = str(raw_query or "").strip() if raw_query else ""
|
|
314
|
+
if not query:
|
|
315
|
+
UserMessage(
|
|
316
|
+
"FirecrawlEngine.forward requires at least one non-empty query or url.",
|
|
317
|
+
raise_with=ValueError,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return self._search(query, kwargs)
|
|
321
|
+
|
|
322
|
+
def prepare(self, argument):
|
|
323
|
+
url = argument.kwargs.get("url") or argument.prop.url
|
|
324
|
+
if url:
|
|
325
|
+
argument.prop.prepared_input = str(url)
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
query = argument.prop.query
|
|
329
|
+
if isinstance(query, list):
|
|
330
|
+
argument.prop.prepared_input = " ".join(str(q) for q in query if q)
|
|
331
|
+
return
|
|
332
|
+
|
|
333
|
+
argument.prop.prepared_input = str(query or "").strip()
|
|
@@ -66,7 +66,7 @@ class Citation:
|
|
|
66
66
|
return hash((self.url,))
|
|
67
67
|
|
|
68
68
|
|
|
69
|
-
class
|
|
69
|
+
class ParallelSearchResult(Result):
|
|
70
70
|
def __init__(self, value: dict[str, Any] | Any, **kwargs) -> None:
|
|
71
71
|
super().__init__(value, **kwargs)
|
|
72
72
|
if isinstance(value, dict) and value.get("error"):
|
|
@@ -286,7 +286,7 @@ class SearchResult(Result):
|
|
|
286
286
|
return self._citations
|
|
287
287
|
|
|
288
288
|
|
|
289
|
-
class
|
|
289
|
+
class ParallelExtractResult(Result):
|
|
290
290
|
"""Result wrapper for Parallel Extract API responses."""
|
|
291
291
|
|
|
292
292
|
def __init__(self, value: dict[str, Any] | Any, **kwargs) -> None:
|
|
@@ -485,7 +485,7 @@ class ParallelEngine(Engine):
|
|
|
485
485
|
)
|
|
486
486
|
except Exception as e:
|
|
487
487
|
UserMessage(f"Failed to call Parallel Search API: {e}", raise_with=ValueError)
|
|
488
|
-
return [
|
|
488
|
+
return [ParallelSearchResult(result)], {"raw_output": result}
|
|
489
489
|
|
|
490
490
|
def _task(self, queries: list[str], kwargs: dict[str, Any]):
|
|
491
491
|
processor_name = self._coerce_processor(kwargs.get("processor"))
|
|
@@ -521,7 +521,7 @@ class ParallelEngine(Engine):
|
|
|
521
521
|
result = self._fetch_task_result(run.run_id, timeout=timeout, api_timeout=api_timeout)
|
|
522
522
|
|
|
523
523
|
payload = self._task_result_to_search_payload(result)
|
|
524
|
-
return [
|
|
524
|
+
return [ParallelSearchResult(payload)], {
|
|
525
525
|
"raw_output": result,
|
|
526
526
|
"task_output": payload.get("task_output"),
|
|
527
527
|
"task_output_type": payload.get("task_output_type"),
|
|
@@ -699,7 +699,7 @@ class ParallelEngine(Engine):
|
|
|
699
699
|
)
|
|
700
700
|
except Exception as e:
|
|
701
701
|
UserMessage(f"Failed to call Parallel Extract API: {e}", raise_with=ValueError)
|
|
702
|
-
return [
|
|
702
|
+
return [ParallelExtractResult(result)], {"raw_output": result, "final_url": url}
|
|
703
703
|
|
|
704
704
|
def forward(self, argument):
|
|
705
705
|
kwargs = argument.kwargs
|
symai/components.py
CHANGED
|
@@ -1508,12 +1508,18 @@ class DynamicEngine(Expression):
|
|
|
1508
1508
|
"""Create an engine instance based on the model name."""
|
|
1509
1509
|
# Deferred to avoid components <-> neurosymbolic engine circular imports.
|
|
1510
1510
|
from .backend.engines.neurosymbolic import ENGINE_MAPPING # noqa
|
|
1511
|
-
from .backend.engines.
|
|
1511
|
+
from .backend.engines.search import SEARCH_ENGINE_MAPPING # noqa
|
|
1512
1512
|
|
|
1513
1513
|
try:
|
|
1514
|
+
# Check neurosymbolic engines first
|
|
1514
1515
|
engine_class = ENGINE_MAPPING.get(self.model)
|
|
1515
|
-
|
|
1516
|
-
|
|
1516
|
+
|
|
1517
|
+
# Check search engines
|
|
1518
|
+
if engine_class is None:
|
|
1519
|
+
engine_class = SEARCH_ENGINE_MAPPING.get(self.model)
|
|
1520
|
+
if engine_class is not None:
|
|
1521
|
+
return engine_class(api_key=self.api_key)
|
|
1522
|
+
|
|
1517
1523
|
if engine_class is None:
|
|
1518
1524
|
UserMessage(f"Unsupported model '{self.model}'", raise_with=ValueError)
|
|
1519
1525
|
return engine_class(api_key=self.api_key, model=self.model)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from ... import core
|
|
2
|
+
from ...backend.engines.search.engine_firecrawl import FirecrawlExtractResult, FirecrawlSearchResult
|
|
3
|
+
from ...symbol import Expression, Symbol
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class firecrawl(Expression):
|
|
7
|
+
def __init__(self, *args, **kwargs):
|
|
8
|
+
super().__init__(*args, **kwargs)
|
|
9
|
+
self.name = self.__class__.__name__
|
|
10
|
+
|
|
11
|
+
def search(self, query: Symbol, **kwargs) -> FirecrawlSearchResult:
|
|
12
|
+
query = self._to_symbol(query)
|
|
13
|
+
|
|
14
|
+
@core.search(query=query.value, **kwargs)
|
|
15
|
+
def _func(_) -> FirecrawlSearchResult:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
return _func(self)
|
|
19
|
+
|
|
20
|
+
def scrape(self, url: str, **kwargs) -> FirecrawlExtractResult:
|
|
21
|
+
symbol = self._to_symbol(url)
|
|
22
|
+
options = dict(kwargs)
|
|
23
|
+
options.pop("query", None)
|
|
24
|
+
options["url"] = symbol.value
|
|
25
|
+
|
|
26
|
+
@core.search(query="", **options)
|
|
27
|
+
def _func(_, *_args, **_inner_kwargs) -> FirecrawlExtractResult:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
return _func(self)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
from ...backend.engines.index.engine_qdrant import QdrantIndexEngine
|
|
4
|
+
from ...symbol import Expression, Symbol
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from ...backend.engines.index.engine_qdrant import SearchResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class local_search(Expression):
|
|
11
|
+
def __init__(self, index_name: str = QdrantIndexEngine._default_index_name, *args, **kwargs):
|
|
12
|
+
super().__init__(*args, **kwargs)
|
|
13
|
+
self.index_name = index_name
|
|
14
|
+
self.name = self.__class__.__name__
|
|
15
|
+
|
|
16
|
+
def search(self, query: Symbol, **kwargs) -> "SearchResult":
|
|
17
|
+
symbol = self._to_symbol(query)
|
|
18
|
+
options = dict(kwargs)
|
|
19
|
+
|
|
20
|
+
index_name = options.pop("collection_name", options.pop("index_name", self.index_name))
|
|
21
|
+
|
|
22
|
+
# Normalize limit/top_k/index_top_k
|
|
23
|
+
index_top_k = options.pop("index_top_k", None)
|
|
24
|
+
if index_top_k is None:
|
|
25
|
+
top_k = options.pop("top_k", None)
|
|
26
|
+
limit = options.pop("limit", None)
|
|
27
|
+
index_top_k = top_k if top_k is not None else limit
|
|
28
|
+
if index_top_k is not None:
|
|
29
|
+
options["index_top_k"] = index_top_k
|
|
30
|
+
|
|
31
|
+
# Bypass decorator/EngineRepository pipeline entirely (and thus `forward()`).
|
|
32
|
+
# We query Qdrant directly and then format results into the same SearchResult
|
|
33
|
+
# structure used by `parallel.search` (citations, inline markers, etc.).
|
|
34
|
+
engine = QdrantIndexEngine(index_name=index_name)
|
|
35
|
+
try:
|
|
36
|
+
score_threshold = options.pop("score_threshold", None)
|
|
37
|
+
raw_filter = options.pop("query_filter", options.pop("filter", None))
|
|
38
|
+
query_filter = engine._build_query_filter(raw_filter)
|
|
39
|
+
|
|
40
|
+
# Keep `with_payload` default aligned with engine behavior; let caller override.
|
|
41
|
+
with_payload = options.pop("with_payload", True)
|
|
42
|
+
with_vectors = options.pop("with_vectors", False)
|
|
43
|
+
|
|
44
|
+
points = engine._search_sync(
|
|
45
|
+
collection_name=index_name,
|
|
46
|
+
query_vector=symbol.embedding,
|
|
47
|
+
limit=options.pop("index_top_k", engine.index_top_k),
|
|
48
|
+
score_threshold=score_threshold,
|
|
49
|
+
query_filter=query_filter,
|
|
50
|
+
with_payload=with_payload,
|
|
51
|
+
with_vectors=with_vectors,
|
|
52
|
+
**options,
|
|
53
|
+
)
|
|
54
|
+
result = engine._format_search_results(points, index_name)
|
|
55
|
+
finally:
|
|
56
|
+
del engine
|
|
57
|
+
return result
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from ... import core
|
|
2
|
-
from ...backend.engines.search.engine_parallel import
|
|
2
|
+
from ...backend.engines.search.engine_parallel import ParallelExtractResult, ParallelSearchResult
|
|
3
3
|
from ...symbol import Expression, Symbol
|
|
4
4
|
|
|
5
5
|
|
|
@@ -8,23 +8,23 @@ class parallel(Expression):
|
|
|
8
8
|
super().__init__(*args, **kwargs)
|
|
9
9
|
self.name = self.__class__.__name__
|
|
10
10
|
|
|
11
|
-
def search(self, query: Symbol, **kwargs) ->
|
|
11
|
+
def search(self, query: Symbol, **kwargs) -> ParallelSearchResult:
|
|
12
12
|
query = self._to_symbol(query)
|
|
13
13
|
|
|
14
14
|
@core.search(query=query.value, **kwargs)
|
|
15
|
-
def _func(_) ->
|
|
15
|
+
def _func(_) -> ParallelSearchResult:
|
|
16
16
|
pass
|
|
17
17
|
|
|
18
18
|
return _func(self)
|
|
19
19
|
|
|
20
|
-
def scrape(self, url: str, **kwargs) ->
|
|
20
|
+
def scrape(self, url: str, **kwargs) -> ParallelExtractResult:
|
|
21
21
|
symbol = self._to_symbol(url)
|
|
22
22
|
options = dict(kwargs)
|
|
23
23
|
options.pop("query", None)
|
|
24
24
|
options["url"] = symbol.value
|
|
25
25
|
|
|
26
26
|
@core.search(query="", **options)
|
|
27
|
-
def _func(_, *_args, **_inner_kwargs) ->
|
|
27
|
+
def _func(_, *_args, **_inner_kwargs) -> ParallelExtractResult:
|
|
28
28
|
return None
|
|
29
29
|
|
|
30
30
|
return _func(self)
|
symai/functional.py
CHANGED
|
@@ -498,10 +498,9 @@ class EngineRepository:
|
|
|
498
498
|
def get(engine_name: str, *_args, **_kwargs):
|
|
499
499
|
self = EngineRepository()
|
|
500
500
|
# First check if we're in the context manager that dynamically changes models
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
return engine
|
|
501
|
+
dynamic_engine = self.get_dynamic_engine_instance()
|
|
502
|
+
if dynamic_engine is not None and engine_name in ("neurosymbolic", "search"):
|
|
503
|
+
return dynamic_engine
|
|
505
504
|
|
|
506
505
|
# Otherwise, fallback to normal lookup:
|
|
507
506
|
if engine_name not in self._engines:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: symbolicai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: A Neurosymbolic Perspective on Large Language Models
|
|
5
5
|
Author-email: Marius-Constantin Dinu <marius@extensity.ai>, Leoveanu-Condrei Claudiu <leo@extensity.ai>
|
|
6
6
|
License: BSD 3-Clause License
|
|
@@ -113,6 +113,7 @@ Requires-Dist: openai-whisper>=20240930; extra == "whisper"
|
|
|
113
113
|
Requires-Dist: numba>=0.62.1; extra == "whisper"
|
|
114
114
|
Requires-Dist: llvmlite>=0.45.1; extra == "whisper"
|
|
115
115
|
Provides-Extra: search
|
|
116
|
+
Requires-Dist: firecrawl-py>=4.12.0; extra == "search"
|
|
116
117
|
Requires-Dist: parallel-web>=0.3.3; extra == "search"
|
|
117
118
|
Provides-Extra: serpapi
|
|
118
119
|
Requires-Dist: google_search_results>=2.4.2; extra == "serpapi"
|
|
@@ -136,6 +137,8 @@ Requires-Dist: symbolicai[serpapi]; extra == "all"
|
|
|
136
137
|
Requires-Dist: symbolicai[services]; extra == "all"
|
|
137
138
|
Requires-Dist: symbolicai[solver]; extra == "all"
|
|
138
139
|
Requires-Dist: symbolicai[qdrant]; extra == "all"
|
|
140
|
+
Provides-Extra: dev
|
|
141
|
+
Requires-Dist: pytest-asyncio>=1.3.0; extra == "dev"
|
|
139
142
|
Dynamic: license-file
|
|
140
143
|
|
|
141
144
|
# **SymbolicAI: A neuro-symbolic perspective on LLMs**
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
symai/TERMS_OF_SERVICE.md,sha256=HN42UXVI_wAVDHjMShzy_k7xAsbjXaATNeMKcIte_eg,91409
|
|
2
|
-
symai/__init__.py,sha256=
|
|
2
|
+
symai/__init__.py,sha256=qlqkm2OjRqXtKhIBltfB9zx0kBf4V4ygckH1RHVPAVE,18530
|
|
3
3
|
symai/chat.py,sha256=DCEbmZ96wv-eitAVt6-oF6PT3JM3cT59Iy3r2Hucd_M,14100
|
|
4
|
-
symai/components.py,sha256=
|
|
4
|
+
symai/components.py,sha256=XL1whwdZd6HCl0viUuXca_7d8no_xxfTGZsqE1hhwqI,64845
|
|
5
5
|
symai/constraints.py,sha256=ljjB9p0qK4DrDl_u5G_Y-Y6WAH5ZHANIqLLxRtwcORs,1980
|
|
6
6
|
symai/context.py,sha256=4M69MJOeWSdPTr2Y9teoNTs-nEvpzcAcr7900UgORXA,189
|
|
7
7
|
symai/core.py,sha256=gI9qvTT0Skq2D0izdhAoN3RdwBtWei59KO52mKN1Sos,70420
|
|
8
8
|
symai/core_ext.py,sha256=lS_BZNeUGmNhhXR-F3dFLF26_nZHq3NVaAwa4vAbkTQ,8937
|
|
9
9
|
symai/exceptions.py,sha256=BxpxI8q3-7Uh_Kg9Xi2PhF6RR6CofxV1h8R07j4v47U,165
|
|
10
|
-
symai/functional.py,sha256=
|
|
10
|
+
symai/functional.py,sha256=GqBs5FZPVZ3iVJ-MlO0Zvkf7cNSDgVhkt3tsL82kFrM,21457
|
|
11
11
|
symai/imports.py,sha256=P5WsamkfKxsK3fs8vlrFpC6CIv5WVpMIMNue9DKJGnE,16126
|
|
12
12
|
symai/interfaces.py,sha256=Z8CDdarnOVa67GCLljKjxQojDH9MhhPKBQFb0pi2WfY,3458
|
|
13
13
|
symai/memory.py,sha256=Cd60UyeJk7SHNBWEYOLrmUXQy54GzQsu3Mjh0lfNQOY,3716
|
|
@@ -41,7 +41,7 @@ symai/backend/engines/files/engine_io.py,sha256=4eYBz44rQYWD7VO6Pn7hVF_cOnqNuolo
|
|
|
41
41
|
symai/backend/engines/imagecaptioning/engine_blip2.py,sha256=8lTzc8sQpuNY4AUb_ZweRKr95v-sFtTykT5ennVf6g0,2915
|
|
42
42
|
symai/backend/engines/imagecaptioning/engine_llavacpp_client.py,sha256=jBsLZv0Laa4tuPyX0VQ7uwyldyO3aYIbbj73WjTbceM,6793
|
|
43
43
|
symai/backend/engines/index/engine_pinecone.py,sha256=fxCew1ldUdjd9UtqnMuWFDiVz5X5BUIKZtq1iSDhj28,9132
|
|
44
|
-
symai/backend/engines/index/engine_qdrant.py,sha256=
|
|
44
|
+
symai/backend/engines/index/engine_qdrant.py,sha256=U9p0kzYvHE4DjFgxnvnG_8xfEoP_W4dpaBGY5gTFMF4,50994
|
|
45
45
|
symai/backend/engines/index/engine_vectordb.py,sha256=xXU8QaC2BX9O4dDjDCVYgWO4PxQMpmNlhtal6UVtV0o,8541
|
|
46
46
|
symai/backend/engines/lean/engine_lean4.py,sha256=ln5nbQn5szq8nRulbREPLCPQ5bwjM_A5XAGMkfzPdT8,10102
|
|
47
47
|
symai/backend/engines/neurosymbolic/__init__.py,sha256=o7HUmxcYSrIkutGYB-6_Qur3adHyrkVeWroDtqEK-YE,2279
|
|
@@ -59,9 +59,11 @@ symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py,sha256=yWiCT
|
|
|
59
59
|
symai/backend/engines/neurosymbolic/engine_openai_responses.py,sha256=J3P7WcQhxWSPK99uZuLClpIDlLRqLJFWYwDJHrBKox4,17830
|
|
60
60
|
symai/backend/engines/ocr/engine_apilayer.py,sha256=UpC3oHBdSM6wlPVqxwMkemBd-Y0ReVwc270O_EVbRD0,2267
|
|
61
61
|
symai/backend/engines/output/engine_stdout.py,sha256=BWNXACl5U-WYIJnT1pZNwZsTRMzP1XzA0A7o693mmyQ,899
|
|
62
|
-
symai/backend/engines/scrape/engine_requests.py,sha256=
|
|
62
|
+
symai/backend/engines/scrape/engine_requests.py,sha256=uXQ8PGeRN2OyM0_ioEI61rkv5PqSBE0wayAJNS7s8ZA,15819
|
|
63
|
+
symai/backend/engines/search/__init__.py,sha256=iW6kEBOZ-gUiPYfcIWupNgewiqLrFOBGJ643kqwQFoM,274
|
|
64
|
+
symai/backend/engines/search/engine_firecrawl.py,sha256=M_nxXBtvudNqRR4gTC5dXoJzf_9ofrMScYXzaGVTmaM,11990
|
|
63
65
|
symai/backend/engines/search/engine_openai.py,sha256=hAEu3vPZzLTvgmNc4BSZDTcNb4ek4xYeOf8xgti2zRs,14248
|
|
64
|
-
symai/backend/engines/search/engine_parallel.py,sha256=
|
|
66
|
+
symai/backend/engines/search/engine_parallel.py,sha256=voMmeJZ5bf1x3pt7uxMJu84z6VLLG0-ZfgFUWvhM-vI,27048
|
|
65
67
|
symai/backend/engines/search/engine_perplexity.py,sha256=rXnZjMCSiIRuJcNSchE58-f9zWJmYpkKMHONF_XwGnk,4100
|
|
66
68
|
symai/backend/engines/search/engine_serpapi.py,sha256=ZJJBnEDoLjkpxWt_o4vFZanwqojH8ZFBWmWNnEaIbww,3618
|
|
67
69
|
symai/backend/engines/speech_to_text/engine_local_whisper.py,sha256=EOUh2GCeEhZ2Av72i_AZ4NSj9e46Pl7Ft6sIErFy6FI,8387
|
|
@@ -100,21 +102,23 @@ symai/extended/solver.py,sha256=Men8FcGlUdUHJCw0lb1rKAwLOGp5-d5Rnuf2sx5Q6PM,1173
|
|
|
100
102
|
symai/extended/summarizer.py,sha256=x7yKOU-tXmvHZxmyKrPoy5_Dy9-Zet1oAcDK8uvQSRI,1052
|
|
101
103
|
symai/extended/taypan_interpreter.py,sha256=yPIcI-NcpNpfDb3r3KiclP9XwzvFo_enoZOgK1JM3NI,4832
|
|
102
104
|
symai/extended/vectordb.py,sha256=npCR9WBfV6RN3OQZuJAELpwz1sM6q1btKqrVaW5jPvs,13546
|
|
103
|
-
symai/extended/interfaces/__init__.py,sha256=
|
|
105
|
+
symai/extended/interfaces/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
104
106
|
symai/extended/interfaces/blip_2.py,sha256=wZYVzql6w_OJMUZc1c2BKx3LHrlapRprx-Q6p99_qxE,463
|
|
105
107
|
symai/extended/interfaces/clip.py,sha256=l6vjEq3cF-wDX9cRPulyiKpDFQB8QI2609GcGtvqt7U,514
|
|
106
108
|
symai/extended/interfaces/console.py,sha256=qeAnG80f95ArADjfpk57AaDA1cHUQSkaUrau2zGNSKs,637
|
|
107
109
|
symai/extended/interfaces/dall_e.py,sha256=SSF1K17SzA-lpdHVtsfHbwRCP6XJxWqsNdXoWwcBYjw,551
|
|
108
110
|
symai/extended/interfaces/file.py,sha256=1_BXHKsHm78MmBeRolA_fFWFTLuA6on7Le-ZF4S_1ds,457
|
|
111
|
+
symai/extended/interfaces/firecrawl.py,sha256=hGA5WxiW6EN5LNsfBSlsYzASgvz9e515TWrHGHcE21s,955
|
|
109
112
|
symai/extended/interfaces/flux.py,sha256=LTY_I9UtIxnh3Nc4cBPQhQ6upB6CVZIhc1uOnFpxEIo,532
|
|
110
113
|
symai/extended/interfaces/gpt_image.py,sha256=Jk5-9og440eZeRAhKmjdyhwP22wX58q0NcFuVhIFWZQ,718
|
|
111
114
|
symai/extended/interfaces/input.py,sha256=CFMLf2j_a-rZ1ApaEwfgqZmWVS7_1yj_u6iiqtiOGPs,456
|
|
112
115
|
symai/extended/interfaces/llava.py,sha256=yCItfGYSk35RazhEfHR4R324h-R6W5DjZYeJBonDkRU,433
|
|
116
|
+
symai/extended/interfaces/local_search.py,sha256=AHHRsYCUm4VttGSl_HAk5kpH34e0x_uzvgy1OXSubSs,2408
|
|
113
117
|
symai/extended/interfaces/naive_scrape.py,sha256=KPjTSBXSCr5zwHwIPgF-VwLSTD2OjVcL4xALNX4l9-4,682
|
|
114
118
|
symai/extended/interfaces/naive_vectordb.py,sha256=fm7DBMYYnSx7Ma7eNnCmuOVyQwNGnkiDR31oV-qNrJA,1348
|
|
115
119
|
symai/extended/interfaces/ocr.py,sha256=MMxgp8ZKoM44doJPZzzrBVh2VxChs6faFu2uFYnbzfU,563
|
|
116
120
|
symai/extended/interfaces/openai_search.py,sha256=UvnSihdfIwybrLDz2A-yt92aklHEHIvh0pt0hp1Dpis,528
|
|
117
|
-
symai/extended/interfaces/parallel.py,sha256=
|
|
121
|
+
symai/extended/interfaces/parallel.py,sha256=kWRcrs_vTPvZDDhKjl1Hp94ltZeiYH7K8l9zOy5jd-I,947
|
|
118
122
|
symai/extended/interfaces/perplexity.py,sha256=vSUl8CfBsFhFrzxws9Lf8WgfhsoPatJf7eYRfihKRG4,529
|
|
119
123
|
symai/extended/interfaces/pinecone.py,sha256=NA2t1pNQf-G-HSeewEO8jqGnitD3huBV5bucIM9vgi4,1075
|
|
120
124
|
symai/extended/interfaces/python.py,sha256=EcxXQwrlhjGOS5SkRoa_cVt069vu_INDD9DIfbnUses,418
|
|
@@ -162,9 +166,9 @@ symai/server/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
|
162
166
|
symai/server/huggingface_server.py,sha256=wSAVqFiKQsCu5UB2YYVpxJBhJ7GgQBBfePxNi265yP8,9039
|
|
163
167
|
symai/server/llama_cpp_server.py,sha256=-WPTNB2cbnwtnpES4AtPM__MCasDKl83jr94JGS9tmI,2144
|
|
164
168
|
symai/server/qdrant_server.py,sha256=l4r4rz29c7cO1dapXO0LQ4sHW4WF44keuz7j8v5azMc,9854
|
|
165
|
-
symbolicai-1.
|
|
166
|
-
symbolicai-1.
|
|
167
|
-
symbolicai-1.
|
|
168
|
-
symbolicai-1.
|
|
169
|
-
symbolicai-1.
|
|
170
|
-
symbolicai-1.
|
|
169
|
+
symbolicai-1.5.0.dist-info/licenses/LICENSE,sha256=9vRFudlJ1ghVfra5lcCUIYQCqnZSYcBLjLHbGRsrQCs,1505
|
|
170
|
+
symbolicai-1.5.0.dist-info/METADATA,sha256=gQLPEUb1pW2VPNqCtgN-WcXeSQnfUJAWx0KTAN3vnJw,23731
|
|
171
|
+
symbolicai-1.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
172
|
+
symbolicai-1.5.0.dist-info/entry_points.txt,sha256=JV5sdydIfUZdDF6QBEQHiZHod6XNPjCjpWQrXh7gTAw,261
|
|
173
|
+
symbolicai-1.5.0.dist-info/top_level.txt,sha256=bOoIDfpDIvCQtQgXcwVKJvxAKwsxpxo2IL4z92rNJjw,6
|
|
174
|
+
symbolicai-1.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|