symbolicai 0.21.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +269 -173
- symai/backend/base.py +123 -110
- symai/backend/engines/drawing/engine_bfl.py +45 -44
- symai/backend/engines/drawing/engine_gpt_image.py +112 -97
- symai/backend/engines/embedding/engine_llama_cpp.py +63 -52
- symai/backend/engines/embedding/engine_openai.py +25 -21
- symai/backend/engines/execute/engine_python.py +19 -18
- symai/backend/engines/files/engine_io.py +104 -95
- symai/backend/engines/imagecaptioning/engine_blip2.py +28 -24
- symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +102 -79
- symai/backend/engines/index/engine_pinecone.py +124 -97
- symai/backend/engines/index/engine_qdrant.py +1011 -0
- symai/backend/engines/index/engine_vectordb.py +84 -56
- symai/backend/engines/lean/engine_lean4.py +96 -52
- symai/backend/engines/neurosymbolic/__init__.py +41 -13
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +330 -248
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +329 -264
- symai/backend/engines/neurosymbolic/engine_cerebras.py +328 -0
- symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +118 -88
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +344 -299
- symai/backend/engines/neurosymbolic/engine_groq.py +173 -115
- symai/backend/engines/neurosymbolic/engine_huggingface.py +114 -84
- symai/backend/engines/neurosymbolic/engine_llama_cpp.py +144 -118
- symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +415 -307
- symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +394 -231
- symai/backend/engines/ocr/engine_apilayer.py +23 -27
- symai/backend/engines/output/engine_stdout.py +10 -13
- symai/backend/engines/{webscraping → scrape}/engine_requests.py +101 -54
- symai/backend/engines/search/engine_openai.py +100 -88
- symai/backend/engines/search/engine_parallel.py +665 -0
- symai/backend/engines/search/engine_perplexity.py +44 -45
- symai/backend/engines/search/engine_serpapi.py +37 -34
- symai/backend/engines/speech_to_text/engine_local_whisper.py +54 -51
- symai/backend/engines/symbolic/engine_wolframalpha.py +15 -9
- symai/backend/engines/text_to_speech/engine_openai.py +20 -26
- symai/backend/engines/text_vision/engine_clip.py +39 -37
- symai/backend/engines/userinput/engine_console.py +5 -6
- symai/backend/mixin/__init__.py +13 -0
- symai/backend/mixin/anthropic.py +48 -38
- symai/backend/mixin/deepseek.py +6 -5
- symai/backend/mixin/google.py +7 -4
- symai/backend/mixin/groq.py +2 -4
- symai/backend/mixin/openai.py +140 -110
- symai/backend/settings.py +87 -20
- symai/chat.py +216 -123
- symai/collect/__init__.py +7 -1
- symai/collect/dynamic.py +80 -70
- symai/collect/pipeline.py +67 -51
- symai/collect/stats.py +161 -109
- symai/components.py +707 -360
- symai/constraints.py +24 -12
- symai/core.py +1857 -1233
- symai/core_ext.py +83 -80
- symai/endpoints/api.py +166 -104
- symai/extended/.DS_Store +0 -0
- symai/extended/__init__.py +46 -12
- symai/extended/api_builder.py +29 -21
- symai/extended/arxiv_pdf_parser.py +23 -14
- symai/extended/bibtex_parser.py +9 -6
- symai/extended/conversation.py +156 -126
- symai/extended/document.py +50 -30
- symai/extended/file_merger.py +57 -14
- symai/extended/graph.py +51 -32
- symai/extended/html_style_template.py +18 -14
- symai/extended/interfaces/blip_2.py +2 -3
- symai/extended/interfaces/clip.py +4 -3
- symai/extended/interfaces/console.py +9 -1
- symai/extended/interfaces/dall_e.py +4 -2
- symai/extended/interfaces/file.py +2 -0
- symai/extended/interfaces/flux.py +4 -2
- symai/extended/interfaces/gpt_image.py +16 -7
- symai/extended/interfaces/input.py +2 -1
- symai/extended/interfaces/llava.py +1 -2
- symai/extended/interfaces/{naive_webscraping.py → naive_scrape.py} +4 -3
- symai/extended/interfaces/naive_vectordb.py +9 -10
- symai/extended/interfaces/ocr.py +5 -3
- symai/extended/interfaces/openai_search.py +2 -0
- symai/extended/interfaces/parallel.py +30 -0
- symai/extended/interfaces/perplexity.py +2 -0
- symai/extended/interfaces/pinecone.py +12 -9
- symai/extended/interfaces/python.py +2 -0
- symai/extended/interfaces/serpapi.py +3 -1
- symai/extended/interfaces/terminal.py +2 -4
- symai/extended/interfaces/tts.py +3 -2
- symai/extended/interfaces/whisper.py +3 -2
- symai/extended/interfaces/wolframalpha.py +2 -1
- symai/extended/metrics/__init__.py +11 -1
- symai/extended/metrics/similarity.py +14 -13
- symai/extended/os_command.py +39 -29
- symai/extended/packages/__init__.py +29 -3
- symai/extended/packages/symdev.py +51 -43
- symai/extended/packages/sympkg.py +41 -35
- symai/extended/packages/symrun.py +63 -50
- symai/extended/repo_cloner.py +14 -12
- symai/extended/seo_query_optimizer.py +15 -13
- symai/extended/solver.py +116 -91
- symai/extended/summarizer.py +12 -10
- symai/extended/taypan_interpreter.py +17 -18
- symai/extended/vectordb.py +122 -92
- symai/formatter/__init__.py +9 -1
- symai/formatter/formatter.py +51 -47
- symai/formatter/regex.py +70 -69
- symai/functional.py +325 -176
- symai/imports.py +190 -147
- symai/interfaces.py +57 -28
- symai/memory.py +45 -35
- symai/menu/screen.py +28 -19
- symai/misc/console.py +66 -56
- symai/misc/loader.py +8 -5
- symai/models/__init__.py +17 -1
- symai/models/base.py +395 -236
- symai/models/errors.py +1 -2
- symai/ops/__init__.py +32 -22
- symai/ops/measures.py +24 -25
- symai/ops/primitives.py +1149 -731
- symai/post_processors.py +58 -50
- symai/pre_processors.py +86 -82
- symai/processor.py +21 -13
- symai/prompts.py +764 -685
- symai/server/huggingface_server.py +135 -49
- symai/server/llama_cpp_server.py +21 -11
- symai/server/qdrant_server.py +206 -0
- symai/shell.py +100 -42
- symai/shellsv.py +700 -492
- symai/strategy.py +630 -346
- symai/symbol.py +368 -322
- symai/utils.py +100 -78
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/METADATA +22 -10
- symbolicai-1.1.0.dist-info/RECORD +168 -0
- symbolicai-0.21.0.dist-info/RECORD +0 -162
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/WHEEL +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/top_level.txt +0 -0
symai/extended/vectordb.py
CHANGED
|
@@ -1,34 +1,41 @@
|
|
|
1
1
|
import gzip
|
|
2
2
|
import logging
|
|
3
|
-
import os
|
|
4
3
|
import pickle
|
|
4
|
+
from collections.abc import Mapping
|
|
5
5
|
from copy import deepcopy
|
|
6
6
|
from pathlib import Path
|
|
7
|
+
from typing import Any, ClassVar
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
|
|
10
11
|
from ..backend.settings import HOME_PATH, SYMAI_CONFIG
|
|
11
12
|
from ..interfaces import Interface
|
|
12
13
|
from ..symbol import Expression, Symbol
|
|
13
|
-
from ..utils import
|
|
14
|
-
from .metrics import (
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
from ..utils import UserMessage
|
|
15
|
+
from .metrics import (
|
|
16
|
+
adams_similarity,
|
|
17
|
+
cosine_similarity,
|
|
18
|
+
derridaean_similarity,
|
|
19
|
+
dot_product,
|
|
20
|
+
euclidean_metric,
|
|
21
|
+
ranking_algorithm_sort,
|
|
22
|
+
)
|
|
17
23
|
|
|
18
|
-
logging.getLogger(
|
|
19
|
-
logging.getLogger(
|
|
24
|
+
logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
|
|
25
|
+
logging.getLogger("datasets").setLevel(logging.WARNING)
|
|
20
26
|
|
|
21
27
|
|
|
22
28
|
class VectorDB(Expression):
|
|
23
|
-
_default_documents = []
|
|
24
|
-
_default_vectors = None
|
|
25
|
-
_default_batch_size = 2048
|
|
26
|
-
_default_similarity_metric = "cosine"
|
|
27
|
-
_default_embedding_function = None
|
|
28
|
-
_default_index_dims = 768
|
|
29
|
-
_default_top_k = 5
|
|
30
|
-
_default_storage_path =
|
|
31
|
-
_default_index_name = "dataindex"
|
|
29
|
+
_default_documents: ClassVar[list] = []
|
|
30
|
+
_default_vectors: ClassVar[np.ndarray | None] = None
|
|
31
|
+
_default_batch_size: ClassVar[int] = 2048
|
|
32
|
+
_default_similarity_metric: ClassVar[str] = "cosine"
|
|
33
|
+
_default_embedding_function: ClassVar[object | None] = None
|
|
34
|
+
_default_index_dims: ClassVar[int] = 768
|
|
35
|
+
_default_top_k: ClassVar[int] = 5
|
|
36
|
+
_default_storage_path: ClassVar[Path] = HOME_PATH / "localdb"
|
|
37
|
+
_default_index_name: ClassVar[str] = "dataindex"
|
|
38
|
+
|
|
32
39
|
def __init__(
|
|
33
40
|
self,
|
|
34
41
|
documents=_default_documents,
|
|
@@ -40,7 +47,7 @@ class VectorDB(Expression):
|
|
|
40
47
|
index_dims=_default_index_dims,
|
|
41
48
|
top_k=_default_top_k,
|
|
42
49
|
index_name=_default_index_name,
|
|
43
|
-
**kwargs
|
|
50
|
+
**kwargs,
|
|
44
51
|
):
|
|
45
52
|
super().__init__(**kwargs)
|
|
46
53
|
self.config = deepcopy(SYMAI_CONFIG)
|
|
@@ -71,22 +78,73 @@ class VectorDB(Expression):
|
|
|
71
78
|
elif "adams" in similarity_metric:
|
|
72
79
|
self.similarity_metric = adams_similarity
|
|
73
80
|
else:
|
|
74
|
-
|
|
81
|
+
UserMessage(
|
|
82
|
+
"Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.",
|
|
83
|
+
raise_with=ValueError,
|
|
84
|
+
)
|
|
75
85
|
|
|
76
86
|
if load_on_init:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
path = os.path.join(load_on_init, f"{self.index_name}.pkl")
|
|
87
|
+
if isinstance(load_on_init, (str, Path)):
|
|
88
|
+
path = Path(load_on_init) / f"{self.index_name}.pkl"
|
|
80
89
|
self.load(path)
|
|
81
90
|
else:
|
|
82
91
|
self.load()
|
|
83
92
|
|
|
84
93
|
def _init_embedding_model(self):
|
|
85
|
-
if
|
|
86
|
-
self.
|
|
94
|
+
if (
|
|
95
|
+
self.config["EMBEDDING_ENGINE_API_KEY"] is None
|
|
96
|
+
or self.config["EMBEDDING_ENGINE_API_KEY"] == ""
|
|
97
|
+
):
|
|
98
|
+
self.model = Interface("ExtensityAI/embeddings") # default to local model
|
|
87
99
|
else:
|
|
88
100
|
self.model = lambda x: Symbol(x).embedding
|
|
89
101
|
|
|
102
|
+
def _unwrap_documents(self, documents):
|
|
103
|
+
if isinstance(documents, Symbol):
|
|
104
|
+
return documents.value
|
|
105
|
+
return documents
|
|
106
|
+
|
|
107
|
+
def _to_texts(self, documents, key):
|
|
108
|
+
if not isinstance(documents, list):
|
|
109
|
+
self._raise_texts_unassigned()
|
|
110
|
+
if len(documents) == 0:
|
|
111
|
+
return []
|
|
112
|
+
first_document = documents[0]
|
|
113
|
+
if isinstance(first_document, dict):
|
|
114
|
+
return self._texts_from_dicts(documents, key)
|
|
115
|
+
if isinstance(first_document, str):
|
|
116
|
+
return documents
|
|
117
|
+
return self._raise_texts_unassigned()
|
|
118
|
+
|
|
119
|
+
def _texts_from_dicts(self, documents, key):
|
|
120
|
+
if isinstance(key, str):
|
|
121
|
+
key_chain = key.split(".") if "." in key else [key]
|
|
122
|
+
return [self._resolve_key_chain(doc, key_chain).replace("\n", " ") for doc in documents]
|
|
123
|
+
if key is None:
|
|
124
|
+
return [
|
|
125
|
+
", ".join([f"{dict_key}: {value}" for dict_key, value in doc.items()])
|
|
126
|
+
for doc in documents
|
|
127
|
+
]
|
|
128
|
+
return self._raise_texts_unassigned()
|
|
129
|
+
|
|
130
|
+
def _resolve_key_chain(self, document, key_chain):
|
|
131
|
+
current_document = document
|
|
132
|
+
for chain_key in key_chain:
|
|
133
|
+
current_document = current_document[chain_key]
|
|
134
|
+
return current_document
|
|
135
|
+
|
|
136
|
+
def _embed_batch(self, batch):
|
|
137
|
+
emb = self.model(batch)
|
|
138
|
+
if len(emb.shape) == 1:
|
|
139
|
+
return [emb]
|
|
140
|
+
if len(emb.shape) == 2:
|
|
141
|
+
return [emb[index] for index in range(emb.shape[0])]
|
|
142
|
+
return UserMessage("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
|
|
143
|
+
|
|
144
|
+
def _raise_texts_unassigned(self):
|
|
145
|
+
error_message = "local variable 'texts' referenced before assignment"
|
|
146
|
+
raise UnboundLocalError(error_message)
|
|
147
|
+
|
|
90
148
|
def _get_embedding(self, documents, key=None):
|
|
91
149
|
"""
|
|
92
150
|
Get embeddings from a list of documents.
|
|
@@ -103,48 +161,17 @@ class VectorDB(Expression):
|
|
|
103
161
|
embeddings : numpy.ndarray
|
|
104
162
|
A numpy array of embeddings.
|
|
105
163
|
"""
|
|
106
|
-
|
|
107
|
-
if isinstance(documents, Symbol):
|
|
108
|
-
documents = documents.value
|
|
109
|
-
# if the documents are a list of Symbols, unwrap them
|
|
164
|
+
documents = self._unwrap_documents(documents)
|
|
110
165
|
if len(documents) == 0:
|
|
111
166
|
return []
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
if isinstance(key, str):
|
|
118
|
-
if "." in key:
|
|
119
|
-
key_chain = key.split(".")
|
|
120
|
-
else:
|
|
121
|
-
key_chain = [key]
|
|
122
|
-
for doc in documents:
|
|
123
|
-
for key in key_chain:
|
|
124
|
-
doc = doc[key]
|
|
125
|
-
texts.append(doc.replace("\n", " "))
|
|
126
|
-
# If no key is specified, extract the text from the dictionary using all keys
|
|
127
|
-
elif key is None:
|
|
128
|
-
for doc in documents:
|
|
129
|
-
text = ", ".join([f"{key}: {value}" for key, value in doc.items()])
|
|
130
|
-
texts.append(text)
|
|
131
|
-
# If the documents are a list of strings, use the strings as the documents
|
|
132
|
-
elif isinstance(documents[0], str):
|
|
133
|
-
texts = documents
|
|
134
|
-
# If the documents are a list of lists, use the lists as the documents
|
|
135
|
-
batches = [texts[i : i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
|
|
167
|
+
texts = self._to_texts(documents, key)
|
|
168
|
+
batches = [
|
|
169
|
+
texts[index : index + self.batch_size]
|
|
170
|
+
for index in range(0, len(texts), self.batch_size)
|
|
171
|
+
]
|
|
136
172
|
embeddings = []
|
|
137
|
-
# Embed the documents in batches
|
|
138
173
|
for batch in batches:
|
|
139
|
-
|
|
140
|
-
emb = self.model(batch)
|
|
141
|
-
if len(emb.shape) == 1:
|
|
142
|
-
embeddings.append(emb)
|
|
143
|
-
elif len(emb.shape) == 2:
|
|
144
|
-
for i in range(emb.shape[0]):
|
|
145
|
-
embeddings.append(emb[i])
|
|
146
|
-
else:
|
|
147
|
-
CustomUserWarning("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
|
|
174
|
+
embeddings.extend(self._embed_batch(batch))
|
|
148
175
|
return embeddings
|
|
149
176
|
|
|
150
177
|
def dict(self, vectors=False):
|
|
@@ -165,12 +192,11 @@ class VectorDB(Expression):
|
|
|
165
192
|
return [
|
|
166
193
|
{"document": document, "vector": vector.tolist(), "index": index}
|
|
167
194
|
for index, (document, vector) in enumerate(
|
|
168
|
-
zip(self.documents, self.vectors)
|
|
195
|
+
zip(self.documents, self.vectors, strict=False)
|
|
169
196
|
)
|
|
170
197
|
]
|
|
171
198
|
return [
|
|
172
|
-
{"document": document, "index": index}
|
|
173
|
-
for index, document in enumerate(self.documents)
|
|
199
|
+
{"document": document, "index": index} for index, document in enumerate(self.documents)
|
|
174
200
|
]
|
|
175
201
|
|
|
176
202
|
def add(self, documents, vectors=None):
|
|
@@ -191,8 +217,9 @@ class VectorDB(Expression):
|
|
|
191
217
|
if not isinstance(documents, list):
|
|
192
218
|
return self.add_document(documents, vectors)
|
|
193
219
|
self.add_documents(documents, vectors)
|
|
220
|
+
return None
|
|
194
221
|
|
|
195
|
-
def add_document(self, document:
|
|
222
|
+
def add_document(self, document: Mapping[str, Any], vector=None):
|
|
196
223
|
"""
|
|
197
224
|
Adds a document to the database.
|
|
198
225
|
|
|
@@ -204,13 +231,13 @@ class VectorDB(Expression):
|
|
|
204
231
|
A vector to add to the database.
|
|
205
232
|
|
|
206
233
|
"""
|
|
207
|
-
vector =
|
|
234
|
+
vector = vector if vector is not None else self.embedding_function([document])[0]
|
|
208
235
|
if self.vectors is None:
|
|
209
236
|
self.vectors = np.empty((0, len(vector)), dtype=np.float32)
|
|
210
237
|
elif len(vector) != self.vectors.shape[1]:
|
|
211
|
-
|
|
238
|
+
UserMessage("All vectors must have the same length.", raise_with=ValueError)
|
|
212
239
|
# convert the vector to a numpy array if it is not already
|
|
213
|
-
if
|
|
240
|
+
if isinstance(vector, list):
|
|
214
241
|
vector = np.array(vector)
|
|
215
242
|
self.vectors = np.vstack([self.vectors, vector]).astype(np.float32)
|
|
216
243
|
self.documents.append(document)
|
|
@@ -243,7 +270,7 @@ class VectorDB(Expression):
|
|
|
243
270
|
if not documents:
|
|
244
271
|
return
|
|
245
272
|
vectors = vectors or np.array(self.embedding_function(documents)).astype(np.float32)
|
|
246
|
-
for vector, document in zip(vectors, documents):
|
|
273
|
+
for vector, document in zip(vectors, documents, strict=False):
|
|
247
274
|
self.add_document(document, vector)
|
|
248
275
|
|
|
249
276
|
def clear(self):
|
|
@@ -251,10 +278,10 @@ class VectorDB(Expression):
|
|
|
251
278
|
Clears the database.
|
|
252
279
|
|
|
253
280
|
"""
|
|
254
|
-
self.vectors
|
|
281
|
+
self.vectors = None
|
|
255
282
|
self.documents = []
|
|
256
283
|
|
|
257
|
-
def save(self, storage_file: str = None):
|
|
284
|
+
def save(self, storage_file: str | None = None):
|
|
258
285
|
"""
|
|
259
286
|
Saves the database to a file.
|
|
260
287
|
|
|
@@ -265,20 +292,20 @@ class VectorDB(Expression):
|
|
|
265
292
|
|
|
266
293
|
"""
|
|
267
294
|
if storage_file is None:
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
storage_file =
|
|
295
|
+
storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
|
|
296
|
+
storage_file.parent.mkdir(parents=True, exist_ok=True)
|
|
297
|
+
else:
|
|
298
|
+
storage_file = Path(storage_file)
|
|
272
299
|
|
|
273
300
|
data = {"vectors": self.vectors, "documents": self.documents}
|
|
274
|
-
if storage_file.
|
|
301
|
+
if storage_file.suffix == ".gz":
|
|
275
302
|
with gzip.open(storage_file, "wb") as f:
|
|
276
303
|
pickle.dump(data, f)
|
|
277
304
|
else:
|
|
278
|
-
with open(
|
|
305
|
+
with storage_file.open("wb") as f:
|
|
279
306
|
pickle.dump(data, f)
|
|
280
307
|
|
|
281
|
-
def load(self, storage_file
|
|
308
|
+
def load(self, storage_file: str | None = None):
|
|
282
309
|
"""
|
|
283
310
|
Loads the database from a file.
|
|
284
311
|
|
|
@@ -289,27 +316,26 @@ class VectorDB(Expression):
|
|
|
289
316
|
|
|
290
317
|
"""
|
|
291
318
|
if storage_file is None:
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
storage_file = os.path.join(storage_path, f"{self.index_name}.pkl")
|
|
319
|
+
storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
|
|
320
|
+
storage_file.parent.mkdir(parents=True, exist_ok=True)
|
|
321
|
+
else:
|
|
322
|
+
storage_file = Path(storage_file)
|
|
297
323
|
|
|
298
324
|
# return since nothing to load
|
|
299
|
-
if not
|
|
325
|
+
if not storage_file.exists():
|
|
300
326
|
return
|
|
301
327
|
|
|
302
|
-
if storage_file.
|
|
328
|
+
if storage_file.suffix == ".gz":
|
|
303
329
|
with gzip.open(storage_file, "rb") as f:
|
|
304
330
|
data = pickle.load(f)
|
|
305
331
|
else:
|
|
306
|
-
with open(
|
|
332
|
+
with storage_file.open("rb") as f:
|
|
307
333
|
data = pickle.load(f)
|
|
308
334
|
|
|
309
335
|
self.vectors = data["vectors"].astype(np.float32) if data["vectors"] is not None else None
|
|
310
336
|
self.documents = data["documents"]
|
|
311
337
|
|
|
312
|
-
def purge(self, index_name
|
|
338
|
+
def purge(self, index_name: str):
|
|
313
339
|
"""
|
|
314
340
|
Purges the database file from your machine, but does not delete the database from memory.
|
|
315
341
|
Use the `clear` method to clear the database from memory.
|
|
@@ -328,11 +354,11 @@ class VectorDB(Expression):
|
|
|
328
354
|
# use path to home directory by default
|
|
329
355
|
storage_path = symai_folder / "localdb"
|
|
330
356
|
# create dir on first load if never used
|
|
331
|
-
|
|
357
|
+
storage_path.mkdir(parents=True, exist_ok=True)
|
|
332
358
|
storage_file = storage_path / f"{index_name}.pkl"
|
|
333
359
|
if storage_file.exists():
|
|
334
360
|
# remove the file
|
|
335
|
-
|
|
361
|
+
storage_file.unlink()
|
|
336
362
|
self.clear()
|
|
337
363
|
|
|
338
364
|
def forward(self, query=None, vector=None, top_k=None, return_similarities=True):
|
|
@@ -354,14 +380,18 @@ class VectorDB(Expression):
|
|
|
354
380
|
A list of results.
|
|
355
381
|
|
|
356
382
|
"""
|
|
357
|
-
assert self.vectors is not None,
|
|
383
|
+
assert self.vectors is not None, (
|
|
384
|
+
"Error: Cannot query the database without prior insertion / initialization."
|
|
385
|
+
)
|
|
358
386
|
top_k = top_k or self.index_top_k
|
|
359
387
|
query_vector = self.embedding_function([query])[0] if vector is None else vector
|
|
360
|
-
if
|
|
388
|
+
if isinstance(query_vector, list):
|
|
361
389
|
query_vector = np.array(query_vector)
|
|
362
390
|
ranked_results, similarities = ranking_algorithm_sort(
|
|
363
391
|
self.vectors, query_vector, top_k=top_k, metric=self.similarity_metric
|
|
364
392
|
)
|
|
365
393
|
if return_similarities:
|
|
366
|
-
return list(
|
|
394
|
+
return list(
|
|
395
|
+
zip([self.documents[index] for index in ranked_results], similarities, strict=False)
|
|
396
|
+
)
|
|
367
397
|
return [self.documents[index] for index in ranked_results]
|
symai/formatter/__init__.py
CHANGED
|
@@ -1,2 +1,10 @@
|
|
|
1
|
+
from .formatter import ParagraphFormatter, RegexFormatter, SentenceFormatter, TextContainerFormatter
|
|
1
2
|
from .regex import CHUNK_REGEX
|
|
2
|
-
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"CHUNK_REGEX",
|
|
6
|
+
"ParagraphFormatter",
|
|
7
|
+
"RegexFormatter",
|
|
8
|
+
"SentenceFormatter",
|
|
9
|
+
"TextContainerFormatter",
|
|
10
|
+
]
|
symai/formatter/formatter.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
3
4
|
from beartype import beartype
|
|
4
5
|
from beartype.typing import Any, Dict, List
|
|
5
6
|
from tqdm import tqdm
|
|
6
7
|
|
|
7
|
-
from .regex import CHUNK_REGEX
|
|
8
8
|
from .. import core_ext
|
|
9
9
|
from ..symbol import Expression, Symbol
|
|
10
|
+
from .regex import CHUNK_REGEX
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from ..backend.engines.files.engine_io import TextContainer
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
class ParagraphFormatter(Expression):
|
|
@@ -17,16 +21,16 @@ class ParagraphFormatter(Expression):
|
|
|
17
21
|
|
|
18
22
|
def split_files(self, input_text=""):
|
|
19
23
|
input_ = input_text.strip()
|
|
20
|
-
if input_.startswith(
|
|
24
|
+
if input_.startswith("# ----[FILE_START]") and "# ----[FILE_END]" in input_:
|
|
21
25
|
self._has_file_start = True
|
|
22
26
|
# split text file-wise and create a map of file names and their contents
|
|
23
27
|
files = {}
|
|
24
|
-
split_text = input_.split(
|
|
25
|
-
for
|
|
28
|
+
split_text = input_.split("# ----[FILE_START]")
|
|
29
|
+
for _i, file in enumerate(split_text):
|
|
26
30
|
if not file.strip():
|
|
27
31
|
continue
|
|
28
|
-
_, content_file = file.split(
|
|
29
|
-
content, file_name = content_file.split(
|
|
32
|
+
_, content_file = file.split("[FILE_CONTENT]:")
|
|
33
|
+
content, file_name = content_file.split("# ----[FILE_END]")
|
|
30
34
|
files[file_name.strip()] = content.strip()
|
|
31
35
|
else:
|
|
32
36
|
files = {"": input_}
|
|
@@ -36,8 +40,10 @@ class ParagraphFormatter(Expression):
|
|
|
36
40
|
if file_name and self._has_file_start:
|
|
37
41
|
header = f"# ----[FILE_START]<PART{part}/{total_parts}>{file_name}[FILE_CONTENT]:\n"
|
|
38
42
|
footer = f"\n# ----[FILE_END]{file_name}\n"
|
|
39
|
-
if
|
|
40
|
-
|
|
43
|
+
if (
|
|
44
|
+
"[FILE_CONTENT]:" in paragraph
|
|
45
|
+
): # TODO: remove this if statement after fixing the bug
|
|
46
|
+
paragraph = paragraph.split("[FILE_CONTENT]:")[-1].strip()
|
|
41
47
|
paragraph = header + paragraph + footer
|
|
42
48
|
return paragraph
|
|
43
49
|
|
|
@@ -63,7 +69,12 @@ class ParagraphFormatter(Expression):
|
|
|
63
69
|
input_ = file_content.strip()
|
|
64
70
|
split_text = self.NEWLINES_RE.split(input_)
|
|
65
71
|
|
|
66
|
-
par = [
|
|
72
|
+
par = [
|
|
73
|
+
self._add_header_footer(p, file_name, part=i + 1, total_parts=len(split_text))
|
|
74
|
+
+ "\n"
|
|
75
|
+
for i, p in enumerate(split_text)
|
|
76
|
+
if p.strip()
|
|
77
|
+
]
|
|
67
78
|
# p + "\n" ensures that all lines in the paragraph end with a newline
|
|
68
79
|
# p.strip() == True if paragraph has other characters than whitespace
|
|
69
80
|
|
|
@@ -81,14 +92,20 @@ class ParagraphFormatter(Expression):
|
|
|
81
92
|
# n splits
|
|
82
93
|
total_parts = (len(words) // max_length + 1) * self._get_total_parts(text)
|
|
83
94
|
for p, i in enumerate(range(0, len(words), max_length)):
|
|
84
|
-
paragraph =
|
|
85
|
-
paragraphs.append(
|
|
95
|
+
paragraph = " ".join(words[i : i + max_length])
|
|
96
|
+
paragraphs.append(
|
|
97
|
+
self._add_header_footer(
|
|
98
|
+
paragraph, file_name, part=p + 1, total_parts=total_parts
|
|
99
|
+
)
|
|
100
|
+
+ "\n"
|
|
101
|
+
)
|
|
86
102
|
else:
|
|
87
103
|
paragraphs.append(text)
|
|
88
104
|
return paragraphs
|
|
89
105
|
|
|
90
|
-
@core_ext.bind(engine=
|
|
91
|
-
def _max_tokens(self):
|
|
106
|
+
@core_ext.bind(engine="embedding", property="max_tokens")
|
|
107
|
+
def _max_tokens(self):
|
|
108
|
+
pass
|
|
92
109
|
|
|
93
110
|
def split_max_tokens_exceeded(self, input_text: List[str], token_ratio=0.5):
|
|
94
111
|
paragraphs = []
|
|
@@ -103,13 +120,18 @@ class ParagraphFormatter(Expression):
|
|
|
103
120
|
text_len_ = len(str(text)) // splits_
|
|
104
121
|
total_parts = (text_len_ + 1) * self._get_total_parts(text)
|
|
105
122
|
for i in range(splits_):
|
|
106
|
-
paragraph = text[i * text_len_:(i + 1) * text_len_]
|
|
107
|
-
paragraphs.append(
|
|
123
|
+
paragraph = text[i * text_len_ : (i + 1) * text_len_]
|
|
124
|
+
paragraphs.append(
|
|
125
|
+
self._add_header_footer(
|
|
126
|
+
paragraph, file_name, part=i + 1, total_parts=total_parts
|
|
127
|
+
)
|
|
128
|
+
+ "\n"
|
|
129
|
+
)
|
|
108
130
|
else:
|
|
109
131
|
paragraphs.append(text)
|
|
110
132
|
return paragraphs
|
|
111
133
|
|
|
112
|
-
def forward(self, sym: Symbol, *
|
|
134
|
+
def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
|
|
113
135
|
sym = self._to_symbol(sym)
|
|
114
136
|
# split text paragraph-wise and index each paragraph separately
|
|
115
137
|
self.elements = self.split_files(sym.value)
|
|
@@ -122,19 +144,17 @@ class ParagraphFormatter(Expression):
|
|
|
122
144
|
class SentenceFormatter(Expression):
|
|
123
145
|
def __init__(self, value=None, **kwargs):
|
|
124
146
|
super().__init__(value, **kwargs)
|
|
125
|
-
self.SENTENCES_RE = re.compile(
|
|
147
|
+
self.SENTENCES_RE = re.compile(
|
|
148
|
+
r"[.!?]\n*|[\n]{1,}"
|
|
149
|
+
) # Sentence ending characters followed by newlines
|
|
126
150
|
|
|
127
151
|
def split_sentences(self, input_text=""):
|
|
128
152
|
input_ = input_text.strip()
|
|
129
153
|
split_text = self.SENTENCES_RE.split(input_) # regex splitting
|
|
130
154
|
|
|
131
|
-
|
|
132
|
-
# s.strip() + ".\n" ensures that all lines in the sentence end with a period and newline
|
|
133
|
-
# s.strip() == True if sentence has other characters than whitespace
|
|
155
|
+
return [s.strip() + ".\n" for s in split_text if s.strip()]
|
|
134
156
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
|
|
157
|
+
def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
|
|
138
158
|
sym = self._to_symbol(sym)
|
|
139
159
|
# split text sentence-wise and index each sentence separately
|
|
140
160
|
self.elements = self.split_sentences(sym.value)
|
|
@@ -151,12 +171,9 @@ class RegexFormatter(Expression):
|
|
|
151
171
|
input_ = input_text.strip()
|
|
152
172
|
split_text = self.SENTENCES_RE.split(input_) # regex splitting
|
|
153
173
|
|
|
154
|
-
|
|
155
|
-
# s.strip() == True if sentence has other characters than whitespace
|
|
156
|
-
|
|
157
|
-
return chunks
|
|
174
|
+
return [s.strip() for s in split_text if s.strip()]
|
|
158
175
|
|
|
159
|
-
def forward(self, sym: Symbol, *
|
|
176
|
+
def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
|
|
160
177
|
sym = self._to_symbol(sym)
|
|
161
178
|
# split text sentence-wise and index each sentence separately
|
|
162
179
|
self.elements = self.split_sentences(sym.value)
|
|
@@ -164,25 +181,19 @@ class RegexFormatter(Expression):
|
|
|
164
181
|
|
|
165
182
|
|
|
166
183
|
class TextContainerFormatter(Expression):
|
|
167
|
-
def __init__(
|
|
168
|
-
self,
|
|
169
|
-
value: Any = None,
|
|
170
|
-
key: str ="text",
|
|
171
|
-
text_split: int = 4,
|
|
172
|
-
**kwargs
|
|
173
|
-
):
|
|
184
|
+
def __init__(self, value: Any = None, key: str = "text", text_split: int = 4, **kwargs):
|
|
174
185
|
super().__init__(value, **kwargs)
|
|
175
186
|
self.key = key
|
|
176
187
|
self.text_split = text_split
|
|
177
188
|
|
|
178
189
|
@beartype
|
|
179
|
-
def forward(self, sym: Symbol, *
|
|
190
|
+
def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
|
|
180
191
|
if isinstance(sym.value, list):
|
|
181
192
|
containers = [container for pdf in sym.value for container in pdf]
|
|
182
193
|
chunks = [text for container in tqdm(containers) for text in self._chunk(container)]
|
|
183
194
|
return self._to_symbol(chunks)
|
|
184
195
|
|
|
185
|
-
def _chunk(self, container:
|
|
196
|
+
def _chunk(self, container: "TextContainer") -> List[str]:
|
|
186
197
|
text = container.text
|
|
187
198
|
step = len(text) // self.text_split
|
|
188
199
|
splits = []
|
|
@@ -192,17 +203,10 @@ class TextContainerFormatter(Expression):
|
|
|
192
203
|
# Unify the last chunk with the previous one if necessary
|
|
193
204
|
splits.append(self._as_str(text[i:], container))
|
|
194
205
|
break
|
|
195
|
-
splits.append(self._as_str(text[i:i+step], container))
|
|
206
|
+
splits.append(self._as_str(text[i : i + step], container))
|
|
196
207
|
i += step
|
|
197
208
|
c += 1
|
|
198
209
|
return splits
|
|
199
210
|
|
|
200
|
-
def _as_str(self, text: str, container:
|
|
201
|
-
return
|
|
202
|
-
'---\n'
|
|
203
|
-
f"id: {container.id}\n"
|
|
204
|
-
f"page: {container.page}\n"
|
|
205
|
-
'---\n'
|
|
206
|
-
f"{text}"
|
|
207
|
-
)
|
|
208
|
-
|
|
211
|
+
def _as_str(self, text: str, container: "TextContainer") -> str:
|
|
212
|
+
return f"---\nid: {container.id}\npage: {container.page}\n---\n{text}"
|