symbolicai 0.20.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +96 -64
- symai/backend/base.py +93 -80
- symai/backend/engines/drawing/engine_bfl.py +12 -11
- symai/backend/engines/drawing/engine_gpt_image.py +108 -87
- symai/backend/engines/embedding/engine_llama_cpp.py +25 -28
- symai/backend/engines/embedding/engine_openai.py +3 -5
- symai/backend/engines/execute/engine_python.py +6 -5
- symai/backend/engines/files/engine_io.py +74 -67
- symai/backend/engines/imagecaptioning/engine_blip2.py +3 -3
- symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +54 -38
- symai/backend/engines/index/engine_pinecone.py +23 -24
- symai/backend/engines/index/engine_vectordb.py +16 -14
- symai/backend/engines/lean/engine_lean4.py +38 -34
- symai/backend/engines/neurosymbolic/__init__.py +41 -13
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +262 -182
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +263 -191
- symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +53 -49
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +212 -211
- symai/backend/engines/neurosymbolic/engine_groq.py +87 -63
- symai/backend/engines/neurosymbolic/engine_huggingface.py +21 -24
- symai/backend/engines/neurosymbolic/engine_llama_cpp.py +117 -48
- symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +256 -229
- symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +270 -150
- symai/backend/engines/ocr/engine_apilayer.py +6 -8
- symai/backend/engines/output/engine_stdout.py +1 -4
- symai/backend/engines/search/engine_openai.py +7 -7
- symai/backend/engines/search/engine_perplexity.py +5 -5
- symai/backend/engines/search/engine_serpapi.py +12 -14
- symai/backend/engines/speech_to_text/engine_local_whisper.py +20 -27
- symai/backend/engines/symbolic/engine_wolframalpha.py +3 -3
- symai/backend/engines/text_to_speech/engine_openai.py +5 -7
- symai/backend/engines/text_vision/engine_clip.py +7 -11
- symai/backend/engines/userinput/engine_console.py +3 -3
- symai/backend/engines/webscraping/engine_requests.py +81 -48
- symai/backend/mixin/__init__.py +13 -0
- symai/backend/mixin/anthropic.py +4 -2
- symai/backend/mixin/deepseek.py +2 -0
- symai/backend/mixin/google.py +2 -0
- symai/backend/mixin/openai.py +11 -3
- symai/backend/settings.py +83 -16
- symai/chat.py +101 -78
- symai/collect/__init__.py +7 -1
- symai/collect/dynamic.py +77 -69
- symai/collect/pipeline.py +35 -27
- symai/collect/stats.py +75 -63
- symai/components.py +198 -169
- symai/constraints.py +15 -12
- symai/core.py +698 -359
- symai/core_ext.py +32 -34
- symai/endpoints/api.py +80 -73
- symai/extended/.DS_Store +0 -0
- symai/extended/__init__.py +46 -12
- symai/extended/api_builder.py +11 -8
- symai/extended/arxiv_pdf_parser.py +13 -12
- symai/extended/bibtex_parser.py +2 -3
- symai/extended/conversation.py +101 -90
- symai/extended/document.py +17 -10
- symai/extended/file_merger.py +18 -13
- symai/extended/graph.py +18 -13
- symai/extended/html_style_template.py +2 -4
- symai/extended/interfaces/blip_2.py +1 -2
- symai/extended/interfaces/clip.py +1 -2
- symai/extended/interfaces/console.py +7 -1
- symai/extended/interfaces/dall_e.py +1 -1
- symai/extended/interfaces/flux.py +1 -1
- symai/extended/interfaces/gpt_image.py +1 -1
- symai/extended/interfaces/input.py +1 -1
- symai/extended/interfaces/llava.py +0 -1
- symai/extended/interfaces/naive_vectordb.py +7 -8
- symai/extended/interfaces/naive_webscraping.py +1 -1
- symai/extended/interfaces/ocr.py +1 -1
- symai/extended/interfaces/pinecone.py +6 -5
- symai/extended/interfaces/serpapi.py +1 -1
- symai/extended/interfaces/terminal.py +2 -3
- symai/extended/interfaces/tts.py +1 -1
- symai/extended/interfaces/whisper.py +1 -1
- symai/extended/interfaces/wolframalpha.py +1 -1
- symai/extended/metrics/__init__.py +11 -1
- symai/extended/metrics/similarity.py +11 -13
- symai/extended/os_command.py +17 -16
- symai/extended/packages/__init__.py +29 -3
- symai/extended/packages/symdev.py +19 -16
- symai/extended/packages/sympkg.py +12 -9
- symai/extended/packages/symrun.py +21 -19
- symai/extended/repo_cloner.py +11 -10
- symai/extended/seo_query_optimizer.py +1 -2
- symai/extended/solver.py +20 -23
- symai/extended/summarizer.py +4 -3
- symai/extended/taypan_interpreter.py +10 -12
- symai/extended/vectordb.py +99 -82
- symai/formatter/__init__.py +9 -1
- symai/formatter/formatter.py +12 -16
- symai/formatter/regex.py +62 -63
- symai/functional.py +176 -122
- symai/imports.py +136 -127
- symai/interfaces.py +56 -27
- symai/memory.py +14 -13
- symai/misc/console.py +49 -39
- symai/misc/loader.py +5 -3
- symai/models/__init__.py +17 -1
- symai/models/base.py +269 -181
- symai/models/errors.py +0 -1
- symai/ops/__init__.py +32 -22
- symai/ops/measures.py +11 -15
- symai/ops/primitives.py +348 -228
- symai/post_processors.py +32 -28
- symai/pre_processors.py +39 -41
- symai/processor.py +6 -4
- symai/prompts.py +59 -45
- symai/server/huggingface_server.py +23 -20
- symai/server/llama_cpp_server.py +7 -5
- symai/shell.py +3 -4
- symai/shellsv.py +499 -375
- symai/strategy.py +517 -287
- symai/symbol.py +111 -116
- symai/utils.py +42 -36
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/METADATA +4 -2
- symbolicai-1.0.0.dist-info/RECORD +163 -0
- symbolicai-0.20.2.dist-info/RECORD +0 -162
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/WHEEL +0 -0
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/top_level.txt +0 -0
symai/extended/repo_cloner.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
from git import Repo
|
|
4
4
|
|
|
5
|
-
from ..symbol import Expression
|
|
6
5
|
from ..backend.settings import HOME_PATH
|
|
6
|
+
from ..symbol import Expression
|
|
7
|
+
from ..utils import UserMessage
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class RepositoryCloner(Expression):
|
|
@@ -16,7 +17,7 @@ class RepositoryCloner(Expression):
|
|
|
16
17
|
repo_path (Optional[str]): The path where to clone the repository.
|
|
17
18
|
By default it will be at '~/.symai/repos/'.
|
|
18
19
|
"""
|
|
19
|
-
def __init__(self, repo_path:
|
|
20
|
+
def __init__(self, repo_path: str | None = None, **kwargs):
|
|
20
21
|
super().__init__(**kwargs)
|
|
21
22
|
self.repo_dir = HOME_PATH / 'repos/' if repo_path is None else Path(repo_path)
|
|
22
23
|
if not self.repo_dir.exists():
|
|
@@ -35,24 +36,24 @@ class RepositoryCloner(Expression):
|
|
|
35
36
|
"""
|
|
36
37
|
repo_name = url.split('/')[-1].replace('.git', '')
|
|
37
38
|
if (self.repo_dir / repo_name).is_dir():
|
|
38
|
-
|
|
39
|
+
UserMessage(f'Repository {repo_name} already exists. Checking for updates...')
|
|
39
40
|
try:
|
|
40
41
|
repo = Repo(self.repo_dir / repo_name)
|
|
41
42
|
current = repo.head.commit
|
|
42
43
|
repo.remotes.origin.pull()
|
|
43
44
|
if current != repo.head.commit:
|
|
44
|
-
|
|
45
|
+
UserMessage(f'Repository {repo_name} updated.')
|
|
45
46
|
else:
|
|
46
|
-
|
|
47
|
+
UserMessage(f'Repository {repo_name} is up-to-date.')
|
|
47
48
|
except Exception as e:
|
|
48
|
-
|
|
49
|
+
UserMessage(f'An error occurred: {e}')
|
|
49
50
|
raise e
|
|
50
51
|
else:
|
|
51
|
-
|
|
52
|
+
UserMessage(f'Cloning repository {repo_name}...')
|
|
52
53
|
try:
|
|
53
54
|
Repo.clone_from(url, self.repo_dir / repo_name)
|
|
54
|
-
|
|
55
|
+
UserMessage(f'Repository {repo_name} cloned successfully.')
|
|
55
56
|
except Exception as e:
|
|
56
|
-
|
|
57
|
+
UserMessage(f'Failed to clone the repository. An error occurred: {e}')
|
|
57
58
|
raise e
|
|
58
59
|
return str(self.repo_dir / repo_name)
|
|
@@ -4,7 +4,6 @@ from ..pre_processors import PreProcessor
|
|
|
4
4
|
from ..prompts import Prompt
|
|
5
5
|
from ..symbol import Expression, Symbol
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
SEO_OPTIMIZER_DESCRIPTION = """[Description]
|
|
9
8
|
You are a SEO query optimizer. You are given a list of queries, phrases or sentences and you need to optimize them for search engines.
|
|
10
9
|
Assume your search engines are based on vector databases and contain indices of GitHub repositories, papers and other resources.
|
|
@@ -17,7 +16,7 @@ The number of resulting queries should be between 1 and 8 statements separated b
|
|
|
17
16
|
|
|
18
17
|
class SEOQueryOptimizerPreProcessor(PreProcessor):
|
|
19
18
|
def __call__(self, argument):
|
|
20
|
-
return '$> {
|
|
19
|
+
return f'$> {argument.args[0]!s} =>'
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
class SEOQueryOptimizer(Expression):
|
symai/extended/solver.py
CHANGED
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
import argparse
|
|
2
|
+
|
|
2
3
|
try:
|
|
3
4
|
import z3
|
|
4
5
|
except ImportError:
|
|
5
6
|
z3 = None
|
|
6
7
|
|
|
7
|
-
from .conversation import Conversation
|
|
8
8
|
from .. import core
|
|
9
9
|
from ..components import Execute
|
|
10
|
-
from ..post_processors import
|
|
10
|
+
from ..post_processors import CodeExtractPostProcessor, StripPostProcessor
|
|
11
11
|
from ..pre_processors import PreProcessor
|
|
12
12
|
from ..prompts import Prompt
|
|
13
13
|
from ..symbol import Expression, Symbol
|
|
14
|
+
from ..utils import UserMessage
|
|
15
|
+
from .conversation import Conversation
|
|
14
16
|
|
|
15
17
|
#############################################################################################
|
|
16
18
|
#
|
|
@@ -57,15 +59,17 @@ $> Max is 2 years older than his brother. In 5 years, Max will be 3 times as old
|
|
|
57
59
|
|
|
58
60
|
class ProblemClassifierPreProcessor(PreProcessor):
|
|
59
61
|
def __call__(self, argument):
|
|
60
|
-
return '$> {}\n//'
|
|
62
|
+
return f'$> {argument.prop.instance!s}\n//'
|
|
61
63
|
|
|
62
64
|
|
|
63
65
|
class OptionsPreProcessor(PreProcessor):
|
|
64
66
|
def __call__(self, argument):
|
|
65
|
-
return '$> :{}: == :{
|
|
67
|
+
return f'$> :{argument.prop.instance!s}: == :{argument.args[0]!s}: =>'
|
|
66
68
|
|
|
67
69
|
|
|
68
70
|
class ProblemClassifier(Expression):
|
|
71
|
+
__hash__ = Expression.__hash__
|
|
72
|
+
|
|
69
73
|
@property
|
|
70
74
|
def static_context(self):
|
|
71
75
|
return PROBLEM_CATEGORY_CONTEXT
|
|
@@ -105,7 +109,7 @@ class ProblemClassifier(Expression):
|
|
|
105
109
|
|
|
106
110
|
class FormulaCheckerPreProcessor(PreProcessor):
|
|
107
111
|
def __call__(self, argument):
|
|
108
|
-
return '$> {
|
|
112
|
+
return f'$> {argument.prop.instance!s} =>'
|
|
109
113
|
|
|
110
114
|
|
|
111
115
|
class FormulaChecker(Expression):
|
|
@@ -145,7 +149,7 @@ class FormulaChecker(Expression):
|
|
|
145
149
|
|
|
146
150
|
class FormulaWriterPreProcessor(PreProcessor):
|
|
147
151
|
def __call__(self, argument):
|
|
148
|
-
return '$> {
|
|
152
|
+
return f'$> {argument.prop.instance!s} =>'
|
|
149
153
|
|
|
150
154
|
|
|
151
155
|
class FormulaWriter(Expression):
|
|
@@ -212,9 +216,8 @@ class SATSolver(Expression):
|
|
|
212
216
|
m = S.model()
|
|
213
217
|
# Return the solution
|
|
214
218
|
return m[query]
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
return None
|
|
219
|
+
UserMessage("Cannot solve the puzzle. Returned: " + str(r))
|
|
220
|
+
return None
|
|
218
221
|
|
|
219
222
|
|
|
220
223
|
#############################################################################################
|
|
@@ -244,32 +247,26 @@ class Solver(Expression):
|
|
|
244
247
|
classifier = ProblemClassifier(sym)
|
|
245
248
|
problem = classifier(**kwargs)
|
|
246
249
|
|
|
247
|
-
if 'Arithmetics formula' ==
|
|
248
|
-
formula = self.rewrite_formula(sym, **kwargs)
|
|
249
|
-
print(formula)
|
|
250
|
-
elif 'Equations' == problem:
|
|
250
|
+
if problem == 'Arithmetics formula' or problem == 'Equations':
|
|
251
251
|
formula = self.rewrite_formula(sym, **kwargs)
|
|
252
|
-
|
|
253
|
-
elif 'Implication and logical expressions'
|
|
252
|
+
UserMessage(str(formula))
|
|
253
|
+
elif problem == 'Implication and logical expressions':
|
|
254
254
|
res = self.conv(sym, **kwargs)
|
|
255
255
|
code = self.pp(str(res), None, tag="python")
|
|
256
256
|
formula = self.solver(code, lambda: 'German')
|
|
257
|
-
|
|
258
|
-
elif 'Probability and statistics' == problem:
|
|
259
|
-
|
|
260
|
-
elif 'Linear algebra' == problem:
|
|
261
|
-
raise NotImplementedError('This feature is not yet implemented.')
|
|
262
|
-
elif 'Linguistic problem with relations' == problem:
|
|
263
|
-
raise NotImplementedError('This feature is not yet implemented.')
|
|
257
|
+
UserMessage(str(formula))
|
|
258
|
+
elif problem == 'Probability and statistics' or problem == 'Linear algebra' or problem == 'Linguistic problem with relations':
|
|
259
|
+
UserMessage('This feature is not yet implemented.', raise_with=NotImplementedError)
|
|
264
260
|
else:
|
|
265
261
|
return "Sorry, something went wrong. Please check if your backend is available and try again or report an issue to the devs. :("
|
|
262
|
+
return None
|
|
266
263
|
|
|
267
264
|
|
|
268
265
|
def process_query(args) -> None:
|
|
269
266
|
query = args.query
|
|
270
267
|
solver = Solver()
|
|
271
268
|
res = solver(query)
|
|
272
|
-
|
|
269
|
+
UserMessage(str(res))
|
|
273
270
|
|
|
274
271
|
|
|
275
272
|
def run() -> None:
|
symai/extended/summarizer.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
from typing import List
|
|
2
1
|
|
|
3
2
|
from ..components import Clean, Outline, Sequence, Stream, Translate
|
|
4
3
|
from ..symbol import Expression, Symbol
|
|
5
4
|
|
|
6
5
|
|
|
7
6
|
class Summarizer(Expression):
|
|
8
|
-
def __init__(self, filters:
|
|
7
|
+
def __init__(self, filters: list[Expression] | None = None, **kwargs):
|
|
8
|
+
if filters is None:
|
|
9
|
+
filters = []
|
|
9
10
|
super().__init__(**kwargs)
|
|
10
|
-
filters = filters if isinstance(filters,
|
|
11
|
+
filters = filters if isinstance(filters, (list, tuple)) else [filters]
|
|
11
12
|
self.data_stream = Stream(Sequence(
|
|
12
13
|
Clean(),
|
|
13
14
|
Translate(),
|
|
@@ -1,25 +1,24 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import pathlib
|
|
3
2
|
|
|
4
|
-
from ..core import
|
|
5
|
-
from ..pre_processors import PreProcessor
|
|
3
|
+
from ..core import zero_shot
|
|
6
4
|
from ..post_processors import CodeExtractPostProcessor
|
|
5
|
+
from ..pre_processors import PreProcessor
|
|
7
6
|
from ..symbol import Expression, Symbol
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
def create_template():
|
|
11
|
-
package_path =
|
|
10
|
+
package_path = pathlib.Path(__file__).parent.absolute()
|
|
12
11
|
|
|
13
12
|
|
|
14
|
-
with
|
|
13
|
+
with (package_path / 'symbol.py').open() as f:
|
|
15
14
|
SYMBOL_API = f.read()
|
|
16
15
|
|
|
17
16
|
|
|
18
|
-
with
|
|
17
|
+
with (package_path / 'components.py').open() as f:
|
|
19
18
|
COMPONENTS_API = f.read()
|
|
20
19
|
|
|
21
20
|
|
|
22
|
-
|
|
21
|
+
return f"""[Description]
|
|
23
22
|
You are a programming language re-writing system from Taypan (high-level general-purpose programming language based on neuro-symbolic virtual machine) to Python interpreter, analogous to the relation between Scala and Java is the relation of Taypan to Python.
|
|
24
23
|
|
|
25
24
|
All code from Python is valid Taypan code, but not all code from Taypan is valid Python code.
|
|
@@ -104,18 +103,17 @@ def create_template():
|
|
|
104
103
|
[SymboliAI API]
|
|
105
104
|
|
|
106
105
|
- components `from symai.components import *`:
|
|
107
|
-
{
|
|
106
|
+
{COMPONENTS_API}
|
|
108
107
|
|
|
109
108
|
- symbol `from symai.symbol import *`:
|
|
110
|
-
{
|
|
109
|
+
{SYMBOL_API}
|
|
111
110
|
|
|
112
|
-
"""
|
|
113
|
-
return TAYPAN_DESCRIPTION
|
|
111
|
+
"""
|
|
114
112
|
|
|
115
113
|
|
|
116
114
|
class TaypanPreProcessor(PreProcessor):
|
|
117
115
|
def __call__(self, argument):
|
|
118
|
-
return '```taypan\n{}\n =>'
|
|
116
|
+
return f'```taypan\n{argument.args[0]!s}\n =>'
|
|
119
117
|
|
|
120
118
|
|
|
121
119
|
class TaypanInterpreter(Expression):
|
symai/extended/vectordb.py
CHANGED
|
@@ -1,34 +1,40 @@
|
|
|
1
1
|
import gzip
|
|
2
2
|
import logging
|
|
3
|
-
import os
|
|
4
3
|
import pickle
|
|
4
|
+
from collections.abc import Mapping
|
|
5
5
|
from copy import deepcopy
|
|
6
6
|
from pathlib import Path
|
|
7
|
+
from typing import Any, ClassVar
|
|
7
8
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
|
|
10
11
|
from ..backend.settings import HOME_PATH, SYMAI_CONFIG
|
|
11
12
|
from ..interfaces import Interface
|
|
12
13
|
from ..symbol import Expression, Symbol
|
|
13
|
-
from ..utils import
|
|
14
|
-
from .metrics import (
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
from ..utils import UserMessage
|
|
15
|
+
from .metrics import (
|
|
16
|
+
adams_similarity,
|
|
17
|
+
cosine_similarity,
|
|
18
|
+
derridaean_similarity,
|
|
19
|
+
dot_product,
|
|
20
|
+
euclidean_metric,
|
|
21
|
+
ranking_algorithm_sort,
|
|
22
|
+
)
|
|
17
23
|
|
|
18
24
|
logging.getLogger('sentence_transformers').setLevel(logging.WARNING)
|
|
19
25
|
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
20
26
|
|
|
21
27
|
|
|
22
28
|
class VectorDB(Expression):
|
|
23
|
-
_default_documents = []
|
|
24
|
-
_default_vectors = None
|
|
25
|
-
_default_batch_size = 2048
|
|
26
|
-
_default_similarity_metric = "cosine"
|
|
27
|
-
_default_embedding_function = None
|
|
28
|
-
_default_index_dims = 768
|
|
29
|
-
_default_top_k = 5
|
|
30
|
-
_default_storage_path =
|
|
31
|
-
_default_index_name = "dataindex"
|
|
29
|
+
_default_documents: ClassVar[list] = []
|
|
30
|
+
_default_vectors: ClassVar[np.ndarray | None] = None
|
|
31
|
+
_default_batch_size: ClassVar[int] = 2048
|
|
32
|
+
_default_similarity_metric: ClassVar[str] = "cosine"
|
|
33
|
+
_default_embedding_function: ClassVar[object | None] = None
|
|
34
|
+
_default_index_dims: ClassVar[int] = 768
|
|
35
|
+
_default_top_k: ClassVar[int] = 5
|
|
36
|
+
_default_storage_path: ClassVar[Path] = HOME_PATH / "localdb"
|
|
37
|
+
_default_index_name: ClassVar[str] = "dataindex"
|
|
32
38
|
def __init__(
|
|
33
39
|
self,
|
|
34
40
|
documents=_default_documents,
|
|
@@ -71,12 +77,11 @@ class VectorDB(Expression):
|
|
|
71
77
|
elif "adams" in similarity_metric:
|
|
72
78
|
self.similarity_metric = adams_similarity
|
|
73
79
|
else:
|
|
74
|
-
|
|
80
|
+
UserMessage("Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.", raise_with=ValueError)
|
|
75
81
|
|
|
76
82
|
if load_on_init:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
path = os.path.join(load_on_init, f"{self.index_name}.pkl")
|
|
83
|
+
if isinstance(load_on_init, (str, Path)):
|
|
84
|
+
path = Path(load_on_init) / f"{self.index_name}.pkl"
|
|
80
85
|
self.load(path)
|
|
81
86
|
else:
|
|
82
87
|
self.load()
|
|
@@ -87,6 +92,52 @@ class VectorDB(Expression):
|
|
|
87
92
|
else:
|
|
88
93
|
self.model = lambda x: Symbol(x).embedding
|
|
89
94
|
|
|
95
|
+
def _unwrap_documents(self, documents):
|
|
96
|
+
if isinstance(documents, Symbol):
|
|
97
|
+
return documents.value
|
|
98
|
+
return documents
|
|
99
|
+
|
|
100
|
+
def _to_texts(self, documents, key):
|
|
101
|
+
if not isinstance(documents, list):
|
|
102
|
+
self._raise_texts_unassigned()
|
|
103
|
+
if len(documents) == 0:
|
|
104
|
+
return []
|
|
105
|
+
first_document = documents[0]
|
|
106
|
+
if isinstance(first_document, dict):
|
|
107
|
+
return self._texts_from_dicts(documents, key)
|
|
108
|
+
if isinstance(first_document, str):
|
|
109
|
+
return documents
|
|
110
|
+
return self._raise_texts_unassigned()
|
|
111
|
+
|
|
112
|
+
def _texts_from_dicts(self, documents, key):
|
|
113
|
+
if isinstance(key, str):
|
|
114
|
+
key_chain = key.split(".") if "." in key else [key]
|
|
115
|
+
return [self._resolve_key_chain(doc, key_chain).replace("\n", " ") for doc in documents]
|
|
116
|
+
if key is None:
|
|
117
|
+
return [
|
|
118
|
+
", ".join([f"{dict_key}: {value}" for dict_key, value in doc.items()])
|
|
119
|
+
for doc in documents
|
|
120
|
+
]
|
|
121
|
+
return self._raise_texts_unassigned()
|
|
122
|
+
|
|
123
|
+
def _resolve_key_chain(self, document, key_chain):
|
|
124
|
+
current_document = document
|
|
125
|
+
for chain_key in key_chain:
|
|
126
|
+
current_document = current_document[chain_key]
|
|
127
|
+
return current_document
|
|
128
|
+
|
|
129
|
+
def _embed_batch(self, batch):
|
|
130
|
+
emb = self.model(batch)
|
|
131
|
+
if len(emb.shape) == 1:
|
|
132
|
+
return [emb]
|
|
133
|
+
if len(emb.shape) == 2:
|
|
134
|
+
return [emb[index] for index in range(emb.shape[0])]
|
|
135
|
+
return UserMessage("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
|
|
136
|
+
|
|
137
|
+
def _raise_texts_unassigned(self):
|
|
138
|
+
error_message = "local variable 'texts' referenced before assignment"
|
|
139
|
+
raise UnboundLocalError(error_message)
|
|
140
|
+
|
|
90
141
|
def _get_embedding(self, documents, key=None):
|
|
91
142
|
"""
|
|
92
143
|
Get embeddings from a list of documents.
|
|
@@ -103,48 +154,14 @@ class VectorDB(Expression):
|
|
|
103
154
|
embeddings : numpy.ndarray
|
|
104
155
|
A numpy array of embeddings.
|
|
105
156
|
"""
|
|
106
|
-
|
|
107
|
-
if isinstance(documents, Symbol):
|
|
108
|
-
documents = documents.value
|
|
109
|
-
# if the documents are a list of Symbols, unwrap them
|
|
157
|
+
documents = self._unwrap_documents(documents)
|
|
110
158
|
if len(documents) == 0:
|
|
111
159
|
return []
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
if isinstance(documents[0], dict):
|
|
115
|
-
texts = []
|
|
116
|
-
# If a key is specified, extract the text from the dictionary using the key
|
|
117
|
-
if isinstance(key, str):
|
|
118
|
-
if "." in key:
|
|
119
|
-
key_chain = key.split(".")
|
|
120
|
-
else:
|
|
121
|
-
key_chain = [key]
|
|
122
|
-
for doc in documents:
|
|
123
|
-
for key in key_chain:
|
|
124
|
-
doc = doc[key]
|
|
125
|
-
texts.append(doc.replace("\n", " "))
|
|
126
|
-
# If no key is specified, extract the text from the dictionary using all keys
|
|
127
|
-
elif key is None:
|
|
128
|
-
for doc in documents:
|
|
129
|
-
text = ", ".join([f"{key}: {value}" for key, value in doc.items()])
|
|
130
|
-
texts.append(text)
|
|
131
|
-
# If the documents are a list of strings, use the strings as the documents
|
|
132
|
-
elif isinstance(documents[0], str):
|
|
133
|
-
texts = documents
|
|
134
|
-
# If the documents are a list of lists, use the lists as the documents
|
|
135
|
-
batches = [texts[i : i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
|
|
160
|
+
texts = self._to_texts(documents, key)
|
|
161
|
+
batches = [texts[index : index + self.batch_size] for index in range(0, len(texts), self.batch_size)]
|
|
136
162
|
embeddings = []
|
|
137
|
-
# Embed the documents in batches
|
|
138
163
|
for batch in batches:
|
|
139
|
-
|
|
140
|
-
emb = self.model(batch)
|
|
141
|
-
if len(emb.shape) == 1:
|
|
142
|
-
embeddings.append(emb)
|
|
143
|
-
elif len(emb.shape) == 2:
|
|
144
|
-
for i in range(emb.shape[0]):
|
|
145
|
-
embeddings.append(emb[i])
|
|
146
|
-
else:
|
|
147
|
-
CustomUserWarning("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
|
|
164
|
+
embeddings.extend(self._embed_batch(batch))
|
|
148
165
|
return embeddings
|
|
149
166
|
|
|
150
167
|
def dict(self, vectors=False):
|
|
@@ -165,7 +182,7 @@ class VectorDB(Expression):
|
|
|
165
182
|
return [
|
|
166
183
|
{"document": document, "vector": vector.tolist(), "index": index}
|
|
167
184
|
for index, (document, vector) in enumerate(
|
|
168
|
-
zip(self.documents, self.vectors)
|
|
185
|
+
zip(self.documents, self.vectors, strict=False)
|
|
169
186
|
)
|
|
170
187
|
]
|
|
171
188
|
return [
|
|
@@ -191,8 +208,9 @@ class VectorDB(Expression):
|
|
|
191
208
|
if not isinstance(documents, list):
|
|
192
209
|
return self.add_document(documents, vectors)
|
|
193
210
|
self.add_documents(documents, vectors)
|
|
211
|
+
return None
|
|
194
212
|
|
|
195
|
-
def add_document(self, document:
|
|
213
|
+
def add_document(self, document: Mapping[str, Any], vector=None):
|
|
196
214
|
"""
|
|
197
215
|
Adds a document to the database.
|
|
198
216
|
|
|
@@ -208,9 +226,9 @@ class VectorDB(Expression):
|
|
|
208
226
|
if self.vectors is None:
|
|
209
227
|
self.vectors = np.empty((0, len(vector)), dtype=np.float32)
|
|
210
228
|
elif len(vector) != self.vectors.shape[1]:
|
|
211
|
-
|
|
229
|
+
UserMessage("All vectors must have the same length.", raise_with=ValueError)
|
|
212
230
|
# convert the vector to a numpy array if it is not already
|
|
213
|
-
if
|
|
231
|
+
if isinstance(vector, list):
|
|
214
232
|
vector = np.array(vector)
|
|
215
233
|
self.vectors = np.vstack([self.vectors, vector]).astype(np.float32)
|
|
216
234
|
self.documents.append(document)
|
|
@@ -243,7 +261,7 @@ class VectorDB(Expression):
|
|
|
243
261
|
if not documents:
|
|
244
262
|
return
|
|
245
263
|
vectors = vectors or np.array(self.embedding_function(documents)).astype(np.float32)
|
|
246
|
-
for vector, document in zip(vectors, documents):
|
|
264
|
+
for vector, document in zip(vectors, documents, strict=False):
|
|
247
265
|
self.add_document(document, vector)
|
|
248
266
|
|
|
249
267
|
def clear(self):
|
|
@@ -254,7 +272,7 @@ class VectorDB(Expression):
|
|
|
254
272
|
self.vectors = None
|
|
255
273
|
self.documents = []
|
|
256
274
|
|
|
257
|
-
def save(self, storage_file: str = None):
|
|
275
|
+
def save(self, storage_file: str | None = None):
|
|
258
276
|
"""
|
|
259
277
|
Saves the database to a file.
|
|
260
278
|
|
|
@@ -265,20 +283,20 @@ class VectorDB(Expression):
|
|
|
265
283
|
|
|
266
284
|
"""
|
|
267
285
|
if storage_file is None:
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
storage_file =
|
|
286
|
+
storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
|
|
287
|
+
storage_file.parent.mkdir(parents=True, exist_ok=True)
|
|
288
|
+
else:
|
|
289
|
+
storage_file = Path(storage_file)
|
|
272
290
|
|
|
273
291
|
data = {"vectors": self.vectors, "documents": self.documents}
|
|
274
|
-
if storage_file.
|
|
292
|
+
if storage_file.suffix == ".gz":
|
|
275
293
|
with gzip.open(storage_file, "wb") as f:
|
|
276
294
|
pickle.dump(data, f)
|
|
277
295
|
else:
|
|
278
|
-
with open(
|
|
296
|
+
with storage_file.open("wb") as f:
|
|
279
297
|
pickle.dump(data, f)
|
|
280
298
|
|
|
281
|
-
def load(self, storage_file : str = None):
|
|
299
|
+
def load(self, storage_file : str | None = None):
|
|
282
300
|
"""
|
|
283
301
|
Loads the database from a file.
|
|
284
302
|
|
|
@@ -289,21 +307,20 @@ class VectorDB(Expression):
|
|
|
289
307
|
|
|
290
308
|
"""
|
|
291
309
|
if storage_file is None:
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
storage_file = os.path.join(storage_path, f"{self.index_name}.pkl")
|
|
310
|
+
storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
|
|
311
|
+
storage_file.parent.mkdir(parents=True, exist_ok=True)
|
|
312
|
+
else:
|
|
313
|
+
storage_file = Path(storage_file)
|
|
297
314
|
|
|
298
315
|
# return since nothing to load
|
|
299
|
-
if not
|
|
316
|
+
if not storage_file.exists():
|
|
300
317
|
return
|
|
301
318
|
|
|
302
|
-
if storage_file.
|
|
319
|
+
if storage_file.suffix == ".gz":
|
|
303
320
|
with gzip.open(storage_file, "rb") as f:
|
|
304
321
|
data = pickle.load(f)
|
|
305
322
|
else:
|
|
306
|
-
with open(
|
|
323
|
+
with storage_file.open("rb") as f:
|
|
307
324
|
data = pickle.load(f)
|
|
308
325
|
|
|
309
326
|
self.vectors = data["vectors"].astype(np.float32) if data["vectors"] is not None else None
|
|
@@ -328,11 +345,11 @@ class VectorDB(Expression):
|
|
|
328
345
|
# use path to home directory by default
|
|
329
346
|
storage_path = symai_folder / "localdb"
|
|
330
347
|
# create dir on first load if never used
|
|
331
|
-
|
|
348
|
+
storage_path.mkdir(parents=True, exist_ok=True)
|
|
332
349
|
storage_file = storage_path / f"{index_name}.pkl"
|
|
333
350
|
if storage_file.exists():
|
|
334
351
|
# remove the file
|
|
335
|
-
|
|
352
|
+
storage_file.unlink()
|
|
336
353
|
self.clear()
|
|
337
354
|
|
|
338
355
|
def forward(self, query=None, vector=None, top_k=None, return_similarities=True):
|
|
@@ -354,14 +371,14 @@ class VectorDB(Expression):
|
|
|
354
371
|
A list of results.
|
|
355
372
|
|
|
356
373
|
"""
|
|
357
|
-
assert self.vectors is not None,
|
|
374
|
+
assert self.vectors is not None, "Error: Cannot query the database without prior insertion / initialization."
|
|
358
375
|
top_k = top_k or self.index_top_k
|
|
359
376
|
query_vector = self.embedding_function([query])[0] if vector is None else vector
|
|
360
|
-
if
|
|
377
|
+
if isinstance(query_vector, list):
|
|
361
378
|
query_vector = np.array(query_vector)
|
|
362
379
|
ranked_results, similarities = ranking_algorithm_sort(
|
|
363
380
|
self.vectors, query_vector, top_k=top_k, metric=self.similarity_metric
|
|
364
381
|
)
|
|
365
382
|
if return_similarities:
|
|
366
|
-
return list(zip([self.documents[index] for index in ranked_results], similarities))
|
|
383
|
+
return list(zip([self.documents[index] for index in ranked_results], similarities, strict=False))
|
|
367
384
|
return [self.documents[index] for index in ranked_results]
|
symai/formatter/__init__.py
CHANGED
|
@@ -1,2 +1,10 @@
|
|
|
1
|
+
from .formatter import ParagraphFormatter, RegexFormatter, SentenceFormatter, TextContainerFormatter
|
|
1
2
|
from .regex import CHUNK_REGEX
|
|
2
|
-
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"CHUNK_REGEX",
|
|
6
|
+
"ParagraphFormatter",
|
|
7
|
+
"RegexFormatter",
|
|
8
|
+
"SentenceFormatter",
|
|
9
|
+
"TextContainerFormatter",
|
|
10
|
+
]
|
symai/formatter/formatter.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
3
4
|
from beartype import beartype
|
|
4
5
|
from beartype.typing import Any, Dict, List
|
|
5
6
|
from tqdm import tqdm
|
|
6
7
|
|
|
7
|
-
from .regex import CHUNK_REGEX
|
|
8
8
|
from .. import core_ext
|
|
9
9
|
from ..symbol import Expression, Symbol
|
|
10
|
+
from .regex import CHUNK_REGEX
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from ..backend.engines.files.engine_io import TextContainer
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
class ParagraphFormatter(Expression):
|
|
@@ -22,7 +26,7 @@ class ParagraphFormatter(Expression):
|
|
|
22
26
|
# split text file-wise and create a map of file names and their contents
|
|
23
27
|
files = {}
|
|
24
28
|
split_text = input_.split('# ----[FILE_START]')
|
|
25
|
-
for
|
|
29
|
+
for _i, file in enumerate(split_text):
|
|
26
30
|
if not file.strip():
|
|
27
31
|
continue
|
|
28
32
|
_, content_file = file.split('[FILE_CONTENT]:')
|
|
@@ -109,7 +113,7 @@ class ParagraphFormatter(Expression):
|
|
|
109
113
|
paragraphs.append(text)
|
|
110
114
|
return paragraphs
|
|
111
115
|
|
|
112
|
-
def forward(self, sym: Symbol, *
|
|
116
|
+
def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
|
|
113
117
|
sym = self._to_symbol(sym)
|
|
114
118
|
# split text paragraph-wise and index each paragraph separately
|
|
115
119
|
self.elements = self.split_files(sym.value)
|
|
@@ -128,13 +132,9 @@ class SentenceFormatter(Expression):
|
|
|
128
132
|
input_ = input_text.strip()
|
|
129
133
|
split_text = self.SENTENCES_RE.split(input_) # regex splitting
|
|
130
134
|
|
|
131
|
-
|
|
132
|
-
# s.strip() + ".\n" ensures that all lines in the sentence end with a period and newline
|
|
133
|
-
# s.strip() == True if sentence has other characters than whitespace
|
|
135
|
+
return [s.strip() + ".\n" for s in split_text if s.strip()]
|
|
134
136
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
|
|
137
|
+
def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
|
|
138
138
|
sym = self._to_symbol(sym)
|
|
139
139
|
# split text sentence-wise and index each sentence separately
|
|
140
140
|
self.elements = self.split_sentences(sym.value)
|
|
@@ -151,12 +151,9 @@ class RegexFormatter(Expression):
|
|
|
151
151
|
input_ = input_text.strip()
|
|
152
152
|
split_text = self.SENTENCES_RE.split(input_) # regex splitting
|
|
153
153
|
|
|
154
|
-
|
|
155
|
-
# s.strip() == True if sentence has other characters than whitespace
|
|
156
|
-
|
|
157
|
-
return chunks
|
|
154
|
+
return [s.strip() for s in split_text if s.strip()]
|
|
158
155
|
|
|
159
|
-
def forward(self, sym: Symbol, *
|
|
156
|
+
def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
|
|
160
157
|
sym = self._to_symbol(sym)
|
|
161
158
|
# split text sentence-wise and index each sentence separately
|
|
162
159
|
self.elements = self.split_sentences(sym.value)
|
|
@@ -176,7 +173,7 @@ class TextContainerFormatter(Expression):
|
|
|
176
173
|
self.text_split = text_split
|
|
177
174
|
|
|
178
175
|
@beartype
|
|
179
|
-
def forward(self, sym: Symbol, *
|
|
176
|
+
def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
|
|
180
177
|
if isinstance(sym.value, list):
|
|
181
178
|
containers = [container for pdf in sym.value for container in pdf]
|
|
182
179
|
chunks = [text for container in tqdm(containers) for text in self._chunk(container)]
|
|
@@ -205,4 +202,3 @@ class TextContainerFormatter(Expression):
|
|
|
205
202
|
'---\n'
|
|
206
203
|
f"{text}"
|
|
207
204
|
)
|
|
208
|
-
|