symbolicai 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- symai/__init__.py +198 -134
- symai/backend/base.py +51 -51
- symai/backend/engines/drawing/engine_bfl.py +33 -33
- symai/backend/engines/drawing/engine_gpt_image.py +4 -10
- symai/backend/engines/embedding/engine_llama_cpp.py +50 -35
- symai/backend/engines/embedding/engine_openai.py +22 -16
- symai/backend/engines/execute/engine_python.py +16 -16
- symai/backend/engines/files/engine_io.py +51 -49
- symai/backend/engines/imagecaptioning/engine_blip2.py +27 -23
- symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +53 -46
- symai/backend/engines/index/engine_pinecone.py +116 -88
- symai/backend/engines/index/engine_qdrant.py +1011 -0
- symai/backend/engines/index/engine_vectordb.py +78 -52
- symai/backend/engines/lean/engine_lean4.py +65 -25
- symai/backend/engines/neurosymbolic/__init__.py +28 -28
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +137 -135
- symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +145 -152
- symai/backend/engines/neurosymbolic/engine_cerebras.py +328 -0
- symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +75 -49
- symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +199 -155
- symai/backend/engines/neurosymbolic/engine_groq.py +106 -72
- symai/backend/engines/neurosymbolic/engine_huggingface.py +100 -67
- symai/backend/engines/neurosymbolic/engine_llama_cpp.py +121 -93
- symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +213 -132
- symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +180 -137
- symai/backend/engines/ocr/engine_apilayer.py +18 -20
- symai/backend/engines/output/engine_stdout.py +9 -9
- symai/backend/engines/{webscraping → scrape}/engine_requests.py +25 -11
- symai/backend/engines/search/engine_openai.py +95 -83
- symai/backend/engines/search/engine_parallel.py +665 -0
- symai/backend/engines/search/engine_perplexity.py +40 -41
- symai/backend/engines/search/engine_serpapi.py +33 -28
- symai/backend/engines/speech_to_text/engine_local_whisper.py +37 -27
- symai/backend/engines/symbolic/engine_wolframalpha.py +14 -8
- symai/backend/engines/text_to_speech/engine_openai.py +15 -19
- symai/backend/engines/text_vision/engine_clip.py +34 -28
- symai/backend/engines/userinput/engine_console.py +3 -4
- symai/backend/mixin/anthropic.py +48 -40
- symai/backend/mixin/deepseek.py +4 -5
- symai/backend/mixin/google.py +5 -4
- symai/backend/mixin/groq.py +2 -4
- symai/backend/mixin/openai.py +132 -110
- symai/backend/settings.py +14 -14
- symai/chat.py +164 -94
- symai/collect/dynamic.py +13 -11
- symai/collect/pipeline.py +39 -31
- symai/collect/stats.py +109 -69
- symai/components.py +556 -238
- symai/constraints.py +14 -5
- symai/core.py +1495 -1210
- symai/core_ext.py +55 -50
- symai/endpoints/api.py +113 -58
- symai/extended/api_builder.py +22 -17
- symai/extended/arxiv_pdf_parser.py +13 -5
- symai/extended/bibtex_parser.py +8 -4
- symai/extended/conversation.py +88 -69
- symai/extended/document.py +40 -27
- symai/extended/file_merger.py +45 -7
- symai/extended/graph.py +38 -24
- symai/extended/html_style_template.py +17 -11
- symai/extended/interfaces/blip_2.py +1 -1
- symai/extended/interfaces/clip.py +4 -2
- symai/extended/interfaces/console.py +5 -3
- symai/extended/interfaces/dall_e.py +3 -1
- symai/extended/interfaces/file.py +2 -0
- symai/extended/interfaces/flux.py +3 -1
- symai/extended/interfaces/gpt_image.py +15 -6
- symai/extended/interfaces/input.py +2 -1
- symai/extended/interfaces/llava.py +1 -1
- symai/extended/interfaces/{naive_webscraping.py → naive_scrape.py} +3 -2
- symai/extended/interfaces/naive_vectordb.py +2 -2
- symai/extended/interfaces/ocr.py +4 -2
- symai/extended/interfaces/openai_search.py +2 -0
- symai/extended/interfaces/parallel.py +30 -0
- symai/extended/interfaces/perplexity.py +2 -0
- symai/extended/interfaces/pinecone.py +6 -4
- symai/extended/interfaces/python.py +2 -0
- symai/extended/interfaces/serpapi.py +2 -0
- symai/extended/interfaces/terminal.py +0 -1
- symai/extended/interfaces/tts.py +2 -1
- symai/extended/interfaces/whisper.py +2 -1
- symai/extended/interfaces/wolframalpha.py +1 -0
- symai/extended/metrics/__init__.py +1 -1
- symai/extended/metrics/similarity.py +5 -2
- symai/extended/os_command.py +31 -22
- symai/extended/packages/symdev.py +39 -34
- symai/extended/packages/sympkg.py +30 -27
- symai/extended/packages/symrun.py +46 -35
- symai/extended/repo_cloner.py +10 -9
- symai/extended/seo_query_optimizer.py +15 -12
- symai/extended/solver.py +104 -76
- symai/extended/summarizer.py +8 -7
- symai/extended/taypan_interpreter.py +10 -9
- symai/extended/vectordb.py +28 -15
- symai/formatter/formatter.py +39 -31
- symai/formatter/regex.py +46 -44
- symai/functional.py +184 -86
- symai/imports.py +85 -51
- symai/interfaces.py +1 -1
- symai/memory.py +33 -24
- symai/menu/screen.py +28 -19
- symai/misc/console.py +27 -27
- symai/misc/loader.py +4 -3
- symai/models/base.py +147 -76
- symai/models/errors.py +1 -1
- symai/ops/__init__.py +1 -1
- symai/ops/measures.py +17 -14
- symai/ops/primitives.py +933 -635
- symai/post_processors.py +28 -24
- symai/pre_processors.py +58 -52
- symai/processor.py +15 -9
- symai/prompts.py +714 -649
- symai/server/huggingface_server.py +115 -32
- symai/server/llama_cpp_server.py +14 -6
- symai/server/qdrant_server.py +206 -0
- symai/shell.py +98 -39
- symai/shellsv.py +307 -223
- symai/strategy.py +135 -81
- symai/symbol.py +276 -225
- symai/utils.py +62 -46
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/METADATA +19 -9
- symbolicai-1.1.0.dist-info/RECORD +168 -0
- symbolicai-1.0.0.dist-info/RECORD +0 -163
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/WHEEL +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/entry_points.txt +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/licenses/LICENSE +0 -0
- {symbolicai-1.0.0.dist-info → symbolicai-1.1.0.dist-info}/top_level.txt +0 -0
symai/extended/solver.py
CHANGED
|
@@ -57,14 +57,15 @@ $> Max is 2 years older than his brother. In 5 years, Max will be 3 times as old
|
|
|
57
57
|
--------------
|
|
58
58
|
"""
|
|
59
59
|
|
|
60
|
+
|
|
60
61
|
class ProblemClassifierPreProcessor(PreProcessor):
|
|
61
62
|
def __call__(self, argument):
|
|
62
|
-
return f
|
|
63
|
+
return f"$> {argument.prop.instance!s}\n//"
|
|
63
64
|
|
|
64
65
|
|
|
65
66
|
class OptionsPreProcessor(PreProcessor):
|
|
66
67
|
def __call__(self, argument):
|
|
67
|
-
return f
|
|
68
|
+
return f"$> :{argument.prop.instance!s}: == :{argument.args[0]!s}: =>"
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
class ProblemClassifier(Expression):
|
|
@@ -75,32 +76,41 @@ class ProblemClassifier(Expression):
|
|
|
75
76
|
return PROBLEM_CATEGORY_CONTEXT
|
|
76
77
|
|
|
77
78
|
def __eq__(self, other, **kwargs) -> bool:
|
|
78
|
-
@core.few_shot(
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
79
|
+
@core.few_shot(
|
|
80
|
+
prompt="Verify equality of the following categories. Ignore typos, upper / lower case or singular / plural differences:\n",
|
|
81
|
+
examples=Prompt(
|
|
82
|
+
[
|
|
83
|
+
"$> :Arithmetic formula: == :Arithmetics formula: =>True EOF",
|
|
84
|
+
"$> :arithmetic formula: == :Arithmetic formula: =>True EOF",
|
|
85
|
+
"$> :arithmetic formula: == :arithmeticformula: =>True EOF",
|
|
86
|
+
"$> :arithmetic formula: == :Implication and logical expressions: =>False EOF",
|
|
87
|
+
"$> :Linear algebra: == :Implication and logical expressions: =>False EOF",
|
|
88
|
+
"$> :Linear algebra: == :Unknown category: =>False EOF",
|
|
89
|
+
"$> :Linear algebra: == :Linear algebra: =>True EOF",
|
|
90
|
+
"$> :Probability and statistics: == :Probabilities and statistics: =>True EOF",
|
|
91
|
+
"$> :PROBABILITY AND STATISTICS: == :Probability and statistics: =>True EOF",
|
|
92
|
+
"$> :PROBABILITY AND STATISTICS: == :UNKNOWN CATEGORY: =>False EOF",
|
|
93
|
+
]
|
|
94
|
+
),
|
|
95
|
+
pre_processors=[OptionsPreProcessor()],
|
|
96
|
+
post_processors=[StripPostProcessor()],
|
|
97
|
+
stop=["EOF"],
|
|
98
|
+
**kwargs,
|
|
99
|
+
)
|
|
94
100
|
def _func(_, other) -> bool:
|
|
95
101
|
pass
|
|
102
|
+
|
|
96
103
|
return _func(self, other)
|
|
97
104
|
|
|
98
105
|
def forward(self, **kwargs) -> str:
|
|
99
|
-
@core.few_shot(
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
106
|
+
@core.few_shot(
|
|
107
|
+
prompt="Classify the user query to the mathematical classes:\n",
|
|
108
|
+
examples=[],
|
|
109
|
+
pre_processors=[ProblemClassifierPreProcessor()],
|
|
110
|
+
post_processors=[StripPostProcessor()],
|
|
111
|
+
stop=["EOF"],
|
|
112
|
+
**kwargs,
|
|
113
|
+
)
|
|
104
114
|
def _func(_) -> str:
|
|
105
115
|
pass
|
|
106
116
|
|
|
@@ -109,34 +119,40 @@ class ProblemClassifier(Expression):
|
|
|
109
119
|
|
|
110
120
|
class FormulaCheckerPreProcessor(PreProcessor):
|
|
111
121
|
def __call__(self, argument):
|
|
112
|
-
return f
|
|
122
|
+
return f"$> {argument.prop.instance!s} =>"
|
|
113
123
|
|
|
114
124
|
|
|
115
125
|
class FormulaChecker(Expression):
|
|
116
126
|
def forward(self, **kwargs) -> bool:
|
|
117
|
-
@core.few_shot(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
127
|
+
@core.few_shot(
|
|
128
|
+
prompt="Is the following statement in an explicit formula form without natural language text?:\n",
|
|
129
|
+
examples=Prompt(
|
|
130
|
+
[
|
|
131
|
+
"$> 2 + 2 * 2 =>True EOF",
|
|
132
|
+
"$> x + 2 = 3 =>True EOF",
|
|
133
|
+
"$> Set of all natural numbers =>False EOF",
|
|
134
|
+
"$> Probability of drawing a red ball =>False EOF",
|
|
135
|
+
"$> (a + b) * (a - b) =>True EOF",
|
|
136
|
+
"$> Add the square root of nine to the square root of x =>False EOF",
|
|
137
|
+
"$> Five plus two equals seven =>False EOF",
|
|
138
|
+
"$> 5 + 2 = 7 =>True EOF",
|
|
139
|
+
"$> x is seven =>False EOF",
|
|
140
|
+
"$> x = 7 =>True EOF",
|
|
141
|
+
"$> Anna has two apples. She gives one to her brother. How many apples does Anna have now? =>False EOF",
|
|
142
|
+
"$> 0.447662 =>True EOF",
|
|
143
|
+
"$> Subtract the x from y squared =>False EOF",
|
|
144
|
+
"$> The sum of the first n natural numbers =>False EOF",
|
|
145
|
+
"$> Sum[x=5, {i=0, n=10}] =>True EOF",
|
|
146
|
+
]
|
|
147
|
+
),
|
|
148
|
+
pre_processors=[FormulaCheckerPreProcessor()],
|
|
149
|
+
post_processors=[StripPostProcessor()],
|
|
150
|
+
stop=["EOF"],
|
|
151
|
+
**kwargs,
|
|
152
|
+
)
|
|
138
153
|
def _func(_) -> bool:
|
|
139
154
|
pass
|
|
155
|
+
|
|
140
156
|
return _func(self)
|
|
141
157
|
|
|
142
158
|
|
|
@@ -149,27 +165,33 @@ class FormulaChecker(Expression):
|
|
|
149
165
|
|
|
150
166
|
class FormulaWriterPreProcessor(PreProcessor):
|
|
151
167
|
def __call__(self, argument):
|
|
152
|
-
return f
|
|
168
|
+
return f"$> {argument.prop.instance!s} =>"
|
|
153
169
|
|
|
154
170
|
|
|
155
171
|
class FormulaWriter(Expression):
|
|
156
172
|
def forward(self, **kwargs) -> str:
|
|
157
|
-
@core.few_shot(
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
173
|
+
@core.few_shot(
|
|
174
|
+
prompt="Rewrite the following natural language statement in a mathematical formula or higher-order logic statement to be solved by Mathematica:\n",
|
|
175
|
+
examples=Prompt(
|
|
176
|
+
[
|
|
177
|
+
"$> Add 5 plus 3 =>5 + 3 EOF",
|
|
178
|
+
"$> Seventy plus twenty =>70 + 20 EOF",
|
|
179
|
+
"$> Divide 5 by three =>5 / 3 EOF",
|
|
180
|
+
"$> The square root of pi plus x. =>Sqrt[Pi + x] EOF",
|
|
181
|
+
"$> Eight point five six seven one four two seven =>8.5671427 EOF",
|
|
182
|
+
"$> Give a solution for a quadratic equation x^2 + 2x + 1 =>Solve[x^2 + 2x + 1 ==0, x] EOF",
|
|
183
|
+
"$> Sum x n times from i equals 0 to n equals 10. x is equals to 5. =>Sum[x=5, {i=0, n=10}] EOF",
|
|
184
|
+
"$> Multiply the first statement in brackets a plus b times the second term in brackets c minus d =>(a + b) * (c - d) EOF",
|
|
185
|
+
]
|
|
186
|
+
),
|
|
187
|
+
pre_processors=[FormulaWriterPreProcessor()],
|
|
188
|
+
post_processors=[StripPostProcessor()],
|
|
189
|
+
stop=["EOF"],
|
|
190
|
+
**kwargs,
|
|
191
|
+
)
|
|
171
192
|
def _func(_) -> str:
|
|
172
193
|
pass
|
|
194
|
+
|
|
173
195
|
return _func(self)
|
|
174
196
|
|
|
175
197
|
|
|
@@ -199,17 +221,19 @@ _value_obj_ = problem_statement
|
|
|
199
221
|
|
|
200
222
|
class SATSolver(Expression):
|
|
201
223
|
def forward(self, code):
|
|
202
|
-
assert z3 is not None,
|
|
224
|
+
assert z3 is not None, (
|
|
225
|
+
"The z3 library is not installed. Please install it using `pip install 'symbolicai[solver]'` and try again."
|
|
226
|
+
)
|
|
203
227
|
# Create the execution template
|
|
204
|
-
runner
|
|
228
|
+
runner = Execute(enclosure=True)
|
|
205
229
|
# Execute the code
|
|
206
230
|
statement = runner(code)
|
|
207
231
|
# Create a new solver instance
|
|
208
|
-
S
|
|
232
|
+
S = z3.Solver()
|
|
209
233
|
# Create a new query
|
|
210
|
-
query
|
|
234
|
+
query = statement["locals"]["_output_"](S)
|
|
211
235
|
# Check if the query can be solved
|
|
212
|
-
r
|
|
236
|
+
r = S.check()
|
|
213
237
|
# Print the solution
|
|
214
238
|
if r == z3.sat:
|
|
215
239
|
# Get the model
|
|
@@ -232,8 +256,8 @@ class Solver(Expression):
|
|
|
232
256
|
super().__init__(**kwargs)
|
|
233
257
|
self.sym_return_type = Solver
|
|
234
258
|
self.solver = SATSolver()
|
|
235
|
-
self.conv
|
|
236
|
-
self.pp
|
|
259
|
+
self.conv = Conversation(init=LOGIC_TEMPLATE)
|
|
260
|
+
self.pp = CodeExtractPostProcessor()
|
|
237
261
|
|
|
238
262
|
def rewrite_formula(self, sym, **kwargs):
|
|
239
263
|
formula = sym
|
|
@@ -247,16 +271,20 @@ class Solver(Expression):
|
|
|
247
271
|
classifier = ProblemClassifier(sym)
|
|
248
272
|
problem = classifier(**kwargs)
|
|
249
273
|
|
|
250
|
-
if problem ==
|
|
274
|
+
if problem == "Arithmetics formula" or problem == "Equations":
|
|
251
275
|
formula = self.rewrite_formula(sym, **kwargs)
|
|
252
276
|
UserMessage(str(formula))
|
|
253
|
-
elif problem ==
|
|
254
|
-
res
|
|
255
|
-
code
|
|
256
|
-
formula = self.solver(code, lambda:
|
|
277
|
+
elif problem == "Implication and logical expressions":
|
|
278
|
+
res = self.conv(sym, **kwargs)
|
|
279
|
+
code = self.pp(str(res), None, tag="python")
|
|
280
|
+
formula = self.solver(code, lambda: "German")
|
|
257
281
|
UserMessage(str(formula))
|
|
258
|
-
elif
|
|
259
|
-
|
|
282
|
+
elif (
|
|
283
|
+
problem == "Probability and statistics"
|
|
284
|
+
or problem == "Linear algebra"
|
|
285
|
+
or problem == "Linguistic problem with relations"
|
|
286
|
+
):
|
|
287
|
+
UserMessage("This feature is not yet implemented.", raise_with=NotImplementedError)
|
|
260
288
|
else:
|
|
261
289
|
return "Sorry, something went wrong. Please check if your backend is available and try again or report an issue to the devs. :("
|
|
262
290
|
return None
|
|
@@ -271,8 +299,8 @@ def process_query(args) -> None:
|
|
|
271
299
|
|
|
272
300
|
def run() -> None:
|
|
273
301
|
# All the logic of argparse goes in this function
|
|
274
|
-
parser = argparse.ArgumentParser(description=
|
|
275
|
-
parser.add_argument(
|
|
302
|
+
parser = argparse.ArgumentParser(description="Welcome to the Symbolic<AI/> Shell support tool!")
|
|
303
|
+
parser.add_argument("query", type=str, help="The prompt for the shell query.")
|
|
276
304
|
|
|
277
305
|
args = parser.parse_args()
|
|
278
306
|
process_query(args)
|
symai/extended/summarizer.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
from ..components import Clean, Outline, Sequence, Stream, Translate
|
|
3
2
|
from ..symbol import Expression, Symbol
|
|
4
3
|
|
|
@@ -9,12 +8,14 @@ class Summarizer(Expression):
|
|
|
9
8
|
filters = []
|
|
10
9
|
super().__init__(**kwargs)
|
|
11
10
|
filters = filters if isinstance(filters, (list, tuple)) else [filters]
|
|
12
|
-
self.data_stream = Stream(
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
11
|
+
self.data_stream = Stream(
|
|
12
|
+
Sequence(
|
|
13
|
+
Clean(),
|
|
14
|
+
Translate(),
|
|
15
|
+
Outline(),
|
|
16
|
+
*filters,
|
|
17
|
+
)
|
|
18
|
+
)
|
|
18
19
|
|
|
19
20
|
def forward(self, sym: Symbol, **kwargs) -> Symbol:
|
|
20
21
|
vals = list(self.data_stream(sym, **kwargs))
|
|
@@ -9,15 +9,12 @@ from ..symbol import Expression, Symbol
|
|
|
9
9
|
def create_template():
|
|
10
10
|
package_path = pathlib.Path(__file__).parent.absolute()
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
with (package_path / 'symbol.py').open() as f:
|
|
12
|
+
with (package_path / "symbol.py").open() as f:
|
|
14
13
|
SYMBOL_API = f.read()
|
|
15
14
|
|
|
16
|
-
|
|
17
|
-
with (package_path / 'components.py').open() as f:
|
|
15
|
+
with (package_path / "components.py").open() as f:
|
|
18
16
|
COMPONENTS_API = f.read()
|
|
19
17
|
|
|
20
|
-
|
|
21
18
|
return f"""[Description]
|
|
22
19
|
You are a programming language re-writing system from Taypan (high-level general-purpose programming language based on neuro-symbolic virtual machine) to Python interpreter, analogous to the relation between Scala and Java is the relation of Taypan to Python.
|
|
23
20
|
|
|
@@ -113,7 +110,7 @@ def create_template():
|
|
|
113
110
|
|
|
114
111
|
class TaypanPreProcessor(PreProcessor):
|
|
115
112
|
def __call__(self, argument):
|
|
116
|
-
return f
|
|
113
|
+
return f"```taypan\n{argument.args[0]!s}\n =>"
|
|
117
114
|
|
|
118
115
|
|
|
119
116
|
class TaypanInterpreter(Expression):
|
|
@@ -127,9 +124,13 @@ class TaypanInterpreter(Expression):
|
|
|
127
124
|
self.description = create_template()
|
|
128
125
|
|
|
129
126
|
def forward(self, sym: Symbol, **kwargs) -> Symbol:
|
|
130
|
-
@zero_shot(
|
|
131
|
-
|
|
132
|
-
|
|
127
|
+
@zero_shot(
|
|
128
|
+
prompt="Translate the Taypan code to Python code:\n",
|
|
129
|
+
pre_processors=[TaypanPreProcessor()],
|
|
130
|
+
post_processors=[CodeExtractPostProcessor()],
|
|
131
|
+
**kwargs,
|
|
132
|
+
)
|
|
133
133
|
def _func(_, text) -> str:
|
|
134
134
|
pass
|
|
135
|
+
|
|
135
136
|
return _func(self, sym)
|
symai/extended/vectordb.py
CHANGED
|
@@ -21,8 +21,8 @@ from .metrics import (
|
|
|
21
21
|
ranking_algorithm_sort,
|
|
22
22
|
)
|
|
23
23
|
|
|
24
|
-
logging.getLogger(
|
|
25
|
-
logging.getLogger(
|
|
24
|
+
logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
|
|
25
|
+
logging.getLogger("datasets").setLevel(logging.WARNING)
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class VectorDB(Expression):
|
|
@@ -35,6 +35,7 @@ class VectorDB(Expression):
|
|
|
35
35
|
_default_top_k: ClassVar[int] = 5
|
|
36
36
|
_default_storage_path: ClassVar[Path] = HOME_PATH / "localdb"
|
|
37
37
|
_default_index_name: ClassVar[str] = "dataindex"
|
|
38
|
+
|
|
38
39
|
def __init__(
|
|
39
40
|
self,
|
|
40
41
|
documents=_default_documents,
|
|
@@ -46,7 +47,7 @@ class VectorDB(Expression):
|
|
|
46
47
|
index_dims=_default_index_dims,
|
|
47
48
|
top_k=_default_top_k,
|
|
48
49
|
index_name=_default_index_name,
|
|
49
|
-
**kwargs
|
|
50
|
+
**kwargs,
|
|
50
51
|
):
|
|
51
52
|
super().__init__(**kwargs)
|
|
52
53
|
self.config = deepcopy(SYMAI_CONFIG)
|
|
@@ -77,7 +78,10 @@ class VectorDB(Expression):
|
|
|
77
78
|
elif "adams" in similarity_metric:
|
|
78
79
|
self.similarity_metric = adams_similarity
|
|
79
80
|
else:
|
|
80
|
-
UserMessage(
|
|
81
|
+
UserMessage(
|
|
82
|
+
"Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.",
|
|
83
|
+
raise_with=ValueError,
|
|
84
|
+
)
|
|
81
85
|
|
|
82
86
|
if load_on_init:
|
|
83
87
|
if isinstance(load_on_init, (str, Path)):
|
|
@@ -87,8 +91,11 @@ class VectorDB(Expression):
|
|
|
87
91
|
self.load()
|
|
88
92
|
|
|
89
93
|
def _init_embedding_model(self):
|
|
90
|
-
if
|
|
91
|
-
self.
|
|
94
|
+
if (
|
|
95
|
+
self.config["EMBEDDING_ENGINE_API_KEY"] is None
|
|
96
|
+
or self.config["EMBEDDING_ENGINE_API_KEY"] == ""
|
|
97
|
+
):
|
|
98
|
+
self.model = Interface("ExtensityAI/embeddings") # default to local model
|
|
92
99
|
else:
|
|
93
100
|
self.model = lambda x: Symbol(x).embedding
|
|
94
101
|
|
|
@@ -158,7 +165,10 @@ class VectorDB(Expression):
|
|
|
158
165
|
if len(documents) == 0:
|
|
159
166
|
return []
|
|
160
167
|
texts = self._to_texts(documents, key)
|
|
161
|
-
batches = [
|
|
168
|
+
batches = [
|
|
169
|
+
texts[index : index + self.batch_size]
|
|
170
|
+
for index in range(0, len(texts), self.batch_size)
|
|
171
|
+
]
|
|
162
172
|
embeddings = []
|
|
163
173
|
for batch in batches:
|
|
164
174
|
embeddings.extend(self._embed_batch(batch))
|
|
@@ -186,8 +196,7 @@ class VectorDB(Expression):
|
|
|
186
196
|
)
|
|
187
197
|
]
|
|
188
198
|
return [
|
|
189
|
-
{"document": document, "index": index}
|
|
190
|
-
for index, document in enumerate(self.documents)
|
|
199
|
+
{"document": document, "index": index} for index, document in enumerate(self.documents)
|
|
191
200
|
]
|
|
192
201
|
|
|
193
202
|
def add(self, documents, vectors=None):
|
|
@@ -222,7 +231,7 @@ class VectorDB(Expression):
|
|
|
222
231
|
A vector to add to the database.
|
|
223
232
|
|
|
224
233
|
"""
|
|
225
|
-
vector =
|
|
234
|
+
vector = vector if vector is not None else self.embedding_function([document])[0]
|
|
226
235
|
if self.vectors is None:
|
|
227
236
|
self.vectors = np.empty((0, len(vector)), dtype=np.float32)
|
|
228
237
|
elif len(vector) != self.vectors.shape[1]:
|
|
@@ -269,7 +278,7 @@ class VectorDB(Expression):
|
|
|
269
278
|
Clears the database.
|
|
270
279
|
|
|
271
280
|
"""
|
|
272
|
-
self.vectors
|
|
281
|
+
self.vectors = None
|
|
273
282
|
self.documents = []
|
|
274
283
|
|
|
275
284
|
def save(self, storage_file: str | None = None):
|
|
@@ -296,7 +305,7 @@ class VectorDB(Expression):
|
|
|
296
305
|
with storage_file.open("wb") as f:
|
|
297
306
|
pickle.dump(data, f)
|
|
298
307
|
|
|
299
|
-
def load(self, storage_file
|
|
308
|
+
def load(self, storage_file: str | None = None):
|
|
300
309
|
"""
|
|
301
310
|
Loads the database from a file.
|
|
302
311
|
|
|
@@ -326,7 +335,7 @@ class VectorDB(Expression):
|
|
|
326
335
|
self.vectors = data["vectors"].astype(np.float32) if data["vectors"] is not None else None
|
|
327
336
|
self.documents = data["documents"]
|
|
328
337
|
|
|
329
|
-
def purge(self, index_name
|
|
338
|
+
def purge(self, index_name: str):
|
|
330
339
|
"""
|
|
331
340
|
Purges the database file from your machine, but does not delete the database from memory.
|
|
332
341
|
Use the `clear` method to clear the database from memory.
|
|
@@ -371,7 +380,9 @@ class VectorDB(Expression):
|
|
|
371
380
|
A list of results.
|
|
372
381
|
|
|
373
382
|
"""
|
|
374
|
-
assert self.vectors is not None,
|
|
383
|
+
assert self.vectors is not None, (
|
|
384
|
+
"Error: Cannot query the database without prior insertion / initialization."
|
|
385
|
+
)
|
|
375
386
|
top_k = top_k or self.index_top_k
|
|
376
387
|
query_vector = self.embedding_function([query])[0] if vector is None else vector
|
|
377
388
|
if isinstance(query_vector, list):
|
|
@@ -380,5 +391,7 @@ class VectorDB(Expression):
|
|
|
380
391
|
self.vectors, query_vector, top_k=top_k, metric=self.similarity_metric
|
|
381
392
|
)
|
|
382
393
|
if return_similarities:
|
|
383
|
-
return list(
|
|
394
|
+
return list(
|
|
395
|
+
zip([self.documents[index] for index in ranked_results], similarities, strict=False)
|
|
396
|
+
)
|
|
384
397
|
return [self.documents[index] for index in ranked_results]
|
symai/formatter/formatter.py
CHANGED
|
@@ -21,16 +21,16 @@ class ParagraphFormatter(Expression):
|
|
|
21
21
|
|
|
22
22
|
def split_files(self, input_text=""):
|
|
23
23
|
input_ = input_text.strip()
|
|
24
|
-
if input_.startswith(
|
|
24
|
+
if input_.startswith("# ----[FILE_START]") and "# ----[FILE_END]" in input_:
|
|
25
25
|
self._has_file_start = True
|
|
26
26
|
# split text file-wise and create a map of file names and their contents
|
|
27
27
|
files = {}
|
|
28
|
-
split_text = input_.split(
|
|
28
|
+
split_text = input_.split("# ----[FILE_START]")
|
|
29
29
|
for _i, file in enumerate(split_text):
|
|
30
30
|
if not file.strip():
|
|
31
31
|
continue
|
|
32
|
-
_, content_file = file.split(
|
|
33
|
-
content, file_name = content_file.split(
|
|
32
|
+
_, content_file = file.split("[FILE_CONTENT]:")
|
|
33
|
+
content, file_name = content_file.split("# ----[FILE_END]")
|
|
34
34
|
files[file_name.strip()] = content.strip()
|
|
35
35
|
else:
|
|
36
36
|
files = {"": input_}
|
|
@@ -40,8 +40,10 @@ class ParagraphFormatter(Expression):
|
|
|
40
40
|
if file_name and self._has_file_start:
|
|
41
41
|
header = f"# ----[FILE_START]<PART{part}/{total_parts}>{file_name}[FILE_CONTENT]:\n"
|
|
42
42
|
footer = f"\n# ----[FILE_END]{file_name}\n"
|
|
43
|
-
if
|
|
44
|
-
|
|
43
|
+
if (
|
|
44
|
+
"[FILE_CONTENT]:" in paragraph
|
|
45
|
+
): # TODO: remove this if statement after fixing the bug
|
|
46
|
+
paragraph = paragraph.split("[FILE_CONTENT]:")[-1].strip()
|
|
45
47
|
paragraph = header + paragraph + footer
|
|
46
48
|
return paragraph
|
|
47
49
|
|
|
@@ -67,7 +69,12 @@ class ParagraphFormatter(Expression):
|
|
|
67
69
|
input_ = file_content.strip()
|
|
68
70
|
split_text = self.NEWLINES_RE.split(input_)
|
|
69
71
|
|
|
70
|
-
par = [
|
|
72
|
+
par = [
|
|
73
|
+
self._add_header_footer(p, file_name, part=i + 1, total_parts=len(split_text))
|
|
74
|
+
+ "\n"
|
|
75
|
+
for i, p in enumerate(split_text)
|
|
76
|
+
if p.strip()
|
|
77
|
+
]
|
|
71
78
|
# p + "\n" ensures that all lines in the paragraph end with a newline
|
|
72
79
|
# p.strip() == True if paragraph has other characters than whitespace
|
|
73
80
|
|
|
@@ -85,14 +92,20 @@ class ParagraphFormatter(Expression):
|
|
|
85
92
|
# n splits
|
|
86
93
|
total_parts = (len(words) // max_length + 1) * self._get_total_parts(text)
|
|
87
94
|
for p, i in enumerate(range(0, len(words), max_length)):
|
|
88
|
-
paragraph =
|
|
89
|
-
paragraphs.append(
|
|
95
|
+
paragraph = " ".join(words[i : i + max_length])
|
|
96
|
+
paragraphs.append(
|
|
97
|
+
self._add_header_footer(
|
|
98
|
+
paragraph, file_name, part=p + 1, total_parts=total_parts
|
|
99
|
+
)
|
|
100
|
+
+ "\n"
|
|
101
|
+
)
|
|
90
102
|
else:
|
|
91
103
|
paragraphs.append(text)
|
|
92
104
|
return paragraphs
|
|
93
105
|
|
|
94
|
-
@core_ext.bind(engine=
|
|
95
|
-
def _max_tokens(self):
|
|
106
|
+
@core_ext.bind(engine="embedding", property="max_tokens")
|
|
107
|
+
def _max_tokens(self):
|
|
108
|
+
pass
|
|
96
109
|
|
|
97
110
|
def split_max_tokens_exceeded(self, input_text: List[str], token_ratio=0.5):
|
|
98
111
|
paragraphs = []
|
|
@@ -107,8 +120,13 @@ class ParagraphFormatter(Expression):
|
|
|
107
120
|
text_len_ = len(str(text)) // splits_
|
|
108
121
|
total_parts = (text_len_ + 1) * self._get_total_parts(text)
|
|
109
122
|
for i in range(splits_):
|
|
110
|
-
paragraph = text[i * text_len_:(i + 1) * text_len_]
|
|
111
|
-
paragraphs.append(
|
|
123
|
+
paragraph = text[i * text_len_ : (i + 1) * text_len_]
|
|
124
|
+
paragraphs.append(
|
|
125
|
+
self._add_header_footer(
|
|
126
|
+
paragraph, file_name, part=i + 1, total_parts=total_parts
|
|
127
|
+
)
|
|
128
|
+
+ "\n"
|
|
129
|
+
)
|
|
112
130
|
else:
|
|
113
131
|
paragraphs.append(text)
|
|
114
132
|
return paragraphs
|
|
@@ -126,7 +144,9 @@ class ParagraphFormatter(Expression):
|
|
|
126
144
|
class SentenceFormatter(Expression):
|
|
127
145
|
def __init__(self, value=None, **kwargs):
|
|
128
146
|
super().__init__(value, **kwargs)
|
|
129
|
-
self.SENTENCES_RE = re.compile(
|
|
147
|
+
self.SENTENCES_RE = re.compile(
|
|
148
|
+
r"[.!?]\n*|[\n]{1,}"
|
|
149
|
+
) # Sentence ending characters followed by newlines
|
|
130
150
|
|
|
131
151
|
def split_sentences(self, input_text=""):
|
|
132
152
|
input_ = input_text.strip()
|
|
@@ -161,13 +181,7 @@ class RegexFormatter(Expression):
|
|
|
161
181
|
|
|
162
182
|
|
|
163
183
|
class TextContainerFormatter(Expression):
|
|
164
|
-
def __init__(
|
|
165
|
-
self,
|
|
166
|
-
value: Any = None,
|
|
167
|
-
key: str ="text",
|
|
168
|
-
text_split: int = 4,
|
|
169
|
-
**kwargs
|
|
170
|
-
):
|
|
184
|
+
def __init__(self, value: Any = None, key: str = "text", text_split: int = 4, **kwargs):
|
|
171
185
|
super().__init__(value, **kwargs)
|
|
172
186
|
self.key = key
|
|
173
187
|
self.text_split = text_split
|
|
@@ -179,7 +193,7 @@ class TextContainerFormatter(Expression):
|
|
|
179
193
|
chunks = [text for container in tqdm(containers) for text in self._chunk(container)]
|
|
180
194
|
return self._to_symbol(chunks)
|
|
181
195
|
|
|
182
|
-
def _chunk(self, container:
|
|
196
|
+
def _chunk(self, container: "TextContainer") -> List[str]:
|
|
183
197
|
text = container.text
|
|
184
198
|
step = len(text) // self.text_split
|
|
185
199
|
splits = []
|
|
@@ -189,16 +203,10 @@ class TextContainerFormatter(Expression):
|
|
|
189
203
|
# Unify the last chunk with the previous one if necessary
|
|
190
204
|
splits.append(self._as_str(text[i:], container))
|
|
191
205
|
break
|
|
192
|
-
splits.append(self._as_str(text[i:i+step], container))
|
|
206
|
+
splits.append(self._as_str(text[i : i + step], container))
|
|
193
207
|
i += step
|
|
194
208
|
c += 1
|
|
195
209
|
return splits
|
|
196
210
|
|
|
197
|
-
def _as_str(self, text: str, container:
|
|
198
|
-
return
|
|
199
|
-
'---\n'
|
|
200
|
-
f"id: {container.id}\n"
|
|
201
|
-
f"page: {container.page}\n"
|
|
202
|
-
'---\n'
|
|
203
|
-
f"{text}"
|
|
204
|
-
)
|
|
211
|
+
def _as_str(self, text: str, container: "TextContainer") -> str:
|
|
212
|
+
return f"---\nid: {container.id}\npage: {container.page}\n---\n{text}"
|