symbolicai 0.20.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. symai/__init__.py +96 -64
  2. symai/backend/base.py +93 -80
  3. symai/backend/engines/drawing/engine_bfl.py +12 -11
  4. symai/backend/engines/drawing/engine_gpt_image.py +108 -87
  5. symai/backend/engines/embedding/engine_llama_cpp.py +25 -28
  6. symai/backend/engines/embedding/engine_openai.py +3 -5
  7. symai/backend/engines/execute/engine_python.py +6 -5
  8. symai/backend/engines/files/engine_io.py +74 -67
  9. symai/backend/engines/imagecaptioning/engine_blip2.py +3 -3
  10. symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +54 -38
  11. symai/backend/engines/index/engine_pinecone.py +23 -24
  12. symai/backend/engines/index/engine_vectordb.py +16 -14
  13. symai/backend/engines/lean/engine_lean4.py +38 -34
  14. symai/backend/engines/neurosymbolic/__init__.py +41 -13
  15. symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +262 -182
  16. symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +263 -191
  17. symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +53 -49
  18. symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +212 -211
  19. symai/backend/engines/neurosymbolic/engine_groq.py +87 -63
  20. symai/backend/engines/neurosymbolic/engine_huggingface.py +21 -24
  21. symai/backend/engines/neurosymbolic/engine_llama_cpp.py +117 -48
  22. symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +256 -229
  23. symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +270 -150
  24. symai/backend/engines/ocr/engine_apilayer.py +6 -8
  25. symai/backend/engines/output/engine_stdout.py +1 -4
  26. symai/backend/engines/search/engine_openai.py +7 -7
  27. symai/backend/engines/search/engine_perplexity.py +5 -5
  28. symai/backend/engines/search/engine_serpapi.py +12 -14
  29. symai/backend/engines/speech_to_text/engine_local_whisper.py +20 -27
  30. symai/backend/engines/symbolic/engine_wolframalpha.py +3 -3
  31. symai/backend/engines/text_to_speech/engine_openai.py +5 -7
  32. symai/backend/engines/text_vision/engine_clip.py +7 -11
  33. symai/backend/engines/userinput/engine_console.py +3 -3
  34. symai/backend/engines/webscraping/engine_requests.py +81 -48
  35. symai/backend/mixin/__init__.py +13 -0
  36. symai/backend/mixin/anthropic.py +4 -2
  37. symai/backend/mixin/deepseek.py +2 -0
  38. symai/backend/mixin/google.py +2 -0
  39. symai/backend/mixin/openai.py +11 -3
  40. symai/backend/settings.py +83 -16
  41. symai/chat.py +101 -78
  42. symai/collect/__init__.py +7 -1
  43. symai/collect/dynamic.py +77 -69
  44. symai/collect/pipeline.py +35 -27
  45. symai/collect/stats.py +75 -63
  46. symai/components.py +198 -169
  47. symai/constraints.py +15 -12
  48. symai/core.py +698 -359
  49. symai/core_ext.py +32 -34
  50. symai/endpoints/api.py +80 -73
  51. symai/extended/.DS_Store +0 -0
  52. symai/extended/__init__.py +46 -12
  53. symai/extended/api_builder.py +11 -8
  54. symai/extended/arxiv_pdf_parser.py +13 -12
  55. symai/extended/bibtex_parser.py +2 -3
  56. symai/extended/conversation.py +101 -90
  57. symai/extended/document.py +17 -10
  58. symai/extended/file_merger.py +18 -13
  59. symai/extended/graph.py +18 -13
  60. symai/extended/html_style_template.py +2 -4
  61. symai/extended/interfaces/blip_2.py +1 -2
  62. symai/extended/interfaces/clip.py +1 -2
  63. symai/extended/interfaces/console.py +7 -1
  64. symai/extended/interfaces/dall_e.py +1 -1
  65. symai/extended/interfaces/flux.py +1 -1
  66. symai/extended/interfaces/gpt_image.py +1 -1
  67. symai/extended/interfaces/input.py +1 -1
  68. symai/extended/interfaces/llava.py +0 -1
  69. symai/extended/interfaces/naive_vectordb.py +7 -8
  70. symai/extended/interfaces/naive_webscraping.py +1 -1
  71. symai/extended/interfaces/ocr.py +1 -1
  72. symai/extended/interfaces/pinecone.py +6 -5
  73. symai/extended/interfaces/serpapi.py +1 -1
  74. symai/extended/interfaces/terminal.py +2 -3
  75. symai/extended/interfaces/tts.py +1 -1
  76. symai/extended/interfaces/whisper.py +1 -1
  77. symai/extended/interfaces/wolframalpha.py +1 -1
  78. symai/extended/metrics/__init__.py +11 -1
  79. symai/extended/metrics/similarity.py +11 -13
  80. symai/extended/os_command.py +17 -16
  81. symai/extended/packages/__init__.py +29 -3
  82. symai/extended/packages/symdev.py +19 -16
  83. symai/extended/packages/sympkg.py +12 -9
  84. symai/extended/packages/symrun.py +21 -19
  85. symai/extended/repo_cloner.py +11 -10
  86. symai/extended/seo_query_optimizer.py +1 -2
  87. symai/extended/solver.py +20 -23
  88. symai/extended/summarizer.py +4 -3
  89. symai/extended/taypan_interpreter.py +10 -12
  90. symai/extended/vectordb.py +99 -82
  91. symai/formatter/__init__.py +9 -1
  92. symai/formatter/formatter.py +12 -16
  93. symai/formatter/regex.py +62 -63
  94. symai/functional.py +176 -122
  95. symai/imports.py +136 -127
  96. symai/interfaces.py +56 -27
  97. symai/memory.py +14 -13
  98. symai/misc/console.py +49 -39
  99. symai/misc/loader.py +5 -3
  100. symai/models/__init__.py +17 -1
  101. symai/models/base.py +269 -181
  102. symai/models/errors.py +0 -1
  103. symai/ops/__init__.py +32 -22
  104. symai/ops/measures.py +11 -15
  105. symai/ops/primitives.py +348 -228
  106. symai/post_processors.py +32 -28
  107. symai/pre_processors.py +39 -41
  108. symai/processor.py +6 -4
  109. symai/prompts.py +59 -45
  110. symai/server/huggingface_server.py +23 -20
  111. symai/server/llama_cpp_server.py +7 -5
  112. symai/shell.py +3 -4
  113. symai/shellsv.py +499 -375
  114. symai/strategy.py +517 -287
  115. symai/symbol.py +111 -116
  116. symai/utils.py +42 -36
  117. {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/METADATA +4 -2
  118. symbolicai-1.0.0.dist-info/RECORD +163 -0
  119. symbolicai-0.20.2.dist-info/RECORD +0 -162
  120. {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/WHEEL +0 -0
  121. {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/entry_points.txt +0 -0
  122. {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/licenses/LICENSE +0 -0
  123. {symbolicai-0.20.2.dist-info → symbolicai-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
1
  from pathlib import Path
2
- from typing import Optional
2
+
3
3
  from git import Repo
4
4
 
5
- from ..symbol import Expression
6
5
  from ..backend.settings import HOME_PATH
6
+ from ..symbol import Expression
7
+ from ..utils import UserMessage
7
8
 
8
9
 
9
10
  class RepositoryCloner(Expression):
@@ -16,7 +17,7 @@ class RepositoryCloner(Expression):
16
17
  repo_path (Optional[str]): The path where to clone the repository.
17
18
  By default it will be at '~/.symai/repos/'.
18
19
  """
19
- def __init__(self, repo_path: Optional[str] = None, **kwargs):
20
+ def __init__(self, repo_path: str | None = None, **kwargs):
20
21
  super().__init__(**kwargs)
21
22
  self.repo_dir = HOME_PATH / 'repos/' if repo_path is None else Path(repo_path)
22
23
  if not self.repo_dir.exists():
@@ -35,24 +36,24 @@ class RepositoryCloner(Expression):
35
36
  """
36
37
  repo_name = url.split('/')[-1].replace('.git', '')
37
38
  if (self.repo_dir / repo_name).is_dir():
38
- print(f'Repository {repo_name} already exists. Checking for updates...')
39
+ UserMessage(f'Repository {repo_name} already exists. Checking for updates...')
39
40
  try:
40
41
  repo = Repo(self.repo_dir / repo_name)
41
42
  current = repo.head.commit
42
43
  repo.remotes.origin.pull()
43
44
  if current != repo.head.commit:
44
- print(f'Repository {repo_name} updated.')
45
+ UserMessage(f'Repository {repo_name} updated.')
45
46
  else:
46
- print(f'Repository {repo_name} is up-to-date.')
47
+ UserMessage(f'Repository {repo_name} is up-to-date.')
47
48
  except Exception as e:
48
- print(f'An error occurred: {e}')
49
+ UserMessage(f'An error occurred: {e}')
49
50
  raise e
50
51
  else:
51
- print(f'Cloning repository {repo_name}...')
52
+ UserMessage(f'Cloning repository {repo_name}...')
52
53
  try:
53
54
  Repo.clone_from(url, self.repo_dir / repo_name)
54
- print(f'Repository {repo_name} cloned successfully.')
55
+ UserMessage(f'Repository {repo_name} cloned successfully.')
55
56
  except Exception as e:
56
- print(f'Failed to clone the repository. An error occurred: {e}')
57
+ UserMessage(f'Failed to clone the repository. An error occurred: {e}')
57
58
  raise e
58
59
  return str(self.repo_dir / repo_name)
@@ -4,7 +4,6 @@ from ..pre_processors import PreProcessor
4
4
  from ..prompts import Prompt
5
5
  from ..symbol import Expression, Symbol
6
6
 
7
-
8
7
  SEO_OPTIMIZER_DESCRIPTION = """[Description]
9
8
  You are a SEO query optimizer. You are given a list of queries, phrases or sentences and you need to optimize them for search engines.
10
9
  Assume your search engines are based on vector databases and contain indices of GitHub repositories, papers and other resources.
@@ -17,7 +16,7 @@ The number of resulting queries should be between 1 and 8 statements separated b
17
16
 
18
17
  class SEOQueryOptimizerPreProcessor(PreProcessor):
19
18
  def __call__(self, argument):
20
- return '$> {} =>'.format(str(argument.args[0]))
19
+ return f'$> {argument.args[0]!s} =>'
21
20
 
22
21
 
23
22
  class SEOQueryOptimizer(Expression):
symai/extended/solver.py CHANGED
@@ -1,16 +1,18 @@
1
1
  import argparse
2
+
2
3
  try:
3
4
  import z3
4
5
  except ImportError:
5
6
  z3 = None
6
7
 
7
- from .conversation import Conversation
8
8
  from .. import core
9
9
  from ..components import Execute
10
- from ..post_processors import StripPostProcessor, CodeExtractPostProcessor
10
+ from ..post_processors import CodeExtractPostProcessor, StripPostProcessor
11
11
  from ..pre_processors import PreProcessor
12
12
  from ..prompts import Prompt
13
13
  from ..symbol import Expression, Symbol
14
+ from ..utils import UserMessage
15
+ from .conversation import Conversation
14
16
 
15
17
  #############################################################################################
16
18
  #
@@ -57,15 +59,17 @@ $> Max is 2 years older than his brother. In 5 years, Max will be 3 times as old
57
59
 
58
60
  class ProblemClassifierPreProcessor(PreProcessor):
59
61
  def __call__(self, argument):
60
- return '$> {}\n//'.format(str(argument.prop.instance))
62
+ return f'$> {argument.prop.instance!s}\n//'
61
63
 
62
64
 
63
65
  class OptionsPreProcessor(PreProcessor):
64
66
  def __call__(self, argument):
65
- return '$> :{}: == :{}: =>'.format(str(argument.prop.instance), str(argument.args[0]))
67
+ return f'$> :{argument.prop.instance!s}: == :{argument.args[0]!s}: =>'
66
68
 
67
69
 
68
70
  class ProblemClassifier(Expression):
71
+ __hash__ = Expression.__hash__
72
+
69
73
  @property
70
74
  def static_context(self):
71
75
  return PROBLEM_CATEGORY_CONTEXT
@@ -105,7 +109,7 @@ class ProblemClassifier(Expression):
105
109
 
106
110
  class FormulaCheckerPreProcessor(PreProcessor):
107
111
  def __call__(self, argument):
108
- return '$> {} =>'.format(str(argument.prop.instance))
112
+ return f'$> {argument.prop.instance!s} =>'
109
113
 
110
114
 
111
115
  class FormulaChecker(Expression):
@@ -145,7 +149,7 @@ class FormulaChecker(Expression):
145
149
 
146
150
  class FormulaWriterPreProcessor(PreProcessor):
147
151
  def __call__(self, argument):
148
- return '$> {} =>'.format(str(argument.prop.instance))
152
+ return f'$> {argument.prop.instance!s} =>'
149
153
 
150
154
 
151
155
  class FormulaWriter(Expression):
@@ -212,9 +216,8 @@ class SATSolver(Expression):
212
216
  m = S.model()
213
217
  # Return the solution
214
218
  return m[query]
215
- else:
216
- print("Cannot solve the puzzle. Returned: " + str(r))
217
- return None
219
+ UserMessage("Cannot solve the puzzle. Returned: " + str(r))
220
+ return None
218
221
 
219
222
 
220
223
  #############################################################################################
@@ -244,32 +247,26 @@ class Solver(Expression):
244
247
  classifier = ProblemClassifier(sym)
245
248
  problem = classifier(**kwargs)
246
249
 
247
- if 'Arithmetics formula' == problem:
248
- formula = self.rewrite_formula(sym, **kwargs)
249
- print(formula)
250
- elif 'Equations' == problem:
250
+ if problem == 'Arithmetics formula' or problem == 'Equations':
251
251
  formula = self.rewrite_formula(sym, **kwargs)
252
- print(formula)
253
- elif 'Implication and logical expressions' == problem:
252
+ UserMessage(str(formula))
253
+ elif problem == 'Implication and logical expressions':
254
254
  res = self.conv(sym, **kwargs)
255
255
  code = self.pp(str(res), None, tag="python")
256
256
  formula = self.solver(code, lambda: 'German')
257
- print(formula)
258
- elif 'Probability and statistics' == problem:
259
- raise NotImplementedError('This feature is not yet implemented.')
260
- elif 'Linear algebra' == problem:
261
- raise NotImplementedError('This feature is not yet implemented.')
262
- elif 'Linguistic problem with relations' == problem:
263
- raise NotImplementedError('This feature is not yet implemented.')
257
+ UserMessage(str(formula))
258
+ elif problem == 'Probability and statistics' or problem == 'Linear algebra' or problem == 'Linguistic problem with relations':
259
+ UserMessage('This feature is not yet implemented.', raise_with=NotImplementedError)
264
260
  else:
265
261
  return "Sorry, something went wrong. Please check if your backend is available and try again or report an issue to the devs. :("
262
+ return None
266
263
 
267
264
 
268
265
  def process_query(args) -> None:
269
266
  query = args.query
270
267
  solver = Solver()
271
268
  res = solver(query)
272
- print(res)
269
+ UserMessage(str(res))
273
270
 
274
271
 
275
272
  def run() -> None:
@@ -1,13 +1,14 @@
1
- from typing import List
2
1
 
3
2
  from ..components import Clean, Outline, Sequence, Stream, Translate
4
3
  from ..symbol import Expression, Symbol
5
4
 
6
5
 
7
6
  class Summarizer(Expression):
8
- def __init__(self, filters: List[Expression] = [], **kwargs):
7
+ def __init__(self, filters: list[Expression] | None = None, **kwargs):
8
+ if filters is None:
9
+ filters = []
9
10
  super().__init__(**kwargs)
10
- filters = filters if isinstance(filters, List) or isinstance(filters, tuple) else [filters]
11
+ filters = filters if isinstance(filters, (list, tuple)) else [filters]
11
12
  self.data_stream = Stream(Sequence(
12
13
  Clean(),
13
14
  Translate(),
@@ -1,25 +1,24 @@
1
- import os
2
1
  import pathlib
3
2
 
4
- from ..core import *
5
- from ..pre_processors import PreProcessor
3
+ from ..core import zero_shot
6
4
  from ..post_processors import CodeExtractPostProcessor
5
+ from ..pre_processors import PreProcessor
7
6
  from ..symbol import Expression, Symbol
8
7
 
9
8
 
10
9
  def create_template():
11
- package_path = os.path.dirname(pathlib.Path(__file__).parent.absolute())
10
+ package_path = pathlib.Path(__file__).parent.absolute()
12
11
 
13
12
 
14
- with open(os.path.join(package_path, 'symbol.py'), 'r') as f:
13
+ with (package_path / 'symbol.py').open() as f:
15
14
  SYMBOL_API = f.read()
16
15
 
17
16
 
18
- with open(os.path.join(package_path, 'components.py'), 'r') as f:
17
+ with (package_path / 'components.py').open() as f:
19
18
  COMPONENTS_API = f.read()
20
19
 
21
20
 
22
- TAYPAN_DESCRIPTION = """[Description]
21
+ return f"""[Description]
23
22
  You are a programming language re-writing system from Taypan (high-level general-purpose programming language based on neuro-symbolic virtual machine) to Python interpreter, analogous to the relation between Scala and Java is the relation of Taypan to Python.
24
23
 
25
24
  All code from Python is valid Taypan code, but not all code from Taypan is valid Python code.
@@ -104,18 +103,17 @@ def create_template():
104
103
  [SymboliAI API]
105
104
 
106
105
  - components `from symai.components import *`:
107
- {0}
106
+ {COMPONENTS_API}
108
107
 
109
108
  - symbol `from symai.symbol import *`:
110
- {1}
109
+ {SYMBOL_API}
111
110
 
112
- """.format(COMPONENTS_API, SYMBOL_API)
113
- return TAYPAN_DESCRIPTION
111
+ """
114
112
 
115
113
 
116
114
  class TaypanPreProcessor(PreProcessor):
117
115
  def __call__(self, argument):
118
- return '```taypan\n{}\n =>'.format(str(argument.args[0]))
116
+ return f'```taypan\n{argument.args[0]!s}\n =>'
119
117
 
120
118
 
121
119
  class TaypanInterpreter(Expression):
@@ -1,34 +1,40 @@
1
1
  import gzip
2
2
  import logging
3
- import os
4
3
  import pickle
4
+ from collections.abc import Mapping
5
5
  from copy import deepcopy
6
6
  from pathlib import Path
7
+ from typing import Any, ClassVar
7
8
 
8
9
  import numpy as np
9
10
 
10
11
  from ..backend.settings import HOME_PATH, SYMAI_CONFIG
11
12
  from ..interfaces import Interface
12
13
  from ..symbol import Expression, Symbol
13
- from ..utils import CustomUserWarning
14
- from .metrics import (adams_similarity, cosine_similarity,
15
- derridaean_similarity, dot_product, euclidean_metric,
16
- ranking_algorithm_sort)
14
+ from ..utils import UserMessage
15
+ from .metrics import (
16
+ adams_similarity,
17
+ cosine_similarity,
18
+ derridaean_similarity,
19
+ dot_product,
20
+ euclidean_metric,
21
+ ranking_algorithm_sort,
22
+ )
17
23
 
18
24
  logging.getLogger('sentence_transformers').setLevel(logging.WARNING)
19
25
  logging.getLogger('datasets').setLevel(logging.WARNING)
20
26
 
21
27
 
22
28
  class VectorDB(Expression):
23
- _default_documents = []
24
- _default_vectors = None
25
- _default_batch_size = 2048
26
- _default_similarity_metric = "cosine"
27
- _default_embedding_function = None
28
- _default_index_dims = 768
29
- _default_top_k = 5
30
- _default_storage_path = os.path.join(HOME_PATH, "localdb")
31
- _default_index_name = "dataindex"
29
+ _default_documents: ClassVar[list] = []
30
+ _default_vectors: ClassVar[np.ndarray | None] = None
31
+ _default_batch_size: ClassVar[int] = 2048
32
+ _default_similarity_metric: ClassVar[str] = "cosine"
33
+ _default_embedding_function: ClassVar[object | None] = None
34
+ _default_index_dims: ClassVar[int] = 768
35
+ _default_top_k: ClassVar[int] = 5
36
+ _default_storage_path: ClassVar[Path] = HOME_PATH / "localdb"
37
+ _default_index_name: ClassVar[str] = "dataindex"
32
38
  def __init__(
33
39
  self,
34
40
  documents=_default_documents,
@@ -71,12 +77,11 @@ class VectorDB(Expression):
71
77
  elif "adams" in similarity_metric:
72
78
  self.similarity_metric = adams_similarity
73
79
  else:
74
- CustomUserWarning(f"Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.", raise_with=ValueError)
80
+ UserMessage("Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.", raise_with=ValueError)
75
81
 
76
82
  if load_on_init:
77
- # If load_on_init is a string, use it as the storage file
78
- if isinstance(load_on_init, str):
79
- path = os.path.join(load_on_init, f"{self.index_name}.pkl")
83
+ if isinstance(load_on_init, (str, Path)):
84
+ path = Path(load_on_init) / f"{self.index_name}.pkl"
80
85
  self.load(path)
81
86
  else:
82
87
  self.load()
@@ -87,6 +92,52 @@ class VectorDB(Expression):
87
92
  else:
88
93
  self.model = lambda x: Symbol(x).embedding
89
94
 
95
+ def _unwrap_documents(self, documents):
96
+ if isinstance(documents, Symbol):
97
+ return documents.value
98
+ return documents
99
+
100
+ def _to_texts(self, documents, key):
101
+ if not isinstance(documents, list):
102
+ self._raise_texts_unassigned()
103
+ if len(documents) == 0:
104
+ return []
105
+ first_document = documents[0]
106
+ if isinstance(first_document, dict):
107
+ return self._texts_from_dicts(documents, key)
108
+ if isinstance(first_document, str):
109
+ return documents
110
+ return self._raise_texts_unassigned()
111
+
112
+ def _texts_from_dicts(self, documents, key):
113
+ if isinstance(key, str):
114
+ key_chain = key.split(".") if "." in key else [key]
115
+ return [self._resolve_key_chain(doc, key_chain).replace("\n", " ") for doc in documents]
116
+ if key is None:
117
+ return [
118
+ ", ".join([f"{dict_key}: {value}" for dict_key, value in doc.items()])
119
+ for doc in documents
120
+ ]
121
+ return self._raise_texts_unassigned()
122
+
123
+ def _resolve_key_chain(self, document, key_chain):
124
+ current_document = document
125
+ for chain_key in key_chain:
126
+ current_document = current_document[chain_key]
127
+ return current_document
128
+
129
+ def _embed_batch(self, batch):
130
+ emb = self.model(batch)
131
+ if len(emb.shape) == 1:
132
+ return [emb]
133
+ if len(emb.shape) == 2:
134
+ return [emb[index] for index in range(emb.shape[0])]
135
+ return UserMessage("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
136
+
137
+ def _raise_texts_unassigned(self):
138
+ error_message = "local variable 'texts' referenced before assignment"
139
+ raise UnboundLocalError(error_message)
140
+
90
141
  def _get_embedding(self, documents, key=None):
91
142
  """
92
143
  Get embeddings from a list of documents.
@@ -103,48 +154,14 @@ class VectorDB(Expression):
103
154
  embeddings : numpy.ndarray
104
155
  A numpy array of embeddings.
105
156
  """
106
- # unwrap the documents if they are a Symbol
107
- if isinstance(documents, Symbol):
108
- documents = documents.value
109
- # if the documents are a list of Symbols, unwrap them
157
+ documents = self._unwrap_documents(documents)
110
158
  if len(documents) == 0:
111
159
  return []
112
- if isinstance(documents, list):
113
- # If the documents are a list of dictionaries, extract the text from the dictionary
114
- if isinstance(documents[0], dict):
115
- texts = []
116
- # If a key is specified, extract the text from the dictionary using the key
117
- if isinstance(key, str):
118
- if "." in key:
119
- key_chain = key.split(".")
120
- else:
121
- key_chain = [key]
122
- for doc in documents:
123
- for key in key_chain:
124
- doc = doc[key]
125
- texts.append(doc.replace("\n", " "))
126
- # If no key is specified, extract the text from the dictionary using all keys
127
- elif key is None:
128
- for doc in documents:
129
- text = ", ".join([f"{key}: {value}" for key, value in doc.items()])
130
- texts.append(text)
131
- # If the documents are a list of strings, use the strings as the documents
132
- elif isinstance(documents[0], str):
133
- texts = documents
134
- # If the documents are a list of lists, use the lists as the documents
135
- batches = [texts[i : i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
160
+ texts = self._to_texts(documents, key)
161
+ batches = [texts[index : index + self.batch_size] for index in range(0, len(texts), self.batch_size)]
136
162
  embeddings = []
137
- # Embed the documents in batches
138
163
  for batch in batches:
139
- # Extend the embeddings list with the embeddings from the batch
140
- emb = self.model(batch)
141
- if len(emb.shape) == 1:
142
- embeddings.append(emb)
143
- elif len(emb.shape) == 2:
144
- for i in range(emb.shape[0]):
145
- embeddings.append(emb[i])
146
- else:
147
- CustomUserWarning("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
164
+ embeddings.extend(self._embed_batch(batch))
148
165
  return embeddings
149
166
 
150
167
  def dict(self, vectors=False):
@@ -165,7 +182,7 @@ class VectorDB(Expression):
165
182
  return [
166
183
  {"document": document, "vector": vector.tolist(), "index": index}
167
184
  for index, (document, vector) in enumerate(
168
- zip(self.documents, self.vectors)
185
+ zip(self.documents, self.vectors, strict=False)
169
186
  )
170
187
  ]
171
188
  return [
@@ -191,8 +208,9 @@ class VectorDB(Expression):
191
208
  if not isinstance(documents, list):
192
209
  return self.add_document(documents, vectors)
193
210
  self.add_documents(documents, vectors)
211
+ return None
194
212
 
195
- def add_document(self, document: dict, vector=None):
213
+ def add_document(self, document: Mapping[str, Any], vector=None):
196
214
  """
197
215
  Adds a document to the database.
198
216
 
@@ -208,9 +226,9 @@ class VectorDB(Expression):
208
226
  if self.vectors is None:
209
227
  self.vectors = np.empty((0, len(vector)), dtype=np.float32)
210
228
  elif len(vector) != self.vectors.shape[1]:
211
- CustomUserWarning("All vectors must have the same length.", raise_with=ValueError)
229
+ UserMessage("All vectors must have the same length.", raise_with=ValueError)
212
230
  # convert the vector to a numpy array if it is not already
213
- if type(vector) == list:
231
+ if isinstance(vector, list):
214
232
  vector = np.array(vector)
215
233
  self.vectors = np.vstack([self.vectors, vector]).astype(np.float32)
216
234
  self.documents.append(document)
@@ -243,7 +261,7 @@ class VectorDB(Expression):
243
261
  if not documents:
244
262
  return
245
263
  vectors = vectors or np.array(self.embedding_function(documents)).astype(np.float32)
246
- for vector, document in zip(vectors, documents):
264
+ for vector, document in zip(vectors, documents, strict=False):
247
265
  self.add_document(document, vector)
248
266
 
249
267
  def clear(self):
@@ -254,7 +272,7 @@ class VectorDB(Expression):
254
272
  self.vectors = None
255
273
  self.documents = []
256
274
 
257
- def save(self, storage_file: str = None):
275
+ def save(self, storage_file: str | None = None):
258
276
  """
259
277
  Saves the database to a file.
260
278
 
@@ -265,20 +283,20 @@ class VectorDB(Expression):
265
283
 
266
284
  """
267
285
  if storage_file is None:
268
- # use path to home directory by default
269
- storage_path = os.path.join(HOME_PATH, "localdb")
270
- os.makedirs(storage_path, exist_ok=True)
271
- storage_file = os.path.join(storage_path, f"{self.index_name}.pkl")
286
+ storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
287
+ storage_file.parent.mkdir(parents=True, exist_ok=True)
288
+ else:
289
+ storage_file = Path(storage_file)
272
290
 
273
291
  data = {"vectors": self.vectors, "documents": self.documents}
274
- if storage_file.endswith(".gz"):
292
+ if storage_file.suffix == ".gz":
275
293
  with gzip.open(storage_file, "wb") as f:
276
294
  pickle.dump(data, f)
277
295
  else:
278
- with open(storage_file, "wb") as f:
296
+ with storage_file.open("wb") as f:
279
297
  pickle.dump(data, f)
280
298
 
281
- def load(self, storage_file : str = None):
299
+ def load(self, storage_file : str | None = None):
282
300
  """
283
301
  Loads the database from a file.
284
302
 
@@ -289,21 +307,20 @@ class VectorDB(Expression):
289
307
 
290
308
  """
291
309
  if storage_file is None:
292
- # use path to home directory by default
293
- storage_path = os.path.join(HOME_PATH, "localdb")
294
- # create dir on first load if never used
295
- os.makedirs(storage_path, exist_ok=True)
296
- storage_file = os.path.join(storage_path, f"{self.index_name}.pkl")
310
+ storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
311
+ storage_file.parent.mkdir(parents=True, exist_ok=True)
312
+ else:
313
+ storage_file = Path(storage_file)
297
314
 
298
315
  # return since nothing to load
299
- if not os.path.exists(storage_file):
316
+ if not storage_file.exists():
300
317
  return
301
318
 
302
- if storage_file.endswith(".gz"):
319
+ if storage_file.suffix == ".gz":
303
320
  with gzip.open(storage_file, "rb") as f:
304
321
  data = pickle.load(f)
305
322
  else:
306
- with open(storage_file, "rb") as f:
323
+ with storage_file.open("rb") as f:
307
324
  data = pickle.load(f)
308
325
 
309
326
  self.vectors = data["vectors"].astype(np.float32) if data["vectors"] is not None else None
@@ -328,11 +345,11 @@ class VectorDB(Expression):
328
345
  # use path to home directory by default
329
346
  storage_path = symai_folder / "localdb"
330
347
  # create dir on first load if never used
331
- os.makedirs(storage_path, exist_ok=True)
348
+ storage_path.mkdir(parents=True, exist_ok=True)
332
349
  storage_file = storage_path / f"{index_name}.pkl"
333
350
  if storage_file.exists():
334
351
  # remove the file
335
- os.remove(storage_file)
352
+ storage_file.unlink()
336
353
  self.clear()
337
354
 
338
355
  def forward(self, query=None, vector=None, top_k=None, return_similarities=True):
@@ -354,14 +371,14 @@ class VectorDB(Expression):
354
371
  A list of results.
355
372
 
356
373
  """
357
- assert self.vectors is not None, f"Error: Cannot query the database without prior insertion / initialization."
374
+ assert self.vectors is not None, "Error: Cannot query the database without prior insertion / initialization."
358
375
  top_k = top_k or self.index_top_k
359
376
  query_vector = self.embedding_function([query])[0] if vector is None else vector
360
- if type(query_vector) == list:
377
+ if isinstance(query_vector, list):
361
378
  query_vector = np.array(query_vector)
362
379
  ranked_results, similarities = ranking_algorithm_sort(
363
380
  self.vectors, query_vector, top_k=top_k, metric=self.similarity_metric
364
381
  )
365
382
  if return_similarities:
366
- return list(zip([self.documents[index] for index in ranked_results], similarities))
383
+ return list(zip([self.documents[index] for index in ranked_results], similarities, strict=False))
367
384
  return [self.documents[index] for index in ranked_results]
@@ -1,2 +1,10 @@
1
+ from .formatter import ParagraphFormatter, RegexFormatter, SentenceFormatter, TextContainerFormatter
1
2
  from .regex import CHUNK_REGEX
2
- from .formatter import ParagraphFormatter, SentenceFormatter, RegexFormatter, TextContainerFormatter
3
+
4
+ __all__ = [
5
+ "CHUNK_REGEX",
6
+ "ParagraphFormatter",
7
+ "RegexFormatter",
8
+ "SentenceFormatter",
9
+ "TextContainerFormatter",
10
+ ]
@@ -1,12 +1,16 @@
1
1
  import re
2
+ from typing import TYPE_CHECKING
2
3
 
3
4
  from beartype import beartype
4
5
  from beartype.typing import Any, Dict, List
5
6
  from tqdm import tqdm
6
7
 
7
- from .regex import CHUNK_REGEX
8
8
  from .. import core_ext
9
9
  from ..symbol import Expression, Symbol
10
+ from .regex import CHUNK_REGEX
11
+
12
+ if TYPE_CHECKING:
13
+ from ..backend.engines.files.engine_io import TextContainer
10
14
 
11
15
 
12
16
  class ParagraphFormatter(Expression):
@@ -22,7 +26,7 @@ class ParagraphFormatter(Expression):
22
26
  # split text file-wise and create a map of file names and their contents
23
27
  files = {}
24
28
  split_text = input_.split('# ----[FILE_START]')
25
- for i, file in enumerate(split_text):
29
+ for _i, file in enumerate(split_text):
26
30
  if not file.strip():
27
31
  continue
28
32
  _, content_file = file.split('[FILE_CONTENT]:')
@@ -109,7 +113,7 @@ class ParagraphFormatter(Expression):
109
113
  paragraphs.append(text)
110
114
  return paragraphs
111
115
 
112
- def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
116
+ def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
113
117
  sym = self._to_symbol(sym)
114
118
  # split text paragraph-wise and index each paragraph separately
115
119
  self.elements = self.split_files(sym.value)
@@ -128,13 +132,9 @@ class SentenceFormatter(Expression):
128
132
  input_ = input_text.strip()
129
133
  split_text = self.SENTENCES_RE.split(input_) # regex splitting
130
134
 
131
- sentences = [s.strip() + ".\n" for s in split_text if s.strip()]
132
- # s.strip() + ".\n" ensures that all lines in the sentence end with a period and newline
133
- # s.strip() == True if sentence has other characters than whitespace
135
+ return [s.strip() + ".\n" for s in split_text if s.strip()]
134
136
 
135
- return sentences
136
-
137
- def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
137
+ def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
138
138
  sym = self._to_symbol(sym)
139
139
  # split text sentence-wise and index each sentence separately
140
140
  self.elements = self.split_sentences(sym.value)
@@ -151,12 +151,9 @@ class RegexFormatter(Expression):
151
151
  input_ = input_text.strip()
152
152
  split_text = self.SENTENCES_RE.split(input_) # regex splitting
153
153
 
154
- chunks = [s.strip() for s in split_text if s.strip()]
155
- # s.strip() == True if sentence has other characters than whitespace
156
-
157
- return chunks
154
+ return [s.strip() for s in split_text if s.strip()]
158
155
 
159
- def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
156
+ def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
160
157
  sym = self._to_symbol(sym)
161
158
  # split text sentence-wise and index each sentence separately
162
159
  self.elements = self.split_sentences(sym.value)
@@ -176,7 +173,7 @@ class TextContainerFormatter(Expression):
176
173
  self.text_split = text_split
177
174
 
178
175
  @beartype
179
- def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
176
+ def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
180
177
  if isinstance(sym.value, list):
181
178
  containers = [container for pdf in sym.value for container in pdf]
182
179
  chunks = [text for container in tqdm(containers) for text in self._chunk(container)]
@@ -205,4 +202,3 @@ class TextContainerFormatter(Expression):
205
202
  '---\n'
206
203
  f"{text}"
207
204
  )
208
-