symbolicai 0.21.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. symai/__init__.py +269 -173
  2. symai/backend/base.py +123 -110
  3. symai/backend/engines/drawing/engine_bfl.py +45 -44
  4. symai/backend/engines/drawing/engine_gpt_image.py +112 -97
  5. symai/backend/engines/embedding/engine_llama_cpp.py +63 -52
  6. symai/backend/engines/embedding/engine_openai.py +25 -21
  7. symai/backend/engines/execute/engine_python.py +19 -18
  8. symai/backend/engines/files/engine_io.py +104 -95
  9. symai/backend/engines/imagecaptioning/engine_blip2.py +28 -24
  10. symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +102 -79
  11. symai/backend/engines/index/engine_pinecone.py +124 -97
  12. symai/backend/engines/index/engine_qdrant.py +1011 -0
  13. symai/backend/engines/index/engine_vectordb.py +84 -56
  14. symai/backend/engines/lean/engine_lean4.py +96 -52
  15. symai/backend/engines/neurosymbolic/__init__.py +41 -13
  16. symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +330 -248
  17. symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +329 -264
  18. symai/backend/engines/neurosymbolic/engine_cerebras.py +328 -0
  19. symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +118 -88
  20. symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +344 -299
  21. symai/backend/engines/neurosymbolic/engine_groq.py +173 -115
  22. symai/backend/engines/neurosymbolic/engine_huggingface.py +114 -84
  23. symai/backend/engines/neurosymbolic/engine_llama_cpp.py +144 -118
  24. symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +415 -307
  25. symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +394 -231
  26. symai/backend/engines/ocr/engine_apilayer.py +23 -27
  27. symai/backend/engines/output/engine_stdout.py +10 -13
  28. symai/backend/engines/{webscraping → scrape}/engine_requests.py +101 -54
  29. symai/backend/engines/search/engine_openai.py +100 -88
  30. symai/backend/engines/search/engine_parallel.py +665 -0
  31. symai/backend/engines/search/engine_perplexity.py +44 -45
  32. symai/backend/engines/search/engine_serpapi.py +37 -34
  33. symai/backend/engines/speech_to_text/engine_local_whisper.py +54 -51
  34. symai/backend/engines/symbolic/engine_wolframalpha.py +15 -9
  35. symai/backend/engines/text_to_speech/engine_openai.py +20 -26
  36. symai/backend/engines/text_vision/engine_clip.py +39 -37
  37. symai/backend/engines/userinput/engine_console.py +5 -6
  38. symai/backend/mixin/__init__.py +13 -0
  39. symai/backend/mixin/anthropic.py +48 -38
  40. symai/backend/mixin/deepseek.py +6 -5
  41. symai/backend/mixin/google.py +7 -4
  42. symai/backend/mixin/groq.py +2 -4
  43. symai/backend/mixin/openai.py +140 -110
  44. symai/backend/settings.py +87 -20
  45. symai/chat.py +216 -123
  46. symai/collect/__init__.py +7 -1
  47. symai/collect/dynamic.py +80 -70
  48. symai/collect/pipeline.py +67 -51
  49. symai/collect/stats.py +161 -109
  50. symai/components.py +707 -360
  51. symai/constraints.py +24 -12
  52. symai/core.py +1857 -1233
  53. symai/core_ext.py +83 -80
  54. symai/endpoints/api.py +166 -104
  55. symai/extended/.DS_Store +0 -0
  56. symai/extended/__init__.py +46 -12
  57. symai/extended/api_builder.py +29 -21
  58. symai/extended/arxiv_pdf_parser.py +23 -14
  59. symai/extended/bibtex_parser.py +9 -6
  60. symai/extended/conversation.py +156 -126
  61. symai/extended/document.py +50 -30
  62. symai/extended/file_merger.py +57 -14
  63. symai/extended/graph.py +51 -32
  64. symai/extended/html_style_template.py +18 -14
  65. symai/extended/interfaces/blip_2.py +2 -3
  66. symai/extended/interfaces/clip.py +4 -3
  67. symai/extended/interfaces/console.py +9 -1
  68. symai/extended/interfaces/dall_e.py +4 -2
  69. symai/extended/interfaces/file.py +2 -0
  70. symai/extended/interfaces/flux.py +4 -2
  71. symai/extended/interfaces/gpt_image.py +16 -7
  72. symai/extended/interfaces/input.py +2 -1
  73. symai/extended/interfaces/llava.py +1 -2
  74. symai/extended/interfaces/{naive_webscraping.py → naive_scrape.py} +4 -3
  75. symai/extended/interfaces/naive_vectordb.py +9 -10
  76. symai/extended/interfaces/ocr.py +5 -3
  77. symai/extended/interfaces/openai_search.py +2 -0
  78. symai/extended/interfaces/parallel.py +30 -0
  79. symai/extended/interfaces/perplexity.py +2 -0
  80. symai/extended/interfaces/pinecone.py +12 -9
  81. symai/extended/interfaces/python.py +2 -0
  82. symai/extended/interfaces/serpapi.py +3 -1
  83. symai/extended/interfaces/terminal.py +2 -4
  84. symai/extended/interfaces/tts.py +3 -2
  85. symai/extended/interfaces/whisper.py +3 -2
  86. symai/extended/interfaces/wolframalpha.py +2 -1
  87. symai/extended/metrics/__init__.py +11 -1
  88. symai/extended/metrics/similarity.py +14 -13
  89. symai/extended/os_command.py +39 -29
  90. symai/extended/packages/__init__.py +29 -3
  91. symai/extended/packages/symdev.py +51 -43
  92. symai/extended/packages/sympkg.py +41 -35
  93. symai/extended/packages/symrun.py +63 -50
  94. symai/extended/repo_cloner.py +14 -12
  95. symai/extended/seo_query_optimizer.py +15 -13
  96. symai/extended/solver.py +116 -91
  97. symai/extended/summarizer.py +12 -10
  98. symai/extended/taypan_interpreter.py +17 -18
  99. symai/extended/vectordb.py +122 -92
  100. symai/formatter/__init__.py +9 -1
  101. symai/formatter/formatter.py +51 -47
  102. symai/formatter/regex.py +70 -69
  103. symai/functional.py +325 -176
  104. symai/imports.py +190 -147
  105. symai/interfaces.py +57 -28
  106. symai/memory.py +45 -35
  107. symai/menu/screen.py +28 -19
  108. symai/misc/console.py +66 -56
  109. symai/misc/loader.py +8 -5
  110. symai/models/__init__.py +17 -1
  111. symai/models/base.py +395 -236
  112. symai/models/errors.py +1 -2
  113. symai/ops/__init__.py +32 -22
  114. symai/ops/measures.py +24 -25
  115. symai/ops/primitives.py +1149 -731
  116. symai/post_processors.py +58 -50
  117. symai/pre_processors.py +86 -82
  118. symai/processor.py +21 -13
  119. symai/prompts.py +764 -685
  120. symai/server/huggingface_server.py +135 -49
  121. symai/server/llama_cpp_server.py +21 -11
  122. symai/server/qdrant_server.py +206 -0
  123. symai/shell.py +100 -42
  124. symai/shellsv.py +700 -492
  125. symai/strategy.py +630 -346
  126. symai/symbol.py +368 -322
  127. symai/utils.py +100 -78
  128. {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/METADATA +22 -10
  129. symbolicai-1.1.0.dist-info/RECORD +168 -0
  130. symbolicai-0.21.0.dist-info/RECORD +0 -162
  131. {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/WHEEL +0 -0
  132. {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/entry_points.txt +0 -0
  133. {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/licenses/LICENSE +0 -0
  134. {symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,41 @@
1
1
  import gzip
2
2
  import logging
3
- import os
4
3
  import pickle
4
+ from collections.abc import Mapping
5
5
  from copy import deepcopy
6
6
  from pathlib import Path
7
+ from typing import Any, ClassVar
7
8
 
8
9
  import numpy as np
9
10
 
10
11
  from ..backend.settings import HOME_PATH, SYMAI_CONFIG
11
12
  from ..interfaces import Interface
12
13
  from ..symbol import Expression, Symbol
13
- from ..utils import CustomUserWarning
14
- from .metrics import (adams_similarity, cosine_similarity,
15
- derridaean_similarity, dot_product, euclidean_metric,
16
- ranking_algorithm_sort)
14
+ from ..utils import UserMessage
15
+ from .metrics import (
16
+ adams_similarity,
17
+ cosine_similarity,
18
+ derridaean_similarity,
19
+ dot_product,
20
+ euclidean_metric,
21
+ ranking_algorithm_sort,
22
+ )
17
23
 
18
- logging.getLogger('sentence_transformers').setLevel(logging.WARNING)
19
- logging.getLogger('datasets').setLevel(logging.WARNING)
24
+ logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
25
+ logging.getLogger("datasets").setLevel(logging.WARNING)
20
26
 
21
27
 
22
28
  class VectorDB(Expression):
23
- _default_documents = []
24
- _default_vectors = None
25
- _default_batch_size = 2048
26
- _default_similarity_metric = "cosine"
27
- _default_embedding_function = None
28
- _default_index_dims = 768
29
- _default_top_k = 5
30
- _default_storage_path = os.path.join(HOME_PATH, "localdb")
31
- _default_index_name = "dataindex"
29
+ _default_documents: ClassVar[list] = []
30
+ _default_vectors: ClassVar[np.ndarray | None] = None
31
+ _default_batch_size: ClassVar[int] = 2048
32
+ _default_similarity_metric: ClassVar[str] = "cosine"
33
+ _default_embedding_function: ClassVar[object | None] = None
34
+ _default_index_dims: ClassVar[int] = 768
35
+ _default_top_k: ClassVar[int] = 5
36
+ _default_storage_path: ClassVar[Path] = HOME_PATH / "localdb"
37
+ _default_index_name: ClassVar[str] = "dataindex"
38
+
32
39
  def __init__(
33
40
  self,
34
41
  documents=_default_documents,
@@ -40,7 +47,7 @@ class VectorDB(Expression):
40
47
  index_dims=_default_index_dims,
41
48
  top_k=_default_top_k,
42
49
  index_name=_default_index_name,
43
- **kwargs
50
+ **kwargs,
44
51
  ):
45
52
  super().__init__(**kwargs)
46
53
  self.config = deepcopy(SYMAI_CONFIG)
@@ -71,22 +78,73 @@ class VectorDB(Expression):
71
78
  elif "adams" in similarity_metric:
72
79
  self.similarity_metric = adams_similarity
73
80
  else:
74
- CustomUserWarning(f"Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.", raise_with=ValueError)
81
+ UserMessage(
82
+ "Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.",
83
+ raise_with=ValueError,
84
+ )
75
85
 
76
86
  if load_on_init:
77
- # If load_on_init is a string, use it as the storage file
78
- if isinstance(load_on_init, str):
79
- path = os.path.join(load_on_init, f"{self.index_name}.pkl")
87
+ if isinstance(load_on_init, (str, Path)):
88
+ path = Path(load_on_init) / f"{self.index_name}.pkl"
80
89
  self.load(path)
81
90
  else:
82
91
  self.load()
83
92
 
84
93
  def _init_embedding_model(self):
85
- if self.config['EMBEDDING_ENGINE_API_KEY'] is None or self.config['EMBEDDING_ENGINE_API_KEY'] == '':
86
- self.model = Interface('ExtensityAI/embeddings') # default to local model
94
+ if (
95
+ self.config["EMBEDDING_ENGINE_API_KEY"] is None
96
+ or self.config["EMBEDDING_ENGINE_API_KEY"] == ""
97
+ ):
98
+ self.model = Interface("ExtensityAI/embeddings") # default to local model
87
99
  else:
88
100
  self.model = lambda x: Symbol(x).embedding
89
101
 
102
+ def _unwrap_documents(self, documents):
103
+ if isinstance(documents, Symbol):
104
+ return documents.value
105
+ return documents
106
+
107
+ def _to_texts(self, documents, key):
108
+ if not isinstance(documents, list):
109
+ self._raise_texts_unassigned()
110
+ if len(documents) == 0:
111
+ return []
112
+ first_document = documents[0]
113
+ if isinstance(first_document, dict):
114
+ return self._texts_from_dicts(documents, key)
115
+ if isinstance(first_document, str):
116
+ return documents
117
+ return self._raise_texts_unassigned()
118
+
119
+ def _texts_from_dicts(self, documents, key):
120
+ if isinstance(key, str):
121
+ key_chain = key.split(".") if "." in key else [key]
122
+ return [self._resolve_key_chain(doc, key_chain).replace("\n", " ") for doc in documents]
123
+ if key is None:
124
+ return [
125
+ ", ".join([f"{dict_key}: {value}" for dict_key, value in doc.items()])
126
+ for doc in documents
127
+ ]
128
+ return self._raise_texts_unassigned()
129
+
130
+ def _resolve_key_chain(self, document, key_chain):
131
+ current_document = document
132
+ for chain_key in key_chain:
133
+ current_document = current_document[chain_key]
134
+ return current_document
135
+
136
+ def _embed_batch(self, batch):
137
+ emb = self.model(batch)
138
+ if len(emb.shape) == 1:
139
+ return [emb]
140
+ if len(emb.shape) == 2:
141
+ return [emb[index] for index in range(emb.shape[0])]
142
+ return UserMessage("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
143
+
144
+ def _raise_texts_unassigned(self):
145
+ error_message = "local variable 'texts' referenced before assignment"
146
+ raise UnboundLocalError(error_message)
147
+
90
148
  def _get_embedding(self, documents, key=None):
91
149
  """
92
150
  Get embeddings from a list of documents.
@@ -103,48 +161,17 @@ class VectorDB(Expression):
103
161
  embeddings : numpy.ndarray
104
162
  A numpy array of embeddings.
105
163
  """
106
- # unwrap the documents if they are a Symbol
107
- if isinstance(documents, Symbol):
108
- documents = documents.value
109
- # if the documents are a list of Symbols, unwrap them
164
+ documents = self._unwrap_documents(documents)
110
165
  if len(documents) == 0:
111
166
  return []
112
- if isinstance(documents, list):
113
- # If the documents are a list of dictionaries, extract the text from the dictionary
114
- if isinstance(documents[0], dict):
115
- texts = []
116
- # If a key is specified, extract the text from the dictionary using the key
117
- if isinstance(key, str):
118
- if "." in key:
119
- key_chain = key.split(".")
120
- else:
121
- key_chain = [key]
122
- for doc in documents:
123
- for key in key_chain:
124
- doc = doc[key]
125
- texts.append(doc.replace("\n", " "))
126
- # If no key is specified, extract the text from the dictionary using all keys
127
- elif key is None:
128
- for doc in documents:
129
- text = ", ".join([f"{key}: {value}" for key, value in doc.items()])
130
- texts.append(text)
131
- # If the documents are a list of strings, use the strings as the documents
132
- elif isinstance(documents[0], str):
133
- texts = documents
134
- # If the documents are a list of lists, use the lists as the documents
135
- batches = [texts[i : i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
167
+ texts = self._to_texts(documents, key)
168
+ batches = [
169
+ texts[index : index + self.batch_size]
170
+ for index in range(0, len(texts), self.batch_size)
171
+ ]
136
172
  embeddings = []
137
- # Embed the documents in batches
138
173
  for batch in batches:
139
- # Extend the embeddings list with the embeddings from the batch
140
- emb = self.model(batch)
141
- if len(emb.shape) == 1:
142
- embeddings.append(emb)
143
- elif len(emb.shape) == 2:
144
- for i in range(emb.shape[0]):
145
- embeddings.append(emb[i])
146
- else:
147
- CustomUserWarning("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
174
+ embeddings.extend(self._embed_batch(batch))
148
175
  return embeddings
149
176
 
150
177
  def dict(self, vectors=False):
@@ -165,12 +192,11 @@ class VectorDB(Expression):
165
192
  return [
166
193
  {"document": document, "vector": vector.tolist(), "index": index}
167
194
  for index, (document, vector) in enumerate(
168
- zip(self.documents, self.vectors)
195
+ zip(self.documents, self.vectors, strict=False)
169
196
  )
170
197
  ]
171
198
  return [
172
- {"document": document, "index": index}
173
- for index, document in enumerate(self.documents)
199
+ {"document": document, "index": index} for index, document in enumerate(self.documents)
174
200
  ]
175
201
 
176
202
  def add(self, documents, vectors=None):
@@ -191,8 +217,9 @@ class VectorDB(Expression):
191
217
  if not isinstance(documents, list):
192
218
  return self.add_document(documents, vectors)
193
219
  self.add_documents(documents, vectors)
220
+ return None
194
221
 
195
- def add_document(self, document: dict, vector=None):
222
+ def add_document(self, document: Mapping[str, Any], vector=None):
196
223
  """
197
224
  Adds a document to the database.
198
225
 
@@ -204,13 +231,13 @@ class VectorDB(Expression):
204
231
  A vector to add to the database.
205
232
 
206
233
  """
207
- vector = (vector if vector is not None else self.embedding_function([document])[0])
234
+ vector = vector if vector is not None else self.embedding_function([document])[0]
208
235
  if self.vectors is None:
209
236
  self.vectors = np.empty((0, len(vector)), dtype=np.float32)
210
237
  elif len(vector) != self.vectors.shape[1]:
211
- CustomUserWarning("All vectors must have the same length.", raise_with=ValueError)
238
+ UserMessage("All vectors must have the same length.", raise_with=ValueError)
212
239
  # convert the vector to a numpy array if it is not already
213
- if type(vector) == list:
240
+ if isinstance(vector, list):
214
241
  vector = np.array(vector)
215
242
  self.vectors = np.vstack([self.vectors, vector]).astype(np.float32)
216
243
  self.documents.append(document)
@@ -243,7 +270,7 @@ class VectorDB(Expression):
243
270
  if not documents:
244
271
  return
245
272
  vectors = vectors or np.array(self.embedding_function(documents)).astype(np.float32)
246
- for vector, document in zip(vectors, documents):
273
+ for vector, document in zip(vectors, documents, strict=False):
247
274
  self.add_document(document, vector)
248
275
 
249
276
  def clear(self):
@@ -251,10 +278,10 @@ class VectorDB(Expression):
251
278
  Clears the database.
252
279
 
253
280
  """
254
- self.vectors = None
281
+ self.vectors = None
255
282
  self.documents = []
256
283
 
257
- def save(self, storage_file: str = None):
284
+ def save(self, storage_file: str | None = None):
258
285
  """
259
286
  Saves the database to a file.
260
287
 
@@ -265,20 +292,20 @@ class VectorDB(Expression):
265
292
 
266
293
  """
267
294
  if storage_file is None:
268
- # use path to home directory by default
269
- storage_path = os.path.join(HOME_PATH, "localdb")
270
- os.makedirs(storage_path, exist_ok=True)
271
- storage_file = os.path.join(storage_path, f"{self.index_name}.pkl")
295
+ storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
296
+ storage_file.parent.mkdir(parents=True, exist_ok=True)
297
+ else:
298
+ storage_file = Path(storage_file)
272
299
 
273
300
  data = {"vectors": self.vectors, "documents": self.documents}
274
- if storage_file.endswith(".gz"):
301
+ if storage_file.suffix == ".gz":
275
302
  with gzip.open(storage_file, "wb") as f:
276
303
  pickle.dump(data, f)
277
304
  else:
278
- with open(storage_file, "wb") as f:
305
+ with storage_file.open("wb") as f:
279
306
  pickle.dump(data, f)
280
307
 
281
- def load(self, storage_file : str = None):
308
+ def load(self, storage_file: str | None = None):
282
309
  """
283
310
  Loads the database from a file.
284
311
 
@@ -289,27 +316,26 @@ class VectorDB(Expression):
289
316
 
290
317
  """
291
318
  if storage_file is None:
292
- # use path to home directory by default
293
- storage_path = os.path.join(HOME_PATH, "localdb")
294
- # create dir on first load if never used
295
- os.makedirs(storage_path, exist_ok=True)
296
- storage_file = os.path.join(storage_path, f"{self.index_name}.pkl")
319
+ storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
320
+ storage_file.parent.mkdir(parents=True, exist_ok=True)
321
+ else:
322
+ storage_file = Path(storage_file)
297
323
 
298
324
  # return since nothing to load
299
- if not os.path.exists(storage_file):
325
+ if not storage_file.exists():
300
326
  return
301
327
 
302
- if storage_file.endswith(".gz"):
328
+ if storage_file.suffix == ".gz":
303
329
  with gzip.open(storage_file, "rb") as f:
304
330
  data = pickle.load(f)
305
331
  else:
306
- with open(storage_file, "rb") as f:
332
+ with storage_file.open("rb") as f:
307
333
  data = pickle.load(f)
308
334
 
309
335
  self.vectors = data["vectors"].astype(np.float32) if data["vectors"] is not None else None
310
336
  self.documents = data["documents"]
311
337
 
312
- def purge(self, index_name : str):
338
+ def purge(self, index_name: str):
313
339
  """
314
340
  Purges the database file from your machine, but does not delete the database from memory.
315
341
  Use the `clear` method to clear the database from memory.
@@ -328,11 +354,11 @@ class VectorDB(Expression):
328
354
  # use path to home directory by default
329
355
  storage_path = symai_folder / "localdb"
330
356
  # create dir on first load if never used
331
- os.makedirs(storage_path, exist_ok=True)
357
+ storage_path.mkdir(parents=True, exist_ok=True)
332
358
  storage_file = storage_path / f"{index_name}.pkl"
333
359
  if storage_file.exists():
334
360
  # remove the file
335
- os.remove(storage_file)
361
+ storage_file.unlink()
336
362
  self.clear()
337
363
 
338
364
  def forward(self, query=None, vector=None, top_k=None, return_similarities=True):
@@ -354,14 +380,18 @@ class VectorDB(Expression):
354
380
  A list of results.
355
381
 
356
382
  """
357
- assert self.vectors is not None, f"Error: Cannot query the database without prior insertion / initialization."
383
+ assert self.vectors is not None, (
384
+ "Error: Cannot query the database without prior insertion / initialization."
385
+ )
358
386
  top_k = top_k or self.index_top_k
359
387
  query_vector = self.embedding_function([query])[0] if vector is None else vector
360
- if type(query_vector) == list:
388
+ if isinstance(query_vector, list):
361
389
  query_vector = np.array(query_vector)
362
390
  ranked_results, similarities = ranking_algorithm_sort(
363
391
  self.vectors, query_vector, top_k=top_k, metric=self.similarity_metric
364
392
  )
365
393
  if return_similarities:
366
- return list(zip([self.documents[index] for index in ranked_results], similarities))
394
+ return list(
395
+ zip([self.documents[index] for index in ranked_results], similarities, strict=False)
396
+ )
367
397
  return [self.documents[index] for index in ranked_results]
@@ -1,2 +1,10 @@
1
+ from .formatter import ParagraphFormatter, RegexFormatter, SentenceFormatter, TextContainerFormatter
1
2
  from .regex import CHUNK_REGEX
2
- from .formatter import ParagraphFormatter, SentenceFormatter, RegexFormatter, TextContainerFormatter
3
+
4
+ __all__ = [
5
+ "CHUNK_REGEX",
6
+ "ParagraphFormatter",
7
+ "RegexFormatter",
8
+ "SentenceFormatter",
9
+ "TextContainerFormatter",
10
+ ]
@@ -1,12 +1,16 @@
1
1
  import re
2
+ from typing import TYPE_CHECKING
2
3
 
3
4
  from beartype import beartype
4
5
  from beartype.typing import Any, Dict, List
5
6
  from tqdm import tqdm
6
7
 
7
- from .regex import CHUNK_REGEX
8
8
  from .. import core_ext
9
9
  from ..symbol import Expression, Symbol
10
+ from .regex import CHUNK_REGEX
11
+
12
+ if TYPE_CHECKING:
13
+ from ..backend.engines.files.engine_io import TextContainer
10
14
 
11
15
 
12
16
  class ParagraphFormatter(Expression):
@@ -17,16 +21,16 @@ class ParagraphFormatter(Expression):
17
21
 
18
22
  def split_files(self, input_text=""):
19
23
  input_ = input_text.strip()
20
- if input_.startswith('# ----[FILE_START]') and '# ----[FILE_END]' in input_:
24
+ if input_.startswith("# ----[FILE_START]") and "# ----[FILE_END]" in input_:
21
25
  self._has_file_start = True
22
26
  # split text file-wise and create a map of file names and their contents
23
27
  files = {}
24
- split_text = input_.split('# ----[FILE_START]')
25
- for i, file in enumerate(split_text):
28
+ split_text = input_.split("# ----[FILE_START]")
29
+ for _i, file in enumerate(split_text):
26
30
  if not file.strip():
27
31
  continue
28
- _, content_file = file.split('[FILE_CONTENT]:')
29
- content, file_name = content_file.split('# ----[FILE_END]')
32
+ _, content_file = file.split("[FILE_CONTENT]:")
33
+ content, file_name = content_file.split("# ----[FILE_END]")
30
34
  files[file_name.strip()] = content.strip()
31
35
  else:
32
36
  files = {"": input_}
@@ -36,8 +40,10 @@ class ParagraphFormatter(Expression):
36
40
  if file_name and self._has_file_start:
37
41
  header = f"# ----[FILE_START]<PART{part}/{total_parts}>{file_name}[FILE_CONTENT]:\n"
38
42
  footer = f"\n# ----[FILE_END]{file_name}\n"
39
- if '[FILE_CONTENT]:' in paragraph: # TODO: remove this if statement after fixing the bug
40
- paragraph = paragraph.split('[FILE_CONTENT]:')[-1].strip()
43
+ if (
44
+ "[FILE_CONTENT]:" in paragraph
45
+ ): # TODO: remove this if statement after fixing the bug
46
+ paragraph = paragraph.split("[FILE_CONTENT]:")[-1].strip()
41
47
  paragraph = header + paragraph + footer
42
48
  return paragraph
43
49
 
@@ -63,7 +69,12 @@ class ParagraphFormatter(Expression):
63
69
  input_ = file_content.strip()
64
70
  split_text = self.NEWLINES_RE.split(input_)
65
71
 
66
- par = [self._add_header_footer(p, file_name, part=i+1, total_parts=len(split_text)) + "\n" for i, p in enumerate(split_text) if p.strip()]
72
+ par = [
73
+ self._add_header_footer(p, file_name, part=i + 1, total_parts=len(split_text))
74
+ + "\n"
75
+ for i, p in enumerate(split_text)
76
+ if p.strip()
77
+ ]
67
78
  # p + "\n" ensures that all lines in the paragraph end with a newline
68
79
  # p.strip() == True if paragraph has other characters than whitespace
69
80
 
@@ -81,14 +92,20 @@ class ParagraphFormatter(Expression):
81
92
  # n splits
82
93
  total_parts = (len(words) // max_length + 1) * self._get_total_parts(text)
83
94
  for p, i in enumerate(range(0, len(words), max_length)):
84
- paragraph = ' '.join(words[i:i + max_length])
85
- paragraphs.append(self._add_header_footer(paragraph, file_name, part=p+1, total_parts=total_parts) + "\n")
95
+ paragraph = " ".join(words[i : i + max_length])
96
+ paragraphs.append(
97
+ self._add_header_footer(
98
+ paragraph, file_name, part=p + 1, total_parts=total_parts
99
+ )
100
+ + "\n"
101
+ )
86
102
  else:
87
103
  paragraphs.append(text)
88
104
  return paragraphs
89
105
 
90
- @core_ext.bind(engine='embedding', property='max_tokens')
91
- def _max_tokens(self): pass
106
+ @core_ext.bind(engine="embedding", property="max_tokens")
107
+ def _max_tokens(self):
108
+ pass
92
109
 
93
110
  def split_max_tokens_exceeded(self, input_text: List[str], token_ratio=0.5):
94
111
  paragraphs = []
@@ -103,13 +120,18 @@ class ParagraphFormatter(Expression):
103
120
  text_len_ = len(str(text)) // splits_
104
121
  total_parts = (text_len_ + 1) * self._get_total_parts(text)
105
122
  for i in range(splits_):
106
- paragraph = text[i * text_len_:(i + 1) * text_len_]
107
- paragraphs.append(self._add_header_footer(paragraph, file_name, part=i+1, total_parts=total_parts) + "\n")
123
+ paragraph = text[i * text_len_ : (i + 1) * text_len_]
124
+ paragraphs.append(
125
+ self._add_header_footer(
126
+ paragraph, file_name, part=i + 1, total_parts=total_parts
127
+ )
128
+ + "\n"
129
+ )
108
130
  else:
109
131
  paragraphs.append(text)
110
132
  return paragraphs
111
133
 
112
- def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
134
+ def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
113
135
  sym = self._to_symbol(sym)
114
136
  # split text paragraph-wise and index each paragraph separately
115
137
  self.elements = self.split_files(sym.value)
@@ -122,19 +144,17 @@ class ParagraphFormatter(Expression):
122
144
  class SentenceFormatter(Expression):
123
145
  def __init__(self, value=None, **kwargs):
124
146
  super().__init__(value, **kwargs)
125
- self.SENTENCES_RE = re.compile(r"[.!?]\n*|[\n]{1,}") # Sentence ending characters followed by newlines
147
+ self.SENTENCES_RE = re.compile(
148
+ r"[.!?]\n*|[\n]{1,}"
149
+ ) # Sentence ending characters followed by newlines
126
150
 
127
151
  def split_sentences(self, input_text=""):
128
152
  input_ = input_text.strip()
129
153
  split_text = self.SENTENCES_RE.split(input_) # regex splitting
130
154
 
131
- sentences = [s.strip() + ".\n" for s in split_text if s.strip()]
132
- # s.strip() + ".\n" ensures that all lines in the sentence end with a period and newline
133
- # s.strip() == True if sentence has other characters than whitespace
155
+ return [s.strip() + ".\n" for s in split_text if s.strip()]
134
156
 
135
- return sentences
136
-
137
- def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
157
+ def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
138
158
  sym = self._to_symbol(sym)
139
159
  # split text sentence-wise and index each sentence separately
140
160
  self.elements = self.split_sentences(sym.value)
@@ -151,12 +171,9 @@ class RegexFormatter(Expression):
151
171
  input_ = input_text.strip()
152
172
  split_text = self.SENTENCES_RE.split(input_) # regex splitting
153
173
 
154
- chunks = [s.strip() for s in split_text if s.strip()]
155
- # s.strip() == True if sentence has other characters than whitespace
156
-
157
- return chunks
174
+ return [s.strip() for s in split_text if s.strip()]
158
175
 
159
- def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
176
+ def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
160
177
  sym = self._to_symbol(sym)
161
178
  # split text sentence-wise and index each sentence separately
162
179
  self.elements = self.split_sentences(sym.value)
@@ -164,25 +181,19 @@ class RegexFormatter(Expression):
164
181
 
165
182
 
166
183
  class TextContainerFormatter(Expression):
167
- def __init__(
168
- self,
169
- value: Any = None,
170
- key: str ="text",
171
- text_split: int = 4,
172
- **kwargs
173
- ):
184
+ def __init__(self, value: Any = None, key: str = "text", text_split: int = 4, **kwargs):
174
185
  super().__init__(value, **kwargs)
175
186
  self.key = key
176
187
  self.text_split = text_split
177
188
 
178
189
  @beartype
179
- def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
190
+ def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
180
191
  if isinstance(sym.value, list):
181
192
  containers = [container for pdf in sym.value for container in pdf]
182
193
  chunks = [text for container in tqdm(containers) for text in self._chunk(container)]
183
194
  return self._to_symbol(chunks)
184
195
 
185
- def _chunk(self, container: 'TextContainer') -> List[str]:
196
+ def _chunk(self, container: "TextContainer") -> List[str]:
186
197
  text = container.text
187
198
  step = len(text) // self.text_split
188
199
  splits = []
@@ -192,17 +203,10 @@ class TextContainerFormatter(Expression):
192
203
  # Unify the last chunk with the previous one if necessary
193
204
  splits.append(self._as_str(text[i:], container))
194
205
  break
195
- splits.append(self._as_str(text[i:i+step], container))
206
+ splits.append(self._as_str(text[i : i + step], container))
196
207
  i += step
197
208
  c += 1
198
209
  return splits
199
210
 
200
- def _as_str(self, text: str, container: 'TextContainer') -> str:
201
- return (
202
- '---\n'
203
- f"id: {container.id}\n"
204
- f"page: {container.page}\n"
205
- '---\n'
206
- f"{text}"
207
- )
208
-
211
+ def _as_str(self, text: str, container: "TextContainer") -> str:
212
+ return f"---\nid: {container.id}\npage: {container.page}\n---\n{text}"