PyPI - symbolicai - Versions diffs - 0.21.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

symbolicai 0.21.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

symai/__init__.py +269 -173
symai/backend/base.py +123 -110
symai/backend/engines/drawing/engine_bfl.py +45 -44
symai/backend/engines/drawing/engine_gpt_image.py +112 -97
symai/backend/engines/embedding/engine_llama_cpp.py +63 -52
symai/backend/engines/embedding/engine_openai.py +25 -21
symai/backend/engines/execute/engine_python.py +19 -18
symai/backend/engines/files/engine_io.py +104 -95
symai/backend/engines/imagecaptioning/engine_blip2.py +28 -24
symai/backend/engines/imagecaptioning/engine_llavacpp_client.py +102 -79
symai/backend/engines/index/engine_pinecone.py +124 -97
symai/backend/engines/index/engine_qdrant.py +1011 -0
symai/backend/engines/index/engine_vectordb.py +84 -56
symai/backend/engines/lean/engine_lean4.py +96 -52
symai/backend/engines/neurosymbolic/__init__.py +41 -13
symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_chat.py +330 -248
symai/backend/engines/neurosymbolic/engine_anthropic_claudeX_reasoning.py +329 -264
symai/backend/engines/neurosymbolic/engine_cerebras.py +328 -0
symai/backend/engines/neurosymbolic/engine_deepseekX_reasoning.py +118 -88
symai/backend/engines/neurosymbolic/engine_google_geminiX_reasoning.py +344 -299
symai/backend/engines/neurosymbolic/engine_groq.py +173 -115
symai/backend/engines/neurosymbolic/engine_huggingface.py +114 -84
symai/backend/engines/neurosymbolic/engine_llama_cpp.py +144 -118
symai/backend/engines/neurosymbolic/engine_openai_gptX_chat.py +415 -307
symai/backend/engines/neurosymbolic/engine_openai_gptX_reasoning.py +394 -231
symai/backend/engines/ocr/engine_apilayer.py +23 -27
symai/backend/engines/output/engine_stdout.py +10 -13
symai/backend/engines/{webscraping → scrape}/engine_requests.py +101 -54
symai/backend/engines/search/engine_openai.py +100 -88
symai/backend/engines/search/engine_parallel.py +665 -0
symai/backend/engines/search/engine_perplexity.py +44 -45
symai/backend/engines/search/engine_serpapi.py +37 -34
symai/backend/engines/speech_to_text/engine_local_whisper.py +54 -51
symai/backend/engines/symbolic/engine_wolframalpha.py +15 -9
symai/backend/engines/text_to_speech/engine_openai.py +20 -26
symai/backend/engines/text_vision/engine_clip.py +39 -37
symai/backend/engines/userinput/engine_console.py +5 -6
symai/backend/mixin/__init__.py +13 -0
symai/backend/mixin/anthropic.py +48 -38
symai/backend/mixin/deepseek.py +6 -5
symai/backend/mixin/google.py +7 -4
symai/backend/mixin/groq.py +2 -4
symai/backend/mixin/openai.py +140 -110
symai/backend/settings.py +87 -20
symai/chat.py +216 -123
symai/collect/__init__.py +7 -1
symai/collect/dynamic.py +80 -70
symai/collect/pipeline.py +67 -51
symai/collect/stats.py +161 -109
symai/components.py +707 -360
symai/constraints.py +24 -12
symai/core.py +1857 -1233
symai/core_ext.py +83 -80
symai/endpoints/api.py +166 -104
symai/extended/.DS_Store +0 -0
symai/extended/__init__.py +46 -12
symai/extended/api_builder.py +29 -21
symai/extended/arxiv_pdf_parser.py +23 -14
symai/extended/bibtex_parser.py +9 -6
symai/extended/conversation.py +156 -126
symai/extended/document.py +50 -30
symai/extended/file_merger.py +57 -14
symai/extended/graph.py +51 -32
symai/extended/html_style_template.py +18 -14
symai/extended/interfaces/blip_2.py +2 -3
symai/extended/interfaces/clip.py +4 -3
symai/extended/interfaces/console.py +9 -1
symai/extended/interfaces/dall_e.py +4 -2
symai/extended/interfaces/file.py +2 -0
symai/extended/interfaces/flux.py +4 -2
symai/extended/interfaces/gpt_image.py +16 -7
symai/extended/interfaces/input.py +2 -1
symai/extended/interfaces/llava.py +1 -2
symai/extended/interfaces/{naive_webscraping.py → naive_scrape.py} +4 -3
symai/extended/interfaces/naive_vectordb.py +9 -10
symai/extended/interfaces/ocr.py +5 -3
symai/extended/interfaces/openai_search.py +2 -0
symai/extended/interfaces/parallel.py +30 -0
symai/extended/interfaces/perplexity.py +2 -0
symai/extended/interfaces/pinecone.py +12 -9
symai/extended/interfaces/python.py +2 -0
symai/extended/interfaces/serpapi.py +3 -1
symai/extended/interfaces/terminal.py +2 -4
symai/extended/interfaces/tts.py +3 -2
symai/extended/interfaces/whisper.py +3 -2
symai/extended/interfaces/wolframalpha.py +2 -1
symai/extended/metrics/__init__.py +11 -1
symai/extended/metrics/similarity.py +14 -13
symai/extended/os_command.py +39 -29
symai/extended/packages/__init__.py +29 -3
symai/extended/packages/symdev.py +51 -43
symai/extended/packages/sympkg.py +41 -35
symai/extended/packages/symrun.py +63 -50
symai/extended/repo_cloner.py +14 -12
symai/extended/seo_query_optimizer.py +15 -13
symai/extended/solver.py +116 -91
symai/extended/summarizer.py +12 -10
symai/extended/taypan_interpreter.py +17 -18
symai/extended/vectordb.py +122 -92
symai/formatter/__init__.py +9 -1
symai/formatter/formatter.py +51 -47
symai/formatter/regex.py +70 -69
symai/functional.py +325 -176
symai/imports.py +190 -147
symai/interfaces.py +57 -28
symai/memory.py +45 -35
symai/menu/screen.py +28 -19
symai/misc/console.py +66 -56
symai/misc/loader.py +8 -5
symai/models/__init__.py +17 -1
symai/models/base.py +395 -236
symai/models/errors.py +1 -2
symai/ops/__init__.py +32 -22
symai/ops/measures.py +24 -25
symai/ops/primitives.py +1149 -731
symai/post_processors.py +58 -50
symai/pre_processors.py +86 -82
symai/processor.py +21 -13
symai/prompts.py +764 -685
symai/server/huggingface_server.py +135 -49
symai/server/llama_cpp_server.py +21 -11
symai/server/qdrant_server.py +206 -0
symai/shell.py +100 -42
symai/shellsv.py +700 -492
symai/strategy.py +630 -346
symai/symbol.py +368 -322
symai/utils.py +100 -78
{symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/METADATA +22 -10
symbolicai-1.1.0.dist-info/RECORD +168 -0
symbolicai-0.21.0.dist-info/RECORD +0 -162
{symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/WHEEL +0 -0
{symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/entry_points.txt +0 -0
{symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/licenses/LICENSE +0 -0
{symbolicai-0.21.0.dist-info → symbolicai-1.1.0.dist-info}/top_level.txt +0 -0

symai/extended/vectordb.py CHANGED Viewed

@@ -1,34 +1,41 @@
 import gzip
 import logging
-import os
 import pickle
+from collections.abc import Mapping
 from copy import deepcopy
 from pathlib import Path
+from typing import Any, ClassVar
 import numpy as np
 from ..backend.settings import HOME_PATH, SYMAI_CONFIG
 from ..interfaces import Interface
 from ..symbol import Expression, Symbol
-from ..utils import CustomUserWarning
-from .metrics import (adams_similarity, cosine_similarity,
-                      derridaean_similarity, dot_product, euclidean_metric,
-                      ranking_algorithm_sort)
+from ..utils import UserMessage
+from .metrics import (
+    adams_similarity,
+    cosine_similarity,
+    derridaean_similarity,
+    dot_product,
+    euclidean_metric,
+    ranking_algorithm_sort,
+)
-logging.getLogger('sentence_transformers').setLevel(logging.WARNING)
-logging.getLogger('datasets').setLevel(logging.WARNING)
+logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
+logging.getLogger("datasets").setLevel(logging.WARNING)
 class VectorDB(Expression):
-    _default_documents = []
-    _default_vectors = None
-    _default_batch_size = 2048
-    _default_similarity_metric = "cosine"
-    _default_embedding_function = None
-    _default_index_dims = 768
-    _default_top_k = 5
-    _default_storage_path = os.path.join(HOME_PATH, "localdb")
-    _default_index_name = "dataindex"
+    _default_documents: ClassVar[list] = []
+    _default_vectors: ClassVar[np.ndarray | None] = None
+    _default_batch_size: ClassVar[int] = 2048
+    _default_similarity_metric: ClassVar[str] = "cosine"
+    _default_embedding_function: ClassVar[object | None] = None
+    _default_index_dims: ClassVar[int] = 768
+    _default_top_k: ClassVar[int] = 5
+    _default_storage_path: ClassVar[Path] = HOME_PATH / "localdb"
+    _default_index_name: ClassVar[str] = "dataindex"
     def __init__(
         self,
         documents=_default_documents,
@@ -40,7 +47,7 @@ class VectorDB(Expression):
         index_dims=_default_index_dims,
         top_k=_default_top_k,
         index_name=_default_index_name,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.config = deepcopy(SYMAI_CONFIG)
@@ -71,22 +78,73 @@ class VectorDB(Expression):
         elif "adams" in similarity_metric:
             self.similarity_metric = adams_similarity
         else:
-            CustomUserWarning(f"Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.", raise_with=ValueError)
+            UserMessage(
+                "Similarity metric not supported. Please use either 'dot', 'cosine', 'euclidean', 'adams', or 'derrida'.",
+                raise_with=ValueError,
+            )
         if load_on_init:
-            # If load_on_init is a string, use it as the storage file
-            if isinstance(load_on_init, str):
-                path = os.path.join(load_on_init, f"{self.index_name}.pkl")
+            if isinstance(load_on_init, (str, Path)):
+                path = Path(load_on_init) / f"{self.index_name}.pkl"
                 self.load(path)
             else:
                 self.load()
     def _init_embedding_model(self):
-        if self.config['EMBEDDING_ENGINE_API_KEY'] is None or  self.config['EMBEDDING_ENGINE_API_KEY'] == '':
-            self.model = Interface('ExtensityAI/embeddings') # default to local model
+        if (
+            self.config["EMBEDDING_ENGINE_API_KEY"] is None
+            or self.config["EMBEDDING_ENGINE_API_KEY"] == ""
+        ):
+            self.model = Interface("ExtensityAI/embeddings")  # default to local model
         else:
             self.model = lambda x: Symbol(x).embedding
+    def _unwrap_documents(self, documents):
+        if isinstance(documents, Symbol):
+            return documents.value
+        return documents
+    def _to_texts(self, documents, key):
+        if not isinstance(documents, list):
+            self._raise_texts_unassigned()
+        if len(documents) == 0:
+            return []
+        first_document = documents[0]
+        if isinstance(first_document, dict):
+            return self._texts_from_dicts(documents, key)
+        if isinstance(first_document, str):
+            return documents
+        return self._raise_texts_unassigned()
+    def _texts_from_dicts(self, documents, key):
+        if isinstance(key, str):
+            key_chain = key.split(".") if "." in key else [key]
+            return [self._resolve_key_chain(doc, key_chain).replace("\n", " ") for doc in documents]
+        if key is None:
+            return [
+                ", ".join([f"{dict_key}: {value}" for dict_key, value in doc.items()])
+                for doc in documents
+            ]
+        return self._raise_texts_unassigned()
+    def _resolve_key_chain(self, document, key_chain):
+        current_document = document
+        for chain_key in key_chain:
+            current_document = current_document[chain_key]
+        return current_document
+    def _embed_batch(self, batch):
+        emb = self.model(batch)
+        if len(emb.shape) == 1:
+            return [emb]
+        if len(emb.shape) == 2:
+            return [emb[index] for index in range(emb.shape[0])]
+        return UserMessage("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
+    def _raise_texts_unassigned(self):
+        error_message = "local variable 'texts' referenced before assignment"
+        raise UnboundLocalError(error_message)
     def _get_embedding(self, documents, key=None):
         """
         Get embeddings from a list of documents.
@@ -103,48 +161,17 @@ class VectorDB(Expression):
         embeddings : numpy.ndarray
             A numpy array of embeddings.
         """
-        # unwrap the documents if they are a Symbol
-        if isinstance(documents, Symbol):
-            documents = documents.value
-        # if the documents are a list of Symbols, unwrap them
+        documents = self._unwrap_documents(documents)
         if len(documents) == 0:
             return []
-        if isinstance(documents, list):
-            # If the documents are a list of dictionaries, extract the text from the dictionary
-            if isinstance(documents[0], dict):
-                texts = []
-                # If a key is specified, extract the text from the dictionary using the key
-                if isinstance(key, str):
-                    if "." in key:
-                        key_chain = key.split(".")
-                    else:
-                        key_chain = [key]
-                    for doc in documents:
-                        for key in key_chain:
-                            doc = doc[key]
-                        texts.append(doc.replace("\n", " "))
-                # If no key is specified, extract the text from the dictionary using all keys
-                elif key is None:
-                    for doc in documents:
-                        text = ", ".join([f"{key}: {value}" for key, value in doc.items()])
-                        texts.append(text)
-            # If the documents are a list of strings, use the strings as the documents
-            elif isinstance(documents[0], str):
-                texts = documents
-            # If the documents are a list of lists, use the lists as the documents
-        batches = [texts[i : i + self.batch_size] for i in range(0, len(texts), self.batch_size)]
+        texts = self._to_texts(documents, key)
+        batches = [
+            texts[index : index + self.batch_size]
+            for index in range(0, len(texts), self.batch_size)
+        ]
         embeddings = []
-        # Embed the documents in batches
         for batch in batches:
-            # Extend the embeddings list with the embeddings from the batch
-            emb = self.model(batch)
-            if len(emb.shape) == 1:
-                embeddings.append(emb)
-            elif len(emb.shape) == 2:
-                for i in range(emb.shape[0]):
-                    embeddings.append(emb[i])
-            else:
-                CustomUserWarning("Embeddings must be a 1D or 2D array.", raise_with=ValueError)
+            embeddings.extend(self._embed_batch(batch))
         return embeddings
     def dict(self, vectors=False):
@@ -165,12 +192,11 @@ class VectorDB(Expression):
             return [
                 {"document": document, "vector": vector.tolist(), "index": index}
                 for index, (document, vector) in enumerate(
-                    zip(self.documents, self.vectors)
+                    zip(self.documents, self.vectors, strict=False)
                 )
             ]
         return [
-            {"document": document, "index": index}
-            for index, document in enumerate(self.documents)
+            {"document": document, "index": index} for index, document in enumerate(self.documents)
         ]
     def add(self, documents, vectors=None):
@@ -191,8 +217,9 @@ class VectorDB(Expression):
         if not isinstance(documents, list):
             return self.add_document(documents, vectors)
         self.add_documents(documents, vectors)
+        return None
-    def add_document(self, document: dict, vector=None):
+    def add_document(self, document: Mapping[str, Any], vector=None):
         """
         Adds a document to the database.
@@ -204,13 +231,13 @@ class VectorDB(Expression):
             A vector to add to the database.
         """
-        vector = (vector if vector is not None else self.embedding_function([document])[0])
+        vector = vector if vector is not None else self.embedding_function([document])[0]
         if self.vectors is None:
             self.vectors = np.empty((0, len(vector)), dtype=np.float32)
         elif len(vector) != self.vectors.shape[1]:
-            CustomUserWarning("All vectors must have the same length.", raise_with=ValueError)
+            UserMessage("All vectors must have the same length.", raise_with=ValueError)
         # convert the vector to a numpy array if it is not already
-        if type(vector) == list:
+        if isinstance(vector, list):
             vector = np.array(vector)
         self.vectors = np.vstack([self.vectors, vector]).astype(np.float32)
         self.documents.append(document)
@@ -243,7 +270,7 @@ class VectorDB(Expression):
         if not documents:
             return
         vectors = vectors or np.array(self.embedding_function(documents)).astype(np.float32)
-        for vector, document in zip(vectors, documents):
+        for vector, document in zip(vectors, documents, strict=False):
             self.add_document(document, vector)
     def clear(self):
@@ -251,10 +278,10 @@ class VectorDB(Expression):
         Clears the database.
         """
-        self.vectors   = None
+        self.vectors = None
         self.documents = []
-    def save(self, storage_file: str = None):
+    def save(self, storage_file: str | None = None):
         """
         Saves the database to a file.
@@ -265,20 +292,20 @@ class VectorDB(Expression):
         """
         if storage_file is None:
-            # use path to home directory by default
-            storage_path = os.path.join(HOME_PATH, "localdb")
-            os.makedirs(storage_path, exist_ok=True)
-            storage_file = os.path.join(storage_path, f"{self.index_name}.pkl")
+            storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
+            storage_file.parent.mkdir(parents=True, exist_ok=True)
+        else:
+            storage_file = Path(storage_file)
         data = {"vectors": self.vectors, "documents": self.documents}
-        if storage_file.endswith(".gz"):
+        if storage_file.suffix == ".gz":
             with gzip.open(storage_file, "wb") as f:
                 pickle.dump(data, f)
         else:
-            with open(storage_file, "wb") as f:
+            with storage_file.open("wb") as f:
                 pickle.dump(data, f)
-    def load(self, storage_file : str = None):
+    def load(self, storage_file: str | None = None):
         """
         Loads the database from a file.
@@ -289,27 +316,26 @@ class VectorDB(Expression):
         """
         if storage_file is None:
-            # use path to home directory by default
-            storage_path = os.path.join(HOME_PATH, "localdb")
-            # create dir on first load if never used
-            os.makedirs(storage_path, exist_ok=True)
-            storage_file = os.path.join(storage_path, f"{self.index_name}.pkl")
+            storage_file = HOME_PATH / "localdb" / f"{self.index_name}.pkl"
+            storage_file.parent.mkdir(parents=True, exist_ok=True)
+        else:
+            storage_file = Path(storage_file)
         # return since nothing to load
-        if not os.path.exists(storage_file):
+        if not storage_file.exists():
             return
-        if storage_file.endswith(".gz"):
+        if storage_file.suffix == ".gz":
             with gzip.open(storage_file, "rb") as f:
                 data = pickle.load(f)
         else:
-            with open(storage_file, "rb") as f:
+            with storage_file.open("rb") as f:
                 data = pickle.load(f)
         self.vectors = data["vectors"].astype(np.float32) if data["vectors"] is not None else None
         self.documents = data["documents"]
-    def purge(self, index_name : str):
+    def purge(self, index_name: str):
         """
         Purges the database file from your machine, but does not delete the database from memory.
         Use the `clear` method to clear the database from memory.
@@ -328,11 +354,11 @@ class VectorDB(Expression):
         # use path to home directory by default
         storage_path = symai_folder / "localdb"
         # create dir on first load if never used
-        os.makedirs(storage_path, exist_ok=True)
+        storage_path.mkdir(parents=True, exist_ok=True)
         storage_file = storage_path / f"{index_name}.pkl"
         if storage_file.exists():
             # remove the file
-            os.remove(storage_file)
+            storage_file.unlink()
         self.clear()
     def forward(self, query=None, vector=None, top_k=None, return_similarities=True):
@@ -354,14 +380,18 @@ class VectorDB(Expression):
             A list of results.
         """
-        assert self.vectors is not None, f"Error: Cannot query the database without prior insertion / initialization."
+        assert self.vectors is not None, (
+            "Error: Cannot query the database without prior insertion / initialization."
+        )
         top_k = top_k or self.index_top_k
         query_vector = self.embedding_function([query])[0] if vector is None else vector
-        if type(query_vector) == list:
+        if isinstance(query_vector, list):
             query_vector = np.array(query_vector)
         ranked_results, similarities = ranking_algorithm_sort(
             self.vectors, query_vector, top_k=top_k, metric=self.similarity_metric
         )
         if return_similarities:
-            return list(zip([self.documents[index] for index in ranked_results], similarities))
+            return list(
+                zip([self.documents[index] for index in ranked_results], similarities, strict=False)
+            )
         return [self.documents[index] for index in ranked_results]

symai/formatter/__init__.py CHANGED Viewed

@@ -1,2 +1,10 @@
+from .formatter import ParagraphFormatter, RegexFormatter, SentenceFormatter, TextContainerFormatter
 from .regex import CHUNK_REGEX
-from .formatter import ParagraphFormatter, SentenceFormatter, RegexFormatter, TextContainerFormatter
+__all__ = [
+    "CHUNK_REGEX",
+    "ParagraphFormatter",
+    "RegexFormatter",
+    "SentenceFormatter",
+    "TextContainerFormatter",
+]

symai/formatter/formatter.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import re
+from typing import TYPE_CHECKING
 from beartype import beartype
 from beartype.typing import Any, Dict, List
 from tqdm import tqdm
-from .regex import CHUNK_REGEX
 from .. import core_ext
 from ..symbol import Expression, Symbol
+from .regex import CHUNK_REGEX
+if TYPE_CHECKING:
+    from ..backend.engines.files.engine_io import TextContainer
 class ParagraphFormatter(Expression):
@@ -17,16 +21,16 @@ class ParagraphFormatter(Expression):
     def split_files(self, input_text=""):
         input_ = input_text.strip()
-        if input_.startswith('# ----[FILE_START]') and '# ----[FILE_END]' in input_:
+        if input_.startswith("# ----[FILE_START]") and "# ----[FILE_END]" in input_:
             self._has_file_start = True
             # split text file-wise and create a map of file names and their contents
             files = {}
-            split_text = input_.split('# ----[FILE_START]')
-            for i, file in enumerate(split_text):
+            split_text = input_.split("# ----[FILE_START]")
+            for _i, file in enumerate(split_text):
                 if not file.strip():
                     continue
-                _, content_file = file.split('[FILE_CONTENT]:')
-                content, file_name = content_file.split('# ----[FILE_END]')
+                _, content_file = file.split("[FILE_CONTENT]:")
+                content, file_name = content_file.split("# ----[FILE_END]")
                 files[file_name.strip()] = content.strip()
         else:
             files = {"": input_}
@@ -36,8 +40,10 @@ class ParagraphFormatter(Expression):
         if file_name and self._has_file_start:
             header = f"# ----[FILE_START]<PART{part}/{total_parts}>{file_name}[FILE_CONTENT]:\n"
             footer = f"\n# ----[FILE_END]{file_name}\n"
-            if '[FILE_CONTENT]:' in paragraph: # TODO: remove this if statement after fixing the bug
-                paragraph = paragraph.split('[FILE_CONTENT]:')[-1].strip()
+            if (
+                "[FILE_CONTENT]:" in paragraph
+            ):  # TODO: remove this if statement after fixing the bug
+                paragraph = paragraph.split("[FILE_CONTENT]:")[-1].strip()
             paragraph = header + paragraph + footer
         return paragraph
@@ -63,7 +69,12 @@ class ParagraphFormatter(Expression):
             input_ = file_content.strip()
             split_text = self.NEWLINES_RE.split(input_)
-            par = [self._add_header_footer(p, file_name, part=i+1, total_parts=len(split_text)) + "\n" for i, p in enumerate(split_text) if p.strip()]
+            par = [
+                self._add_header_footer(p, file_name, part=i + 1, total_parts=len(split_text))
+                + "\n"
+                for i, p in enumerate(split_text)
+                if p.strip()
+            ]
             # p + "\n" ensures that all lines in the paragraph end with a newline
             # p.strip() == True if paragraph has other characters than whitespace
@@ -81,14 +92,20 @@ class ParagraphFormatter(Expression):
                 # n splits
                 total_parts = (len(words) // max_length + 1) * self._get_total_parts(text)
                 for p, i in enumerate(range(0, len(words), max_length)):
-                    paragraph = ' '.join(words[i:i + max_length])
-                    paragraphs.append(self._add_header_footer(paragraph, file_name, part=p+1, total_parts=total_parts) + "\n")
+                    paragraph = " ".join(words[i : i + max_length])
+                    paragraphs.append(
+                        self._add_header_footer(
+                            paragraph, file_name, part=p + 1, total_parts=total_parts
+                        )
+                        + "\n"
+                    )
             else:
                 paragraphs.append(text)
         return paragraphs
-    @core_ext.bind(engine='embedding', property='max_tokens')
-    def _max_tokens(self): pass
+    @core_ext.bind(engine="embedding", property="max_tokens")
+    def _max_tokens(self):
+        pass
     def split_max_tokens_exceeded(self, input_text: List[str], token_ratio=0.5):
         paragraphs = []
@@ -103,13 +120,18 @@ class ParagraphFormatter(Expression):
                 text_len_ = len(str(text)) // splits_
                 total_parts = (text_len_ + 1) * self._get_total_parts(text)
                 for i in range(splits_):
-                    paragraph = text[i * text_len_:(i + 1) * text_len_]
-                    paragraphs.append(self._add_header_footer(paragraph, file_name, part=i+1, total_parts=total_parts) + "\n")
+                    paragraph = text[i * text_len_ : (i + 1) * text_len_]
+                    paragraphs.append(
+                        self._add_header_footer(
+                            paragraph, file_name, part=i + 1, total_parts=total_parts
+                        )
+                        + "\n"
+                    )
             else:
                 paragraphs.append(text)
         return paragraphs
-    def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
+    def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
         sym = self._to_symbol(sym)
         # split text paragraph-wise and index each paragraph separately
         self.elements = self.split_files(sym.value)
@@ -122,19 +144,17 @@ class ParagraphFormatter(Expression):
 class SentenceFormatter(Expression):
     def __init__(self, value=None, **kwargs):
         super().__init__(value, **kwargs)
-        self.SENTENCES_RE = re.compile(r"[.!?]\n*|[\n]{1,}")  # Sentence ending characters followed by newlines
+        self.SENTENCES_RE = re.compile(
+            r"[.!?]\n*|[\n]{1,}"
+        )  # Sentence ending characters followed by newlines
     def split_sentences(self, input_text=""):
         input_ = input_text.strip()
         split_text = self.SENTENCES_RE.split(input_)  # regex splitting
-        sentences = [s.strip() + ".\n" for s in split_text if s.strip()]
-        # s.strip() + ".\n" ensures that all lines in the sentence end with a period and newline
-        # s.strip() == True if sentence has other characters than whitespace
+        return [s.strip() + ".\n" for s in split_text if s.strip()]
-        return sentences
-    def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
+    def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
         sym = self._to_symbol(sym)
         # split text sentence-wise and index each sentence separately
         self.elements = self.split_sentences(sym.value)
@@ -151,12 +171,9 @@ class RegexFormatter(Expression):
         input_ = input_text.strip()
         split_text = self.SENTENCES_RE.split(input_)  # regex splitting
-        chunks = [s.strip() for s in split_text if s.strip()]
-        # s.strip() == True if sentence has other characters than whitespace
-        return chunks
+        return [s.strip() for s in split_text if s.strip()]
-    def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
+    def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
         sym = self._to_symbol(sym)
         # split text sentence-wise and index each sentence separately
         self.elements = self.split_sentences(sym.value)
@@ -164,25 +181,19 @@ class RegexFormatter(Expression):
 class TextContainerFormatter(Expression):
-    def __init__(
-            self,
-            value: Any = None,
-            key: str ="text",
-            text_split: int = 4,
-            **kwargs
-        ):
+    def __init__(self, value: Any = None, key: str = "text", text_split: int = 4, **kwargs):
         super().__init__(value, **kwargs)
         self.key = key
         self.text_split = text_split
     @beartype
-    def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
+    def forward(self, sym: Symbol, *_args, **_kwargs) -> Symbol:
         if isinstance(sym.value, list):
             containers = [container for pdf in sym.value for container in pdf]
         chunks = [text for container in tqdm(containers) for text in self._chunk(container)]
         return self._to_symbol(chunks)
-    def _chunk(self, container: 'TextContainer') -> List[str]:
+    def _chunk(self, container: "TextContainer") -> List[str]:
         text = container.text
         step = len(text) // self.text_split
         splits = []
@@ -192,17 +203,10 @@ class TextContainerFormatter(Expression):
                 # Unify the last chunk with the previous one if necessary
                 splits.append(self._as_str(text[i:], container))
                 break
-            splits.append(self._as_str(text[i:i+step], container))
+            splits.append(self._as_str(text[i : i + step], container))
             i += step
             c += 1
         return splits
-    def _as_str(self, text: str, container: 'TextContainer') -> str:
-        return (
-            '---\n'
-            f"id: {container.id}\n"
-            f"page: {container.page}\n"
-            '---\n'
-            f"{text}"
-        )
+    def _as_str(self, text: str, container: "TextContainer") -> str:
+        return f"---\nid: {container.id}\npage: {container.page}\n---\n{text}"

symbolicai 0.21.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

symbolicai 0.21.0py3-none-any.whl → 1.1.0py3-none-any.whl