PyPI - aisberg - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

aisberg 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

aisberg/api/async_endpoints.py +138 -20
aisberg/api/endpoints.py +136 -18
aisberg/async_client.py +8 -0
aisberg/client.py +8 -0
aisberg/config.py +6 -0
aisberg/models/collections.py +15 -1
aisberg/models/documents.py +46 -0
aisberg/models/requests.py +5 -1
aisberg/modules/__init__.py +5 -0
aisberg/modules/chat.py +11 -3
aisberg/modules/collections.py +360 -7
aisberg/modules/documents.py +168 -0
aisberg/modules/embeddings.py +11 -3
aisberg/modules/me.py +1 -1
aisberg/modules/models.py +3 -3
aisberg/modules/s3.py +316 -0
aisberg/modules/workflows.py +3 -3
{aisberg-0.1.0.dist-info → aisberg-0.2.0.dist-info}/METADATA +16 -3
{aisberg-0.1.0.dist-info → aisberg-0.2.0.dist-info}/RECORD +24 -21
tmp/test_collection.py +65 -0
tmp/test_doc_parse.py +31 -7
aisberg/modules/document.py +0 -117
{aisberg-0.1.0.dist-info → aisberg-0.2.0.dist-info}/WHEEL +0 -0
{aisberg-0.1.0.dist-info → aisberg-0.2.0.dist-info}/licenses/LICENSE +0 -0
{aisberg-0.1.0.dist-info → aisberg-0.2.0.dist-info}/top_level.txt +0 -0

aisberg/modules/collections.py CHANGED Viewed

@@ -1,11 +1,19 @@
-from typing import List
+from typing import List, Union, Optional
 from abc import ABC
-from ..models.collections import GroupCollections, Collection, CollectionDetails
+import json
+from ..models.collections import (
+    GroupCollections,
+    Collection,
+    CollectionDetails,
+    CollectionDataset,
+)
 from abc import abstractmethod
 from ..abstract.modules import SyncModule, AsyncModule
 from ..api import endpoints, async_endpoints
+from ..models.requests import HttpxFileField
+from io import BytesIO
 class AbstractCollectionsModule(ABC):
@@ -25,7 +33,7 @@ class AbstractCollectionsModule(ABC):
             ValueError: If no collections are found.
             Exception: If there is an error fetching the collections.
         """
-        pass
+        ...
     @abstractmethod
     def get_by_group(self, group_id: str) -> List[Collection]:
@@ -42,7 +50,7 @@ class AbstractCollectionsModule(ABC):
             ValueError: If no collections are found for the specified group ID.
             Exception: If there is an error fetching the collections.
         """
-        pass
+        ...
     @abstractmethod
     def details(self, collection_id: str, group_id: str) -> CollectionDetails:
@@ -59,7 +67,126 @@ class AbstractCollectionsModule(ABC):
         Raises:
             ValueError: If the specified collection is not found.
         """
-        pass
+        ...
+    @abstractmethod
+    def delete(self, name: str, **kwargs) -> bool:
+        """
+        Delete a collection by name and group ID.
+        Args:
+            name (str): The name of the collection to delete.
+            **kwargs: Additional keyword arguments, such as group ID.
+        Returns:
+            bool: True if the deletion was successful, False otherwise.
+        Raises:
+            ValueError: If the collection could not be deleted.
+            Exception: If there is an error during the deletion process.
+        """
+        ...
+    @abstractmethod
+    def create(
+        self,
+        name: str,
+        data: Union[dict, CollectionDataset, str],
+        embedding_model: Optional[str] = "BAAI/bge-m3",
+        normalize: bool = False,
+        **kwargs,
+    ) -> CollectionDetails:
+        """
+        Create a new collection.
+        Args:
+            name (str): The name of the collection to create.
+            data (Union[dict, CollectionDataset, str]): The data to insert into the collection.
+                Can be a Dict, aCollectionDataset object or a string representing the file path.
+            embedding_model (Optional[str]): The embedding model to use for the collection.
+                Defaults to "BAAI/bge-m3".
+            normalize (bool): Whether to normalize the data before inserting it into the collection. Defaults to False.
+            **kwargs: Additional keyword arguments, such as group ID.
+        Returns:
+            CollectionDetails: The details of the created collection.
+        Raises:
+            ValueError: If the collection could not be created.
+            Exception: If there is an error during the creation process.
+        """
+        ...
+    @abstractmethod
+    def insert_points(
+        self,
+        collection_name: str,
+        data: Union[dict, CollectionDataset, str],
+        normalize: bool = False,
+        **kwargs,
+    ) -> CollectionDetails:
+        """
+        Insert points into an existing collection. All existing points in the collection won't be deleted.
+        This method is used to add new data to an existing collection without removing the previous data.
+        Args:
+            collection_name (str): The name of the collection to create.
+            data (Union[dict, CollectionDataset, str]): The data to insert into the collection.
+                Can be a Dict, aCollectionDataset object or a string representing the file path.
+            normalize (bool): If collection already have points, the normalize parameter will be ignored. Defaults to False.
+            **kwargs: Additional keyword arguments, such as group ID.
+        Returns:
+            CollectionDetails: The details of the created collection.
+        Raises:
+            ValueError: If the collection could not be created.
+            Exception: If there is an error during the creation process.
+        """
+        ...
+    @abstractmethod
+    def delete_points(
+        self,
+        collection_name: str,
+        points: List[str],
+        **kwargs,
+    ) -> CollectionDetails:
+        """
+        Delete points into an existing collection. Points with the specified IDs will be removed from the collection.
+        Args:
+            collection_name (str): The name of the collection to create.
+            points (List[str]): The list of point IDs to delete from the collection.
+            **kwargs: Additional keyword arguments, such as group ID.
+        Returns:
+            CollectionDetails: The details of the created collection.
+        Raises:
+            ValueError: If the collection could not be created.
+            Exception: If there is an error during the creation process.
+        """
+        ...
+    @abstractmethod
+    def clear(
+        self,
+        collection_name: str,
+        **kwargs,
+    ) -> CollectionDetails:
+        """
+        Delete ALL points into an existing collection. All points will be removed from the collection. But the collection itself will not be deleted.
+        So you will still be able to insert new points into the collection without creating a new one.
+        Args:
+            collection_name (str): The name of the collection to create.
+            **kwargs: Additional keyword arguments, such as group ID.
+        Returns:
+            CollectionDetails: The details of the created collection.
+        Raises:
+            ValueError: If the collection could not be created.
+            Exception: If there is an error during the creation process.
+        """
+        ...
     @staticmethod
     def _get_collections_by_group(
@@ -70,6 +197,43 @@ class AbstractCollectionsModule(ABC):
                 return group.collections
         raise ValueError("No collections found for group ID")
+    @staticmethod
+    def _data_to_httpx_file(
+        data: Union[dict, CollectionDataset, str],
+    ) -> HttpxFileField:
+        """
+        Prepare a JSON payload as a HTTPX file field (for multipart upload).
+        Args:
+            data (dict | CollectionDataset | str): The dataset as dict/obj or a path to a JSON file.
+        Returns:
+            HttpxFileField: List suitable for HTTPX multipart upload.
+        """
+        if isinstance(data, str):
+            with open(data, "r", encoding="utf-8") as f:
+                coll_dict = json.load(f)
+            filename = data.split("/")[-1]
+        elif isinstance(data, CollectionDataset):
+            coll_dict = data if isinstance(data, dict) else data.model_dump()
+            filename = "collection.json"
+        elif isinstance(data, dict):
+            if "chunks" in data and "metadata" in data:
+                coll_dict = data
+                filename = "collection.json"
+            else:
+                raise ValueError(
+                    "data must be a dict with 'chunks' and 'metadata' keys"
+                )
+        else:
+            raise ValueError(
+                "data must be a dict, CollectionDataset, or file path string"
+            )
+        json_bytes = json.dumps(coll_dict, ensure_ascii=False).encode("utf-8")
+        file_tuple = ("files", (filename, BytesIO(json_bytes), "application/json"))
+        return [file_tuple]
 class SyncCollectionsModule(SyncModule, AbstractCollectionsModule):
     def __init__(self, parent, client):
@@ -83,7 +247,9 @@ class SyncCollectionsModule(SyncModule, AbstractCollectionsModule):
         collections = self.list()
         return self._get_collections_by_group(collections, group_id)
-    def details(self, collection_id: str, group_id: str) -> CollectionDetails:
+    def details(
+        self, collection_id: str, group_id: Optional[str] = None
+    ) -> CollectionDetails:
         points = endpoints.collection(self._client, collection_id, group_id)
         if points is None:
             raise ValueError("No collection found")
@@ -93,6 +259,98 @@ class SyncCollectionsModule(SyncModule, AbstractCollectionsModule):
             points=points,
         )
+    def delete(self, name: str, **kwargs) -> bool:
+        response = endpoints.delete_collection(self._client, name, **kwargs)
+        if response is None:
+            raise ValueError("Collection could not be deleted")
+        return True
+    def create(
+        self,
+        name: str,
+        data: Union[dict, CollectionDataset, str],
+        embedding_model: Optional[str] = "BAAI/bge-m3",
+        normalize: bool = False,
+        **kwargs,
+    ) -> CollectionDetails:
+        create = endpoints.create_collection(
+            self._client, name, embedding_model, **kwargs
+        )
+        if create.message != "Creation started":
+            raise ValueError("Collection could not be created")
+        insert = endpoints.insert_points_in_collection(
+            self._client,
+            name,
+            self._data_to_httpx_file(data),
+            normalize,
+            **kwargs,
+        )
+        if insert.message != f"Documents inserted in {name}.":
+            raise ValueError("Points could not be inserted into the collection")
+        return self.details(name, kwargs.get("group", None))
+    def insert_points(
+        self,
+        collection_name: str,
+        data: Union[dict, CollectionDataset, str],
+        normalize: bool = False,
+        **kwargs,
+    ) -> CollectionDetails:
+        insert = endpoints.insert_points_in_collection(
+            self._client,
+            collection_name,
+            self._data_to_httpx_file(data),
+            normalize,
+        )
+        if insert.message != f"Documents inserted in {collection_name}.":
+            raise ValueError(
+                f"Points could not be inserted into the collection : {insert.model_dump_json()}"
+            )
+        return self.details(collection_name, kwargs.get("group", None))
+    def delete_points(
+        self,
+        collection_name: str,
+        points: List[str],
+        **kwargs,
+    ) -> CollectionDetails:
+        delete = endpoints.delete_points_in_collection(
+            self._client,
+            points,
+            collection_name,
+            **kwargs,
+        )
+        if (
+            f'{len(points)} points deleted from collection "{collection_name}"'
+            not in delete.message
+        ):
+            raise ValueError(
+                f"Points could not be deleted from the collection : {delete.model_dump_json()}"
+            )
+        return self.details(collection_name, kwargs.get("group", None))
+    def clear(
+        self,
+        collection_name: str,
+        **kwargs,
+    ) -> CollectionDetails:
+        clear = endpoints.delete_all_points_in_collection(
+            self._client,
+            collection_name,
+            **kwargs,
+        )
+        if (
+            f'All points deleted from collection "{collection_name}" for group'
+            not in clear.message
+        ):
+            raise ValueError(
+                f"Points could not be deleted from the collection : {clear.model_dump_json()}"
+            )
+        return self.details(collection_name, kwargs.get("group", None))
 class AsyncCollectionsModule(AsyncModule, AbstractCollectionsModule):
     def __init__(self, parent, client):
@@ -106,7 +364,9 @@ class AsyncCollectionsModule(AsyncModule, AbstractCollectionsModule):
         collections = await self.list()
         return self._get_collections_by_group(collections, group_id)
-    async def details(self, collection_id: str, group_id: str) -> CollectionDetails:
+    async def details(
+        self, collection_id: str, group_id: Optional[str] = None
+    ) -> CollectionDetails:
         points = await async_endpoints.collection(self._client, collection_id, group_id)
         if points is None:
             raise ValueError("No collection found")
@@ -115,3 +375,96 @@ class AsyncCollectionsModule(AsyncModule, AbstractCollectionsModule):
             group=group_id,
             points=points,
         )
+    async def delete(self, name: str, **kwargs) -> bool:
+        response = await async_endpoints.delete_collection(self._client, name, **kwargs)
+        if response is None:
+            raise ValueError("Collection could not be deleted")
+        return True
+    async def create(
+        self,
+        name: str,
+        data: Union[dict, CollectionDataset, str],
+        embedding_model: Optional[str] = "BAAI/bge-m3",
+        normalize: bool = False,
+        **kwargs,
+    ) -> CollectionDetails:
+        create = await async_endpoints.create_collection(
+            self._client, name, embedding_model, **kwargs
+        )
+        if create.message != "Creation started":
+            raise ValueError("Collection could not be created")
+        insert = await async_endpoints.insert_points_in_collection(
+            self._client,
+            name,
+            self._data_to_httpx_file(data),
+            normalize,
+            **kwargs,
+        )
+        if insert.message != f"Documents inserted in {name}.":
+            raise ValueError("Points could not be inserted into the collection")
+        return await self.details(name, kwargs.get("group", None))
+    async def insert_points(
+        self,
+        collection_name: str,
+        data: Union[dict, CollectionDataset, str],
+        normalize: bool = False,
+        **kwargs,
+    ) -> CollectionDetails:
+        insert = await async_endpoints.insert_points_in_collection(
+            self._client,
+            collection_name,
+            self._data_to_httpx_file(data),
+            normalize,
+        )
+        if insert.message != f"Documents inserted in {collection_name}.":
+            raise ValueError(
+                f"Points could not be inserted into the collection : {insert.model_dump_json()}"
+            )
+        return await self.details(collection_name, kwargs.get("group", None))
+    async def delete_points(
+        self,
+        collection_name: str,
+        points: List[str],
+        **kwargs,
+    ) -> CollectionDetails:
+        delete = await async_endpoints.delete_points_in_collection(
+            self._client,
+            points,
+            collection_name,
+            **kwargs,
+        )
+        if (
+            f'{len(points)} points deleted from collection "{collection_name}"'
+            not in delete.message
+        ):
+            raise ValueError(
+                f"Points could not be deleted from the collection : {delete.model_dump_json()}"
+            )
+        return await self.details(collection_name, kwargs.get("group", None))
+    async def clear(
+        self,
+        collection_name: str,
+        **kwargs,
+    ) -> CollectionDetails:
+        clear = await async_endpoints.delete_all_points_in_collection(
+            self._client,
+            collection_name,
+            **kwargs,
+        )
+        if (
+            f'All points deleted from collection "{collection_name}" for group'
+            not in clear.message
+        ):
+            raise ValueError(
+                f"Points could not be deleted from the collection : {clear.model_dump_json()}"
+            )
+        return await self.details(collection_name, kwargs.get("group", None))

aisberg/modules/documents.py ADDED Viewed

@@ -0,0 +1,168 @@
+from abc import ABC, abstractmethod
+from ..api import endpoints, async_endpoints
+from ..abstract.modules import SyncModule, AsyncModule
+from ..models.documents import (
+    FileObject,
+    DocumentParserFileInput,
+    ParsedDocument,
+)
+from typing import List
+import json
+from io import BytesIO
+import logging
+from ..models.requests import HttpxFileField
+logger = logging.getLogger(__name__)
+class AbstractDocumentsModule(ABC):
+    def __init__(self, parent, client):
+        self._parent = parent
+        self._client = client
+    @abstractmethod
+    def parse(
+        self, files: DocumentParserFileInput, **kwargs
+    ) -> List[ParsedDocument]: ...
+    def _get_parsed_files_from_s3(
+        self, files: List[str], bucket_name: str
+    ) -> List[ParsedDocument]:
+        """
+        Download and parse a list of files from an S3 bucket.
+        Args:
+            files (List[str]): List of file names to download from S3.
+            bucket_name (str): Name of the S3 bucket.
+        Returns:
+            List[ParsedDocument]: Parsed documents as objects with content and metadata.
+        Raises:
+            Exception: If a file cannot be downloaded or parsed.
+        """
+        parsed_documents = []
+        for file_name in files:
+            if not file_name.endswith(".json"):
+                if '"type": "error"' in file_name:
+                    logger.error(f"[DOCUMENT PARSER] Parsing failed => {file_name}. ")
+                continue
+            logger.debug(f"Downloading file {file_name} from bucket {bucket_name}")
+            # Download the file as a BytesIO
+            doc_bytesio = self._parent._s3.download_file(bucket_name, file_name)
+            try:
+                buffer = doc_bytesio.getvalue()
+                content_str = buffer.decode("utf-8")
+                content_json = json.loads(content_str)
+            finally:
+                doc_bytesio.close()
+            file_object = FileObject(name=file_name, buffer=buffer)
+            parsed_documents.append(
+                ParsedDocument(
+                    content=content_json, metadata={"name": file_object.name}
+                )
+            )
+        return parsed_documents
+    @staticmethod
+    def _prepare_files_payload(
+        files: DocumentParserFileInput,
+    ) -> HttpxFileField:
+        """
+        Prepares input files into a format compatible with HTTPX multipart uploads.
+        Args:
+            files (DocumentParserFileInput): Files to upload (see type for options).
+        Returns:
+            HttpxFileField: HTTPX-style list for multipart upload.
+        Raises:
+            TypeError: On unsupported type.
+        """
+        def to_file_tuple(item):
+            # FileObject case
+            if "FileObject" in globals() and isinstance(item, FileObject):
+                content = item.buffer
+                filename = item.name
+            # (bytes, filename) tuple
+            elif isinstance(item, tuple) and len(item) == 2:
+                content, filename = item
+            # bytes or BytesIO
+            elif isinstance(item, (bytes, BytesIO)):
+                content = item
+                filename = "file"
+            # str (filepath)
+            elif isinstance(item, str):
+                with open(item, "rb") as f:
+                    content = f.read()
+                filename = item.split("/")[-1]
+            else:
+                raise TypeError(
+                    f"Unsupported file input type: {type(item)}. "
+                    "Expected str, bytes, BytesIO, tuple, or FileObject."
+                )
+            # Normalize to BytesIO for HTTPX
+            if isinstance(content, bytes):
+                content = BytesIO(content)
+            elif isinstance(content, BytesIO):
+                content.seek(0)
+            else:
+                raise TypeError(
+                    f"File content must be bytes or BytesIO, got {type(content)}"
+                )
+            return (filename, content)
+        if isinstance(files, list):
+            if len(files) == 0:
+                raise ValueError("File list cannot be empty.")
+            elif len(files) > 10:
+                raise ValueError("Too many files provided. Maximum is 10.")
+            normalized = [to_file_tuple(f) for f in files]
+        else:
+            normalized = [to_file_tuple(files)]
+        # HTTPX format: [("files", (filename, fileobj, mimetype)), ...]
+        httpx_files = [
+            ("files", (filename, content, "application/octet-stream"))
+            for filename, content in normalized
+        ]
+        return httpx_files
+class SyncDocumentsModule(SyncModule, AbstractDocumentsModule):
+    def __init__(self, parent, client):
+        SyncModule.__init__(self, parent, client)
+        AbstractDocumentsModule.__init__(self, parent, client)
+    def parse(self, files, **kwargs) -> List[ParsedDocument]:
+        output = endpoints.parse_documents(
+            self._client,
+            self._prepare_files_payload(files),
+            **kwargs,
+        )
+        if output.message == "Files parsed successfully":
+            return self._get_parsed_files_from_s3(output.parsedFiles, output.bucketName)
+        else:
+            raise ValueError(f"Error parsing files: {output.message}")
+class AsyncDocumentsModule(AsyncModule, AbstractDocumentsModule):
+    def __init__(self, parent, client):
+        AsyncModule.__init__(self, parent, client)
+        AbstractDocumentsModule.__init__(self, parent, client)
+    async def parse(self, files, **kwargs) -> List[ParsedDocument]:
+        output = await async_endpoints.parse_documents(
+            self._client,
+            self._prepare_files_payload(files),
+            **kwargs,
+        )
+        if output.message == "Files parsed successfully":
+            return self._get_parsed_files_from_s3(output.parsedFiles, output.bucketName)
+        else:
+            raise ValueError(f"Error parsing files: {output.message}")

aisberg/modules/embeddings.py CHANGED Viewed

@@ -50,7 +50,7 @@ class AbstractEmbeddingsModule(ABC):
         Returns:
             EncodingResponse: The response containing the encoded embeddings.
         """
-        pass
+        ...
     @abstractmethod
     def retrieve(
@@ -75,7 +75,7 @@ class AbstractEmbeddingsModule(ABC):
         Returns:
             List[ChunkData]: A list of ChunkData objects containing the retrieved texts and their metadata.
         """
-        pass
+        ...
     @abstractmethod
     def rerank(
@@ -104,7 +104,7 @@ class AbstractEmbeddingsModule(ABC):
             ValueError: If the documents list is empty or contains invalid document types.
             Exception: If the documents list is not of the expected type.
         """
-        pass
+        ...
     @staticmethod
     def _format_collections_names(
@@ -192,6 +192,7 @@ class SyncEmbeddingsModule(AbstractEmbeddingsModule, SyncModule):
         score_threshold: float = 0.0,
         filters: List = None,
         beta: float = 0.7,
+        **kwargs,
     ) -> ChunksDataList:
         if filters is None:
             filters = []
@@ -204,6 +205,7 @@ class SyncEmbeddingsModule(AbstractEmbeddingsModule, SyncModule):
             score_threshold=score_threshold,
             filters=filters,
             beta=beta,
+            **kwargs,
         )
         return ChunksDataList.model_validate(resp)
@@ -215,6 +217,7 @@ class SyncEmbeddingsModule(AbstractEmbeddingsModule, SyncModule):
         top_n: int = 10,
         return_documents: bool = True,
         threshold: Optional[float] = None,
+        **kwargs,
     ) -> RerankerResponse:
         resp = endpoints.rerank(
             self._client,
@@ -223,6 +226,7 @@ class SyncEmbeddingsModule(AbstractEmbeddingsModule, SyncModule):
             model,
             top_n,
             return_documents,
+            **kwargs,
         )
         resp = RerankerResponse.model_validate(resp)
@@ -269,6 +273,7 @@ class AsyncEmbeddingsModule(AbstractEmbeddingsModule, AsyncModule):
         score_threshold: float = 0.0,
         filters: List = None,
         beta: float = 0.7,
+        **kwargs,
     ) -> ChunksDataList:
         if filters is None:
             filters = []
@@ -281,6 +286,7 @@ class AsyncEmbeddingsModule(AbstractEmbeddingsModule, AsyncModule):
             score_threshold=score_threshold,
             filters=filters,
             beta=beta,
+            **kwargs,
         )
         return ChunksDataList.model_validate(resp)
@@ -292,6 +298,7 @@ class AsyncEmbeddingsModule(AbstractEmbeddingsModule, AsyncModule):
         top_n: int = 10,
         return_documents: bool = True,
         threshold: Optional[float] = None,
+        **kwargs,
     ) -> RerankerResponse:
         resp = await async_endpoints.rerank(
             self._client,
@@ -300,6 +307,7 @@ class AsyncEmbeddingsModule(AbstractEmbeddingsModule, AsyncModule):
             model,
             top_n,
             return_documents,
+            **kwargs,
         )
         resp = RerankerResponse.model_validate(resp)

aisberg/modules/me.py CHANGED Viewed

@@ -23,7 +23,7 @@ class AbstractMeModule(ABC):
         Raises:
             Exception: If there is an error fetching the token information.
         """
-        pass
+        ...
     def info(self) -> TokenInfo:
         """

aisberg 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

aisberg 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl