PyPI - cognee - Versions diffs - 0.3.4.dev3__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

cognee 0.3.4.dev3py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

cognee/infrastructure/databases/vector/create_vector_engine.py CHANGED Viewed

@@ -19,8 +19,7 @@ def create_vector_engine(
     for each provider, raising an EnvironmentError if any are missing, or ImportError if the
     ChromaDB package is not installed.
-    Supported providers include: pgvector, FalkorDB, ChromaDB, and
-    LanceDB.
+    Supported providers include: pgvector, ChromaDB, and LanceDB.
     Parameters:
     -----------
@@ -66,7 +65,12 @@ def create_vector_engine(
             f"postgresql+asyncpg://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
         )
-        from .pgvector.PGVectorAdapter import PGVectorAdapter
+        try:
+            from .pgvector.PGVectorAdapter import PGVectorAdapter
+        except ImportError:
+            raise ImportError(
+                "PostgreSQL dependencies are not installed. Please install with 'pip install cognee\"[postgres]\"' or 'pip install cognee\"[postgres-binary]\"' to use PGVector functionality."
+            )
         return PGVectorAdapter(
             connection_string,
@@ -74,18 +78,6 @@ def create_vector_engine(
             embedding_engine,
         )
-    elif vector_db_provider == "falkordb":
-        if not (vector_db_url and vector_db_port):
-            raise EnvironmentError("Missing requred FalkorDB credentials!")
-        from ..hybrid.falkordb.FalkorDBAdapter import FalkorDBAdapter
-        return FalkorDBAdapter(
-            database_url=vector_db_url,
-            database_port=vector_db_port,
-            embedding_engine=embedding_engine,
-        )
     elif vector_db_provider == "chromadb":
         try:
             import chromadb

cognee/infrastructure/databases/vector/embeddings/EmbeddingEngine.py CHANGED Viewed

@@ -34,3 +34,12 @@ class EmbeddingEngine(Protocol):
             - int: An integer representing the number of dimensions in the embedding vector.
         """
         raise NotImplementedError()
+    def get_batch_size(self) -> int:
+        """
+        Return the desired batch size for embedding calls
+        Returns:
+        """
+        raise NotImplementedError()

cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py CHANGED Viewed

@@ -42,11 +42,13 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
         model: Optional[str] = "openai/text-embedding-3-large",
         dimensions: Optional[int] = 3072,
         max_completion_tokens: int = 512,
+        batch_size: int = 100,
     ):
         self.model = model
         self.dimensions = dimensions
         self.max_completion_tokens = max_completion_tokens
         self.tokenizer = self.get_tokenizer()
+        self.batch_size = batch_size
         # self.retry_count = 0
         self.embedding_model = TextEmbedding(model_name=model)
@@ -101,6 +103,15 @@ class FastembedEmbeddingEngine(EmbeddingEngine):
         """
         return self.dimensions
+    def get_batch_size(self) -> int:
+        """
+        Return the desired batch size for embedding calls
+        Returns:
+        """
+        return self.batch_size
     def get_tokenizer(self):
         """
         Instantiate and return the tokenizer used for preparing text for embedding.

cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py CHANGED Viewed

@@ -58,6 +58,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
         endpoint: str = None,
         api_version: str = None,
         max_completion_tokens: int = 512,
+        batch_size: int = 100,
     ):
         self.api_key = api_key
         self.endpoint = endpoint
@@ -68,6 +69,7 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
         self.max_completion_tokens = max_completion_tokens
         self.tokenizer = self.get_tokenizer()
         self.retry_count = 0
+        self.batch_size = batch_size
         enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
         if isinstance(enable_mocking, bool):
@@ -165,6 +167,15 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
         """
         return self.dimensions
+    def get_batch_size(self) -> int:
+        """
+        Return the desired batch size for embedding calls
+        Returns:
+        """
+        return self.batch_size
     def get_tokenizer(self):
         """
         Load and return the appropriate tokenizer for the specified model based on the provider.
@@ -183,9 +194,15 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
                 model=model, max_completion_tokens=self.max_completion_tokens
             )
         elif "gemini" in self.provider.lower():
-            tokenizer = GeminiTokenizer(
-                model=model, max_completion_tokens=self.max_completion_tokens
+            # Since Gemini tokenization needs to send an API request to get the token count we will use TikToken to
+            # count tokens as we calculate tokens word by word
+            tokenizer = TikTokenTokenizer(
+                model=None, max_completion_tokens=self.max_completion_tokens
             )
+            # Note: Gemini Tokenizer expects an LLM model as input and not the embedding model
+            # tokenizer = GeminiTokenizer(
+            #     llm_model=llm_model, max_completion_tokens=self.max_completion_tokens
+            # )
         elif "mistral" in self.provider.lower():
             tokenizer = MistralTokenizer(
                 model=model, max_completion_tokens=self.max_completion_tokens

cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py CHANGED Viewed

@@ -54,12 +54,14 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
         max_completion_tokens: int = 512,
         endpoint: Optional[str] = "http://localhost:11434/api/embeddings",
         huggingface_tokenizer: str = "Salesforce/SFR-Embedding-Mistral",
+        batch_size: int = 100,
     ):
         self.model = model
         self.dimensions = dimensions
         self.max_completion_tokens = max_completion_tokens
         self.endpoint = endpoint
         self.huggingface_tokenizer_name = huggingface_tokenizer
+        self.batch_size = batch_size
         self.tokenizer = self.get_tokenizer()
         enable_mocking = os.getenv("MOCK_EMBEDDING", "false")
@@ -122,6 +124,15 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
         """
         return self.dimensions
+    def get_batch_size(self) -> int:
+        """
+        Return the desired batch size for embedding calls
+        Returns:
+        """
+        return self.batch_size
     def get_tokenizer(self):
         """
         Load and return a HuggingFace tokenizer for the embedding engine.

cognee/infrastructure/databases/vector/embeddings/config.py CHANGED Viewed

@@ -19,9 +19,17 @@ class EmbeddingConfig(BaseSettings):
     embedding_api_key: Optional[str] = None
     embedding_api_version: Optional[str] = None
     embedding_max_completion_tokens: Optional[int] = 8191
+    embedding_batch_size: Optional[int] = None
     huggingface_tokenizer: Optional[str] = None
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
+    def model_post_init(self, __context) -> None:
+        # If embedding batch size is not defined use 2048 as default for OpenAI and 100 for all other embedding models
+        if not self.embedding_batch_size and self.embedding_provider.lower() == "openai":
+            self.embedding_batch_size = 2048
+        elif not self.embedding_batch_size:
+            self.embedding_batch_size = 100
     def to_dict(self) -> dict:
         """
         Serialize all embedding configuration settings to a dictionary.

cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py CHANGED Viewed

@@ -31,6 +31,7 @@ def get_embedding_engine() -> EmbeddingEngine:
         config.embedding_endpoint,
         config.embedding_api_key,
         config.embedding_api_version,
+        config.embedding_batch_size,
         config.huggingface_tokenizer,
         llm_config.llm_api_key,
         llm_config.llm_provider,
@@ -46,6 +47,7 @@ def create_embedding_engine(
     embedding_endpoint,
     embedding_api_key,
     embedding_api_version,
+    embedding_batch_size,
     huggingface_tokenizer,
     llm_api_key,
     llm_provider,
@@ -84,6 +86,7 @@ def create_embedding_engine(
             model=embedding_model,
             dimensions=embedding_dimensions,
             max_completion_tokens=embedding_max_completion_tokens,
+            batch_size=embedding_batch_size,
         )
     if embedding_provider == "ollama":
@@ -95,6 +98,7 @@ def create_embedding_engine(
             max_completion_tokens=embedding_max_completion_tokens,
             endpoint=embedding_endpoint,
             huggingface_tokenizer=huggingface_tokenizer,
+            batch_size=embedding_batch_size,
         )
     from .LiteLLMEmbeddingEngine import LiteLLMEmbeddingEngine
@@ -108,4 +112,5 @@ def create_embedding_engine(
         model=embedding_model,
         dimensions=embedding_dimensions,
         max_completion_tokens=embedding_max_completion_tokens,
+        batch_size=embedding_batch_size,
     )

cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py CHANGED Viewed

@@ -205,9 +205,12 @@ class LanceDBAdapter(VectorDBInterface):
         collection = await self.get_collection(collection_name)
         if len(data_point_ids) == 1:
-            results = await collection.query().where(f"id = '{data_point_ids[0]}'").to_pandas()
+            results = await collection.query().where(f"id = '{data_point_ids[0]}'")
         else:
-            results = await collection.query().where(f"id IN {tuple(data_point_ids)}").to_pandas()
+            results = await collection.query().where(f"id IN {tuple(data_point_ids)}")
+        # Convert query results to list format
+        results_list = results.to_list() if hasattr(results, "to_list") else list(results)
         return [
             ScoredResult(
@@ -215,7 +218,7 @@ class LanceDBAdapter(VectorDBInterface):
                 payload=result["payload"],
                 score=0,
             )
-            for result in results.to_dict("index").values()
+            for result in results_list
         ]
     async def search(
@@ -223,7 +226,7 @@ class LanceDBAdapter(VectorDBInterface):
         collection_name: str,
         query_text: str = None,
         query_vector: List[float] = None,
-        limit: int = 15,
+        limit: Optional[int] = 15,
         with_vector: bool = False,
         normalized: bool = True,
     ):
@@ -235,16 +238,14 @@ class LanceDBAdapter(VectorDBInterface):
         collection = await self.get_collection(collection_name)
-        if limit == 0:
+        if limit is None:
             limit = await collection.count_rows()
         # LanceDB search will break if limit is 0 so we must return
-        if limit == 0:
+        if limit <= 0:
             return []
-        results = await collection.vector_search(query_vector).limit(limit).to_pandas()
-        result_values = list(results.to_dict("index").values())
+        result_values = await collection.vector_search(query_vector).limit(limit).to_list()
         if not result_values:
             return []
@@ -264,7 +265,7 @@ class LanceDBAdapter(VectorDBInterface):
         self,
         collection_name: str,
         query_texts: List[str],
-        limit: int = None,
+        limit: Optional[int] = None,
         with_vectors: bool = False,
     ):
         query_vectors = await self.embedding_engine.embed_text(query_texts)

cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py CHANGED Viewed

@@ -3,13 +3,12 @@ from typing import List, Optional, get_type_hints
 from sqlalchemy.inspection import inspect
 from sqlalchemy.orm import Mapped, mapped_column
 from sqlalchemy.dialects.postgresql import insert
-from sqlalchemy import JSON, Column, Table, select, delete, MetaData
+from sqlalchemy import JSON, Column, Table, select, delete, MetaData, func
 from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker
 from sqlalchemy.exc import ProgrammingError
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 from asyncpg import DeadlockDetectedError, DuplicateTableError, UniqueViolationError
 from cognee.shared.logging_utils import get_logger
 from cognee.infrastructure.engine import DataPoint
 from cognee.infrastructure.engine.utils import parse_id
@@ -126,41 +125,42 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
         data_point_types = get_type_hints(DataPoint)
         vector_size = self.embedding_engine.get_vector_size()
-        async with self.VECTOR_DB_LOCK:
-            if not await self.has_collection(collection_name):
-                class PGVectorDataPoint(Base):
-                    """
-                    Represent a point in a vector data space with associated data and vector representation.
-                    This class inherits from Base and is associated with a database table defined by
-                    __tablename__. It maintains the following public methods and instance variables:
-                    - __init__(self, id, payload, vector): Initializes a new PGVectorDataPoint instance.
-                    Instance variables:
-                    - id: Identifier for the data point, defined by data_point_types.
-                    - payload: JSON data associated with the data point.
-                    - vector: Vector representation of the data point, with size defined by vector_size.
-                    """
-                    __tablename__ = collection_name
-                    __table_args__ = {"extend_existing": True}
-                    # PGVector requires one column to be the primary key
-                    id: Mapped[data_point_types["id"]] = mapped_column(primary_key=True)
-                    payload = Column(JSON)
-                    vector = Column(self.Vector(vector_size))
-                    def __init__(self, id, payload, vector):
-                        self.id = id
-                        self.payload = payload
-                        self.vector = vector
-                async with self.engine.begin() as connection:
-                    if len(Base.metadata.tables.keys()) > 0:
-                        await connection.run_sync(
-                            Base.metadata.create_all, tables=[PGVectorDataPoint.__table__]
-                        )
+        if not await self.has_collection(collection_name):
+            async with self.VECTOR_DB_LOCK:
+                if not await self.has_collection(collection_name):
+                    class PGVectorDataPoint(Base):
+                        """
+                        Represent a point in a vector data space with associated data and vector representation.
+                        This class inherits from Base and is associated with a database table defined by
+                        __tablename__. It maintains the following public methods and instance variables:
+                        - __init__(self, id, payload, vector): Initializes a new PGVectorDataPoint instance.
+                        Instance variables:
+                        - id: Identifier for the data point, defined by data_point_types.
+                        - payload: JSON data associated with the data point.
+                        - vector: Vector representation of the data point, with size defined by vector_size.
+                        """
+                        __tablename__ = collection_name
+                        __table_args__ = {"extend_existing": True}
+                        # PGVector requires one column to be the primary key
+                        id: Mapped[data_point_types["id"]] = mapped_column(primary_key=True)
+                        payload = Column(JSON)
+                        vector = Column(self.Vector(vector_size))
+                        def __init__(self, id, payload, vector):
+                            self.id = id
+                            self.payload = payload
+                            self.vector = vector
+                    async with self.engine.begin() as connection:
+                        if len(Base.metadata.tables.keys()) > 0:
+                            await connection.run_sync(
+                                Base.metadata.create_all, tables=[PGVectorDataPoint.__table__]
+                            )
     @retry(
         retry=retry_if_exception_type(DeadlockDetectedError),
@@ -299,7 +299,7 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
         collection_name: str,
         query_text: Optional[str] = None,
         query_vector: Optional[List[float]] = None,
-        limit: int = 15,
+        limit: Optional[int] = 15,
         with_vector: bool = False,
     ) -> List[ScoredResult]:
         if query_text is None and query_vector is None:
@@ -311,6 +311,16 @@ class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface):
         # Get PGVectorDataPoint Table from database
         PGVectorDataPoint = await self.get_table(collection_name)
+        if limit is None:
+            async with self.get_async_session() as session:
+                query = select(func.count()).select_from(PGVectorDataPoint)
+                result = await session.execute(query)
+                limit = result.scalar_one()
+        # If limit is still 0, no need to do the search, just return empty results
+        if limit <= 0:
+            return []
         # NOTE: This needs to be initialized in case search doesn't return a value
         closest_items = []

cognee/infrastructure/databases/vector/vector_db_interface.py CHANGED Viewed

@@ -83,7 +83,7 @@ class VectorDBInterface(Protocol):
         collection_name: str,
         query_text: Optional[str],
         query_vector: Optional[List[float]],
-        limit: int,
+        limit: Optional[int],
         with_vector: bool = False,
     ):
         """
@@ -98,7 +98,7 @@ class VectorDBInterface(Protocol):
               collection.
             - query_vector (Optional[List[float]]): An optional vector representation for
               searching the collection.
-            - limit (int): The maximum number of results to return from the search.
+            - limit (Optional[int]): The maximum number of results to return from the search.
             - with_vector (bool): Whether to return the vector representations with search
               results. (default False)
         """
@@ -106,7 +106,11 @@ class VectorDBInterface(Protocol):
     @abstractmethod
     async def batch_search(
-        self, collection_name: str, query_texts: List[str], limit: int, with_vectors: bool = False
+        self,
+        collection_name: str,
+        query_texts: List[str],
+        limit: Optional[int],
+        with_vectors: bool = False,
     ):
         """
         Perform a batch search using multiple text queries against a collection.
@@ -116,7 +120,7 @@ class VectorDBInterface(Protocol):
             - collection_name (str): The name of the collection to conduct the batch search in.
             - query_texts (List[str]): A list of text queries to use for the search.
-            - limit (int): The maximum number of results to return for each query.
+            - limit (Optional[int]): The maximum number of results to return for each query.
             - with_vectors (bool): Whether to include vector representations with search
               results. (default False)
         """

cognee/infrastructure/files/storage/S3FileStorage.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
-import s3fs
-from typing import BinaryIO, Union
+from typing import BinaryIO, Union, TYPE_CHECKING
 from contextlib import asynccontextmanager
 from cognee.infrastructure.files.storage.s3_config import get_s3_config
@@ -8,23 +7,34 @@ from cognee.infrastructure.utils.run_async import run_async
 from cognee.infrastructure.files.storage.FileBufferedReader import FileBufferedReader
 from .storage import Storage
+if TYPE_CHECKING:
+    import s3fs
 class S3FileStorage(Storage):
     """
-    Manage local file storage operations such as storing, retrieving, and managing files on
-    the filesystem.
+    Manage S3 file storage operations such as storing, retrieving, and managing files on
+    S3-compatible storage.
     """
     storage_path: str
-    s3: s3fs.S3FileSystem
+    s3: "s3fs.S3FileSystem"
     def __init__(self, storage_path: str):
+        try:
+            import s3fs
+        except ImportError:
+            raise ImportError(
+                's3fs is required for S3FileStorage. Install it with: pip install cognee"[aws]"'
+            )
         self.storage_path = storage_path
         s3_config = get_s3_config()
         if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
             self.s3 = s3fs.S3FileSystem(
                 key=s3_config.aws_access_key_id,
                 secret=s3_config.aws_secret_access_key,
+                token=s3_config.aws_session_token,
                 anon=False,
                 endpoint_url=s3_config.aws_endpoint_url,
                 client_kwargs={"region_name": s3_config.aws_region},

cognee/infrastructure/files/storage/s3_config.py CHANGED Viewed

@@ -8,6 +8,7 @@ class S3Config(BaseSettings):
     aws_endpoint_url: Optional[str] = None
     aws_access_key_id: Optional[str] = None
     aws_secret_access_key: Optional[str] = None
+    aws_session_token: Optional[str] = None
     model_config = SettingsConfigDict(env_file=".env", extra="allow")

cognee/infrastructure/files/utils/open_data_file.py CHANGED Viewed

@@ -4,7 +4,6 @@ from urllib.parse import urlparse
 from contextlib import asynccontextmanager
 from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
-from cognee.infrastructure.files.storage.S3FileStorage import S3FileStorage
 from cognee.infrastructure.files.storage.LocalFileStorage import LocalFileStorage
@@ -23,23 +22,17 @@ async def open_data_file(file_path: str, mode: str = "rb", encoding: str = None,
             yield file
     elif file_path.startswith("s3://"):
+        try:
+            from cognee.infrastructure.files.storage.S3FileStorage import S3FileStorage
+        except ImportError:
+            raise ImportError(
+                "S3 dependencies are not installed. Please install with 'pip install cognee\"[aws]\"' to use S3 functionality."
+            )
         normalized_url = get_data_file_path(file_path)
         s3_dir_path = os.path.dirname(normalized_url)
         s3_filename = os.path.basename(normalized_url)
-        # if "/" in s3_path:
-        #     s3_dir = "/".join(s3_path.split("/")[:-1])
-        #     s3_filename = s3_path.split("/")[-1]
-        # else:
-        #     s3_dir = ""
-        #     s3_filename = s3_path
-        # Extract filesystem path from S3 URL structure
-        # file_dir_path = (
-        #     f"s3://{parsed_url.netloc}/{s3_dir}" if s3_dir else f"s3://{parsed_url.netloc}"
-        # )
-        # file_name = s3_filename
         file_storage = S3FileStorage(s3_dir_path)
         async with file_storage.open(s3_filename, mode=mode, **kwargs) as file:

cognee 0.3.4.dev3__py3-none-any.whl → 0.3.5__py3-none-any.whl

cognee 0.3.4.dev3py3-none-any.whl → 0.3.5py3-none-any.whl