PyPI - cognee - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.1.dev0__py3-none-any.whl - Mend

cognee 0.5.1py3-none-any.whl → 0.5.1.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import litellm
 import instructor
 from pydantic import BaseModel
-from typing import Type
+from typing import Type, Optional
 from litellm import JSONSchemaValidationError
+from cognee.infrastructure.files.utils.open_data_file import open_data_file
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.observability.get_observe import get_observe
-from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
-    LLMInterface,
+from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.generic_llm_api.adapter import (
+    GenericAPIAdapter,
 )
 from cognee.infrastructure.llm.config import get_llm_config
 from cognee.shared.rate_limiting import llm_rate_limiter_context_manager
@@ -20,12 +20,14 @@ from tenacity import (
     retry_if_not_exception_type,
     before_sleep_log,
 )
+from ..types import TranscriptionReturnType
+from mistralai import Mistral
 logger = get_logger()
 observe = get_observe()
-class MistralAdapter(LLMInterface):
+class MistralAdapter(GenericAPIAdapter):
     """
     Adapter for Mistral AI API, for structured output generation and prompt display.
@@ -34,10 +36,6 @@ class MistralAdapter(LLMInterface):
     - show_prompt
     """
-    name = "Mistral"
-    model: str
-    api_key: str
-    max_completion_tokens: int
     default_instructor_mode = "mistral_tools"
     def __init__(
@@ -46,12 +44,19 @@ class MistralAdapter(LLMInterface):
         model: str,
         max_completion_tokens: int,
         endpoint: str = None,
+        transcription_model: str = None,
+        image_transcribe_model: str = None,
         instructor_mode: str = None,
     ):
-        from mistralai import Mistral
-        self.model = model
-        self.max_completion_tokens = max_completion_tokens
+        super().__init__(
+            api_key=api_key,
+            model=model,
+            max_completion_tokens=max_completion_tokens,
+            name="Mistral",
+            endpoint=endpoint,
+            transcription_model=transcription_model,
+            image_transcribe_model=image_transcribe_model,
+        )
         self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
@@ -60,7 +65,9 @@ class MistralAdapter(LLMInterface):
             mode=instructor.Mode(self.instructor_mode),
             api_key=get_llm_config().llm_api_key,
         )
+        self.mistral_client = Mistral(api_key=self.api_key)
+    @observe(as_type="generation")
     @retry(
         stop=stop_after_delay(128),
         wait=wait_exponential_jitter(8, 128),
@@ -119,3 +126,41 @@ class MistralAdapter(LLMInterface):
             logger.error(f"Schema validation failed: {str(e)}")
             logger.debug(f"Raw response: {e.raw_response}")
             raise ValueError(f"Response failed schema validation: {str(e)}")
+    @observe(as_type="transcription")
+    @retry(
+        stop=stop_after_delay(128),
+        wait=wait_exponential_jitter(2, 128),
+        retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
+        before_sleep=before_sleep_log(logger, logging.DEBUG),
+        reraise=True,
+    )
+    async def create_transcript(self, input) -> Optional[TranscriptionReturnType]:
+        """
+        Generate an audio transcript from a user query.
+        This method creates a transcript from the specified audio file.
+        The audio file is processed and the transcription is retrieved from the API.
+        Parameters:
+        -----------
+            - input: The path to the audio file that needs to be transcribed.
+        Returns:
+        --------
+            The generated transcription of the audio file.
+        """
+        transcription_model = self.transcription_model
+        if self.transcription_model.startswith("mistral"):
+            transcription_model = self.transcription_model.split("/")[-1]
+        file_name = input.split("/")[-1]
+        async with open_data_file(input, mode="rb") as f:
+            transcription_response = self.mistral_client.audio.transcriptions.complete(
+                model=transcription_model,
+                file={
+                    "content": f,
+                    "file_name": file_name,
+                },
+            )
+            return TranscriptionReturnType(transcription_response.text, transcription_response)

cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py CHANGED Viewed

@@ -12,7 +12,6 @@ from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.ll
 from cognee.infrastructure.files.utils.open_data_file import open_data_file
 from cognee.shared.logging_utils import get_logger
 from cognee.shared.rate_limiting import llm_rate_limiter_context_manager
 from tenacity import (
     retry,
     stop_after_delay,

cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import base64
 import litellm
 import instructor
 from typing import Type
@@ -16,8 +15,8 @@ from tenacity import (
     before_sleep_log,
 )
-from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.llm_interface import (
-    LLMInterface,
+from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.generic_llm_api.adapter import (
+    GenericAPIAdapter,
 )
 from cognee.infrastructure.llm.exceptions import (
     ContentPolicyFilterError,
@@ -26,13 +25,16 @@ from cognee.shared.rate_limiting import llm_rate_limiter_context_manager
 from cognee.infrastructure.files.utils.open_data_file import open_data_file
 from cognee.modules.observability.get_observe import get_observe
 from cognee.shared.logging_utils import get_logger
+from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.types import (
+    TranscriptionReturnType,
+)
 logger = get_logger()
 observe = get_observe()
-class OpenAIAdapter(LLMInterface):
+class OpenAIAdapter(GenericAPIAdapter):
     """
     Adapter for OpenAI's GPT-3, GPT-4 API.
@@ -53,12 +55,7 @@ class OpenAIAdapter(LLMInterface):
     - MAX_RETRIES
     """
-    name = "OpenAI"
-    model: str
-    api_key: str
-    api_version: str
     default_instructor_mode = "json_schema_mode"
     MAX_RETRIES = 5
     """Adapter for OpenAI's GPT-3, GPT=4 API"""
@@ -66,17 +63,29 @@ class OpenAIAdapter(LLMInterface):
     def __init__(
         self,
         api_key: str,
-        endpoint: str,
-        api_version: str,
         model: str,
-        transcription_model: str,
         max_completion_tokens: int,
+        endpoint: str = None,
+        api_version: str = None,
+        transcription_model: str = None,
         instructor_mode: str = None,
         streaming: bool = False,
         fallback_model: str = None,
         fallback_api_key: str = None,
         fallback_endpoint: str = None,
     ):
+        super().__init__(
+            api_key=api_key,
+            model=model,
+            max_completion_tokens=max_completion_tokens,
+            name="OpenAI",
+            endpoint=endpoint,
+            api_version=api_version,
+            transcription_model=transcription_model,
+            fallback_model=fallback_model,
+            fallback_api_key=fallback_api_key,
+            fallback_endpoint=fallback_endpoint,
+        )
         self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
         # TODO: With gpt5 series models OpenAI expects JSON_SCHEMA as a mode for structured outputs.
         #       Make sure all new gpt models will work with this mode as well.
@@ -91,18 +100,8 @@ class OpenAIAdapter(LLMInterface):
             self.aclient = instructor.from_litellm(litellm.acompletion)
             self.client = instructor.from_litellm(litellm.completion)
-        self.transcription_model = transcription_model
-        self.model = model
-        self.api_key = api_key
-        self.endpoint = endpoint
-        self.api_version = api_version
-        self.max_completion_tokens = max_completion_tokens
         self.streaming = streaming
-        self.fallback_model = fallback_model
-        self.fallback_api_key = fallback_api_key
-        self.fallback_endpoint = fallback_endpoint
     @observe(as_type="generation")
     @retry(
         stop=stop_after_delay(128),
@@ -198,7 +197,7 @@ class OpenAIAdapter(LLMInterface):
                         f"The provided input contains content that is not aligned with our content policy: {text_input}"
                     ) from error
-    @observe
+    @observe(as_type="transcription")
     @retry(
         stop=stop_after_delay(128),
         wait=wait_exponential_jitter(2, 128),
@@ -206,58 +205,7 @@ class OpenAIAdapter(LLMInterface):
         before_sleep=before_sleep_log(logger, logging.DEBUG),
         reraise=True,
     )
-    def create_structured_output(
-        self, text_input: str, system_prompt: str, response_model: Type[BaseModel], **kwargs
-    ) -> BaseModel:
-        """
-        Generate a response from a user query.
-        This method creates structured output by sending a synchronous request to the OpenAI API
-        using the provided parameters to generate a completion based on the user input and
-        system prompt.
-        Parameters:
-        -----------
-            - text_input (str): The input text provided by the user for generating a response.
-            - system_prompt (str): The system's prompt to guide the model's response.
-            - response_model (Type[BaseModel]): The expected model type for the response.
-        Returns:
-        --------
-            - BaseModel: A structured output generated by the model, returned as an instance of
-              BaseModel.
-        """
-        return self.client.chat.completions.create(
-            model=self.model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": f"""{text_input}""",
-                },
-                {
-                    "role": "system",
-                    "content": system_prompt,
-                },
-            ],
-            api_key=self.api_key,
-            api_base=self.endpoint,
-            api_version=self.api_version,
-            response_model=response_model,
-            max_retries=self.MAX_RETRIES,
-            **kwargs,
-        )
-    @retry(
-        stop=stop_after_delay(128),
-        wait=wait_exponential_jitter(2, 128),
-        retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
-        before_sleep=before_sleep_log(logger, logging.DEBUG),
-        reraise=True,
-    )
-    async def create_transcript(self, input, **kwargs):
+    async def create_transcript(self, input, **kwargs) -> TranscriptionReturnType:
         """
         Generate an audio transcript from a user query.
@@ -286,60 +234,6 @@ class OpenAIAdapter(LLMInterface):
                 max_retries=self.MAX_RETRIES,
                 **kwargs,
             )
+            return TranscriptionReturnType(transcription.text, transcription)
-        return transcription
-    @retry(
-        stop=stop_after_delay(128),
-        wait=wait_exponential_jitter(2, 128),
-        retry=retry_if_not_exception_type(litellm.exceptions.NotFoundError),
-        before_sleep=before_sleep_log(logger, logging.DEBUG),
-        reraise=True,
-    )
-    async def transcribe_image(self, input, **kwargs) -> BaseModel:
-        """
-        Generate a transcription of an image from a user query.
-        This method encodes the image and sends a request to the OpenAI API to obtain a
-        description of the contents of the image.
-        Parameters:
-        -----------
-            - input: The path to the image file that needs to be transcribed.
-        Returns:
-        --------
-            - BaseModel: A structured output generated by the model, returned as an instance of
-              BaseModel.
-        """
-        async with open_data_file(input, mode="rb") as image_file:
-            encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
-        return litellm.completion(
-            model=self.model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "What's in this image?",
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/jpeg;base64,{encoded_image}",
-                            },
-                        },
-                    ],
-                }
-            ],
-            api_key=self.api_key,
-            api_base=self.endpoint,
-            api_version=self.api_version,
-            max_completion_tokens=300,
-            max_retries=self.MAX_RETRIES,
-            **kwargs,
-        )
+    # transcribe_image is inherited from GenericAPIAdapter

cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/types.py ADDED Viewed

@@ -0,0 +1,10 @@
+from pydantic import BaseModel
+class TranscriptionReturnType:
+    text: str
+    payload: BaseModel
+    def __init__(self, text: str, payload: BaseModel):
+        self.text = text
+        self.payload = payload

cognee/modules/data/models/Data.py CHANGED Viewed

@@ -13,7 +13,7 @@ class Data(Base):
     __tablename__ = "data"
     id = Column(UUID, primary_key=True, default=uuid4)
+    label = Column(String, nullable=True)
     name = Column(String)
     extension = Column(String)
     mime_type = Column(String)
@@ -49,6 +49,7 @@ class Data(Base):
         return {
             "id": str(self.id),
             "name": self.name,
+            "label": self.label,
             "extension": self.extension,
             "mimeType": self.mime_type,
             "rawDataLocation": self.raw_data_location,

cognee/modules/retrieval/triplet_retriever.py CHANGED Viewed

@@ -36,7 +36,7 @@ class TripletRetriever(BaseRetriever):
         """Initialize retriever with optional custom prompt paths."""
         self.user_prompt_path = user_prompt_path
         self.system_prompt_path = system_prompt_path
-        self.top_k = top_k if top_k is not None else 1
+        self.top_k = top_k if top_k is not None else 5
         self.system_prompt = system_prompt
     async def get_context(self, query: str) -> str:

cognee/modules/retrieval/utils/brute_force_triplet_search.py CHANGED Viewed

@@ -16,24 +16,6 @@ logger = get_logger(level=ERROR)
 def format_triplets(edges):
-    print("\n\n\n")
-    def filter_attributes(obj, attributes):
-        """Helper function to filter out non-None properties, including nested dicts."""
-        result = {}
-        for attr in attributes:
-            value = getattr(obj, attr, None)
-            if value is not None:
-                # If the value is a dict, extract relevant keys from it
-                if isinstance(value, dict):
-                    nested_values = {
-                        k: v for k, v in value.items() if k in attributes and v is not None
-                    }
-                    result[attr] = nested_values
-                else:
-                    result[attr] = value
-        return result
     triplets = []
     for edge in edges:
         node1 = edge.node1

cognee/modules/search/methods/search.py CHANGED Viewed

@@ -49,7 +49,6 @@ async def search(
     session_id: Optional[str] = None,
     wide_search_top_k: Optional[int] = 100,
     triplet_distance_penalty: Optional[float] = 3.5,
-    verbose: bool = False,
 ) -> Union[CombinedSearchResult, List[SearchResult]]:
     """
@@ -141,7 +140,6 @@ async def search(
     )
     if use_combined_context:
-        # Note: combined context search must always be verbose and return a CombinedSearchResult with graphs info
         prepared_search_results = await prepare_search_result(
             search_results[0] if isinstance(search_results, list) else search_results
         )
@@ -175,30 +173,25 @@ async def search(
                 datasets = prepared_search_results["datasets"]
                 if only_context:
-                    search_result_dict = {
-                        "search_result": [context] if context else None,
-                        "dataset_id": datasets[0].id,
-                        "dataset_name": datasets[0].name,
-                        "dataset_tenant_id": datasets[0].tenant_id,
-                    }
-                    if verbose:
-                        # Include graphs only in verbose mode
-                        search_result_dict["graphs"] = graphs
-                    return_value.append(search_result_dict)
+                    return_value.append(
+                        {
+                            "search_result": [context] if context else None,
+                            "dataset_id": datasets[0].id,
+                            "dataset_name": datasets[0].name,
+                            "dataset_tenant_id": datasets[0].tenant_id,
+                            "graphs": graphs,
+                        }
+                    )
                 else:
-                    search_result_dict = {
-                        "search_result": [result] if result else None,
-                        "dataset_id": datasets[0].id,
-                        "dataset_name": datasets[0].name,
-                        "dataset_tenant_id": datasets[0].tenant_id,
-                    }
-                    if verbose:
-                        # Include graphs only in verbose mode
-                        search_result_dict["graphs"] = graphs
-                    return_value.append(search_result_dict)
+                    return_value.append(
+                        {
+                            "search_result": [result] if result else None,
+                            "dataset_id": datasets[0].id,
+                            "dataset_name": datasets[0].name,
+                            "dataset_tenant_id": datasets[0].tenant_id,
+                            "graphs": graphs,
+                        }
+                    )
             return return_value
         else:
             return_value = []

cognee/tasks/ingestion/data_item.py ADDED Viewed

@@ -0,0 +1,8 @@
+from dataclasses import dataclass
+from typing import Any, Optional
+@dataclass
+class DataItem:
+    data: Any
+    label: Optional[str] = None

cognee/tasks/ingestion/ingest_data.py CHANGED Viewed

@@ -20,6 +20,7 @@ from cognee.modules.data.methods import (
 from .save_data_item_to_storage import save_data_item_to_storage
 from .data_item_to_text_file import data_item_to_text_file
+from .data_item import DataItem
 async def ingest_data(
@@ -78,8 +79,16 @@ async def ingest_data(
         dataset_data_map = {str(data.id): True for data in dataset_data}
         for data_item in data:
+            # Support for DataItem (custom label + data wrapper)
+            current_label = None
+            underlying_data = data_item
+            if isinstance(data_item, DataItem):
+                underlying_data = data_item.data
+                current_label = data_item.label
             # Get file path of data item or create a file if it doesn't exist
-            original_file_path = await save_data_item_to_storage(data_item)
+            original_file_path = await save_data_item_to_storage(underlying_data)
             # Transform file path to be OS usable
             actual_file_path = get_data_file_path(original_file_path)
@@ -139,6 +148,7 @@ async def ingest_data(
                 data_point.external_metadata = ext_metadata
                 data_point.node_set = json.dumps(node_set) if node_set else None
                 data_point.tenant_id = user.tenant_id if user.tenant_id else None
+                data_point.label = current_label
                 # Check if data is already in dataset
                 if str(data_point.id) in dataset_data_map:
@@ -169,6 +179,7 @@ async def ingest_data(
                     tenant_id=user.tenant_id if user.tenant_id else None,
                     pipeline_status={},
                     token_count=-1,
+                    label=current_label,
                 )
                 new_datapoints.append(data_point)

cognee/tasks/ingestion/save_data_item_to_storage.py CHANGED Viewed

@@ -9,6 +9,7 @@ from cognee.shared.logging_utils import get_logger
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from cognee.tasks.web_scraper.utils import fetch_page_content
+from cognee.tasks.ingestion.data_item import DataItem
 logger = get_logger()
@@ -95,5 +96,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
         # data is text, save it to data storage and return the file path
         return await save_data_to_file(data_item)
+    if isinstance(data_item, DataItem):
+        # If instance is DataItem use the underlying data
+        return await save_data_item_to_storage(data_item.data)
     # data is not a supported type
     raise IngestionError(message=f"Data type not supported: {type(data_item)}")

cognee 0.5.1__py3-none-any.whl → 0.5.1.dev0__py3-none-any.whl

cognee 0.5.1py3-none-any.whl → 0.5.1.dev0py3-none-any.whl