PyPI - cognee - Versions diffs - 0.3.4.dev3__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

cognee 0.3.4.dev3py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

cognee/tasks/graph/infer_data_ontology.py DELETED Viewed

@@ -1,309 +0,0 @@
-# PROPOSED TO BE DEPRECATED
-"""This module contains the OntologyEngine class which is responsible for adding graph ontology from a JSON or CSV file."""
-import csv
-import json
-from cognee.shared.logging_utils import get_logger
-from datetime import datetime, timezone
-from fastapi import status
-from typing import Any, Dict, List, Optional, Union, Type
-import aiofiles
-import pandas as pd
-from pydantic import BaseModel
-from cognee.modules.graph.exceptions import EntityNotFoundError
-from cognee.modules.ingestion.exceptions import IngestionError
-from cognee.infrastructure.data.chunking.config import get_chunk_config
-from cognee.infrastructure.data.chunking.get_chunking_engine import get_chunk_engine
-from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
-from cognee.infrastructure.files.utils.extract_text_from_file import extract_text_from_file
-from cognee.infrastructure.files.utils.guess_file_type import guess_file_type, FileTypeException
-from cognee.modules.data.methods.add_model_class_to_graph import (
-    add_model_class_to_graph,
-)
-from cognee.tasks.graph.models import NodeModel, GraphOntology
-from cognee.shared.data_models import KnowledgeGraph
-from cognee.modules.engine.utils import generate_node_id, generate_node_name
-from cognee.infrastructure.llm.LLMGateway import LLMGateway
-logger = get_logger("task:infer_data_ontology")
-async def extract_ontology(content: str, response_model: Type[BaseModel]):
-    """
-    Extracts structured ontology from the provided content using a pre-defined LLM client.
-    This asynchronous function retrieves a system prompt from a file and utilizes an LLM
-    client to create a structured output based on the input content and specified response
-    model.
-    Parameters:
-    -----------
-        - content (str): The content from which to extract the ontology.
-        - response_model (Type[BaseModel]): The model that defines the structure of the
-          output ontology.
-    Returns:
-    --------
-        The structured ontology extracted from the content.
-    """
-    system_prompt = LLMGateway.read_query_prompt("extract_ontology.txt")
-    ontology = await LLMGateway.acreate_structured_output(content, system_prompt, response_model)
-    return ontology
-class OntologyEngine:
-    """
-    Manage ontology data and operations for graph structures, providing methods for data
-    loading, flattening models, and adding ontological relationships to a graph database.
-    Public methods:
-    - flatten_model
-    - recursive_flatten
-    - load_data
-    - add_graph_ontology
-    """
-    async def flatten_model(
-        self, model: NodeModel, parent_id: Optional[str] = None
-    ) -> Dict[str, Any]:
-        """
-        Flatten the model to a dictionary including optional parent ID and relationship details
-        if available.
-        Parameters:
-        -----------
-            - model (NodeModel): The NodeModel instance to flatten.
-            - parent_id (Optional[str]): An optional ID of the parent node for hierarchical
-              purposes. (default None)
-        Returns:
-        --------
-            - Dict[str, Any]: A dictionary representation of the model with flattened
-              attributes.
-        """
-        result = model.dict()
-        result["parent_id"] = parent_id
-        if model.default_relationship:
-            result.update(
-                {
-                    "relationship_type": model.default_relationship.type,
-                    "relationship_source": model.default_relationship.source,
-                    "relationship_target": model.default_relationship.target,
-                }
-            )
-        return result
-    async def recursive_flatten(
-        self, items: Union[List[Dict[str, Any]], Dict[str, Any]], parent_id: Optional[str] = None
-    ) -> List[Dict[str, Any]]:
-        """
-        Recursively flatten a hierarchical structure of models into a flat list of dictionaries.
-        Parameters:
-        -----------
-            - items (Union[List[Dict[str, Any]], Dict[str, Any]]): A list or dictionary
-              containing models to flatten.
-            - parent_id (Optional[str]): An optional ID of the parent node to maintain hierarchy
-              during flattening. (default None)
-        Returns:
-        --------
-            - List[Dict[str, Any]]: A flat list of dictionaries representing the hierarchical
-              model structure.
-        """
-        flat_list = []
-        if isinstance(items, list):
-            for item in items:
-                flat_list.extend(await self.recursive_flatten(item, parent_id))
-        elif isinstance(items, dict):
-            model = NodeModel.model_validate(items)
-            flat_list.append(await self.flatten_model(model, parent_id))
-            for child in model.children:
-                flat_list.extend(await self.recursive_flatten(child, model.node_id))
-        return flat_list
-    async def load_data(self, file_path: str) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
-        """
-        Load data from a specified JSON or CSV file and return it in a structured format.
-        Parameters:
-        -----------
-            - file_path (str): The path to the file to load data from.
-        Returns:
-        --------
-            - Union[List[Dict[str, Any]], Dict[str, Any]]: Parsed data from the file as either a
-              list of dictionaries or a single dictionary depending on content type.
-        """
-        try:
-            if file_path.endswith(".json"):
-                async with aiofiles.open(file_path, mode="r") as f:
-                    data = await f.read()
-                    return json.loads(data)
-            elif file_path.endswith(".csv"):
-                async with aiofiles.open(file_path, mode="r") as f:
-                    content = await f.read()
-                    reader = csv.DictReader(content.splitlines())
-                    return list(reader)
-            else:
-                raise IngestionError(message="Unsupported file format")
-        except Exception as e:
-            raise IngestionError(
-                message=f"Failed to load data from {file_path}: {e}",
-                status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            )
-    async def add_graph_ontology(self, file_path: str = None, documents: list = None):
-        """
-        Add graph ontology from a JSON or CSV file, or infer relationships from provided
-        document content. Raise exceptions for invalid file types or missing entities.
-        Parameters:
-        -----------
-            - file_path (str): Optional path to a file containing data to be loaded. (default
-              None)
-            - documents (list): Optional list of document objects for content extraction if no
-              file path is provided. (default None)
-        """
-        if file_path is None:
-            initial_chunks_and_ids = []
-            chunk_config = get_chunk_config()
-            chunk_engine = get_chunk_engine()
-            chunk_strategy = chunk_config.chunk_strategy
-            for base_file in documents:
-                with open(base_file.raw_data_location, "rb") as file:
-                    try:
-                        file_type = guess_file_type(file)
-                        text = extract_text_from_file(file, file_type)
-                        subchunks, chunks_with_ids = chunk_engine.chunk_data(
-                            chunk_strategy,
-                            text,
-                            chunk_config.chunk_size,
-                            chunk_config.chunk_overlap,
-                        )
-                        if chunks_with_ids[0][0] == 1:
-                            initial_chunks_and_ids.append({base_file.id: chunks_with_ids})
-                    except FileTypeException:
-                        logger.warning(
-                            "File (%s) has an unknown file type. We are skipping it.", file["id"]
-                        )
-            ontology = await extract_ontology(str(initial_chunks_and_ids), GraphOntology)
-            graph_client = await get_graph_engine()
-            await graph_client.add_nodes(
-                [
-                    (
-                        node.id,
-                        dict(
-                            uuid=generate_node_id(node.id),
-                            name=generate_node_name(node.name),
-                            type=generate_node_id(node.id),
-                            description=node.description,
-                            updated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
-                        ),
-                    )
-                    for node in ontology.nodes
-                ]
-            )
-            await graph_client.add_edges(
-                (
-                    generate_node_id(edge.source_id),
-                    generate_node_id(edge.target_id),
-                    edge.relationship_type,
-                    dict(
-                        source_node_id=generate_node_id(edge.source_id),
-                        target_node_id=generate_node_id(edge.target_id),
-                        relationship_name=edge.relationship_type,
-                        updated_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
-                    ),
-                )
-                for edge in ontology.edges
-            )
-        else:
-            dataset_level_information = documents[0][1]
-            # Extract the list of valid IDs from the explanations
-            valid_ids = {item["id"] for item in dataset_level_information}
-            try:
-                data = await self.load_data(file_path)
-                flt_ontology = await self.recursive_flatten(data)
-                df = pd.DataFrame(flt_ontology)
-                graph_client = await get_graph_engine()
-                for _, row in df.iterrows():
-                    node_data = row.to_dict()
-                    node_id = node_data.pop("node_id", None)
-                    if node_id in valid_ids:
-                        await graph_client.add_node(node_id, node_data)
-                    if node_id not in valid_ids:
-                        raise EntityNotFoundError(
-                            message=f"Node ID {node_id} not found in the dataset"
-                        )
-                    if pd.notna(row.get("relationship_source")) and pd.notna(
-                        row.get("relationship_target")
-                    ):
-                        await graph_client.add_edge(
-                            row["relationship_source"],
-                            row["relationship_target"],
-                            relationship_name=row["relationship_type"],
-                            edge_properties={
-                                "source_node_id": row["relationship_source"],
-                                "target_node_id": row["relationship_target"],
-                                "relationship_name": row["relationship_type"],
-                                "updated_at": datetime.now(timezone.utc).strftime(
-                                    "%Y-%m-%d %H:%M:%S"
-                                ),
-                            },
-                        )
-                return
-            except Exception as e:
-                raise RuntimeError(f"Failed to add graph ontology from {file_path}: {e}") from e
-async def infer_data_ontology(documents, ontology_model=KnowledgeGraph, root_node_id=None):
-    """
-    Infer data ontology from provided documents and optionally add it to a graph.
-    Parameters:
-    -----------
-        - documents: The documents from which to infer the ontology.
-        - ontology_model: The ontology model to use for the inference, defaults to
-          KnowledgeGraph. (default KnowledgeGraph)
-        - root_node_id: An optional root node identifier for the ontology. (default None)
-    """
-    if ontology_model == KnowledgeGraph:
-        ontology_engine = OntologyEngine()
-        root_node_id = await ontology_engine.add_graph_ontology(documents=documents)
-    else:
-        graph_engine = await get_graph_engine()
-        await add_model_class_to_graph(ontology_model, graph_engine)
-    yield (documents, root_node_id)

cognee/tests/test_falkordb.py DELETED Viewed

@@ -1,174 +0,0 @@
-import os
-import cognee
-import pathlib
-from cognee.infrastructure.files.storage import get_storage_config
-from cognee.modules.search.operations import get_history
-from cognee.modules.users.methods import get_default_user
-from cognee.shared.logging_utils import get_logger
-from cognee.modules.search.types import SearchType
-logger = get_logger()
-async def check_falkordb_connection():
-    """Check if FalkorDB is available at localhost:6379"""
-    try:
-        from falkordb import FalkorDB
-        client = FalkorDB(host="localhost", port=6379)
-        # Try to list graphs to check connection
-        client.list_graphs()
-        return True
-    except Exception as e:
-        logger.warning(f"FalkorDB not available at localhost:6379: {e}")
-        return False
-async def main():
-    # Check if FalkorDB is available
-    if not await check_falkordb_connection():
-        print("⚠️  FalkorDB is not available at localhost:6379")
-        print("   To run this test, start FalkorDB server:")
-        print("   docker run -p 6379:6379 falkordb/falkordb:latest")
-        print("   Skipping FalkorDB test...")
-        return
-    print("✅ FalkorDB connection successful, running test...")
-    # Configure FalkorDB as the graph database provider
-    cognee.config.set_graph_db_config(
-        {
-            "graph_database_url": "localhost",  # FalkorDB URL (using Redis protocol)
-            "graph_database_port": 6379,
-            "graph_database_provider": "falkordb",
-        }
-    )
-    # Configure FalkorDB as the vector database provider too since it's a hybrid adapter
-    cognee.config.set_vector_db_config(
-        {
-            "vector_db_url": "localhost",
-            "vector_db_port": 6379,
-            "vector_db_provider": "falkordb",
-        }
-    )
-    data_directory_path = str(
-        pathlib.Path(
-            os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_falkordb")
-        ).resolve()
-    )
-    cognee.config.data_root_directory(data_directory_path)
-    cognee_directory_path = str(
-        pathlib.Path(
-            os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_falkordb")
-        ).resolve()
-    )
-    cognee.config.system_root_directory(cognee_directory_path)
-    await cognee.prune.prune_data()
-    await cognee.prune.prune_system(metadata=True)
-    dataset_name = "artificial_intelligence"
-    ai_text_file_path = os.path.join(
-        pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf"
-    )
-    await cognee.add([ai_text_file_path], dataset_name)
-    text = """A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation and other natural language processing tasks such as classification. LLMs acquire these abilities by learning statistical relationships from text documents during a computationally intensive self-supervised and semi-supervised training process. LLMs can be used for text generation, a form of generative AI, by taking an input text and repeatedly predicting the next token or word.
-    LLMs are artificial neural networks. The largest and most capable, as of March 2024, are built with a decoder-only transformer-based architecture while some recent implementations are based on other architectures, such as recurrent neural network variants and Mamba (a state space model).
-    Up to 2020, fine tuning was the only way a model could be adapted to be able to accomplish specific tasks. Larger sized models, such as GPT-3, however, can be prompt-engineered to achieve similar results.[6] They are thought to acquire knowledge about syntax, semantics and "ontology" inherent in human language corpora, but also inaccuracies and biases present in the corpora.
-    Some notable LLMs are OpenAI's GPT series of models (e.g., GPT-3.5 and GPT-4, used in ChatGPT and Microsoft Copilot), Google's PaLM and Gemini (the latter of which is currently used in the chatbot of the same name), xAI's Grok, Meta's LLaMA family of open-source models, Anthropic's Claude models, Mistral AI's open source models, and Databricks' open source DBRX.
-    """
-    await cognee.add([text], dataset_name)
-    await cognee.cognify([dataset_name])
-    from cognee.infrastructure.databases.vector import get_vector_engine
-    vector_engine = get_vector_engine()
-    random_node = (await vector_engine.search("entity.name", "AI"))[0]
-    random_node_name = random_node.payload["text"]
-    search_results = await cognee.search(
-        query_type=SearchType.INSIGHTS, query_text=random_node_name
-    )
-    assert len(search_results) != 0, "The search results list is empty."
-    print("\n\nExtracted sentences are:\n")
-    for result in search_results:
-        print(f"{result}\n")
-    search_results = await cognee.search(query_type=SearchType.CHUNKS, query_text=random_node_name)
-    assert len(search_results) != 0, "The search results list is empty."
-    print("\n\nExtracted chunks are:\n")
-    for result in search_results:
-        print(f"{result}\n")
-    search_results = await cognee.search(
-        query_type=SearchType.SUMMARIES, query_text=random_node_name
-    )
-    assert len(search_results) != 0, "Query related summaries don't exist."
-    print("\nExtracted summaries are:\n")
-    for result in search_results:
-        print(f"{result}\n")
-    user = await get_default_user()
-    history = await get_history(user.id)
-    assert len(history) == 6, "Search history is not correct."
-    # Assert local data files are cleaned properly
-    await cognee.prune.prune_data()
-    data_root_directory = get_storage_config()["data_root_directory"]
-    assert not os.path.isdir(data_root_directory), "Local data files are not deleted"
-    # Assert relational, vector and graph databases have been cleaned properly
-    await cognee.prune.prune_system(metadata=True)
-    # For FalkorDB vector engine, check if collections are empty
-    # Since FalkorDB is a hybrid adapter, we can check if the graph is empty
-    # as the vector data is stored in the same graph
-    if hasattr(vector_engine, "driver"):
-        # This is FalkorDB - check if graphs exist
-        collections = vector_engine.driver.list_graphs()
-        # The graph should be deleted, so either no graphs or empty graph
-        if vector_engine.graph_name in collections:
-            # Graph exists but should be empty
-            vector_graph_data = await vector_engine.get_graph_data()
-            vector_nodes, vector_edges = vector_graph_data
-            assert len(vector_nodes) == 0 and len(vector_edges) == 0, (
-                "FalkorDB vector database is not empty"
-            )
-    else:
-        # Fallback for other vector engines like LanceDB
-        connection = await vector_engine.get_connection()
-        collection_names = await connection.table_names()
-        assert len(collection_names) == 0, "Vector database is not empty"
-    from cognee.infrastructure.databases.relational import get_relational_engine
-    assert not os.path.exists(get_relational_engine().db_path), (
-        "SQLite relational database is not empty"
-    )
-    # For FalkorDB, check if the graph database is empty
-    from cognee.infrastructure.databases.graph import get_graph_engine
-    graph_engine = get_graph_engine()
-    graph_data = await graph_engine.get_graph_data()
-    nodes, edges = graph_data
-    assert len(nodes) == 0 and len(edges) == 0, "FalkorDB graph database is not empty"
-    print("🎉 FalkorDB test completed successfully!")
-    print("   ✓ Data ingestion worked")
-    print("   ✓ Cognify processing worked")
-    print("   ✓ Search operations worked")
-    print("   ✓ Cleanup worked")
-if __name__ == "__main__":
-    import asyncio
-    asyncio.run(main(), debug=True)