PyPI - cognee - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.0.dev0__py3-none-any.whl - Mend

cognee 0.5.0py3-none-any.whl → 0.5.0.dev0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

cognee/tasks/repo_processor/get_repo_file_dependencies.py ADDED Viewed

@@ -0,0 +1,243 @@
+import asyncio
+import math
+import os
+from pathlib import Path
+from typing import Set
+from typing import AsyncGenerator, Optional, List
+from uuid import NAMESPACE_OID, uuid5
+from cognee.infrastructure.engine import DataPoint
+from cognee.shared.CodeGraphEntities import CodeFile, Repository
+# constant, declared only once
+EXCLUDED_DIRS: Set[str] = {
+    ".venv",
+    "venv",
+    "env",
+    ".env",
+    "site-packages",
+    "node_modules",
+    "dist",
+    "build",
+    ".git",
+    "tests",
+    "test",
+}
+async def get_source_code_files(
+    repo_path,
+    language_config: dict[str, list[str]] | None = None,
+    excluded_paths: Optional[List[str]] = None,
+):
+    """
+    Retrieve Python source code files from the specified repository path.
+    This function scans the given repository path for files that have the .py extension
+    while excluding test files and files within a virtual environment. It returns a list of
+    absolute paths to the source code files that are not empty.
+    Parameters:
+    -----------
+    - repo_path: Root path of the repository to search
+    - language_config: dict mapping language names to file extensions, e.g.,
+            {'python': ['.py'], 'javascript': ['.js', '.jsx'], ...}
+    - excluded_paths: Optional list of path fragments or glob patterns to exclude
+    Returns:
+    --------
+        A list of (absolute_path, language) tuples for source code files.
+    """
+    def _get_language_from_extension(file, language_config):
+        for lang, exts in language_config.items():
+            for ext in exts:
+                if file.endswith(ext):
+                    return lang
+        return None
+    # Default config if not provided
+    if language_config is None:
+        language_config = {
+            "python": [".py"],
+            "javascript": [".js", ".jsx"],
+            "typescript": [".ts", ".tsx"],
+            "java": [".java"],
+            "csharp": [".cs"],
+            "go": [".go"],
+            "rust": [".rs"],
+            "cpp": [".cpp", ".c", ".h", ".hpp"],
+        }
+    if not os.path.exists(repo_path):
+        return []
+    source_code_files = set()
+    for root, _, files in os.walk(repo_path):
+        for file in files:
+            lang = _get_language_from_extension(file, language_config)
+            if lang is None:
+                continue
+            # Exclude tests, common build/venv directories and files provided in exclude_paths
+            excluded_dirs = EXCLUDED_DIRS
+            excluded_paths = {Path(p).resolve() for p in (excluded_paths or [])}  # full paths
+            root_path = Path(root).resolve()
+            root_parts = set(root_path.parts)  # same as before
+            base_name, _ext = os.path.splitext(file)
+            if (
+                base_name.startswith("test_")
+                or base_name.endswith("_test")
+                or ".test." in file
+                or ".spec." in file
+                or (excluded_dirs & root_parts)  # name match
+                or any(
+                    root_path.is_relative_to(p)  # full-path match
+                    for p in excluded_paths
+                )
+            ):
+                continue
+            file_path = os.path.abspath(os.path.join(root, file))
+            if os.path.getsize(file_path) == 0:
+                continue
+            source_code_files.add((file_path, lang))
+    return sorted(list(source_code_files))
+def run_coroutine(coroutine_func, *args, **kwargs):
+    """
+    Run a coroutine function until it completes.
+    This function creates a new asyncio event loop, sets it as the current loop, and
+    executes the given coroutine function with the provided arguments. Once the coroutine
+    completes, the loop is closed. Intended for use in environments where an existing event
+    loop is not available or desirable.
+    Parameters:
+    -----------
+        - coroutine_func: The coroutine function to be run.
+        - *args: Positional arguments to pass to the coroutine function.
+        - **kwargs: Keyword arguments to pass to the coroutine function.
+    Returns:
+    --------
+        The result returned by the coroutine after completion.
+    """
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    result = loop.run_until_complete(coroutine_func(*args, **kwargs))
+    loop.close()
+    return result
+async def get_repo_file_dependencies(
+    repo_path: str,
+    detailed_extraction: bool = False,
+    supported_languages: list = None,
+    excluded_paths: Optional[List[str]] = None,
+) -> AsyncGenerator[DataPoint, None]:
+    """
+    Generate a dependency graph for source files (multi-language) in the given repository path.
+    Check the validity of the repository path and yield a repository object followed by the
+    dependencies of source files within that repository. Raise a FileNotFoundError if the
+    provided path does not exist. The extraction of detailed dependencies can be controlled
+    via the `detailed_extraction` argument. Languages considered can be restricted via
+    the `supported_languages` argument.
+    Parameters:
+    -----------
+        - repo_path (str): The file path to the repository to process.
+        - detailed_extraction (bool): Whether to perform a detailed extraction of code parts.
+        - supported_languages (list | None): Subset of languages to include; if None, use defaults.
+    """
+    if isinstance(repo_path, list) and len(repo_path) == 1:
+        repo_path = repo_path[0]
+    if not os.path.exists(repo_path):
+        raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
+    # Build language config from supported_languages
+    default_language_config = {
+        "python": [".py"],
+        "javascript": [".js", ".jsx"],
+        "typescript": [".ts", ".tsx"],
+        "java": [".java"],
+        "csharp": [".cs"],
+        "go": [".go"],
+        "rust": [".rs"],
+        "cpp": [".cpp", ".c", ".h", ".hpp"],
+        "c": [".c", ".h"],
+    }
+    if supported_languages is not None:
+        language_config = {
+            k: v for k, v in default_language_config.items() if k in supported_languages
+        }
+    else:
+        language_config = default_language_config
+    source_code_files = await get_source_code_files(
+        repo_path, language_config=language_config, excluded_paths=excluded_paths
+    )
+    repo = Repository(
+        id=uuid5(NAMESPACE_OID, repo_path),
+        path=repo_path,
+    )
+    yield repo
+    chunk_size = 100
+    number_of_chunks = math.ceil(len(source_code_files) / chunk_size)
+    chunk_ranges = [
+        (
+            chunk_number * chunk_size,
+            min((chunk_number + 1) * chunk_size, len(source_code_files)) - 1,
+        )
+        for chunk_number in range(number_of_chunks)
+    ]
+    # Import dependency extractors for each language (Python for now, extend later)
+    from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
+    import aiofiles
+    # TODO: Add other language extractors here
+    for start_range, end_range in chunk_ranges:
+        tasks = []
+        for file_path, lang in source_code_files[start_range : end_range + 1]:
+            # For now, only Python is supported; extend with other languages
+            if lang == "python":
+                tasks.append(
+                    get_local_script_dependencies(repo_path, file_path, detailed_extraction)
+                )
+            else:
+                # Placeholder: create a minimal CodeFile for other languages
+                async def make_codefile_stub(file_path=file_path, lang=lang):
+                    async with aiofiles.open(
+                        file_path, "r", encoding="utf-8", errors="replace"
+                    ) as f:
+                        source = await f.read()
+                    return CodeFile(
+                        id=uuid5(NAMESPACE_OID, file_path),
+                        name=os.path.relpath(file_path, repo_path),
+                        file_path=file_path,
+                        language=lang,
+                        source_code=source,
+                    )
+                tasks.append(make_codefile_stub())
+        results: list[CodeFile] = await asyncio.gather(*tasks)
+        for source_code_file in results:
+            source_code_file.part_of = repo
+            if getattr(
+                source_code_file, "language", None
+            ) is None and source_code_file.file_path.endswith(".py"):
+                source_code_file.language = "python"
+            yield source_code_file

cognee/tasks/storage/add_data_points.py CHANGED Viewed

@@ -1,23 +1,16 @@
 import asyncio
-from typing import List, Dict, Optional
+from typing import List
 from cognee.infrastructure.engine import DataPoint
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.modules.graph.utils import deduplicate_nodes_and_edges, get_graph_from_model
 from .index_data_points import index_data_points
 from .index_graph_edges import index_graph_edges
-from cognee.modules.engine.models import Triplet
-from cognee.shared.logging_utils import get_logger
 from cognee.tasks.storage.exceptions import (
     InvalidDataPointsInAddDataPointsError,
 )
-from ...modules.engine.utils import generate_node_id
-logger = get_logger("add_data_points")
-async def add_data_points(
-    data_points: List[DataPoint], custom_edges: Optional[List] = None, embed_triplets: bool = False
-) -> List[DataPoint]:
+async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
     """
     Add a batch of data points to the graph database by extracting nodes and edges,
     deduplicating them, and indexing them for retrieval.
@@ -30,10 +23,6 @@ async def add_data_points(
     Args:
         data_points (List[DataPoint]):
             A list of data points to process and insert into the graph.
-        custom_edges (List[tuple]): Custom edges between datapoints.
-        embed_triplets (bool):
-            If True, creates and indexes triplet embeddings from the graph structure.
-            Defaults to False.
     Returns:
         List[DataPoint]:
@@ -45,7 +34,6 @@ async def add_data_points(
         - Updates the node index via `index_data_points`.
         - Inserts nodes and edges into the graph engine.
         - Optionally updates the edge index via `index_graph_edges`.
-        - Optionally creates and indexes triplet embeddings if embed_triplets is True.
     """
     if not isinstance(data_points, list):
@@ -86,132 +74,4 @@ async def add_data_points(
     await graph_engine.add_edges(edges)
     await index_graph_edges(edges)
-    if isinstance(custom_edges, list) and custom_edges:
-        # This must be handled separately from datapoint edges, created a task in linear to dig deeper but (COG-3488)
-        await graph_engine.add_edges(custom_edges)
-        await index_graph_edges(custom_edges)
-        edges.extend(custom_edges)
-    if embed_triplets:
-        triplets = _create_triplets_from_graph(nodes, edges)
-        if triplets:
-            await index_data_points(triplets)
-            logger.info(f"Created and indexed {len(triplets)} triplets from graph structure")
     return data_points
-def _extract_embeddable_text_from_datapoint(data_point: DataPoint) -> str:
-    """
-    Extract embeddable text from a DataPoint using its index_fields metadata.
-    Uses the same approach as index_data_points.
-    Parameters:
-    -----------
-        - data_point (DataPoint): The data point to extract text from.
-    Returns:
-    --------
-        - str: Concatenated string of all embeddable property values, or empty string if none found.
-    """
-    if not data_point or not hasattr(data_point, "metadata"):
-        return ""
-    index_fields = data_point.metadata.get("index_fields", [])
-    if not index_fields:
-        return ""
-    embeddable_values = []
-    for field_name in index_fields:
-        field_value = getattr(data_point, field_name, None)
-        if field_value is not None:
-            field_value = str(field_value).strip()
-            if field_value:
-                embeddable_values.append(field_value)
-    return " ".join(embeddable_values) if embeddable_values else ""
-def _create_triplets_from_graph(nodes: List[DataPoint], edges: List[tuple]) -> List[Triplet]:
-    """
-    Create Triplet objects from graph nodes and edges.
-    This function processes graph edges and their corresponding nodes to create
-    triplet datapoints with embeddable text, similar to the triplet embeddings pipeline.
-    Parameters:
-    -----------
-        - nodes (List[DataPoint]): List of graph nodes extracted from data points
-        - edges (List[tuple]): List of edge tuples in format
-          (source_node_id, target_node_id, relationship_name, properties_dict)
-          Note: All edges including those from DocumentChunk.contains are already extracted
-          by get_graph_from_model and included in this list.
-    Returns:
-    --------
-        - List[Triplet]: List of Triplet objects ready for indexing
-    """
-    node_map: Dict[str, DataPoint] = {}
-    for node in nodes:
-        if hasattr(node, "id"):
-            node_id = str(node.id)
-            if node_id not in node_map:
-                node_map[node_id] = node
-    triplets = []
-    skipped_count = 0
-    seen_ids = set()
-    for edge_tuple in edges:
-        if len(edge_tuple) < 4:
-            continue
-        source_node_id, target_node_id, relationship_name, edge_properties = (
-            edge_tuple[0],
-            edge_tuple[1],
-            edge_tuple[2],
-            edge_tuple[3],
-        )
-        source_node = node_map.get(str(source_node_id))
-        target_node = node_map.get(str(target_node_id))
-        if not source_node or not target_node or relationship_name is None:
-            skipped_count += 1
-            continue
-        source_node_text = _extract_embeddable_text_from_datapoint(source_node)
-        target_node_text = _extract_embeddable_text_from_datapoint(target_node)
-        relationship_text = ""
-        if isinstance(edge_properties, dict):
-            edge_text = edge_properties.get("edge_text")
-            if edge_text and isinstance(edge_text, str) and edge_text.strip():
-                relationship_text = edge_text.strip()
-        if not relationship_text and relationship_name:
-            relationship_text = relationship_name
-        if not source_node_text and not relationship_text and not relationship_name:
-            skipped_count += 1
-            continue
-        embeddable_text = f"{source_node_text} -› {relationship_text}-›{target_node_text}".strip()
-        triplet_id = generate_node_id(str(source_node_id) + relationship_name + str(target_node_id))
-        if triplet_id in seen_ids:
-            continue
-        seen_ids.add(triplet_id)
-        triplets.append(
-            Triplet(
-                id=triplet_id,
-                from_node_id=str(source_node_id),
-                to_node_id=str(target_node_id),
-                text=embeddable_text,
-            )
-        )
-    return triplets

cognee/tests/test_cognee_server_start.py CHANGED Viewed

@@ -25,6 +25,8 @@ class TestCogneeServerStart(unittest.TestCase):
                 "--port",
                 "8000",
             ],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
             preexec_fn=os.setsid,
         )
         # Give the server some time to start
@@ -148,8 +150,8 @@ class TestCogneeServerStart(unittest.TestCase):
             headers=headers,
             files=[("ontology_file", ("test.owl", ontology_content, "application/xml"))],
             data={
-                "ontology_key": ontology_key,
-                "description": "Test ontology",
+                "ontology_key": json.dumps([ontology_key]),
+                "description": json.dumps(["Test ontology"]),
             },
         )
         self.assertEqual(ontology_response.status_code, 200)

cognee/tests/test_conversation_history.py CHANGED Viewed

@@ -8,10 +8,10 @@ Tests all retrievers that save conversation history to Redis cache:
 4. GRAPH_COMPLETION_CONTEXT_EXTENSION
 5. GRAPH_SUMMARY_COMPLETION
 6. TEMPORAL
-7. TRIPLET_COMPLETION
 """
 import os
+import shutil
 import cognee
 import pathlib
@@ -63,10 +63,6 @@ async def main():
     user = await get_default_user()
-    from cognee.memify_pipelines.create_triplet_embeddings import create_triplet_embeddings
-    await create_triplet_embeddings(user=user, dataset=dataset_name)
     cache_engine = get_cache_engine()
     assert cache_engine is not None, "Cache engine should be available for testing"
@@ -220,24 +216,6 @@ async def main():
     ]
     assert len(our_qa_temporal) == 1, "Should find Temporal question in history"
-    session_id_triplet = "test_session_triplet"
-    result_triplet = await cognee.search(
-        query_type=SearchType.TRIPLET_COMPLETION,
-        query_text="What companies are mentioned?",
-        session_id=session_id_triplet,
-    )
-    assert isinstance(result_triplet, list) and len(result_triplet) > 0, (
-        f"TRIPLET_COMPLETION should return non-empty list, got: {result_triplet!r}"
-    )
-    history_triplet = await cache_engine.get_latest_qa(str(user.id), session_id_triplet, last_n=10)
-    our_qa_triplet = [
-        h for h in history_triplet if h["question"] == "What companies are mentioned?"
-    ]
-    assert len(our_qa_triplet) == 1, "Should find Triplet question in history"
     from cognee.modules.retrieval.utils.session_cache import (
         get_conversation_history,
     )

cognee/tests/test_delete_bmw_example.py ADDED Viewed

@@ -0,0 +1,60 @@
+import os
+import pathlib
+from uuid import UUID
+import cognee
+from cognee.api.v1.datasets import datasets
+from cognee.api.v1.visualize.visualize import visualize_graph
+from cognee.context_global_variables import set_database_global_context_variables
+from cognee.modules.engine.operations.setup import setup
+from cognee.modules.users.methods import get_default_user
+# from cognee.modules.engine.operations.setup import setup
+from cognee.shared.logging_utils import get_logger
+logger = get_logger()
+async def main():
+    data_directory_path = os.path.join(
+        pathlib.Path(__file__).parent, ".data_storage/test_delete_bmw_example"
+    )
+    cognee.config.data_root_directory(data_directory_path)
+    cognee_directory_path = os.path.join(
+        pathlib.Path(__file__).parent, ".cognee_system/test_delete_bmw_example"
+    )
+    cognee.config.system_root_directory(cognee_directory_path)
+    # await cognee.prune.prune_data()
+    # await cognee.prune.prune_system(metadata=True)
+    # await setup()
+    # add_result = await cognee.add("Bmw is a german carmanufacturer")
+    # add_result = await cognee.add("Germany is located next to the netherlands")
+    # data_id = add_result.data_ingestion_info[0]["data_id"]
+    # cognify_result: dict = await cognee.cognify()
+    # dataset_id = list(cognify_result.keys())[0]
+    user = await get_default_user()
+    await set_database_global_context_variables("main_dataset", user.id)
+    graph_file_path = os.path.join(data_directory_path, "artifacts/graph-before.html")
+    await visualize_graph(graph_file_path)
+    await datasets.delete_data(
+        UUID("b52be2e1-9fdb-5be0-a317-d3a56e9a34c6"),
+        UUID("fdae2cbd-61e1-5e99-93ca-4f3e32ed0d02"),
+        user,
+    )
+    graph_file_path = os.path.join(data_directory_path, "artifacts/graph-after.html")
+    await visualize_graph(graph_file_path)
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())

cognee/tests/test_search_db.py CHANGED Viewed

@@ -2,7 +2,6 @@ import pathlib
 import os
 import cognee
 from cognee.infrastructure.databases.graph import get_graph_engine
-from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.modules.graph.cognee_graph.CogneeGraphElements import Edge
 from cognee.modules.graph.utils import resolve_edges_to_text
 from cognee.modules.retrieval.graph_completion_retriever import GraphCompletionRetriever
@@ -13,10 +12,8 @@ from cognee.modules.retrieval.graph_completion_cot_retriever import GraphComplet
 from cognee.modules.retrieval.graph_summary_completion_retriever import (
     GraphSummaryCompletionRetriever,
 )
-from cognee.modules.retrieval.triplet_retriever import TripletRetriever
 from cognee.shared.logging_utils import get_logger
 from cognee.modules.search.types import SearchType
-from cognee.modules.users.methods import get_default_user
 from collections import Counter
 logger = get_logger()
@@ -40,23 +37,6 @@ async def main():
     await cognee.cognify([dataset_name])
-    user = await get_default_user()
-    from cognee.memify_pipelines.create_triplet_embeddings import create_triplet_embeddings
-    await create_triplet_embeddings(user=user, dataset=dataset_name, triplets_batch_size=5)
-    graph_engine = await get_graph_engine()
-    nodes, edges = await graph_engine.get_graph_data()
-    vector_engine = get_vector_engine()
-    collection = await vector_engine.search(
-        query_text="Test", limit=None, collection_name="Triplet_text"
-    )
-    assert len(edges) == len(collection), (
-        f"Expected {len(edges)} edges but got {len(collection)} in Triplet_text collection"
-    )
     context_gk = await GraphCompletionRetriever().get_context(
         query="Next to which country is Germany located?"
     )
@@ -69,9 +49,6 @@ async def main():
     context_gk_sum = await GraphSummaryCompletionRetriever().get_context(
         query="Next to which country is Germany located?"
     )
-    context_triplet = await TripletRetriever().get_context(
-        query="Next to which country is Germany located?"
-    )
     for name, context in [
         ("GraphCompletionRetriever", context_gk),
@@ -88,13 +65,6 @@ async def main():
             f"{name}: Context did not contain 'germany' or 'netherlands'; got: {context!r}"
         )
-    assert isinstance(context_triplet, str), "TripletRetriever: Context should be a string"
-    assert len(context_triplet) > 0, "TripletRetriever: Context should not be empty"
-    lower_triplet = context_triplet.lower()
-    assert "germany" in lower_triplet or "netherlands" in lower_triplet, (
-        f"TripletRetriever: Context did not contain 'germany' or 'netherlands'; got: {context_triplet!r}"
-    )
     triplets_gk = await GraphCompletionRetriever().get_triplets(
         query="Next to which country is Germany located?"
     )
@@ -159,11 +129,6 @@ async def main():
         query_text="Next to which country is Germany located?",
         save_interaction=True,
     )
-    completion_triplet = await cognee.search(
-        query_type=SearchType.TRIPLET_COMPLETION,
-        query_text="Next to which country is Germany located?",
-        save_interaction=True,
-    )
     await cognee.search(
         query_type=SearchType.FEEDBACK,
@@ -176,7 +141,6 @@ async def main():
         ("GRAPH_COMPLETION_COT", completion_cot),
         ("GRAPH_COMPLETION_CONTEXT_EXTENSION", completion_ext),
         ("GRAPH_SUMMARY_COMPLETION", completion_sum),
-        ("TRIPLET_COMPLETION", completion_triplet),
     ]:
         assert isinstance(search_results, list), f"{name}: should return a list"
         assert len(search_results) == 1, (
@@ -204,7 +168,7 @@ async def main():
     # Assert there are exactly 4 CogneeUserInteraction nodes.
     assert type_counts.get("CogneeUserInteraction", 0) == 4, (
-        f"Expected exactly four CogneeUserInteraction nodes, but found {type_counts.get('CogneeUserInteraction', 0)}"
+        f"Expected exactly four DCogneeUserInteraction nodes, but found {type_counts.get('CogneeUserInteraction', 0)}"
     )
     # Assert there is exactly two CogneeUserFeedback nodes.

cognee 0.5.0__py3-none-any.whl → 0.5.0.dev0__py3-none-any.whl

cognee 0.5.0py3-none-any.whl → 0.5.0.dev0py3-none-any.whl