PyPI - cognee - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

cognee 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (227) hide show

cognee/tests/test_edge_centered_payload.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""
+End-to-end integration test for edge-centered payload and triplet embeddings.
+"""
+import os
+import pathlib
+import cognee
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.infrastructure.databases.vector import get_vector_engine
+from cognee.modules.search.types import SearchType
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.ontology.rdf_xml.RDFLibOntologyResolver import RDFLibOntologyResolver
+from cognee.modules.ontology.ontology_config import Config
+logger = get_logger()
+text_data = """
+Apple is a technology company that produces the iPhone, iPad, and Mac computers.
+The company is known for its innovative products and ecosystem integration.
+Microsoft develops the Windows operating system and Office productivity suite.
+They are also major players in cloud computing with Azure.
+Google created the Android operating system and provides search engine services.
+The company is a leader in artificial intelligence and machine learning.
+"""
+ontology_content = """<?xml version="1.0"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+         xmlns:owl="http://www.w3.org/2002/07/owl#"
+         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
+         xmlns="http://example.org/tech#"
+         xml:base="http://example.org/tech">
+    <owl:Ontology rdf:about="http://example.org/tech"/>
+    <!-- Classes -->
+    <owl:Class rdf:ID="Company"/>
+    <owl:Class rdf:ID="TechnologyCompany"/>
+    <owl:Class rdf:ID="Product"/>
+    <owl:Class rdf:ID="Software"/>
+    <owl:Class rdf:ID="Hardware"/>
+    <owl:Class rdf:ID="Service"/>
+    <rdf:Description rdf:about="#TechnologyCompany">
+        <rdfs:subClassOf rdf:resource="#Company"/>
+        <rdfs:comment>A company operating in the technology sector.</rdfs:comment>
+    </rdf:Description>
+    <rdf:Description rdf:about="#Software">
+        <rdfs:subClassOf rdf:resource="#Product"/>
+        <rdfs:comment>Software products and applications.</rdfs:comment>
+    </rdf:Description>
+    <rdf:Description rdf:about="#Hardware">
+        <rdfs:subClassOf rdf:resource="#Product"/>
+        <rdfs:comment>Physical hardware products.</rdfs:comment>
+    </rdf:Description>
+    <!-- Individuals -->
+    <TechnologyCompany rdf:ID="apple">
+        <rdfs:label>Apple</rdfs:label>
+    </TechnologyCompany>
+    <TechnologyCompany rdf:ID="microsoft">
+        <rdfs:label>Microsoft</rdfs:label>
+    </TechnologyCompany>
+    <TechnologyCompany rdf:ID="google">
+        <rdfs:label>Google</rdfs:label>
+    </TechnologyCompany>
+    <Hardware rdf:ID="iphone">
+        <rdfs:label>iPhone</rdfs:label>
+    </Hardware>
+    <Software rdf:ID="windows">
+        <rdfs:label>Windows</rdfs:label>
+    </Software>
+    <Software rdf:ID="android">
+        <rdfs:label>Android</rdfs:label>
+    </Software>
+</rdf:RDF>"""
+async def main():
+    data_directory_path = str(
+        pathlib.Path(
+            os.path.join(
+                pathlib.Path(__file__).parent,
+                ".data_storage/test_edge_centered_payload",
+            )
+        ).resolve()
+    )
+    cognee_directory_path = str(
+        pathlib.Path(
+            os.path.join(
+                pathlib.Path(__file__).parent,
+                ".cognee_system/test_edge_centered_payload",
+            )
+        ).resolve()
+    )
+    cognee.config.data_root_directory(data_directory_path)
+    cognee.config.system_root_directory(cognee_directory_path)
+    dataset_name = "tech_companies"
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    await cognee.add(data=text_data, dataset_name=dataset_name)
+    import tempfile
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".owl", delete=False) as f:
+        f.write(ontology_content)
+        ontology_file_path = f.name
+    try:
+        logger.info(f"Loading ontology from: {ontology_file_path}")
+        config: Config = {
+            "ontology_config": {
+                "ontology_resolver": RDFLibOntologyResolver(ontology_file=ontology_file_path)
+            }
+        }
+        await cognee.cognify(datasets=[dataset_name], config=config)
+        graph_engine = await get_graph_engine()
+        nodes_phase2, edges_phase2 = await graph_engine.get_graph_data()
+        vector_engine = get_vector_engine()
+        triplets_phase2 = await vector_engine.search(
+            query_text="technology", limit=None, collection_name="Triplet_text"
+        )
+        assert len(triplets_phase2) == len(edges_phase2), (
+            f"Triplet embeddings and number of edges do not match. Vector db contains {len(triplets_phase2)} edge triplets while graph db contains {len(edges_phase2)} edges."
+        )
+        search_results_phase2 = await cognee.search(
+            query_type=SearchType.TRIPLET_COMPLETION,
+            query_text="What products does Apple make?",
+        )
+        assert search_results_phase2 is not None, (
+            "Search should return results for triplet embeddings in simple ontology use case."
+        )
+    finally:
+        if os.path.exists(ontology_file_path):
+            os.unlink(ontology_file_path)
+if __name__ == "__main__":
+    import asyncio
+    from cognee.shared.logging_utils import setup_logging
+    setup_logging()
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        loop.run_until_complete(main())
+    finally:
+        loop.run_until_complete(loop.shutdown_asyncgens())
+        loop.close()

cognee/tests/test_edge_ingestion.py CHANGED Viewed

@@ -52,6 +52,33 @@ async def test_edge_ingestion():
     edge_type_counts = Counter(edge_type[2] for edge_type in graph[1])
+    "Tests edge_text presence and format"
+    contains_edges = [edge for edge in graph[1] if edge[2] == "contains"]
+    assert len(contains_edges) > 0, "Expected at least one contains edge for edge_text verification"
+    edge_properties = contains_edges[0][3]
+    assert "edge_text" in edge_properties, "Expected edge_text in edge properties"
+    edge_text = edge_properties["edge_text"]
+    assert "relationship_name: contains" in edge_text, (
+        f"Expected 'relationship_name: contains' in edge_text, got: {edge_text}"
+    )
+    assert "entity_name:" in edge_text, f"Expected 'entity_name:' in edge_text, got: {edge_text}"
+    assert "entity_description:" in edge_text, (
+        f"Expected 'entity_description:' in edge_text, got: {edge_text}"
+    )
+    all_edge_texts = [
+        edge[3].get("edge_text", "") for edge in contains_edges if "edge_text" in edge[3]
+    ]
+    expected_entities = ["dave", "ana", "bob", "dexter", "apples", "cognee"]
+    found_entity = any(
+        any(entity in text.lower() for entity in expected_entities) for text in all_edge_texts
+    )
+    assert found_entity, (
+        f"Expected to find at least one entity name in edge_text: {all_edge_texts[:3]}"
+    )
     "Tests the presence of basic nested edges"
     for basic_nested_edge in basic_nested_edges:
         assert edge_type_counts.get(basic_nested_edge, 0) >= 1, (

cognee/tests/test_feedback_enrichment.py CHANGED Viewed

@@ -133,7 +133,7 @@ async def main():
         extraction_tasks=extraction_tasks,
         enrichment_tasks=enrichment_tasks,
         data=[{}],
-        dataset="feedback_enrichment_test_memify",
+        dataset=dataset_name,
     )
     nodes_after, edges_after = await graph_engine.get_graph_data()

cognee/tests/test_library.py CHANGED Viewed

@@ -90,15 +90,17 @@ async def main():
         )
     search_results = await cognee.search(
-        query_type=SearchType.GRAPH_COMPLETION, query_text="What information do you contain?"
+        query_type=SearchType.GRAPH_COMPLETION,
+        query_text="What information do you contain?",
+        dataset_ids=[pipeline_run_obj.dataset_id],
     )
-    assert "Mark" in search_results[0], (
+    assert "Mark" in search_results[0]["search_result"][0], (
         "Failed to update document, no mention of Mark in search results"
     )
-    assert "Cindy" in search_results[0], (
+    assert "Cindy" in search_results[0]["search_result"][0], (
         "Failed to update document, no mention of Cindy in search results"
     )
-    assert "Artificial intelligence" not in search_results[0], (
+    assert "Artificial intelligence" not in search_results[0]["search_result"][0], (
         "Failed to update document, Artificial intelligence still mentioned in search results"
     )

cognee/tests/test_load.py ADDED Viewed

@@ -0,0 +1,62 @@
+import os
+import pathlib
+import asyncio
+import time
+import cognee
+from cognee.modules.search.types import SearchType
+from cognee.shared.logging_utils import get_logger
+logger = get_logger()
+async def process_and_search(num_of_searches):
+    start_time = time.time()
+    await cognee.cognify()
+    await asyncio.gather(
+        *[
+            cognee.search(
+                query_text="Tell me about the document", query_type=SearchType.GRAPH_COMPLETION
+            )
+            for _ in range(num_of_searches)
+        ]
+    )
+    end_time = time.time()
+    return end_time - start_time
+async def main():
+    data_directory_path = os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_load")
+    cognee.config.data_root_directory(data_directory_path)
+    cognee_directory_path = os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_load")
+    cognee.config.system_root_directory(cognee_directory_path)
+    num_of_pdfs = 10
+    num_of_reps = 5
+    upper_boundary_minutes = 10
+    average_minutes = 8
+    recorded_times = []
+    for _ in range(num_of_reps):
+        await cognee.prune.prune_data()
+        await cognee.prune.prune_system(metadata=True)
+        s3_input = "s3://cognee-test-load-s3-bucket"
+        await cognee.add(s3_input)
+        recorded_times.append(await process_and_search(num_of_pdfs))
+    average_recorded_time = sum(recorded_times) / len(recorded_times)
+    assert average_recorded_time <= average_minutes * 60
+    assert all(rec_time <= upper_boundary_minutes * 60 for rec_time in recorded_times)
+if __name__ == "__main__":
+    asyncio.run(main())

cognee/tests/test_multi_tenancy.py ADDED Viewed

@@ -0,0 +1,165 @@
+import cognee
+import pytest
+from cognee.modules.users.exceptions import PermissionDeniedError
+from cognee.modules.users.tenants.methods import select_tenant
+from cognee.modules.users.methods import get_user
+from cognee.shared.logging_utils import get_logger
+from cognee.modules.search.types import SearchType
+from cognee.modules.users.methods import create_user
+from cognee.modules.users.permissions.methods import authorized_give_permission_on_datasets
+from cognee.modules.users.roles.methods import add_user_to_role
+from cognee.modules.users.roles.methods import create_role
+from cognee.modules.users.tenants.methods import create_tenant
+from cognee.modules.users.tenants.methods import add_user_to_tenant
+from cognee.modules.engine.operations.setup import setup
+from cognee.shared.logging_utils import setup_logging, CRITICAL
+logger = get_logger()
+async def main():
+    # Create a clean slate for cognee -- reset data and system state
+    print("Resetting cognee data...")
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    print("Data reset complete.\n")
+    # Set up the necessary databases and tables for user management.
+    await setup()
+    # Add document for user_1, add it under dataset name AI
+    text = """A quantum computer is a computer that takes advantage of quantum mechanical phenomena.
+    At small scales, physical matter exhibits properties of both particles and waves, and quantum computing leverages
+    this behavior, specifically quantum superposition and entanglement, using specialized hardware that supports the
+    preparation and manipulation of quantum state"""
+    print("Creating user_1: user_1@example.com")
+    user_1 = await create_user("user_1@example.com", "example")
+    await cognee.add([text], dataset_name="AI", user=user_1)
+    print("\nCreating user_2: user_2@example.com")
+    user_2 = await create_user("user_2@example.com", "example")
+    # Run cognify for both datasets as the appropriate user/owner
+    print("\nCreating different datasets for user_1 (AI dataset) and user_2 (QUANTUM dataset)")
+    ai_cognify_result = await cognee.cognify(["AI"], user=user_1)
+    # Extract dataset_ids from cognify results
+    def extract_dataset_id_from_cognify(cognify_result):
+        """Extract dataset_id from cognify output dictionary"""
+        for dataset_id, pipeline_result in cognify_result.items():
+            return dataset_id  # Return the first dataset_id
+        return None
+    # Get dataset IDs from cognify results
+    # Note: When we want to work with datasets from other users (search, add, cognify and etc.) we must supply dataset
+    # information through dataset_id using dataset name only looks for datasets owned by current user
+    ai_dataset_id = extract_dataset_id_from_cognify(ai_cognify_result)
+    # We can see here that user_1 can read his own dataset (AI dataset)
+    search_results = await cognee.search(
+        query_type=SearchType.GRAPH_COMPLETION,
+        query_text="What is in the document?",
+        user=user_1,
+        datasets=[ai_dataset_id],
+    )
+    # Verify that user_2 cannot access user_1's dataset without permission
+    with pytest.raises(PermissionDeniedError):
+        search_results = await cognee.search(
+            query_type=SearchType.GRAPH_COMPLETION,
+            query_text="What is in the document?",
+            user=user_2,
+            datasets=[ai_dataset_id],
+        )
+    # Create new tenant and role, add user_2 to tenant and role
+    tenant_id = await create_tenant("CogneeLab", user_1.id)
+    await select_tenant(user_id=user_1.id, tenant_id=tenant_id)
+    role_id = await create_role(role_name="Researcher", owner_id=user_1.id)
+    await add_user_to_tenant(
+        user_id=user_2.id, tenant_id=tenant_id, owner_id=user_1.id, set_as_active_tenant=True
+    )
+    await add_user_to_role(user_id=user_2.id, role_id=role_id, owner_id=user_1.id)
+    # Assert that user_1 cannot give permissions on his dataset to role before switching to the correct tenant
+    # AI dataset was made with default tenant and not CogneeLab tenant
+    with pytest.raises(PermissionDeniedError):
+        await authorized_give_permission_on_datasets(
+            role_id,
+            [ai_dataset_id],
+            "read",
+            user_1.id,
+        )
+    # We need to refresh the user object with changes made when switching tenants
+    user_1 = await get_user(user_1.id)
+    await cognee.add([text], dataset_name="AI_COGNEE_LAB", user=user_1)
+    ai_cognee_lab_cognify_result = await cognee.cognify(["AI_COGNEE_LAB"], user=user_1)
+    ai_cognee_lab_dataset_id = extract_dataset_id_from_cognify(ai_cognee_lab_cognify_result)
+    await authorized_give_permission_on_datasets(
+        role_id,
+        [ai_cognee_lab_dataset_id],
+        "read",
+        user_1.id,
+    )
+    search_results = await cognee.search(
+        query_type=SearchType.GRAPH_COMPLETION,
+        query_text="What is in the document?",
+        user=user_2,
+        dataset_ids=[ai_cognee_lab_dataset_id],
+    )
+    for result in search_results:
+        print(f"{result}\n")
+    # Let's test changing tenants
+    tenant_id = await create_tenant("CogneeLab2", user_1.id)
+    await select_tenant(user_id=user_1.id, tenant_id=tenant_id)
+    user_1 = await get_user(user_1.id)
+    await cognee.add([text], dataset_name="AI_COGNEE_LAB", user=user_1)
+    await cognee.cognify(["AI_COGNEE_LAB"], user=user_1)
+    search_results = await cognee.search(
+        query_type=SearchType.GRAPH_COMPLETION,
+        query_text="What is in the document?",
+        user=user_1,
+    )
+    # Assert only AI_COGNEE_LAB dataset from CogneeLab2 tenant is visible as the currently selected tenant
+    assert len(search_results) == 1, (
+        f"Search results must only contain one dataset from current tenant: {search_results}"
+    )
+    assert search_results[0]["dataset_name"] == "AI_COGNEE_LAB", (
+        f"Dict must contain dataset name 'AI_COGNEE_LAB': {search_results[0]}"
+    )
+    assert search_results[0]["dataset_tenant_id"] == user_1.tenant_id, (
+        f"Dataset tenant_id must be same as user_1 tenant_id: {search_results[0]}"
+    )
+    # Switch back to no tenant (default tenant)
+    await select_tenant(user_id=user_1.id, tenant_id=None)
+    # Refresh user_1 object
+    user_1 = await get_user(user_1.id)
+    search_results = await cognee.search(
+        query_type=SearchType.GRAPH_COMPLETION,
+        query_text="What is in the document?",
+        user=user_1,
+    )
+    assert len(search_results) == 1, (
+        f"Search results must only contain one dataset from default tenant: {search_results}"
+    )
+    assert search_results[0]["dataset_name"] == "AI", (
+        f"Dict must contain dataset name 'AI': {search_results[0]}"
+    )
+if __name__ == "__main__":
+    import asyncio
+    logger = setup_logging(log_level=CRITICAL)
+    asyncio.run(main())

cognee/tests/test_parallel_databases.py CHANGED Viewed

@@ -33,11 +33,13 @@ async def main():
         "vector_db_url": "cognee1.test",
         "vector_db_key": "",
         "vector_db_provider": "lancedb",
+        "vector_db_name": "",
     }
     task_2_config = {
         "vector_db_url": "cognee2.test",
         "vector_db_key": "",
         "vector_db_provider": "lancedb",
+        "vector_db_name": "",
     }
     task_1_graph_config = {

cognee/tests/test_pipeline_cache.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""
+Test suite for the pipeline_cache feature in Cognee pipelines.
+This module tests the behavior of the `pipeline_cache` parameter which controls
+whether a pipeline should skip re-execution when it has already been completed
+for the same dataset.
+Architecture Overview:
+---------------------
+The pipeline_cache mechanism works at the dataset level:
+1. When a pipeline runs, it logs its status (INITIATED -> STARTED -> COMPLETED)
+2. Before each run, `check_pipeline_run_qualification()` checks the pipeline status
+3. If `use_pipeline_cache=True` and status is COMPLETED/STARTED, the pipeline skips
+4. If `use_pipeline_cache=False`, the pipeline always re-executes regardless of status
+"""
+import pytest
+import cognee
+from cognee.modules.pipelines.tasks.task import Task
+from cognee.modules.pipelines import run_pipeline
+from cognee.modules.users.methods import get_default_user
+from cognee.modules.pipelines.layers.reset_dataset_pipeline_run_status import (
+    reset_dataset_pipeline_run_status,
+)
+from cognee.infrastructure.databases.relational import create_db_and_tables
+class ExecutionCounter:
+    """Helper class to track task execution counts."""
+    def __init__(self):
+        self.count = 0
+async def create_counting_task(data, counter: ExecutionCounter):
+    """Create a task that increments a counter from the ExecutionCounter instance when executed."""
+    counter.count += 1
+    return counter
+class TestPipelineCache:
+    """Tests for basic pipeline_cache on/off behavior."""
+    @pytest.mark.asyncio
+    async def test_pipeline_cache_off_allows_reexecution(self):
+        """
+        Test that with use_pipeline_cache=False, the pipeline re-executes
+        even when it has already completed for the dataset.
+        Expected behavior:
+        - First run: Pipeline executes fully, task runs once
+        - Second run: Pipeline executes again, task runs again (total: 2 times)
+        """
+        await cognee.prune.prune_data()
+        await cognee.prune.prune_system(metadata=True)
+        await create_db_and_tables()
+        counter = ExecutionCounter()
+        user = await get_default_user()
+        tasks = [Task(create_counting_task, counter=counter)]
+        # First run
+        pipeline_results_1 = []
+        async for result in run_pipeline(
+            tasks=tasks,
+            datasets="test_dataset_cache_off",
+            data=["sample data"],  # Data is necessary to trigger processing
+            user=user,
+            pipeline_name="test_cache_off_pipeline",
+            use_pipeline_cache=False,
+        ):
+            pipeline_results_1.append(result)
+        first_run_count = counter.count
+        assert first_run_count >= 1, "Task should have executed at least once on first run"
+        # Second run with pipeline_cache=False
+        pipeline_results_2 = []
+        async for result in run_pipeline(
+            tasks=tasks,
+            datasets="test_dataset_cache_off",
+            data=["sample data"],  # Data is necessary to trigger processing
+            user=user,
+            pipeline_name="test_cache_off_pipeline",
+            use_pipeline_cache=False,
+        ):
+            pipeline_results_2.append(result)
+        second_run_count = counter.count
+        assert second_run_count > first_run_count, (
+            f"With pipeline_cache=False, task should re-execute. "
+            f"First run: {first_run_count}, After second run: {second_run_count}"
+        )
+    @pytest.mark.asyncio
+    async def test_reset_pipeline_status_allows_reexecution_with_cache(self):
+        """
+        Test that resetting pipeline status allows re-execution even with
+        pipeline_cache=True.
+        """
+        await cognee.prune.prune_data()
+        await cognee.prune.prune_system(metadata=True)
+        await create_db_and_tables()
+        counter = ExecutionCounter()
+        user = await get_default_user()
+        dataset_name = "reset_status_test"
+        pipeline_name = "test_reset_pipeline"
+        tasks = [Task(create_counting_task, counter=counter)]
+        # First run
+        pipeline_result = []
+        async for result in run_pipeline(
+            tasks=tasks,
+            datasets=dataset_name,
+            user=user,
+            data=["sample data"],  # Data is necessary to trigger processing
+            pipeline_name=pipeline_name,
+            use_pipeline_cache=True,
+        ):
+            pipeline_result.append(result)
+        first_run_count = counter.count
+        assert first_run_count >= 1
+        # Second run without reset - should skip
+        async for _ in run_pipeline(
+            tasks=tasks,
+            datasets=dataset_name,
+            user=user,
+            data=["sample data"],  # Data is necessary to trigger processing
+            pipeline_name=pipeline_name,
+            use_pipeline_cache=True,
+        ):
+            pass
+        after_second_run = counter.count
+        assert after_second_run == first_run_count, "Should have skipped due to cache"
+        # Reset the pipeline status
+        await reset_dataset_pipeline_run_status(
+            pipeline_result[0].dataset_id, user, pipeline_names=[pipeline_name]
+        )
+        # Third run after reset - should execute
+        async for _ in run_pipeline(
+            tasks=tasks,
+            datasets=dataset_name,
+            user=user,
+            data=["sample data"],  # Data is necessary to trigger processing
+            pipeline_name=pipeline_name,
+            use_pipeline_cache=True,
+        ):
+            pass
+        after_reset_run = counter.count
+        assert after_reset_run > after_second_run, (
+            f"After reset, pipeline should re-execute. "
+            f"Before reset: {after_second_run}, After reset run: {after_reset_run}"
+        )

cognee 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

cognee 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl