PyPI - graphiti-core - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

graphiti-core 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of graphiti-core might be problematic. Click here for more details.

Files changed (17) hide show

graphiti_core/graphiti.py +105 -85
graphiti_core/llm_client/openai_client.py +0 -1
graphiti_core/prompts/dedupe_edges.py +46 -8
graphiti_core/prompts/dedupe_nodes.py +61 -13
graphiti_core/prompts/extract_edges.py +2 -1
graphiti_core/prompts/extract_nodes.py +2 -0
graphiti_core/search/search.py +8 -8
graphiti_core/search/search_utils.py +44 -26
graphiti_core/utils/bulk_utils.py +138 -20
graphiti_core/utils/maintenance/edge_operations.py +76 -9
graphiti_core/utils/maintenance/node_operations.py +98 -40
graphiti_core/utils/maintenance/temporal_operations.py +3 -4
graphiti_core/utils/utils.py +22 -1
{graphiti_core-0.1.0.dist-info → graphiti_core-0.2.0.dist-info}/METADATA +38 -38
{graphiti_core-0.1.0.dist-info → graphiti_core-0.2.0.dist-info}/RECORD +17 -17
{graphiti_core-0.1.0.dist-info → graphiti_core-0.2.0.dist-info}/LICENSE +0 -0
{graphiti_core-0.1.0.dist-info → graphiti_core-0.2.0.dist-info}/WHEEL +0 -0

graphiti_core/search/search_utils.py CHANGED Viewed

@@ -96,14 +96,18 @@ async def bfs(node_ids: list[str], driver: AsyncDriver):
 async def edge_similarity_search(
-    search_vector: list[float], driver: AsyncDriver, limit=RELEVANT_SCHEMA_LIMIT
+    driver: AsyncDriver,
+    search_vector: list[float],
+    limit: int = RELEVANT_SCHEMA_LIMIT,
+    source_node_uuid: str = '*',
+    target_node_uuid: str = '*',
 ) -> list[EntityEdge]:
     # vector similarity search over embedded facts
     records, _, _ = await driver.execute_query(
         """
                 CALL db.index.vector.queryRelationships("fact_embedding", $limit, $search_vector)
-                YIELD relationship AS r, score
-                MATCH (n)-[r:RELATES_TO]->(m)
+                YIELD relationship AS rel, score
+                MATCH (n:Entity {uuid: $source_uuid})-[r {uuid: rel.uuid}]-(m:Entity {uuid: $target_uuid})
                 RETURN
                     r.uuid AS uuid,
                     n.uuid AS source_node_uuid,
@@ -119,6 +123,8 @@ async def edge_similarity_search(
                 ORDER BY score DESC
                 """,
         search_vector=search_vector,
+        source_uuid=source_node_uuid,
+        target_uuid=target_node_uuid,
         limit=limit,
     )
@@ -214,7 +220,11 @@ async def entity_fulltext_search(
 async def edge_fulltext_search(
-    query: str, driver: AsyncDriver, limit=RELEVANT_SCHEMA_LIMIT
+    driver: AsyncDriver,
+    query: str,
+    limit=RELEVANT_SCHEMA_LIMIT,
+    source_node_uuid: str = '*',
+    target_node_uuid: str = '*',
 ) -> list[EntityEdge]:
     # fulltext search over facts
     fuzzy_query = re.sub(r'[^\w\s]', '', query) + '~'
@@ -222,8 +232,8 @@ async def edge_fulltext_search(
     records, _, _ = await driver.execute_query(
         """
                 CALL db.index.fulltext.queryRelationships("name_and_fact", $query)
-                YIELD relationship AS r, score
-                MATCH (n:Entity)-[r]->(m:Entity)
+                YIELD relationship AS rel, score
+                MATCH (n:Entity {uuid: $source_uuid})-[r {uuid: rel.uuid}]-(m:Entity {uuid: $target_uuid})
                 RETURN
                     r.uuid AS uuid,
                     n.uuid AS source_node_uuid,
@@ -239,6 +249,8 @@ async def edge_fulltext_search(
                 ORDER BY score DESC LIMIT $limit
                 """,
         query=fuzzy_query,
+        source_uuid=source_node_uuid,
+        target_uuid=target_node_uuid,
         limit=limit,
     )
@@ -268,13 +280,13 @@ async def hybrid_node_search(
     queries: list[str],
     embeddings: list[list[float]],
     driver: AsyncDriver,
-    limit: int | None = None,
+    limit: int = RELEVANT_SCHEMA_LIMIT,
 ) -> list[EntityNode]:
     """
     Perform a hybrid search for nodes using both text queries and embeddings.
     This method combines fulltext search and vector similarity search to find
-    relevant nodes in the graph database.
+    relevant nodes in the graph database. It uses an rrf reranker.
     Parameters
     ----------
@@ -307,27 +319,25 @@ async def hybrid_node_search(
     """
     start = time()
-    relevant_nodes: list[EntityNode] = []
-    relevant_node_uuids = set()
-    results = await asyncio.gather(
-        *[entity_fulltext_search(q, driver, 2 * (limit or RELEVANT_SCHEMA_LIMIT)) for q in queries],
-        *[
-            entity_similarity_search(e, driver, 2 * (limit or RELEVANT_SCHEMA_LIMIT))
-            for e in embeddings
-        ],
+    results: list[list[EntityNode]] = list(
+        await asyncio.gather(
+            *[entity_fulltext_search(q, driver, 2 * limit) for q in queries],
+            *[entity_similarity_search(e, driver, 2 * limit) for e in embeddings],
+        )
     )
-    for result in results:
-        for node in result:
-            if node.uuid in relevant_node_uuids:
-                continue
+    node_uuid_map: dict[str, EntityNode] = {
+        node.uuid: node for result in results for node in result
+    }
+    result_uuids = [[node.uuid for node in result] for result in results]
-            relevant_node_uuids.add(node.uuid)
-            relevant_nodes.append(node)
+    ranked_uuids = rrf(result_uuids)
+    relevant_nodes: list[EntityNode] = [node_uuid_map[uuid] for uuid in ranked_uuids]
     end = time()
-    logger.info(f'Found relevant nodes: {relevant_node_uuids} in {(end - start) * 1000} ms')
+    logger.info(f'Found relevant nodes: {ranked_uuids} in {(end - start) * 1000} ms')
     return relevant_nodes
@@ -371,6 +381,9 @@ async def get_relevant_nodes(
 async def get_relevant_edges(
     edges: list[EntityEdge],
     driver: AsyncDriver,
+    limit: int = RELEVANT_SCHEMA_LIMIT,
+    source_node_uuid: str = '*',
+    target_node_uuid: str = '*',
 ) -> list[EntityEdge]:
     start = time()
     relevant_edges: list[EntityEdge] = []
@@ -378,11 +391,16 @@ async def get_relevant_edges(
     results = await asyncio.gather(
         *[
-            edge_similarity_search(edge.fact_embedding, driver)
+            edge_similarity_search(
+                driver, edge.fact_embedding, limit, source_node_uuid, target_node_uuid
+            )
             for edge in edges
             if edge.fact_embedding is not None
         ],
-        *[edge_fulltext_search(edge.fact, driver) for edge in edges],
+        *[
+            edge_fulltext_search(driver, edge.fact, limit, source_node_uuid, target_node_uuid)
+            for edge in edges
+        ],
     )
     for result in results:
@@ -426,7 +444,7 @@ async def node_distance_reranker(
         records, _, _ = await driver.execute_query(
             """
         MATCH (source:Entity)-[r:RELATES_TO {uuid: $edge_uuid}]->(target:Entity)
-        MATCH p = SHORTEST 1 (center:Entity)-[:RELATES_TO]-+(n:Entity)
+        MATCH p = SHORTEST 1 (center:Entity)-[:RELATES_TO*1..10]->(n:Entity)
         WHERE center.uuid = $center_uuid AND n.uuid IN [source.uuid, target.uuid]
         RETURN min(length(p)) AS score, source.uuid AS source_uuid, target.uuid AS target_uuid
         """,

graphiti_core/utils/bulk_utils.py CHANGED Viewed

@@ -15,11 +15,13 @@ limitations under the License.
 """
 import asyncio
+import logging
 import typing
 from datetime import datetime
+from math import ceil
 from neo4j import AsyncDriver
-from numpy import dot
+from numpy import dot, sqrt
 from pydantic import BaseModel
 from graphiti_core.edges import Edge, EntityEdge, EpisodicEdge
@@ -39,8 +41,12 @@ from graphiti_core.utils.maintenance.node_operations import (
     dedupe_node_list,
     extract_nodes,
 )
+from graphiti_core.utils.maintenance.temporal_operations import extract_edge_dates
+from graphiti_core.utils.utils import chunk_edges_by_nodes
-CHUNK_SIZE = 15
+logger = logging.getLogger(__name__)
+CHUNK_SIZE = 10
 class RawEpisode(BaseModel):
@@ -114,27 +120,58 @@ async def dedupe_nodes_bulk(
     compressed_nodes, compressed_map = await compress_nodes(llm_client, nodes, uuid_map)
-    existing_nodes = await get_relevant_nodes(compressed_nodes, driver)
+    node_chunks = [nodes[i : i + CHUNK_SIZE] for i in range(0, len(nodes), CHUNK_SIZE)]
-    nodes, partial_uuid_map, _ = await dedupe_extracted_nodes(
-        llm_client, compressed_nodes, existing_nodes
+    existing_nodes_chunks: list[list[EntityNode]] = list(
+        await asyncio.gather(
+            *[get_relevant_nodes(node_chunk, driver) for node_chunk in node_chunks]
+        )
     )
-    compressed_map.update(partial_uuid_map)
+    results: list[tuple[list[EntityNode], dict[str, str]]] = list(
+        await asyncio.gather(
+            *[
+                dedupe_extracted_nodes(llm_client, node_chunk, existing_nodes_chunks[i])
+                for i, node_chunk in enumerate(node_chunks)
+            ]
+        )
+    )
-    return nodes, compressed_map
+    final_nodes: list[EntityNode] = []
+    for result in results:
+        final_nodes.extend(result[0])
+        partial_uuid_map = result[1]
+        compressed_map.update(partial_uuid_map)
+    return final_nodes, compressed_map
 async def dedupe_edges_bulk(
     driver: AsyncDriver, llm_client: LLMClient, extracted_edges: list[EntityEdge]
 ) -> list[EntityEdge]:
-    # Compress edges
+    # First compress edges
     compressed_edges = await compress_edges(llm_client, extracted_edges)
-    existing_edges = await get_relevant_edges(compressed_edges, driver)
+    edge_chunks = [
+        compressed_edges[i : i + CHUNK_SIZE] for i in range(0, len(compressed_edges), CHUNK_SIZE)
+    ]
-    edges = await dedupe_extracted_edges(llm_client, compressed_edges, existing_edges)
+    relevant_edges_chunks: list[list[EntityEdge]] = list(
+        await asyncio.gather(
+            *[get_relevant_edges(edge_chunk, driver) for edge_chunk in edge_chunks]
+        )
+    )
+    resolved_edge_chunks: list[list[EntityEdge]] = list(
+        await asyncio.gather(
+            *[
+                dedupe_extracted_edges(llm_client, edge_chunk, relevant_edges_chunks[i])
+                for i, edge_chunk in enumerate(edge_chunks)
+            ]
+        )
+    )
+    edges = [edge for edge_chunk in resolved_edge_chunks for edge in edge_chunk]
     return edges
@@ -154,13 +191,58 @@ def node_name_match(nodes: list[EntityNode]) -> tuple[list[EntityNode], dict[str
 async def compress_nodes(
     llm_client: LLMClient, nodes: list[EntityNode], uuid_map: dict[str, str]
 ) -> tuple[list[EntityNode], dict[str, str]]:
+    # We want to first compress the nodes by deduplicating nodes across each of the episodes added in bulk
     if len(nodes) == 0:
         return nodes, uuid_map
-    anchor = nodes[0]
-    nodes.sort(key=lambda node: dot(anchor.name_embedding or [], node.name_embedding or []))
+    # Our approach involves us deduplicating chunks of nodes in parallel.
+    # We want n chunks of size n so that n ** 2 == len(nodes).
+    # We want chunk sizes to be at least 10 for optimizing LLM processing time
+    chunk_size = max(int(sqrt(len(nodes))), CHUNK_SIZE)
-    node_chunks = [nodes[i : i + CHUNK_SIZE] for i in range(0, len(nodes), CHUNK_SIZE)]
+    # First calculate similarity scores between nodes
+    similarity_scores: list[tuple[int, int, float]] = [
+        (i, j, dot(n.name_embedding or [], m.name_embedding or []))
+        for i, n in enumerate(nodes)
+        for j, m in enumerate(nodes[:i])
+    ]
+    # We now sort by semantic similarity
+    similarity_scores.sort(key=lambda score_tuple: score_tuple[2])
+    # initialize our chunks based on chunk size
+    node_chunks: list[list[EntityNode]] = [[] for _ in range(ceil(len(nodes) / chunk_size))]
+    # Draft the most similar nodes into the same chunk
+    while len(similarity_scores) > 0:
+        i, j, _ = similarity_scores.pop()
+        # determine if any of the nodes have already been drafted into a chunk
+        n = nodes[i]
+        m = nodes[j]
+        # make sure the shortest chunks get preference
+        node_chunks.sort(reverse=True, key=lambda chunk: len(chunk))
+        n_chunk = max([i if n in chunk else -1 for i, chunk in enumerate(node_chunks)])
+        m_chunk = max([i if m in chunk else -1 for i, chunk in enumerate(node_chunks)])
+        # both nodes already in a chunk
+        if n_chunk > -1 and m_chunk > -1:
+            continue
+        # n has a chunk and that chunk is not full
+        elif n_chunk > -1 and len(node_chunks[n_chunk]) < chunk_size:
+            # put m in the same chunk as n
+            node_chunks[n_chunk].append(m)
+        # m has a chunk and that chunk is not full
+        elif m_chunk > -1 and len(node_chunks[m_chunk]) < chunk_size:
+            # put n in the same chunk as m
+            node_chunks[m_chunk].append(n)
+        # neither node has a chunk or the chunk is full
+        else:
+            # add both nodes to the shortest chunk
+            node_chunks[-1].extend([n, m])
     results = await asyncio.gather(*[dedupe_node_list(llm_client, chunk) for chunk in node_chunks])
@@ -181,13 +263,9 @@ async def compress_nodes(
 async def compress_edges(llm_client: LLMClient, edges: list[EntityEdge]) -> list[EntityEdge]:
     if len(edges) == 0:
         return edges
-    anchor = edges[0]
-    edges.sort(
-        key=lambda embedding: dot(anchor.fact_embedding or [], embedding.fact_embedding or [])
-    )
-    edge_chunks = [edges[i : i + CHUNK_SIZE] for i in range(0, len(edges), CHUNK_SIZE)]
+    # We only want to dedupe edges that are between the same pair of nodes
+    # We build a map of the edges based on their source and target nodes.
+    edge_chunks = chunk_edges_by_nodes(edges)
     results = await asyncio.gather(*[dedupe_edge_list(llm_client, chunk) for chunk in edge_chunks])
@@ -225,3 +303,43 @@ def resolve_edge_pointers(edges: list[E], uuid_map: dict[str, str]):
         edge.target_node_uuid = uuid_map.get(target_uuid, target_uuid)
     return edges
+async def extract_edge_dates_bulk(
+    llm_client: LLMClient,
+    extracted_edges: list[EntityEdge],
+    episode_pairs: list[tuple[EpisodicNode, list[EpisodicNode]]],
+) -> list[EntityEdge]:
+    edges: list[EntityEdge] = []
+    # confirm that all of our edges have at least one episode
+    for edge in extracted_edges:
+        if edge.episodes is not None and len(edge.episodes) > 0:
+            edges.append(edge)
+    episode_uuid_map: dict[str, tuple[EpisodicNode, list[EpisodicNode]]] = {
+        episode.uuid: (episode, previous_episodes) for episode, previous_episodes in episode_pairs
+    }
+    results = await asyncio.gather(
+        *[
+            extract_edge_dates(
+                llm_client,
+                edge,
+                episode_uuid_map[edge.episodes[0]][0],  # type: ignore
+                episode_uuid_map[edge.episodes[0]][1],  # type: ignore
+            )
+            for edge in edges
+        ]
+    )
+    for i, result in enumerate(results):
+        valid_at = result[0]
+        invalid_at = result[1]
+        edge = edges[i]
+        edge.valid_at = valid_at
+        edge.invalid_at = invalid_at
+        if edge.invalid_at:
+            edge.expired_at = datetime.now()
+    return edges

graphiti_core/utils/maintenance/edge_operations.py CHANGED Viewed

@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+import asyncio
 import logging
 from datetime import datetime
 from time import time
@@ -70,7 +71,6 @@ async def extract_edges(
     }
     llm_response = await llm_client.generate_response(prompt_library.extract_edges.v2(context))
-    print(llm_response)
     edges_data = llm_response.get('edges', [])
     end = time()
@@ -110,8 +110,8 @@ async def dedupe_extracted_edges(
     existing_edges: list[EntityEdge],
 ) -> list[EntityEdge]:
     # Create edge map
-    edge_map = {}
-    for edge in extracted_edges:
+    edge_map: dict[str, EntityEdge] = {}
+    for edge in existing_edges:
         edge_map[edge.uuid] = edge
     # Prepare context for LLM
@@ -125,18 +125,85 @@ async def dedupe_extracted_edges(
     }
     llm_response = await llm_client.generate_response(prompt_library.dedupe_edges.v1(context))
-    unique_edge_data = llm_response.get('unique_facts', [])
-    logger.info(f'Extracted unique edges: {unique_edge_data}')
+    duplicate_data = llm_response.get('duplicates', [])
+    logger.info(f'Extracted unique edges: {duplicate_data}')
+    duplicate_uuid_map: dict[str, str] = {}
+    for duplicate in duplicate_data:
+        uuid_value = duplicate['duplicate_of']
+        duplicate_uuid_map[duplicate['uuid']] = uuid_value
     # Get full edge data
-    edges = []
-    for unique_edge in unique_edge_data:
-        edge = edge_map[unique_edge['uuid']]
-        edges.append(edge)
+    edges: list[EntityEdge] = []
+    for edge in extracted_edges:
+        if edge.uuid in duplicate_uuid_map:
+            existing_uuid = duplicate_uuid_map[edge.uuid]
+            existing_edge = edge_map[existing_uuid]
+            edges.append(existing_edge)
+        else:
+            edges.append(edge)
     return edges
+async def resolve_extracted_edges(
+    llm_client: LLMClient,
+    extracted_edges: list[EntityEdge],
+    existing_edges_lists: list[list[EntityEdge]],
+) -> list[EntityEdge]:
+    resolved_edges: list[EntityEdge] = list(
+        await asyncio.gather(
+            *[
+                resolve_extracted_edge(llm_client, extracted_edge, existing_edges)
+                for extracted_edge, existing_edges in zip(extracted_edges, existing_edges_lists)
+            ]
+        )
+    )
+    return resolved_edges
+async def resolve_extracted_edge(
+    llm_client: LLMClient, extracted_edge: EntityEdge, existing_edges: list[EntityEdge]
+) -> EntityEdge:
+    start = time()
+    # Prepare context for LLM
+    existing_edges_context = [
+        {'uuid': edge.uuid, 'name': edge.name, 'fact': edge.fact} for edge in existing_edges
+    ]
+    extracted_edge_context = {
+        'uuid': extracted_edge.uuid,
+        'name': extracted_edge.name,
+        'fact': extracted_edge.fact,
+    }
+    context = {
+        'existing_edges': existing_edges_context,
+        'extracted_edges': extracted_edge_context,
+    }
+    llm_response = await llm_client.generate_response(prompt_library.dedupe_edges.v3(context))
+    is_duplicate: bool = llm_response.get('is_duplicate', False)
+    uuid: str | None = llm_response.get('uuid', None)
+    edge = extracted_edge
+    if is_duplicate:
+        for existing_edge in existing_edges:
+            if existing_edge.uuid != uuid:
+                continue
+            edge = existing_edge
+    end = time()
+    logger.info(
+        f'Resolved node: {extracted_edge.name} is {edge.name}, in {(end - start) * 1000} ms'
+    )
+    return edge
 async def dedupe_edge_list(
     llm_client: LLMClient,
     edges: list[EntityEdge],

graphiti-core 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

graphiti-core 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl