cognee 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. cognee/__init__.py +2 -0
  2. cognee/api/client.py +28 -3
  3. cognee/api/health.py +10 -13
  4. cognee/api/v1/add/add.py +3 -1
  5. cognee/api/v1/add/routers/get_add_router.py +12 -37
  6. cognee/api/v1/cloud/routers/__init__.py +1 -0
  7. cognee/api/v1/cloud/routers/get_checks_router.py +23 -0
  8. cognee/api/v1/cognify/code_graph_pipeline.py +9 -4
  9. cognee/api/v1/cognify/cognify.py +50 -3
  10. cognee/api/v1/cognify/routers/get_cognify_router.py +1 -1
  11. cognee/api/v1/datasets/routers/get_datasets_router.py +15 -4
  12. cognee/api/v1/memify/__init__.py +0 -0
  13. cognee/api/v1/memify/routers/__init__.py +1 -0
  14. cognee/api/v1/memify/routers/get_memify_router.py +100 -0
  15. cognee/api/v1/notebooks/routers/__init__.py +1 -0
  16. cognee/api/v1/notebooks/routers/get_notebooks_router.py +96 -0
  17. cognee/api/v1/search/routers/get_search_router.py +20 -1
  18. cognee/api/v1/search/search.py +11 -4
  19. cognee/api/v1/sync/__init__.py +17 -0
  20. cognee/api/v1/sync/routers/__init__.py +3 -0
  21. cognee/api/v1/sync/routers/get_sync_router.py +241 -0
  22. cognee/api/v1/sync/sync.py +877 -0
  23. cognee/api/v1/ui/__init__.py +1 -0
  24. cognee/api/v1/ui/ui.py +529 -0
  25. cognee/api/v1/users/routers/get_auth_router.py +13 -1
  26. cognee/base_config.py +10 -1
  27. cognee/cli/_cognee.py +93 -0
  28. cognee/infrastructure/databases/graph/config.py +10 -4
  29. cognee/infrastructure/databases/graph/kuzu/adapter.py +135 -0
  30. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +89 -0
  31. cognee/infrastructure/databases/relational/__init__.py +2 -0
  32. cognee/infrastructure/databases/relational/get_async_session.py +15 -0
  33. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +6 -1
  34. cognee/infrastructure/databases/relational/with_async_session.py +25 -0
  35. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +1 -1
  36. cognee/infrastructure/databases/vector/config.py +13 -6
  37. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +1 -1
  38. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +2 -6
  39. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +4 -1
  40. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -0
  41. cognee/infrastructure/files/storage/S3FileStorage.py +5 -0
  42. cognee/infrastructure/files/storage/StorageManager.py +7 -1
  43. cognee/infrastructure/files/storage/storage.py +16 -0
  44. cognee/infrastructure/llm/LLMGateway.py +18 -0
  45. cognee/infrastructure/llm/config.py +4 -2
  46. cognee/infrastructure/llm/prompts/extract_query_time.txt +15 -0
  47. cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +25 -0
  48. cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +30 -0
  49. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +2 -0
  50. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_event_entities.py +44 -0
  51. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py +1 -0
  52. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py +46 -0
  53. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +25 -1
  54. cognee/infrastructure/utils/run_sync.py +8 -1
  55. cognee/modules/chunking/models/DocumentChunk.py +4 -3
  56. cognee/modules/cloud/exceptions/CloudApiKeyMissingError.py +15 -0
  57. cognee/modules/cloud/exceptions/CloudConnectionError.py +15 -0
  58. cognee/modules/cloud/exceptions/__init__.py +2 -0
  59. cognee/modules/cloud/operations/__init__.py +1 -0
  60. cognee/modules/cloud/operations/check_api_key.py +25 -0
  61. cognee/modules/data/deletion/prune_system.py +1 -1
  62. cognee/modules/data/methods/check_dataset_name.py +1 -1
  63. cognee/modules/data/methods/get_dataset_data.py +1 -1
  64. cognee/modules/data/methods/load_or_create_datasets.py +1 -1
  65. cognee/modules/engine/models/Event.py +16 -0
  66. cognee/modules/engine/models/Interval.py +8 -0
  67. cognee/modules/engine/models/Timestamp.py +13 -0
  68. cognee/modules/engine/models/__init__.py +3 -0
  69. cognee/modules/engine/utils/__init__.py +2 -0
  70. cognee/modules/engine/utils/generate_event_datapoint.py +46 -0
  71. cognee/modules/engine/utils/generate_timestamp_datapoint.py +51 -0
  72. cognee/modules/graph/cognee_graph/CogneeGraph.py +2 -2
  73. cognee/modules/graph/utils/__init__.py +1 -0
  74. cognee/modules/graph/utils/resolve_edges_to_text.py +71 -0
  75. cognee/modules/memify/__init__.py +1 -0
  76. cognee/modules/memify/memify.py +118 -0
  77. cognee/modules/notebooks/methods/__init__.py +5 -0
  78. cognee/modules/notebooks/methods/create_notebook.py +26 -0
  79. cognee/modules/notebooks/methods/delete_notebook.py +13 -0
  80. cognee/modules/notebooks/methods/get_notebook.py +21 -0
  81. cognee/modules/notebooks/methods/get_notebooks.py +18 -0
  82. cognee/modules/notebooks/methods/update_notebook.py +17 -0
  83. cognee/modules/notebooks/models/Notebook.py +53 -0
  84. cognee/modules/notebooks/models/__init__.py +1 -0
  85. cognee/modules/notebooks/operations/__init__.py +1 -0
  86. cognee/modules/notebooks/operations/run_in_local_sandbox.py +55 -0
  87. cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py +19 -3
  88. cognee/modules/pipelines/operations/pipeline.py +1 -0
  89. cognee/modules/pipelines/operations/run_tasks.py +17 -41
  90. cognee/modules/retrieval/base_graph_retriever.py +18 -0
  91. cognee/modules/retrieval/base_retriever.py +1 -1
  92. cognee/modules/retrieval/code_retriever.py +8 -0
  93. cognee/modules/retrieval/coding_rules_retriever.py +31 -0
  94. cognee/modules/retrieval/completion_retriever.py +9 -3
  95. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -0
  96. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +23 -14
  97. cognee/modules/retrieval/graph_completion_cot_retriever.py +21 -11
  98. cognee/modules/retrieval/graph_completion_retriever.py +32 -65
  99. cognee/modules/retrieval/graph_summary_completion_retriever.py +3 -1
  100. cognee/modules/retrieval/insights_retriever.py +14 -3
  101. cognee/modules/retrieval/summaries_retriever.py +1 -1
  102. cognee/modules/retrieval/temporal_retriever.py +152 -0
  103. cognee/modules/retrieval/utils/brute_force_triplet_search.py +7 -32
  104. cognee/modules/retrieval/utils/completion.py +10 -3
  105. cognee/modules/search/methods/get_search_type_tools.py +168 -0
  106. cognee/modules/search/methods/no_access_control_search.py +47 -0
  107. cognee/modules/search/methods/search.py +219 -139
  108. cognee/modules/search/types/SearchResult.py +21 -0
  109. cognee/modules/search/types/SearchType.py +2 -0
  110. cognee/modules/search/types/__init__.py +1 -0
  111. cognee/modules/search/utils/__init__.py +2 -0
  112. cognee/modules/search/utils/prepare_search_result.py +41 -0
  113. cognee/modules/search/utils/transform_context_to_graph.py +38 -0
  114. cognee/modules/sync/__init__.py +1 -0
  115. cognee/modules/sync/methods/__init__.py +23 -0
  116. cognee/modules/sync/methods/create_sync_operation.py +53 -0
  117. cognee/modules/sync/methods/get_sync_operation.py +107 -0
  118. cognee/modules/sync/methods/update_sync_operation.py +248 -0
  119. cognee/modules/sync/models/SyncOperation.py +142 -0
  120. cognee/modules/sync/models/__init__.py +3 -0
  121. cognee/modules/users/__init__.py +0 -1
  122. cognee/modules/users/methods/__init__.py +4 -1
  123. cognee/modules/users/methods/create_user.py +26 -1
  124. cognee/modules/users/methods/get_authenticated_user.py +36 -42
  125. cognee/modules/users/methods/get_default_user.py +3 -1
  126. cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +2 -1
  127. cognee/root_dir.py +19 -0
  128. cognee/shared/logging_utils.py +1 -1
  129. cognee/tasks/codingagents/__init__.py +0 -0
  130. cognee/tasks/codingagents/coding_rule_associations.py +127 -0
  131. cognee/tasks/ingestion/save_data_item_to_storage.py +23 -0
  132. cognee/tasks/memify/__init__.py +2 -0
  133. cognee/tasks/memify/extract_subgraph.py +7 -0
  134. cognee/tasks/memify/extract_subgraph_chunks.py +11 -0
  135. cognee/tasks/repo_processor/get_repo_file_dependencies.py +52 -27
  136. cognee/tasks/temporal_graph/__init__.py +1 -0
  137. cognee/tasks/temporal_graph/add_entities_to_event.py +85 -0
  138. cognee/tasks/temporal_graph/enrich_events.py +34 -0
  139. cognee/tasks/temporal_graph/extract_events_and_entities.py +32 -0
  140. cognee/tasks/temporal_graph/extract_knowledge_graph_from_events.py +41 -0
  141. cognee/tasks/temporal_graph/models.py +49 -0
  142. cognee/tests/test_kuzu.py +4 -4
  143. cognee/tests/test_neo4j.py +4 -4
  144. cognee/tests/test_permissions.py +3 -3
  145. cognee/tests/test_relational_db_migration.py +7 -5
  146. cognee/tests/test_search_db.py +18 -24
  147. cognee/tests/test_temporal_graph.py +167 -0
  148. cognee/tests/unit/api/__init__.py +1 -0
  149. cognee/tests/unit/api/test_conditional_authentication_endpoints.py +246 -0
  150. cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +18 -2
  151. cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +13 -16
  152. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +11 -16
  153. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +5 -4
  154. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +4 -2
  155. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +18 -2
  156. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +225 -0
  157. cognee/tests/unit/modules/users/__init__.py +1 -0
  158. cognee/tests/unit/modules/users/test_conditional_authentication.py +277 -0
  159. cognee/tests/unit/processing/utils/utils_test.py +20 -1
  160. {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/METADATA +8 -6
  161. {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/RECORD +165 -90
  162. cognee/tests/unit/modules/search/search_methods_test.py +0 -225
  163. {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/WHEEL +0 -0
  164. {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/entry_points.txt +0 -0
  165. {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/licenses/LICENSE +0 -0
  166. {cognee-0.2.4.dist-info → cognee-0.3.0.dist-info}/licenses/NOTICE.md +0 -0
@@ -6,6 +6,7 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
6
6
  import pydantic
7
7
  from pydantic import Field
8
8
  from cognee.base_config import get_base_config
9
+ from cognee.root_dir import ensure_absolute_path
9
10
  from cognee.shared.data_models import KnowledgeGraph
10
11
 
11
12
 
@@ -51,15 +52,20 @@ class GraphConfig(BaseSettings):
51
52
  @pydantic.model_validator(mode="after")
52
53
  def fill_derived(cls, values):
53
54
  provider = values.graph_database_provider.lower()
55
+ base_config = get_base_config()
54
56
 
55
57
  # Set default filename if no filename is provided
56
58
  if not values.graph_filename:
57
59
  values.graph_filename = f"cognee_graph_{provider}"
58
60
 
59
- # Set file path based on graph database provider if no file path is provided
60
- if not values.graph_file_path:
61
- base_config = get_base_config()
62
-
61
+ # Handle graph file path
62
+ if values.graph_file_path:
63
+ # Check if absolute path is provided
64
+ values.graph_file_path = ensure_absolute_path(
65
+ os.path.join(values.graph_file_path, values.graph_filename)
66
+ )
67
+ else:
68
+ # Default path
63
69
  databases_directory_path = os.path.join(base_config.system_root_directory, "databases")
64
70
  values.graph_file_path = os.path.join(databases_directory_path, values.graph_filename)
65
71
 
@@ -21,6 +21,8 @@ from cognee.infrastructure.databases.graph.graph_db_interface import (
21
21
  )
22
22
  from cognee.infrastructure.engine import DataPoint
23
23
  from cognee.modules.storage.utils import JSONEncoder
24
+ from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int
25
+ from cognee.tasks.temporal_graph.models import Timestamp
24
26
 
25
27
  logger = get_logger()
26
28
 
@@ -106,6 +108,18 @@ class KuzuAdapter(GraphDBInterface):
106
108
 
107
109
  self.db.init_database()
108
110
  self.connection = Connection(self.db)
111
+
112
+ try:
113
+ self.connection.execute("INSTALL JSON;")
114
+ except Exception as e:
115
+ logger.info(f"JSON extension already installed or not needed: {e}")
116
+
117
+ try:
118
+ self.connection.execute("LOAD EXTENSION JSON;")
119
+ logger.info("Loaded JSON extension")
120
+ except Exception as e:
121
+ logger.info(f"JSON extension already loaded or unavailable: {e}")
122
+
109
123
  # Create node table with essential fields and timestamp
110
124
  self.connection.execute("""
111
125
  CREATE NODE TABLE IF NOT EXISTS Node(
@@ -1693,3 +1707,124 @@ class KuzuAdapter(GraphDBInterface):
1693
1707
  SET r.properties = $props
1694
1708
  """
1695
1709
  await self.query(update_query, {"node_id": node_id, "props": new_props})
1710
+
1711
+ async def collect_events(self, ids: List[str]) -> Any:
1712
+ """
1713
+ Collect all Event-type nodes reachable within 1..2 hops
1714
+ from the given node IDs.
1715
+
1716
+ Args:
1717
+ graph_engine: Object exposing an async .query(str) -> Any
1718
+ ids: List of node IDs (strings)
1719
+
1720
+ Returns:
1721
+ List of events
1722
+ """
1723
+
1724
+ event_collection_cypher = """UNWIND [{quoted}] AS uid
1725
+ MATCH (start {{id: uid}})
1726
+ MATCH (start)-[*1..2]-(event)
1727
+ WHERE event.type = 'Event'
1728
+ WITH DISTINCT event
1729
+ RETURN collect(event) AS events;
1730
+ """
1731
+
1732
+ query = event_collection_cypher.format(quoted=ids)
1733
+ result = await self.query(query)
1734
+ events = []
1735
+ for node in result[0][0]:
1736
+ props = json.loads(node["properties"])
1737
+
1738
+ event = {
1739
+ "id": node["id"],
1740
+ "name": node["name"],
1741
+ "description": props.get("description"),
1742
+ }
1743
+
1744
+ if props.get("location"):
1745
+ event["location"] = props["location"]
1746
+
1747
+ events.append(event)
1748
+
1749
+ return [{"events": events}]
1750
+
1751
+ async def collect_time_ids(
1752
+ self,
1753
+ time_from: Optional[Timestamp] = None,
1754
+ time_to: Optional[Timestamp] = None,
1755
+ ) -> str:
1756
+ """
1757
+ Collect IDs of Timestamp nodes between time_from and time_to.
1758
+
1759
+ Args:
1760
+ graph_engine: Object exposing an async .query(query, params) -> list[dict]
1761
+ time_from: Lower bound int (inclusive), optional
1762
+ time_to: Upper bound int (inclusive), optional
1763
+
1764
+ Returns:
1765
+ A string of quoted IDs: "'id1', 'id2', 'id3'"
1766
+ (ready for use in a Cypher UNWIND clause).
1767
+ """
1768
+
1769
+ ids: List[str] = []
1770
+
1771
+ if time_from and time_to:
1772
+ time_from = date_to_int(time_from)
1773
+ time_to = date_to_int(time_to)
1774
+
1775
+ cypher = f"""
1776
+ MATCH (n:Node)
1777
+ WHERE n.type = 'Timestamp'
1778
+ // Extract time_at from the JSON string and cast to INT64
1779
+ WITH n, json_extract(n.properties, '$.time_at') AS t_str
1780
+ WITH n,
1781
+ CASE
1782
+ WHEN t_str IS NULL OR t_str = '' THEN NULL
1783
+ ELSE CAST(t_str AS INT64)
1784
+ END AS t
1785
+ WHERE t >= {time_from}
1786
+ AND t <= {time_to}
1787
+ RETURN n.id as id
1788
+ """
1789
+
1790
+ elif time_from:
1791
+ time_from = date_to_int(time_from)
1792
+
1793
+ cypher = f"""
1794
+ MATCH (n:Node)
1795
+ WHERE n.type = 'Timestamp'
1796
+ // Extract time_at from the JSON string and cast to INT64
1797
+ WITH n, json_extract(n.properties, '$.time_at') AS t_str
1798
+ WITH n,
1799
+ CASE
1800
+ WHEN t_str IS NULL OR t_str = '' THEN NULL
1801
+ ELSE CAST(t_str AS INT64)
1802
+ END AS t
1803
+ WHERE t >= {time_from}
1804
+ RETURN n.id as id
1805
+ """
1806
+
1807
+ elif time_to:
1808
+ time_to = date_to_int(time_to)
1809
+
1810
+ cypher = f"""
1811
+ MATCH (n:Node)
1812
+ WHERE n.type = 'Timestamp'
1813
+ // Extract time_at from the JSON string and cast to INT64
1814
+ WITH n, json_extract(n.properties, '$.time_at') AS t_str
1815
+ WITH n,
1816
+ CASE
1817
+ WHEN t_str IS NULL OR t_str = '' THEN NULL
1818
+ ELSE CAST(t_str AS INT64)
1819
+ END AS t
1820
+ WHERE t <= {time_to}
1821
+ RETURN n.id as id
1822
+ """
1823
+
1824
+ else:
1825
+ return ids
1826
+
1827
+ time_nodes = await self.query(cypher)
1828
+ time_ids_list = [item[0] for item in time_nodes]
1829
+
1830
+ return ", ".join(f"'{uid}'" for uid in time_ids_list)
@@ -11,6 +11,8 @@ from contextlib import asynccontextmanager
11
11
  from typing import Optional, Any, List, Dict, Type, Tuple
12
12
 
13
13
  from cognee.infrastructure.engine import DataPoint
14
+ from cognee.modules.engine.utils.generate_timestamp_datapoint import date_to_int
15
+ from cognee.tasks.temporal_graph.models import Timestamp
14
16
  from cognee.shared.logging_utils import get_logger, ERROR
15
17
  from cognee.infrastructure.databases.graph.graph_db_interface import (
16
18
  GraphDBInterface,
@@ -1371,3 +1373,90 @@ class Neo4jAdapter(GraphDBInterface):
1371
1373
  query,
1372
1374
  params={"weight": float(weight), "node_ids": list(node_ids)},
1373
1375
  )
1376
+
1377
+ async def collect_events(self, ids: List[str]) -> Any:
1378
+ """
1379
+ Collect all Event-type nodes reachable within 1..2 hops
1380
+ from the given node IDs.
1381
+
1382
+ Args:
1383
+ graph_engine: Object exposing an async .query(str) -> Any
1384
+ ids: List of node IDs (strings)
1385
+
1386
+ Returns:
1387
+ List of events
1388
+ """
1389
+
1390
+ event_collection_cypher = """UNWIND [{quoted}] AS uid
1391
+ MATCH (start {{id: uid}})
1392
+ MATCH (start)-[*1..2]-(event)
1393
+ WHERE event.type = 'Event'
1394
+ WITH DISTINCT event
1395
+ RETURN collect(event) AS events;
1396
+ """
1397
+
1398
+ query = event_collection_cypher.format(quoted=ids)
1399
+ return await self.query(query)
1400
+
1401
+ async def collect_time_ids(
1402
+ self,
1403
+ time_from: Optional[Timestamp] = None,
1404
+ time_to: Optional[Timestamp] = None,
1405
+ ) -> str:
1406
+ """
1407
+ Collect IDs of Timestamp nodes between time_from and time_to.
1408
+
1409
+ Args:
1410
+ graph_engine: Object exposing an async .query(query, params) -> list[dict]
1411
+ time_from: Lower bound int (inclusive), optional
1412
+ time_to: Upper bound int (inclusive), optional
1413
+
1414
+ Returns:
1415
+ A string of quoted IDs: "'id1', 'id2', 'id3'"
1416
+ (ready for use in a Cypher UNWIND clause).
1417
+ """
1418
+
1419
+ ids: List[str] = []
1420
+
1421
+ if time_from and time_to:
1422
+ time_from = date_to_int(time_from)
1423
+ time_to = date_to_int(time_to)
1424
+
1425
+ cypher = """
1426
+ MATCH (n)
1427
+ WHERE n.type = 'Timestamp'
1428
+ AND n.time_at >= $time_from
1429
+ AND n.time_at <= $time_to
1430
+ RETURN n.id AS id
1431
+ """
1432
+ params = {"time_from": time_from, "time_to": time_to}
1433
+
1434
+ elif time_from:
1435
+ time_from = date_to_int(time_from)
1436
+
1437
+ cypher = """
1438
+ MATCH (n)
1439
+ WHERE n.type = 'Timestamp'
1440
+ AND n.time_at >= $time_from
1441
+ RETURN n.id AS id
1442
+ """
1443
+ params = {"time_from": time_from}
1444
+
1445
+ elif time_to:
1446
+ time_to = date_to_int(time_to)
1447
+
1448
+ cypher = """
1449
+ MATCH (n)
1450
+ WHERE n.type = 'Timestamp'
1451
+ AND n.time_at <= $time_to
1452
+ RETURN n.id AS id
1453
+ """
1454
+ params = {"time_to": time_to}
1455
+
1456
+ else:
1457
+ return ids
1458
+
1459
+ time_nodes = await self.query(cypher, params)
1460
+ time_ids_list = [item["id"] for item in time_nodes if "id" in item]
1461
+
1462
+ return ", ".join(f"'{uid}'" for uid in time_ids_list)
@@ -1,6 +1,8 @@
1
1
  from .ModelBase import Base
2
2
  from .config import get_relational_config
3
3
  from .config import get_migration_config
4
+ from .get_async_session import get_async_session
5
+ from .with_async_session import with_async_session
4
6
  from .create_db_and_tables import create_db_and_tables
5
7
  from .get_relational_engine import get_relational_engine
6
8
  from .get_migration_relational_engine import get_migration_relational_engine
@@ -0,0 +1,15 @@
1
+ from typing import AsyncGenerator
2
+ from contextlib import asynccontextmanager
3
+ from sqlalchemy.ext.asyncio import AsyncSession
4
+
5
+ from .get_relational_engine import get_relational_engine
6
+
7
+
8
+ @asynccontextmanager
9
+ async def get_async_session(auto_commit=False) -> AsyncGenerator[AsyncSession, None]:
10
+ db_engine = get_relational_engine()
11
+ async with db_engine.get_async_session() as session:
12
+ yield session
13
+
14
+ if auto_commit:
15
+ await session.commit()
@@ -57,7 +57,12 @@ class SQLAlchemyAdapter:
57
57
  )
58
58
  else:
59
59
  self.engine = create_async_engine(
60
- connection_string, pool_size=12, max_overflow=12, poolclass=None
60
+ connection_string,
61
+ pool_size=5,
62
+ max_overflow=10,
63
+ pool_recycle=280,
64
+ pool_pre_ping=True,
65
+ pool_timeout=280,
61
66
  )
62
67
 
63
68
  self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
@@ -0,0 +1,25 @@
1
+ from typing import Any, Callable, Optional
2
+ from sqlalchemy.ext.asyncio import AsyncSession
3
+ from .get_async_session import get_async_session
4
+
5
+
6
+ def get_session_from_args(args):
7
+ last_arg = args[-1]
8
+ if isinstance(last_arg, AsyncSession):
9
+ return last_arg
10
+ return None
11
+
12
+
13
+ def with_async_session(func: Callable[..., Any]) -> Callable[..., Any]:
14
+ async def wrapper(*args, **kwargs):
15
+ session = kwargs.get("session") or get_session_from_args(args) # type: Optional[AsyncSession]
16
+
17
+ if session is None:
18
+ async with get_async_session() as session:
19
+ result = await func(*args, **kwargs, session=session)
20
+ await session.commit()
21
+ return result
22
+ else:
23
+ return await func(*args, **kwargs)
24
+
25
+ return wrapper
@@ -538,7 +538,7 @@ class ChromaDBAdapter(VectorDBInterface):
538
538
  Returns True upon successful deletion of all collections.
539
539
  """
540
540
  client = await self.get_connection()
541
- collections = await self.list_collections()
541
+ collections = await client.list_collections()
542
542
  for collection_name in collections:
543
543
  await client.delete_collection(collection_name)
544
544
  return True
@@ -1,9 +1,11 @@
1
1
  import os
2
2
  import pydantic
3
+ from pathlib import Path
3
4
  from functools import lru_cache
4
5
  from pydantic_settings import BaseSettings, SettingsConfigDict
5
6
 
6
7
  from cognee.base_config import get_base_config
8
+ from cognee.root_dir import ensure_absolute_path
7
9
 
8
10
 
9
11
  class VectorConfig(BaseSettings):
@@ -11,11 +13,9 @@ class VectorConfig(BaseSettings):
11
13
  Manage the configuration settings for the vector database.
12
14
 
13
15
  Public methods:
14
-
15
16
  - to_dict: Convert the configuration to a dictionary.
16
17
 
17
18
  Instance variables:
18
-
19
19
  - vector_db_url: The URL of the vector database.
20
20
  - vector_db_port: The port for the vector database.
21
21
  - vector_db_key: The key for accessing the vector database.
@@ -30,10 +30,17 @@ class VectorConfig(BaseSettings):
30
30
  model_config = SettingsConfigDict(env_file=".env", extra="allow")
31
31
 
32
32
  @pydantic.model_validator(mode="after")
33
- def fill_derived(cls, values):
34
- # Set file path based on graph database provider if no file path is provided
35
- if not values.vector_db_url:
36
- base_config = get_base_config()
33
+ def validate_paths(cls, values):
34
+ base_config = get_base_config()
35
+
36
+ # If vector_db_url is provided and is not a path skip checking if path is absolute (as it can also be a url)
37
+ if values.vector_db_url and Path(values.vector_db_url).exists():
38
+ # Relative path to absolute
39
+ values.vector_db_url = ensure_absolute_path(
40
+ values.vector_db_url,
41
+ )
42
+ else:
43
+ # Default path
37
44
  databases_directory_path = os.path.join(base_config.system_root_directory, "databases")
38
45
  values.vector_db_url = os.path.join(databases_directory_path, "cognee.lancedb")
39
46
 
@@ -4,7 +4,7 @@ from fastembed import TextEmbedding
4
4
  import litellm
5
5
  import os
6
6
  from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
7
- from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
7
+ from cognee.infrastructure.databases.exceptions import EmbeddingException
8
8
  from cognee.infrastructure.llm.tokenizer.TikToken import (
9
9
  TikTokenTokenizer,
10
10
  )
@@ -250,9 +250,7 @@ def embedding_rate_limit_sync(func):
250
250
  logger.warning(error_msg)
251
251
 
252
252
  # Create a custom embedding rate limit exception
253
- from cognee.infrastructure.databases.exceptions.EmbeddingException import (
254
- EmbeddingException,
255
- )
253
+ from cognee.infrastructure.databases.exceptions import EmbeddingException
256
254
 
257
255
  raise EmbeddingException(error_msg)
258
256
 
@@ -307,9 +305,7 @@ def embedding_rate_limit_async(func):
307
305
  logger.warning(error_msg)
308
306
 
309
307
  # Create a custom embedding rate limit exception
310
- from cognee.infrastructure.databases.exceptions.EmbeddingException import (
311
- EmbeddingException,
312
- )
308
+ from cognee.infrastructure.databases.exceptions import EmbeddingException
313
309
 
314
310
  raise EmbeddingException(error_msg)
315
311
 
@@ -33,6 +33,7 @@ def get_embedding_engine() -> EmbeddingEngine:
33
33
  config.embedding_api_version,
34
34
  config.huggingface_tokenizer,
35
35
  llm_config.llm_api_key,
36
+ llm_config.llm_provider,
36
37
  )
37
38
 
38
39
 
@@ -47,6 +48,7 @@ def create_embedding_engine(
47
48
  embedding_api_version,
48
49
  huggingface_tokenizer,
49
50
  llm_api_key,
51
+ llm_provider,
50
52
  ):
51
53
  """
52
54
  Create and return an embedding engine based on the specified provider.
@@ -99,7 +101,8 @@ def create_embedding_engine(
99
101
 
100
102
  return LiteLLMEmbeddingEngine(
101
103
  provider=embedding_provider,
102
- api_key=embedding_api_key or llm_api_key,
104
+ api_key=embedding_api_key
105
+ or (embedding_api_key if llm_provider == "custom" else llm_api_key),
103
106
  endpoint=embedding_endpoint,
104
107
  api_version=embedding_api_version,
105
108
  model=embedding_model,
@@ -189,6 +189,15 @@ class LocalFileStorage(Storage):
189
189
 
190
190
  return os.path.isfile(os.path.join(parsed_storage_path, file_path))
191
191
 
192
+ def get_size(self, file_path: str) -> int:
193
+ parsed_storage_path = get_parsed_path(self.storage_path)
194
+
195
+ return (
196
+ os.path.getsize(os.path.join(parsed_storage_path, file_path))
197
+ if self.file_exists(file_path)
198
+ else 0
199
+ )
200
+
192
201
  def ensure_directory_exists(self, directory_path: str = ""):
193
202
  """
194
203
  Ensure that the specified directory exists, creating it if necessary.
@@ -146,6 +146,11 @@ class S3FileStorage(Storage):
146
146
  self.s3.isfile, os.path.join(self.storage_path.replace("s3://", ""), file_path)
147
147
  )
148
148
 
149
+ async def get_size(self, file_path: str) -> int:
150
+ return await run_async(
151
+ self.s3.size, os.path.join(self.storage_path.replace("s3://", ""), file_path)
152
+ )
153
+
149
154
  async def ensure_directory_exists(self, directory_path: str = ""):
150
155
  """
151
156
  Ensure that the specified directory exists, creating it if necessary.
@@ -46,6 +46,12 @@ class StorageManager:
46
46
  else:
47
47
  return self.storage.is_file(file_path)
48
48
 
49
+ async def get_size(self, file_path: str) -> int:
50
+ if inspect.iscoroutinefunction(self.storage.get_size):
51
+ return await self.storage.get_size(file_path)
52
+ else:
53
+ return self.storage.get_size(file_path)
54
+
49
55
  async def store(self, file_path: str, data: BinaryIO, overwrite: bool = False) -> str:
50
56
  """
51
57
  Store data at the specified file path.
@@ -84,7 +90,7 @@ class StorageManager:
84
90
  """
85
91
  # Check the actual storage type by class name to determine if open() is async or sync
86
92
 
87
- if self.storage.__class__.__name__ == "S3FileStorage" and file_path.startswith("s3://"):
93
+ if self.storage.__class__.__name__ == "S3FileStorage":
88
94
  # S3FileStorage.open() is async
89
95
  async with self.storage.open(file_path, *args, **kwargs) as file:
90
96
  yield file
@@ -40,6 +40,22 @@ class Storage(Protocol):
40
40
  """
41
41
  pass
42
42
 
43
+ def get_size(self, file_path: str) -> int:
44
+ """
45
+ Get the size of a specified file in bytes.
46
+
47
+ Parameters:
48
+ -----------
49
+
50
+ - file_path (str): The path of the file to get the size of.
51
+
52
+ Returns:
53
+ --------
54
+
55
+ - int: The size of the file in bytes.
56
+ """
57
+ pass
58
+
43
59
  def store(self, file_path: str, data: Union[BinaryIO, str], overwrite: bool):
44
60
  """
45
61
  Store data at the specified file path.
@@ -144,3 +144,21 @@ class LLMGateway:
144
144
  )
145
145
 
146
146
  return extract_summary(content=content, response_model=response_model)
147
+
148
+ @staticmethod
149
+ def extract_event_graph(content: str, response_model: Type[BaseModel]) -> Coroutine:
150
+ # TODO: Add BAML version of category and extraction and update function (consulted with Igor)
151
+ from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.extraction import (
152
+ extract_event_graph,
153
+ )
154
+
155
+ return extract_event_graph(content=content, response_model=response_model)
156
+
157
+ @staticmethod
158
+ def extract_event_entities(content: str, response_model: Type[BaseModel]) -> Coroutine:
159
+ # TODO: Add BAML version of category and extraction and update function (consulted with Igor)
160
+ from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.extraction import (
161
+ extract_event_entities,
162
+ )
163
+
164
+ return extract_event_entities(content=content, response_model=response_model)
@@ -35,7 +35,7 @@ class LLMConfig(BaseSettings):
35
35
 
36
36
  structured_output_framework: str = "instructor"
37
37
  llm_provider: str = "openai"
38
- llm_model: str = "gpt-5-mini"
38
+ llm_model: str = "openai/gpt-4o-mini"
39
39
  llm_endpoint: str = ""
40
40
  llm_api_key: Optional[str] = None
41
41
  llm_api_version: Optional[str] = None
@@ -44,7 +44,7 @@ class LLMConfig(BaseSettings):
44
44
  llm_max_completion_tokens: int = 16384
45
45
 
46
46
  baml_llm_provider: str = "openai"
47
- baml_llm_model: str = "gpt-5-mini"
47
+ baml_llm_model: str = "gpt-4o-mini"
48
48
  baml_llm_endpoint: str = ""
49
49
  baml_llm_api_key: Optional[str] = None
50
50
  baml_llm_temperature: float = 0.0
@@ -52,6 +52,8 @@ class LLMConfig(BaseSettings):
52
52
 
53
53
  transcription_model: str = "whisper-1"
54
54
  graph_prompt_path: str = "generate_graph_prompt.txt"
55
+ temporal_graph_prompt_path: str = "generate_event_graph_prompt.txt"
56
+ event_entity_prompt_path: str = "generate_event_entity_prompt.txt"
55
57
  llm_rate_limit_enabled: bool = False
56
58
  llm_rate_limit_requests: int = 60
57
59
  llm_rate_limit_interval: int = 60 # in seconds (default is 60 requests per minute)
@@ -0,0 +1,15 @@
1
+ For the purposes of identifying timestamps in a query, you are tasked with extracting relevant timestamps from the query.
2
+ ## Timestamp requirements
3
+ - If the query contains interval extrack both starts_at and ends_at properties
4
+ - If the query contains an instantaneous timestamp, starts_at and ends_at should be the same
5
+ - If the query its open-ended (before 2009 or after 2009), the corresponding non defined end of the time should be none
6
+ -For example: "before 2009" -- starts_at: None, ends_at: 2009 or "after 2009" -- starts_at: 2009, ends_at: None
7
+ - Put always the data that comes first in time as starts_at and the timestamps that comes second in time as ends_at
8
+ - If starts_at or ends_at cannot be extracted both of them has to be None
9
+ ## Output Format
10
+ Your reply should be a JSON: list of dictionaries with the following structure:
11
+ ```python
12
+ class QueryInterval(BaseModel):
13
+ starts_at: Optional[Timestamp] = None
14
+ ends_at: Optional[Timestamp] = None
15
+ ```
@@ -0,0 +1,25 @@
1
+ For the purposes of building event-based knowledge graphs, you are tasked with extracting highly granular entities from events text. An entity is any distinct, identifiable thing, person, place, object, organization, concept, or phenomenon that can be named, referenced, or described in the event context. This includes but is not limited to: people, places, objects, organizations, concepts, events, processes, states, conditions, properties, attributes, roles, functions, and any other meaningful referents that contribute to understanding the event.
2
+ **Temporal Entity Exclusion**: Do not extract timestamp-like entities (dates, times, durations) as these are handled separately. However, extract named temporal periods, eras, historical epochs, and culturally significant time references
3
+ ## Input Format
4
+ The input will be a list of dictionaries, each containing:
5
+ - `event_name`: The name of the event
6
+ - `description`: The description of the event
7
+ ## Task
8
+ For each event, extract all entities mentioned in the event description and determine their relationship to the event.
9
+ ## Output Format
10
+ Return the same enriched JSON with an additional key in each dictionary: `attributes`.
11
+ The `attributes` should be a list of dictionaries, each containing:
12
+ - `entity`: The name of the entity
13
+ - `entity_type`: The type/category of the entity (person, place, organization, object, concept, etc.)
14
+ - `relationship`: A concise description of how the entity relates to the event
15
+ ## Requirements
16
+ - **Be extremely thorough** - extract EVERY non-temporal entity mentioned, no matter how small, obvious, or seemingly insignificant
17
+ - **After you are done with obvious entities, every noun, pronoun, proper noun, and named reference = one entity**
18
+ - We expect rich entity networks from any event, easily reaching a dozens of entities per event
19
+ - Granularity and richness of the entity extraction is key to our success and is of utmost importance
20
+ - **Do not skip any entities** - if you're unsure whether something is an entity, extract it anyway
21
+ - Use the event name for context when determining relationships
22
+ - Relationships should be technical with one or at most two words. If two words, use underscore camelcase style
23
+ - Relationships could imply general meaning like: subject, object, participant, recipient, agent, instrument, tool, source, cause, effect, purpose, manner, resource, etc.
24
+ - You can combine two words to form a relationship name: subject_role, previous_owner, etc.
25
+ - Focus on how the entity specifically relates to the event
@@ -0,0 +1,30 @@
1
+ For the purposes of building event-based knowledge graphs, you are tasked with extracting highly granular stream events from a text. The events are defined as follows:
2
+ ## Event Definition
3
+ - Anything with a date or a timestamp is an event
4
+ - Anything that took place in time (even if the time is unknown) is an event
5
+ - Anything that lasted over a period of time, or happened in an instant is an event: from historical milestones (wars, presidencies, olympiads) to personal milestones (birth, death, employment, etc.), to mundane actions (a walk, a conversation, etc.)
6
+ - **ANY action or verb represents an event** - this is the most important rule
7
+ - Every single verb in the text corresponds to an event that must be extracted
8
+ - This includes: thinking, feeling, seeing, hearing, moving, speaking, writing, reading, eating, sleeping, working, playing, studying, traveling, meeting, calling, texting, buying, selling, creating, destroying, building, breaking, starting, stopping, beginning, ending, etc.
9
+ - Even the most mundane or obvious actions are events: "he walked", "she sat", "they talked", "I thought", "we waited"
10
+ ## Requirements
11
+ - **Be extremely thorough** - extract EVERY event mentioned, no matter how small or obvious
12
+ - **Timestamped first" - every time stamp, or date should have atleast one event
13
+ - **Verbs/actions = one event** - After you are done with timestamped events -- every verb that is an action should have a corresponding event.
14
+ - We expect long streams of events from any piece of text, easily reaching a hundred events
15
+ - Granularity and richness of the stream is key to our success and is of utmost importance
16
+ - Not all events will have timestamps, add timestamps only to known events
17
+ - For events that were instantaneous, just attach the time_from or time_to property don't create both
18
+ - **Do not skip any events** - if you're unsure whether something is an event, extract it anyway
19
+ - **Quantity over filtering** - it's better to extract too many events than to miss any
20
+ - **Descriptions** - Always include the event description together with entities (Who did what, what happened? What is the event?). If you can include the corresponding part from the text.
21
+ ## Output Format
22
+ Your reply should be a JSON: list of dictionaries with the following structure:
23
+ ```python
24
+ class Event(BaseModel):
25
+ name: str [concise]
26
+ description: Optional[str] = None
27
+ time_from: Optional[Timestamp] = None
28
+ time_to: Optional[Timestamp] = None
29
+ location: Optional[str] = None
30
+ ```