cognee 0.2.3.dev1__py3-none-any.whl → 0.3.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/__main__.py +4 -0
  3. cognee/api/client.py +28 -3
  4. cognee/api/health.py +10 -13
  5. cognee/api/v1/add/add.py +20 -6
  6. cognee/api/v1/add/routers/get_add_router.py +12 -37
  7. cognee/api/v1/cloud/routers/__init__.py +1 -0
  8. cognee/api/v1/cloud/routers/get_checks_router.py +23 -0
  9. cognee/api/v1/cognify/code_graph_pipeline.py +14 -3
  10. cognee/api/v1/cognify/cognify.py +67 -105
  11. cognee/api/v1/cognify/routers/get_cognify_router.py +11 -3
  12. cognee/api/v1/datasets/routers/get_datasets_router.py +16 -5
  13. cognee/api/v1/memify/routers/__init__.py +1 -0
  14. cognee/api/v1/memify/routers/get_memify_router.py +100 -0
  15. cognee/api/v1/notebooks/routers/__init__.py +1 -0
  16. cognee/api/v1/notebooks/routers/get_notebooks_router.py +96 -0
  17. cognee/api/v1/responses/default_tools.py +4 -0
  18. cognee/api/v1/responses/dispatch_function.py +6 -1
  19. cognee/api/v1/responses/models.py +1 -1
  20. cognee/api/v1/search/routers/get_search_router.py +20 -1
  21. cognee/api/v1/search/search.py +17 -4
  22. cognee/api/v1/sync/__init__.py +17 -0
  23. cognee/api/v1/sync/routers/__init__.py +3 -0
  24. cognee/api/v1/sync/routers/get_sync_router.py +241 -0
  25. cognee/api/v1/sync/sync.py +877 -0
  26. cognee/api/v1/users/routers/get_auth_router.py +13 -1
  27. cognee/base_config.py +10 -1
  28. cognee/cli/__init__.py +10 -0
  29. cognee/cli/_cognee.py +180 -0
  30. cognee/cli/commands/__init__.py +1 -0
  31. cognee/cli/commands/add_command.py +80 -0
  32. cognee/cli/commands/cognify_command.py +128 -0
  33. cognee/cli/commands/config_command.py +225 -0
  34. cognee/cli/commands/delete_command.py +80 -0
  35. cognee/cli/commands/search_command.py +149 -0
  36. cognee/cli/config.py +33 -0
  37. cognee/cli/debug.py +21 -0
  38. cognee/cli/echo.py +45 -0
  39. cognee/cli/exceptions.py +23 -0
  40. cognee/cli/minimal_cli.py +97 -0
  41. cognee/cli/reference.py +26 -0
  42. cognee/cli/suppress_logging.py +12 -0
  43. cognee/eval_framework/corpus_builder/corpus_builder_executor.py +2 -2
  44. cognee/eval_framework/eval_config.py +1 -1
  45. cognee/infrastructure/databases/graph/config.py +10 -4
  46. cognee/infrastructure/databases/graph/get_graph_engine.py +4 -9
  47. cognee/infrastructure/databases/graph/kuzu/adapter.py +199 -2
  48. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +138 -0
  49. cognee/infrastructure/databases/relational/__init__.py +2 -0
  50. cognee/infrastructure/databases/relational/get_async_session.py +15 -0
  51. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +6 -1
  52. cognee/infrastructure/databases/relational/with_async_session.py +25 -0
  53. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +1 -1
  54. cognee/infrastructure/databases/vector/config.py +13 -6
  55. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +6 -4
  56. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +16 -7
  57. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +5 -5
  58. cognee/infrastructure/databases/vector/embeddings/config.py +2 -2
  59. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +2 -6
  60. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +10 -7
  61. cognee/infrastructure/files/storage/LocalFileStorage.py +9 -0
  62. cognee/infrastructure/files/storage/S3FileStorage.py +5 -0
  63. cognee/infrastructure/files/storage/StorageManager.py +7 -1
  64. cognee/infrastructure/files/storage/storage.py +16 -0
  65. cognee/infrastructure/files/utils/get_data_file_path.py +14 -9
  66. cognee/infrastructure/files/utils/get_file_metadata.py +2 -1
  67. cognee/infrastructure/llm/LLMGateway.py +32 -5
  68. cognee/infrastructure/llm/config.py +6 -4
  69. cognee/infrastructure/llm/prompts/extract_query_time.txt +15 -0
  70. cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +25 -0
  71. cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +30 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +16 -5
  73. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +2 -0
  74. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_event_entities.py +44 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py +1 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_content_graph.py +19 -15
  77. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py +46 -0
  78. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +3 -3
  79. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +3 -3
  80. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +2 -2
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +14 -8
  82. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +6 -4
  83. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +28 -4
  84. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +2 -2
  85. cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py +3 -3
  86. cognee/infrastructure/llm/tokenizer/Mistral/adapter.py +3 -3
  87. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +6 -6
  88. cognee/infrastructure/llm/utils.py +7 -7
  89. cognee/infrastructure/utils/run_sync.py +8 -1
  90. cognee/modules/chunking/models/DocumentChunk.py +4 -3
  91. cognee/modules/cloud/exceptions/CloudApiKeyMissingError.py +15 -0
  92. cognee/modules/cloud/exceptions/CloudConnectionError.py +15 -0
  93. cognee/modules/cloud/exceptions/__init__.py +2 -0
  94. cognee/modules/cloud/operations/__init__.py +1 -0
  95. cognee/modules/cloud/operations/check_api_key.py +25 -0
  96. cognee/modules/data/deletion/prune_system.py +1 -1
  97. cognee/modules/data/methods/__init__.py +2 -0
  98. cognee/modules/data/methods/check_dataset_name.py +1 -1
  99. cognee/modules/data/methods/create_authorized_dataset.py +19 -0
  100. cognee/modules/data/methods/get_authorized_dataset.py +11 -5
  101. cognee/modules/data/methods/get_authorized_dataset_by_name.py +16 -0
  102. cognee/modules/data/methods/get_dataset_data.py +1 -1
  103. cognee/modules/data/methods/load_or_create_datasets.py +2 -20
  104. cognee/modules/engine/models/Event.py +16 -0
  105. cognee/modules/engine/models/Interval.py +8 -0
  106. cognee/modules/engine/models/Timestamp.py +13 -0
  107. cognee/modules/engine/models/__init__.py +3 -0
  108. cognee/modules/engine/utils/__init__.py +2 -0
  109. cognee/modules/engine/utils/generate_event_datapoint.py +46 -0
  110. cognee/modules/engine/utils/generate_timestamp_datapoint.py +51 -0
  111. cognee/modules/graph/cognee_graph/CogneeGraph.py +2 -2
  112. cognee/modules/graph/methods/get_formatted_graph_data.py +3 -2
  113. cognee/modules/graph/utils/__init__.py +1 -0
  114. cognee/modules/graph/utils/resolve_edges_to_text.py +71 -0
  115. cognee/modules/memify/__init__.py +1 -0
  116. cognee/modules/memify/memify.py +118 -0
  117. cognee/modules/notebooks/methods/__init__.py +5 -0
  118. cognee/modules/notebooks/methods/create_notebook.py +26 -0
  119. cognee/modules/notebooks/methods/delete_notebook.py +13 -0
  120. cognee/modules/notebooks/methods/get_notebook.py +21 -0
  121. cognee/modules/notebooks/methods/get_notebooks.py +18 -0
  122. cognee/modules/notebooks/methods/update_notebook.py +17 -0
  123. cognee/modules/notebooks/models/Notebook.py +53 -0
  124. cognee/modules/notebooks/models/__init__.py +1 -0
  125. cognee/modules/notebooks/operations/__init__.py +1 -0
  126. cognee/modules/notebooks/operations/run_in_local_sandbox.py +55 -0
  127. cognee/modules/pipelines/__init__.py +1 -1
  128. cognee/modules/pipelines/exceptions/tasks.py +18 -0
  129. cognee/modules/pipelines/layers/__init__.py +1 -0
  130. cognee/modules/pipelines/layers/check_pipeline_run_qualification.py +59 -0
  131. cognee/modules/pipelines/layers/pipeline_execution_mode.py +127 -0
  132. cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py +28 -0
  133. cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +34 -0
  134. cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +55 -0
  135. cognee/modules/pipelines/layers/setup_and_check_environment.py +41 -0
  136. cognee/modules/pipelines/layers/validate_pipeline_tasks.py +20 -0
  137. cognee/modules/pipelines/methods/__init__.py +2 -0
  138. cognee/modules/pipelines/methods/get_pipeline_runs_by_dataset.py +34 -0
  139. cognee/modules/pipelines/methods/reset_pipeline_run_status.py +16 -0
  140. cognee/modules/pipelines/operations/__init__.py +0 -1
  141. cognee/modules/pipelines/operations/log_pipeline_run_initiated.py +1 -1
  142. cognee/modules/pipelines/operations/pipeline.py +24 -138
  143. cognee/modules/pipelines/operations/run_tasks.py +17 -41
  144. cognee/modules/retrieval/base_feedback.py +11 -0
  145. cognee/modules/retrieval/base_graph_retriever.py +18 -0
  146. cognee/modules/retrieval/base_retriever.py +1 -1
  147. cognee/modules/retrieval/code_retriever.py +8 -0
  148. cognee/modules/retrieval/coding_rules_retriever.py +31 -0
  149. cognee/modules/retrieval/completion_retriever.py +9 -3
  150. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -0
  151. cognee/modules/retrieval/cypher_search_retriever.py +1 -9
  152. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +29 -13
  153. cognee/modules/retrieval/graph_completion_cot_retriever.py +30 -13
  154. cognee/modules/retrieval/graph_completion_retriever.py +107 -56
  155. cognee/modules/retrieval/graph_summary_completion_retriever.py +5 -1
  156. cognee/modules/retrieval/insights_retriever.py +14 -3
  157. cognee/modules/retrieval/natural_language_retriever.py +0 -4
  158. cognee/modules/retrieval/summaries_retriever.py +1 -1
  159. cognee/modules/retrieval/temporal_retriever.py +152 -0
  160. cognee/modules/retrieval/user_qa_feedback.py +83 -0
  161. cognee/modules/retrieval/utils/brute_force_triplet_search.py +7 -32
  162. cognee/modules/retrieval/utils/completion.py +10 -3
  163. cognee/modules/retrieval/utils/extract_uuid_from_node.py +18 -0
  164. cognee/modules/retrieval/utils/models.py +40 -0
  165. cognee/modules/search/methods/get_search_type_tools.py +168 -0
  166. cognee/modules/search/methods/no_access_control_search.py +47 -0
  167. cognee/modules/search/methods/search.py +239 -118
  168. cognee/modules/search/types/SearchResult.py +21 -0
  169. cognee/modules/search/types/SearchType.py +3 -0
  170. cognee/modules/search/types/__init__.py +1 -0
  171. cognee/modules/search/utils/__init__.py +2 -0
  172. cognee/modules/search/utils/prepare_search_result.py +41 -0
  173. cognee/modules/search/utils/transform_context_to_graph.py +38 -0
  174. cognee/modules/settings/get_settings.py +2 -2
  175. cognee/modules/sync/__init__.py +1 -0
  176. cognee/modules/sync/methods/__init__.py +23 -0
  177. cognee/modules/sync/methods/create_sync_operation.py +53 -0
  178. cognee/modules/sync/methods/get_sync_operation.py +107 -0
  179. cognee/modules/sync/methods/update_sync_operation.py +248 -0
  180. cognee/modules/sync/models/SyncOperation.py +142 -0
  181. cognee/modules/sync/models/__init__.py +3 -0
  182. cognee/modules/users/__init__.py +0 -1
  183. cognee/modules/users/methods/__init__.py +4 -1
  184. cognee/modules/users/methods/create_user.py +26 -1
  185. cognee/modules/users/methods/get_authenticated_user.py +36 -42
  186. cognee/modules/users/methods/get_default_user.py +3 -1
  187. cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +2 -1
  188. cognee/root_dir.py +19 -0
  189. cognee/shared/CodeGraphEntities.py +1 -0
  190. cognee/shared/logging_utils.py +143 -32
  191. cognee/shared/utils.py +0 -1
  192. cognee/tasks/codingagents/coding_rule_associations.py +127 -0
  193. cognee/tasks/graph/extract_graph_from_data.py +6 -2
  194. cognee/tasks/ingestion/save_data_item_to_storage.py +23 -0
  195. cognee/tasks/memify/__init__.py +2 -0
  196. cognee/tasks/memify/extract_subgraph.py +7 -0
  197. cognee/tasks/memify/extract_subgraph_chunks.py +11 -0
  198. cognee/tasks/repo_processor/get_local_dependencies.py +2 -0
  199. cognee/tasks/repo_processor/get_repo_file_dependencies.py +144 -47
  200. cognee/tasks/storage/add_data_points.py +33 -3
  201. cognee/tasks/temporal_graph/__init__.py +1 -0
  202. cognee/tasks/temporal_graph/add_entities_to_event.py +85 -0
  203. cognee/tasks/temporal_graph/enrich_events.py +34 -0
  204. cognee/tasks/temporal_graph/extract_events_and_entities.py +32 -0
  205. cognee/tasks/temporal_graph/extract_knowledge_graph_from_events.py +41 -0
  206. cognee/tasks/temporal_graph/models.py +49 -0
  207. cognee/tests/integration/cli/__init__.py +3 -0
  208. cognee/tests/integration/cli/test_cli_integration.py +331 -0
  209. cognee/tests/integration/documents/PdfDocument_test.py +2 -2
  210. cognee/tests/integration/documents/TextDocument_test.py +2 -4
  211. cognee/tests/integration/documents/UnstructuredDocument_test.py +5 -8
  212. cognee/tests/{test_deletion.py → test_delete_hard.py} +0 -37
  213. cognee/tests/test_delete_soft.py +85 -0
  214. cognee/tests/test_kuzu.py +2 -2
  215. cognee/tests/test_neo4j.py +2 -2
  216. cognee/tests/test_permissions.py +3 -3
  217. cognee/tests/test_relational_db_migration.py +7 -5
  218. cognee/tests/test_search_db.py +136 -23
  219. cognee/tests/test_temporal_graph.py +167 -0
  220. cognee/tests/unit/api/__init__.py +1 -0
  221. cognee/tests/unit/api/test_conditional_authentication_endpoints.py +246 -0
  222. cognee/tests/unit/cli/__init__.py +3 -0
  223. cognee/tests/unit/cli/test_cli_commands.py +483 -0
  224. cognee/tests/unit/cli/test_cli_edge_cases.py +625 -0
  225. cognee/tests/unit/cli/test_cli_main.py +173 -0
  226. cognee/tests/unit/cli/test_cli_runner.py +62 -0
  227. cognee/tests/unit/cli/test_cli_utils.py +127 -0
  228. cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +18 -2
  229. cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +12 -15
  230. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +10 -15
  231. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +4 -3
  232. cognee/tests/unit/modules/retrieval/insights_retriever_test.py +4 -2
  233. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +18 -2
  234. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +225 -0
  235. cognee/tests/unit/modules/users/__init__.py +1 -0
  236. cognee/tests/unit/modules/users/test_conditional_authentication.py +277 -0
  237. cognee/tests/unit/processing/utils/utils_test.py +20 -1
  238. {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/METADATA +13 -9
  239. {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/RECORD +245 -135
  240. cognee-0.3.0.dev0.dist-info/entry_points.txt +2 -0
  241. cognee/infrastructure/databases/graph/networkx/adapter.py +0 -1017
  242. cognee/infrastructure/pipeline/models/Operation.py +0 -60
  243. cognee/notebooks/github_analysis_step_by_step.ipynb +0 -37
  244. cognee/tests/tasks/descriptive_metrics/networkx_metrics_test.py +0 -7
  245. cognee/tests/unit/modules/search/search_methods_test.py +0 -223
  246. /cognee/{infrastructure/databases/graph/networkx → api/v1/memify}/__init__.py +0 -0
  247. /cognee/{infrastructure/pipeline/models → tasks/codingagents}/__init__.py +0 -0
  248. {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/WHEEL +0 -0
  249. {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/licenses/LICENSE +0 -0
  250. {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dev0.dist-info}/licenses/NOTICE.md +0 -0
@@ -27,12 +27,13 @@ def get_embedding_engine() -> EmbeddingEngine:
27
27
  config.embedding_provider,
28
28
  config.embedding_model,
29
29
  config.embedding_dimensions,
30
- config.embedding_max_tokens,
30
+ config.embedding_max_completion_tokens,
31
31
  config.embedding_endpoint,
32
32
  config.embedding_api_key,
33
33
  config.embedding_api_version,
34
34
  config.huggingface_tokenizer,
35
35
  llm_config.llm_api_key,
36
+ llm_config.llm_provider,
36
37
  )
37
38
 
38
39
 
@@ -41,12 +42,13 @@ def create_embedding_engine(
41
42
  embedding_provider,
42
43
  embedding_model,
43
44
  embedding_dimensions,
44
- embedding_max_tokens,
45
+ embedding_max_completion_tokens,
45
46
  embedding_endpoint,
46
47
  embedding_api_key,
47
48
  embedding_api_version,
48
49
  huggingface_tokenizer,
49
50
  llm_api_key,
51
+ llm_provider,
50
52
  ):
51
53
  """
52
54
  Create and return an embedding engine based on the specified provider.
@@ -58,7 +60,7 @@ def create_embedding_engine(
58
60
  'ollama', or another supported provider.
59
61
  - embedding_model: The model to be used for the embedding engine.
60
62
  - embedding_dimensions: The number of dimensions for the embeddings.
61
- - embedding_max_tokens: The maximum number of tokens for the embeddings.
63
+ - embedding_max_completion_tokens: The maximum number of tokens for the embeddings.
62
64
  - embedding_endpoint: The endpoint for the embedding service, relevant for certain
63
65
  providers.
64
66
  - embedding_api_key: API key to authenticate with the embedding service, if
@@ -81,7 +83,7 @@ def create_embedding_engine(
81
83
  return FastembedEmbeddingEngine(
82
84
  model=embedding_model,
83
85
  dimensions=embedding_dimensions,
84
- max_tokens=embedding_max_tokens,
86
+ max_completion_tokens=embedding_max_completion_tokens,
85
87
  )
86
88
 
87
89
  if embedding_provider == "ollama":
@@ -90,7 +92,7 @@ def create_embedding_engine(
90
92
  return OllamaEmbeddingEngine(
91
93
  model=embedding_model,
92
94
  dimensions=embedding_dimensions,
93
- max_tokens=embedding_max_tokens,
95
+ max_completion_tokens=embedding_max_completion_tokens,
94
96
  endpoint=embedding_endpoint,
95
97
  huggingface_tokenizer=huggingface_tokenizer,
96
98
  )
@@ -99,10 +101,11 @@ def create_embedding_engine(
99
101
 
100
102
  return LiteLLMEmbeddingEngine(
101
103
  provider=embedding_provider,
102
- api_key=embedding_api_key or llm_api_key,
104
+ api_key=embedding_api_key
105
+ or (embedding_api_key if llm_provider == "custom" else llm_api_key),
103
106
  endpoint=embedding_endpoint,
104
107
  api_version=embedding_api_version,
105
108
  model=embedding_model,
106
109
  dimensions=embedding_dimensions,
107
- max_tokens=embedding_max_tokens,
110
+ max_completion_tokens=embedding_max_completion_tokens,
108
111
  )
@@ -189,6 +189,15 @@ class LocalFileStorage(Storage):
189
189
 
190
190
  return os.path.isfile(os.path.join(parsed_storage_path, file_path))
191
191
 
192
+ def get_size(self, file_path: str) -> int:
193
+ parsed_storage_path = get_parsed_path(self.storage_path)
194
+
195
+ return (
196
+ os.path.getsize(os.path.join(parsed_storage_path, file_path))
197
+ if self.file_exists(file_path)
198
+ else 0
199
+ )
200
+
192
201
  def ensure_directory_exists(self, directory_path: str = ""):
193
202
  """
194
203
  Ensure that the specified directory exists, creating it if necessary.
@@ -146,6 +146,11 @@ class S3FileStorage(Storage):
146
146
  self.s3.isfile, os.path.join(self.storage_path.replace("s3://", ""), file_path)
147
147
  )
148
148
 
149
+ async def get_size(self, file_path: str) -> int:
150
+ return await run_async(
151
+ self.s3.size, os.path.join(self.storage_path.replace("s3://", ""), file_path)
152
+ )
153
+
149
154
  async def ensure_directory_exists(self, directory_path: str = ""):
150
155
  """
151
156
  Ensure that the specified directory exists, creating it if necessary.
@@ -46,6 +46,12 @@ class StorageManager:
46
46
  else:
47
47
  return self.storage.is_file(file_path)
48
48
 
49
+ async def get_size(self, file_path: str) -> int:
50
+ if inspect.iscoroutinefunction(self.storage.get_size):
51
+ return await self.storage.get_size(file_path)
52
+ else:
53
+ return self.storage.get_size(file_path)
54
+
49
55
  async def store(self, file_path: str, data: BinaryIO, overwrite: bool = False) -> str:
50
56
  """
51
57
  Store data at the specified file path.
@@ -84,7 +90,7 @@ class StorageManager:
84
90
  """
85
91
  # Check the actual storage type by class name to determine if open() is async or sync
86
92
 
87
- if self.storage.__class__.__name__ == "S3FileStorage" and file_path.startswith("s3://"):
93
+ if self.storage.__class__.__name__ == "S3FileStorage":
88
94
  # S3FileStorage.open() is async
89
95
  async with self.storage.open(file_path, *args, **kwargs) as file:
90
96
  yield file
@@ -40,6 +40,22 @@ class Storage(Protocol):
40
40
  """
41
41
  pass
42
42
 
43
+ def get_size(self, file_path: str) -> int:
44
+ """
45
+ Get the size of a specified file in bytes.
46
+
47
+ Parameters:
48
+ -----------
49
+
50
+ - file_path (str): The path of the file to get the size of.
51
+
52
+ Returns:
53
+ --------
54
+
55
+ - int: The size of the file in bytes.
56
+ """
57
+ pass
58
+
43
59
  def store(self, file_path: str, data: Union[BinaryIO, str], overwrite: bool):
44
60
  """
45
61
  Store data at the specified file path.
@@ -5,19 +5,24 @@ from urllib.parse import urlparse
5
5
  def get_data_file_path(file_path: str):
6
6
  # Check if this is a file URI BEFORE normalizing (which corrupts URIs)
7
7
  if file_path.startswith("file://"):
8
+ # Remove first occurrence of file:// prefix
9
+ pure_file_path = file_path.replace("file://", "", 1)
8
10
  # Normalize the file URI for Windows - replace backslashes with forward slashes
9
- normalized_file_uri = os.path.normpath(file_path)
11
+ normalized_file_uri = os.path.normpath(pure_file_path)
10
12
 
11
- parsed_url = urlparse(normalized_file_uri)
12
-
13
- # Convert URI path to file system path
13
+ # Convert path to proper file system path
14
14
  if os.name == "nt": # Windows
15
15
  # Handle Windows drive letters correctly
16
- fs_path = parsed_url.path
17
- if fs_path.startswith("/") and len(fs_path) > 1 and fs_path[2] == ":":
18
- fs_path = fs_path[1:] # Remove leading slash for Windows drive paths
19
- else: # Unix-like systems
20
- fs_path = parsed_url.path
16
+ fs_path = normalized_file_uri
17
+ if (
18
+ (fs_path.startswith("/") or fs_path.startswith("\\"))
19
+ and len(fs_path) > 1
20
+ and fs_path[2] == ":"
21
+ ):
22
+ fs_path = fs_path[1:]
23
+ else:
24
+ # Unix - like systems
25
+ fs_path = normalized_file_uri
21
26
 
22
27
  # Now split the actual filesystem path
23
28
  actual_fs_path = os.path.normpath(fs_path)
@@ -1,6 +1,7 @@
1
1
  import io
2
2
  import os.path
3
3
  from typing import BinaryIO, TypedDict
4
+ from pathlib import Path
4
5
 
5
6
  from cognee.shared.logging_utils import get_logger
6
7
  from cognee.infrastructure.files.utils.get_file_content_hash import get_file_content_hash
@@ -55,7 +56,7 @@ async def get_file_metadata(file: BinaryIO) -> FileMetadata:
55
56
  file_type = guess_file_type(file)
56
57
 
57
58
  file_path = getattr(file, "name", None) or getattr(file, "full_name", None)
58
- file_name = str(file_path).split("/")[-1].split(".")[0] if file_path else None
59
+ file_name = Path(file_path).stem if file_path else None
59
60
 
60
61
  # Get file size
61
62
  pos = file.tell() # remember current pointer
@@ -1,6 +1,5 @@
1
- from typing import Type
1
+ from typing import Type, Optional, Coroutine
2
2
  from pydantic import BaseModel
3
- from typing import Coroutine
4
3
  from cognee.infrastructure.llm import get_llm_config
5
4
 
6
5
 
@@ -79,7 +78,10 @@ class LLMGateway:
79
78
 
80
79
  @staticmethod
81
80
  def extract_content_graph(
82
- content: str, response_model: Type[BaseModel], mode: str = "simple"
81
+ content: str,
82
+ response_model: Type[BaseModel],
83
+ mode: str = "simple",
84
+ custom_prompt: Optional[str] = None,
83
85
  ) -> Coroutine:
84
86
  llm_config = get_llm_config()
85
87
  if llm_config.structured_output_framework.upper() == "BAML":
@@ -87,13 +89,20 @@ class LLMGateway:
87
89
  extract_content_graph,
88
90
  )
89
91
 
90
- return extract_content_graph(content=content, response_model=response_model, mode=mode)
92
+ return extract_content_graph(
93
+ content=content,
94
+ response_model=response_model,
95
+ mode=mode,
96
+ custom_prompt=custom_prompt,
97
+ )
91
98
  else:
92
99
  from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.extraction import (
93
100
  extract_content_graph,
94
101
  )
95
102
 
96
- return extract_content_graph(content=content, response_model=response_model)
103
+ return extract_content_graph(
104
+ content=content, response_model=response_model, custom_prompt=custom_prompt
105
+ )
97
106
 
98
107
  @staticmethod
99
108
  def extract_categories(content: str, response_model: Type[BaseModel]) -> Coroutine:
@@ -135,3 +144,21 @@ class LLMGateway:
135
144
  )
136
145
 
137
146
  return extract_summary(content=content, response_model=response_model)
147
+
148
+ @staticmethod
149
+ def extract_event_graph(content: str, response_model: Type[BaseModel]) -> Coroutine:
150
+ # TODO: Add BAML version of category and extraction and update function (consulted with Igor)
151
+ from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.extraction import (
152
+ extract_event_graph,
153
+ )
154
+
155
+ return extract_event_graph(content=content, response_model=response_model)
156
+
157
+ @staticmethod
158
+ def extract_event_entities(content: str, response_model: Type[BaseModel]) -> Coroutine:
159
+ # TODO: Add BAML version of category and extraction and update function (consulted with Igor)
160
+ from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.extraction import (
161
+ extract_event_entities,
162
+ )
163
+
164
+ return extract_event_entities(content=content, response_model=response_model)
@@ -18,7 +18,7 @@ class LLMConfig(BaseSettings):
18
18
  - llm_api_version
19
19
  - llm_temperature
20
20
  - llm_streaming
21
- - llm_max_tokens
21
+ - llm_max_completion_tokens
22
22
  - transcription_model
23
23
  - graph_prompt_path
24
24
  - llm_rate_limit_enabled
@@ -35,13 +35,13 @@ class LLMConfig(BaseSettings):
35
35
 
36
36
  structured_output_framework: str = "instructor"
37
37
  llm_provider: str = "openai"
38
- llm_model: str = "gpt-4o-mini"
38
+ llm_model: str = "openai/gpt-4o-mini"
39
39
  llm_endpoint: str = ""
40
40
  llm_api_key: Optional[str] = None
41
41
  llm_api_version: Optional[str] = None
42
42
  llm_temperature: float = 0.0
43
43
  llm_streaming: bool = False
44
- llm_max_tokens: int = 16384
44
+ llm_max_completion_tokens: int = 16384
45
45
 
46
46
  baml_llm_provider: str = "openai"
47
47
  baml_llm_model: str = "gpt-4o-mini"
@@ -52,6 +52,8 @@ class LLMConfig(BaseSettings):
52
52
 
53
53
  transcription_model: str = "whisper-1"
54
54
  graph_prompt_path: str = "generate_graph_prompt.txt"
55
+ temporal_graph_prompt_path: str = "generate_event_graph_prompt.txt"
56
+ event_entity_prompt_path: str = "generate_event_entity_prompt.txt"
55
57
  llm_rate_limit_enabled: bool = False
56
58
  llm_rate_limit_requests: int = 60
57
59
  llm_rate_limit_interval: int = 60 # in seconds (default is 60 requests per minute)
@@ -171,7 +173,7 @@ class LLMConfig(BaseSettings):
171
173
  "api_version": self.llm_api_version,
172
174
  "temperature": self.llm_temperature,
173
175
  "streaming": self.llm_streaming,
174
- "max_tokens": self.llm_max_tokens,
176
+ "max_completion_tokens": self.llm_max_completion_tokens,
175
177
  "transcription_model": self.transcription_model,
176
178
  "graph_prompt_path": self.graph_prompt_path,
177
179
  "rate_limit_enabled": self.llm_rate_limit_enabled,
@@ -0,0 +1,15 @@
1
+ For the purposes of identifying timestamps in a query, you are tasked with extracting relevant timestamps from the query.
2
+ ## Timestamp requirements
3
+ - If the query contains interval extrack both starts_at and ends_at properties
4
+ - If the query contains an instantaneous timestamp, starts_at and ends_at should be the same
5
+ - If the query its open-ended (before 2009 or after 2009), the corresponding non defined end of the time should be none
6
+ -For example: "before 2009" -- starts_at: None, ends_at: 2009 or "after 2009" -- starts_at: 2009, ends_at: None
7
+ - Put always the data that comes first in time as starts_at and the timestamps that comes second in time as ends_at
8
+ - If starts_at or ends_at cannot be extracted both of them has to be None
9
+ ## Output Format
10
+ Your reply should be a JSON: list of dictionaries with the following structure:
11
+ ```python
12
+ class QueryInterval(BaseModel):
13
+ starts_at: Optional[Timestamp] = None
14
+ ends_at: Optional[Timestamp] = None
15
+ ```
@@ -0,0 +1,25 @@
1
+ For the purposes of building event-based knowledge graphs, you are tasked with extracting highly granular entities from events text. An entity is any distinct, identifiable thing, person, place, object, organization, concept, or phenomenon that can be named, referenced, or described in the event context. This includes but is not limited to: people, places, objects, organizations, concepts, events, processes, states, conditions, properties, attributes, roles, functions, and any other meaningful referents that contribute to understanding the event.
2
+ **Temporal Entity Exclusion**: Do not extract timestamp-like entities (dates, times, durations) as these are handled separately. However, extract named temporal periods, eras, historical epochs, and culturally significant time references
3
+ ## Input Format
4
+ The input will be a list of dictionaries, each containing:
5
+ - `event_name`: The name of the event
6
+ - `description`: The description of the event
7
+ ## Task
8
+ For each event, extract all entities mentioned in the event description and determine their relationship to the event.
9
+ ## Output Format
10
+ Return the same enriched JSON with an additional key in each dictionary: `attributes`.
11
+ The `attributes` should be a list of dictionaries, each containing:
12
+ - `entity`: The name of the entity
13
+ - `entity_type`: The type/category of the entity (person, place, organization, object, concept, etc.)
14
+ - `relationship`: A concise description of how the entity relates to the event
15
+ ## Requirements
16
+ - **Be extremely thorough** - extract EVERY non-temporal entity mentioned, no matter how small, obvious, or seemingly insignificant
17
+ - **After you are done with obvious entities, every noun, pronoun, proper noun, and named reference = one entity**
18
+ - We expect rich entity networks from any event, easily reaching a dozens of entities per event
19
+ - Granularity and richness of the entity extraction is key to our success and is of utmost importance
20
+ - **Do not skip any entities** - if you're unsure whether something is an entity, extract it anyway
21
+ - Use the event name for context when determining relationships
22
+ - Relationships should be technical with one or at most two words. If two words, use underscore camelcase style
23
+ - Relationships could imply general meaning like: subject, object, participant, recipient, agent, instrument, tool, source, cause, effect, purpose, manner, resource, etc.
24
+ - You can combine two words to form a relationship name: subject_role, previous_owner, etc.
25
+ - Focus on how the entity specifically relates to the event
@@ -0,0 +1,30 @@
1
+ For the purposes of building event-based knowledge graphs, you are tasked with extracting highly granular stream events from a text. The events are defined as follows:
2
+ ## Event Definition
3
+ - Anything with a date or a timestamp is an event
4
+ - Anything that took place in time (even if the time is unknown) is an event
5
+ - Anything that lasted over a period of time, or happened in an instant is an event: from historical milestones (wars, presidencies, olympiads) to personal milestones (birth, death, employment, etc.), to mundane actions (a walk, a conversation, etc.)
6
+ - **ANY action or verb represents an event** - this is the most important rule
7
+ - Every single verb in the text corresponds to an event that must be extracted
8
+ - This includes: thinking, feeling, seeing, hearing, moving, speaking, writing, reading, eating, sleeping, working, playing, studying, traveling, meeting, calling, texting, buying, selling, creating, destroying, building, breaking, starting, stopping, beginning, ending, etc.
9
+ - Even the most mundane or obvious actions are events: "he walked", "she sat", "they talked", "I thought", "we waited"
10
+ ## Requirements
11
+ - **Be extremely thorough** - extract EVERY event mentioned, no matter how small or obvious
12
+ - **Timestamped first" - every time stamp, or date should have atleast one event
13
+ - **Verbs/actions = one event** - After you are done with timestamped events -- every verb that is an action should have a corresponding event.
14
+ - We expect long streams of events from any piece of text, easily reaching a hundred events
15
+ - Granularity and richness of the stream is key to our success and is of utmost importance
16
+ - Not all events will have timestamps, add timestamps only to known events
17
+ - For events that were instantaneous, just attach the time_from or time_to property don't create both
18
+ - **Do not skip any events** - if you're unsure whether something is an event, extract it anyway
19
+ - **Quantity over filtering** - it's better to extract too many events than to miss any
20
+ - **Descriptions** - Always include the event description together with entities (Who did what, what happened? What is the event?). If you can include the corresponding part from the text.
21
+ ## Output Format
22
+ Your reply should be a JSON: list of dictionaries with the following structure:
23
+ ```python
24
+ class Event(BaseModel):
25
+ name: str [concise]
26
+ description: Optional[str] = None
27
+ time_from: Optional[Timestamp] = None
28
+ time_to: Optional[Timestamp] = None
29
+ location: Optional[str] = None
30
+ ```
@@ -1,4 +1,4 @@
1
- from typing import Type
1
+ from typing import Type, Optional
2
2
  from pydantic import BaseModel
3
3
  from cognee.infrastructure.llm.config import get_llm_config
4
4
  from cognee.shared.logging_utils import get_logger, setup_logging
@@ -6,7 +6,10 @@ from cognee.infrastructure.llm.structured_output_framework.baml.baml_client.asyn
6
6
 
7
7
 
8
8
  async def extract_content_graph(
9
- content: str, response_model: Type[BaseModel], mode: str = "simple"
9
+ content: str,
10
+ response_model: Type[BaseModel],
11
+ mode: str = "simple",
12
+ custom_prompt: Optional[str] = None,
10
13
  ):
11
14
  config = get_llm_config()
12
15
  setup_logging()
@@ -26,8 +29,16 @@ async def extract_content_graph(
26
29
  # return graph
27
30
 
28
31
  # else:
29
- graph = await b.ExtractContentGraphGeneric(
30
- content, mode=mode, baml_options={"client_registry": config.baml_registry}
31
- )
32
+ if custom_prompt:
33
+ graph = await b.ExtractContentGraphGeneric(
34
+ content,
35
+ mode="custom",
36
+ custom_prompt_content=custom_prompt,
37
+ baml_options={"client_registry": config.baml_registry},
38
+ )
39
+ else:
40
+ graph = await b.ExtractContentGraphGeneric(
41
+ content, mode=mode, baml_options={"client_registry": config.baml_registry}
42
+ )
32
43
 
33
44
  return graph
@@ -1,3 +1,5 @@
1
1
  from .knowledge_graph.extract_content_graph import extract_content_graph
2
+ from .knowledge_graph.extract_event_graph import extract_event_graph
2
3
  from .extract_categories import extract_categories
3
4
  from .extract_summary import extract_summary, extract_code_summary
5
+ from .extract_event_entities import extract_event_entities
@@ -0,0 +1,44 @@
1
+ import os
2
+ from typing import List, Type
3
+ from pydantic import BaseModel
4
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
+ from cognee.infrastructure.llm.config import (
6
+ get_llm_config,
7
+ )
8
+
9
+
10
+ async def extract_event_entities(content: str, response_model: Type[BaseModel]):
11
+ """
12
+ Extracts event-related entities from the given content using an LLM with structured output.
13
+
14
+ This function loads an event entity extraction prompt from the LLM configuration,
15
+ renders it into a system prompt, and queries the LLM to produce structured entities
16
+ that conform to the specified response model.
17
+
18
+ Args:
19
+ content (str): The input text from which to extract event entities.
20
+ response_model (Type[BaseModel]): A Pydantic model defining the structure of the expected output.
21
+
22
+ Returns:
23
+ BaseModel: An instance of the response_model populated with extracted event entities.
24
+ """
25
+ llm_config = get_llm_config()
26
+
27
+ prompt_path = llm_config.event_entity_prompt_path
28
+
29
+ # Check if the prompt path is an absolute path or just a filename
30
+ if os.path.isabs(prompt_path):
31
+ # directory containing the file
32
+ base_directory = os.path.dirname(prompt_path)
33
+ # just the filename itself
34
+ prompt_path = os.path.basename(prompt_path)
35
+ else:
36
+ base_directory = None
37
+
38
+ system_prompt = LLMGateway.render_prompt(prompt_path, {}, base_directory=base_directory)
39
+
40
+ content_graph = await LLMGateway.acreate_structured_output(
41
+ content, system_prompt, response_model
42
+ )
43
+
44
+ return content_graph
@@ -1 +1,2 @@
1
1
  from .extract_content_graph import extract_content_graph
2
+ from .extract_event_graph import extract_event_graph
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import Type
2
+ from typing import Type, Optional
3
3
  from pydantic import BaseModel
4
4
 
5
5
  from cognee.infrastructure.llm.LLMGateway import LLMGateway
@@ -8,21 +8,25 @@ from cognee.infrastructure.llm.config import (
8
8
  )
9
9
 
10
10
 
11
- async def extract_content_graph(content: str, response_model: Type[BaseModel]):
12
- llm_config = get_llm_config()
13
-
14
- prompt_path = llm_config.graph_prompt_path
15
-
16
- # Check if the prompt path is an absolute path or just a filename
17
- if os.path.isabs(prompt_path):
18
- # directory containing the file
19
- base_directory = os.path.dirname(prompt_path)
20
- # just the filename itself
21
- prompt_path = os.path.basename(prompt_path)
11
+ async def extract_content_graph(
12
+ content: str, response_model: Type[BaseModel], custom_prompt: Optional[str] = None
13
+ ):
14
+ if custom_prompt:
15
+ system_prompt = custom_prompt
22
16
  else:
23
- base_directory = None
24
-
25
- system_prompt = LLMGateway.render_prompt(prompt_path, {}, base_directory=base_directory)
17
+ llm_config = get_llm_config()
18
+ prompt_path = llm_config.graph_prompt_path
19
+
20
+ # Check if the prompt path is an absolute path or just a filename
21
+ if os.path.isabs(prompt_path):
22
+ # directory containing the file
23
+ base_directory = os.path.dirname(prompt_path)
24
+ # just the filename itself
25
+ prompt_path = os.path.basename(prompt_path)
26
+ else:
27
+ base_directory = None
28
+
29
+ system_prompt = LLMGateway.render_prompt(prompt_path, {}, base_directory=base_directory)
26
30
 
27
31
  content_graph = await LLMGateway.acreate_structured_output(
28
32
  content, system_prompt, response_model
@@ -0,0 +1,46 @@
1
+ import os
2
+ from pydantic import BaseModel
3
+ from typing import Type
4
+ from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
+
6
+ from cognee.infrastructure.llm.config import (
7
+ get_llm_config,
8
+ )
9
+
10
+
11
+ async def extract_event_graph(content: str, response_model: Type[BaseModel]):
12
+ """
13
+ Extracts an event graph from the given content using an LLM with a structured output format.
14
+
15
+ This function loads a temporal graph extraction prompt from the LLM configuration,
16
+ renders it as a system prompt, and queries the LLM to produce a structured event
17
+ graph matching the specified response model.
18
+
19
+ Args:
20
+ content (str): The input text from which to extract the event graph.
21
+ response_model (Type[BaseModel]): A Pydantic model defining the structure of the expected output.
22
+
23
+ Returns:
24
+ BaseModel: An instance of the response_model populated with the extracted event graph.
25
+ """
26
+
27
+ llm_config = get_llm_config()
28
+
29
+ prompt_path = llm_config.temporal_graph_prompt_path
30
+
31
+ # Check if the prompt path is an absolute path or just a filename
32
+ if os.path.isabs(prompt_path):
33
+ # directory containing the file
34
+ base_directory = os.path.dirname(prompt_path)
35
+ # just the filename itself
36
+ prompt_path = os.path.basename(prompt_path)
37
+ else:
38
+ base_directory = None
39
+
40
+ system_prompt = LLMGateway.render_prompt(prompt_path, {}, base_directory=base_directory)
41
+
42
+ content_graph = await LLMGateway.acreate_structured_output(
43
+ content, system_prompt, response_model
44
+ )
45
+
46
+ return content_graph
@@ -23,7 +23,7 @@ class AnthropicAdapter(LLMInterface):
23
23
  name = "Anthropic"
24
24
  model: str
25
25
 
26
- def __init__(self, max_tokens: int, model: str = None):
26
+ def __init__(self, max_completion_tokens: int, model: str = None):
27
27
  import anthropic
28
28
 
29
29
  self.aclient = instructor.patch(
@@ -31,7 +31,7 @@ class AnthropicAdapter(LLMInterface):
31
31
  )
32
32
 
33
33
  self.model = model
34
- self.max_tokens = max_tokens
34
+ self.max_completion_tokens = max_completion_tokens
35
35
 
36
36
  @sleep_and_retry_async()
37
37
  @rate_limit_async
@@ -57,7 +57,7 @@ class AnthropicAdapter(LLMInterface):
57
57
 
58
58
  return await self.aclient(
59
59
  model=self.model,
60
- max_tokens=4096,
60
+ max_completion_tokens=4096,
61
61
  max_retries=5,
62
62
  messages=[
63
63
  {
@@ -34,7 +34,7 @@ class GeminiAdapter(LLMInterface):
34
34
  self,
35
35
  api_key: str,
36
36
  model: str,
37
- max_tokens: int,
37
+ max_completion_tokens: int,
38
38
  endpoint: Optional[str] = None,
39
39
  api_version: Optional[str] = None,
40
40
  streaming: bool = False,
@@ -44,7 +44,7 @@ class GeminiAdapter(LLMInterface):
44
44
  self.endpoint = endpoint
45
45
  self.api_version = api_version
46
46
  self.streaming = streaming
47
- self.max_tokens = max_tokens
47
+ self.max_completion_tokens = max_completion_tokens
48
48
 
49
49
  @observe(as_type="generation")
50
50
  @sleep_and_retry_async()
@@ -90,7 +90,7 @@ class GeminiAdapter(LLMInterface):
90
90
  model=f"{self.model}",
91
91
  messages=messages,
92
92
  api_key=self.api_key,
93
- max_tokens=self.max_tokens,
93
+ max_completion_tokens=self.max_completion_tokens,
94
94
  temperature=0.1,
95
95
  response_format=response_schema,
96
96
  timeout=100,
@@ -41,7 +41,7 @@ class GenericAPIAdapter(LLMInterface):
41
41
  api_key: str,
42
42
  model: str,
43
43
  name: str,
44
- max_tokens: int,
44
+ max_completion_tokens: int,
45
45
  fallback_model: str = None,
46
46
  fallback_api_key: str = None,
47
47
  fallback_endpoint: str = None,
@@ -50,7 +50,7 @@ class GenericAPIAdapter(LLMInterface):
50
50
  self.model = model
51
51
  self.api_key = api_key
52
52
  self.endpoint = endpoint
53
- self.max_tokens = max_tokens
53
+ self.max_completion_tokens = max_completion_tokens
54
54
 
55
55
  self.fallback_model = fallback_model
56
56
  self.fallback_api_key = fallback_api_key