cognee 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/client.py +9 -5
  3. cognee/api/v1/add/add.py +2 -1
  4. cognee/api/v1/add/routers/get_add_router.py +3 -1
  5. cognee/api/v1/cognify/cognify.py +24 -16
  6. cognee/api/v1/cognify/routers/__init__.py +0 -1
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +30 -1
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
  9. cognee/api/v1/ontologies/__init__.py +4 -0
  10. cognee/api/v1/ontologies/ontologies.py +158 -0
  11. cognee/api/v1/ontologies/routers/__init__.py +0 -0
  12. cognee/api/v1/ontologies/routers/get_ontology_router.py +109 -0
  13. cognee/api/v1/permissions/routers/get_permissions_router.py +41 -1
  14. cognee/api/v1/search/search.py +4 -0
  15. cognee/api/v1/ui/node_setup.py +360 -0
  16. cognee/api/v1/ui/npm_utils.py +50 -0
  17. cognee/api/v1/ui/ui.py +38 -68
  18. cognee/cli/commands/cognify_command.py +8 -1
  19. cognee/cli/config.py +1 -1
  20. cognee/context_global_variables.py +86 -9
  21. cognee/eval_framework/Dockerfile +29 -0
  22. cognee/eval_framework/answer_generation/answer_generation_executor.py +10 -0
  23. cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
  24. cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +0 -2
  25. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
  26. cognee/eval_framework/eval_config.py +2 -2
  27. cognee/eval_framework/modal_run_eval.py +16 -28
  28. cognee/infrastructure/databases/cache/config.py +3 -1
  29. cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +151 -0
  30. cognee/infrastructure/databases/cache/get_cache_engine.py +20 -10
  31. cognee/infrastructure/databases/dataset_database_handler/__init__.py +3 -0
  32. cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +80 -0
  33. cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +18 -0
  34. cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +10 -0
  35. cognee/infrastructure/databases/exceptions/exceptions.py +16 -0
  36. cognee/infrastructure/databases/graph/config.py +7 -0
  37. cognee/infrastructure/databases/graph/get_graph_engine.py +3 -0
  38. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -0
  39. cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +81 -0
  40. cognee/infrastructure/databases/graph/kuzu/adapter.py +228 -0
  41. cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +168 -0
  42. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +80 -1
  43. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +9 -0
  44. cognee/infrastructure/databases/utils/__init__.py +3 -0
  45. cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +10 -0
  46. cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +66 -18
  47. cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +10 -0
  48. cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +30 -0
  49. cognee/infrastructure/databases/vector/config.py +5 -0
  50. cognee/infrastructure/databases/vector/create_vector_engine.py +6 -1
  51. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +8 -6
  52. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +9 -7
  53. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -10
  54. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +2 -0
  55. cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +50 -0
  56. cognee/infrastructure/databases/vector/vector_db_interface.py +35 -0
  57. cognee/infrastructure/engine/models/Edge.py +13 -1
  58. cognee/infrastructure/files/storage/s3_config.py +2 -0
  59. cognee/infrastructure/files/utils/guess_file_type.py +4 -0
  60. cognee/infrastructure/llm/LLMGateway.py +5 -2
  61. cognee/infrastructure/llm/config.py +37 -0
  62. cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +23 -8
  64. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +22 -18
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +5 -0
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +153 -0
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +47 -38
  68. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +46 -37
  69. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +20 -10
  70. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +23 -11
  71. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +36 -23
  72. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +47 -36
  73. cognee/infrastructure/loaders/LoaderEngine.py +1 -0
  74. cognee/infrastructure/loaders/core/__init__.py +2 -1
  75. cognee/infrastructure/loaders/core/csv_loader.py +93 -0
  76. cognee/infrastructure/loaders/core/text_loader.py +1 -2
  77. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +0 -9
  78. cognee/infrastructure/loaders/supported_loaders.py +2 -1
  79. cognee/memify_pipelines/create_triplet_embeddings.py +53 -0
  80. cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +55 -0
  81. cognee/modules/chunking/CsvChunker.py +35 -0
  82. cognee/modules/chunking/models/DocumentChunk.py +2 -1
  83. cognee/modules/chunking/text_chunker_with_overlap.py +124 -0
  84. cognee/modules/cognify/config.py +2 -0
  85. cognee/modules/data/deletion/prune_system.py +52 -2
  86. cognee/modules/data/methods/__init__.py +1 -0
  87. cognee/modules/data/methods/create_dataset.py +4 -2
  88. cognee/modules/data/methods/delete_dataset.py +26 -0
  89. cognee/modules/data/methods/get_dataset_ids.py +5 -1
  90. cognee/modules/data/methods/get_unique_data_id.py +68 -0
  91. cognee/modules/data/methods/get_unique_dataset_id.py +66 -4
  92. cognee/modules/data/models/Dataset.py +2 -0
  93. cognee/modules/data/processing/document_types/CsvDocument.py +33 -0
  94. cognee/modules/data/processing/document_types/__init__.py +1 -0
  95. cognee/modules/engine/models/Triplet.py +9 -0
  96. cognee/modules/engine/models/__init__.py +1 -0
  97. cognee/modules/graph/cognee_graph/CogneeGraph.py +89 -39
  98. cognee/modules/graph/cognee_graph/CogneeGraphElements.py +8 -3
  99. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +19 -2
  100. cognee/modules/graph/utils/resolve_edges_to_text.py +48 -49
  101. cognee/modules/ingestion/identify.py +4 -4
  102. cognee/modules/memify/memify.py +1 -7
  103. cognee/modules/notebooks/operations/run_in_local_sandbox.py +3 -0
  104. cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +55 -23
  105. cognee/modules/pipelines/operations/pipeline.py +18 -2
  106. cognee/modules/pipelines/operations/run_tasks_data_item.py +1 -1
  107. cognee/modules/retrieval/EntityCompletionRetriever.py +10 -3
  108. cognee/modules/retrieval/__init__.py +1 -1
  109. cognee/modules/retrieval/base_graph_retriever.py +7 -3
  110. cognee/modules/retrieval/base_retriever.py +7 -3
  111. cognee/modules/retrieval/completion_retriever.py +11 -4
  112. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +10 -2
  113. cognee/modules/retrieval/graph_completion_cot_retriever.py +18 -51
  114. cognee/modules/retrieval/graph_completion_retriever.py +14 -1
  115. cognee/modules/retrieval/graph_summary_completion_retriever.py +4 -0
  116. cognee/modules/retrieval/register_retriever.py +10 -0
  117. cognee/modules/retrieval/registered_community_retrievers.py +1 -0
  118. cognee/modules/retrieval/temporal_retriever.py +13 -2
  119. cognee/modules/retrieval/triplet_retriever.py +182 -0
  120. cognee/modules/retrieval/utils/brute_force_triplet_search.py +43 -11
  121. cognee/modules/retrieval/utils/completion.py +2 -22
  122. cognee/modules/run_custom_pipeline/__init__.py +1 -0
  123. cognee/modules/run_custom_pipeline/run_custom_pipeline.py +76 -0
  124. cognee/modules/search/methods/get_search_type_tools.py +54 -8
  125. cognee/modules/search/methods/no_access_control_search.py +4 -0
  126. cognee/modules/search/methods/search.py +26 -3
  127. cognee/modules/search/types/SearchType.py +1 -1
  128. cognee/modules/settings/get_settings.py +19 -0
  129. cognee/modules/users/methods/create_user.py +12 -27
  130. cognee/modules/users/methods/get_authenticated_user.py +3 -2
  131. cognee/modules/users/methods/get_default_user.py +4 -2
  132. cognee/modules/users/methods/get_user.py +1 -1
  133. cognee/modules/users/methods/get_user_by_email.py +1 -1
  134. cognee/modules/users/models/DatasetDatabase.py +24 -3
  135. cognee/modules/users/models/Tenant.py +6 -7
  136. cognee/modules/users/models/User.py +6 -5
  137. cognee/modules/users/models/UserTenant.py +12 -0
  138. cognee/modules/users/models/__init__.py +1 -0
  139. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +13 -13
  140. cognee/modules/users/roles/methods/add_user_to_role.py +3 -1
  141. cognee/modules/users/tenants/methods/__init__.py +1 -0
  142. cognee/modules/users/tenants/methods/add_user_to_tenant.py +21 -12
  143. cognee/modules/users/tenants/methods/create_tenant.py +22 -8
  144. cognee/modules/users/tenants/methods/select_tenant.py +62 -0
  145. cognee/shared/logging_utils.py +6 -0
  146. cognee/shared/rate_limiting.py +30 -0
  147. cognee/tasks/chunks/__init__.py +1 -0
  148. cognee/tasks/chunks/chunk_by_row.py +94 -0
  149. cognee/tasks/documents/__init__.py +0 -1
  150. cognee/tasks/documents/classify_documents.py +2 -0
  151. cognee/tasks/feedback/generate_improved_answers.py +3 -3
  152. cognee/tasks/graph/extract_graph_from_data.py +9 -10
  153. cognee/tasks/ingestion/ingest_data.py +1 -1
  154. cognee/tasks/memify/__init__.py +2 -0
  155. cognee/tasks/memify/cognify_session.py +41 -0
  156. cognee/tasks/memify/extract_user_sessions.py +73 -0
  157. cognee/tasks/memify/get_triplet_datapoints.py +289 -0
  158. cognee/tasks/storage/add_data_points.py +142 -2
  159. cognee/tasks/storage/index_data_points.py +33 -22
  160. cognee/tasks/storage/index_graph_edges.py +37 -57
  161. cognee/tests/integration/documents/CsvDocument_test.py +70 -0
  162. cognee/tests/integration/retrieval/test_triplet_retriever.py +84 -0
  163. cognee/tests/integration/tasks/test_add_data_points.py +139 -0
  164. cognee/tests/integration/tasks/test_get_triplet_datapoints.py +69 -0
  165. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +1 -1
  166. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +1 -1
  167. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +13 -27
  168. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +1 -1
  169. cognee/tests/test_add_docling_document.py +2 -2
  170. cognee/tests/test_cognee_server_start.py +84 -3
  171. cognee/tests/test_conversation_history.py +68 -5
  172. cognee/tests/test_data/example_with_header.csv +3 -0
  173. cognee/tests/test_dataset_database_handler.py +137 -0
  174. cognee/tests/test_dataset_delete.py +76 -0
  175. cognee/tests/test_edge_centered_payload.py +170 -0
  176. cognee/tests/test_edge_ingestion.py +27 -0
  177. cognee/tests/test_feedback_enrichment.py +1 -1
  178. cognee/tests/test_library.py +6 -4
  179. cognee/tests/test_load.py +62 -0
  180. cognee/tests/test_multi_tenancy.py +165 -0
  181. cognee/tests/test_parallel_databases.py +2 -0
  182. cognee/tests/test_pipeline_cache.py +164 -0
  183. cognee/tests/test_relational_db_migration.py +54 -2
  184. cognee/tests/test_search_db.py +44 -2
  185. cognee/tests/unit/api/test_conditional_authentication_endpoints.py +12 -3
  186. cognee/tests/unit/api/test_ontology_endpoint.py +252 -0
  187. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +5 -0
  188. cognee/tests/unit/infrastructure/databases/test_index_data_points.py +27 -0
  189. cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +14 -16
  190. cognee/tests/unit/infrastructure/llm/test_llm_config.py +46 -0
  191. cognee/tests/unit/infrastructure/mock_embedding_engine.py +3 -7
  192. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +0 -5
  193. cognee/tests/unit/modules/chunking/test_text_chunker.py +248 -0
  194. cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py +324 -0
  195. cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
  196. cognee/tests/unit/modules/graph/cognee_graph_test.py +406 -0
  197. cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +111 -0
  198. cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py +175 -0
  199. cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +214 -0
  200. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -51
  201. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +1 -0
  202. cognee/tests/unit/modules/retrieval/structured_output_test.py +204 -0
  203. cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +1 -1
  204. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +0 -1
  205. cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +608 -0
  206. cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +83 -0
  207. cognee/tests/unit/modules/users/test_conditional_authentication.py +0 -63
  208. cognee/tests/unit/processing/chunks/chunk_by_row_test.py +52 -0
  209. cognee/tests/unit/tasks/storage/test_add_data_points.py +288 -0
  210. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/METADATA +11 -6
  211. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/RECORD +215 -163
  212. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/WHEEL +1 -1
  213. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/entry_points.txt +0 -1
  214. cognee/api/v1/cognify/code_graph_pipeline.py +0 -119
  215. cognee/api/v1/cognify/routers/get_code_pipeline_router.py +0 -90
  216. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +0 -544
  217. cognee/modules/retrieval/code_retriever.py +0 -232
  218. cognee/tasks/code/enrich_dependency_graph_checker.py +0 -35
  219. cognee/tasks/code/get_local_dependencies_checker.py +0 -20
  220. cognee/tasks/code/get_repo_dependency_graph_checker.py +0 -35
  221. cognee/tasks/documents/check_permissions_on_dataset.py +0 -26
  222. cognee/tasks/repo_processor/__init__.py +0 -2
  223. cognee/tasks/repo_processor/get_local_dependencies.py +0 -335
  224. cognee/tasks/repo_processor/get_non_code_files.py +0 -158
  225. cognee/tasks/repo_processor/get_repo_file_dependencies.py +0 -243
  226. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/licenses/LICENSE +0 -0
  227. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,55 @@
1
+ from typing import Optional, List
2
+
3
+ from cognee import memify
4
+ from cognee.context_global_variables import (
5
+ set_database_global_context_variables,
6
+ set_session_user_context_variable,
7
+ )
8
+ from cognee.exceptions import CogneeValidationError
9
+ from cognee.modules.data.methods import get_authorized_existing_datasets
10
+ from cognee.shared.logging_utils import get_logger
11
+ from cognee.modules.pipelines.tasks.task import Task
12
+ from cognee.modules.users.models import User
13
+ from cognee.tasks.memify import extract_user_sessions, cognify_session
14
+
15
+
16
+ logger = get_logger("persist_sessions_in_knowledge_graph")
17
+
18
+
19
+ async def persist_sessions_in_knowledge_graph_pipeline(
20
+ user: User,
21
+ session_ids: Optional[List[str]] = None,
22
+ dataset: str = "main_dataset",
23
+ run_in_background: bool = False,
24
+ ):
25
+ await set_session_user_context_variable(user)
26
+ dataset_to_write = await get_authorized_existing_datasets(
27
+ user=user, datasets=[dataset], permission_type="write"
28
+ )
29
+
30
+ if not dataset_to_write:
31
+ raise CogneeValidationError(
32
+ message=f"User (id: {str(user.id)}) does not have write access to dataset: {dataset}",
33
+ log=False,
34
+ )
35
+
36
+ await set_database_global_context_variables(
37
+ dataset_to_write[0].id, dataset_to_write[0].owner_id
38
+ )
39
+
40
+ extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)]
41
+
42
+ enrichment_tasks = [
43
+ Task(cognify_session, dataset_id=dataset_to_write[0].id),
44
+ ]
45
+
46
+ result = await memify(
47
+ extraction_tasks=extraction_tasks,
48
+ enrichment_tasks=enrichment_tasks,
49
+ dataset=dataset_to_write[0].id,
50
+ data=[{}],
51
+ run_in_background=run_in_background,
52
+ )
53
+
54
+ logger.info("Session persistence pipeline completed")
55
+ return result
@@ -0,0 +1,35 @@
1
+ from cognee.shared.logging_utils import get_logger
2
+
3
+
4
+ from cognee.tasks.chunks import chunk_by_row
5
+ from cognee.modules.chunking.Chunker import Chunker
6
+ from .models.DocumentChunk import DocumentChunk
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ class CsvChunker(Chunker):
12
+ async def read(self):
13
+ async for content_text in self.get_text():
14
+ if content_text is None:
15
+ continue
16
+
17
+ for chunk_data in chunk_by_row(content_text, self.max_chunk_size):
18
+ if chunk_data["chunk_size"] <= self.max_chunk_size:
19
+ yield DocumentChunk(
20
+ id=chunk_data["chunk_id"],
21
+ text=chunk_data["text"],
22
+ chunk_size=chunk_data["chunk_size"],
23
+ is_part_of=self.document,
24
+ chunk_index=self.chunk_index,
25
+ cut_type=chunk_data["cut_type"],
26
+ contains=[],
27
+ metadata={
28
+ "index_fields": ["text"],
29
+ },
30
+ )
31
+ self.chunk_index += 1
32
+ else:
33
+ raise ValueError(
34
+ f"Chunk size is larger than the maximum chunk size {self.max_chunk_size}"
35
+ )
@@ -1,6 +1,7 @@
1
1
  from typing import List, Union
2
2
 
3
3
  from cognee.infrastructure.engine import DataPoint
4
+ from cognee.infrastructure.engine.models.Edge import Edge
4
5
  from cognee.modules.data.processing.document_types import Document
5
6
  from cognee.modules.engine.models import Entity
6
7
  from cognee.tasks.temporal_graph.models import Event
@@ -31,6 +32,6 @@ class DocumentChunk(DataPoint):
31
32
  chunk_index: int
32
33
  cut_type: str
33
34
  is_part_of: Document
34
- contains: List[Union[Entity, Event]] = None
35
+ contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None
35
36
 
36
37
  metadata: dict = {"index_fields": ["text"]}
@@ -0,0 +1,124 @@
1
+ from cognee.shared.logging_utils import get_logger
2
+ from uuid import NAMESPACE_OID, uuid5
3
+
4
+ from cognee.tasks.chunks import chunk_by_paragraph
5
+ from cognee.modules.chunking.Chunker import Chunker
6
+ from .models.DocumentChunk import DocumentChunk
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ class TextChunkerWithOverlap(Chunker):
12
+ def __init__(
13
+ self,
14
+ document,
15
+ get_text: callable,
16
+ max_chunk_size: int,
17
+ chunk_overlap_ratio: float = 0.0,
18
+ get_chunk_data: callable = None,
19
+ ):
20
+ super().__init__(document, get_text, max_chunk_size)
21
+ self._accumulated_chunk_data = []
22
+ self._accumulated_size = 0
23
+ self.chunk_overlap_ratio = chunk_overlap_ratio
24
+ self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio)
25
+
26
+ if get_chunk_data is not None:
27
+ self.get_chunk_data = get_chunk_data
28
+ elif chunk_overlap_ratio > 0:
29
+ paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size)
30
+ self.get_chunk_data = lambda text: chunk_by_paragraph(
31
+ text, paragraph_max_size, batch_paragraphs=True
32
+ )
33
+ else:
34
+ self.get_chunk_data = lambda text: chunk_by_paragraph(
35
+ text, self.max_chunk_size, batch_paragraphs=True
36
+ )
37
+
38
+ def _accumulation_overflows(self, chunk_data):
39
+ """Check if adding chunk_data would exceed max_chunk_size."""
40
+ return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size
41
+
42
+ def _accumulate_chunk_data(self, chunk_data):
43
+ """Add chunk_data to the current accumulation."""
44
+ self._accumulated_chunk_data.append(chunk_data)
45
+ self._accumulated_size += chunk_data["chunk_size"]
46
+
47
+ def _clear_accumulation(self):
48
+ """Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio."""
49
+ if self.chunk_overlap == 0:
50
+ self._accumulated_chunk_data = []
51
+ self._accumulated_size = 0
52
+ return
53
+
54
+ # Keep chunk_data from the end that fit in overlap
55
+ overlap_chunk_data = []
56
+ overlap_size = 0
57
+
58
+ for chunk_data in reversed(self._accumulated_chunk_data):
59
+ if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap:
60
+ overlap_chunk_data.insert(0, chunk_data)
61
+ overlap_size += chunk_data["chunk_size"]
62
+ else:
63
+ break
64
+
65
+ self._accumulated_chunk_data = overlap_chunk_data
66
+ self._accumulated_size = overlap_size
67
+
68
+ def _create_chunk(self, text, size, cut_type, chunk_id=None):
69
+ """Create a DocumentChunk with standard metadata."""
70
+ try:
71
+ return DocumentChunk(
72
+ id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
73
+ text=text,
74
+ chunk_size=size,
75
+ is_part_of=self.document,
76
+ chunk_index=self.chunk_index,
77
+ cut_type=cut_type,
78
+ contains=[],
79
+ metadata={"index_fields": ["text"]},
80
+ )
81
+ except Exception as e:
82
+ logger.error(e)
83
+ raise e
84
+
85
+ def _create_chunk_from_accumulation(self):
86
+ """Create a DocumentChunk from current accumulated chunk_data."""
87
+ chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data)
88
+ return self._create_chunk(
89
+ text=chunk_text,
90
+ size=self._accumulated_size,
91
+ cut_type=self._accumulated_chunk_data[-1]["cut_type"],
92
+ )
93
+
94
+ def _emit_chunk(self, chunk_data):
95
+ """Emit a chunk when accumulation overflows."""
96
+ if len(self._accumulated_chunk_data) > 0:
97
+ chunk = self._create_chunk_from_accumulation()
98
+ self._clear_accumulation()
99
+ self._accumulate_chunk_data(chunk_data)
100
+ else:
101
+ # Handle single chunk_data exceeding max_chunk_size
102
+ chunk = self._create_chunk(
103
+ text=chunk_data["text"],
104
+ size=chunk_data["chunk_size"],
105
+ cut_type=chunk_data["cut_type"],
106
+ chunk_id=chunk_data["chunk_id"],
107
+ )
108
+
109
+ self.chunk_index += 1
110
+ return chunk
111
+
112
+ async def read(self):
113
+ async for content_text in self.get_text():
114
+ for chunk_data in self.get_chunk_data(content_text):
115
+ if not self._accumulation_overflows(chunk_data):
116
+ self._accumulate_chunk_data(chunk_data)
117
+ continue
118
+
119
+ yield self._emit_chunk(chunk_data)
120
+
121
+ if len(self._accumulated_chunk_data) == 0:
122
+ return
123
+
124
+ yield self._create_chunk_from_accumulation()
@@ -8,12 +8,14 @@ import os
8
8
  class CognifyConfig(BaseSettings):
9
9
  classification_model: object = DefaultContentPrediction
10
10
  summarization_model: object = SummarizedContent
11
+ triplet_embedding: bool = False
11
12
  model_config = SettingsConfigDict(env_file=".env", extra="allow")
12
13
 
13
14
  def to_dict(self) -> dict:
14
15
  return {
15
16
  "classification_model": self.classification_model,
16
17
  "summarization_model": self.summarization_model,
18
+ "triplet_embedding": self.triplet_embedding,
17
19
  }
18
20
 
19
21
 
@@ -1,17 +1,67 @@
1
+ from sqlalchemy.exc import OperationalError
2
+
3
+ from cognee.infrastructure.databases.exceptions import EntityNotFoundError
4
+ from cognee.context_global_variables import backend_access_control_enabled
1
5
  from cognee.infrastructure.databases.vector import get_vector_engine
2
6
  from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
3
7
  from cognee.infrastructure.databases.relational import get_relational_engine
8
+ from cognee.infrastructure.databases.utils import (
9
+ get_graph_dataset_database_handler,
10
+ get_vector_dataset_database_handler,
11
+ )
4
12
  from cognee.shared.cache import delete_cache
13
+ from cognee.modules.users.models import DatasetDatabase
14
+ from cognee.shared.logging_utils import get_logger
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ async def prune_graph_databases():
20
+ db_engine = get_relational_engine()
21
+ try:
22
+ dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
23
+ # Go through each dataset database and delete the graph database
24
+ for dataset_database in dataset_databases:
25
+ handler = get_graph_dataset_database_handler(dataset_database)
26
+ await handler["handler_instance"].delete_dataset(dataset_database)
27
+ except (OperationalError, EntityNotFoundError) as e:
28
+ logger.debug(
29
+ "Skipping pruning of graph DB. Error when accessing dataset_database table: %s",
30
+ e,
31
+ )
32
+ return
33
+
34
+
35
+ async def prune_vector_databases():
36
+ db_engine = get_relational_engine()
37
+ try:
38
+ dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
39
+ # Go through each dataset database and delete the vector database
40
+ for dataset_database in dataset_databases:
41
+ handler = get_vector_dataset_database_handler(dataset_database)
42
+ await handler["handler_instance"].delete_dataset(dataset_database)
43
+ except (OperationalError, EntityNotFoundError) as e:
44
+ logger.debug(
45
+ "Skipping pruning of vector DB. Error when accessing dataset_database table: %s",
46
+ e,
47
+ )
48
+ return
5
49
 
6
50
 
7
51
  async def prune_system(graph=True, vector=True, metadata=True, cache=True):
8
- if graph:
52
+ # Note: prune system should not be available through the API, it has no permission checks and will
53
+ # delete all graph and vector databases if called. It should only be used in development or testing environments.
54
+ if graph and not backend_access_control_enabled():
9
55
  graph_engine = await get_graph_engine()
10
56
  await graph_engine.delete_graph()
57
+ elif graph and backend_access_control_enabled():
58
+ await prune_graph_databases()
11
59
 
12
- if vector:
60
+ if vector and not backend_access_control_enabled():
13
61
  vector_engine = get_vector_engine()
14
62
  await vector_engine.prune()
63
+ elif vector and backend_access_control_enabled():
64
+ await prune_vector_databases()
15
65
 
16
66
  if metadata:
17
67
  db_engine = get_relational_engine()
@@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
10
10
  from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
11
11
  from .get_data import get_data
12
12
  from .get_unique_dataset_id import get_unique_dataset_id
13
+ from .get_unique_data_id import get_unique_data_id
13
14
  from .get_authorized_existing_datasets import get_authorized_existing_datasets
14
15
  from .get_dataset_ids import get_dataset_ids
15
16
 
@@ -16,14 +16,16 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) -
16
16
  .options(joinedload(Dataset.data))
17
17
  .filter(Dataset.name == dataset_name)
18
18
  .filter(Dataset.owner_id == owner_id)
19
+ .filter(Dataset.tenant_id == user.tenant_id)
19
20
  )
20
21
  ).first()
21
22
 
22
23
  if dataset is None:
23
24
  # Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name
24
25
  dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user)
25
- dataset = Dataset(id=dataset_id, name=dataset_name, data=[])
26
- dataset.owner_id = owner_id
26
+ dataset = Dataset(
27
+ id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id
28
+ )
27
29
 
28
30
  session.add(dataset)
29
31
 
@@ -1,8 +1,34 @@
1
+ from cognee.modules.users.models import DatasetDatabase
2
+ from sqlalchemy import select
3
+
1
4
  from cognee.modules.data.models import Dataset
5
+ from cognee.infrastructure.databases.utils.get_vector_dataset_database_handler import (
6
+ get_vector_dataset_database_handler,
7
+ )
8
+ from cognee.infrastructure.databases.utils.get_graph_dataset_database_handler import (
9
+ get_graph_dataset_database_handler,
10
+ )
2
11
  from cognee.infrastructure.databases.relational import get_relational_engine
3
12
 
4
13
 
5
14
  async def delete_dataset(dataset: Dataset):
6
15
  db_engine = get_relational_engine()
7
16
 
17
+ async with db_engine.get_async_session() as session:
18
+ stmt = select(DatasetDatabase).where(
19
+ DatasetDatabase.dataset_id == dataset.id,
20
+ )
21
+ dataset_database: DatasetDatabase = await session.scalar(stmt)
22
+ if dataset_database:
23
+ graph_dataset_database_handler = get_graph_dataset_database_handler(dataset_database)
24
+ vector_dataset_database_handler = get_vector_dataset_database_handler(dataset_database)
25
+ await graph_dataset_database_handler["handler_instance"].delete_dataset(
26
+ dataset_database
27
+ )
28
+ await vector_dataset_database_handler["handler_instance"].delete_dataset(
29
+ dataset_database
30
+ )
31
+ # TODO: Remove dataset from pipeline_run_status in Data objects related to dataset as well
32
+ # This blocks recreation of the dataset with the same name and data after deletion as
33
+ # it's marked as completed and will be just skipped even though it's empty.
8
34
  return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id)
@@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user):
27
27
  # Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.)
28
28
  user_datasets = await get_datasets(user.id)
29
29
  # Filter out non name mentioned datasets
30
- dataset_ids = [dataset.id for dataset in user_datasets if dataset.name in datasets]
30
+ dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets]
31
+ # Filter out non current tenant datasets
32
+ dataset_ids = [
33
+ dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id
34
+ ]
31
35
  else:
32
36
  raise DatasetTypeError(
33
37
  f"One or more of the provided dataset types is not handled: f{datasets}"
@@ -0,0 +1,68 @@
1
+ from uuid import uuid5, NAMESPACE_OID, UUID
2
+ from sqlalchemy import select
3
+
4
+ from cognee.modules.data.models.Data import Data
5
+ from cognee.infrastructure.databases.relational import get_relational_engine
6
+ from cognee.modules.users.models import User
7
+
8
+
9
+ async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
10
+ """
11
+ Function returns a unique UUID for data based on data identifier, user id and tenant id.
12
+ If data with legacy ID exists, return that ID to maintain compatibility.
13
+
14
+ Args:
15
+ data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
16
+ user: User object adding the data
17
+ tenant_id: UUID of the tenant for which data is being added
18
+
19
+ Returns:
20
+ UUID: Unique identifier for the data
21
+ """
22
+
23
+ def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
24
+ """
25
+ Deprecated function, returns a unique UUID for data based on data identifier and user id.
26
+ Needed to support legacy data without tenant information.
27
+ Args:
28
+ data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
29
+ user: User object adding the data
30
+
31
+ Returns:
32
+ UUID: Unique identifier for the data
33
+ """
34
+ # return UUID hash of file contents + owner id + tenant_id
35
+ return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
36
+
37
+ def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
38
+ """
39
+ Function returns a unique UUID for data based on data identifier, user id and tenant id.
40
+ Args:
41
+ data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
42
+ user: User object adding the data
43
+ tenant_id: UUID of the tenant for which data is being added
44
+
45
+ Returns:
46
+ UUID: Unique identifier for the data
47
+ """
48
+ # return UUID hash of file contents + owner id + tenant_id
49
+ return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
50
+
51
+ # Get all possible data_id values
52
+ data_id = {
53
+ "modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
54
+ "legacy_data_id": _get_deprecated_unique_data_id(
55
+ data_identifier=data_identifier, user=user
56
+ ),
57
+ }
58
+
59
+ # Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
60
+ db_engine = get_relational_engine()
61
+ async with db_engine.get_async_session() as session:
62
+ legacy_data_point = (
63
+ await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
64
+ ).scalar_one_or_none()
65
+
66
+ if not legacy_data_point:
67
+ return data_id["modern_data_id"]
68
+ return data_id["legacy_data_id"]
@@ -1,9 +1,71 @@
1
1
  from uuid import UUID, uuid5, NAMESPACE_OID
2
- from cognee.modules.users.models import User
3
2
  from typing import Union
3
+ from sqlalchemy import select
4
+
5
+ from cognee.modules.data.models.Dataset import Dataset
6
+ from cognee.modules.users.models import User
7
+ from cognee.infrastructure.databases.relational import get_relational_engine
4
8
 
5
9
 
6
10
  async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
7
- if isinstance(dataset_name, UUID):
8
- return dataset_name
9
- return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
11
+ """
12
+ Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
13
+ If dataset with legacy ID exists, return that ID to maintain compatibility.
14
+
15
+ Args:
16
+ dataset_name: string representing the dataset name
17
+ user: User object adding the dataset
18
+ tenant_id: UUID of the tenant for which dataset is being added
19
+
20
+ Returns:
21
+ UUID: Unique identifier for the dataset
22
+ """
23
+
24
+ def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
25
+ """
26
+ Legacy function, returns a unique UUID for dataset based on dataset name and user id.
27
+ Needed to support legacy datasets without tenant information.
28
+ Args:
29
+ dataset_name: string representing the dataset name
30
+ user: Current User object adding the dataset
31
+
32
+ Returns:
33
+ UUID: Unique identifier for the dataset
34
+ """
35
+ if isinstance(dataset_name, UUID):
36
+ return dataset_name
37
+ return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
38
+
39
+ def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
40
+ """
41
+ Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
42
+ Args:
43
+ dataset_name: string representing the dataset name
44
+ user: Current User object adding the dataset
45
+ tenant_id: UUID of the tenant for which dataset is being added
46
+
47
+ Returns:
48
+ UUID: Unique identifier for the dataset
49
+ """
50
+ if isinstance(dataset_name, UUID):
51
+ return dataset_name
52
+ return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
53
+
54
+ # Get all possible dataset_id values
55
+ dataset_id = {
56
+ "modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
57
+ "legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
58
+ }
59
+
60
+ # Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
61
+ db_engine = get_relational_engine()
62
+ async with db_engine.get_async_session() as session:
63
+ legacy_dataset = (
64
+ await session.execute(
65
+ select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"])
66
+ )
67
+ ).scalar_one_or_none()
68
+
69
+ if not legacy_dataset:
70
+ return dataset_id["modern_dataset_id"]
71
+ return dataset_id["legacy_dataset_id"]
@@ -18,6 +18,7 @@ class Dataset(Base):
18
18
  updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
19
19
 
20
20
  owner_id = Column(UUID, index=True)
21
+ tenant_id = Column(UUID, index=True, nullable=True)
21
22
 
22
23
  acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan")
23
24
 
@@ -36,5 +37,6 @@ class Dataset(Base):
36
37
  "createdAt": self.created_at.isoformat(),
37
38
  "updatedAt": self.updated_at.isoformat() if self.updated_at else None,
38
39
  "ownerId": str(self.owner_id),
40
+ "tenantId": str(self.tenant_id),
39
41
  "data": [data.to_json() for data in self.data],
40
42
  }
@@ -0,0 +1,33 @@
1
+ import io
2
+ import csv
3
+ from typing import Type
4
+
5
+ from cognee.modules.chunking.Chunker import Chunker
6
+ from cognee.infrastructure.files.utils.open_data_file import open_data_file
7
+ from .Document import Document
8
+
9
+
10
+ class CsvDocument(Document):
11
+ type: str = "csv"
12
+ mime_type: str = "text/csv"
13
+
14
+ async def read(self, chunker_cls: Type[Chunker], max_chunk_size: int):
15
+ async def get_text():
16
+ async with open_data_file(
17
+ self.raw_data_location, mode="r", encoding="utf-8", newline=""
18
+ ) as file:
19
+ content = file.read()
20
+ file_like_obj = io.StringIO(content)
21
+ reader = csv.DictReader(file_like_obj)
22
+
23
+ for row in reader:
24
+ pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()]
25
+ row_text = ", ".join(pairs)
26
+ if not row_text.strip():
27
+ break
28
+ yield row_text
29
+
30
+ chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)
31
+
32
+ async for chunk in chunker.read():
33
+ yield chunk
@@ -4,3 +4,4 @@ from .TextDocument import TextDocument
4
4
  from .ImageDocument import ImageDocument
5
5
  from .AudioDocument import AudioDocument
6
6
  from .UnstructuredDocument import UnstructuredDocument
7
+ from .CsvDocument import CsvDocument
@@ -0,0 +1,9 @@
1
+ from cognee.infrastructure.engine import DataPoint
2
+
3
+
4
+ class Triplet(DataPoint):
5
+ text: str
6
+ from_node_id: str
7
+ to_node_id: str
8
+
9
+ metadata: dict = {"index_fields": ["text"]}
@@ -7,3 +7,4 @@ from .ColumnValue import ColumnValue
7
7
  from .Timestamp import Timestamp
8
8
  from .Interval import Interval
9
9
  from .Event import Event
10
+ from .Triplet import Triplet