cognee 0.4.1__py3-none-any.whl → 0.5.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/client.py +8 -0
  3. cognee/api/v1/add/routers/get_add_router.py +3 -1
  4. cognee/api/v1/cognify/routers/get_cognify_router.py +28 -1
  5. cognee/api/v1/ontologies/__init__.py +4 -0
  6. cognee/api/v1/ontologies/ontologies.py +183 -0
  7. cognee/api/v1/ontologies/routers/__init__.py +0 -0
  8. cognee/api/v1/ontologies/routers/get_ontology_router.py +107 -0
  9. cognee/api/v1/permissions/routers/get_permissions_router.py +41 -1
  10. cognee/cli/commands/cognify_command.py +8 -1
  11. cognee/cli/config.py +1 -1
  12. cognee/context_global_variables.py +41 -9
  13. cognee/infrastructure/databases/cache/config.py +3 -1
  14. cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +151 -0
  15. cognee/infrastructure/databases/cache/get_cache_engine.py +20 -10
  16. cognee/infrastructure/databases/exceptions/exceptions.py +16 -0
  17. cognee/infrastructure/databases/graph/config.py +4 -0
  18. cognee/infrastructure/databases/graph/get_graph_engine.py +2 -0
  19. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +9 -0
  20. cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +37 -3
  21. cognee/infrastructure/databases/vector/config.py +3 -0
  22. cognee/infrastructure/databases/vector/create_vector_engine.py +5 -1
  23. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +1 -4
  24. cognee/infrastructure/engine/models/Edge.py +13 -1
  25. cognee/infrastructure/files/utils/guess_file_type.py +4 -0
  26. cognee/infrastructure/llm/config.py +2 -0
  27. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +5 -2
  28. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +7 -1
  29. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +7 -1
  30. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +8 -16
  31. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +12 -2
  32. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +13 -2
  33. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +5 -2
  34. cognee/infrastructure/loaders/LoaderEngine.py +1 -0
  35. cognee/infrastructure/loaders/core/__init__.py +2 -1
  36. cognee/infrastructure/loaders/core/csv_loader.py +93 -0
  37. cognee/infrastructure/loaders/core/text_loader.py +1 -2
  38. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +0 -9
  39. cognee/infrastructure/loaders/supported_loaders.py +2 -1
  40. cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +55 -0
  41. cognee/modules/chunking/CsvChunker.py +35 -0
  42. cognee/modules/chunking/models/DocumentChunk.py +2 -1
  43. cognee/modules/chunking/text_chunker_with_overlap.py +124 -0
  44. cognee/modules/data/methods/__init__.py +1 -0
  45. cognee/modules/data/methods/create_dataset.py +4 -2
  46. cognee/modules/data/methods/get_dataset_ids.py +5 -1
  47. cognee/modules/data/methods/get_unique_data_id.py +68 -0
  48. cognee/modules/data/methods/get_unique_dataset_id.py +66 -4
  49. cognee/modules/data/models/Dataset.py +2 -0
  50. cognee/modules/data/processing/document_types/CsvDocument.py +33 -0
  51. cognee/modules/data/processing/document_types/__init__.py +1 -0
  52. cognee/modules/graph/cognee_graph/CogneeGraph.py +4 -2
  53. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +19 -2
  54. cognee/modules/graph/utils/resolve_edges_to_text.py +48 -49
  55. cognee/modules/ingestion/identify.py +4 -4
  56. cognee/modules/notebooks/operations/run_in_local_sandbox.py +3 -0
  57. cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +55 -23
  58. cognee/modules/pipelines/operations/run_tasks_data_item.py +1 -1
  59. cognee/modules/retrieval/EntityCompletionRetriever.py +10 -3
  60. cognee/modules/retrieval/base_graph_retriever.py +7 -3
  61. cognee/modules/retrieval/base_retriever.py +7 -3
  62. cognee/modules/retrieval/completion_retriever.py +11 -4
  63. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +6 -2
  64. cognee/modules/retrieval/graph_completion_cot_retriever.py +14 -51
  65. cognee/modules/retrieval/graph_completion_retriever.py +4 -1
  66. cognee/modules/retrieval/temporal_retriever.py +9 -2
  67. cognee/modules/retrieval/utils/brute_force_triplet_search.py +1 -1
  68. cognee/modules/retrieval/utils/completion.py +2 -22
  69. cognee/modules/run_custom_pipeline/__init__.py +1 -0
  70. cognee/modules/run_custom_pipeline/run_custom_pipeline.py +69 -0
  71. cognee/modules/search/methods/search.py +5 -3
  72. cognee/modules/users/methods/create_user.py +12 -27
  73. cognee/modules/users/methods/get_authenticated_user.py +2 -1
  74. cognee/modules/users/methods/get_default_user.py +4 -2
  75. cognee/modules/users/methods/get_user.py +1 -1
  76. cognee/modules/users/methods/get_user_by_email.py +1 -1
  77. cognee/modules/users/models/DatasetDatabase.py +9 -0
  78. cognee/modules/users/models/Tenant.py +6 -7
  79. cognee/modules/users/models/User.py +6 -5
  80. cognee/modules/users/models/UserTenant.py +12 -0
  81. cognee/modules/users/models/__init__.py +1 -0
  82. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +13 -13
  83. cognee/modules/users/roles/methods/add_user_to_role.py +3 -1
  84. cognee/modules/users/tenants/methods/__init__.py +1 -0
  85. cognee/modules/users/tenants/methods/add_user_to_tenant.py +21 -12
  86. cognee/modules/users/tenants/methods/create_tenant.py +22 -8
  87. cognee/modules/users/tenants/methods/select_tenant.py +62 -0
  88. cognee/shared/logging_utils.py +2 -0
  89. cognee/tasks/chunks/__init__.py +1 -0
  90. cognee/tasks/chunks/chunk_by_row.py +94 -0
  91. cognee/tasks/documents/classify_documents.py +2 -0
  92. cognee/tasks/feedback/generate_improved_answers.py +3 -3
  93. cognee/tasks/ingestion/ingest_data.py +1 -1
  94. cognee/tasks/memify/__init__.py +2 -0
  95. cognee/tasks/memify/cognify_session.py +41 -0
  96. cognee/tasks/memify/extract_user_sessions.py +73 -0
  97. cognee/tasks/storage/index_data_points.py +33 -22
  98. cognee/tasks/storage/index_graph_edges.py +37 -57
  99. cognee/tests/integration/documents/CsvDocument_test.py +70 -0
  100. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +1 -1
  101. cognee/tests/test_add_docling_document.py +2 -2
  102. cognee/tests/test_cognee_server_start.py +84 -1
  103. cognee/tests/test_conversation_history.py +45 -4
  104. cognee/tests/test_data/example_with_header.csv +3 -0
  105. cognee/tests/test_delete_bmw_example.py +60 -0
  106. cognee/tests/test_edge_ingestion.py +27 -0
  107. cognee/tests/test_feedback_enrichment.py +1 -1
  108. cognee/tests/test_library.py +6 -4
  109. cognee/tests/test_load.py +62 -0
  110. cognee/tests/test_multi_tenancy.py +165 -0
  111. cognee/tests/test_parallel_databases.py +2 -0
  112. cognee/tests/test_relational_db_migration.py +54 -2
  113. cognee/tests/test_search_db.py +7 -1
  114. cognee/tests/unit/api/test_conditional_authentication_endpoints.py +12 -3
  115. cognee/tests/unit/api/test_ontology_endpoint.py +264 -0
  116. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +5 -0
  117. cognee/tests/unit/infrastructure/databases/test_index_data_points.py +27 -0
  118. cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +14 -16
  119. cognee/tests/unit/modules/chunking/test_text_chunker.py +248 -0
  120. cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py +324 -0
  121. cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +111 -0
  122. cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py +175 -0
  123. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -51
  124. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +1 -0
  125. cognee/tests/unit/modules/retrieval/structured_output_test.py +204 -0
  126. cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +1 -1
  127. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +0 -1
  128. cognee/tests/unit/modules/users/test_conditional_authentication.py +0 -63
  129. cognee/tests/unit/processing/chunks/chunk_by_row_test.py +52 -0
  130. {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/METADATA +88 -71
  131. {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/RECORD +135 -104
  132. {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/WHEEL +1 -1
  133. {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/entry_points.txt +0 -1
  134. {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/LICENSE +0 -0
  135. {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/NOTICE.md +0 -0
@@ -42,8 +42,16 @@ class OllamaAPIAdapter(LLMInterface):
42
42
  - aclient
43
43
  """
44
44
 
45
+ default_instructor_mode = "json_mode"
46
+
45
47
  def __init__(
46
- self, endpoint: str, api_key: str, model: str, name: str, max_completion_tokens: int
48
+ self,
49
+ endpoint: str,
50
+ api_key: str,
51
+ model: str,
52
+ name: str,
53
+ max_completion_tokens: int,
54
+ instructor_mode: str = None,
47
55
  ):
48
56
  self.name = name
49
57
  self.model = model
@@ -51,8 +59,11 @@ class OllamaAPIAdapter(LLMInterface):
51
59
  self.endpoint = endpoint
52
60
  self.max_completion_tokens = max_completion_tokens
53
61
 
62
+ self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
63
+
54
64
  self.aclient = instructor.from_openai(
55
- OpenAI(base_url=self.endpoint, api_key=self.api_key), mode=instructor.Mode.JSON
65
+ OpenAI(base_url=self.endpoint, api_key=self.api_key),
66
+ mode=instructor.Mode(self.instructor_mode),
56
67
  )
57
68
 
58
69
  @retry(
@@ -56,6 +56,7 @@ class OpenAIAdapter(LLMInterface):
56
56
  model: str
57
57
  api_key: str
58
58
  api_version: str
59
+ default_instructor_mode = "json_schema_mode"
59
60
 
60
61
  MAX_RETRIES = 5
61
62
 
@@ -69,19 +70,21 @@ class OpenAIAdapter(LLMInterface):
69
70
  model: str,
70
71
  transcription_model: str,
71
72
  max_completion_tokens: int,
73
+ instructor_mode: str = None,
72
74
  streaming: bool = False,
73
75
  fallback_model: str = None,
74
76
  fallback_api_key: str = None,
75
77
  fallback_endpoint: str = None,
76
78
  ):
79
+ self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
77
80
  # TODO: With gpt5 series models OpenAI expects JSON_SCHEMA as a mode for structured outputs.
78
81
  # Make sure all new gpt models will work with this mode as well.
79
82
  if "gpt-5" in model:
80
83
  self.aclient = instructor.from_litellm(
81
- litellm.acompletion, mode=instructor.Mode.JSON_SCHEMA
84
+ litellm.acompletion, mode=instructor.Mode(self.instructor_mode)
82
85
  )
83
86
  self.client = instructor.from_litellm(
84
- litellm.completion, mode=instructor.Mode.JSON_SCHEMA
87
+ litellm.completion, mode=instructor.Mode(self.instructor_mode)
85
88
  )
86
89
  else:
87
90
  self.aclient = instructor.from_litellm(litellm.acompletion)
@@ -31,6 +31,7 @@ class LoaderEngine:
31
31
  "pypdf_loader",
32
32
  "image_loader",
33
33
  "audio_loader",
34
+ "csv_loader",
34
35
  "unstructured_loader",
35
36
  "advanced_pdf_loader",
36
37
  ]
@@ -3,5 +3,6 @@
3
3
  from .text_loader import TextLoader
4
4
  from .audio_loader import AudioLoader
5
5
  from .image_loader import ImageLoader
6
+ from .csv_loader import CsvLoader
6
7
 
7
- __all__ = ["TextLoader", "AudioLoader", "ImageLoader"]
8
+ __all__ = ["TextLoader", "AudioLoader", "ImageLoader", "CsvLoader"]
@@ -0,0 +1,93 @@
1
+ import os
2
+ from typing import List
3
+ import csv
4
+ from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
5
+ from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
6
+ from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
7
+
8
+
9
+ class CsvLoader(LoaderInterface):
10
+ """
11
+ Core CSV file loader that handles basic CSV file formats.
12
+ """
13
+
14
+ @property
15
+ def supported_extensions(self) -> List[str]:
16
+ """Supported text file extensions."""
17
+ return [
18
+ "csv",
19
+ ]
20
+
21
+ @property
22
+ def supported_mime_types(self) -> List[str]:
23
+ """Supported MIME types for text content."""
24
+ return [
25
+ "text/csv",
26
+ ]
27
+
28
+ @property
29
+ def loader_name(self) -> str:
30
+ """Unique identifier for this loader."""
31
+ return "csv_loader"
32
+
33
+ def can_handle(self, extension: str, mime_type: str) -> bool:
34
+ """
35
+ Check if this loader can handle the given file.
36
+
37
+ Args:
38
+ extension: File extension
39
+ mime_type: Optional MIME type
40
+
41
+ Returns:
42
+ True if file can be handled, False otherwise
43
+ """
44
+ if extension in self.supported_extensions and mime_type in self.supported_mime_types:
45
+ return True
46
+
47
+ return False
48
+
49
+ async def load(self, file_path: str, encoding: str = "utf-8", **kwargs):
50
+ """
51
+ Load and process the csv file.
52
+
53
+ Args:
54
+ file_path: Path to the file to load
55
+ encoding: Text encoding to use (default: utf-8)
56
+ **kwargs: Additional configuration (unused)
57
+
58
+ Returns:
59
+ LoaderResult containing the file content and metadata
60
+
61
+ Raises:
62
+ FileNotFoundError: If file doesn't exist
63
+ UnicodeDecodeError: If file cannot be decoded with specified encoding
64
+ OSError: If file cannot be read
65
+ """
66
+ if not os.path.exists(file_path):
67
+ raise FileNotFoundError(f"File not found: {file_path}")
68
+
69
+ with open(file_path, "rb") as f:
70
+ file_metadata = await get_file_metadata(f)
71
+ # Name ingested file of current loader based on original file content hash
72
+ storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
73
+
74
+ row_texts = []
75
+ row_index = 1
76
+
77
+ with open(file_path, "r", encoding=encoding, newline="") as file:
78
+ reader = csv.DictReader(file)
79
+ for row in reader:
80
+ pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()]
81
+ row_text = ", ".join(pairs)
82
+ row_texts.append(f"Row {row_index}:\n{row_text}\n")
83
+ row_index += 1
84
+
85
+ content = "\n".join(row_texts)
86
+
87
+ storage_config = get_storage_config()
88
+ data_root_directory = storage_config["data_root_directory"]
89
+ storage = get_file_storage(data_root_directory)
90
+
91
+ full_file_path = await storage.store(storage_file_name, content)
92
+
93
+ return full_file_path
@@ -16,7 +16,7 @@ class TextLoader(LoaderInterface):
16
16
  @property
17
17
  def supported_extensions(self) -> List[str]:
18
18
  """Supported text file extensions."""
19
- return ["txt", "md", "csv", "json", "xml", "yaml", "yml", "log"]
19
+ return ["txt", "md", "json", "xml", "yaml", "yml", "log"]
20
20
 
21
21
  @property
22
22
  def supported_mime_types(self) -> List[str]:
@@ -24,7 +24,6 @@ class TextLoader(LoaderInterface):
24
24
  return [
25
25
  "text/plain",
26
26
  "text/markdown",
27
- "text/csv",
28
27
  "application/json",
29
28
  "text/xml",
30
29
  "application/xml",
@@ -227,12 +227,3 @@ class AdvancedPdfLoader(LoaderInterface):
227
227
  if value is None:
228
228
  return ""
229
229
  return str(value).replace("\xa0", " ").strip()
230
-
231
-
232
- if __name__ == "__main__":
233
- loader = AdvancedPdfLoader()
234
- asyncio.run(
235
- loader.load(
236
- "/Users/xiaotao/work/cognee/cognee/infrastructure/loaders/external/attention_is_all_you_need.pdf"
237
- )
238
- )
@@ -1,5 +1,5 @@
1
1
  from cognee.infrastructure.loaders.external import PyPdfLoader
2
- from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader
2
+ from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader, CsvLoader
3
3
 
4
4
  # Registry for loader implementations
5
5
  supported_loaders = {
@@ -7,6 +7,7 @@ supported_loaders = {
7
7
  TextLoader.loader_name: TextLoader,
8
8
  ImageLoader.loader_name: ImageLoader,
9
9
  AudioLoader.loader_name: AudioLoader,
10
+ CsvLoader.loader_name: CsvLoader,
10
11
  }
11
12
 
12
13
  # Try adding optional loaders
@@ -0,0 +1,55 @@
1
+ from typing import Optional, List
2
+
3
+ from cognee import memify
4
+ from cognee.context_global_variables import (
5
+ set_database_global_context_variables,
6
+ set_session_user_context_variable,
7
+ )
8
+ from cognee.exceptions import CogneeValidationError
9
+ from cognee.modules.data.methods import get_authorized_existing_datasets
10
+ from cognee.shared.logging_utils import get_logger
11
+ from cognee.modules.pipelines.tasks.task import Task
12
+ from cognee.modules.users.models import User
13
+ from cognee.tasks.memify import extract_user_sessions, cognify_session
14
+
15
+
16
+ logger = get_logger("persist_sessions_in_knowledge_graph")
17
+
18
+
19
+ async def persist_sessions_in_knowledge_graph_pipeline(
20
+ user: User,
21
+ session_ids: Optional[List[str]] = None,
22
+ dataset: str = "main_dataset",
23
+ run_in_background: bool = False,
24
+ ):
25
+ await set_session_user_context_variable(user)
26
+ dataset_to_write = await get_authorized_existing_datasets(
27
+ user=user, datasets=[dataset], permission_type="write"
28
+ )
29
+
30
+ if not dataset_to_write:
31
+ raise CogneeValidationError(
32
+ message=f"User (id: {str(user.id)}) does not have write access to dataset: {dataset}",
33
+ log=False,
34
+ )
35
+
36
+ await set_database_global_context_variables(
37
+ dataset_to_write[0].id, dataset_to_write[0].owner_id
38
+ )
39
+
40
+ extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)]
41
+
42
+ enrichment_tasks = [
43
+ Task(cognify_session, dataset_id=dataset_to_write[0].id),
44
+ ]
45
+
46
+ result = await memify(
47
+ extraction_tasks=extraction_tasks,
48
+ enrichment_tasks=enrichment_tasks,
49
+ dataset=dataset_to_write[0].id,
50
+ data=[{}],
51
+ run_in_background=run_in_background,
52
+ )
53
+
54
+ logger.info("Session persistence pipeline completed")
55
+ return result
@@ -0,0 +1,35 @@
1
+ from cognee.shared.logging_utils import get_logger
2
+
3
+
4
+ from cognee.tasks.chunks import chunk_by_row
5
+ from cognee.modules.chunking.Chunker import Chunker
6
+ from .models.DocumentChunk import DocumentChunk
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ class CsvChunker(Chunker):
12
+ async def read(self):
13
+ async for content_text in self.get_text():
14
+ if content_text is None:
15
+ continue
16
+
17
+ for chunk_data in chunk_by_row(content_text, self.max_chunk_size):
18
+ if chunk_data["chunk_size"] <= self.max_chunk_size:
19
+ yield DocumentChunk(
20
+ id=chunk_data["chunk_id"],
21
+ text=chunk_data["text"],
22
+ chunk_size=chunk_data["chunk_size"],
23
+ is_part_of=self.document,
24
+ chunk_index=self.chunk_index,
25
+ cut_type=chunk_data["cut_type"],
26
+ contains=[],
27
+ metadata={
28
+ "index_fields": ["text"],
29
+ },
30
+ )
31
+ self.chunk_index += 1
32
+ else:
33
+ raise ValueError(
34
+ f"Chunk size is larger than the maximum chunk size {self.max_chunk_size}"
35
+ )
@@ -1,6 +1,7 @@
1
1
  from typing import List, Union
2
2
 
3
3
  from cognee.infrastructure.engine import DataPoint
4
+ from cognee.infrastructure.engine.models.Edge import Edge
4
5
  from cognee.modules.data.processing.document_types import Document
5
6
  from cognee.modules.engine.models import Entity
6
7
  from cognee.tasks.temporal_graph.models import Event
@@ -31,6 +32,6 @@ class DocumentChunk(DataPoint):
31
32
  chunk_index: int
32
33
  cut_type: str
33
34
  is_part_of: Document
34
- contains: List[Union[Entity, Event]] = None
35
+ contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None
35
36
 
36
37
  metadata: dict = {"index_fields": ["text"]}
@@ -0,0 +1,124 @@
1
+ from cognee.shared.logging_utils import get_logger
2
+ from uuid import NAMESPACE_OID, uuid5
3
+
4
+ from cognee.tasks.chunks import chunk_by_paragraph
5
+ from cognee.modules.chunking.Chunker import Chunker
6
+ from .models.DocumentChunk import DocumentChunk
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ class TextChunkerWithOverlap(Chunker):
12
+ def __init__(
13
+ self,
14
+ document,
15
+ get_text: callable,
16
+ max_chunk_size: int,
17
+ chunk_overlap_ratio: float = 0.0,
18
+ get_chunk_data: callable = None,
19
+ ):
20
+ super().__init__(document, get_text, max_chunk_size)
21
+ self._accumulated_chunk_data = []
22
+ self._accumulated_size = 0
23
+ self.chunk_overlap_ratio = chunk_overlap_ratio
24
+ self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio)
25
+
26
+ if get_chunk_data is not None:
27
+ self.get_chunk_data = get_chunk_data
28
+ elif chunk_overlap_ratio > 0:
29
+ paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size)
30
+ self.get_chunk_data = lambda text: chunk_by_paragraph(
31
+ text, paragraph_max_size, batch_paragraphs=True
32
+ )
33
+ else:
34
+ self.get_chunk_data = lambda text: chunk_by_paragraph(
35
+ text, self.max_chunk_size, batch_paragraphs=True
36
+ )
37
+
38
+ def _accumulation_overflows(self, chunk_data):
39
+ """Check if adding chunk_data would exceed max_chunk_size."""
40
+ return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size
41
+
42
+ def _accumulate_chunk_data(self, chunk_data):
43
+ """Add chunk_data to the current accumulation."""
44
+ self._accumulated_chunk_data.append(chunk_data)
45
+ self._accumulated_size += chunk_data["chunk_size"]
46
+
47
+ def _clear_accumulation(self):
48
+ """Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio."""
49
+ if self.chunk_overlap == 0:
50
+ self._accumulated_chunk_data = []
51
+ self._accumulated_size = 0
52
+ return
53
+
54
+ # Keep chunk_data from the end that fit in overlap
55
+ overlap_chunk_data = []
56
+ overlap_size = 0
57
+
58
+ for chunk_data in reversed(self._accumulated_chunk_data):
59
+ if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap:
60
+ overlap_chunk_data.insert(0, chunk_data)
61
+ overlap_size += chunk_data["chunk_size"]
62
+ else:
63
+ break
64
+
65
+ self._accumulated_chunk_data = overlap_chunk_data
66
+ self._accumulated_size = overlap_size
67
+
68
+ def _create_chunk(self, text, size, cut_type, chunk_id=None):
69
+ """Create a DocumentChunk with standard metadata."""
70
+ try:
71
+ return DocumentChunk(
72
+ id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
73
+ text=text,
74
+ chunk_size=size,
75
+ is_part_of=self.document,
76
+ chunk_index=self.chunk_index,
77
+ cut_type=cut_type,
78
+ contains=[],
79
+ metadata={"index_fields": ["text"]},
80
+ )
81
+ except Exception as e:
82
+ logger.error(e)
83
+ raise e
84
+
85
+ def _create_chunk_from_accumulation(self):
86
+ """Create a DocumentChunk from current accumulated chunk_data."""
87
+ chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data)
88
+ return self._create_chunk(
89
+ text=chunk_text,
90
+ size=self._accumulated_size,
91
+ cut_type=self._accumulated_chunk_data[-1]["cut_type"],
92
+ )
93
+
94
+ def _emit_chunk(self, chunk_data):
95
+ """Emit a chunk when accumulation overflows."""
96
+ if len(self._accumulated_chunk_data) > 0:
97
+ chunk = self._create_chunk_from_accumulation()
98
+ self._clear_accumulation()
99
+ self._accumulate_chunk_data(chunk_data)
100
+ else:
101
+ # Handle single chunk_data exceeding max_chunk_size
102
+ chunk = self._create_chunk(
103
+ text=chunk_data["text"],
104
+ size=chunk_data["chunk_size"],
105
+ cut_type=chunk_data["cut_type"],
106
+ chunk_id=chunk_data["chunk_id"],
107
+ )
108
+
109
+ self.chunk_index += 1
110
+ return chunk
111
+
112
+ async def read(self):
113
+ async for content_text in self.get_text():
114
+ for chunk_data in self.get_chunk_data(content_text):
115
+ if not self._accumulation_overflows(chunk_data):
116
+ self._accumulate_chunk_data(chunk_data)
117
+ continue
118
+
119
+ yield self._emit_chunk(chunk_data)
120
+
121
+ if len(self._accumulated_chunk_data) == 0:
122
+ return
123
+
124
+ yield self._create_chunk_from_accumulation()
@@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
10
10
  from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
11
11
  from .get_data import get_data
12
12
  from .get_unique_dataset_id import get_unique_dataset_id
13
+ from .get_unique_data_id import get_unique_data_id
13
14
  from .get_authorized_existing_datasets import get_authorized_existing_datasets
14
15
  from .get_dataset_ids import get_dataset_ids
15
16
 
@@ -16,14 +16,16 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) -
16
16
  .options(joinedload(Dataset.data))
17
17
  .filter(Dataset.name == dataset_name)
18
18
  .filter(Dataset.owner_id == owner_id)
19
+ .filter(Dataset.tenant_id == user.tenant_id)
19
20
  )
20
21
  ).first()
21
22
 
22
23
  if dataset is None:
23
24
  # Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name
24
25
  dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user)
25
- dataset = Dataset(id=dataset_id, name=dataset_name, data=[])
26
- dataset.owner_id = owner_id
26
+ dataset = Dataset(
27
+ id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id
28
+ )
27
29
 
28
30
  session.add(dataset)
29
31
 
@@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user):
27
27
  # Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.)
28
28
  user_datasets = await get_datasets(user.id)
29
29
  # Filter out non name mentioned datasets
30
- dataset_ids = [dataset.id for dataset in user_datasets if dataset.name in datasets]
30
+ dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets]
31
+ # Filter out non current tenant datasets
32
+ dataset_ids = [
33
+ dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id
34
+ ]
31
35
  else:
32
36
  raise DatasetTypeError(
33
37
  f"One or more of the provided dataset types is not handled: f{datasets}"
@@ -0,0 +1,68 @@
1
+ from uuid import uuid5, NAMESPACE_OID, UUID
2
+ from sqlalchemy import select
3
+
4
+ from cognee.modules.data.models.Data import Data
5
+ from cognee.infrastructure.databases.relational import get_relational_engine
6
+ from cognee.modules.users.models import User
7
+
8
+
9
+ async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
10
+ """
11
+ Function returns a unique UUID for data based on data identifier, user id and tenant id.
12
+ If data with legacy ID exists, return that ID to maintain compatibility.
13
+
14
+ Args:
15
+ data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
16
+ user: User object adding the data
17
+ tenant_id: UUID of the tenant for which data is being added
18
+
19
+ Returns:
20
+ UUID: Unique identifier for the data
21
+ """
22
+
23
+ def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
24
+ """
25
+ Deprecated function, returns a unique UUID for data based on data identifier and user id.
26
+ Needed to support legacy data without tenant information.
27
+ Args:
28
+ data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
29
+ user: User object adding the data
30
+
31
+ Returns:
32
+ UUID: Unique identifier for the data
33
+ """
34
+ # return UUID hash of file contents + owner id + tenant_id
35
+ return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
36
+
37
+ def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
38
+ """
39
+ Function returns a unique UUID for data based on data identifier, user id and tenant id.
40
+ Args:
41
+ data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
42
+ user: User object adding the data
43
+ tenant_id: UUID of the tenant for which data is being added
44
+
45
+ Returns:
46
+ UUID: Unique identifier for the data
47
+ """
48
+ # return UUID hash of file contents + owner id + tenant_id
49
+ return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
50
+
51
+ # Get all possible data_id values
52
+ data_id = {
53
+ "modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
54
+ "legacy_data_id": _get_deprecated_unique_data_id(
55
+ data_identifier=data_identifier, user=user
56
+ ),
57
+ }
58
+
59
+ # Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
60
+ db_engine = get_relational_engine()
61
+ async with db_engine.get_async_session() as session:
62
+ legacy_data_point = (
63
+ await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
64
+ ).scalar_one_or_none()
65
+
66
+ if not legacy_data_point:
67
+ return data_id["modern_data_id"]
68
+ return data_id["legacy_data_id"]
@@ -1,9 +1,71 @@
1
1
  from uuid import UUID, uuid5, NAMESPACE_OID
2
- from cognee.modules.users.models import User
3
2
  from typing import Union
3
+ from sqlalchemy import select
4
+
5
+ from cognee.modules.data.models.Dataset import Dataset
6
+ from cognee.modules.users.models import User
7
+ from cognee.infrastructure.databases.relational import get_relational_engine
4
8
 
5
9
 
6
10
  async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
7
- if isinstance(dataset_name, UUID):
8
- return dataset_name
9
- return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
11
+ """
12
+ Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
13
+ If dataset with legacy ID exists, return that ID to maintain compatibility.
14
+
15
+ Args:
16
+ dataset_name: string representing the dataset name
17
+ user: User object adding the dataset
18
+ tenant_id: UUID of the tenant for which dataset is being added
19
+
20
+ Returns:
21
+ UUID: Unique identifier for the dataset
22
+ """
23
+
24
+ def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
25
+ """
26
+ Legacy function, returns a unique UUID for dataset based on dataset name and user id.
27
+ Needed to support legacy datasets without tenant information.
28
+ Args:
29
+ dataset_name: string representing the dataset name
30
+ user: Current User object adding the dataset
31
+
32
+ Returns:
33
+ UUID: Unique identifier for the dataset
34
+ """
35
+ if isinstance(dataset_name, UUID):
36
+ return dataset_name
37
+ return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
38
+
39
+ def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
40
+ """
41
+ Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
42
+ Args:
43
+ dataset_name: string representing the dataset name
44
+ user: Current User object adding the dataset
45
+ tenant_id: UUID of the tenant for which dataset is being added
46
+
47
+ Returns:
48
+ UUID: Unique identifier for the dataset
49
+ """
50
+ if isinstance(dataset_name, UUID):
51
+ return dataset_name
52
+ return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
53
+
54
+ # Get all possible dataset_id values
55
+ dataset_id = {
56
+ "modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
57
+ "legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
58
+ }
59
+
60
+ # Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
61
+ db_engine = get_relational_engine()
62
+ async with db_engine.get_async_session() as session:
63
+ legacy_dataset = (
64
+ await session.execute(
65
+ select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"])
66
+ )
67
+ ).scalar_one_or_none()
68
+
69
+ if not legacy_dataset:
70
+ return dataset_id["modern_dataset_id"]
71
+ return dataset_id["legacy_dataset_id"]
@@ -18,6 +18,7 @@ class Dataset(Base):
18
18
  updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
19
19
 
20
20
  owner_id = Column(UUID, index=True)
21
+ tenant_id = Column(UUID, index=True, nullable=True)
21
22
 
22
23
  acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan")
23
24
 
@@ -36,5 +37,6 @@ class Dataset(Base):
36
37
  "createdAt": self.created_at.isoformat(),
37
38
  "updatedAt": self.updated_at.isoformat() if self.updated_at else None,
38
39
  "ownerId": str(self.owner_id),
40
+ "tenantId": str(self.tenant_id),
39
41
  "data": [data.to_json() for data in self.data],
40
42
  }