cognee 0.4.1__py3-none-any.whl → 0.5.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/client.py +8 -0
- cognee/api/v1/add/routers/get_add_router.py +3 -1
- cognee/api/v1/cognify/routers/get_cognify_router.py +28 -1
- cognee/api/v1/ontologies/__init__.py +4 -0
- cognee/api/v1/ontologies/ontologies.py +183 -0
- cognee/api/v1/ontologies/routers/__init__.py +0 -0
- cognee/api/v1/ontologies/routers/get_ontology_router.py +107 -0
- cognee/api/v1/permissions/routers/get_permissions_router.py +41 -1
- cognee/cli/commands/cognify_command.py +8 -1
- cognee/cli/config.py +1 -1
- cognee/context_global_variables.py +41 -9
- cognee/infrastructure/databases/cache/config.py +3 -1
- cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +151 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +20 -10
- cognee/infrastructure/databases/exceptions/exceptions.py +16 -0
- cognee/infrastructure/databases/graph/config.py +4 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +2 -0
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +9 -0
- cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +37 -3
- cognee/infrastructure/databases/vector/config.py +3 -0
- cognee/infrastructure/databases/vector/create_vector_engine.py +5 -1
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +1 -4
- cognee/infrastructure/engine/models/Edge.py +13 -1
- cognee/infrastructure/files/utils/guess_file_type.py +4 -0
- cognee/infrastructure/llm/config.py +2 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +5 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +7 -1
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +7 -1
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +8 -16
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +12 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +13 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +5 -2
- cognee/infrastructure/loaders/LoaderEngine.py +1 -0
- cognee/infrastructure/loaders/core/__init__.py +2 -1
- cognee/infrastructure/loaders/core/csv_loader.py +93 -0
- cognee/infrastructure/loaders/core/text_loader.py +1 -2
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +0 -9
- cognee/infrastructure/loaders/supported_loaders.py +2 -1
- cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +55 -0
- cognee/modules/chunking/CsvChunker.py +35 -0
- cognee/modules/chunking/models/DocumentChunk.py +2 -1
- cognee/modules/chunking/text_chunker_with_overlap.py +124 -0
- cognee/modules/data/methods/__init__.py +1 -0
- cognee/modules/data/methods/create_dataset.py +4 -2
- cognee/modules/data/methods/get_dataset_ids.py +5 -1
- cognee/modules/data/methods/get_unique_data_id.py +68 -0
- cognee/modules/data/methods/get_unique_dataset_id.py +66 -4
- cognee/modules/data/models/Dataset.py +2 -0
- cognee/modules/data/processing/document_types/CsvDocument.py +33 -0
- cognee/modules/data/processing/document_types/__init__.py +1 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +4 -2
- cognee/modules/graph/utils/expand_with_nodes_and_edges.py +19 -2
- cognee/modules/graph/utils/resolve_edges_to_text.py +48 -49
- cognee/modules/ingestion/identify.py +4 -4
- cognee/modules/notebooks/operations/run_in_local_sandbox.py +3 -0
- cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +55 -23
- cognee/modules/pipelines/operations/run_tasks_data_item.py +1 -1
- cognee/modules/retrieval/EntityCompletionRetriever.py +10 -3
- cognee/modules/retrieval/base_graph_retriever.py +7 -3
- cognee/modules/retrieval/base_retriever.py +7 -3
- cognee/modules/retrieval/completion_retriever.py +11 -4
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +6 -2
- cognee/modules/retrieval/graph_completion_cot_retriever.py +14 -51
- cognee/modules/retrieval/graph_completion_retriever.py +4 -1
- cognee/modules/retrieval/temporal_retriever.py +9 -2
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +1 -1
- cognee/modules/retrieval/utils/completion.py +2 -22
- cognee/modules/run_custom_pipeline/__init__.py +1 -0
- cognee/modules/run_custom_pipeline/run_custom_pipeline.py +69 -0
- cognee/modules/search/methods/search.py +5 -3
- cognee/modules/users/methods/create_user.py +12 -27
- cognee/modules/users/methods/get_authenticated_user.py +2 -1
- cognee/modules/users/methods/get_default_user.py +4 -2
- cognee/modules/users/methods/get_user.py +1 -1
- cognee/modules/users/methods/get_user_by_email.py +1 -1
- cognee/modules/users/models/DatasetDatabase.py +9 -0
- cognee/modules/users/models/Tenant.py +6 -7
- cognee/modules/users/models/User.py +6 -5
- cognee/modules/users/models/UserTenant.py +12 -0
- cognee/modules/users/models/__init__.py +1 -0
- cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +13 -13
- cognee/modules/users/roles/methods/add_user_to_role.py +3 -1
- cognee/modules/users/tenants/methods/__init__.py +1 -0
- cognee/modules/users/tenants/methods/add_user_to_tenant.py +21 -12
- cognee/modules/users/tenants/methods/create_tenant.py +22 -8
- cognee/modules/users/tenants/methods/select_tenant.py +62 -0
- cognee/shared/logging_utils.py +2 -0
- cognee/tasks/chunks/__init__.py +1 -0
- cognee/tasks/chunks/chunk_by_row.py +94 -0
- cognee/tasks/documents/classify_documents.py +2 -0
- cognee/tasks/feedback/generate_improved_answers.py +3 -3
- cognee/tasks/ingestion/ingest_data.py +1 -1
- cognee/tasks/memify/__init__.py +2 -0
- cognee/tasks/memify/cognify_session.py +41 -0
- cognee/tasks/memify/extract_user_sessions.py +73 -0
- cognee/tasks/storage/index_data_points.py +33 -22
- cognee/tasks/storage/index_graph_edges.py +37 -57
- cognee/tests/integration/documents/CsvDocument_test.py +70 -0
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +1 -1
- cognee/tests/test_add_docling_document.py +2 -2
- cognee/tests/test_cognee_server_start.py +84 -1
- cognee/tests/test_conversation_history.py +45 -4
- cognee/tests/test_data/example_with_header.csv +3 -0
- cognee/tests/test_delete_bmw_example.py +60 -0
- cognee/tests/test_edge_ingestion.py +27 -0
- cognee/tests/test_feedback_enrichment.py +1 -1
- cognee/tests/test_library.py +6 -4
- cognee/tests/test_load.py +62 -0
- cognee/tests/test_multi_tenancy.py +165 -0
- cognee/tests/test_parallel_databases.py +2 -0
- cognee/tests/test_relational_db_migration.py +54 -2
- cognee/tests/test_search_db.py +7 -1
- cognee/tests/unit/api/test_conditional_authentication_endpoints.py +12 -3
- cognee/tests/unit/api/test_ontology_endpoint.py +264 -0
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +5 -0
- cognee/tests/unit/infrastructure/databases/test_index_data_points.py +27 -0
- cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +14 -16
- cognee/tests/unit/modules/chunking/test_text_chunker.py +248 -0
- cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py +324 -0
- cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +111 -0
- cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py +175 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -51
- cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +1 -0
- cognee/tests/unit/modules/retrieval/structured_output_test.py +204 -0
- cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +1 -1
- cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +0 -1
- cognee/tests/unit/modules/users/test_conditional_authentication.py +0 -63
- cognee/tests/unit/processing/chunks/chunk_by_row_test.py +52 -0
- {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/METADATA +88 -71
- {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/RECORD +135 -104
- {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/WHEEL +1 -1
- {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/entry_points.txt +0 -1
- {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.4.1.dist-info → cognee-0.5.0.dev0.dist-info}/licenses/NOTICE.md +0 -0
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py
CHANGED
|
@@ -42,8 +42,16 @@ class OllamaAPIAdapter(LLMInterface):
|
|
|
42
42
|
- aclient
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
|
+
default_instructor_mode = "json_mode"
|
|
46
|
+
|
|
45
47
|
def __init__(
|
|
46
|
-
self,
|
|
48
|
+
self,
|
|
49
|
+
endpoint: str,
|
|
50
|
+
api_key: str,
|
|
51
|
+
model: str,
|
|
52
|
+
name: str,
|
|
53
|
+
max_completion_tokens: int,
|
|
54
|
+
instructor_mode: str = None,
|
|
47
55
|
):
|
|
48
56
|
self.name = name
|
|
49
57
|
self.model = model
|
|
@@ -51,8 +59,11 @@ class OllamaAPIAdapter(LLMInterface):
|
|
|
51
59
|
self.endpoint = endpoint
|
|
52
60
|
self.max_completion_tokens = max_completion_tokens
|
|
53
61
|
|
|
62
|
+
self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
|
|
63
|
+
|
|
54
64
|
self.aclient = instructor.from_openai(
|
|
55
|
-
OpenAI(base_url=self.endpoint, api_key=self.api_key),
|
|
65
|
+
OpenAI(base_url=self.endpoint, api_key=self.api_key),
|
|
66
|
+
mode=instructor.Mode(self.instructor_mode),
|
|
56
67
|
)
|
|
57
68
|
|
|
58
69
|
@retry(
|
cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py
CHANGED
|
@@ -56,6 +56,7 @@ class OpenAIAdapter(LLMInterface):
|
|
|
56
56
|
model: str
|
|
57
57
|
api_key: str
|
|
58
58
|
api_version: str
|
|
59
|
+
default_instructor_mode = "json_schema_mode"
|
|
59
60
|
|
|
60
61
|
MAX_RETRIES = 5
|
|
61
62
|
|
|
@@ -69,19 +70,21 @@ class OpenAIAdapter(LLMInterface):
|
|
|
69
70
|
model: str,
|
|
70
71
|
transcription_model: str,
|
|
71
72
|
max_completion_tokens: int,
|
|
73
|
+
instructor_mode: str = None,
|
|
72
74
|
streaming: bool = False,
|
|
73
75
|
fallback_model: str = None,
|
|
74
76
|
fallback_api_key: str = None,
|
|
75
77
|
fallback_endpoint: str = None,
|
|
76
78
|
):
|
|
79
|
+
self.instructor_mode = instructor_mode if instructor_mode else self.default_instructor_mode
|
|
77
80
|
# TODO: With gpt5 series models OpenAI expects JSON_SCHEMA as a mode for structured outputs.
|
|
78
81
|
# Make sure all new gpt models will work with this mode as well.
|
|
79
82
|
if "gpt-5" in model:
|
|
80
83
|
self.aclient = instructor.from_litellm(
|
|
81
|
-
litellm.acompletion, mode=instructor.Mode.
|
|
84
|
+
litellm.acompletion, mode=instructor.Mode(self.instructor_mode)
|
|
82
85
|
)
|
|
83
86
|
self.client = instructor.from_litellm(
|
|
84
|
-
litellm.completion, mode=instructor.Mode.
|
|
87
|
+
litellm.completion, mode=instructor.Mode(self.instructor_mode)
|
|
85
88
|
)
|
|
86
89
|
else:
|
|
87
90
|
self.aclient = instructor.from_litellm(litellm.acompletion)
|
|
@@ -3,5 +3,6 @@
|
|
|
3
3
|
from .text_loader import TextLoader
|
|
4
4
|
from .audio_loader import AudioLoader
|
|
5
5
|
from .image_loader import ImageLoader
|
|
6
|
+
from .csv_loader import CsvLoader
|
|
6
7
|
|
|
7
|
-
__all__ = ["TextLoader", "AudioLoader", "ImageLoader"]
|
|
8
|
+
__all__ = ["TextLoader", "AudioLoader", "ImageLoader", "CsvLoader"]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
import csv
|
|
4
|
+
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
5
|
+
from cognee.infrastructure.files.storage import get_file_storage, get_storage_config
|
|
6
|
+
from cognee.infrastructure.files.utils.get_file_metadata import get_file_metadata
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CsvLoader(LoaderInterface):
|
|
10
|
+
"""
|
|
11
|
+
Core CSV file loader that handles basic CSV file formats.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def supported_extensions(self) -> List[str]:
|
|
16
|
+
"""Supported text file extensions."""
|
|
17
|
+
return [
|
|
18
|
+
"csv",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def supported_mime_types(self) -> List[str]:
|
|
23
|
+
"""Supported MIME types for text content."""
|
|
24
|
+
return [
|
|
25
|
+
"text/csv",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def loader_name(self) -> str:
|
|
30
|
+
"""Unique identifier for this loader."""
|
|
31
|
+
return "csv_loader"
|
|
32
|
+
|
|
33
|
+
def can_handle(self, extension: str, mime_type: str) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Check if this loader can handle the given file.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
extension: File extension
|
|
39
|
+
mime_type: Optional MIME type
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
True if file can be handled, False otherwise
|
|
43
|
+
"""
|
|
44
|
+
if extension in self.supported_extensions and mime_type in self.supported_mime_types:
|
|
45
|
+
return True
|
|
46
|
+
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
async def load(self, file_path: str, encoding: str = "utf-8", **kwargs):
|
|
50
|
+
"""
|
|
51
|
+
Load and process the csv file.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
file_path: Path to the file to load
|
|
55
|
+
encoding: Text encoding to use (default: utf-8)
|
|
56
|
+
**kwargs: Additional configuration (unused)
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
LoaderResult containing the file content and metadata
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
FileNotFoundError: If file doesn't exist
|
|
63
|
+
UnicodeDecodeError: If file cannot be decoded with specified encoding
|
|
64
|
+
OSError: If file cannot be read
|
|
65
|
+
"""
|
|
66
|
+
if not os.path.exists(file_path):
|
|
67
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
68
|
+
|
|
69
|
+
with open(file_path, "rb") as f:
|
|
70
|
+
file_metadata = await get_file_metadata(f)
|
|
71
|
+
# Name ingested file of current loader based on original file content hash
|
|
72
|
+
storage_file_name = "text_" + file_metadata["content_hash"] + ".txt"
|
|
73
|
+
|
|
74
|
+
row_texts = []
|
|
75
|
+
row_index = 1
|
|
76
|
+
|
|
77
|
+
with open(file_path, "r", encoding=encoding, newline="") as file:
|
|
78
|
+
reader = csv.DictReader(file)
|
|
79
|
+
for row in reader:
|
|
80
|
+
pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()]
|
|
81
|
+
row_text = ", ".join(pairs)
|
|
82
|
+
row_texts.append(f"Row {row_index}:\n{row_text}\n")
|
|
83
|
+
row_index += 1
|
|
84
|
+
|
|
85
|
+
content = "\n".join(row_texts)
|
|
86
|
+
|
|
87
|
+
storage_config = get_storage_config()
|
|
88
|
+
data_root_directory = storage_config["data_root_directory"]
|
|
89
|
+
storage = get_file_storage(data_root_directory)
|
|
90
|
+
|
|
91
|
+
full_file_path = await storage.store(storage_file_name, content)
|
|
92
|
+
|
|
93
|
+
return full_file_path
|
|
@@ -16,7 +16,7 @@ class TextLoader(LoaderInterface):
|
|
|
16
16
|
@property
|
|
17
17
|
def supported_extensions(self) -> List[str]:
|
|
18
18
|
"""Supported text file extensions."""
|
|
19
|
-
return ["txt", "md", "
|
|
19
|
+
return ["txt", "md", "json", "xml", "yaml", "yml", "log"]
|
|
20
20
|
|
|
21
21
|
@property
|
|
22
22
|
def supported_mime_types(self) -> List[str]:
|
|
@@ -24,7 +24,6 @@ class TextLoader(LoaderInterface):
|
|
|
24
24
|
return [
|
|
25
25
|
"text/plain",
|
|
26
26
|
"text/markdown",
|
|
27
|
-
"text/csv",
|
|
28
27
|
"application/json",
|
|
29
28
|
"text/xml",
|
|
30
29
|
"application/xml",
|
|
@@ -227,12 +227,3 @@ class AdvancedPdfLoader(LoaderInterface):
|
|
|
227
227
|
if value is None:
|
|
228
228
|
return ""
|
|
229
229
|
return str(value).replace("\xa0", " ").strip()
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
if __name__ == "__main__":
|
|
233
|
-
loader = AdvancedPdfLoader()
|
|
234
|
-
asyncio.run(
|
|
235
|
-
loader.load(
|
|
236
|
-
"/Users/xiaotao/work/cognee/cognee/infrastructure/loaders/external/attention_is_all_you_need.pdf"
|
|
237
|
-
)
|
|
238
|
-
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from cognee.infrastructure.loaders.external import PyPdfLoader
|
|
2
|
-
from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader
|
|
2
|
+
from cognee.infrastructure.loaders.core import TextLoader, AudioLoader, ImageLoader, CsvLoader
|
|
3
3
|
|
|
4
4
|
# Registry for loader implementations
|
|
5
5
|
supported_loaders = {
|
|
@@ -7,6 +7,7 @@ supported_loaders = {
|
|
|
7
7
|
TextLoader.loader_name: TextLoader,
|
|
8
8
|
ImageLoader.loader_name: ImageLoader,
|
|
9
9
|
AudioLoader.loader_name: AudioLoader,
|
|
10
|
+
CsvLoader.loader_name: CsvLoader,
|
|
10
11
|
}
|
|
11
12
|
|
|
12
13
|
# Try adding optional loaders
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing import Optional, List
|
|
2
|
+
|
|
3
|
+
from cognee import memify
|
|
4
|
+
from cognee.context_global_variables import (
|
|
5
|
+
set_database_global_context_variables,
|
|
6
|
+
set_session_user_context_variable,
|
|
7
|
+
)
|
|
8
|
+
from cognee.exceptions import CogneeValidationError
|
|
9
|
+
from cognee.modules.data.methods import get_authorized_existing_datasets
|
|
10
|
+
from cognee.shared.logging_utils import get_logger
|
|
11
|
+
from cognee.modules.pipelines.tasks.task import Task
|
|
12
|
+
from cognee.modules.users.models import User
|
|
13
|
+
from cognee.tasks.memify import extract_user_sessions, cognify_session
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = get_logger("persist_sessions_in_knowledge_graph")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def persist_sessions_in_knowledge_graph_pipeline(
|
|
20
|
+
user: User,
|
|
21
|
+
session_ids: Optional[List[str]] = None,
|
|
22
|
+
dataset: str = "main_dataset",
|
|
23
|
+
run_in_background: bool = False,
|
|
24
|
+
):
|
|
25
|
+
await set_session_user_context_variable(user)
|
|
26
|
+
dataset_to_write = await get_authorized_existing_datasets(
|
|
27
|
+
user=user, datasets=[dataset], permission_type="write"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if not dataset_to_write:
|
|
31
|
+
raise CogneeValidationError(
|
|
32
|
+
message=f"User (id: {str(user.id)}) does not have write access to dataset: {dataset}",
|
|
33
|
+
log=False,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
await set_database_global_context_variables(
|
|
37
|
+
dataset_to_write[0].id, dataset_to_write[0].owner_id
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)]
|
|
41
|
+
|
|
42
|
+
enrichment_tasks = [
|
|
43
|
+
Task(cognify_session, dataset_id=dataset_to_write[0].id),
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
result = await memify(
|
|
47
|
+
extraction_tasks=extraction_tasks,
|
|
48
|
+
enrichment_tasks=enrichment_tasks,
|
|
49
|
+
dataset=dataset_to_write[0].id,
|
|
50
|
+
data=[{}],
|
|
51
|
+
run_in_background=run_in_background,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
logger.info("Session persistence pipeline completed")
|
|
55
|
+
return result
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from cognee.shared.logging_utils import get_logger
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from cognee.tasks.chunks import chunk_by_row
|
|
5
|
+
from cognee.modules.chunking.Chunker import Chunker
|
|
6
|
+
from .models.DocumentChunk import DocumentChunk
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CsvChunker(Chunker):
|
|
12
|
+
async def read(self):
|
|
13
|
+
async for content_text in self.get_text():
|
|
14
|
+
if content_text is None:
|
|
15
|
+
continue
|
|
16
|
+
|
|
17
|
+
for chunk_data in chunk_by_row(content_text, self.max_chunk_size):
|
|
18
|
+
if chunk_data["chunk_size"] <= self.max_chunk_size:
|
|
19
|
+
yield DocumentChunk(
|
|
20
|
+
id=chunk_data["chunk_id"],
|
|
21
|
+
text=chunk_data["text"],
|
|
22
|
+
chunk_size=chunk_data["chunk_size"],
|
|
23
|
+
is_part_of=self.document,
|
|
24
|
+
chunk_index=self.chunk_index,
|
|
25
|
+
cut_type=chunk_data["cut_type"],
|
|
26
|
+
contains=[],
|
|
27
|
+
metadata={
|
|
28
|
+
"index_fields": ["text"],
|
|
29
|
+
},
|
|
30
|
+
)
|
|
31
|
+
self.chunk_index += 1
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Chunk size is larger than the maximum chunk size {self.max_chunk_size}"
|
|
35
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Union
|
|
2
2
|
|
|
3
3
|
from cognee.infrastructure.engine import DataPoint
|
|
4
|
+
from cognee.infrastructure.engine.models.Edge import Edge
|
|
4
5
|
from cognee.modules.data.processing.document_types import Document
|
|
5
6
|
from cognee.modules.engine.models import Entity
|
|
6
7
|
from cognee.tasks.temporal_graph.models import Event
|
|
@@ -31,6 +32,6 @@ class DocumentChunk(DataPoint):
|
|
|
31
32
|
chunk_index: int
|
|
32
33
|
cut_type: str
|
|
33
34
|
is_part_of: Document
|
|
34
|
-
contains: List[Union[Entity, Event]] = None
|
|
35
|
+
contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None
|
|
35
36
|
|
|
36
37
|
metadata: dict = {"index_fields": ["text"]}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from cognee.shared.logging_utils import get_logger
|
|
2
|
+
from uuid import NAMESPACE_OID, uuid5
|
|
3
|
+
|
|
4
|
+
from cognee.tasks.chunks import chunk_by_paragraph
|
|
5
|
+
from cognee.modules.chunking.Chunker import Chunker
|
|
6
|
+
from .models.DocumentChunk import DocumentChunk
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextChunkerWithOverlap(Chunker):
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
document,
|
|
15
|
+
get_text: callable,
|
|
16
|
+
max_chunk_size: int,
|
|
17
|
+
chunk_overlap_ratio: float = 0.0,
|
|
18
|
+
get_chunk_data: callable = None,
|
|
19
|
+
):
|
|
20
|
+
super().__init__(document, get_text, max_chunk_size)
|
|
21
|
+
self._accumulated_chunk_data = []
|
|
22
|
+
self._accumulated_size = 0
|
|
23
|
+
self.chunk_overlap_ratio = chunk_overlap_ratio
|
|
24
|
+
self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio)
|
|
25
|
+
|
|
26
|
+
if get_chunk_data is not None:
|
|
27
|
+
self.get_chunk_data = get_chunk_data
|
|
28
|
+
elif chunk_overlap_ratio > 0:
|
|
29
|
+
paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size)
|
|
30
|
+
self.get_chunk_data = lambda text: chunk_by_paragraph(
|
|
31
|
+
text, paragraph_max_size, batch_paragraphs=True
|
|
32
|
+
)
|
|
33
|
+
else:
|
|
34
|
+
self.get_chunk_data = lambda text: chunk_by_paragraph(
|
|
35
|
+
text, self.max_chunk_size, batch_paragraphs=True
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def _accumulation_overflows(self, chunk_data):
|
|
39
|
+
"""Check if adding chunk_data would exceed max_chunk_size."""
|
|
40
|
+
return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size
|
|
41
|
+
|
|
42
|
+
def _accumulate_chunk_data(self, chunk_data):
|
|
43
|
+
"""Add chunk_data to the current accumulation."""
|
|
44
|
+
self._accumulated_chunk_data.append(chunk_data)
|
|
45
|
+
self._accumulated_size += chunk_data["chunk_size"]
|
|
46
|
+
|
|
47
|
+
def _clear_accumulation(self):
|
|
48
|
+
"""Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio."""
|
|
49
|
+
if self.chunk_overlap == 0:
|
|
50
|
+
self._accumulated_chunk_data = []
|
|
51
|
+
self._accumulated_size = 0
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
# Keep chunk_data from the end that fit in overlap
|
|
55
|
+
overlap_chunk_data = []
|
|
56
|
+
overlap_size = 0
|
|
57
|
+
|
|
58
|
+
for chunk_data in reversed(self._accumulated_chunk_data):
|
|
59
|
+
if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap:
|
|
60
|
+
overlap_chunk_data.insert(0, chunk_data)
|
|
61
|
+
overlap_size += chunk_data["chunk_size"]
|
|
62
|
+
else:
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
self._accumulated_chunk_data = overlap_chunk_data
|
|
66
|
+
self._accumulated_size = overlap_size
|
|
67
|
+
|
|
68
|
+
def _create_chunk(self, text, size, cut_type, chunk_id=None):
|
|
69
|
+
"""Create a DocumentChunk with standard metadata."""
|
|
70
|
+
try:
|
|
71
|
+
return DocumentChunk(
|
|
72
|
+
id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
|
|
73
|
+
text=text,
|
|
74
|
+
chunk_size=size,
|
|
75
|
+
is_part_of=self.document,
|
|
76
|
+
chunk_index=self.chunk_index,
|
|
77
|
+
cut_type=cut_type,
|
|
78
|
+
contains=[],
|
|
79
|
+
metadata={"index_fields": ["text"]},
|
|
80
|
+
)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.error(e)
|
|
83
|
+
raise e
|
|
84
|
+
|
|
85
|
+
def _create_chunk_from_accumulation(self):
|
|
86
|
+
"""Create a DocumentChunk from current accumulated chunk_data."""
|
|
87
|
+
chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data)
|
|
88
|
+
return self._create_chunk(
|
|
89
|
+
text=chunk_text,
|
|
90
|
+
size=self._accumulated_size,
|
|
91
|
+
cut_type=self._accumulated_chunk_data[-1]["cut_type"],
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def _emit_chunk(self, chunk_data):
|
|
95
|
+
"""Emit a chunk when accumulation overflows."""
|
|
96
|
+
if len(self._accumulated_chunk_data) > 0:
|
|
97
|
+
chunk = self._create_chunk_from_accumulation()
|
|
98
|
+
self._clear_accumulation()
|
|
99
|
+
self._accumulate_chunk_data(chunk_data)
|
|
100
|
+
else:
|
|
101
|
+
# Handle single chunk_data exceeding max_chunk_size
|
|
102
|
+
chunk = self._create_chunk(
|
|
103
|
+
text=chunk_data["text"],
|
|
104
|
+
size=chunk_data["chunk_size"],
|
|
105
|
+
cut_type=chunk_data["cut_type"],
|
|
106
|
+
chunk_id=chunk_data["chunk_id"],
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
self.chunk_index += 1
|
|
110
|
+
return chunk
|
|
111
|
+
|
|
112
|
+
async def read(self):
|
|
113
|
+
async for content_text in self.get_text():
|
|
114
|
+
for chunk_data in self.get_chunk_data(content_text):
|
|
115
|
+
if not self._accumulation_overflows(chunk_data):
|
|
116
|
+
self._accumulate_chunk_data(chunk_data)
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
yield self._emit_chunk(chunk_data)
|
|
120
|
+
|
|
121
|
+
if len(self._accumulated_chunk_data) == 0:
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
yield self._create_chunk_from_accumulation()
|
|
@@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
|
|
|
10
10
|
from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
|
|
11
11
|
from .get_data import get_data
|
|
12
12
|
from .get_unique_dataset_id import get_unique_dataset_id
|
|
13
|
+
from .get_unique_data_id import get_unique_data_id
|
|
13
14
|
from .get_authorized_existing_datasets import get_authorized_existing_datasets
|
|
14
15
|
from .get_dataset_ids import get_dataset_ids
|
|
15
16
|
|
|
@@ -16,14 +16,16 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) -
|
|
|
16
16
|
.options(joinedload(Dataset.data))
|
|
17
17
|
.filter(Dataset.name == dataset_name)
|
|
18
18
|
.filter(Dataset.owner_id == owner_id)
|
|
19
|
+
.filter(Dataset.tenant_id == user.tenant_id)
|
|
19
20
|
)
|
|
20
21
|
).first()
|
|
21
22
|
|
|
22
23
|
if dataset is None:
|
|
23
24
|
# Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name
|
|
24
25
|
dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user)
|
|
25
|
-
dataset = Dataset(
|
|
26
|
-
|
|
26
|
+
dataset = Dataset(
|
|
27
|
+
id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id
|
|
28
|
+
)
|
|
27
29
|
|
|
28
30
|
session.add(dataset)
|
|
29
31
|
|
|
@@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user):
|
|
|
27
27
|
# Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.)
|
|
28
28
|
user_datasets = await get_datasets(user.id)
|
|
29
29
|
# Filter out non name mentioned datasets
|
|
30
|
-
dataset_ids = [dataset
|
|
30
|
+
dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets]
|
|
31
|
+
# Filter out non current tenant datasets
|
|
32
|
+
dataset_ids = [
|
|
33
|
+
dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id
|
|
34
|
+
]
|
|
31
35
|
else:
|
|
32
36
|
raise DatasetTypeError(
|
|
33
37
|
f"One or more of the provided dataset types is not handled: f{datasets}"
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from uuid import uuid5, NAMESPACE_OID, UUID
|
|
2
|
+
from sqlalchemy import select
|
|
3
|
+
|
|
4
|
+
from cognee.modules.data.models.Data import Data
|
|
5
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
6
|
+
from cognee.modules.users.models import User
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
|
|
10
|
+
"""
|
|
11
|
+
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
|
12
|
+
If data with legacy ID exists, return that ID to maintain compatibility.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
|
16
|
+
user: User object adding the data
|
|
17
|
+
tenant_id: UUID of the tenant for which data is being added
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
UUID: Unique identifier for the data
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
|
|
24
|
+
"""
|
|
25
|
+
Deprecated function, returns a unique UUID for data based on data identifier and user id.
|
|
26
|
+
Needed to support legacy data without tenant information.
|
|
27
|
+
Args:
|
|
28
|
+
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
|
29
|
+
user: User object adding the data
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
UUID: Unique identifier for the data
|
|
33
|
+
"""
|
|
34
|
+
# return UUID hash of file contents + owner id + tenant_id
|
|
35
|
+
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
|
|
36
|
+
|
|
37
|
+
def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
|
|
38
|
+
"""
|
|
39
|
+
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
|
40
|
+
Args:
|
|
41
|
+
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
|
42
|
+
user: User object adding the data
|
|
43
|
+
tenant_id: UUID of the tenant for which data is being added
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
UUID: Unique identifier for the data
|
|
47
|
+
"""
|
|
48
|
+
# return UUID hash of file contents + owner id + tenant_id
|
|
49
|
+
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
|
|
50
|
+
|
|
51
|
+
# Get all possible data_id values
|
|
52
|
+
data_id = {
|
|
53
|
+
"modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
|
|
54
|
+
"legacy_data_id": _get_deprecated_unique_data_id(
|
|
55
|
+
data_identifier=data_identifier, user=user
|
|
56
|
+
),
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
|
|
60
|
+
db_engine = get_relational_engine()
|
|
61
|
+
async with db_engine.get_async_session() as session:
|
|
62
|
+
legacy_data_point = (
|
|
63
|
+
await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
|
|
64
|
+
).scalar_one_or_none()
|
|
65
|
+
|
|
66
|
+
if not legacy_data_point:
|
|
67
|
+
return data_id["modern_data_id"]
|
|
68
|
+
return data_id["legacy_data_id"]
|
|
@@ -1,9 +1,71 @@
|
|
|
1
1
|
from uuid import UUID, uuid5, NAMESPACE_OID
|
|
2
|
-
from cognee.modules.users.models import User
|
|
3
2
|
from typing import Union
|
|
3
|
+
from sqlalchemy import select
|
|
4
|
+
|
|
5
|
+
from cognee.modules.data.models.Dataset import Dataset
|
|
6
|
+
from cognee.modules.users.models import User
|
|
7
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
4
8
|
|
|
5
9
|
|
|
6
10
|
async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
11
|
+
"""
|
|
12
|
+
Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
|
|
13
|
+
If dataset with legacy ID exists, return that ID to maintain compatibility.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
dataset_name: string representing the dataset name
|
|
17
|
+
user: User object adding the dataset
|
|
18
|
+
tenant_id: UUID of the tenant for which dataset is being added
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
UUID: Unique identifier for the dataset
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
|
25
|
+
"""
|
|
26
|
+
Legacy function, returns a unique UUID for dataset based on dataset name and user id.
|
|
27
|
+
Needed to support legacy datasets without tenant information.
|
|
28
|
+
Args:
|
|
29
|
+
dataset_name: string representing the dataset name
|
|
30
|
+
user: Current User object adding the dataset
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
UUID: Unique identifier for the dataset
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(dataset_name, UUID):
|
|
36
|
+
return dataset_name
|
|
37
|
+
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
|
|
38
|
+
|
|
39
|
+
def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
|
40
|
+
"""
|
|
41
|
+
Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
|
|
42
|
+
Args:
|
|
43
|
+
dataset_name: string representing the dataset name
|
|
44
|
+
user: Current User object adding the dataset
|
|
45
|
+
tenant_id: UUID of the tenant for which dataset is being added
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
UUID: Unique identifier for the dataset
|
|
49
|
+
"""
|
|
50
|
+
if isinstance(dataset_name, UUID):
|
|
51
|
+
return dataset_name
|
|
52
|
+
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
|
|
53
|
+
|
|
54
|
+
# Get all possible dataset_id values
|
|
55
|
+
dataset_id = {
|
|
56
|
+
"modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
|
|
57
|
+
"legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
|
|
61
|
+
db_engine = get_relational_engine()
|
|
62
|
+
async with db_engine.get_async_session() as session:
|
|
63
|
+
legacy_dataset = (
|
|
64
|
+
await session.execute(
|
|
65
|
+
select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"])
|
|
66
|
+
)
|
|
67
|
+
).scalar_one_or_none()
|
|
68
|
+
|
|
69
|
+
if not legacy_dataset:
|
|
70
|
+
return dataset_id["modern_dataset_id"]
|
|
71
|
+
return dataset_id["legacy_dataset_id"]
|
|
@@ -18,6 +18,7 @@ class Dataset(Base):
|
|
|
18
18
|
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
|
19
19
|
|
|
20
20
|
owner_id = Column(UUID, index=True)
|
|
21
|
+
tenant_id = Column(UUID, index=True, nullable=True)
|
|
21
22
|
|
|
22
23
|
acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan")
|
|
23
24
|
|
|
@@ -36,5 +37,6 @@ class Dataset(Base):
|
|
|
36
37
|
"createdAt": self.created_at.isoformat(),
|
|
37
38
|
"updatedAt": self.updated_at.isoformat() if self.updated_at else None,
|
|
38
39
|
"ownerId": str(self.owner_id),
|
|
40
|
+
"tenantId": str(self.tenant_id),
|
|
39
41
|
"data": [data.to_json() for data in self.data],
|
|
40
42
|
}
|