cognee 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +1 -0
- cognee/api/client.py +9 -5
- cognee/api/v1/add/add.py +2 -1
- cognee/api/v1/add/routers/get_add_router.py +3 -1
- cognee/api/v1/cognify/cognify.py +24 -16
- cognee/api/v1/cognify/routers/__init__.py +0 -1
- cognee/api/v1/cognify/routers/get_cognify_router.py +30 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
- cognee/api/v1/ontologies/__init__.py +4 -0
- cognee/api/v1/ontologies/ontologies.py +158 -0
- cognee/api/v1/ontologies/routers/__init__.py +0 -0
- cognee/api/v1/ontologies/routers/get_ontology_router.py +109 -0
- cognee/api/v1/permissions/routers/get_permissions_router.py +41 -1
- cognee/api/v1/search/search.py +4 -0
- cognee/api/v1/ui/node_setup.py +360 -0
- cognee/api/v1/ui/npm_utils.py +50 -0
- cognee/api/v1/ui/ui.py +38 -68
- cognee/cli/commands/cognify_command.py +8 -1
- cognee/cli/config.py +1 -1
- cognee/context_global_variables.py +86 -9
- cognee/eval_framework/Dockerfile +29 -0
- cognee/eval_framework/answer_generation/answer_generation_executor.py +10 -0
- cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
- cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +0 -2
- cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
- cognee/eval_framework/eval_config.py +2 -2
- cognee/eval_framework/modal_run_eval.py +16 -28
- cognee/infrastructure/databases/cache/config.py +3 -1
- cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +151 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +20 -10
- cognee/infrastructure/databases/dataset_database_handler/__init__.py +3 -0
- cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +80 -0
- cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +18 -0
- cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/exceptions/exceptions.py +16 -0
- cognee/infrastructure/databases/graph/config.py +7 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +3 -0
- cognee/infrastructure/databases/graph/graph_db_interface.py +15 -0
- cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +81 -0
- cognee/infrastructure/databases/graph/kuzu/adapter.py +228 -0
- cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +168 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +80 -1
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +9 -0
- cognee/infrastructure/databases/utils/__init__.py +3 -0
- cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +66 -18
- cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +10 -0
- cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +30 -0
- cognee/infrastructure/databases/vector/config.py +5 -0
- cognee/infrastructure/databases/vector/create_vector_engine.py +6 -1
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +8 -6
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +9 -7
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -13
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +2 -0
- cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +50 -0
- cognee/infrastructure/databases/vector/vector_db_interface.py +35 -0
- cognee/infrastructure/engine/models/Edge.py +13 -1
- cognee/infrastructure/files/storage/s3_config.py +2 -0
- cognee/infrastructure/files/utils/guess_file_type.py +4 -0
- cognee/infrastructure/llm/LLMGateway.py +5 -2
- cognee/infrastructure/llm/config.py +37 -0
- cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +23 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +22 -18
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +5 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +153 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +47 -38
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +46 -37
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +20 -10
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +23 -11
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +36 -23
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +47 -36
- cognee/infrastructure/loaders/LoaderEngine.py +1 -0
- cognee/infrastructure/loaders/core/__init__.py +2 -1
- cognee/infrastructure/loaders/core/csv_loader.py +93 -0
- cognee/infrastructure/loaders/core/text_loader.py +1 -2
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +0 -9
- cognee/infrastructure/loaders/supported_loaders.py +2 -1
- cognee/memify_pipelines/create_triplet_embeddings.py +53 -0
- cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +55 -0
- cognee/modules/chunking/CsvChunker.py +35 -0
- cognee/modules/chunking/models/DocumentChunk.py +2 -1
- cognee/modules/chunking/text_chunker_with_overlap.py +124 -0
- cognee/modules/cognify/config.py +2 -0
- cognee/modules/data/deletion/prune_system.py +52 -2
- cognee/modules/data/methods/__init__.py +1 -0
- cognee/modules/data/methods/create_dataset.py +4 -2
- cognee/modules/data/methods/delete_dataset.py +26 -0
- cognee/modules/data/methods/get_dataset_ids.py +5 -1
- cognee/modules/data/methods/get_unique_data_id.py +68 -0
- cognee/modules/data/methods/get_unique_dataset_id.py +66 -4
- cognee/modules/data/models/Dataset.py +2 -0
- cognee/modules/data/processing/document_types/CsvDocument.py +33 -0
- cognee/modules/data/processing/document_types/__init__.py +1 -0
- cognee/modules/engine/models/Triplet.py +9 -0
- cognee/modules/engine/models/__init__.py +1 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +89 -39
- cognee/modules/graph/cognee_graph/CogneeGraphElements.py +8 -3
- cognee/modules/graph/utils/expand_with_nodes_and_edges.py +19 -2
- cognee/modules/graph/utils/resolve_edges_to_text.py +48 -49
- cognee/modules/ingestion/identify.py +4 -4
- cognee/modules/memify/memify.py +1 -7
- cognee/modules/notebooks/operations/run_in_local_sandbox.py +3 -0
- cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +55 -23
- cognee/modules/pipelines/operations/pipeline.py +18 -2
- cognee/modules/pipelines/operations/run_tasks_data_item.py +1 -1
- cognee/modules/retrieval/EntityCompletionRetriever.py +10 -3
- cognee/modules/retrieval/__init__.py +1 -1
- cognee/modules/retrieval/base_graph_retriever.py +7 -3
- cognee/modules/retrieval/base_retriever.py +7 -3
- cognee/modules/retrieval/completion_retriever.py +11 -4
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +10 -2
- cognee/modules/retrieval/graph_completion_cot_retriever.py +18 -51
- cognee/modules/retrieval/graph_completion_retriever.py +14 -1
- cognee/modules/retrieval/graph_summary_completion_retriever.py +4 -0
- cognee/modules/retrieval/register_retriever.py +10 -0
- cognee/modules/retrieval/registered_community_retrievers.py +1 -0
- cognee/modules/retrieval/temporal_retriever.py +13 -2
- cognee/modules/retrieval/triplet_retriever.py +182 -0
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +43 -11
- cognee/modules/retrieval/utils/completion.py +2 -22
- cognee/modules/run_custom_pipeline/__init__.py +1 -0
- cognee/modules/run_custom_pipeline/run_custom_pipeline.py +76 -0
- cognee/modules/search/methods/get_search_type_tools.py +54 -8
- cognee/modules/search/methods/no_access_control_search.py +4 -0
- cognee/modules/search/methods/search.py +26 -3
- cognee/modules/search/types/SearchType.py +1 -1
- cognee/modules/settings/get_settings.py +19 -0
- cognee/modules/users/methods/create_user.py +12 -27
- cognee/modules/users/methods/get_authenticated_user.py +3 -2
- cognee/modules/users/methods/get_default_user.py +4 -2
- cognee/modules/users/methods/get_user.py +1 -1
- cognee/modules/users/methods/get_user_by_email.py +1 -1
- cognee/modules/users/models/DatasetDatabase.py +24 -3
- cognee/modules/users/models/Tenant.py +6 -7
- cognee/modules/users/models/User.py +6 -5
- cognee/modules/users/models/UserTenant.py +12 -0
- cognee/modules/users/models/__init__.py +1 -0
- cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +13 -13
- cognee/modules/users/roles/methods/add_user_to_role.py +3 -1
- cognee/modules/users/tenants/methods/__init__.py +1 -0
- cognee/modules/users/tenants/methods/add_user_to_tenant.py +21 -12
- cognee/modules/users/tenants/methods/create_tenant.py +22 -8
- cognee/modules/users/tenants/methods/select_tenant.py +62 -0
- cognee/shared/logging_utils.py +6 -0
- cognee/shared/rate_limiting.py +30 -0
- cognee/tasks/chunks/__init__.py +1 -0
- cognee/tasks/chunks/chunk_by_row.py +94 -0
- cognee/tasks/documents/__init__.py +0 -1
- cognee/tasks/documents/classify_documents.py +2 -0
- cognee/tasks/feedback/generate_improved_answers.py +3 -3
- cognee/tasks/graph/extract_graph_from_data.py +9 -10
- cognee/tasks/ingestion/ingest_data.py +1 -1
- cognee/tasks/memify/__init__.py +2 -0
- cognee/tasks/memify/cognify_session.py +41 -0
- cognee/tasks/memify/extract_user_sessions.py +73 -0
- cognee/tasks/memify/get_triplet_datapoints.py +289 -0
- cognee/tasks/storage/add_data_points.py +142 -2
- cognee/tasks/storage/index_data_points.py +33 -22
- cognee/tasks/storage/index_graph_edges.py +37 -57
- cognee/tests/integration/documents/CsvDocument_test.py +70 -0
- cognee/tests/integration/retrieval/test_triplet_retriever.py +84 -0
- cognee/tests/integration/tasks/test_add_data_points.py +139 -0
- cognee/tests/integration/tasks/test_get_triplet_datapoints.py +69 -0
- cognee/tests/tasks/entity_extraction/entity_extraction_test.py +1 -1
- cognee/tests/test_add_docling_document.py +2 -2
- cognee/tests/test_cognee_server_start.py +84 -3
- cognee/tests/test_conversation_history.py +68 -5
- cognee/tests/test_data/example_with_header.csv +3 -0
- cognee/tests/test_dataset_database_handler.py +137 -0
- cognee/tests/test_dataset_delete.py +76 -0
- cognee/tests/test_edge_centered_payload.py +170 -0
- cognee/tests/test_edge_ingestion.py +27 -0
- cognee/tests/test_feedback_enrichment.py +1 -1
- cognee/tests/test_library.py +6 -4
- cognee/tests/test_load.py +62 -0
- cognee/tests/test_multi_tenancy.py +165 -0
- cognee/tests/test_parallel_databases.py +2 -0
- cognee/tests/test_pipeline_cache.py +164 -0
- cognee/tests/test_relational_db_migration.py +54 -2
- cognee/tests/test_search_db.py +44 -2
- cognee/tests/unit/api/test_conditional_authentication_endpoints.py +12 -3
- cognee/tests/unit/api/test_ontology_endpoint.py +252 -0
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +5 -0
- cognee/tests/unit/infrastructure/databases/test_index_data_points.py +27 -0
- cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +14 -16
- cognee/tests/unit/infrastructure/llm/test_llm_config.py +46 -0
- cognee/tests/unit/infrastructure/mock_embedding_engine.py +3 -7
- cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +0 -5
- cognee/tests/unit/modules/chunking/test_text_chunker.py +248 -0
- cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py +324 -0
- cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
- cognee/tests/unit/modules/graph/cognee_graph_test.py +406 -0
- cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +111 -0
- cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py +175 -0
- cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +214 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -51
- cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +1 -0
- cognee/tests/unit/modules/retrieval/structured_output_test.py +204 -0
- cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +1 -1
- cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +0 -1
- cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +608 -0
- cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +83 -0
- cognee/tests/unit/modules/users/test_conditional_authentication.py +0 -63
- cognee/tests/unit/processing/chunks/chunk_by_row_test.py +52 -0
- cognee/tests/unit/tasks/storage/test_add_data_points.py +288 -0
- {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/METADATA +11 -7
- {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/RECORD +212 -160
- {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/entry_points.txt +0 -1
- cognee/api/v1/cognify/code_graph_pipeline.py +0 -119
- cognee/api/v1/cognify/routers/get_code_pipeline_router.py +0 -90
- cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +0 -544
- cognee/modules/retrieval/code_retriever.py +0 -232
- cognee/tasks/code/enrich_dependency_graph_checker.py +0 -35
- cognee/tasks/code/get_local_dependencies_checker.py +0 -20
- cognee/tasks/code/get_repo_dependency_graph_checker.py +0 -35
- cognee/tasks/documents/check_permissions_on_dataset.py +0 -26
- cognee/tasks/repo_processor/__init__.py +0 -2
- cognee/tasks/repo_processor/get_local_dependencies.py +0 -335
- cognee/tasks/repo_processor/get_non_code_files.py +0 -158
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +0 -243
- {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/WHEEL +0 -0
- {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.4.1.dist-info → cognee-0.5.0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing import Optional, List
|
|
2
|
+
|
|
3
|
+
from cognee import memify
|
|
4
|
+
from cognee.context_global_variables import (
|
|
5
|
+
set_database_global_context_variables,
|
|
6
|
+
set_session_user_context_variable,
|
|
7
|
+
)
|
|
8
|
+
from cognee.exceptions import CogneeValidationError
|
|
9
|
+
from cognee.modules.data.methods import get_authorized_existing_datasets
|
|
10
|
+
from cognee.shared.logging_utils import get_logger
|
|
11
|
+
from cognee.modules.pipelines.tasks.task import Task
|
|
12
|
+
from cognee.modules.users.models import User
|
|
13
|
+
from cognee.tasks.memify import extract_user_sessions, cognify_session
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = get_logger("persist_sessions_in_knowledge_graph")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def persist_sessions_in_knowledge_graph_pipeline(
|
|
20
|
+
user: User,
|
|
21
|
+
session_ids: Optional[List[str]] = None,
|
|
22
|
+
dataset: str = "main_dataset",
|
|
23
|
+
run_in_background: bool = False,
|
|
24
|
+
):
|
|
25
|
+
await set_session_user_context_variable(user)
|
|
26
|
+
dataset_to_write = await get_authorized_existing_datasets(
|
|
27
|
+
user=user, datasets=[dataset], permission_type="write"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if not dataset_to_write:
|
|
31
|
+
raise CogneeValidationError(
|
|
32
|
+
message=f"User (id: {str(user.id)}) does not have write access to dataset: {dataset}",
|
|
33
|
+
log=False,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
await set_database_global_context_variables(
|
|
37
|
+
dataset_to_write[0].id, dataset_to_write[0].owner_id
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
extraction_tasks = [Task(extract_user_sessions, session_ids=session_ids)]
|
|
41
|
+
|
|
42
|
+
enrichment_tasks = [
|
|
43
|
+
Task(cognify_session, dataset_id=dataset_to_write[0].id),
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
result = await memify(
|
|
47
|
+
extraction_tasks=extraction_tasks,
|
|
48
|
+
enrichment_tasks=enrichment_tasks,
|
|
49
|
+
dataset=dataset_to_write[0].id,
|
|
50
|
+
data=[{}],
|
|
51
|
+
run_in_background=run_in_background,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
logger.info("Session persistence pipeline completed")
|
|
55
|
+
return result
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from cognee.shared.logging_utils import get_logger
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from cognee.tasks.chunks import chunk_by_row
|
|
5
|
+
from cognee.modules.chunking.Chunker import Chunker
|
|
6
|
+
from .models.DocumentChunk import DocumentChunk
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CsvChunker(Chunker):
|
|
12
|
+
async def read(self):
|
|
13
|
+
async for content_text in self.get_text():
|
|
14
|
+
if content_text is None:
|
|
15
|
+
continue
|
|
16
|
+
|
|
17
|
+
for chunk_data in chunk_by_row(content_text, self.max_chunk_size):
|
|
18
|
+
if chunk_data["chunk_size"] <= self.max_chunk_size:
|
|
19
|
+
yield DocumentChunk(
|
|
20
|
+
id=chunk_data["chunk_id"],
|
|
21
|
+
text=chunk_data["text"],
|
|
22
|
+
chunk_size=chunk_data["chunk_size"],
|
|
23
|
+
is_part_of=self.document,
|
|
24
|
+
chunk_index=self.chunk_index,
|
|
25
|
+
cut_type=chunk_data["cut_type"],
|
|
26
|
+
contains=[],
|
|
27
|
+
metadata={
|
|
28
|
+
"index_fields": ["text"],
|
|
29
|
+
},
|
|
30
|
+
)
|
|
31
|
+
self.chunk_index += 1
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Chunk size is larger than the maximum chunk size {self.max_chunk_size}"
|
|
35
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Union
|
|
2
2
|
|
|
3
3
|
from cognee.infrastructure.engine import DataPoint
|
|
4
|
+
from cognee.infrastructure.engine.models.Edge import Edge
|
|
4
5
|
from cognee.modules.data.processing.document_types import Document
|
|
5
6
|
from cognee.modules.engine.models import Entity
|
|
6
7
|
from cognee.tasks.temporal_graph.models import Event
|
|
@@ -31,6 +32,6 @@ class DocumentChunk(DataPoint):
|
|
|
31
32
|
chunk_index: int
|
|
32
33
|
cut_type: str
|
|
33
34
|
is_part_of: Document
|
|
34
|
-
contains: List[Union[Entity, Event]] = None
|
|
35
|
+
contains: List[Union[Entity, Event, tuple[Edge, Entity]]] = None
|
|
35
36
|
|
|
36
37
|
metadata: dict = {"index_fields": ["text"]}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from cognee.shared.logging_utils import get_logger
|
|
2
|
+
from uuid import NAMESPACE_OID, uuid5
|
|
3
|
+
|
|
4
|
+
from cognee.tasks.chunks import chunk_by_paragraph
|
|
5
|
+
from cognee.modules.chunking.Chunker import Chunker
|
|
6
|
+
from .models.DocumentChunk import DocumentChunk
|
|
7
|
+
|
|
8
|
+
logger = get_logger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextChunkerWithOverlap(Chunker):
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
document,
|
|
15
|
+
get_text: callable,
|
|
16
|
+
max_chunk_size: int,
|
|
17
|
+
chunk_overlap_ratio: float = 0.0,
|
|
18
|
+
get_chunk_data: callable = None,
|
|
19
|
+
):
|
|
20
|
+
super().__init__(document, get_text, max_chunk_size)
|
|
21
|
+
self._accumulated_chunk_data = []
|
|
22
|
+
self._accumulated_size = 0
|
|
23
|
+
self.chunk_overlap_ratio = chunk_overlap_ratio
|
|
24
|
+
self.chunk_overlap = int(max_chunk_size * chunk_overlap_ratio)
|
|
25
|
+
|
|
26
|
+
if get_chunk_data is not None:
|
|
27
|
+
self.get_chunk_data = get_chunk_data
|
|
28
|
+
elif chunk_overlap_ratio > 0:
|
|
29
|
+
paragraph_max_size = int(0.5 * chunk_overlap_ratio * max_chunk_size)
|
|
30
|
+
self.get_chunk_data = lambda text: chunk_by_paragraph(
|
|
31
|
+
text, paragraph_max_size, batch_paragraphs=True
|
|
32
|
+
)
|
|
33
|
+
else:
|
|
34
|
+
self.get_chunk_data = lambda text: chunk_by_paragraph(
|
|
35
|
+
text, self.max_chunk_size, batch_paragraphs=True
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def _accumulation_overflows(self, chunk_data):
|
|
39
|
+
"""Check if adding chunk_data would exceed max_chunk_size."""
|
|
40
|
+
return self._accumulated_size + chunk_data["chunk_size"] > self.max_chunk_size
|
|
41
|
+
|
|
42
|
+
def _accumulate_chunk_data(self, chunk_data):
|
|
43
|
+
"""Add chunk_data to the current accumulation."""
|
|
44
|
+
self._accumulated_chunk_data.append(chunk_data)
|
|
45
|
+
self._accumulated_size += chunk_data["chunk_size"]
|
|
46
|
+
|
|
47
|
+
def _clear_accumulation(self):
|
|
48
|
+
"""Reset accumulation, keeping overlap chunk_data based on chunk_overlap_ratio."""
|
|
49
|
+
if self.chunk_overlap == 0:
|
|
50
|
+
self._accumulated_chunk_data = []
|
|
51
|
+
self._accumulated_size = 0
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
# Keep chunk_data from the end that fit in overlap
|
|
55
|
+
overlap_chunk_data = []
|
|
56
|
+
overlap_size = 0
|
|
57
|
+
|
|
58
|
+
for chunk_data in reversed(self._accumulated_chunk_data):
|
|
59
|
+
if overlap_size + chunk_data["chunk_size"] <= self.chunk_overlap:
|
|
60
|
+
overlap_chunk_data.insert(0, chunk_data)
|
|
61
|
+
overlap_size += chunk_data["chunk_size"]
|
|
62
|
+
else:
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
self._accumulated_chunk_data = overlap_chunk_data
|
|
66
|
+
self._accumulated_size = overlap_size
|
|
67
|
+
|
|
68
|
+
def _create_chunk(self, text, size, cut_type, chunk_id=None):
|
|
69
|
+
"""Create a DocumentChunk with standard metadata."""
|
|
70
|
+
try:
|
|
71
|
+
return DocumentChunk(
|
|
72
|
+
id=chunk_id or uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
|
|
73
|
+
text=text,
|
|
74
|
+
chunk_size=size,
|
|
75
|
+
is_part_of=self.document,
|
|
76
|
+
chunk_index=self.chunk_index,
|
|
77
|
+
cut_type=cut_type,
|
|
78
|
+
contains=[],
|
|
79
|
+
metadata={"index_fields": ["text"]},
|
|
80
|
+
)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
logger.error(e)
|
|
83
|
+
raise e
|
|
84
|
+
|
|
85
|
+
def _create_chunk_from_accumulation(self):
|
|
86
|
+
"""Create a DocumentChunk from current accumulated chunk_data."""
|
|
87
|
+
chunk_text = " ".join(chunk["text"] for chunk in self._accumulated_chunk_data)
|
|
88
|
+
return self._create_chunk(
|
|
89
|
+
text=chunk_text,
|
|
90
|
+
size=self._accumulated_size,
|
|
91
|
+
cut_type=self._accumulated_chunk_data[-1]["cut_type"],
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def _emit_chunk(self, chunk_data):
|
|
95
|
+
"""Emit a chunk when accumulation overflows."""
|
|
96
|
+
if len(self._accumulated_chunk_data) > 0:
|
|
97
|
+
chunk = self._create_chunk_from_accumulation()
|
|
98
|
+
self._clear_accumulation()
|
|
99
|
+
self._accumulate_chunk_data(chunk_data)
|
|
100
|
+
else:
|
|
101
|
+
# Handle single chunk_data exceeding max_chunk_size
|
|
102
|
+
chunk = self._create_chunk(
|
|
103
|
+
text=chunk_data["text"],
|
|
104
|
+
size=chunk_data["chunk_size"],
|
|
105
|
+
cut_type=chunk_data["cut_type"],
|
|
106
|
+
chunk_id=chunk_data["chunk_id"],
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
self.chunk_index += 1
|
|
110
|
+
return chunk
|
|
111
|
+
|
|
112
|
+
async def read(self):
|
|
113
|
+
async for content_text in self.get_text():
|
|
114
|
+
for chunk_data in self.get_chunk_data(content_text):
|
|
115
|
+
if not self._accumulation_overflows(chunk_data):
|
|
116
|
+
self._accumulate_chunk_data(chunk_data)
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
yield self._emit_chunk(chunk_data)
|
|
120
|
+
|
|
121
|
+
if len(self._accumulated_chunk_data) == 0:
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
yield self._create_chunk_from_accumulation()
|
cognee/modules/cognify/config.py
CHANGED
|
@@ -8,12 +8,14 @@ import os
|
|
|
8
8
|
class CognifyConfig(BaseSettings):
|
|
9
9
|
classification_model: object = DefaultContentPrediction
|
|
10
10
|
summarization_model: object = SummarizedContent
|
|
11
|
+
triplet_embedding: bool = False
|
|
11
12
|
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
|
12
13
|
|
|
13
14
|
def to_dict(self) -> dict:
|
|
14
15
|
return {
|
|
15
16
|
"classification_model": self.classification_model,
|
|
16
17
|
"summarization_model": self.summarization_model,
|
|
18
|
+
"triplet_embedding": self.triplet_embedding,
|
|
17
19
|
}
|
|
18
20
|
|
|
19
21
|
|
|
@@ -1,17 +1,67 @@
|
|
|
1
|
+
from sqlalchemy.exc import OperationalError
|
|
2
|
+
|
|
3
|
+
from cognee.infrastructure.databases.exceptions import EntityNotFoundError
|
|
4
|
+
from cognee.context_global_variables import backend_access_control_enabled
|
|
1
5
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
2
6
|
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
|
|
3
7
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
8
|
+
from cognee.infrastructure.databases.utils import (
|
|
9
|
+
get_graph_dataset_database_handler,
|
|
10
|
+
get_vector_dataset_database_handler,
|
|
11
|
+
)
|
|
4
12
|
from cognee.shared.cache import delete_cache
|
|
13
|
+
from cognee.modules.users.models import DatasetDatabase
|
|
14
|
+
from cognee.shared.logging_utils import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def prune_graph_databases():
|
|
20
|
+
db_engine = get_relational_engine()
|
|
21
|
+
try:
|
|
22
|
+
dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
|
|
23
|
+
# Go through each dataset database and delete the graph database
|
|
24
|
+
for dataset_database in dataset_databases:
|
|
25
|
+
handler = get_graph_dataset_database_handler(dataset_database)
|
|
26
|
+
await handler["handler_instance"].delete_dataset(dataset_database)
|
|
27
|
+
except (OperationalError, EntityNotFoundError) as e:
|
|
28
|
+
logger.debug(
|
|
29
|
+
"Skipping pruning of graph DB. Error when accessing dataset_database table: %s",
|
|
30
|
+
e,
|
|
31
|
+
)
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def prune_vector_databases():
|
|
36
|
+
db_engine = get_relational_engine()
|
|
37
|
+
try:
|
|
38
|
+
dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
|
|
39
|
+
# Go through each dataset database and delete the vector database
|
|
40
|
+
for dataset_database in dataset_databases:
|
|
41
|
+
handler = get_vector_dataset_database_handler(dataset_database)
|
|
42
|
+
await handler["handler_instance"].delete_dataset(dataset_database)
|
|
43
|
+
except (OperationalError, EntityNotFoundError) as e:
|
|
44
|
+
logger.debug(
|
|
45
|
+
"Skipping pruning of vector DB. Error when accessing dataset_database table: %s",
|
|
46
|
+
e,
|
|
47
|
+
)
|
|
48
|
+
return
|
|
5
49
|
|
|
6
50
|
|
|
7
51
|
async def prune_system(graph=True, vector=True, metadata=True, cache=True):
|
|
8
|
-
|
|
52
|
+
# Note: prune system should not be available through the API, it has no permission checks and will
|
|
53
|
+
# delete all graph and vector databases if called. It should only be used in development or testing environments.
|
|
54
|
+
if graph and not backend_access_control_enabled():
|
|
9
55
|
graph_engine = await get_graph_engine()
|
|
10
56
|
await graph_engine.delete_graph()
|
|
57
|
+
elif graph and backend_access_control_enabled():
|
|
58
|
+
await prune_graph_databases()
|
|
11
59
|
|
|
12
|
-
if vector:
|
|
60
|
+
if vector and not backend_access_control_enabled():
|
|
13
61
|
vector_engine = get_vector_engine()
|
|
14
62
|
await vector_engine.prune()
|
|
63
|
+
elif vector and backend_access_control_enabled():
|
|
64
|
+
await prune_vector_databases()
|
|
15
65
|
|
|
16
66
|
if metadata:
|
|
17
67
|
db_engine = get_relational_engine()
|
|
@@ -10,6 +10,7 @@ from .get_authorized_dataset import get_authorized_dataset
|
|
|
10
10
|
from .get_authorized_dataset_by_name import get_authorized_dataset_by_name
|
|
11
11
|
from .get_data import get_data
|
|
12
12
|
from .get_unique_dataset_id import get_unique_dataset_id
|
|
13
|
+
from .get_unique_data_id import get_unique_data_id
|
|
13
14
|
from .get_authorized_existing_datasets import get_authorized_existing_datasets
|
|
14
15
|
from .get_dataset_ids import get_dataset_ids
|
|
15
16
|
|
|
@@ -16,14 +16,16 @@ async def create_dataset(dataset_name: str, user: User, session: AsyncSession) -
|
|
|
16
16
|
.options(joinedload(Dataset.data))
|
|
17
17
|
.filter(Dataset.name == dataset_name)
|
|
18
18
|
.filter(Dataset.owner_id == owner_id)
|
|
19
|
+
.filter(Dataset.tenant_id == user.tenant_id)
|
|
19
20
|
)
|
|
20
21
|
).first()
|
|
21
22
|
|
|
22
23
|
if dataset is None:
|
|
23
24
|
# Dataset id should be generated based on dataset_name and owner_id/user so multiple users can use the same dataset_name
|
|
24
25
|
dataset_id = await get_unique_dataset_id(dataset_name=dataset_name, user=user)
|
|
25
|
-
dataset = Dataset(
|
|
26
|
-
|
|
26
|
+
dataset = Dataset(
|
|
27
|
+
id=dataset_id, name=dataset_name, data=[], owner_id=owner_id, tenant_id=user.tenant_id
|
|
28
|
+
)
|
|
27
29
|
|
|
28
30
|
session.add(dataset)
|
|
29
31
|
|
|
@@ -1,8 +1,34 @@
|
|
|
1
|
+
from cognee.modules.users.models import DatasetDatabase
|
|
2
|
+
from sqlalchemy import select
|
|
3
|
+
|
|
1
4
|
from cognee.modules.data.models import Dataset
|
|
5
|
+
from cognee.infrastructure.databases.utils.get_vector_dataset_database_handler import (
|
|
6
|
+
get_vector_dataset_database_handler,
|
|
7
|
+
)
|
|
8
|
+
from cognee.infrastructure.databases.utils.get_graph_dataset_database_handler import (
|
|
9
|
+
get_graph_dataset_database_handler,
|
|
10
|
+
)
|
|
2
11
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
3
12
|
|
|
4
13
|
|
|
5
14
|
async def delete_dataset(dataset: Dataset):
|
|
6
15
|
db_engine = get_relational_engine()
|
|
7
16
|
|
|
17
|
+
async with db_engine.get_async_session() as session:
|
|
18
|
+
stmt = select(DatasetDatabase).where(
|
|
19
|
+
DatasetDatabase.dataset_id == dataset.id,
|
|
20
|
+
)
|
|
21
|
+
dataset_database: DatasetDatabase = await session.scalar(stmt)
|
|
22
|
+
if dataset_database:
|
|
23
|
+
graph_dataset_database_handler = get_graph_dataset_database_handler(dataset_database)
|
|
24
|
+
vector_dataset_database_handler = get_vector_dataset_database_handler(dataset_database)
|
|
25
|
+
await graph_dataset_database_handler["handler_instance"].delete_dataset(
|
|
26
|
+
dataset_database
|
|
27
|
+
)
|
|
28
|
+
await vector_dataset_database_handler["handler_instance"].delete_dataset(
|
|
29
|
+
dataset_database
|
|
30
|
+
)
|
|
31
|
+
# TODO: Remove dataset from pipeline_run_status in Data objects related to dataset as well
|
|
32
|
+
# This blocks recreation of the dataset with the same name and data after deletion as
|
|
33
|
+
# it's marked as completed and will be just skipped even though it's empty.
|
|
8
34
|
return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id)
|
|
@@ -27,7 +27,11 @@ async def get_dataset_ids(datasets: Union[list[str], list[UUID]], user):
|
|
|
27
27
|
# Get all user owned dataset objects (If a user wants to write to a dataset he is not the owner of it must be provided through UUID.)
|
|
28
28
|
user_datasets = await get_datasets(user.id)
|
|
29
29
|
# Filter out non name mentioned datasets
|
|
30
|
-
dataset_ids = [dataset
|
|
30
|
+
dataset_ids = [dataset for dataset in user_datasets if dataset.name in datasets]
|
|
31
|
+
# Filter out non current tenant datasets
|
|
32
|
+
dataset_ids = [
|
|
33
|
+
dataset.id for dataset in dataset_ids if dataset.tenant_id == user.tenant_id
|
|
34
|
+
]
|
|
31
35
|
else:
|
|
32
36
|
raise DatasetTypeError(
|
|
33
37
|
f"One or more of the provided dataset types is not handled: f{datasets}"
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from uuid import uuid5, NAMESPACE_OID, UUID
|
|
2
|
+
from sqlalchemy import select
|
|
3
|
+
|
|
4
|
+
from cognee.modules.data.models.Data import Data
|
|
5
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
6
|
+
from cognee.modules.users.models import User
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def get_unique_data_id(data_identifier: str, user: User) -> UUID:
|
|
10
|
+
"""
|
|
11
|
+
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
|
12
|
+
If data with legacy ID exists, return that ID to maintain compatibility.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
|
16
|
+
user: User object adding the data
|
|
17
|
+
tenant_id: UUID of the tenant for which data is being added
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
UUID: Unique identifier for the data
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def _get_deprecated_unique_data_id(data_identifier: str, user: User) -> UUID:
|
|
24
|
+
"""
|
|
25
|
+
Deprecated function, returns a unique UUID for data based on data identifier and user id.
|
|
26
|
+
Needed to support legacy data without tenant information.
|
|
27
|
+
Args:
|
|
28
|
+
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
|
29
|
+
user: User object adding the data
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
UUID: Unique identifier for the data
|
|
33
|
+
"""
|
|
34
|
+
# return UUID hash of file contents + owner id + tenant_id
|
|
35
|
+
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}")
|
|
36
|
+
|
|
37
|
+
def _get_modern_unique_data_id(data_identifier: str, user: User) -> UUID:
|
|
38
|
+
"""
|
|
39
|
+
Function returns a unique UUID for data based on data identifier, user id and tenant id.
|
|
40
|
+
Args:
|
|
41
|
+
data_identifier: A way to uniquely identify data (e.g. file hash, data name, etc.)
|
|
42
|
+
user: User object adding the data
|
|
43
|
+
tenant_id: UUID of the tenant for which data is being added
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
UUID: Unique identifier for the data
|
|
47
|
+
"""
|
|
48
|
+
# return UUID hash of file contents + owner id + tenant_id
|
|
49
|
+
return uuid5(NAMESPACE_OID, f"{data_identifier}{str(user.id)}{str(user.tenant_id)}")
|
|
50
|
+
|
|
51
|
+
# Get all possible data_id values
|
|
52
|
+
data_id = {
|
|
53
|
+
"modern_data_id": _get_modern_unique_data_id(data_identifier=data_identifier, user=user),
|
|
54
|
+
"legacy_data_id": _get_deprecated_unique_data_id(
|
|
55
|
+
data_identifier=data_identifier, user=user
|
|
56
|
+
),
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Check if data item with legacy_data_id exists, if so use that one, else use modern_data_id
|
|
60
|
+
db_engine = get_relational_engine()
|
|
61
|
+
async with db_engine.get_async_session() as session:
|
|
62
|
+
legacy_data_point = (
|
|
63
|
+
await session.execute(select(Data).filter(Data.id == data_id["legacy_data_id"]))
|
|
64
|
+
).scalar_one_or_none()
|
|
65
|
+
|
|
66
|
+
if not legacy_data_point:
|
|
67
|
+
return data_id["modern_data_id"]
|
|
68
|
+
return data_id["legacy_data_id"]
|
|
@@ -1,9 +1,71 @@
|
|
|
1
1
|
from uuid import UUID, uuid5, NAMESPACE_OID
|
|
2
|
-
from cognee.modules.users.models import User
|
|
3
2
|
from typing import Union
|
|
3
|
+
from sqlalchemy import select
|
|
4
|
+
|
|
5
|
+
from cognee.modules.data.models.Dataset import Dataset
|
|
6
|
+
from cognee.modules.users.models import User
|
|
7
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
4
8
|
|
|
5
9
|
|
|
6
10
|
async def get_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
11
|
+
"""
|
|
12
|
+
Function returns a unique UUID for dataset based on dataset name, user id and tenant id.
|
|
13
|
+
If dataset with legacy ID exists, return that ID to maintain compatibility.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
dataset_name: string representing the dataset name
|
|
17
|
+
user: User object adding the dataset
|
|
18
|
+
tenant_id: UUID of the tenant for which dataset is being added
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
UUID: Unique identifier for the dataset
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def _get_legacy_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
|
25
|
+
"""
|
|
26
|
+
Legacy function, returns a unique UUID for dataset based on dataset name and user id.
|
|
27
|
+
Needed to support legacy datasets without tenant information.
|
|
28
|
+
Args:
|
|
29
|
+
dataset_name: string representing the dataset name
|
|
30
|
+
user: Current User object adding the dataset
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
UUID: Unique identifier for the dataset
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(dataset_name, UUID):
|
|
36
|
+
return dataset_name
|
|
37
|
+
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}")
|
|
38
|
+
|
|
39
|
+
def _get_modern_unique_dataset_id(dataset_name: Union[str, UUID], user: User) -> UUID:
|
|
40
|
+
"""
|
|
41
|
+
Returns a unique UUID for dataset based on dataset name, user id and tenant_id.
|
|
42
|
+
Args:
|
|
43
|
+
dataset_name: string representing the dataset name
|
|
44
|
+
user: Current User object adding the dataset
|
|
45
|
+
tenant_id: UUID of the tenant for which dataset is being added
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
UUID: Unique identifier for the dataset
|
|
49
|
+
"""
|
|
50
|
+
if isinstance(dataset_name, UUID):
|
|
51
|
+
return dataset_name
|
|
52
|
+
return uuid5(NAMESPACE_OID, f"{dataset_name}{str(user.id)}{str(user.tenant_id)}")
|
|
53
|
+
|
|
54
|
+
# Get all possible dataset_id values
|
|
55
|
+
dataset_id = {
|
|
56
|
+
"modern_dataset_id": _get_modern_unique_dataset_id(dataset_name=dataset_name, user=user),
|
|
57
|
+
"legacy_dataset_id": _get_legacy_unique_dataset_id(dataset_name=dataset_name, user=user),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Check if dataset with legacy_dataset_id exists, if so use that one, else use modern_dataset_id
|
|
61
|
+
db_engine = get_relational_engine()
|
|
62
|
+
async with db_engine.get_async_session() as session:
|
|
63
|
+
legacy_dataset = (
|
|
64
|
+
await session.execute(
|
|
65
|
+
select(Dataset).filter(Dataset.id == dataset_id["legacy_dataset_id"])
|
|
66
|
+
)
|
|
67
|
+
).scalar_one_or_none()
|
|
68
|
+
|
|
69
|
+
if not legacy_dataset:
|
|
70
|
+
return dataset_id["modern_dataset_id"]
|
|
71
|
+
return dataset_id["legacy_dataset_id"]
|
|
@@ -18,6 +18,7 @@ class Dataset(Base):
|
|
|
18
18
|
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
|
|
19
19
|
|
|
20
20
|
owner_id = Column(UUID, index=True)
|
|
21
|
+
tenant_id = Column(UUID, index=True, nullable=True)
|
|
21
22
|
|
|
22
23
|
acls = relationship("ACL", back_populates="dataset", cascade="all, delete-orphan")
|
|
23
24
|
|
|
@@ -36,5 +37,6 @@ class Dataset(Base):
|
|
|
36
37
|
"createdAt": self.created_at.isoformat(),
|
|
37
38
|
"updatedAt": self.updated_at.isoformat() if self.updated_at else None,
|
|
38
39
|
"ownerId": str(self.owner_id),
|
|
40
|
+
"tenantId": str(self.tenant_id),
|
|
39
41
|
"data": [data.to_json() for data in self.data],
|
|
40
42
|
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Type
|
|
4
|
+
|
|
5
|
+
from cognee.modules.chunking.Chunker import Chunker
|
|
6
|
+
from cognee.infrastructure.files.utils.open_data_file import open_data_file
|
|
7
|
+
from .Document import Document
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CsvDocument(Document):
|
|
11
|
+
type: str = "csv"
|
|
12
|
+
mime_type: str = "text/csv"
|
|
13
|
+
|
|
14
|
+
async def read(self, chunker_cls: Type[Chunker], max_chunk_size: int):
|
|
15
|
+
async def get_text():
|
|
16
|
+
async with open_data_file(
|
|
17
|
+
self.raw_data_location, mode="r", encoding="utf-8", newline=""
|
|
18
|
+
) as file:
|
|
19
|
+
content = file.read()
|
|
20
|
+
file_like_obj = io.StringIO(content)
|
|
21
|
+
reader = csv.DictReader(file_like_obj)
|
|
22
|
+
|
|
23
|
+
for row in reader:
|
|
24
|
+
pairs = [f"{str(k)}: {str(v)}" for k, v in row.items()]
|
|
25
|
+
row_text = ", ".join(pairs)
|
|
26
|
+
if not row_text.strip():
|
|
27
|
+
break
|
|
28
|
+
yield row_text
|
|
29
|
+
|
|
30
|
+
chunker = chunker_cls(self, max_chunk_size=max_chunk_size, get_text=get_text)
|
|
31
|
+
|
|
32
|
+
async for chunk in chunker.read():
|
|
33
|
+
yield chunk
|