cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +44 -4
- cognee/api/health.py +332 -0
- cognee/api/v1/add/add.py +5 -2
- cognee/api/v1/add/routers/get_add_router.py +3 -0
- cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
- cognee/api/v1/cognify/cognify.py +8 -0
- cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
- cognee/api/v1/config/config.py +3 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
- cognee/api/v1/delete/delete.py +16 -12
- cognee/api/v1/responses/routers/get_responses_router.py +3 -1
- cognee/api/v1/search/search.py +10 -0
- cognee/api/v1/settings/routers/get_settings_router.py +0 -2
- cognee/base_config.py +1 -0
- cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
- cognee/infrastructure/databases/graph/config.py +2 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
- cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
- cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
- cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
- cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
- cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
- cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
- cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
- cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
- cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
- cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
- cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
- cognee/infrastructure/files/utils/guess_file_type.py +2 -2
- cognee/infrastructure/files/utils/open_data_file.py +4 -23
- cognee/infrastructure/llm/LLMGateway.py +137 -0
- cognee/infrastructure/llm/__init__.py +14 -4
- cognee/infrastructure/llm/config.py +29 -1
- cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
- cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
- cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
- cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
- cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
- cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
- cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
- cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
- cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
- cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
- cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
- cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
- cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
- cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
- cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
- cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
- cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
- cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
- cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
- cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
- cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
- cognee/infrastructure/llm/utils.py +3 -1
- cognee/infrastructure/loaders/LoaderEngine.py +156 -0
- cognee/infrastructure/loaders/LoaderInterface.py +73 -0
- cognee/infrastructure/loaders/__init__.py +18 -0
- cognee/infrastructure/loaders/core/__init__.py +7 -0
- cognee/infrastructure/loaders/core/audio_loader.py +98 -0
- cognee/infrastructure/loaders/core/image_loader.py +114 -0
- cognee/infrastructure/loaders/core/text_loader.py +90 -0
- cognee/infrastructure/loaders/create_loader_engine.py +32 -0
- cognee/infrastructure/loaders/external/__init__.py +22 -0
- cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
- cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
- cognee/infrastructure/loaders/get_loader_engine.py +18 -0
- cognee/infrastructure/loaders/supported_loaders.py +18 -0
- cognee/infrastructure/loaders/use_loader.py +21 -0
- cognee/infrastructure/loaders/utils/__init__.py +0 -0
- cognee/modules/data/methods/__init__.py +1 -0
- cognee/modules/data/methods/get_authorized_dataset.py +23 -0
- cognee/modules/data/models/Data.py +13 -3
- cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
- cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
- cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
- cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
- cognee/modules/engine/utils/generate_edge_id.py +5 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
- cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
- cognee/modules/graph/utils/get_graph_from_model.py +93 -101
- cognee/modules/ingestion/data_types/TextData.py +8 -2
- cognee/modules/ingestion/save_data_to_file.py +1 -1
- cognee/modules/pipelines/exceptions/__init__.py +1 -0
- cognee/modules/pipelines/exceptions/exceptions.py +12 -0
- cognee/modules/pipelines/models/DataItemStatus.py +5 -0
- cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
- cognee/modules/pipelines/models/__init__.py +1 -0
- cognee/modules/pipelines/operations/pipeline.py +10 -2
- cognee/modules/pipelines/operations/run_tasks.py +252 -20
- cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
- cognee/modules/retrieval/chunks_retriever.py +23 -1
- cognee/modules/retrieval/code_retriever.py +66 -9
- cognee/modules/retrieval/completion_retriever.py +11 -9
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
- cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
- cognee/modules/retrieval/graph_completion_retriever.py +1 -1
- cognee/modules/retrieval/insights_retriever.py +4 -0
- cognee/modules/retrieval/natural_language_retriever.py +9 -15
- cognee/modules/retrieval/summaries_retriever.py +23 -1
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
- cognee/modules/retrieval/utils/completion.py +6 -9
- cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
- cognee/modules/search/methods/search.py +5 -1
- cognee/modules/search/operations/__init__.py +1 -0
- cognee/modules/search/operations/select_search_type.py +42 -0
- cognee/modules/search/types/SearchType.py +1 -0
- cognee/modules/settings/get_settings.py +0 -8
- cognee/modules/settings/save_vector_db_config.py +1 -1
- cognee/shared/data_models.py +3 -1
- cognee/shared/logging_utils.py +0 -5
- cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
- cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
- cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
- cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
- cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
- cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
- cognee/tasks/graph/extract_graph_from_code.py +3 -2
- cognee/tasks/graph/extract_graph_from_data.py +4 -3
- cognee/tasks/graph/infer_data_ontology.py +5 -6
- cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
- cognee/tasks/ingestion/ingest_data.py +91 -61
- cognee/tasks/ingestion/resolve_data_directories.py +3 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
- cognee/tasks/storage/index_data_points.py +1 -1
- cognee/tasks/storage/index_graph_edges.py +4 -1
- cognee/tasks/summarization/summarize_code.py +2 -3
- cognee/tasks/summarization/summarize_text.py +3 -2
- cognee/tests/test_cognee_server_start.py +12 -7
- cognee/tests/test_deduplication.py +2 -2
- cognee/tests/test_deletion.py +58 -17
- cognee/tests/test_graph_visualization_permissions.py +161 -0
- cognee/tests/test_neptune_analytics_graph.py +309 -0
- cognee/tests/test_neptune_analytics_hybrid.py +176 -0
- cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
- cognee/tests/test_pgvector.py +5 -5
- cognee/tests/test_s3.py +1 -6
- cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
- cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
- cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
- cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
- cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
- cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
- cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
- cognee/tests/unit/modules/search/search_methods_test.py +55 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
- cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
- cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
- cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
- cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
- cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
- cognee/modules/data/extraction/extract_categories.py +0 -14
- cognee/tests/test_qdrant.py +0 -99
- distributed/Dockerfile +0 -34
- distributed/app.py +0 -4
- distributed/entrypoint.py +0 -71
- distributed/entrypoint.sh +0 -5
- distributed/modal_image.py +0 -11
- distributed/queues.py +0 -5
- distributed/tasks/queued_add_data_points.py +0 -13
- distributed/tasks/queued_add_edges.py +0 -13
- distributed/tasks/queued_add_nodes.py +0 -13
- distributed/test.py +0 -28
- distributed/utils.py +0 -19
- distributed/workers/data_point_saving_worker.py +0 -93
- distributed/workers/graph_saving_worker.py +0 -104
- /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
- /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
- /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
- /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
- /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
- /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
- /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
- /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
- /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
- /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
- {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
- {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
- /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Type, List
|
|
3
3
|
from pydantic import BaseModel
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
5
6
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
|
6
7
|
from cognee.tasks.storage import add_data_points
|
|
7
8
|
|
|
@@ -17,7 +18,7 @@ async def extract_graph_from_code(
|
|
|
17
18
|
- Graph nodes are stored using the `add_data_points` function for later retrieval or analysis.
|
|
18
19
|
"""
|
|
19
20
|
chunk_graphs = await asyncio.gather(
|
|
20
|
-
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
|
21
|
+
*[LLMGateway.extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
|
21
22
|
)
|
|
22
23
|
|
|
23
24
|
for chunk_index, chunk in enumerate(data_chunks):
|
|
@@ -3,15 +3,15 @@ from typing import Type, List
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
5
5
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
6
|
+
from cognee.tasks.storage.add_data_points import add_data_points
|
|
6
7
|
from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
|
|
7
8
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
|
8
|
-
from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
|
|
9
9
|
from cognee.modules.graph.utils import (
|
|
10
10
|
expand_with_nodes_and_edges,
|
|
11
11
|
retrieve_existing_edges,
|
|
12
12
|
)
|
|
13
13
|
from cognee.shared.data_models import KnowledgeGraph
|
|
14
|
-
from cognee.
|
|
14
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
async def integrate_chunk_graphs(
|
|
@@ -40,6 +40,7 @@ async def integrate_chunk_graphs(
|
|
|
40
40
|
|
|
41
41
|
if len(graph_nodes) > 0:
|
|
42
42
|
await add_data_points(graph_nodes)
|
|
43
|
+
|
|
43
44
|
if len(graph_edges) > 0:
|
|
44
45
|
await graph_engine.add_edges(graph_edges)
|
|
45
46
|
|
|
@@ -55,7 +56,7 @@ async def extract_graph_from_data(
|
|
|
55
56
|
Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
|
|
56
57
|
"""
|
|
57
58
|
chunk_graphs = await asyncio.gather(
|
|
58
|
-
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
|
59
|
+
*[LLMGateway.extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
|
59
60
|
)
|
|
60
61
|
|
|
61
62
|
# Note: Filter edges with missing source or target nodes
|
|
@@ -15,19 +15,19 @@ from pydantic import BaseModel
|
|
|
15
15
|
|
|
16
16
|
from cognee.modules.graph.exceptions import EntityNotFoundError
|
|
17
17
|
from cognee.modules.ingestion.exceptions import IngestionError
|
|
18
|
-
|
|
19
|
-
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
|
18
|
+
|
|
20
19
|
from cognee.infrastructure.data.chunking.config import get_chunk_config
|
|
21
20
|
from cognee.infrastructure.data.chunking.get_chunking_engine import get_chunk_engine
|
|
22
21
|
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
|
|
23
22
|
from cognee.infrastructure.files.utils.extract_text_from_file import extract_text_from_file
|
|
24
23
|
from cognee.infrastructure.files.utils.guess_file_type import guess_file_type, FileTypeException
|
|
25
|
-
from cognee.modules.data.
|
|
24
|
+
from cognee.modules.data.methods.add_model_class_to_graph import (
|
|
26
25
|
add_model_class_to_graph,
|
|
27
26
|
)
|
|
28
27
|
from cognee.tasks.graph.models import NodeModel, GraphOntology
|
|
29
28
|
from cognee.shared.data_models import KnowledgeGraph
|
|
30
29
|
from cognee.modules.engine.utils import generate_node_id, generate_node_name
|
|
30
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
31
31
|
|
|
32
32
|
logger = get_logger("task:infer_data_ontology")
|
|
33
33
|
|
|
@@ -52,11 +52,10 @@ async def extract_ontology(content: str, response_model: Type[BaseModel]):
|
|
|
52
52
|
|
|
53
53
|
The structured ontology extracted from the content.
|
|
54
54
|
"""
|
|
55
|
-
llm_client = get_llm_client()
|
|
56
55
|
|
|
57
|
-
system_prompt = read_query_prompt("extract_ontology.txt")
|
|
56
|
+
system_prompt = LLMGateway.read_query_prompt("extract_ontology.txt")
|
|
58
57
|
|
|
59
|
-
ontology = await
|
|
58
|
+
ontology = await LLMGateway.acreate_structured_output(content, system_prompt, response_model)
|
|
60
59
|
|
|
61
60
|
return ontology
|
|
62
61
|
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from urllib.parse import urlparse
|
|
3
|
+
from typing import List, Tuple
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import tempfile
|
|
6
|
+
|
|
7
|
+
from cognee.infrastructure.loaders.LoaderInterface import LoaderInterface
|
|
8
|
+
from cognee.modules.ingestion.exceptions import IngestionError
|
|
9
|
+
from cognee.infrastructure.loaders import get_loader_engine
|
|
10
|
+
from cognee.shared.logging_utils import get_logger
|
|
11
|
+
from cognee.infrastructure.files.utils.open_data_file import open_data_file
|
|
12
|
+
|
|
13
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SaveDataSettings(BaseSettings):
|
|
19
|
+
accept_local_file_path: bool = True
|
|
20
|
+
|
|
21
|
+
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
settings = SaveDataSettings()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def pull_from_s3(file_path, destination_file) -> None:
|
|
28
|
+
async with open_data_file(file_path) as file:
|
|
29
|
+
while True:
|
|
30
|
+
chunk = file.read(8192)
|
|
31
|
+
if not chunk:
|
|
32
|
+
break
|
|
33
|
+
destination_file.write(chunk)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def data_item_to_text_file(
|
|
37
|
+
data_item_path: str, preferred_loaders: List[str]
|
|
38
|
+
) -> Tuple[str, LoaderInterface]:
|
|
39
|
+
if isinstance(data_item_path, str):
|
|
40
|
+
parsed_url = urlparse(data_item_path)
|
|
41
|
+
|
|
42
|
+
# data is s3 file path
|
|
43
|
+
if parsed_url.scheme == "s3":
|
|
44
|
+
# TODO: Rework this to work with file streams and not saving data to temp storage
|
|
45
|
+
# Note: proper suffix information is needed for OpenAI to handle mp3 files
|
|
46
|
+
path_info = Path(parsed_url.path)
|
|
47
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=path_info.suffix) as temp_file:
|
|
48
|
+
await pull_from_s3(data_item_path, temp_file)
|
|
49
|
+
temp_file.flush() # Data needs to be saved to local storage
|
|
50
|
+
loader = get_loader_engine()
|
|
51
|
+
return await loader.load_file(temp_file.name, preferred_loaders), loader.get_loader(
|
|
52
|
+
temp_file.name, preferred_loaders
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# data is local file path
|
|
56
|
+
elif parsed_url.scheme == "file":
|
|
57
|
+
if settings.accept_local_file_path:
|
|
58
|
+
loader = get_loader_engine()
|
|
59
|
+
return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
|
|
60
|
+
data_item_path, preferred_loaders
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
raise IngestionError(message="Local files are not accepted.")
|
|
64
|
+
|
|
65
|
+
# data is an absolute file path
|
|
66
|
+
elif data_item_path.startswith("/") or (
|
|
67
|
+
os.name == "nt" and len(data_item_path) > 1 and data_item_path[1] == ":"
|
|
68
|
+
):
|
|
69
|
+
# Handle both Unix absolute paths (/path) and Windows absolute paths (C:\path)
|
|
70
|
+
if settings.accept_local_file_path:
|
|
71
|
+
loader = get_loader_engine()
|
|
72
|
+
return await loader.load_file(data_item_path, preferred_loaders), loader.get_loader(
|
|
73
|
+
data_item_path, preferred_loaders
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
raise IngestionError(message="Local files are not accepted.")
|
|
77
|
+
|
|
78
|
+
# data is not a supported type
|
|
79
|
+
raise IngestionError(message=f"Data type not supported: {type(data_item_path)}")
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import inspect
|
|
3
|
-
from os import path
|
|
4
3
|
from uuid import UUID
|
|
5
4
|
from typing import Union, BinaryIO, Any, List, Optional
|
|
6
5
|
|
|
7
6
|
import cognee.modules.ingestion as ingestion
|
|
8
|
-
from cognee.infrastructure.files.utils.open_data_file import open_data_file
|
|
9
7
|
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
10
8
|
from cognee.modules.data.models import Data
|
|
11
9
|
from cognee.modules.users.models import User
|
|
12
10
|
from cognee.modules.users.methods import get_default_user
|
|
13
11
|
from cognee.modules.users.permissions.methods import get_specific_user_permission_datasets
|
|
12
|
+
from cognee.infrastructure.files.utils.open_data_file import open_data_file
|
|
13
|
+
from cognee.infrastructure.files.utils.get_data_file_path import get_data_file_path
|
|
14
14
|
from cognee.modules.data.methods import (
|
|
15
15
|
get_authorized_existing_datasets,
|
|
16
16
|
get_dataset_data,
|
|
@@ -18,6 +18,7 @@ from cognee.modules.data.methods import (
|
|
|
18
18
|
)
|
|
19
19
|
|
|
20
20
|
from .save_data_item_to_storage import save_data_item_to_storage
|
|
21
|
+
from .data_item_to_text_file import data_item_to_text_file
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
async def ingest_data(
|
|
@@ -26,6 +27,7 @@ async def ingest_data(
|
|
|
26
27
|
user: User,
|
|
27
28
|
node_set: Optional[List[str]] = None,
|
|
28
29
|
dataset_id: UUID = None,
|
|
30
|
+
preferred_loaders: List[str] = None,
|
|
29
31
|
):
|
|
30
32
|
if not user:
|
|
31
33
|
user = await get_default_user()
|
|
@@ -42,6 +44,7 @@ async def ingest_data(
|
|
|
42
44
|
user: User,
|
|
43
45
|
node_set: Optional[List[str]] = None,
|
|
44
46
|
dataset_id: UUID = None,
|
|
47
|
+
preferred_loaders: List[str] = None,
|
|
45
48
|
):
|
|
46
49
|
new_datapoints = []
|
|
47
50
|
existing_data_points = []
|
|
@@ -74,71 +77,96 @@ async def ingest_data(
|
|
|
74
77
|
dataset_data_map = {str(data.id): True for data in dataset_data}
|
|
75
78
|
|
|
76
79
|
for data_item in data:
|
|
77
|
-
|
|
80
|
+
# Get file path of data item or create a file it doesn't exist
|
|
81
|
+
original_file_path = await save_data_item_to_storage(data_item)
|
|
82
|
+
|
|
83
|
+
# Transform file path to be OS usable
|
|
84
|
+
actual_file_path = get_data_file_path(original_file_path)
|
|
78
85
|
|
|
79
|
-
#
|
|
80
|
-
|
|
86
|
+
# Store all input data as text files in Cognee data storage
|
|
87
|
+
cognee_storage_file_path, loader_engine = await data_item_to_text_file(
|
|
88
|
+
actual_file_path, preferred_loaders
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Find metadata from original file
|
|
92
|
+
async with open_data_file(original_file_path) as file:
|
|
81
93
|
classified_data = ingestion.classify(file)
|
|
82
94
|
|
|
83
|
-
# data_id is the hash of file contents + owner id to avoid duplicate data
|
|
95
|
+
# data_id is the hash of original file contents + owner id to avoid duplicate data
|
|
84
96
|
data_id = ingestion.identify(classified_data, user)
|
|
97
|
+
original_file_metadata = classified_data.get_metadata()
|
|
85
98
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
99
|
+
# Find metadata from Cognee data storage text file
|
|
100
|
+
async with open_data_file(cognee_storage_file_path) as file:
|
|
101
|
+
classified_data = ingestion.classify(file)
|
|
102
|
+
storage_file_metadata = classified_data.get_metadata()
|
|
103
|
+
|
|
104
|
+
from sqlalchemy import select
|
|
105
|
+
|
|
106
|
+
db_engine = get_relational_engine()
|
|
107
|
+
|
|
108
|
+
# Check to see if data should be updated
|
|
109
|
+
async with db_engine.get_async_session() as session:
|
|
110
|
+
data_point = (
|
|
111
|
+
await session.execute(select(Data).filter(Data.id == data_id))
|
|
112
|
+
).scalar_one_or_none()
|
|
113
|
+
|
|
114
|
+
# TODO: Maybe allow getting of external metadata through ingestion loader?
|
|
115
|
+
ext_metadata = get_external_metadata_dict(data_item)
|
|
116
|
+
|
|
117
|
+
if node_set:
|
|
118
|
+
ext_metadata["node_set"] = node_set
|
|
119
|
+
|
|
120
|
+
if data_point is not None:
|
|
121
|
+
data_point.name = original_file_metadata["name"]
|
|
122
|
+
data_point.raw_data_location = cognee_storage_file_path
|
|
123
|
+
data_point.original_data_location = original_file_metadata["file_path"]
|
|
124
|
+
data_point.extension = storage_file_metadata["extension"]
|
|
125
|
+
data_point.mime_type = storage_file_metadata["mime_type"]
|
|
126
|
+
data_point.original_extension = original_file_metadata["extension"]
|
|
127
|
+
data_point.original_mime_type = original_file_metadata["mime_type"]
|
|
128
|
+
data_point.loader_engine = loader_engine.loader_name
|
|
129
|
+
data_point.owner_id = user.id
|
|
130
|
+
data_point.content_hash = original_file_metadata["content_hash"]
|
|
131
|
+
data_point.raw_content_hash = storage_file_metadata["content_hash"]
|
|
132
|
+
data_point.file_size = original_file_metadata["file_size"]
|
|
133
|
+
data_point.external_metadata = ext_metadata
|
|
134
|
+
data_point.node_set = json.dumps(node_set) if node_set else None
|
|
135
|
+
data_point.tenant_id = user.tenant_id if user.tenant_id else None
|
|
136
|
+
|
|
137
|
+
# Check if data is already in dataset
|
|
138
|
+
if str(data_point.id) in dataset_data_map:
|
|
139
|
+
existing_data_points.append(data_point)
|
|
121
140
|
else:
|
|
122
|
-
|
|
123
|
-
continue
|
|
124
|
-
|
|
125
|
-
data_point = Data(
|
|
126
|
-
id=data_id,
|
|
127
|
-
name=file_metadata["name"],
|
|
128
|
-
raw_data_location=file_metadata["file_path"],
|
|
129
|
-
extension=file_metadata["extension"],
|
|
130
|
-
mime_type=file_metadata["mime_type"],
|
|
131
|
-
owner_id=user.id,
|
|
132
|
-
content_hash=file_metadata["content_hash"],
|
|
133
|
-
external_metadata=ext_metadata,
|
|
134
|
-
node_set=json.dumps(node_set) if node_set else None,
|
|
135
|
-
data_size=file_metadata["file_size"],
|
|
136
|
-
tenant_id=user.tenant_id if user.tenant_id else None,
|
|
137
|
-
token_count=-1,
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
new_datapoints.append(data_point)
|
|
141
|
+
dataset_new_data_points.append(data_point)
|
|
141
142
|
dataset_data_map[str(data_point.id)] = True
|
|
143
|
+
else:
|
|
144
|
+
if str(data_id) in dataset_data_map:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
data_point = Data(
|
|
148
|
+
id=data_id,
|
|
149
|
+
name=original_file_metadata["name"],
|
|
150
|
+
raw_data_location=cognee_storage_file_path,
|
|
151
|
+
original_data_location=original_file_metadata["file_path"],
|
|
152
|
+
extension=storage_file_metadata["extension"],
|
|
153
|
+
mime_type=storage_file_metadata["mime_type"],
|
|
154
|
+
original_extension=original_file_metadata["extension"],
|
|
155
|
+
original_mime_type=original_file_metadata["mime_type"],
|
|
156
|
+
loader_engine=loader_engine.loader_name,
|
|
157
|
+
owner_id=user.id,
|
|
158
|
+
content_hash=original_file_metadata["content_hash"],
|
|
159
|
+
raw_content_hash=storage_file_metadata["content_hash"],
|
|
160
|
+
external_metadata=ext_metadata,
|
|
161
|
+
node_set=json.dumps(node_set) if node_set else None,
|
|
162
|
+
data_size=original_file_metadata["file_size"],
|
|
163
|
+
tenant_id=user.tenant_id if user.tenant_id else None,
|
|
164
|
+
pipeline_status={},
|
|
165
|
+
token_count=-1,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
new_datapoints.append(data_point)
|
|
169
|
+
dataset_data_map[str(data_point.id)] = True
|
|
142
170
|
|
|
143
171
|
async with db_engine.get_async_session() as session:
|
|
144
172
|
if dataset not in session:
|
|
@@ -160,4 +188,6 @@ async def ingest_data(
|
|
|
160
188
|
|
|
161
189
|
return existing_data_points + dataset_new_data_points + new_datapoints
|
|
162
190
|
|
|
163
|
-
return await store_data_to_dataset(
|
|
191
|
+
return await store_data_to_dataset(
|
|
192
|
+
data, dataset_name, user, node_set, dataset_id, preferred_loaders
|
|
193
|
+
)
|
|
@@ -40,6 +40,9 @@ async def resolve_data_directories(
|
|
|
40
40
|
if include_subdirectories:
|
|
41
41
|
base_path = item if item.endswith("/") else item + "/"
|
|
42
42
|
s3_keys = fs.glob(base_path + "**")
|
|
43
|
+
# If path is not directory attempt to add item directly
|
|
44
|
+
if not s3_keys:
|
|
45
|
+
s3_keys = fs.ls(item)
|
|
43
46
|
else:
|
|
44
47
|
s3_keys = fs.ls(item)
|
|
45
48
|
# Filter out keys that represent directories using fs.isdir
|
|
@@ -103,6 +103,9 @@ async def get_repo_file_dependencies(
|
|
|
103
103
|
extraction of dependencies (default is False). (default False)
|
|
104
104
|
"""
|
|
105
105
|
|
|
106
|
+
if isinstance(repo_path, list) and len(repo_path) == 1:
|
|
107
|
+
repo_path = repo_path[0]
|
|
108
|
+
|
|
106
109
|
if not os.path.exists(repo_path):
|
|
107
110
|
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
|
|
108
111
|
|
|
@@ -38,7 +38,7 @@ async def index_data_points(data_points: list[DataPoint]):
|
|
|
38
38
|
index_name = index_name_and_field[:first_occurence]
|
|
39
39
|
field_name = index_name_and_field[first_occurence + 1 :]
|
|
40
40
|
try:
|
|
41
|
-
# In case the
|
|
41
|
+
# In case the amount of indexable points is too large we need to send them in batches
|
|
42
42
|
batch_size = 100
|
|
43
43
|
for i in range(0, len(indexable_points), batch_size):
|
|
44
44
|
batch = indexable_points[i : i + batch_size]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
|
|
1
2
|
from cognee.shared.logging_utils import get_logger, ERROR
|
|
2
3
|
from collections import Counter
|
|
3
4
|
|
|
@@ -49,7 +50,9 @@ async def index_graph_edges(batch_size: int = 1024):
|
|
|
49
50
|
)
|
|
50
51
|
|
|
51
52
|
for text, count in edge_types.items():
|
|
52
|
-
edge = EdgeType(
|
|
53
|
+
edge = EdgeType(
|
|
54
|
+
id=generate_edge_id(edge_id=text), relationship_name=text, number_of_edges=count
|
|
55
|
+
)
|
|
53
56
|
data_point_type = type(edge)
|
|
54
57
|
|
|
55
58
|
for field_name in edge.metadata["index_fields"]:
|
|
@@ -3,8 +3,7 @@ from typing import AsyncGenerator, Union
|
|
|
3
3
|
from uuid import uuid5
|
|
4
4
|
|
|
5
5
|
from cognee.infrastructure.engine import DataPoint
|
|
6
|
-
from cognee.
|
|
7
|
-
|
|
6
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
8
7
|
from .models import CodeSummary
|
|
9
8
|
|
|
10
9
|
|
|
@@ -17,7 +16,7 @@ async def summarize_code(
|
|
|
17
16
|
code_data_points = [file for file in code_graph_nodes if hasattr(file, "source_code")]
|
|
18
17
|
|
|
19
18
|
file_summaries = await asyncio.gather(
|
|
20
|
-
*[extract_code_summary(file.source_code) for file in code_data_points]
|
|
19
|
+
*[LLMGateway.extract_code_summary(file.source_code) for file in code_data_points]
|
|
21
20
|
)
|
|
22
21
|
|
|
23
22
|
file_summaries_map = {
|
|
@@ -2,8 +2,9 @@ import asyncio
|
|
|
2
2
|
from typing import Type
|
|
3
3
|
from uuid import uuid5
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
-
|
|
5
|
+
|
|
6
6
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
|
7
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
7
8
|
from cognee.modules.cognify.config import get_cognify_config
|
|
8
9
|
from .models import TextSummary
|
|
9
10
|
|
|
@@ -42,7 +43,7 @@ async def summarize_text(
|
|
|
42
43
|
summarization_model = cognee_config.summarization_model
|
|
43
44
|
|
|
44
45
|
chunk_summaries = await asyncio.gather(
|
|
45
|
-
*[extract_summary(chunk.text, summarization_model) for chunk in data_chunks]
|
|
46
|
+
*[LLMGateway.extract_summary(chunk.text, summarization_model) for chunk in data_chunks]
|
|
46
47
|
)
|
|
47
48
|
|
|
48
49
|
summaries = [
|
|
@@ -6,6 +6,7 @@ import signal
|
|
|
6
6
|
import requests
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
import sys
|
|
9
|
+
import uuid
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class TestCogneeServerStart(unittest.TestCase):
|
|
@@ -47,7 +48,7 @@ class TestCogneeServerStart(unittest.TestCase):
|
|
|
47
48
|
"""Test that the server is running and can accept connections."""
|
|
48
49
|
# Test health endpoint
|
|
49
50
|
health_response = requests.get("http://localhost:8000/health", timeout=15)
|
|
50
|
-
self.
|
|
51
|
+
self.assertIn(health_response.status_code, [200, 503])
|
|
51
52
|
|
|
52
53
|
# Test root endpoint
|
|
53
54
|
root_response = requests.get("http://localhost:8000/", timeout=15)
|
|
@@ -74,7 +75,8 @@ class TestCogneeServerStart(unittest.TestCase):
|
|
|
74
75
|
file_path = Path(os.path.join(Path(__file__).parent, "test_data/example.png"))
|
|
75
76
|
headers = {"Authorization": auth_var}
|
|
76
77
|
|
|
77
|
-
|
|
78
|
+
dataset_name = f"test_{uuid.uuid4().hex[:8]}"
|
|
79
|
+
form_data = {"datasetName": dataset_name}
|
|
78
80
|
|
|
79
81
|
file = {
|
|
80
82
|
"data": (
|
|
@@ -83,8 +85,11 @@ class TestCogneeServerStart(unittest.TestCase):
|
|
|
83
85
|
)
|
|
84
86
|
}
|
|
85
87
|
|
|
88
|
+
payload = {"datasets": [dataset_name]}
|
|
89
|
+
|
|
86
90
|
add_response = requests.post(url, headers=headers, data=form_data, files=file, timeout=50)
|
|
87
|
-
add_response.
|
|
91
|
+
if add_response.status_code not in [200, 201, 409]:
|
|
92
|
+
add_response.raise_for_status()
|
|
88
93
|
|
|
89
94
|
# Cognify request
|
|
90
95
|
url = "http://127.0.0.1:8000/api/v1/cognify"
|
|
@@ -93,10 +98,9 @@ class TestCogneeServerStart(unittest.TestCase):
|
|
|
93
98
|
"Content-Type": "application/json",
|
|
94
99
|
}
|
|
95
100
|
|
|
96
|
-
payload = {"datasets": ["test"]}
|
|
97
|
-
|
|
98
101
|
cognify_response = requests.post(url, headers=headers, json=payload, timeout=150)
|
|
99
|
-
cognify_response.
|
|
102
|
+
if cognify_response.status_code not in [200, 201, 409]:
|
|
103
|
+
cognify_response.raise_for_status()
|
|
100
104
|
|
|
101
105
|
# TODO: Add test to verify cognify pipeline is complete before testing search
|
|
102
106
|
|
|
@@ -111,7 +115,8 @@ class TestCogneeServerStart(unittest.TestCase):
|
|
|
111
115
|
payload = {"searchType": "GRAPH_COMPLETION", "query": "What's in the document?"}
|
|
112
116
|
|
|
113
117
|
search_response = requests.post(url, headers=headers, json=payload, timeout=50)
|
|
114
|
-
search_response.
|
|
118
|
+
if search_response.status_code not in [200, 201, 409]:
|
|
119
|
+
search_response.raise_for_status()
|
|
115
120
|
|
|
116
121
|
|
|
117
122
|
if __name__ == "__main__":
|
|
@@ -26,8 +26,8 @@ async def test_deduplication():
|
|
|
26
26
|
explanation_file_path2 = os.path.join(
|
|
27
27
|
pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
|
|
28
28
|
)
|
|
29
|
-
await cognee.add([explanation_file_path], dataset_name)
|
|
30
|
-
await cognee.add([explanation_file_path2], dataset_name2)
|
|
29
|
+
await cognee.add([explanation_file_path], dataset_name, incremental_loading=False)
|
|
30
|
+
await cognee.add([explanation_file_path2], dataset_name2, incremental_loading=False)
|
|
31
31
|
|
|
32
32
|
result = await relational_engine.get_all_data_from_table("data")
|
|
33
33
|
assert len(result) == 1, "More than one data entity was found."
|
cognee/tests/test_deletion.py
CHANGED
|
@@ -12,7 +12,21 @@ async def main():
|
|
|
12
12
|
await cognee.prune.prune_data()
|
|
13
13
|
await cognee.prune.prune_system(metadata=True)
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
pdf_document = os.path.join(
|
|
16
|
+
pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
txt_document = os.path.join(
|
|
20
|
+
pathlib.Path(__file__).parent, "test_data/Natural_language_processing_copy.txt"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
audio_document = os.path.join(pathlib.Path(__file__).parent, "test_data/text_to_speech.mp3")
|
|
24
|
+
|
|
25
|
+
image_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.png")
|
|
26
|
+
|
|
27
|
+
unstructured_document = os.path.join(pathlib.Path(__file__).parent, "test_data/example.pptx")
|
|
28
|
+
|
|
29
|
+
text_document_as_literal = """
|
|
16
30
|
1. Audi
|
|
17
31
|
Audi is known for its modern designs and advanced technology. Founded in the early 1900s, the brand has earned a reputation for precision engineering and innovation. With features like the Quattro all-wheel-drive system, Audi offers a range of vehicles from stylish sedans to high-performance sports cars.
|
|
18
32
|
|
|
@@ -31,27 +45,54 @@ async def main():
|
|
|
31
45
|
Each of these car manufacturer contributes to Germany's reputation as a leader in the global automotive industry, showcasing a blend of innovation, performance, and design excellence.
|
|
32
46
|
"""
|
|
33
47
|
|
|
34
|
-
|
|
35
|
-
1. Apple
|
|
36
|
-
Apple is renowned for its innovative consumer electronics and software. Its product lineup includes the iPhone, iPad, Mac computers, and wearables like the Apple Watch. Known for its emphasis on sleek design and user-friendly interfaces, Apple has built a loyal customer base and created a seamless ecosystem that integrates hardware, software, and services.
|
|
48
|
+
################### HARD DELETE
|
|
37
49
|
|
|
38
|
-
|
|
39
|
-
|
|
50
|
+
# Add documents and get dataset information
|
|
51
|
+
add_result = await cognee.add(
|
|
52
|
+
[
|
|
53
|
+
pdf_document,
|
|
54
|
+
txt_document,
|
|
55
|
+
text_document_as_literal,
|
|
56
|
+
unstructured_document,
|
|
57
|
+
audio_document,
|
|
58
|
+
image_document,
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
dataset_id = add_result.dataset_id
|
|
40
62
|
|
|
41
|
-
|
|
42
|
-
Microsoft Corporation has been a dominant force in software for decades. Its Windows operating system and Microsoft Office suite are staples in both business and personal computing. In recent years, Microsoft has expanded into cloud computing with Azure, gaming with the Xbox platform, and even hardware through products like the Surface line. This evolution has helped the company maintain its relevance in a rapidly changing tech world.
|
|
63
|
+
await cognee.cognify()
|
|
43
64
|
|
|
44
|
-
|
|
45
|
-
What began as an online bookstore has grown into one of the largest e-commerce platforms globally. Amazon is known for its vast online marketplace, but its influence extends far beyond retail. With Amazon Web Services (AWS), the company has become a leader in cloud computing, offering robust solutions that power websites, applications, and businesses around the world. Amazon's constant drive for innovation continues to reshape both retail and technology sectors.
|
|
65
|
+
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
46
66
|
|
|
47
|
-
|
|
48
|
-
|
|
67
|
+
graph_engine = await get_graph_engine()
|
|
68
|
+
nodes, edges = await graph_engine.get_graph_data()
|
|
69
|
+
assert len(nodes) > 10 and len(edges) > 10, "Graph database is not loaded."
|
|
49
70
|
|
|
50
|
-
|
|
51
|
-
|
|
71
|
+
# Get the data IDs from the dataset
|
|
72
|
+
dataset_data = await get_dataset_data(dataset_id)
|
|
73
|
+
assert len(dataset_data) > 0, "Dataset should contain data"
|
|
74
|
+
|
|
75
|
+
# Delete each document using its ID
|
|
76
|
+
for data_item in dataset_data:
|
|
77
|
+
await cognee.delete(data_item.id, dataset_id, mode="hard")
|
|
78
|
+
|
|
79
|
+
nodes, edges = await graph_engine.get_graph_data()
|
|
80
|
+
|
|
81
|
+
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with hard delete."
|
|
82
|
+
|
|
83
|
+
################### SOFT DELETE
|
|
52
84
|
|
|
53
85
|
# Add documents and get dataset information
|
|
54
|
-
add_result = await cognee.add(
|
|
86
|
+
add_result = await cognee.add(
|
|
87
|
+
[
|
|
88
|
+
pdf_document,
|
|
89
|
+
txt_document,
|
|
90
|
+
text_document_as_literal,
|
|
91
|
+
unstructured_document,
|
|
92
|
+
audio_document,
|
|
93
|
+
image_document,
|
|
94
|
+
]
|
|
95
|
+
)
|
|
55
96
|
dataset_id = add_result.dataset_id
|
|
56
97
|
|
|
57
98
|
await cognee.cognify()
|
|
@@ -68,11 +109,11 @@ async def main():
|
|
|
68
109
|
|
|
69
110
|
# Delete each document using its ID
|
|
70
111
|
for data_item in dataset_data:
|
|
71
|
-
await cognee.delete(data_item.id, dataset_id, mode="
|
|
112
|
+
await cognee.delete(data_item.id, dataset_id, mode="soft")
|
|
72
113
|
|
|
73
114
|
nodes, edges = await graph_engine.get_graph_data()
|
|
74
115
|
|
|
75
|
-
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted."
|
|
116
|
+
assert len(nodes) == 0 and len(edges) == 0, "Document is not deleted with soft delete."
|
|
76
117
|
|
|
77
118
|
|
|
78
119
|
if __name__ == "__main__":
|