cognee 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +2 -0
- cognee/alembic/README +1 -0
- cognee/alembic/env.py +107 -0
- cognee/alembic/script.py.mako +26 -0
- cognee/alembic/versions/1a58b986e6e1_enable_delete_for_old_tutorial_notebooks.py +52 -0
- cognee/alembic/versions/1d0bb7fede17_add_pipeline_run_status.py +33 -0
- cognee/alembic/versions/1daae0df1866_incremental_loading.py +48 -0
- cognee/alembic/versions/211ab850ef3d_add_sync_operations_table.py +118 -0
- cognee/alembic/versions/45957f0a9849_add_notebook_table.py +46 -0
- cognee/alembic/versions/46a6ce2bd2b2_expand_dataset_database_with_json_.py +333 -0
- cognee/alembic/versions/482cd6517ce4_add_default_user.py +30 -0
- cognee/alembic/versions/76625596c5c3_expand_dataset_database_for_multi_user.py +98 -0
- cognee/alembic/versions/8057ae7329c2_initial_migration.py +25 -0
- cognee/alembic/versions/9e7a3cb85175_loader_separation.py +104 -0
- cognee/alembic/versions/a1b2c3d4e5f6_add_label_column_to_data.py +38 -0
- cognee/alembic/versions/ab7e313804ae_permission_system_rework.py +236 -0
- cognee/alembic/versions/b9274c27a25a_kuzu_11_migration.py +75 -0
- cognee/alembic/versions/c946955da633_multi_tenant_support.py +137 -0
- cognee/alembic/versions/e1ec1dcb50b6_add_last_accessed_to_data.py +51 -0
- cognee/alembic/versions/e4ebee1091e7_expand_data_model_info.py +140 -0
- cognee/alembic.ini +117 -0
- cognee/api/v1/add/add.py +2 -1
- cognee/api/v1/add/routers/get_add_router.py +2 -0
- cognee/api/v1/cognify/cognify.py +11 -6
- cognee/api/v1/cognify/routers/get_cognify_router.py +8 -0
- cognee/api/v1/config/config.py +60 -0
- cognee/api/v1/datasets/routers/get_datasets_router.py +46 -3
- cognee/api/v1/memify/routers/get_memify_router.py +3 -0
- cognee/api/v1/search/routers/get_search_router.py +21 -6
- cognee/api/v1/search/search.py +21 -5
- cognee/api/v1/sync/routers/get_sync_router.py +3 -3
- cognee/cli/commands/add_command.py +1 -1
- cognee/cli/commands/cognify_command.py +6 -0
- cognee/cli/commands/config_command.py +1 -1
- cognee/context_global_variables.py +5 -1
- cognee/eval_framework/answer_generation/answer_generation_executor.py +7 -8
- cognee/infrastructure/databases/cache/cache_db_interface.py +38 -1
- cognee/infrastructure/databases/cache/config.py +6 -0
- cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +21 -0
- cognee/infrastructure/databases/cache/get_cache_engine.py +9 -3
- cognee/infrastructure/databases/cache/redis/RedisAdapter.py +60 -1
- cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +7 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +29 -1
- cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +62 -27
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +17 -4
- cognee/infrastructure/databases/relational/config.py +16 -1
- cognee/infrastructure/databases/relational/create_relational_engine.py +13 -3
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +26 -3
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +2 -0
- cognee/infrastructure/databases/vector/config.py +6 -0
- cognee/infrastructure/databases/vector/create_vector_engine.py +70 -16
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +64 -9
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +13 -2
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +16 -3
- cognee/infrastructure/databases/vector/models/ScoredResult.py +3 -3
- cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +16 -3
- cognee/infrastructure/databases/vector/pgvector/PGVectorDatasetDatabaseHandler.py +86 -0
- cognee/infrastructure/databases/vector/pgvector/create_db_and_tables.py +81 -2
- cognee/infrastructure/databases/vector/vector_db_interface.py +8 -0
- cognee/infrastructure/files/utils/get_data_file_path.py +33 -27
- cognee/infrastructure/llm/LLMGateway.py +0 -13
- cognee/infrastructure/llm/prompts/extract_query_time.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_graph_prompt.txt +2 -2
- cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt +2 -2
- cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt +1 -1
- cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt +1 -1
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +6 -6
- cognee/infrastructure/llm/prompts/test.txt +1 -1
- cognee/infrastructure/llm/prompts/translate_content.txt +19 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +17 -12
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +31 -25
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +132 -7
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +29 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llama_cpp/adapter.py +191 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llm_interface.py +2 -6
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +58 -13
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +0 -1
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +25 -131
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/types.py +10 -0
- cognee/modules/chunking/models/DocumentChunk.py +0 -1
- cognee/modules/cognify/config.py +2 -0
- cognee/modules/data/models/Data.py +3 -1
- cognee/modules/engine/models/Entity.py +0 -1
- cognee/modules/engine/operations/setup.py +6 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +150 -37
- cognee/modules/graph/cognee_graph/CogneeGraphElements.py +48 -2
- cognee/modules/graph/utils/__init__.py +1 -0
- cognee/modules/graph/utils/get_entity_nodes_from_triplets.py +12 -0
- cognee/modules/notebooks/methods/__init__.py +1 -0
- cognee/modules/notebooks/methods/create_notebook.py +0 -34
- cognee/modules/notebooks/methods/create_tutorial_notebooks.py +191 -0
- cognee/modules/notebooks/methods/get_notebooks.py +12 -8
- cognee/modules/notebooks/tutorials/cognee-basics/cell-1.md +3 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-2.md +10 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-3.md +7 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-4.py +28 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-5.py +3 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-6.py +9 -0
- cognee/modules/notebooks/tutorials/cognee-basics/cell-7.py +17 -0
- cognee/modules/notebooks/tutorials/cognee-basics/config.json +4 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-1.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-10.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-11.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-12.py +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-13.md +7 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-14.py +6 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-15.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-16.py +7 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-2.md +9 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-3.md +7 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-4.md +9 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-5.md +5 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-6.py +13 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-7.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-8.md +3 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/cell-9.py +31 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/config.json +4 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/copilot_conversations.json +107 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/guido_contributions.json +976 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/my_developer_rules.md +79 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/pep_style_guide.md +74 -0
- cognee/modules/notebooks/tutorials/python-development-with-cognee/data/zen_principles.md +74 -0
- cognee/modules/retrieval/EntityCompletionRetriever.py +51 -38
- cognee/modules/retrieval/__init__.py +0 -1
- cognee/modules/retrieval/base_retriever.py +66 -10
- cognee/modules/retrieval/chunks_retriever.py +57 -49
- cognee/modules/retrieval/coding_rules_retriever.py +12 -5
- cognee/modules/retrieval/completion_retriever.py +29 -28
- cognee/modules/retrieval/cypher_search_retriever.py +25 -20
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +42 -46
- cognee/modules/retrieval/graph_completion_cot_retriever.py +68 -51
- cognee/modules/retrieval/graph_completion_retriever.py +78 -63
- cognee/modules/retrieval/graph_summary_completion_retriever.py +2 -0
- cognee/modules/retrieval/lexical_retriever.py +34 -12
- cognee/modules/retrieval/natural_language_retriever.py +18 -15
- cognee/modules/retrieval/summaries_retriever.py +51 -34
- cognee/modules/retrieval/temporal_retriever.py +59 -49
- cognee/modules/retrieval/triplet_retriever.py +32 -33
- cognee/modules/retrieval/utils/access_tracking.py +88 -0
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +99 -103
- cognee/modules/retrieval/utils/node_edge_vector_search.py +174 -0
- cognee/modules/search/methods/__init__.py +1 -0
- cognee/modules/search/methods/get_retriever_output.py +53 -0
- cognee/modules/search/methods/get_search_type_retriever_instance.py +252 -0
- cognee/modules/search/methods/search.py +90 -222
- cognee/modules/search/models/SearchResultPayload.py +67 -0
- cognee/modules/search/types/SearchResult.py +1 -8
- cognee/modules/search/types/SearchType.py +1 -2
- cognee/modules/search/types/__init__.py +1 -1
- cognee/modules/search/utils/__init__.py +1 -2
- cognee/modules/search/utils/transform_insights_to_graph.py +2 -2
- cognee/modules/search/utils/{transform_context_to_graph.py → transform_triplets_to_graph.py} +2 -2
- cognee/modules/users/authentication/default/default_transport.py +11 -1
- cognee/modules/users/authentication/get_api_auth_backend.py +2 -1
- cognee/modules/users/authentication/get_client_auth_backend.py +2 -1
- cognee/modules/users/methods/create_user.py +0 -9
- cognee/modules/users/permissions/methods/has_user_management_permission.py +29 -0
- cognee/modules/visualization/cognee_network_visualization.py +1 -1
- cognee/run_migrations.py +48 -0
- cognee/shared/exceptions/__init__.py +1 -3
- cognee/shared/exceptions/exceptions.py +11 -1
- cognee/shared/usage_logger.py +332 -0
- cognee/shared/utils.py +12 -5
- cognee/tasks/chunks/__init__.py +9 -0
- cognee/tasks/cleanup/cleanup_unused_data.py +172 -0
- cognee/tasks/graph/__init__.py +7 -0
- cognee/tasks/ingestion/data_item.py +8 -0
- cognee/tasks/ingestion/ingest_data.py +12 -1
- cognee/tasks/ingestion/save_data_item_to_storage.py +5 -0
- cognee/tasks/memify/__init__.py +8 -0
- cognee/tasks/memify/extract_usage_frequency.py +613 -0
- cognee/tasks/summarization/models.py +0 -2
- cognee/tasks/temporal_graph/__init__.py +0 -1
- cognee/tasks/translation/__init__.py +96 -0
- cognee/tasks/translation/config.py +110 -0
- cognee/tasks/translation/detect_language.py +190 -0
- cognee/tasks/translation/exceptions.py +62 -0
- cognee/tasks/translation/models.py +72 -0
- cognee/tasks/translation/providers/__init__.py +44 -0
- cognee/tasks/translation/providers/azure_provider.py +192 -0
- cognee/tasks/translation/providers/base.py +85 -0
- cognee/tasks/translation/providers/google_provider.py +158 -0
- cognee/tasks/translation/providers/llm_provider.py +143 -0
- cognee/tasks/translation/translate_content.py +282 -0
- cognee/tasks/web_scraper/default_url_crawler.py +6 -2
- cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py +1 -0
- cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py +3 -0
- cognee/tests/integration/retrieval/test_brute_force_triplet_search_with_cognify.py +62 -0
- cognee/tests/integration/retrieval/test_chunks_retriever.py +351 -0
- cognee/tests/integration/retrieval/test_graph_completion_retriever.py +276 -0
- cognee/tests/integration/retrieval/test_graph_completion_retriever_context_extension.py +228 -0
- cognee/tests/integration/retrieval/test_graph_completion_retriever_cot.py +217 -0
- cognee/tests/integration/retrieval/test_rag_completion_retriever.py +319 -0
- cognee/tests/integration/retrieval/test_structured_output.py +258 -0
- cognee/tests/integration/retrieval/test_summaries_retriever.py +195 -0
- cognee/tests/integration/retrieval/test_temporal_retriever.py +336 -0
- cognee/tests/integration/retrieval/test_triplet_retriever.py +45 -1
- cognee/tests/integration/shared/test_usage_logger_integration.py +255 -0
- cognee/tests/tasks/translation/README.md +147 -0
- cognee/tests/tasks/translation/__init__.py +1 -0
- cognee/tests/tasks/translation/config_test.py +93 -0
- cognee/tests/tasks/translation/detect_language_test.py +118 -0
- cognee/tests/tasks/translation/providers_test.py +151 -0
- cognee/tests/tasks/translation/translate_content_test.py +213 -0
- cognee/tests/test_chromadb.py +1 -1
- cognee/tests/test_cleanup_unused_data.py +165 -0
- cognee/tests/test_custom_data_label.py +68 -0
- cognee/tests/test_delete_by_id.py +6 -6
- cognee/tests/test_extract_usage_frequency.py +308 -0
- cognee/tests/test_kuzu.py +17 -7
- cognee/tests/test_lancedb.py +3 -1
- cognee/tests/test_library.py +1 -1
- cognee/tests/test_neo4j.py +17 -7
- cognee/tests/test_neptune_analytics_vector.py +3 -1
- cognee/tests/test_permissions.py +172 -187
- cognee/tests/test_pgvector.py +3 -1
- cognee/tests/test_relational_db_migration.py +15 -1
- cognee/tests/test_remote_kuzu.py +3 -1
- cognee/tests/test_s3_file_storage.py +1 -1
- cognee/tests/test_search_db.py +345 -205
- cognee/tests/test_usage_logger_e2e.py +268 -0
- cognee/tests/unit/api/test_get_raw_data_endpoint.py +206 -0
- cognee/tests/unit/eval_framework/answer_generation_test.py +4 -3
- cognee/tests/unit/eval_framework/benchmark_adapters_test.py +25 -0
- cognee/tests/unit/eval_framework/corpus_builder_test.py +33 -4
- cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +2 -0
- cognee/tests/unit/infrastructure/databases/relational/test_RelationalConfig.py +69 -0
- cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +42 -2
- cognee/tests/unit/modules/graph/cognee_graph_test.py +329 -31
- cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +122 -168
- cognee/tests/unit/modules/retrieval/conversation_history_test.py +338 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +486 -157
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +693 -155
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +619 -200
- cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +300 -171
- cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +184 -155
- cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +544 -79
- cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +476 -28
- cognee/tests/unit/modules/retrieval/test_completion.py +343 -0
- cognee/tests/unit/modules/retrieval/test_graph_summary_completion_retriever.py +157 -0
- cognee/tests/unit/modules/retrieval/test_node_edge_vector_search.py +273 -0
- cognee/tests/unit/modules/retrieval/test_user_qa_feedback.py +312 -0
- cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +267 -7
- cognee/tests/unit/modules/search/test_get_search_type_retriever_instance.py +125 -0
- cognee/tests/unit/modules/search/test_search.py +96 -20
- cognee/tests/unit/modules/search/test_search_prepare_search_result_contract.py +190 -0
- cognee/tests/unit/modules/users/test_tutorial_notebook_creation.py +511 -297
- cognee/tests/unit/shared/test_usage_logger.py +241 -0
- cognee/tests/unit/users/permissions/test_has_user_management_permission.py +46 -0
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/METADATA +22 -17
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/RECORD +258 -157
- cognee/api/.env.example +0 -5
- cognee/modules/retrieval/base_graph_retriever.py +0 -24
- cognee/modules/search/methods/get_search_type_tools.py +0 -223
- cognee/modules/search/methods/no_access_control_search.py +0 -62
- cognee/modules/search/utils/prepare_search_result.py +0 -63
- cognee/tests/test_feedback_enrichment.py +0 -174
- cognee/tests/unit/modules/retrieval/structured_output_test.py +0 -204
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/WHEEL +0 -0
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/entry_points.txt +0 -0
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.5.1.dist-info → cognee-0.5.2.dist-info}/licenses/NOTICE.md +0 -0
cognee/shared/utils.py
CHANGED
|
@@ -8,7 +8,8 @@ import http.server
|
|
|
8
8
|
import socketserver
|
|
9
9
|
from threading import Thread
|
|
10
10
|
import pathlib
|
|
11
|
-
from
|
|
11
|
+
from typing import Union, Any, Dict, List
|
|
12
|
+
from uuid import uuid4, uuid5, NAMESPACE_OID, UUID
|
|
12
13
|
|
|
13
14
|
from cognee.base_config import get_base_config
|
|
14
15
|
from cognee.shared.logging_utils import get_logger
|
|
@@ -58,7 +59,7 @@ def get_anonymous_id():
|
|
|
58
59
|
return anonymous_id
|
|
59
60
|
|
|
60
61
|
|
|
61
|
-
def _sanitize_nested_properties(obj, property_names: list[str]):
|
|
62
|
+
def _sanitize_nested_properties(obj: Any, property_names: list[str]) -> Any:
|
|
62
63
|
"""
|
|
63
64
|
Recursively replaces any property whose key matches one of `property_names`
|
|
64
65
|
(e.g., ['url', 'path']) in a nested dict or list with a uuid5 hash
|
|
@@ -78,7 +79,9 @@ def _sanitize_nested_properties(obj, property_names: list[str]):
|
|
|
78
79
|
return obj
|
|
79
80
|
|
|
80
81
|
|
|
81
|
-
def send_telemetry(event_name: str, user_id, additional_properties: dict = {}):
|
|
82
|
+
def send_telemetry(event_name: str, user_id: Union[str, UUID], additional_properties: dict = {}):
|
|
83
|
+
if additional_properties is None:
|
|
84
|
+
additional_properties = {}
|
|
82
85
|
if os.getenv("TELEMETRY_DISABLED"):
|
|
83
86
|
return
|
|
84
87
|
|
|
@@ -108,7 +111,7 @@ def send_telemetry(event_name: str, user_id, additional_properties: dict = {}):
|
|
|
108
111
|
print(f"Error sending telemetry through proxy: {response.status_code}")
|
|
109
112
|
|
|
110
113
|
|
|
111
|
-
def embed_logo(p, layout_scale, logo_alpha, position):
|
|
114
|
+
def embed_logo(p: Any, layout_scale: float, logo_alpha: float, position: str):
|
|
112
115
|
"""
|
|
113
116
|
Embed a logo into the graph visualization as a watermark.
|
|
114
117
|
"""
|
|
@@ -138,7 +141,11 @@ def embed_logo(p, layout_scale, logo_alpha, position):
|
|
|
138
141
|
|
|
139
142
|
|
|
140
143
|
def start_visualization_server(
|
|
141
|
-
host="0.0.0.0",
|
|
144
|
+
host: str = "0.0.0.0",
|
|
145
|
+
port: int = 8001,
|
|
146
|
+
handler_class: type[
|
|
147
|
+
http.server.SimpleHTTPRequestHandler
|
|
148
|
+
] = http.server.SimpleHTTPRequestHandler,
|
|
142
149
|
):
|
|
143
150
|
"""
|
|
144
151
|
Spin up a simple HTTP server in a background thread to serve files.
|
cognee/tasks/chunks/__init__.py
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text chunking and chunk management tasks.
|
|
3
|
+
|
|
4
|
+
This module provides functionality for splitting text into chunks using
|
|
5
|
+
different strategies (word, sentence, paragraph, or row-based) and for
|
|
6
|
+
cleaning up disconnected or obsolete chunks to support downstream
|
|
7
|
+
processing and knowledge graph workflows.
|
|
8
|
+
"""
|
|
9
|
+
|
|
1
10
|
from .chunk_by_word import chunk_by_word
|
|
2
11
|
from .chunk_by_sentence import chunk_by_sentence
|
|
3
12
|
from .chunk_by_paragraph import chunk_by_paragraph
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Task for automatically deleting unused data from the memify pipeline.
|
|
3
|
+
|
|
4
|
+
This task identifies and removes entire documents that haven't
|
|
5
|
+
been accessed by retrievers for a specified period, helping maintain system
|
|
6
|
+
efficiency and storage optimization through whole-document removal.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from datetime import datetime, timezone, timedelta
|
|
11
|
+
from typing import Optional, Dict, Any
|
|
12
|
+
from uuid import UUID
|
|
13
|
+
import os
|
|
14
|
+
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
15
|
+
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
16
|
+
from cognee.infrastructure.databases.relational import get_relational_engine
|
|
17
|
+
from cognee.modules.data.models import Data, DatasetData
|
|
18
|
+
from cognee.shared.logging_utils import get_logger
|
|
19
|
+
from sqlalchemy import select, or_
|
|
20
|
+
import cognee
|
|
21
|
+
import sqlalchemy as sa
|
|
22
|
+
from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def cleanup_unused_data(
|
|
28
|
+
minutes_threshold: Optional[int], dry_run: bool = True, user_id: Optional[UUID] = None
|
|
29
|
+
) -> Dict[str, Any]:
|
|
30
|
+
"""
|
|
31
|
+
Identify and remove unused data from the memify pipeline.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
minutes_threshold : int
|
|
36
|
+
Minutes since last access to consider data unused
|
|
37
|
+
dry_run : bool
|
|
38
|
+
If True, only report what would be deleted without actually deleting (default: True)
|
|
39
|
+
user_id : UUID, optional
|
|
40
|
+
Limit cleanup to specific user's data (default: None)
|
|
41
|
+
|
|
42
|
+
Returns
|
|
43
|
+
-------
|
|
44
|
+
Dict[str, Any]
|
|
45
|
+
Cleanup results with status, counts, and timestamp
|
|
46
|
+
"""
|
|
47
|
+
# Check 1: Environment variable must be enabled
|
|
48
|
+
if os.getenv("ENABLE_LAST_ACCESSED", "false").lower() != "true":
|
|
49
|
+
logger.warning("Cleanup skipped: ENABLE_LAST_ACCESSED is not enabled.")
|
|
50
|
+
return {
|
|
51
|
+
"status": "skipped",
|
|
52
|
+
"reason": "ENABLE_LAST_ACCESSED not enabled",
|
|
53
|
+
"unused_count": 0,
|
|
54
|
+
"deleted_count": {},
|
|
55
|
+
"cleanup_date": datetime.now(timezone.utc).isoformat(),
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Check 2: Verify tracking has actually been running
|
|
59
|
+
db_engine = get_relational_engine()
|
|
60
|
+
async with db_engine.get_async_session() as session:
|
|
61
|
+
# Count records with non-NULL last_accessed
|
|
62
|
+
tracked_count = await session.execute(
|
|
63
|
+
select(sa.func.count(Data.id)).where(Data.last_accessed.isnot(None))
|
|
64
|
+
)
|
|
65
|
+
tracked_records = tracked_count.scalar()
|
|
66
|
+
|
|
67
|
+
if tracked_records == 0:
|
|
68
|
+
logger.warning(
|
|
69
|
+
"Cleanup skipped: No records have been tracked yet. "
|
|
70
|
+
"ENABLE_LAST_ACCESSED may have been recently enabled. "
|
|
71
|
+
"Wait for retrievers to update timestamps before running cleanup."
|
|
72
|
+
)
|
|
73
|
+
return {
|
|
74
|
+
"status": "skipped",
|
|
75
|
+
"reason": "No tracked records found - tracking may be newly enabled",
|
|
76
|
+
"unused_count": 0,
|
|
77
|
+
"deleted_count": {},
|
|
78
|
+
"cleanup_date": datetime.now(timezone.utc).isoformat(),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
logger.info(
|
|
82
|
+
"Starting cleanup task",
|
|
83
|
+
minutes_threshold=minutes_threshold,
|
|
84
|
+
dry_run=dry_run,
|
|
85
|
+
user_id=str(user_id) if user_id else None,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Calculate cutoff timestamp
|
|
89
|
+
cutoff_date = datetime.now(timezone.utc) - timedelta(minutes=minutes_threshold)
|
|
90
|
+
|
|
91
|
+
# Document-level approach (recommended)
|
|
92
|
+
return await _cleanup_via_sql(cutoff_date, dry_run, user_id)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
async def _cleanup_via_sql(
|
|
96
|
+
cutoff_date: datetime, dry_run: bool, user_id: Optional[UUID] = None
|
|
97
|
+
) -> Dict[str, Any]:
|
|
98
|
+
"""
|
|
99
|
+
SQL-based cleanup: Query Data table for unused documents and use cognee.delete().
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
cutoff_date : datetime
|
|
104
|
+
Cutoff date for last_accessed filtering
|
|
105
|
+
dry_run : bool
|
|
106
|
+
If True, only report what would be deleted
|
|
107
|
+
user_id : UUID, optional
|
|
108
|
+
Filter by user ID if provided
|
|
109
|
+
|
|
110
|
+
Returns
|
|
111
|
+
-------
|
|
112
|
+
Dict[str, Any]
|
|
113
|
+
Cleanup results
|
|
114
|
+
"""
|
|
115
|
+
db_engine = get_relational_engine()
|
|
116
|
+
|
|
117
|
+
async with db_engine.get_async_session() as session:
|
|
118
|
+
# Query for Data records with old last_accessed timestamps
|
|
119
|
+
query = (
|
|
120
|
+
select(Data, DatasetData)
|
|
121
|
+
.join(DatasetData, Data.id == DatasetData.data_id)
|
|
122
|
+
.where(or_(Data.last_accessed < cutoff_date, Data.last_accessed.is_(None)))
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if user_id:
|
|
126
|
+
from cognee.modules.data.models import Dataset
|
|
127
|
+
|
|
128
|
+
query = query.join(Dataset, DatasetData.dataset_id == Dataset.id).where(
|
|
129
|
+
Dataset.owner_id == user_id
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
result = await session.execute(query)
|
|
133
|
+
unused_data = result.all()
|
|
134
|
+
|
|
135
|
+
logger.info(f"Found {len(unused_data)} unused documents in SQL")
|
|
136
|
+
|
|
137
|
+
if dry_run:
|
|
138
|
+
return {
|
|
139
|
+
"status": "dry_run",
|
|
140
|
+
"unused_count": len(unused_data),
|
|
141
|
+
"deleted_count": {"data_items": 0, "documents": 0},
|
|
142
|
+
"cleanup_date": datetime.now(timezone.utc).isoformat(),
|
|
143
|
+
"preview": {"documents": len(unused_data)},
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# Delete each document using cognee.delete()
|
|
147
|
+
deleted_count = 0
|
|
148
|
+
from cognee.modules.users.methods import get_default_user
|
|
149
|
+
|
|
150
|
+
user = await get_default_user() if user_id is None else None
|
|
151
|
+
|
|
152
|
+
for data, dataset_data in unused_data:
|
|
153
|
+
try:
|
|
154
|
+
await cognee.delete(
|
|
155
|
+
data_id=data.id,
|
|
156
|
+
dataset_id=dataset_data.dataset_id,
|
|
157
|
+
mode="hard", # Use hard mode to also remove orphaned entities
|
|
158
|
+
user=user,
|
|
159
|
+
)
|
|
160
|
+
deleted_count += 1
|
|
161
|
+
logger.info(f"Deleted document {data.id} from dataset {dataset_data.dataset_id}")
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.error(f"Failed to delete document {data.id}: {e}")
|
|
164
|
+
|
|
165
|
+
logger.info("Cleanup completed", deleted_count=deleted_count)
|
|
166
|
+
|
|
167
|
+
return {
|
|
168
|
+
"status": "completed",
|
|
169
|
+
"unused_count": len(unused_data),
|
|
170
|
+
"deleted_count": {"data_items": deleted_count, "documents": deleted_count},
|
|
171
|
+
"cleanup_date": datetime.now(timezone.utc).isoformat(),
|
|
172
|
+
}
|
cognee/tasks/graph/__init__.py
CHANGED
|
@@ -1,2 +1,9 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graph extraction and manipulation tasks.
|
|
3
|
+
|
|
4
|
+
This module provides tasks for extracting knowledge graphs from data,
|
|
5
|
+
building relationships between entities, and managing graph structures.
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
from .extract_graph_from_data import extract_graph_from_data
|
|
2
9
|
from .extract_graph_from_code import extract_graph_from_code
|
|
@@ -20,6 +20,7 @@ from cognee.modules.data.methods import (
|
|
|
20
20
|
|
|
21
21
|
from .save_data_item_to_storage import save_data_item_to_storage
|
|
22
22
|
from .data_item_to_text_file import data_item_to_text_file
|
|
23
|
+
from .data_item import DataItem
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
async def ingest_data(
|
|
@@ -78,8 +79,16 @@ async def ingest_data(
|
|
|
78
79
|
dataset_data_map = {str(data.id): True for data in dataset_data}
|
|
79
80
|
|
|
80
81
|
for data_item in data:
|
|
82
|
+
# Support for DataItem (custom label + data wrapper)
|
|
83
|
+
current_label = None
|
|
84
|
+
underlying_data = data_item
|
|
85
|
+
|
|
86
|
+
if isinstance(data_item, DataItem):
|
|
87
|
+
underlying_data = data_item.data
|
|
88
|
+
current_label = data_item.label
|
|
89
|
+
|
|
81
90
|
# Get file path of data item or create a file if it doesn't exist
|
|
82
|
-
original_file_path = await save_data_item_to_storage(
|
|
91
|
+
original_file_path = await save_data_item_to_storage(underlying_data)
|
|
83
92
|
# Transform file path to be OS usable
|
|
84
93
|
actual_file_path = get_data_file_path(original_file_path)
|
|
85
94
|
|
|
@@ -139,6 +148,7 @@ async def ingest_data(
|
|
|
139
148
|
data_point.external_metadata = ext_metadata
|
|
140
149
|
data_point.node_set = json.dumps(node_set) if node_set else None
|
|
141
150
|
data_point.tenant_id = user.tenant_id if user.tenant_id else None
|
|
151
|
+
data_point.label = current_label
|
|
142
152
|
|
|
143
153
|
# Check if data is already in dataset
|
|
144
154
|
if str(data_point.id) in dataset_data_map:
|
|
@@ -169,6 +179,7 @@ async def ingest_data(
|
|
|
169
179
|
tenant_id=user.tenant_id if user.tenant_id else None,
|
|
170
180
|
pipeline_status={},
|
|
171
181
|
token_count=-1,
|
|
182
|
+
label=current_label,
|
|
172
183
|
)
|
|
173
184
|
|
|
174
185
|
new_datapoints.append(data_point)
|
|
@@ -9,6 +9,7 @@ from cognee.shared.logging_utils import get_logger
|
|
|
9
9
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
10
10
|
|
|
11
11
|
from cognee.tasks.web_scraper.utils import fetch_page_content
|
|
12
|
+
from cognee.tasks.ingestion.data_item import DataItem
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
@@ -95,5 +96,9 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
|
|
|
95
96
|
# data is text, save it to data storage and return the file path
|
|
96
97
|
return await save_data_to_file(data_item)
|
|
97
98
|
|
|
99
|
+
if isinstance(data_item, DataItem):
|
|
100
|
+
# If instance is DataItem use the underlying data
|
|
101
|
+
return await save_data_item_to_storage(data_item.data)
|
|
102
|
+
|
|
98
103
|
# data is not a supported type
|
|
99
104
|
raise IngestionError(message=f"Data type not supported: {type(data_item)}")
|
cognee/tasks/memify/__init__.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Memory and subgraph extraction tasks.
|
|
3
|
+
|
|
4
|
+
This module provides tasks for extracting subgraphs, document chunks, and
|
|
5
|
+
user session data, as well as initiating session cognification workflows,
|
|
6
|
+
to support memory enrichment and downstream knowledge graph processing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from .extract_subgraph import extract_subgraph
|
|
2
10
|
from .extract_subgraph_chunks import extract_subgraph_chunks
|
|
3
11
|
from .cognify_session import cognify_session
|