cognee 0.2.3.dev1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__init__.py +2 -0
- cognee/__main__.py +4 -0
- cognee/api/client.py +28 -3
- cognee/api/health.py +10 -13
- cognee/api/v1/add/add.py +20 -6
- cognee/api/v1/add/routers/get_add_router.py +12 -37
- cognee/api/v1/cloud/routers/__init__.py +1 -0
- cognee/api/v1/cloud/routers/get_checks_router.py +23 -0
- cognee/api/v1/cognify/code_graph_pipeline.py +14 -3
- cognee/api/v1/cognify/cognify.py +67 -105
- cognee/api/v1/cognify/routers/get_cognify_router.py +11 -3
- cognee/api/v1/datasets/routers/get_datasets_router.py +16 -5
- cognee/api/v1/memify/routers/__init__.py +1 -0
- cognee/api/v1/memify/routers/get_memify_router.py +100 -0
- cognee/api/v1/notebooks/routers/__init__.py +1 -0
- cognee/api/v1/notebooks/routers/get_notebooks_router.py +96 -0
- cognee/api/v1/responses/default_tools.py +4 -0
- cognee/api/v1/responses/dispatch_function.py +6 -1
- cognee/api/v1/responses/models.py +1 -1
- cognee/api/v1/search/routers/get_search_router.py +20 -1
- cognee/api/v1/search/search.py +17 -4
- cognee/api/v1/sync/__init__.py +17 -0
- cognee/api/v1/sync/routers/__init__.py +3 -0
- cognee/api/v1/sync/routers/get_sync_router.py +241 -0
- cognee/api/v1/sync/sync.py +877 -0
- cognee/api/v1/ui/__init__.py +1 -0
- cognee/api/v1/ui/ui.py +529 -0
- cognee/api/v1/users/routers/get_auth_router.py +13 -1
- cognee/base_config.py +10 -1
- cognee/cli/__init__.py +10 -0
- cognee/cli/_cognee.py +273 -0
- cognee/cli/commands/__init__.py +1 -0
- cognee/cli/commands/add_command.py +80 -0
- cognee/cli/commands/cognify_command.py +128 -0
- cognee/cli/commands/config_command.py +225 -0
- cognee/cli/commands/delete_command.py +80 -0
- cognee/cli/commands/search_command.py +149 -0
- cognee/cli/config.py +33 -0
- cognee/cli/debug.py +21 -0
- cognee/cli/echo.py +45 -0
- cognee/cli/exceptions.py +23 -0
- cognee/cli/minimal_cli.py +97 -0
- cognee/cli/reference.py +26 -0
- cognee/cli/suppress_logging.py +12 -0
- cognee/eval_framework/corpus_builder/corpus_builder_executor.py +2 -2
- cognee/eval_framework/eval_config.py +1 -1
- cognee/infrastructure/databases/graph/config.py +10 -4
- cognee/infrastructure/databases/graph/get_graph_engine.py +4 -9
- cognee/infrastructure/databases/graph/kuzu/adapter.py +199 -2
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +138 -0
- cognee/infrastructure/databases/relational/__init__.py +2 -0
- cognee/infrastructure/databases/relational/get_async_session.py +15 -0
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +6 -1
- cognee/infrastructure/databases/relational/with_async_session.py +25 -0
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +1 -1
- cognee/infrastructure/databases/vector/config.py +13 -6
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +6 -4
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +16 -7
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +5 -5
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -2
- cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +2 -6
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +10 -7
- cognee/infrastructure/files/storage/LocalFileStorage.py +9 -0
- cognee/infrastructure/files/storage/S3FileStorage.py +5 -0
- cognee/infrastructure/files/storage/StorageManager.py +7 -1
- cognee/infrastructure/files/storage/storage.py +16 -0
- cognee/infrastructure/files/utils/get_data_file_path.py +14 -9
- cognee/infrastructure/files/utils/get_file_metadata.py +2 -1
- cognee/infrastructure/llm/LLMGateway.py +32 -5
- cognee/infrastructure/llm/config.py +6 -4
- cognee/infrastructure/llm/prompts/extract_query_time.txt +15 -0
- cognee/infrastructure/llm/prompts/generate_event_entity_prompt.txt +25 -0
- cognee/infrastructure/llm/prompts/generate_event_graph_prompt.txt +30 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +16 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +2 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_event_entities.py +44 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/__init__.py +1 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_content_graph.py +19 -15
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_event_graph.py +46 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +3 -3
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +3 -3
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +14 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +6 -4
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +28 -4
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +2 -2
- cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py +3 -3
- cognee/infrastructure/llm/tokenizer/Mistral/adapter.py +3 -3
- cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +6 -6
- cognee/infrastructure/llm/utils.py +7 -7
- cognee/infrastructure/utils/run_sync.py +8 -1
- cognee/modules/chunking/models/DocumentChunk.py +4 -3
- cognee/modules/cloud/exceptions/CloudApiKeyMissingError.py +15 -0
- cognee/modules/cloud/exceptions/CloudConnectionError.py +15 -0
- cognee/modules/cloud/exceptions/__init__.py +2 -0
- cognee/modules/cloud/operations/__init__.py +1 -0
- cognee/modules/cloud/operations/check_api_key.py +25 -0
- cognee/modules/data/deletion/prune_system.py +1 -1
- cognee/modules/data/methods/__init__.py +2 -0
- cognee/modules/data/methods/check_dataset_name.py +1 -1
- cognee/modules/data/methods/create_authorized_dataset.py +19 -0
- cognee/modules/data/methods/get_authorized_dataset.py +11 -5
- cognee/modules/data/methods/get_authorized_dataset_by_name.py +16 -0
- cognee/modules/data/methods/get_dataset_data.py +1 -1
- cognee/modules/data/methods/load_or_create_datasets.py +2 -20
- cognee/modules/engine/models/Event.py +16 -0
- cognee/modules/engine/models/Interval.py +8 -0
- cognee/modules/engine/models/Timestamp.py +13 -0
- cognee/modules/engine/models/__init__.py +3 -0
- cognee/modules/engine/utils/__init__.py +2 -0
- cognee/modules/engine/utils/generate_event_datapoint.py +46 -0
- cognee/modules/engine/utils/generate_timestamp_datapoint.py +51 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +2 -2
- cognee/modules/graph/methods/get_formatted_graph_data.py +3 -2
- cognee/modules/graph/utils/__init__.py +1 -0
- cognee/modules/graph/utils/resolve_edges_to_text.py +71 -0
- cognee/modules/memify/__init__.py +1 -0
- cognee/modules/memify/memify.py +118 -0
- cognee/modules/notebooks/methods/__init__.py +5 -0
- cognee/modules/notebooks/methods/create_notebook.py +26 -0
- cognee/modules/notebooks/methods/delete_notebook.py +13 -0
- cognee/modules/notebooks/methods/get_notebook.py +21 -0
- cognee/modules/notebooks/methods/get_notebooks.py +18 -0
- cognee/modules/notebooks/methods/update_notebook.py +17 -0
- cognee/modules/notebooks/models/Notebook.py +53 -0
- cognee/modules/notebooks/models/__init__.py +1 -0
- cognee/modules/notebooks/operations/__init__.py +1 -0
- cognee/modules/notebooks/operations/run_in_local_sandbox.py +55 -0
- cognee/modules/pipelines/__init__.py +1 -1
- cognee/modules/pipelines/exceptions/tasks.py +18 -0
- cognee/modules/pipelines/layers/__init__.py +1 -0
- cognee/modules/pipelines/layers/check_pipeline_run_qualification.py +59 -0
- cognee/modules/pipelines/layers/pipeline_execution_mode.py +127 -0
- cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py +28 -0
- cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +34 -0
- cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +55 -0
- cognee/modules/pipelines/layers/setup_and_check_environment.py +41 -0
- cognee/modules/pipelines/layers/validate_pipeline_tasks.py +20 -0
- cognee/modules/pipelines/methods/__init__.py +2 -0
- cognee/modules/pipelines/methods/get_pipeline_runs_by_dataset.py +34 -0
- cognee/modules/pipelines/methods/reset_pipeline_run_status.py +16 -0
- cognee/modules/pipelines/operations/__init__.py +0 -1
- cognee/modules/pipelines/operations/log_pipeline_run_initiated.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +24 -138
- cognee/modules/pipelines/operations/run_tasks.py +17 -41
- cognee/modules/retrieval/base_feedback.py +11 -0
- cognee/modules/retrieval/base_graph_retriever.py +18 -0
- cognee/modules/retrieval/base_retriever.py +1 -1
- cognee/modules/retrieval/code_retriever.py +8 -0
- cognee/modules/retrieval/coding_rules_retriever.py +31 -0
- cognee/modules/retrieval/completion_retriever.py +9 -3
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -0
- cognee/modules/retrieval/cypher_search_retriever.py +1 -9
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +29 -13
- cognee/modules/retrieval/graph_completion_cot_retriever.py +30 -13
- cognee/modules/retrieval/graph_completion_retriever.py +107 -56
- cognee/modules/retrieval/graph_summary_completion_retriever.py +5 -1
- cognee/modules/retrieval/insights_retriever.py +14 -3
- cognee/modules/retrieval/natural_language_retriever.py +0 -4
- cognee/modules/retrieval/summaries_retriever.py +1 -1
- cognee/modules/retrieval/temporal_retriever.py +152 -0
- cognee/modules/retrieval/user_qa_feedback.py +83 -0
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +7 -32
- cognee/modules/retrieval/utils/completion.py +10 -3
- cognee/modules/retrieval/utils/extract_uuid_from_node.py +18 -0
- cognee/modules/retrieval/utils/models.py +40 -0
- cognee/modules/search/methods/get_search_type_tools.py +168 -0
- cognee/modules/search/methods/no_access_control_search.py +47 -0
- cognee/modules/search/methods/search.py +239 -118
- cognee/modules/search/types/SearchResult.py +21 -0
- cognee/modules/search/types/SearchType.py +3 -0
- cognee/modules/search/types/__init__.py +1 -0
- cognee/modules/search/utils/__init__.py +2 -0
- cognee/modules/search/utils/prepare_search_result.py +41 -0
- cognee/modules/search/utils/transform_context_to_graph.py +38 -0
- cognee/modules/settings/get_settings.py +2 -2
- cognee/modules/sync/__init__.py +1 -0
- cognee/modules/sync/methods/__init__.py +23 -0
- cognee/modules/sync/methods/create_sync_operation.py +53 -0
- cognee/modules/sync/methods/get_sync_operation.py +107 -0
- cognee/modules/sync/methods/update_sync_operation.py +248 -0
- cognee/modules/sync/models/SyncOperation.py +142 -0
- cognee/modules/sync/models/__init__.py +3 -0
- cognee/modules/users/__init__.py +0 -1
- cognee/modules/users/methods/__init__.py +4 -1
- cognee/modules/users/methods/create_user.py +26 -1
- cognee/modules/users/methods/get_authenticated_user.py +36 -42
- cognee/modules/users/methods/get_default_user.py +3 -1
- cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +2 -1
- cognee/root_dir.py +19 -0
- cognee/shared/CodeGraphEntities.py +1 -0
- cognee/shared/logging_utils.py +143 -32
- cognee/shared/utils.py +0 -1
- cognee/tasks/codingagents/coding_rule_associations.py +127 -0
- cognee/tasks/graph/extract_graph_from_data.py +6 -2
- cognee/tasks/ingestion/save_data_item_to_storage.py +23 -0
- cognee/tasks/memify/__init__.py +2 -0
- cognee/tasks/memify/extract_subgraph.py +7 -0
- cognee/tasks/memify/extract_subgraph_chunks.py +11 -0
- cognee/tasks/repo_processor/get_local_dependencies.py +2 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +144 -47
- cognee/tasks/storage/add_data_points.py +33 -3
- cognee/tasks/temporal_graph/__init__.py +1 -0
- cognee/tasks/temporal_graph/add_entities_to_event.py +85 -0
- cognee/tasks/temporal_graph/enrich_events.py +34 -0
- cognee/tasks/temporal_graph/extract_events_and_entities.py +32 -0
- cognee/tasks/temporal_graph/extract_knowledge_graph_from_events.py +41 -0
- cognee/tasks/temporal_graph/models.py +49 -0
- cognee/tests/integration/cli/__init__.py +3 -0
- cognee/tests/integration/cli/test_cli_integration.py +331 -0
- cognee/tests/integration/documents/PdfDocument_test.py +2 -2
- cognee/tests/integration/documents/TextDocument_test.py +2 -4
- cognee/tests/integration/documents/UnstructuredDocument_test.py +5 -8
- cognee/tests/{test_deletion.py → test_delete_hard.py} +0 -37
- cognee/tests/test_delete_soft.py +85 -0
- cognee/tests/test_kuzu.py +2 -2
- cognee/tests/test_neo4j.py +2 -2
- cognee/tests/test_permissions.py +3 -3
- cognee/tests/test_relational_db_migration.py +7 -5
- cognee/tests/test_search_db.py +136 -23
- cognee/tests/test_temporal_graph.py +167 -0
- cognee/tests/unit/api/__init__.py +1 -0
- cognee/tests/unit/api/test_conditional_authentication_endpoints.py +246 -0
- cognee/tests/unit/cli/__init__.py +3 -0
- cognee/tests/unit/cli/test_cli_commands.py +483 -0
- cognee/tests/unit/cli/test_cli_edge_cases.py +625 -0
- cognee/tests/unit/cli/test_cli_main.py +173 -0
- cognee/tests/unit/cli/test_cli_runner.py +62 -0
- cognee/tests/unit/cli/test_cli_utils.py +127 -0
- cognee/tests/unit/modules/retrieval/chunks_retriever_test.py +18 -2
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +12 -15
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +10 -15
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +4 -3
- cognee/tests/unit/modules/retrieval/insights_retriever_test.py +4 -2
- cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +18 -2
- cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +225 -0
- cognee/tests/unit/modules/users/__init__.py +1 -0
- cognee/tests/unit/modules/users/test_conditional_authentication.py +277 -0
- cognee/tests/unit/processing/utils/utils_test.py +20 -1
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dist-info}/METADATA +13 -9
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dist-info}/RECORD +247 -135
- cognee-0.3.0.dist-info/entry_points.txt +2 -0
- cognee/infrastructure/databases/graph/networkx/adapter.py +0 -1017
- cognee/infrastructure/pipeline/models/Operation.py +0 -60
- cognee/notebooks/github_analysis_step_by_step.ipynb +0 -37
- cognee/tests/tasks/descriptive_metrics/networkx_metrics_test.py +0 -7
- cognee/tests/unit/modules/search/search_methods_test.py +0 -223
- /cognee/{infrastructure/databases/graph/networkx → api/v1/memify}/__init__.py +0 -0
- /cognee/{infrastructure/pipeline/models → tasks/codingagents}/__init__.py +0 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dist-info}/WHEEL +0 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.3.0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -1,16 +1,35 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import math
|
|
3
3
|
import os
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from typing import AsyncGenerator
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Set
|
|
6
|
+
from typing import AsyncGenerator, Optional, List
|
|
7
7
|
from uuid import NAMESPACE_OID, uuid5
|
|
8
8
|
|
|
9
9
|
from cognee.infrastructure.engine import DataPoint
|
|
10
10
|
from cognee.shared.CodeGraphEntities import CodeFile, Repository
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
12
|
+
# constant, declared only once
|
|
13
|
+
EXCLUDED_DIRS: Set[str] = {
|
|
14
|
+
".venv",
|
|
15
|
+
"venv",
|
|
16
|
+
"env",
|
|
17
|
+
".env",
|
|
18
|
+
"site-packages",
|
|
19
|
+
"node_modules",
|
|
20
|
+
"dist",
|
|
21
|
+
"build",
|
|
22
|
+
".git",
|
|
23
|
+
"tests",
|
|
24
|
+
"test",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def get_source_code_files(
|
|
29
|
+
repo_path,
|
|
30
|
+
language_config: dict[str, list[str]] | None = None,
|
|
31
|
+
excluded_paths: Optional[List[str]] = None,
|
|
32
|
+
):
|
|
14
33
|
"""
|
|
15
34
|
Retrieve Python source code files from the specified repository path.
|
|
16
35
|
|
|
@@ -20,40 +39,70 @@ async def get_source_code_files(repo_path):
|
|
|
20
39
|
|
|
21
40
|
Parameters:
|
|
22
41
|
-----------
|
|
23
|
-
|
|
24
|
-
|
|
42
|
+
- repo_path: Root path of the repository to search
|
|
43
|
+
- language_config: dict mapping language names to file extensions, e.g.,
|
|
44
|
+
{'python': ['.py'], 'javascript': ['.js', '.jsx'], ...}
|
|
45
|
+
- excluded_paths: Optional list of path fragments or glob patterns to exclude
|
|
25
46
|
|
|
26
47
|
Returns:
|
|
27
48
|
--------
|
|
28
|
-
|
|
29
|
-
A list of absolute paths to .py files that contain source code, excluding empty
|
|
30
|
-
files, test files, and files from a virtual environment.
|
|
49
|
+
A list of (absolute_path, language) tuples for source code files.
|
|
31
50
|
"""
|
|
32
|
-
if not os.path.exists(repo_path):
|
|
33
|
-
return {}
|
|
34
|
-
|
|
35
|
-
py_files_paths = (
|
|
36
|
-
os.path.join(root, file)
|
|
37
|
-
for root, _, files in os.walk(repo_path)
|
|
38
|
-
for file in files
|
|
39
|
-
if (
|
|
40
|
-
file.endswith(".py")
|
|
41
|
-
and not file.startswith("test_")
|
|
42
|
-
and not file.endswith("_test")
|
|
43
|
-
and ".venv" not in file
|
|
44
|
-
)
|
|
45
|
-
)
|
|
46
51
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
52
|
+
def _get_language_from_extension(file, language_config):
|
|
53
|
+
for lang, exts in language_config.items():
|
|
54
|
+
for ext in exts:
|
|
55
|
+
if file.endswith(ext):
|
|
56
|
+
return lang
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
# Default config if not provided
|
|
60
|
+
if language_config is None:
|
|
61
|
+
language_config = {
|
|
62
|
+
"python": [".py"],
|
|
63
|
+
"javascript": [".js", ".jsx"],
|
|
64
|
+
"typescript": [".ts", ".tsx"],
|
|
65
|
+
"java": [".java"],
|
|
66
|
+
"csharp": [".cs"],
|
|
67
|
+
"go": [".go"],
|
|
68
|
+
"rust": [".rs"],
|
|
69
|
+
"cpp": [".cpp", ".c", ".h", ".hpp"],
|
|
70
|
+
}
|
|
53
71
|
|
|
54
|
-
|
|
72
|
+
if not os.path.exists(repo_path):
|
|
73
|
+
return []
|
|
55
74
|
|
|
56
|
-
|
|
75
|
+
source_code_files = set()
|
|
76
|
+
for root, _, files in os.walk(repo_path):
|
|
77
|
+
for file in files:
|
|
78
|
+
lang = _get_language_from_extension(file, language_config)
|
|
79
|
+
if lang is None:
|
|
80
|
+
continue
|
|
81
|
+
# Exclude tests, common build/venv directories and files provided in exclude_paths
|
|
82
|
+
excluded_dirs = EXCLUDED_DIRS
|
|
83
|
+
excluded_paths = {Path(p).resolve() for p in (excluded_paths or [])} # full paths
|
|
84
|
+
|
|
85
|
+
root_path = Path(root).resolve()
|
|
86
|
+
root_parts = set(root_path.parts) # same as before
|
|
87
|
+
base_name, _ext = os.path.splitext(file)
|
|
88
|
+
if (
|
|
89
|
+
base_name.startswith("test_")
|
|
90
|
+
or base_name.endswith("_test")
|
|
91
|
+
or ".test." in file
|
|
92
|
+
or ".spec." in file
|
|
93
|
+
or (excluded_dirs & root_parts) # name match
|
|
94
|
+
or any(
|
|
95
|
+
root_path.is_relative_to(p) # full-path match
|
|
96
|
+
for p in excluded_paths
|
|
97
|
+
)
|
|
98
|
+
):
|
|
99
|
+
continue
|
|
100
|
+
file_path = os.path.abspath(os.path.join(root, file))
|
|
101
|
+
if os.path.getsize(file_path) == 0:
|
|
102
|
+
continue
|
|
103
|
+
source_code_files.add((file_path, lang))
|
|
104
|
+
|
|
105
|
+
return sorted(list(source_code_files))
|
|
57
106
|
|
|
58
107
|
|
|
59
108
|
def run_coroutine(coroutine_func, *args, **kwargs):
|
|
@@ -85,22 +134,26 @@ def run_coroutine(coroutine_func, *args, **kwargs):
|
|
|
85
134
|
|
|
86
135
|
|
|
87
136
|
async def get_repo_file_dependencies(
|
|
88
|
-
repo_path: str,
|
|
137
|
+
repo_path: str,
|
|
138
|
+
detailed_extraction: bool = False,
|
|
139
|
+
supported_languages: list = None,
|
|
140
|
+
excluded_paths: Optional[List[str]] = None,
|
|
89
141
|
) -> AsyncGenerator[DataPoint, None]:
|
|
90
142
|
"""
|
|
91
|
-
Generate a dependency graph for
|
|
143
|
+
Generate a dependency graph for source files (multi-language) in the given repository path.
|
|
92
144
|
|
|
93
145
|
Check the validity of the repository path and yield a repository object followed by the
|
|
94
|
-
dependencies of
|
|
146
|
+
dependencies of source files within that repository. Raise a FileNotFoundError if the
|
|
95
147
|
provided path does not exist. The extraction of detailed dependencies can be controlled
|
|
96
|
-
via the `detailed_extraction` argument.
|
|
148
|
+
via the `detailed_extraction` argument. Languages considered can be restricted via
|
|
149
|
+
the `supported_languages` argument.
|
|
97
150
|
|
|
98
151
|
Parameters:
|
|
99
152
|
-----------
|
|
100
153
|
|
|
101
|
-
- repo_path (str): The file path to the repository
|
|
102
|
-
- detailed_extraction (bool):
|
|
103
|
-
|
|
154
|
+
- repo_path (str): The file path to the repository to process.
|
|
155
|
+
- detailed_extraction (bool): Whether to perform a detailed extraction of code parts.
|
|
156
|
+
- supported_languages (list | None): Subset of languages to include; if None, use defaults.
|
|
104
157
|
"""
|
|
105
158
|
|
|
106
159
|
if isinstance(repo_path, list) and len(repo_path) == 1:
|
|
@@ -109,7 +162,28 @@ async def get_repo_file_dependencies(
|
|
|
109
162
|
if not os.path.exists(repo_path):
|
|
110
163
|
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
|
|
111
164
|
|
|
112
|
-
|
|
165
|
+
# Build language config from supported_languages
|
|
166
|
+
default_language_config = {
|
|
167
|
+
"python": [".py"],
|
|
168
|
+
"javascript": [".js", ".jsx"],
|
|
169
|
+
"typescript": [".ts", ".tsx"],
|
|
170
|
+
"java": [".java"],
|
|
171
|
+
"csharp": [".cs"],
|
|
172
|
+
"go": [".go"],
|
|
173
|
+
"rust": [".rs"],
|
|
174
|
+
"cpp": [".cpp", ".c", ".h", ".hpp"],
|
|
175
|
+
"c": [".c", ".h"],
|
|
176
|
+
}
|
|
177
|
+
if supported_languages is not None:
|
|
178
|
+
language_config = {
|
|
179
|
+
k: v for k, v in default_language_config.items() if k in supported_languages
|
|
180
|
+
}
|
|
181
|
+
else:
|
|
182
|
+
language_config = default_language_config
|
|
183
|
+
|
|
184
|
+
source_code_files = await get_source_code_files(
|
|
185
|
+
repo_path, language_config=language_config, excluded_paths=excluded_paths
|
|
186
|
+
)
|
|
113
187
|
|
|
114
188
|
repo = Repository(
|
|
115
189
|
id=uuid5(NAMESPACE_OID, repo_path),
|
|
@@ -128,19 +202,42 @@ async def get_repo_file_dependencies(
|
|
|
128
202
|
for chunk_number in range(number_of_chunks)
|
|
129
203
|
]
|
|
130
204
|
|
|
131
|
-
#
|
|
205
|
+
# Import dependency extractors for each language (Python for now, extend later)
|
|
132
206
|
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
|
|
207
|
+
import aiofiles
|
|
208
|
+
# TODO: Add other language extractors here
|
|
133
209
|
|
|
134
210
|
for start_range, end_range in chunk_ranges:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
211
|
+
tasks = []
|
|
212
|
+
for file_path, lang in source_code_files[start_range : end_range + 1]:
|
|
213
|
+
# For now, only Python is supported; extend with other languages
|
|
214
|
+
if lang == "python":
|
|
215
|
+
tasks.append(
|
|
216
|
+
get_local_script_dependencies(repo_path, file_path, detailed_extraction)
|
|
217
|
+
)
|
|
218
|
+
else:
|
|
219
|
+
# Placeholder: create a minimal CodeFile for other languages
|
|
220
|
+
async def make_codefile_stub(file_path=file_path, lang=lang):
|
|
221
|
+
async with aiofiles.open(
|
|
222
|
+
file_path, "r", encoding="utf-8", errors="replace"
|
|
223
|
+
) as f:
|
|
224
|
+
source = await f.read()
|
|
225
|
+
return CodeFile(
|
|
226
|
+
id=uuid5(NAMESPACE_OID, file_path),
|
|
227
|
+
name=os.path.relpath(file_path, repo_path),
|
|
228
|
+
file_path=file_path,
|
|
229
|
+
language=lang,
|
|
230
|
+
source_code=source,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
tasks.append(make_codefile_stub())
|
|
140
234
|
|
|
141
235
|
results: list[CodeFile] = await asyncio.gather(*tasks)
|
|
142
236
|
|
|
143
237
|
for source_code_file in results:
|
|
144
238
|
source_code_file.part_of = repo
|
|
145
|
-
|
|
239
|
+
if getattr(
|
|
240
|
+
source_code_file, "language", None
|
|
241
|
+
) is None and source_code_file.file_path.endswith(".py"):
|
|
242
|
+
source_code_file.language = "python"
|
|
146
243
|
yield source_code_file
|
|
@@ -10,7 +10,37 @@ from cognee.tasks.storage.exceptions import (
|
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
async def add_data_points(
|
|
13
|
+
async def add_data_points(
|
|
14
|
+
data_points: List[DataPoint], update_edge_collection: bool = True
|
|
15
|
+
) -> List[DataPoint]:
|
|
16
|
+
"""
|
|
17
|
+
Add a batch of data points to the graph database by extracting nodes and edges,
|
|
18
|
+
deduplicating them, and indexing them for retrieval.
|
|
19
|
+
|
|
20
|
+
This function parallelizes the graph extraction for each data point,
|
|
21
|
+
merges the resulting nodes and edges, and ensures uniqueness before
|
|
22
|
+
committing them to the underlying graph engine. It also updates the
|
|
23
|
+
associated retrieval indices for nodes and (optionally) edges.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
data_points (List[DataPoint]):
|
|
27
|
+
A list of data points to process and insert into the graph.
|
|
28
|
+
update_edge_collection (bool, optional):
|
|
29
|
+
Whether to update the edge index after adding edges.
|
|
30
|
+
Defaults to True.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List[DataPoint]:
|
|
34
|
+
The original list of data points after processing and insertion.
|
|
35
|
+
|
|
36
|
+
Side Effects:
|
|
37
|
+
- Calls `get_graph_from_model` concurrently for each data point.
|
|
38
|
+
- Deduplicates nodes and edges across all results.
|
|
39
|
+
- Updates the node index via `index_data_points`.
|
|
40
|
+
- Inserts nodes and edges into the graph engine.
|
|
41
|
+
- Optionally updates the edge index via `index_graph_edges`.
|
|
42
|
+
"""
|
|
43
|
+
|
|
14
44
|
if not isinstance(data_points, list):
|
|
15
45
|
raise InvalidDataPointsInAddDataPointsError("data_points must be a list.")
|
|
16
46
|
if not all(isinstance(dp, DataPoint) for dp in data_points):
|
|
@@ -48,7 +78,7 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
|
|
|
48
78
|
await graph_engine.add_nodes(nodes)
|
|
49
79
|
await graph_engine.add_edges(edges)
|
|
50
80
|
|
|
51
|
-
|
|
52
|
-
|
|
81
|
+
if update_edge_collection:
|
|
82
|
+
await index_graph_edges()
|
|
53
83
|
|
|
54
84
|
return data_points
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from cognee.modules.engine.models import Event
|
|
2
|
+
from cognee.tasks.temporal_graph.models import EventWithEntities
|
|
3
|
+
from cognee.modules.engine.models.Entity import Entity
|
|
4
|
+
from cognee.modules.engine.models.EntityType import EntityType
|
|
5
|
+
from cognee.infrastructure.engine.models.Edge import Edge
|
|
6
|
+
from cognee.modules.engine.utils import generate_node_id, generate_node_name
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def add_entities_to_event(event: Event, event_with_entities: EventWithEntities) -> None:
|
|
10
|
+
"""
|
|
11
|
+
Adds extracted entities to an Event object by populating its attributes field.
|
|
12
|
+
|
|
13
|
+
For each attribute in the provided EventWithEntities, the function ensures that
|
|
14
|
+
the corresponding entity type exists, creates an Entity node with metadata, and
|
|
15
|
+
links it to the event via an Edge representing the relationship. Entities are
|
|
16
|
+
cached by type to avoid duplication.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
event (Event): The target Event object to enrich with entities.
|
|
20
|
+
event_with_entities (EventWithEntities): An event model containing extracted
|
|
21
|
+
attributes with entity, type, and relationship metadata.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
None
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
if not event_with_entities.attributes:
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
# Create entity types cache
|
|
31
|
+
entity_types = {}
|
|
32
|
+
|
|
33
|
+
# Process each attribute
|
|
34
|
+
for attribute in event_with_entities.attributes:
|
|
35
|
+
# Get or create entity type
|
|
36
|
+
entity_type = get_or_create_entity_type(entity_types, attribute.entity_type)
|
|
37
|
+
|
|
38
|
+
# Create entity
|
|
39
|
+
entity_id = generate_node_id(attribute.entity)
|
|
40
|
+
entity_name = generate_node_name(attribute.entity)
|
|
41
|
+
entity = Entity(
|
|
42
|
+
id=entity_id,
|
|
43
|
+
name=entity_name,
|
|
44
|
+
is_a=entity_type,
|
|
45
|
+
description=f"Entity {attribute.entity} of type {attribute.entity_type}",
|
|
46
|
+
ontology_valid=False,
|
|
47
|
+
belongs_to_set=None,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Create edge
|
|
51
|
+
edge = Edge(relationship_type=attribute.relationship)
|
|
52
|
+
|
|
53
|
+
# Add to event attributes
|
|
54
|
+
if event.attributes is None:
|
|
55
|
+
event.attributes = []
|
|
56
|
+
event.attributes.append((edge, [entity]))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_or_create_entity_type(entity_types: dict, entity_type_name: str) -> EntityType:
|
|
60
|
+
"""
|
|
61
|
+
Retrieves an existing EntityType from the cache or creates a new one if it does not exist.
|
|
62
|
+
|
|
63
|
+
If the given entity type name is not already in the cache, a new EntityType is generated
|
|
64
|
+
with a unique ID, normalized name, and description, then added to the cache.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
entity_types (dict): A cache mapping entity type names to EntityType objects.
|
|
68
|
+
entity_type_name (str): The name of the entity type to retrieve or create.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
EntityType: The existing or newly created EntityType object.
|
|
72
|
+
"""
|
|
73
|
+
if entity_type_name not in entity_types:
|
|
74
|
+
type_id = generate_node_id(entity_type_name)
|
|
75
|
+
type_name = generate_node_name(entity_type_name)
|
|
76
|
+
entity_type = EntityType(
|
|
77
|
+
id=type_id,
|
|
78
|
+
name=type_name,
|
|
79
|
+
type=type_name,
|
|
80
|
+
description=f"Type for {entity_type_name}",
|
|
81
|
+
ontology_valid=False,
|
|
82
|
+
)
|
|
83
|
+
entity_types[entity_type_name] = entity_type
|
|
84
|
+
|
|
85
|
+
return entity_types[entity_type_name]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from cognee.infrastructure.llm import LLMGateway
|
|
4
|
+
from cognee.modules.engine.models import Event
|
|
5
|
+
from cognee.tasks.temporal_graph.models import EventWithEntities, EventEntityList
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
async def enrich_events(events: List[Event]) -> List[EventWithEntities]:
|
|
9
|
+
"""
|
|
10
|
+
Enriches a list of events by extracting entities using an LLM.
|
|
11
|
+
|
|
12
|
+
The function serializes event data into JSON, sends it to the LLM for
|
|
13
|
+
entity extraction, and returns enriched events with associated entities.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
events (List[Event]): A list of Event objects to be enriched.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
List[EventWithEntities]: A list of events augmented with extracted entities.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
|
|
24
|
+
# Convert events to JSON format for LLM processing
|
|
25
|
+
events_json = [
|
|
26
|
+
{"event_name": event.name, "description": event.description or ""} for event in events
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
events_json_str = json.dumps(events_json)
|
|
30
|
+
|
|
31
|
+
# Extract entities from events
|
|
32
|
+
entity_result = await LLMGateway.extract_event_entities(events_json_str, EventEntityList)
|
|
33
|
+
|
|
34
|
+
return entity_result.events
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Type, List
|
|
3
|
+
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
4
|
+
from cognee.modules.chunking.models import DocumentChunk
|
|
5
|
+
from cognee.tasks.temporal_graph.models import EventList
|
|
6
|
+
from cognee.modules.engine.utils.generate_event_datapoint import generate_event_datapoint
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def extract_events_and_timestamps(data_chunks: List[DocumentChunk]) -> List[DocumentChunk]:
|
|
10
|
+
"""
|
|
11
|
+
Extracts events and their timestamps from document chunks using an LLM.
|
|
12
|
+
|
|
13
|
+
Each document chunk is processed with the event graph extractor to identify events.
|
|
14
|
+
The extracted events are converted into Event datapoints and appended to the
|
|
15
|
+
chunk's `contains` list.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
data_chunks (List[DocumentChunk]): A list of document chunks containing text to process.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
List[DocumentChunk]: The same list of document chunks, enriched with extracted Event datapoints.
|
|
22
|
+
"""
|
|
23
|
+
events = await asyncio.gather(
|
|
24
|
+
*[LLMGateway.extract_event_graph(chunk.text, EventList) for chunk in data_chunks]
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
for data_chunk, event_list in zip(data_chunks, events):
|
|
28
|
+
for event in event_list.events:
|
|
29
|
+
event_datapoint = generate_event_datapoint(event)
|
|
30
|
+
data_chunk.contains.append(event_datapoint)
|
|
31
|
+
|
|
32
|
+
return data_chunks
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from cognee.modules.chunking.models import DocumentChunk
|
|
3
|
+
from cognee.modules.engine.models import Event
|
|
4
|
+
from cognee.tasks.temporal_graph.enrich_events import enrich_events
|
|
5
|
+
from cognee.tasks.temporal_graph.add_entities_to_event import add_entities_to_event
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
async def extract_knowledge_graph_from_events(
|
|
9
|
+
data_chunks: List[DocumentChunk],
|
|
10
|
+
) -> List[DocumentChunk]:
|
|
11
|
+
"""
|
|
12
|
+
Extracts events from document chunks and enriches them with entities to form a knowledge graph.
|
|
13
|
+
|
|
14
|
+
The function collects all Event objects from the given document chunks,
|
|
15
|
+
uses an LLM to extract and attach related entities, and updates the events
|
|
16
|
+
with these enriched attributes.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
data_chunks (List[DocumentChunk]): A list of document chunks containing extracted events.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
List[DocumentChunk]: The same list of document chunks, with their events enriched by entities.
|
|
23
|
+
"""
|
|
24
|
+
# Extract events from chunks
|
|
25
|
+
all_events = []
|
|
26
|
+
for chunk in data_chunks:
|
|
27
|
+
for item in chunk.contains:
|
|
28
|
+
if isinstance(item, Event):
|
|
29
|
+
all_events.append(item)
|
|
30
|
+
|
|
31
|
+
if not all_events:
|
|
32
|
+
return data_chunks
|
|
33
|
+
|
|
34
|
+
# Enrich events with entities
|
|
35
|
+
enriched_events = await enrich_events(all_events)
|
|
36
|
+
|
|
37
|
+
# Add entities to events
|
|
38
|
+
for event, enriched_event in zip(all_events, enriched_events):
|
|
39
|
+
add_entities_to_event(event, enriched_event)
|
|
40
|
+
|
|
41
|
+
return data_chunks
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from typing import Optional, List
|
|
2
|
+
from pydantic import BaseModel, Field
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Timestamp(BaseModel):
|
|
6
|
+
year: int = Field(..., ge=1, le=9999)
|
|
7
|
+
month: int = Field(..., ge=1, le=12)
|
|
8
|
+
day: int = Field(..., ge=1, le=31)
|
|
9
|
+
hour: int = Field(..., ge=0, le=23)
|
|
10
|
+
minute: int = Field(..., ge=0, le=59)
|
|
11
|
+
second: int = Field(..., ge=0, le=59)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Interval(BaseModel):
|
|
15
|
+
starts_at: Timestamp
|
|
16
|
+
ends_at: Timestamp
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class QueryInterval(BaseModel):
|
|
20
|
+
starts_at: Optional[Timestamp] = None
|
|
21
|
+
ends_at: Optional[Timestamp] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Event(BaseModel):
|
|
25
|
+
name: str
|
|
26
|
+
description: Optional[str] = None
|
|
27
|
+
time_from: Optional[Timestamp] = None
|
|
28
|
+
time_to: Optional[Timestamp] = None
|
|
29
|
+
location: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class EventList(BaseModel):
|
|
33
|
+
events: List[Event]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class EntityAttribute(BaseModel):
|
|
37
|
+
entity: str
|
|
38
|
+
entity_type: str
|
|
39
|
+
relationship: str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class EventWithEntities(BaseModel):
|
|
43
|
+
event_name: str
|
|
44
|
+
description: Optional[str] = None
|
|
45
|
+
attributes: List[EntityAttribute] = []
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class EventEntityList(BaseModel):
|
|
49
|
+
events: List[EventWithEntities]
|