cognee 0.2.3.dev1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/__main__.py +4 -0
- cognee/api/v1/add/add.py +18 -6
- cognee/api/v1/cognify/code_graph_pipeline.py +7 -1
- cognee/api/v1/cognify/cognify.py +22 -107
- cognee/api/v1/cognify/routers/get_cognify_router.py +11 -3
- cognee/api/v1/datasets/routers/get_datasets_router.py +1 -1
- cognee/api/v1/responses/default_tools.py +4 -0
- cognee/api/v1/responses/dispatch_function.py +6 -1
- cognee/api/v1/responses/models.py +1 -1
- cognee/api/v1/search/search.py +6 -0
- cognee/cli/__init__.py +10 -0
- cognee/cli/_cognee.py +180 -0
- cognee/cli/commands/__init__.py +1 -0
- cognee/cli/commands/add_command.py +80 -0
- cognee/cli/commands/cognify_command.py +128 -0
- cognee/cli/commands/config_command.py +225 -0
- cognee/cli/commands/delete_command.py +80 -0
- cognee/cli/commands/search_command.py +149 -0
- cognee/cli/config.py +33 -0
- cognee/cli/debug.py +21 -0
- cognee/cli/echo.py +45 -0
- cognee/cli/exceptions.py +23 -0
- cognee/cli/minimal_cli.py +97 -0
- cognee/cli/reference.py +26 -0
- cognee/cli/suppress_logging.py +12 -0
- cognee/eval_framework/corpus_builder/corpus_builder_executor.py +2 -2
- cognee/eval_framework/eval_config.py +1 -1
- cognee/infrastructure/databases/graph/get_graph_engine.py +4 -9
- cognee/infrastructure/databases/graph/kuzu/adapter.py +64 -2
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +49 -0
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +5 -3
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +16 -7
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +5 -5
- cognee/infrastructure/databases/vector/embeddings/config.py +2 -2
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +6 -6
- cognee/infrastructure/files/utils/get_data_file_path.py +14 -9
- cognee/infrastructure/files/utils/get_file_metadata.py +2 -1
- cognee/infrastructure/llm/LLMGateway.py +14 -5
- cognee/infrastructure/llm/config.py +5 -5
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +16 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_content_graph.py +19 -15
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +3 -3
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +3 -3
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +14 -8
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +6 -4
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +3 -3
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +2 -2
- cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py +3 -3
- cognee/infrastructure/llm/tokenizer/Mistral/adapter.py +3 -3
- cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +6 -6
- cognee/infrastructure/llm/utils.py +7 -7
- cognee/modules/data/methods/__init__.py +2 -0
- cognee/modules/data/methods/create_authorized_dataset.py +19 -0
- cognee/modules/data/methods/get_authorized_dataset.py +11 -5
- cognee/modules/data/methods/get_authorized_dataset_by_name.py +16 -0
- cognee/modules/data/methods/load_or_create_datasets.py +2 -20
- cognee/modules/graph/methods/get_formatted_graph_data.py +3 -2
- cognee/modules/pipelines/__init__.py +1 -1
- cognee/modules/pipelines/exceptions/tasks.py +18 -0
- cognee/modules/pipelines/layers/__init__.py +1 -0
- cognee/modules/pipelines/layers/check_pipeline_run_qualification.py +59 -0
- cognee/modules/pipelines/layers/pipeline_execution_mode.py +127 -0
- cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py +12 -0
- cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +34 -0
- cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +55 -0
- cognee/modules/pipelines/layers/setup_and_check_environment.py +41 -0
- cognee/modules/pipelines/layers/validate_pipeline_tasks.py +20 -0
- cognee/modules/pipelines/methods/__init__.py +2 -0
- cognee/modules/pipelines/methods/get_pipeline_runs_by_dataset.py +34 -0
- cognee/modules/pipelines/methods/reset_pipeline_run_status.py +16 -0
- cognee/modules/pipelines/operations/__init__.py +0 -1
- cognee/modules/pipelines/operations/log_pipeline_run_initiated.py +1 -1
- cognee/modules/pipelines/operations/pipeline.py +23 -138
- cognee/modules/retrieval/base_feedback.py +11 -0
- cognee/modules/retrieval/cypher_search_retriever.py +1 -9
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +9 -2
- cognee/modules/retrieval/graph_completion_cot_retriever.py +13 -6
- cognee/modules/retrieval/graph_completion_retriever.py +89 -5
- cognee/modules/retrieval/graph_summary_completion_retriever.py +2 -0
- cognee/modules/retrieval/natural_language_retriever.py +0 -4
- cognee/modules/retrieval/user_qa_feedback.py +83 -0
- cognee/modules/retrieval/utils/extract_uuid_from_node.py +18 -0
- cognee/modules/retrieval/utils/models.py +40 -0
- cognee/modules/search/methods/search.py +46 -5
- cognee/modules/search/types/SearchType.py +1 -0
- cognee/modules/settings/get_settings.py +2 -2
- cognee/shared/CodeGraphEntities.py +1 -0
- cognee/shared/logging_utils.py +142 -31
- cognee/shared/utils.py +0 -1
- cognee/tasks/graph/extract_graph_from_data.py +6 -2
- cognee/tasks/repo_processor/get_local_dependencies.py +2 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +120 -48
- cognee/tasks/storage/add_data_points.py +33 -3
- cognee/tests/integration/cli/__init__.py +3 -0
- cognee/tests/integration/cli/test_cli_integration.py +331 -0
- cognee/tests/integration/documents/PdfDocument_test.py +2 -2
- cognee/tests/integration/documents/TextDocument_test.py +2 -4
- cognee/tests/integration/documents/UnstructuredDocument_test.py +5 -8
- cognee/tests/{test_deletion.py → test_delete_hard.py} +0 -37
- cognee/tests/test_delete_soft.py +85 -0
- cognee/tests/test_kuzu.py +2 -2
- cognee/tests/test_neo4j.py +2 -2
- cognee/tests/test_search_db.py +126 -7
- cognee/tests/unit/cli/__init__.py +3 -0
- cognee/tests/unit/cli/test_cli_commands.py +483 -0
- cognee/tests/unit/cli/test_cli_edge_cases.py +625 -0
- cognee/tests/unit/cli/test_cli_main.py +173 -0
- cognee/tests/unit/cli/test_cli_runner.py +62 -0
- cognee/tests/unit/cli/test_cli_utils.py +127 -0
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +3 -3
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +3 -3
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +3 -3
- cognee/tests/unit/modules/search/search_methods_test.py +2 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/METADATA +7 -5
- {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/RECORD +120 -83
- cognee-0.2.4.dist-info/entry_points.txt +2 -0
- cognee/infrastructure/databases/graph/networkx/__init__.py +0 -0
- cognee/infrastructure/databases/graph/networkx/adapter.py +0 -1017
- cognee/infrastructure/pipeline/models/Operation.py +0 -60
- cognee/infrastructure/pipeline/models/__init__.py +0 -0
- cognee/notebooks/github_analysis_step_by_step.ipynb +0 -37
- cognee/tests/tasks/descriptive_metrics/networkx_metrics_test.py +0 -7
- {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/WHEEL +0 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/licenses/NOTICE.md +0 -0
cognee/shared/logging_utils.py
CHANGED
|
@@ -15,14 +15,43 @@ from typing import Protocol
|
|
|
15
15
|
# Configure external library logging
|
|
16
16
|
def configure_external_library_logging():
|
|
17
17
|
"""Configure logging for external libraries to reduce verbosity"""
|
|
18
|
+
# Set environment variables to suppress LiteLLM logging
|
|
19
|
+
os.environ.setdefault("LITELLM_LOG", "ERROR")
|
|
20
|
+
os.environ.setdefault("LITELLM_SET_VERBOSE", "False")
|
|
21
|
+
|
|
18
22
|
# Configure LiteLLM logging to reduce verbosity
|
|
19
23
|
try:
|
|
20
24
|
import litellm
|
|
21
25
|
|
|
26
|
+
# Disable verbose logging
|
|
22
27
|
litellm.set_verbose = False
|
|
23
28
|
|
|
24
|
-
#
|
|
25
|
-
|
|
29
|
+
# Set additional LiteLLM configuration
|
|
30
|
+
if hasattr(litellm, "suppress_debug_info"):
|
|
31
|
+
litellm.suppress_debug_info = True
|
|
32
|
+
if hasattr(litellm, "turn_off_message"):
|
|
33
|
+
litellm.turn_off_message = True
|
|
34
|
+
if hasattr(litellm, "_turn_on_debug"):
|
|
35
|
+
litellm._turn_on_debug = False
|
|
36
|
+
|
|
37
|
+
# Comprehensive logger suppression
|
|
38
|
+
loggers_to_suppress = [
|
|
39
|
+
"litellm",
|
|
40
|
+
"litellm.litellm_core_utils.logging_worker",
|
|
41
|
+
"litellm.litellm_core_utils",
|
|
42
|
+
"litellm.proxy",
|
|
43
|
+
"litellm.router",
|
|
44
|
+
"openai._base_client",
|
|
45
|
+
"LiteLLM", # Capital case variant
|
|
46
|
+
"LiteLLM.core",
|
|
47
|
+
"LiteLLM.logging_worker",
|
|
48
|
+
"litellm.logging_worker",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
for logger_name in loggers_to_suppress:
|
|
52
|
+
logging.getLogger(logger_name).setLevel(logging.CRITICAL)
|
|
53
|
+
logging.getLogger(logger_name).disabled = True
|
|
54
|
+
|
|
26
55
|
except ImportError:
|
|
27
56
|
# LiteLLM not available, skip configuration
|
|
28
57
|
pass
|
|
@@ -173,29 +202,17 @@ def log_database_configuration(logger):
|
|
|
173
202
|
from cognee.infrastructure.databases.graph.config import get_graph_config
|
|
174
203
|
|
|
175
204
|
try:
|
|
176
|
-
#
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
vector_config = get_vectordb_config()
|
|
185
|
-
if vector_config.vector_db_provider == "lancedb":
|
|
186
|
-
logger.info(f"Vector database path: {vector_config.vector_db_url}")
|
|
187
|
-
else:
|
|
188
|
-
logger.info(f"Vector database URL: {vector_config.vector_db_url}")
|
|
189
|
-
|
|
190
|
-
# Log graph database configuration
|
|
191
|
-
graph_config = get_graph_config()
|
|
192
|
-
if graph_config.graph_database_provider == "kuzu":
|
|
193
|
-
logger.info(f"Graph database path: {graph_config.graph_file_path}")
|
|
194
|
-
else:
|
|
195
|
-
logger.info(f"Graph database URL: {graph_config.graph_database_url}")
|
|
205
|
+
# Get base database directory path
|
|
206
|
+
from cognee.base_config import get_base_config
|
|
207
|
+
|
|
208
|
+
base_config = get_base_config()
|
|
209
|
+
databases_path = os.path.join(base_config.system_root_directory, "databases")
|
|
210
|
+
|
|
211
|
+
# Log concise database info
|
|
212
|
+
logger.info(f"Database storage: {databases_path}")
|
|
196
213
|
|
|
197
214
|
except Exception as e:
|
|
198
|
-
logger.
|
|
215
|
+
logger.debug(f"Could not retrieve database configuration: {str(e)}")
|
|
199
216
|
|
|
200
217
|
|
|
201
218
|
def cleanup_old_logs(logs_dir, max_files):
|
|
@@ -216,13 +233,22 @@ def cleanup_old_logs(logs_dir, max_files):
|
|
|
216
233
|
|
|
217
234
|
# Remove old files that exceed the maximum
|
|
218
235
|
if len(log_files) > max_files:
|
|
236
|
+
deleted_count = 0
|
|
219
237
|
for old_file in log_files[max_files:]:
|
|
220
238
|
try:
|
|
221
239
|
old_file.unlink()
|
|
222
|
-
|
|
240
|
+
deleted_count += 1
|
|
241
|
+
# Only log individual files in non-CLI mode
|
|
242
|
+
if os.getenv("COGNEE_CLI_MODE") != "true":
|
|
243
|
+
logger.info(f"Deleted old log file: {old_file}")
|
|
223
244
|
except Exception as e:
|
|
245
|
+
# Always log errors
|
|
224
246
|
logger.error(f"Failed to delete old log file {old_file}: {e}")
|
|
225
247
|
|
|
248
|
+
# In CLI mode, show compact summary
|
|
249
|
+
if os.getenv("COGNEE_CLI_MODE") == "true" and deleted_count > 0:
|
|
250
|
+
logger.info(f"Cleaned up {deleted_count} old log files")
|
|
251
|
+
|
|
226
252
|
return True
|
|
227
253
|
except Exception as e:
|
|
228
254
|
logger.error(f"Error cleaning up log files: {e}")
|
|
@@ -241,11 +267,81 @@ def setup_logging(log_level=None, name=None):
|
|
|
241
267
|
"""
|
|
242
268
|
global _is_structlog_configured
|
|
243
269
|
|
|
270
|
+
# Regular detailed logging for non-CLI usage
|
|
244
271
|
log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")]
|
|
245
272
|
|
|
246
273
|
# Configure external library logging early to suppress verbose output
|
|
247
274
|
configure_external_library_logging()
|
|
248
275
|
|
|
276
|
+
# Add custom filter to suppress LiteLLM worker cancellation errors
|
|
277
|
+
class LiteLLMCancellationFilter(logging.Filter):
|
|
278
|
+
"""Filter to suppress LiteLLM worker cancellation messages"""
|
|
279
|
+
|
|
280
|
+
def filter(self, record):
|
|
281
|
+
# Check if this is a LiteLLM-related logger
|
|
282
|
+
if hasattr(record, "name") and "litellm" in record.name.lower():
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
# Check message content for cancellation errors
|
|
286
|
+
if hasattr(record, "msg") and record.msg:
|
|
287
|
+
msg_str = str(record.msg).lower()
|
|
288
|
+
if any(
|
|
289
|
+
keyword in msg_str
|
|
290
|
+
for keyword in [
|
|
291
|
+
"loggingworker cancelled",
|
|
292
|
+
"logging_worker.py",
|
|
293
|
+
"cancellederror",
|
|
294
|
+
"litellm:error",
|
|
295
|
+
]
|
|
296
|
+
):
|
|
297
|
+
return False
|
|
298
|
+
|
|
299
|
+
# Check formatted message
|
|
300
|
+
try:
|
|
301
|
+
if hasattr(record, "getMessage"):
|
|
302
|
+
formatted_msg = record.getMessage().lower()
|
|
303
|
+
if any(
|
|
304
|
+
keyword in formatted_msg
|
|
305
|
+
for keyword in [
|
|
306
|
+
"loggingworker cancelled",
|
|
307
|
+
"logging_worker.py",
|
|
308
|
+
"cancellederror",
|
|
309
|
+
"litellm:error",
|
|
310
|
+
]
|
|
311
|
+
):
|
|
312
|
+
return False
|
|
313
|
+
except Exception:
|
|
314
|
+
pass
|
|
315
|
+
|
|
316
|
+
return True
|
|
317
|
+
|
|
318
|
+
# Apply the filter to root logger and specific loggers
|
|
319
|
+
cancellation_filter = LiteLLMCancellationFilter()
|
|
320
|
+
logging.getLogger().addFilter(cancellation_filter)
|
|
321
|
+
logging.getLogger("litellm").addFilter(cancellation_filter)
|
|
322
|
+
|
|
323
|
+
# Add custom filter to suppress LiteLLM worker cancellation errors
|
|
324
|
+
class LiteLLMFilter(logging.Filter):
|
|
325
|
+
def filter(self, record):
|
|
326
|
+
# Suppress LiteLLM worker cancellation errors
|
|
327
|
+
if hasattr(record, "msg") and isinstance(record.msg, str):
|
|
328
|
+
msg_lower = record.msg.lower()
|
|
329
|
+
if any(
|
|
330
|
+
phrase in msg_lower
|
|
331
|
+
for phrase in [
|
|
332
|
+
"loggingworker cancelled",
|
|
333
|
+
"cancellederror",
|
|
334
|
+
"logging_worker.py",
|
|
335
|
+
"loggingerror",
|
|
336
|
+
]
|
|
337
|
+
):
|
|
338
|
+
return False
|
|
339
|
+
return True
|
|
340
|
+
|
|
341
|
+
# Apply filter to root logger
|
|
342
|
+
litellm_filter = LiteLLMFilter()
|
|
343
|
+
logging.getLogger().addFilter(litellm_filter)
|
|
344
|
+
|
|
249
345
|
def exception_handler(logger, method_name, event_dict):
|
|
250
346
|
"""Custom processor to handle uncaught exceptions."""
|
|
251
347
|
# Check if there's an exc_info that needs to be processed
|
|
@@ -298,11 +394,6 @@ def setup_logging(log_level=None, name=None):
|
|
|
298
394
|
# Hand back to the original hook → prints traceback and exits
|
|
299
395
|
sys.__excepthook__(exc_type, exc_value, traceback)
|
|
300
396
|
|
|
301
|
-
logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
|
|
302
|
-
logger.info(
|
|
303
|
-
"Need help? Reach out to us on our Discord server: https://discord.gg/NQPKmU5CCg"
|
|
304
|
-
)
|
|
305
|
-
|
|
306
397
|
# Install exception handlers
|
|
307
398
|
sys.excepthook = handle_exception
|
|
308
399
|
|
|
@@ -380,18 +471,38 @@ def setup_logging(log_level=None, name=None):
|
|
|
380
471
|
# Mark logging as configured
|
|
381
472
|
_is_structlog_configured = True
|
|
382
473
|
|
|
474
|
+
from cognee.infrastructure.databases.relational.config import get_relational_config
|
|
475
|
+
from cognee.infrastructure.databases.vector.config import get_vectordb_config
|
|
476
|
+
from cognee.infrastructure.databases.graph.config import get_graph_config
|
|
477
|
+
|
|
478
|
+
graph_config = get_graph_config()
|
|
479
|
+
vector_config = get_vectordb_config()
|
|
480
|
+
relational_config = get_relational_config()
|
|
481
|
+
|
|
482
|
+
try:
|
|
483
|
+
# Get base database directory path
|
|
484
|
+
from cognee.base_config import get_base_config
|
|
485
|
+
|
|
486
|
+
base_config = get_base_config()
|
|
487
|
+
databases_path = os.path.join(base_config.system_root_directory, "databases")
|
|
488
|
+
except Exception as e:
|
|
489
|
+
raise ValueError from e
|
|
490
|
+
|
|
383
491
|
# Get a configured logger and log system information
|
|
384
492
|
logger = structlog.get_logger(name if name else __name__)
|
|
493
|
+
# Detailed initialization for regular usage
|
|
385
494
|
logger.info(
|
|
386
495
|
"Logging initialized",
|
|
387
496
|
python_version=PYTHON_VERSION,
|
|
388
497
|
structlog_version=STRUCTLOG_VERSION,
|
|
389
498
|
cognee_version=COGNEE_VERSION,
|
|
390
499
|
os_info=OS_INFO,
|
|
500
|
+
database_path=databases_path,
|
|
501
|
+
graph_database_name=graph_config.graph_database_name,
|
|
502
|
+
vector_config=vector_config.vector_db_provider,
|
|
503
|
+
relational_config=relational_config.db_name,
|
|
391
504
|
)
|
|
392
505
|
|
|
393
|
-
logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
|
|
394
|
-
|
|
395
506
|
# Log database configuration
|
|
396
507
|
log_database_configuration(logger)
|
|
397
508
|
|
cognee/shared/utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import Type, List
|
|
2
|
+
from typing import Type, List, Optional
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
5
5
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
@@ -71,6 +71,7 @@ async def extract_graph_from_data(
|
|
|
71
71
|
data_chunks: List[DocumentChunk],
|
|
72
72
|
graph_model: Type[BaseModel],
|
|
73
73
|
ontology_adapter: OntologyResolver = None,
|
|
74
|
+
custom_prompt: Optional[str] = None,
|
|
74
75
|
) -> List[DocumentChunk]:
|
|
75
76
|
"""
|
|
76
77
|
Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
|
|
@@ -84,7 +85,10 @@ async def extract_graph_from_data(
|
|
|
84
85
|
raise InvalidGraphModelError(graph_model)
|
|
85
86
|
|
|
86
87
|
chunk_graphs = await asyncio.gather(
|
|
87
|
-
*[
|
|
88
|
+
*[
|
|
89
|
+
LLMGateway.extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt)
|
|
90
|
+
for chunk in data_chunks
|
|
91
|
+
]
|
|
88
92
|
)
|
|
89
93
|
|
|
90
94
|
# Note: Filter edges with missing source or target nodes
|
|
@@ -180,6 +180,7 @@ async def get_local_script_dependencies(
|
|
|
180
180
|
name=file_path_relative_to_repo,
|
|
181
181
|
source_code=source_code,
|
|
182
182
|
file_path=script_path,
|
|
183
|
+
language="python",
|
|
183
184
|
)
|
|
184
185
|
return code_file_node
|
|
185
186
|
|
|
@@ -188,6 +189,7 @@ async def get_local_script_dependencies(
|
|
|
188
189
|
name=file_path_relative_to_repo,
|
|
189
190
|
source_code=None,
|
|
190
191
|
file_path=script_path,
|
|
192
|
+
language="python",
|
|
191
193
|
)
|
|
192
194
|
|
|
193
195
|
async for part in extract_code_parts(source_code_tree.root_node, script_path=script_path):
|
|
@@ -10,50 +10,80 @@ from cognee.infrastructure.engine import DataPoint
|
|
|
10
10
|
from cognee.shared.CodeGraphEntities import CodeFile, Repository
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
async def get_source_code_files(repo_path):
|
|
13
|
+
async def get_source_code_files(repo_path, language_config: dict[str, list[str]] | None = None):
|
|
14
14
|
"""
|
|
15
|
-
Retrieve
|
|
16
|
-
|
|
17
|
-
This function scans the given repository path for files that have the .py extension
|
|
18
|
-
while excluding test files and files within a virtual environment. It returns a list of
|
|
19
|
-
absolute paths to the source code files that are not empty.
|
|
15
|
+
Retrieve source code files from the specified repository path for multiple languages.
|
|
20
16
|
|
|
21
17
|
Parameters:
|
|
22
18
|
-----------
|
|
23
|
-
|
|
24
|
-
-
|
|
19
|
+
- repo_path: The file path to the repository to search for source files.
|
|
20
|
+
- language_config: dict mapping language names to file extensions, e.g.,
|
|
21
|
+
{'python': ['.py'], 'javascript': ['.js', '.jsx'], ...}
|
|
25
22
|
|
|
26
23
|
Returns:
|
|
27
24
|
--------
|
|
28
|
-
|
|
29
|
-
A list of absolute paths to .py files that contain source code, excluding empty
|
|
30
|
-
files, test files, and files from a virtual environment.
|
|
25
|
+
A list of (absolute_path, language) tuples for source code files.
|
|
31
26
|
"""
|
|
32
|
-
if not os.path.exists(repo_path):
|
|
33
|
-
return {}
|
|
34
|
-
|
|
35
|
-
py_files_paths = (
|
|
36
|
-
os.path.join(root, file)
|
|
37
|
-
for root, _, files in os.walk(repo_path)
|
|
38
|
-
for file in files
|
|
39
|
-
if (
|
|
40
|
-
file.endswith(".py")
|
|
41
|
-
and not file.startswith("test_")
|
|
42
|
-
and not file.endswith("_test")
|
|
43
|
-
and ".venv" not in file
|
|
44
|
-
)
|
|
45
|
-
)
|
|
46
27
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
28
|
+
def _get_language_from_extension(file, language_config):
|
|
29
|
+
for lang, exts in language_config.items():
|
|
30
|
+
for ext in exts:
|
|
31
|
+
if file.endswith(ext):
|
|
32
|
+
return lang
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
# Default config if not provided
|
|
36
|
+
if language_config is None:
|
|
37
|
+
language_config = {
|
|
38
|
+
"python": [".py"],
|
|
39
|
+
"javascript": [".js", ".jsx"],
|
|
40
|
+
"typescript": [".ts", ".tsx"],
|
|
41
|
+
"java": [".java"],
|
|
42
|
+
"csharp": [".cs"],
|
|
43
|
+
"go": [".go"],
|
|
44
|
+
"rust": [".rs"],
|
|
45
|
+
"cpp": [".cpp", ".c", ".h", ".hpp"],
|
|
46
|
+
}
|
|
53
47
|
|
|
54
|
-
|
|
48
|
+
if not os.path.exists(repo_path):
|
|
49
|
+
return []
|
|
55
50
|
|
|
56
|
-
|
|
51
|
+
source_code_files = set()
|
|
52
|
+
for root, _, files in os.walk(repo_path):
|
|
53
|
+
for file in files:
|
|
54
|
+
lang = _get_language_from_extension(file, language_config)
|
|
55
|
+
if lang is None:
|
|
56
|
+
continue
|
|
57
|
+
# Exclude tests and common build/venv directories
|
|
58
|
+
excluded_dirs = {
|
|
59
|
+
".venv",
|
|
60
|
+
"venv",
|
|
61
|
+
"env",
|
|
62
|
+
".env",
|
|
63
|
+
"site-packages",
|
|
64
|
+
"node_modules",
|
|
65
|
+
"dist",
|
|
66
|
+
"build",
|
|
67
|
+
".git",
|
|
68
|
+
"tests",
|
|
69
|
+
"test",
|
|
70
|
+
}
|
|
71
|
+
root_parts = set(os.path.normpath(root).split(os.sep))
|
|
72
|
+
base_name, _ext = os.path.splitext(file)
|
|
73
|
+
if (
|
|
74
|
+
base_name.startswith("test_")
|
|
75
|
+
or base_name.endswith("_test") # catches Go's *_test.go and similar
|
|
76
|
+
or ".test." in file
|
|
77
|
+
or ".spec." in file
|
|
78
|
+
or (excluded_dirs & root_parts)
|
|
79
|
+
):
|
|
80
|
+
continue
|
|
81
|
+
file_path = os.path.abspath(os.path.join(root, file))
|
|
82
|
+
if os.path.getsize(file_path) == 0:
|
|
83
|
+
continue
|
|
84
|
+
source_code_files.add((file_path, lang))
|
|
85
|
+
|
|
86
|
+
return sorted(list(source_code_files))
|
|
57
87
|
|
|
58
88
|
|
|
59
89
|
def run_coroutine(coroutine_func, *args, **kwargs):
|
|
@@ -85,22 +115,23 @@ def run_coroutine(coroutine_func, *args, **kwargs):
|
|
|
85
115
|
|
|
86
116
|
|
|
87
117
|
async def get_repo_file_dependencies(
|
|
88
|
-
repo_path: str, detailed_extraction: bool = False
|
|
118
|
+
repo_path: str, detailed_extraction: bool = False, supported_languages: list = None
|
|
89
119
|
) -> AsyncGenerator[DataPoint, None]:
|
|
90
120
|
"""
|
|
91
|
-
Generate a dependency graph for
|
|
121
|
+
Generate a dependency graph for source files (multi-language) in the given repository path.
|
|
92
122
|
|
|
93
123
|
Check the validity of the repository path and yield a repository object followed by the
|
|
94
|
-
dependencies of
|
|
124
|
+
dependencies of source files within that repository. Raise a FileNotFoundError if the
|
|
95
125
|
provided path does not exist. The extraction of detailed dependencies can be controlled
|
|
96
|
-
via the `detailed_extraction` argument.
|
|
126
|
+
via the `detailed_extraction` argument. Languages considered can be restricted via
|
|
127
|
+
the `supported_languages` argument.
|
|
97
128
|
|
|
98
129
|
Parameters:
|
|
99
130
|
-----------
|
|
100
131
|
|
|
101
|
-
- repo_path (str): The file path to the repository
|
|
102
|
-
- detailed_extraction (bool):
|
|
103
|
-
|
|
132
|
+
- repo_path (str): The file path to the repository to process.
|
|
133
|
+
- detailed_extraction (bool): Whether to perform a detailed extraction of code parts.
|
|
134
|
+
- supported_languages (list | None): Subset of languages to include; if None, use defaults.
|
|
104
135
|
"""
|
|
105
136
|
|
|
106
137
|
if isinstance(repo_path, list) and len(repo_path) == 1:
|
|
@@ -109,7 +140,25 @@ async def get_repo_file_dependencies(
|
|
|
109
140
|
if not os.path.exists(repo_path):
|
|
110
141
|
raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
|
|
111
142
|
|
|
112
|
-
|
|
143
|
+
# Build language config from supported_languages
|
|
144
|
+
default_language_config = {
|
|
145
|
+
"python": [".py"],
|
|
146
|
+
"javascript": [".js", ".jsx"],
|
|
147
|
+
"typescript": [".ts", ".tsx"],
|
|
148
|
+
"java": [".java"],
|
|
149
|
+
"csharp": [".cs"],
|
|
150
|
+
"go": [".go"],
|
|
151
|
+
"rust": [".rs"],
|
|
152
|
+
"cpp": [".cpp", ".c", ".h", ".hpp"],
|
|
153
|
+
}
|
|
154
|
+
if supported_languages is not None:
|
|
155
|
+
language_config = {
|
|
156
|
+
k: v for k, v in default_language_config.items() if k in supported_languages
|
|
157
|
+
}
|
|
158
|
+
else:
|
|
159
|
+
language_config = default_language_config
|
|
160
|
+
|
|
161
|
+
source_code_files = await get_source_code_files(repo_path, language_config=language_config)
|
|
113
162
|
|
|
114
163
|
repo = Repository(
|
|
115
164
|
id=uuid5(NAMESPACE_OID, repo_path),
|
|
@@ -128,19 +177,42 @@ async def get_repo_file_dependencies(
|
|
|
128
177
|
for chunk_number in range(number_of_chunks)
|
|
129
178
|
]
|
|
130
179
|
|
|
131
|
-
#
|
|
180
|
+
# Import dependency extractors for each language (Python for now, extend later)
|
|
132
181
|
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
|
|
182
|
+
import aiofiles
|
|
183
|
+
# TODO: Add other language extractors here
|
|
133
184
|
|
|
134
185
|
for start_range, end_range in chunk_ranges:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
186
|
+
tasks = []
|
|
187
|
+
for file_path, lang in source_code_files[start_range : end_range + 1]:
|
|
188
|
+
# For now, only Python is supported; extend with other languages
|
|
189
|
+
if lang == "python":
|
|
190
|
+
tasks.append(
|
|
191
|
+
get_local_script_dependencies(repo_path, file_path, detailed_extraction)
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
# Placeholder: create a minimal CodeFile for other languages
|
|
195
|
+
async def make_codefile_stub(file_path=file_path, lang=lang):
|
|
196
|
+
async with aiofiles.open(
|
|
197
|
+
file_path, "r", encoding="utf-8", errors="replace"
|
|
198
|
+
) as f:
|
|
199
|
+
source = await f.read()
|
|
200
|
+
return CodeFile(
|
|
201
|
+
id=uuid5(NAMESPACE_OID, file_path),
|
|
202
|
+
name=os.path.relpath(file_path, repo_path),
|
|
203
|
+
file_path=file_path,
|
|
204
|
+
language=lang,
|
|
205
|
+
source_code=source,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
tasks.append(make_codefile_stub())
|
|
140
209
|
|
|
141
210
|
results: list[CodeFile] = await asyncio.gather(*tasks)
|
|
142
211
|
|
|
143
212
|
for source_code_file in results:
|
|
144
213
|
source_code_file.part_of = repo
|
|
145
|
-
|
|
214
|
+
if getattr(
|
|
215
|
+
source_code_file, "language", None
|
|
216
|
+
) is None and source_code_file.file_path.endswith(".py"):
|
|
217
|
+
source_code_file.language = "python"
|
|
146
218
|
yield source_code_file
|
|
@@ -10,7 +10,37 @@ from cognee.tasks.storage.exceptions import (
|
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
async def add_data_points(
|
|
13
|
+
async def add_data_points(
|
|
14
|
+
data_points: List[DataPoint], update_edge_collection: bool = True
|
|
15
|
+
) -> List[DataPoint]:
|
|
16
|
+
"""
|
|
17
|
+
Add a batch of data points to the graph database by extracting nodes and edges,
|
|
18
|
+
deduplicating them, and indexing them for retrieval.
|
|
19
|
+
|
|
20
|
+
This function parallelizes the graph extraction for each data point,
|
|
21
|
+
merges the resulting nodes and edges, and ensures uniqueness before
|
|
22
|
+
committing them to the underlying graph engine. It also updates the
|
|
23
|
+
associated retrieval indices for nodes and (optionally) edges.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
data_points (List[DataPoint]):
|
|
27
|
+
A list of data points to process and insert into the graph.
|
|
28
|
+
update_edge_collection (bool, optional):
|
|
29
|
+
Whether to update the edge index after adding edges.
|
|
30
|
+
Defaults to True.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List[DataPoint]:
|
|
34
|
+
The original list of data points after processing and insertion.
|
|
35
|
+
|
|
36
|
+
Side Effects:
|
|
37
|
+
- Calls `get_graph_from_model` concurrently for each data point.
|
|
38
|
+
- Deduplicates nodes and edges across all results.
|
|
39
|
+
- Updates the node index via `index_data_points`.
|
|
40
|
+
- Inserts nodes and edges into the graph engine.
|
|
41
|
+
- Optionally updates the edge index via `index_graph_edges`.
|
|
42
|
+
"""
|
|
43
|
+
|
|
14
44
|
if not isinstance(data_points, list):
|
|
15
45
|
raise InvalidDataPointsInAddDataPointsError("data_points must be a list.")
|
|
16
46
|
if not all(isinstance(dp, DataPoint) for dp in data_points):
|
|
@@ -48,7 +78,7 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
|
|
|
48
78
|
await graph_engine.add_nodes(nodes)
|
|
49
79
|
await graph_engine.add_edges(edges)
|
|
50
80
|
|
|
51
|
-
|
|
52
|
-
|
|
81
|
+
if update_edge_collection:
|
|
82
|
+
await index_graph_edges()
|
|
53
83
|
|
|
54
84
|
return data_points
|