cognee 0.2.3.dev1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. cognee/__main__.py +4 -0
  2. cognee/api/v1/add/add.py +18 -6
  3. cognee/api/v1/cognify/code_graph_pipeline.py +7 -1
  4. cognee/api/v1/cognify/cognify.py +22 -107
  5. cognee/api/v1/cognify/routers/get_cognify_router.py +11 -3
  6. cognee/api/v1/datasets/routers/get_datasets_router.py +1 -1
  7. cognee/api/v1/responses/default_tools.py +4 -0
  8. cognee/api/v1/responses/dispatch_function.py +6 -1
  9. cognee/api/v1/responses/models.py +1 -1
  10. cognee/api/v1/search/search.py +6 -0
  11. cognee/cli/__init__.py +10 -0
  12. cognee/cli/_cognee.py +180 -0
  13. cognee/cli/commands/__init__.py +1 -0
  14. cognee/cli/commands/add_command.py +80 -0
  15. cognee/cli/commands/cognify_command.py +128 -0
  16. cognee/cli/commands/config_command.py +225 -0
  17. cognee/cli/commands/delete_command.py +80 -0
  18. cognee/cli/commands/search_command.py +149 -0
  19. cognee/cli/config.py +33 -0
  20. cognee/cli/debug.py +21 -0
  21. cognee/cli/echo.py +45 -0
  22. cognee/cli/exceptions.py +23 -0
  23. cognee/cli/minimal_cli.py +97 -0
  24. cognee/cli/reference.py +26 -0
  25. cognee/cli/suppress_logging.py +12 -0
  26. cognee/eval_framework/corpus_builder/corpus_builder_executor.py +2 -2
  27. cognee/eval_framework/eval_config.py +1 -1
  28. cognee/infrastructure/databases/graph/get_graph_engine.py +4 -9
  29. cognee/infrastructure/databases/graph/kuzu/adapter.py +64 -2
  30. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +49 -0
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +5 -3
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +16 -7
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +5 -5
  34. cognee/infrastructure/databases/vector/embeddings/config.py +2 -2
  35. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +6 -6
  36. cognee/infrastructure/files/utils/get_data_file_path.py +14 -9
  37. cognee/infrastructure/files/utils/get_file_metadata.py +2 -1
  38. cognee/infrastructure/llm/LLMGateway.py +14 -5
  39. cognee/infrastructure/llm/config.py +5 -5
  40. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +16 -5
  41. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/knowledge_graph/extract_content_graph.py +19 -15
  42. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +3 -3
  43. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +3 -3
  44. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +2 -2
  45. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +14 -8
  46. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +6 -4
  47. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +3 -3
  48. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +2 -2
  49. cognee/infrastructure/llm/tokenizer/HuggingFace/adapter.py +3 -3
  50. cognee/infrastructure/llm/tokenizer/Mistral/adapter.py +3 -3
  51. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +6 -6
  52. cognee/infrastructure/llm/utils.py +7 -7
  53. cognee/modules/data/methods/__init__.py +2 -0
  54. cognee/modules/data/methods/create_authorized_dataset.py +19 -0
  55. cognee/modules/data/methods/get_authorized_dataset.py +11 -5
  56. cognee/modules/data/methods/get_authorized_dataset_by_name.py +16 -0
  57. cognee/modules/data/methods/load_or_create_datasets.py +2 -20
  58. cognee/modules/graph/methods/get_formatted_graph_data.py +3 -2
  59. cognee/modules/pipelines/__init__.py +1 -1
  60. cognee/modules/pipelines/exceptions/tasks.py +18 -0
  61. cognee/modules/pipelines/layers/__init__.py +1 -0
  62. cognee/modules/pipelines/layers/check_pipeline_run_qualification.py +59 -0
  63. cognee/modules/pipelines/layers/pipeline_execution_mode.py +127 -0
  64. cognee/modules/pipelines/layers/reset_dataset_pipeline_run_status.py +12 -0
  65. cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +34 -0
  66. cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +55 -0
  67. cognee/modules/pipelines/layers/setup_and_check_environment.py +41 -0
  68. cognee/modules/pipelines/layers/validate_pipeline_tasks.py +20 -0
  69. cognee/modules/pipelines/methods/__init__.py +2 -0
  70. cognee/modules/pipelines/methods/get_pipeline_runs_by_dataset.py +34 -0
  71. cognee/modules/pipelines/methods/reset_pipeline_run_status.py +16 -0
  72. cognee/modules/pipelines/operations/__init__.py +0 -1
  73. cognee/modules/pipelines/operations/log_pipeline_run_initiated.py +1 -1
  74. cognee/modules/pipelines/operations/pipeline.py +23 -138
  75. cognee/modules/retrieval/base_feedback.py +11 -0
  76. cognee/modules/retrieval/cypher_search_retriever.py +1 -9
  77. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +9 -2
  78. cognee/modules/retrieval/graph_completion_cot_retriever.py +13 -6
  79. cognee/modules/retrieval/graph_completion_retriever.py +89 -5
  80. cognee/modules/retrieval/graph_summary_completion_retriever.py +2 -0
  81. cognee/modules/retrieval/natural_language_retriever.py +0 -4
  82. cognee/modules/retrieval/user_qa_feedback.py +83 -0
  83. cognee/modules/retrieval/utils/extract_uuid_from_node.py +18 -0
  84. cognee/modules/retrieval/utils/models.py +40 -0
  85. cognee/modules/search/methods/search.py +46 -5
  86. cognee/modules/search/types/SearchType.py +1 -0
  87. cognee/modules/settings/get_settings.py +2 -2
  88. cognee/shared/CodeGraphEntities.py +1 -0
  89. cognee/shared/logging_utils.py +142 -31
  90. cognee/shared/utils.py +0 -1
  91. cognee/tasks/graph/extract_graph_from_data.py +6 -2
  92. cognee/tasks/repo_processor/get_local_dependencies.py +2 -0
  93. cognee/tasks/repo_processor/get_repo_file_dependencies.py +120 -48
  94. cognee/tasks/storage/add_data_points.py +33 -3
  95. cognee/tests/integration/cli/__init__.py +3 -0
  96. cognee/tests/integration/cli/test_cli_integration.py +331 -0
  97. cognee/tests/integration/documents/PdfDocument_test.py +2 -2
  98. cognee/tests/integration/documents/TextDocument_test.py +2 -4
  99. cognee/tests/integration/documents/UnstructuredDocument_test.py +5 -8
  100. cognee/tests/{test_deletion.py → test_delete_hard.py} +0 -37
  101. cognee/tests/test_delete_soft.py +85 -0
  102. cognee/tests/test_kuzu.py +2 -2
  103. cognee/tests/test_neo4j.py +2 -2
  104. cognee/tests/test_search_db.py +126 -7
  105. cognee/tests/unit/cli/__init__.py +3 -0
  106. cognee/tests/unit/cli/test_cli_commands.py +483 -0
  107. cognee/tests/unit/cli/test_cli_edge_cases.py +625 -0
  108. cognee/tests/unit/cli/test_cli_main.py +173 -0
  109. cognee/tests/unit/cli/test_cli_runner.py +62 -0
  110. cognee/tests/unit/cli/test_cli_utils.py +127 -0
  111. cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +3 -3
  112. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +3 -3
  113. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +3 -3
  114. cognee/tests/unit/modules/search/search_methods_test.py +2 -0
  115. {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/METADATA +7 -5
  116. {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/RECORD +120 -83
  117. cognee-0.2.4.dist-info/entry_points.txt +2 -0
  118. cognee/infrastructure/databases/graph/networkx/__init__.py +0 -0
  119. cognee/infrastructure/databases/graph/networkx/adapter.py +0 -1017
  120. cognee/infrastructure/pipeline/models/Operation.py +0 -60
  121. cognee/infrastructure/pipeline/models/__init__.py +0 -0
  122. cognee/notebooks/github_analysis_step_by_step.ipynb +0 -37
  123. cognee/tests/tasks/descriptive_metrics/networkx_metrics_test.py +0 -7
  124. {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/WHEEL +0 -0
  125. {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/licenses/LICENSE +0 -0
  126. {cognee-0.2.3.dev1.dist-info → cognee-0.2.4.dist-info}/licenses/NOTICE.md +0 -0
@@ -15,14 +15,43 @@ from typing import Protocol
15
15
  # Configure external library logging
16
16
  def configure_external_library_logging():
17
17
  """Configure logging for external libraries to reduce verbosity"""
18
+ # Set environment variables to suppress LiteLLM logging
19
+ os.environ.setdefault("LITELLM_LOG", "ERROR")
20
+ os.environ.setdefault("LITELLM_SET_VERBOSE", "False")
21
+
18
22
  # Configure LiteLLM logging to reduce verbosity
19
23
  try:
20
24
  import litellm
21
25
 
26
+ # Disable verbose logging
22
27
  litellm.set_verbose = False
23
28
 
24
- # Suppress LiteLLM ERROR logging using standard logging
25
- logging.getLogger("litellm").setLevel(logging.CRITICAL)
29
+ # Set additional LiteLLM configuration
30
+ if hasattr(litellm, "suppress_debug_info"):
31
+ litellm.suppress_debug_info = True
32
+ if hasattr(litellm, "turn_off_message"):
33
+ litellm.turn_off_message = True
34
+ if hasattr(litellm, "_turn_on_debug"):
35
+ litellm._turn_on_debug = False
36
+
37
+ # Comprehensive logger suppression
38
+ loggers_to_suppress = [
39
+ "litellm",
40
+ "litellm.litellm_core_utils.logging_worker",
41
+ "litellm.litellm_core_utils",
42
+ "litellm.proxy",
43
+ "litellm.router",
44
+ "openai._base_client",
45
+ "LiteLLM", # Capital case variant
46
+ "LiteLLM.core",
47
+ "LiteLLM.logging_worker",
48
+ "litellm.logging_worker",
49
+ ]
50
+
51
+ for logger_name in loggers_to_suppress:
52
+ logging.getLogger(logger_name).setLevel(logging.CRITICAL)
53
+ logging.getLogger(logger_name).disabled = True
54
+
26
55
  except ImportError:
27
56
  # LiteLLM not available, skip configuration
28
57
  pass
@@ -173,29 +202,17 @@ def log_database_configuration(logger):
173
202
  from cognee.infrastructure.databases.graph.config import get_graph_config
174
203
 
175
204
  try:
176
- # Log relational database configuration
177
- relational_config = get_relational_config()
178
- if relational_config.db_provider == "postgres":
179
- logger.info(f"Postgres host: {relational_config.db_host}:{relational_config.db_port}")
180
- elif relational_config.db_provider == "sqlite":
181
- logger.info(f"SQLite path: {relational_config.db_path}")
182
-
183
- # Log vector database configuration
184
- vector_config = get_vectordb_config()
185
- if vector_config.vector_db_provider == "lancedb":
186
- logger.info(f"Vector database path: {vector_config.vector_db_url}")
187
- else:
188
- logger.info(f"Vector database URL: {vector_config.vector_db_url}")
189
-
190
- # Log graph database configuration
191
- graph_config = get_graph_config()
192
- if graph_config.graph_database_provider == "kuzu":
193
- logger.info(f"Graph database path: {graph_config.graph_file_path}")
194
- else:
195
- logger.info(f"Graph database URL: {graph_config.graph_database_url}")
205
+ # Get base database directory path
206
+ from cognee.base_config import get_base_config
207
+
208
+ base_config = get_base_config()
209
+ databases_path = os.path.join(base_config.system_root_directory, "databases")
210
+
211
+ # Log concise database info
212
+ logger.info(f"Database storage: {databases_path}")
196
213
 
197
214
  except Exception as e:
198
- logger.warning(f"Could not retrieve database configuration: {str(e)}")
215
+ logger.debug(f"Could not retrieve database configuration: {str(e)}")
199
216
 
200
217
 
201
218
  def cleanup_old_logs(logs_dir, max_files):
@@ -216,13 +233,22 @@ def cleanup_old_logs(logs_dir, max_files):
216
233
 
217
234
  # Remove old files that exceed the maximum
218
235
  if len(log_files) > max_files:
236
+ deleted_count = 0
219
237
  for old_file in log_files[max_files:]:
220
238
  try:
221
239
  old_file.unlink()
222
- logger.info(f"Deleted old log file: {old_file}")
240
+ deleted_count += 1
241
+ # Only log individual files in non-CLI mode
242
+ if os.getenv("COGNEE_CLI_MODE") != "true":
243
+ logger.info(f"Deleted old log file: {old_file}")
223
244
  except Exception as e:
245
+ # Always log errors
224
246
  logger.error(f"Failed to delete old log file {old_file}: {e}")
225
247
 
248
+ # In CLI mode, show compact summary
249
+ if os.getenv("COGNEE_CLI_MODE") == "true" and deleted_count > 0:
250
+ logger.info(f"Cleaned up {deleted_count} old log files")
251
+
226
252
  return True
227
253
  except Exception as e:
228
254
  logger.error(f"Error cleaning up log files: {e}")
@@ -241,11 +267,81 @@ def setup_logging(log_level=None, name=None):
241
267
  """
242
268
  global _is_structlog_configured
243
269
 
270
+ # Regular detailed logging for non-CLI usage
244
271
  log_level = log_level if log_level else log_levels[os.getenv("LOG_LEVEL", "INFO")]
245
272
 
246
273
  # Configure external library logging early to suppress verbose output
247
274
  configure_external_library_logging()
248
275
 
276
+ # Add custom filter to suppress LiteLLM worker cancellation errors
277
+ class LiteLLMCancellationFilter(logging.Filter):
278
+ """Filter to suppress LiteLLM worker cancellation messages"""
279
+
280
+ def filter(self, record):
281
+ # Check if this is a LiteLLM-related logger
282
+ if hasattr(record, "name") and "litellm" in record.name.lower():
283
+ return False
284
+
285
+ # Check message content for cancellation errors
286
+ if hasattr(record, "msg") and record.msg:
287
+ msg_str = str(record.msg).lower()
288
+ if any(
289
+ keyword in msg_str
290
+ for keyword in [
291
+ "loggingworker cancelled",
292
+ "logging_worker.py",
293
+ "cancellederror",
294
+ "litellm:error",
295
+ ]
296
+ ):
297
+ return False
298
+
299
+ # Check formatted message
300
+ try:
301
+ if hasattr(record, "getMessage"):
302
+ formatted_msg = record.getMessage().lower()
303
+ if any(
304
+ keyword in formatted_msg
305
+ for keyword in [
306
+ "loggingworker cancelled",
307
+ "logging_worker.py",
308
+ "cancellederror",
309
+ "litellm:error",
310
+ ]
311
+ ):
312
+ return False
313
+ except Exception:
314
+ pass
315
+
316
+ return True
317
+
318
+ # Apply the filter to root logger and specific loggers
319
+ cancellation_filter = LiteLLMCancellationFilter()
320
+ logging.getLogger().addFilter(cancellation_filter)
321
+ logging.getLogger("litellm").addFilter(cancellation_filter)
322
+
323
+ # Add custom filter to suppress LiteLLM worker cancellation errors
324
+ class LiteLLMFilter(logging.Filter):
325
+ def filter(self, record):
326
+ # Suppress LiteLLM worker cancellation errors
327
+ if hasattr(record, "msg") and isinstance(record.msg, str):
328
+ msg_lower = record.msg.lower()
329
+ if any(
330
+ phrase in msg_lower
331
+ for phrase in [
332
+ "loggingworker cancelled",
333
+ "cancellederror",
334
+ "logging_worker.py",
335
+ "loggingerror",
336
+ ]
337
+ ):
338
+ return False
339
+ return True
340
+
341
+ # Apply filter to root logger
342
+ litellm_filter = LiteLLMFilter()
343
+ logging.getLogger().addFilter(litellm_filter)
344
+
249
345
  def exception_handler(logger, method_name, event_dict):
250
346
  """Custom processor to handle uncaught exceptions."""
251
347
  # Check if there's an exc_info that needs to be processed
@@ -298,11 +394,6 @@ def setup_logging(log_level=None, name=None):
298
394
  # Hand back to the original hook → prints traceback and exits
299
395
  sys.__excepthook__(exc_type, exc_value, traceback)
300
396
 
301
- logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
302
- logger.info(
303
- "Need help? Reach out to us on our Discord server: https://discord.gg/NQPKmU5CCg"
304
- )
305
-
306
397
  # Install exception handlers
307
398
  sys.excepthook = handle_exception
308
399
 
@@ -380,18 +471,38 @@ def setup_logging(log_level=None, name=None):
380
471
  # Mark logging as configured
381
472
  _is_structlog_configured = True
382
473
 
474
+ from cognee.infrastructure.databases.relational.config import get_relational_config
475
+ from cognee.infrastructure.databases.vector.config import get_vectordb_config
476
+ from cognee.infrastructure.databases.graph.config import get_graph_config
477
+
478
+ graph_config = get_graph_config()
479
+ vector_config = get_vectordb_config()
480
+ relational_config = get_relational_config()
481
+
482
+ try:
483
+ # Get base database directory path
484
+ from cognee.base_config import get_base_config
485
+
486
+ base_config = get_base_config()
487
+ databases_path = os.path.join(base_config.system_root_directory, "databases")
488
+ except Exception as e:
489
+ raise ValueError from e
490
+
383
491
  # Get a configured logger and log system information
384
492
  logger = structlog.get_logger(name if name else __name__)
493
+ # Detailed initialization for regular usage
385
494
  logger.info(
386
495
  "Logging initialized",
387
496
  python_version=PYTHON_VERSION,
388
497
  structlog_version=STRUCTLOG_VERSION,
389
498
  cognee_version=COGNEE_VERSION,
390
499
  os_info=OS_INFO,
500
+ database_path=databases_path,
501
+ graph_database_name=graph_config.graph_database_name,
502
+ vector_config=vector_config.vector_db_provider,
503
+ relational_config=relational_config.db_name,
391
504
  )
392
505
 
393
- logger.info("Want to learn more? Visit the Cognee documentation: https://docs.cognee.ai")
394
-
395
506
  # Log database configuration
396
507
  log_database_configuration(logger)
397
508
 
cognee/shared/utils.py CHANGED
@@ -3,7 +3,6 @@
3
3
  import os
4
4
  import requests
5
5
  from datetime import datetime, timezone
6
- import networkx as nx
7
6
  import matplotlib.pyplot as plt
8
7
  import http.server
9
8
  import socketserver
@@ -1,5 +1,5 @@
1
1
  import asyncio
2
- from typing import Type, List
2
+ from typing import Type, List, Optional
3
3
  from pydantic import BaseModel
4
4
 
5
5
  from cognee.infrastructure.databases.graph import get_graph_engine
@@ -71,6 +71,7 @@ async def extract_graph_from_data(
71
71
  data_chunks: List[DocumentChunk],
72
72
  graph_model: Type[BaseModel],
73
73
  ontology_adapter: OntologyResolver = None,
74
+ custom_prompt: Optional[str] = None,
74
75
  ) -> List[DocumentChunk]:
75
76
  """
76
77
  Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
@@ -84,7 +85,10 @@ async def extract_graph_from_data(
84
85
  raise InvalidGraphModelError(graph_model)
85
86
 
86
87
  chunk_graphs = await asyncio.gather(
87
- *[LLMGateway.extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
88
+ *[
89
+ LLMGateway.extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt)
90
+ for chunk in data_chunks
91
+ ]
88
92
  )
89
93
 
90
94
  # Note: Filter edges with missing source or target nodes
@@ -180,6 +180,7 @@ async def get_local_script_dependencies(
180
180
  name=file_path_relative_to_repo,
181
181
  source_code=source_code,
182
182
  file_path=script_path,
183
+ language="python",
183
184
  )
184
185
  return code_file_node
185
186
 
@@ -188,6 +189,7 @@ async def get_local_script_dependencies(
188
189
  name=file_path_relative_to_repo,
189
190
  source_code=None,
190
191
  file_path=script_path,
192
+ language="python",
191
193
  )
192
194
 
193
195
  async for part in extract_code_parts(source_code_tree.root_node, script_path=script_path):
@@ -10,50 +10,80 @@ from cognee.infrastructure.engine import DataPoint
10
10
  from cognee.shared.CodeGraphEntities import CodeFile, Repository
11
11
 
12
12
 
13
- async def get_source_code_files(repo_path):
13
+ async def get_source_code_files(repo_path, language_config: dict[str, list[str]] | None = None):
14
14
  """
15
- Retrieve Python source code files from the specified repository path.
16
-
17
- This function scans the given repository path for files that have the .py extension
18
- while excluding test files and files within a virtual environment. It returns a list of
19
- absolute paths to the source code files that are not empty.
15
+ Retrieve source code files from the specified repository path for multiple languages.
20
16
 
21
17
  Parameters:
22
18
  -----------
23
-
24
- - repo_path: The file path to the repository to search for Python source files.
19
+ - repo_path: The file path to the repository to search for source files.
20
+ - language_config: dict mapping language names to file extensions, e.g.,
21
+ {'python': ['.py'], 'javascript': ['.js', '.jsx'], ...}
25
22
 
26
23
  Returns:
27
24
  --------
28
-
29
- A list of absolute paths to .py files that contain source code, excluding empty
30
- files, test files, and files from a virtual environment.
25
+ A list of (absolute_path, language) tuples for source code files.
31
26
  """
32
- if not os.path.exists(repo_path):
33
- return {}
34
-
35
- py_files_paths = (
36
- os.path.join(root, file)
37
- for root, _, files in os.walk(repo_path)
38
- for file in files
39
- if (
40
- file.endswith(".py")
41
- and not file.startswith("test_")
42
- and not file.endswith("_test")
43
- and ".venv" not in file
44
- )
45
- )
46
27
 
47
- source_code_files = set()
48
- for file_path in py_files_paths:
49
- file_path = os.path.abspath(file_path)
50
-
51
- if os.path.getsize(file_path) == 0:
52
- continue
28
+ def _get_language_from_extension(file, language_config):
29
+ for lang, exts in language_config.items():
30
+ for ext in exts:
31
+ if file.endswith(ext):
32
+ return lang
33
+ return None
34
+
35
+ # Default config if not provided
36
+ if language_config is None:
37
+ language_config = {
38
+ "python": [".py"],
39
+ "javascript": [".js", ".jsx"],
40
+ "typescript": [".ts", ".tsx"],
41
+ "java": [".java"],
42
+ "csharp": [".cs"],
43
+ "go": [".go"],
44
+ "rust": [".rs"],
45
+ "cpp": [".cpp", ".c", ".h", ".hpp"],
46
+ }
53
47
 
54
- source_code_files.add(file_path)
48
+ if not os.path.exists(repo_path):
49
+ return []
55
50
 
56
- return list(source_code_files)
51
+ source_code_files = set()
52
+ for root, _, files in os.walk(repo_path):
53
+ for file in files:
54
+ lang = _get_language_from_extension(file, language_config)
55
+ if lang is None:
56
+ continue
57
+ # Exclude tests and common build/venv directories
58
+ excluded_dirs = {
59
+ ".venv",
60
+ "venv",
61
+ "env",
62
+ ".env",
63
+ "site-packages",
64
+ "node_modules",
65
+ "dist",
66
+ "build",
67
+ ".git",
68
+ "tests",
69
+ "test",
70
+ }
71
+ root_parts = set(os.path.normpath(root).split(os.sep))
72
+ base_name, _ext = os.path.splitext(file)
73
+ if (
74
+ base_name.startswith("test_")
75
+ or base_name.endswith("_test") # catches Go's *_test.go and similar
76
+ or ".test." in file
77
+ or ".spec." in file
78
+ or (excluded_dirs & root_parts)
79
+ ):
80
+ continue
81
+ file_path = os.path.abspath(os.path.join(root, file))
82
+ if os.path.getsize(file_path) == 0:
83
+ continue
84
+ source_code_files.add((file_path, lang))
85
+
86
+ return sorted(list(source_code_files))
57
87
 
58
88
 
59
89
  def run_coroutine(coroutine_func, *args, **kwargs):
@@ -85,22 +115,23 @@ def run_coroutine(coroutine_func, *args, **kwargs):
85
115
 
86
116
 
87
117
  async def get_repo_file_dependencies(
88
- repo_path: str, detailed_extraction: bool = False
118
+ repo_path: str, detailed_extraction: bool = False, supported_languages: list = None
89
119
  ) -> AsyncGenerator[DataPoint, None]:
90
120
  """
91
- Generate a dependency graph for Python files in the given repository path.
121
+ Generate a dependency graph for source files (multi-language) in the given repository path.
92
122
 
93
123
  Check the validity of the repository path and yield a repository object followed by the
94
- dependencies of Python files within that repository. Raise a FileNotFoundError if the
124
+ dependencies of source files within that repository. Raise a FileNotFoundError if the
95
125
  provided path does not exist. The extraction of detailed dependencies can be controlled
96
- via the `detailed_extraction` argument.
126
+ via the `detailed_extraction` argument. Languages considered can be restricted via
127
+ the `supported_languages` argument.
97
128
 
98
129
  Parameters:
99
130
  -----------
100
131
 
101
- - repo_path (str): The file path to the repository where Python files are located.
102
- - detailed_extraction (bool): A flag indicating whether to perform a detailed
103
- extraction of dependencies (default is False). (default False)
132
+ - repo_path (str): The file path to the repository to process.
133
+ - detailed_extraction (bool): Whether to perform a detailed extraction of code parts.
134
+ - supported_languages (list | None): Subset of languages to include; if None, use defaults.
104
135
  """
105
136
 
106
137
  if isinstance(repo_path, list) and len(repo_path) == 1:
@@ -109,7 +140,25 @@ async def get_repo_file_dependencies(
109
140
  if not os.path.exists(repo_path):
110
141
  raise FileNotFoundError(f"Repository path {repo_path} does not exist.")
111
142
 
112
- source_code_files = await get_source_code_files(repo_path)
143
+ # Build language config from supported_languages
144
+ default_language_config = {
145
+ "python": [".py"],
146
+ "javascript": [".js", ".jsx"],
147
+ "typescript": [".ts", ".tsx"],
148
+ "java": [".java"],
149
+ "csharp": [".cs"],
150
+ "go": [".go"],
151
+ "rust": [".rs"],
152
+ "cpp": [".cpp", ".c", ".h", ".hpp"],
153
+ }
154
+ if supported_languages is not None:
155
+ language_config = {
156
+ k: v for k, v in default_language_config.items() if k in supported_languages
157
+ }
158
+ else:
159
+ language_config = default_language_config
160
+
161
+ source_code_files = await get_source_code_files(repo_path, language_config=language_config)
113
162
 
114
163
  repo = Repository(
115
164
  id=uuid5(NAMESPACE_OID, repo_path),
@@ -128,19 +177,42 @@ async def get_repo_file_dependencies(
128
177
  for chunk_number in range(number_of_chunks)
129
178
  ]
130
179
 
131
- # Codegraph dependencies are not installed by default, so we import where we use them.
180
+ # Import dependency extractors for each language (Python for now, extend later)
132
181
  from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
182
+ import aiofiles
183
+ # TODO: Add other language extractors here
133
184
 
134
185
  for start_range, end_range in chunk_ranges:
135
- # with ProcessPoolExecutor(max_workers=12) as executor:
136
- tasks = [
137
- get_local_script_dependencies(repo_path, file_path, detailed_extraction)
138
- for file_path in source_code_files[start_range : end_range + 1]
139
- ]
186
+ tasks = []
187
+ for file_path, lang in source_code_files[start_range : end_range + 1]:
188
+ # For now, only Python is supported; extend with other languages
189
+ if lang == "python":
190
+ tasks.append(
191
+ get_local_script_dependencies(repo_path, file_path, detailed_extraction)
192
+ )
193
+ else:
194
+ # Placeholder: create a minimal CodeFile for other languages
195
+ async def make_codefile_stub(file_path=file_path, lang=lang):
196
+ async with aiofiles.open(
197
+ file_path, "r", encoding="utf-8", errors="replace"
198
+ ) as f:
199
+ source = await f.read()
200
+ return CodeFile(
201
+ id=uuid5(NAMESPACE_OID, file_path),
202
+ name=os.path.relpath(file_path, repo_path),
203
+ file_path=file_path,
204
+ language=lang,
205
+ source_code=source,
206
+ )
207
+
208
+ tasks.append(make_codefile_stub())
140
209
 
141
210
  results: list[CodeFile] = await asyncio.gather(*tasks)
142
211
 
143
212
  for source_code_file in results:
144
213
  source_code_file.part_of = repo
145
-
214
+ if getattr(
215
+ source_code_file, "language", None
216
+ ) is None and source_code_file.file_path.endswith(".py"):
217
+ source_code_file.language = "python"
146
218
  yield source_code_file
@@ -10,7 +10,37 @@ from cognee.tasks.storage.exceptions import (
10
10
  )
11
11
 
12
12
 
13
- async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
13
+ async def add_data_points(
14
+ data_points: List[DataPoint], update_edge_collection: bool = True
15
+ ) -> List[DataPoint]:
16
+ """
17
+ Add a batch of data points to the graph database by extracting nodes and edges,
18
+ deduplicating them, and indexing them for retrieval.
19
+
20
+ This function parallelizes the graph extraction for each data point,
21
+ merges the resulting nodes and edges, and ensures uniqueness before
22
+ committing them to the underlying graph engine. It also updates the
23
+ associated retrieval indices for nodes and (optionally) edges.
24
+
25
+ Args:
26
+ data_points (List[DataPoint]):
27
+ A list of data points to process and insert into the graph.
28
+ update_edge_collection (bool, optional):
29
+ Whether to update the edge index after adding edges.
30
+ Defaults to True.
31
+
32
+ Returns:
33
+ List[DataPoint]:
34
+ The original list of data points after processing and insertion.
35
+
36
+ Side Effects:
37
+ - Calls `get_graph_from_model` concurrently for each data point.
38
+ - Deduplicates nodes and edges across all results.
39
+ - Updates the node index via `index_data_points`.
40
+ - Inserts nodes and edges into the graph engine.
41
+ - Optionally updates the edge index via `index_graph_edges`.
42
+ """
43
+
14
44
  if not isinstance(data_points, list):
15
45
  raise InvalidDataPointsInAddDataPointsError("data_points must be a list.")
16
46
  if not all(isinstance(dp, DataPoint) for dp in data_points):
@@ -48,7 +78,7 @@ async def add_data_points(data_points: List[DataPoint]) -> List[DataPoint]:
48
78
  await graph_engine.add_nodes(nodes)
49
79
  await graph_engine.add_edges(edges)
50
80
 
51
- # This step has to happen after adding nodes and edges because we query the graph.
52
- await index_graph_edges()
81
+ if update_edge_collection:
82
+ await index_graph_edges()
53
83
 
54
84
  return data_points
@@ -0,0 +1,3 @@
1
+ """
2
+ CLI integration tests package.
3
+ """