cognee 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (227) hide show
  1. cognee/__init__.py +1 -0
  2. cognee/api/client.py +9 -5
  3. cognee/api/v1/add/add.py +2 -1
  4. cognee/api/v1/add/routers/get_add_router.py +3 -1
  5. cognee/api/v1/cognify/cognify.py +24 -16
  6. cognee/api/v1/cognify/routers/__init__.py +0 -1
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +30 -1
  8. cognee/api/v1/datasets/routers/get_datasets_router.py +3 -3
  9. cognee/api/v1/ontologies/__init__.py +4 -0
  10. cognee/api/v1/ontologies/ontologies.py +158 -0
  11. cognee/api/v1/ontologies/routers/__init__.py +0 -0
  12. cognee/api/v1/ontologies/routers/get_ontology_router.py +109 -0
  13. cognee/api/v1/permissions/routers/get_permissions_router.py +41 -1
  14. cognee/api/v1/search/search.py +4 -0
  15. cognee/api/v1/ui/node_setup.py +360 -0
  16. cognee/api/v1/ui/npm_utils.py +50 -0
  17. cognee/api/v1/ui/ui.py +38 -68
  18. cognee/cli/commands/cognify_command.py +8 -1
  19. cognee/cli/config.py +1 -1
  20. cognee/context_global_variables.py +86 -9
  21. cognee/eval_framework/Dockerfile +29 -0
  22. cognee/eval_framework/answer_generation/answer_generation_executor.py +10 -0
  23. cognee/eval_framework/answer_generation/run_question_answering_module.py +1 -1
  24. cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py +0 -2
  25. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +4 -4
  26. cognee/eval_framework/eval_config.py +2 -2
  27. cognee/eval_framework/modal_run_eval.py +16 -28
  28. cognee/infrastructure/databases/cache/config.py +3 -1
  29. cognee/infrastructure/databases/cache/fscache/FsCacheAdapter.py +151 -0
  30. cognee/infrastructure/databases/cache/get_cache_engine.py +20 -10
  31. cognee/infrastructure/databases/dataset_database_handler/__init__.py +3 -0
  32. cognee/infrastructure/databases/dataset_database_handler/dataset_database_handler_interface.py +80 -0
  33. cognee/infrastructure/databases/dataset_database_handler/supported_dataset_database_handlers.py +18 -0
  34. cognee/infrastructure/databases/dataset_database_handler/use_dataset_database_handler.py +10 -0
  35. cognee/infrastructure/databases/exceptions/exceptions.py +16 -0
  36. cognee/infrastructure/databases/graph/config.py +7 -0
  37. cognee/infrastructure/databases/graph/get_graph_engine.py +3 -0
  38. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -0
  39. cognee/infrastructure/databases/graph/kuzu/KuzuDatasetDatabaseHandler.py +81 -0
  40. cognee/infrastructure/databases/graph/kuzu/adapter.py +228 -0
  41. cognee/infrastructure/databases/graph/neo4j_driver/Neo4jAuraDevDatasetDatabaseHandler.py +168 -0
  42. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +80 -1
  43. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +9 -0
  44. cognee/infrastructure/databases/utils/__init__.py +3 -0
  45. cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py +10 -0
  46. cognee/infrastructure/databases/utils/get_or_create_dataset_database.py +66 -18
  47. cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py +10 -0
  48. cognee/infrastructure/databases/utils/resolve_dataset_database_connection_info.py +30 -0
  49. cognee/infrastructure/databases/vector/config.py +5 -0
  50. cognee/infrastructure/databases/vector/create_vector_engine.py +6 -1
  51. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +8 -6
  52. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +9 -7
  53. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -10
  54. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +2 -0
  55. cognee/infrastructure/databases/vector/lancedb/LanceDBDatasetDatabaseHandler.py +50 -0
  56. cognee/infrastructure/databases/vector/vector_db_interface.py +35 -0
  57. cognee/infrastructure/engine/models/Edge.py +13 -1
  58. cognee/infrastructure/files/storage/s3_config.py +2 -0
  59. cognee/infrastructure/files/utils/guess_file_type.py +4 -0
  60. cognee/infrastructure/llm/LLMGateway.py +5 -2
  61. cognee/infrastructure/llm/config.py +37 -0
  62. cognee/infrastructure/llm/extraction/knowledge_graph/extract_content_graph.py +2 -2
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +23 -8
  64. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +22 -18
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/__init__.py +5 -0
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/bedrock/adapter.py +153 -0
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +47 -38
  68. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +46 -37
  69. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +20 -10
  70. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/mistral/adapter.py +23 -11
  71. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/adapter.py +36 -23
  72. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +47 -36
  73. cognee/infrastructure/loaders/LoaderEngine.py +1 -0
  74. cognee/infrastructure/loaders/core/__init__.py +2 -1
  75. cognee/infrastructure/loaders/core/csv_loader.py +93 -0
  76. cognee/infrastructure/loaders/core/text_loader.py +1 -2
  77. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +0 -9
  78. cognee/infrastructure/loaders/supported_loaders.py +2 -1
  79. cognee/memify_pipelines/create_triplet_embeddings.py +53 -0
  80. cognee/memify_pipelines/persist_sessions_in_knowledge_graph.py +55 -0
  81. cognee/modules/chunking/CsvChunker.py +35 -0
  82. cognee/modules/chunking/models/DocumentChunk.py +2 -1
  83. cognee/modules/chunking/text_chunker_with_overlap.py +124 -0
  84. cognee/modules/cognify/config.py +2 -0
  85. cognee/modules/data/deletion/prune_system.py +52 -2
  86. cognee/modules/data/methods/__init__.py +1 -0
  87. cognee/modules/data/methods/create_dataset.py +4 -2
  88. cognee/modules/data/methods/delete_dataset.py +26 -0
  89. cognee/modules/data/methods/get_dataset_ids.py +5 -1
  90. cognee/modules/data/methods/get_unique_data_id.py +68 -0
  91. cognee/modules/data/methods/get_unique_dataset_id.py +66 -4
  92. cognee/modules/data/models/Dataset.py +2 -0
  93. cognee/modules/data/processing/document_types/CsvDocument.py +33 -0
  94. cognee/modules/data/processing/document_types/__init__.py +1 -0
  95. cognee/modules/engine/models/Triplet.py +9 -0
  96. cognee/modules/engine/models/__init__.py +1 -0
  97. cognee/modules/graph/cognee_graph/CogneeGraph.py +89 -39
  98. cognee/modules/graph/cognee_graph/CogneeGraphElements.py +8 -3
  99. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +19 -2
  100. cognee/modules/graph/utils/resolve_edges_to_text.py +48 -49
  101. cognee/modules/ingestion/identify.py +4 -4
  102. cognee/modules/memify/memify.py +1 -7
  103. cognee/modules/notebooks/operations/run_in_local_sandbox.py +3 -0
  104. cognee/modules/ontology/rdf_xml/RDFLibOntologyResolver.py +55 -23
  105. cognee/modules/pipelines/operations/pipeline.py +18 -2
  106. cognee/modules/pipelines/operations/run_tasks_data_item.py +1 -1
  107. cognee/modules/retrieval/EntityCompletionRetriever.py +10 -3
  108. cognee/modules/retrieval/__init__.py +1 -1
  109. cognee/modules/retrieval/base_graph_retriever.py +7 -3
  110. cognee/modules/retrieval/base_retriever.py +7 -3
  111. cognee/modules/retrieval/completion_retriever.py +11 -4
  112. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +10 -2
  113. cognee/modules/retrieval/graph_completion_cot_retriever.py +18 -51
  114. cognee/modules/retrieval/graph_completion_retriever.py +14 -1
  115. cognee/modules/retrieval/graph_summary_completion_retriever.py +4 -0
  116. cognee/modules/retrieval/register_retriever.py +10 -0
  117. cognee/modules/retrieval/registered_community_retrievers.py +1 -0
  118. cognee/modules/retrieval/temporal_retriever.py +13 -2
  119. cognee/modules/retrieval/triplet_retriever.py +182 -0
  120. cognee/modules/retrieval/utils/brute_force_triplet_search.py +43 -11
  121. cognee/modules/retrieval/utils/completion.py +2 -22
  122. cognee/modules/run_custom_pipeline/__init__.py +1 -0
  123. cognee/modules/run_custom_pipeline/run_custom_pipeline.py +76 -0
  124. cognee/modules/search/methods/get_search_type_tools.py +54 -8
  125. cognee/modules/search/methods/no_access_control_search.py +4 -0
  126. cognee/modules/search/methods/search.py +26 -3
  127. cognee/modules/search/types/SearchType.py +1 -1
  128. cognee/modules/settings/get_settings.py +19 -0
  129. cognee/modules/users/methods/create_user.py +12 -27
  130. cognee/modules/users/methods/get_authenticated_user.py +3 -2
  131. cognee/modules/users/methods/get_default_user.py +4 -2
  132. cognee/modules/users/methods/get_user.py +1 -1
  133. cognee/modules/users/methods/get_user_by_email.py +1 -1
  134. cognee/modules/users/models/DatasetDatabase.py +24 -3
  135. cognee/modules/users/models/Tenant.py +6 -7
  136. cognee/modules/users/models/User.py +6 -5
  137. cognee/modules/users/models/UserTenant.py +12 -0
  138. cognee/modules/users/models/__init__.py +1 -0
  139. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +13 -13
  140. cognee/modules/users/roles/methods/add_user_to_role.py +3 -1
  141. cognee/modules/users/tenants/methods/__init__.py +1 -0
  142. cognee/modules/users/tenants/methods/add_user_to_tenant.py +21 -12
  143. cognee/modules/users/tenants/methods/create_tenant.py +22 -8
  144. cognee/modules/users/tenants/methods/select_tenant.py +62 -0
  145. cognee/shared/logging_utils.py +6 -0
  146. cognee/shared/rate_limiting.py +30 -0
  147. cognee/tasks/chunks/__init__.py +1 -0
  148. cognee/tasks/chunks/chunk_by_row.py +94 -0
  149. cognee/tasks/documents/__init__.py +0 -1
  150. cognee/tasks/documents/classify_documents.py +2 -0
  151. cognee/tasks/feedback/generate_improved_answers.py +3 -3
  152. cognee/tasks/graph/extract_graph_from_data.py +9 -10
  153. cognee/tasks/ingestion/ingest_data.py +1 -1
  154. cognee/tasks/memify/__init__.py +2 -0
  155. cognee/tasks/memify/cognify_session.py +41 -0
  156. cognee/tasks/memify/extract_user_sessions.py +73 -0
  157. cognee/tasks/memify/get_triplet_datapoints.py +289 -0
  158. cognee/tasks/storage/add_data_points.py +142 -2
  159. cognee/tasks/storage/index_data_points.py +33 -22
  160. cognee/tasks/storage/index_graph_edges.py +37 -57
  161. cognee/tests/integration/documents/CsvDocument_test.py +70 -0
  162. cognee/tests/integration/retrieval/test_triplet_retriever.py +84 -0
  163. cognee/tests/integration/tasks/test_add_data_points.py +139 -0
  164. cognee/tests/integration/tasks/test_get_triplet_datapoints.py +69 -0
  165. cognee/tests/integration/web_url_crawler/test_default_url_crawler.py +1 -1
  166. cognee/tests/integration/web_url_crawler/test_tavily_crawler.py +1 -1
  167. cognee/tests/integration/web_url_crawler/test_url_adding_e2e.py +13 -27
  168. cognee/tests/tasks/entity_extraction/entity_extraction_test.py +1 -1
  169. cognee/tests/test_add_docling_document.py +2 -2
  170. cognee/tests/test_cognee_server_start.py +84 -3
  171. cognee/tests/test_conversation_history.py +68 -5
  172. cognee/tests/test_data/example_with_header.csv +3 -0
  173. cognee/tests/test_dataset_database_handler.py +137 -0
  174. cognee/tests/test_dataset_delete.py +76 -0
  175. cognee/tests/test_edge_centered_payload.py +170 -0
  176. cognee/tests/test_edge_ingestion.py +27 -0
  177. cognee/tests/test_feedback_enrichment.py +1 -1
  178. cognee/tests/test_library.py +6 -4
  179. cognee/tests/test_load.py +62 -0
  180. cognee/tests/test_multi_tenancy.py +165 -0
  181. cognee/tests/test_parallel_databases.py +2 -0
  182. cognee/tests/test_pipeline_cache.py +164 -0
  183. cognee/tests/test_relational_db_migration.py +54 -2
  184. cognee/tests/test_search_db.py +44 -2
  185. cognee/tests/unit/api/test_conditional_authentication_endpoints.py +12 -3
  186. cognee/tests/unit/api/test_ontology_endpoint.py +252 -0
  187. cognee/tests/unit/infrastructure/databases/cache/test_cache_config.py +5 -0
  188. cognee/tests/unit/infrastructure/databases/test_index_data_points.py +27 -0
  189. cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +14 -16
  190. cognee/tests/unit/infrastructure/llm/test_llm_config.py +46 -0
  191. cognee/tests/unit/infrastructure/mock_embedding_engine.py +3 -7
  192. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +0 -5
  193. cognee/tests/unit/modules/chunking/test_text_chunker.py +248 -0
  194. cognee/tests/unit/modules/chunking/test_text_chunker_with_overlap.py +324 -0
  195. cognee/tests/unit/modules/graph/cognee_graph_elements_test.py +2 -2
  196. cognee/tests/unit/modules/graph/cognee_graph_test.py +406 -0
  197. cognee/tests/unit/modules/memify_tasks/test_cognify_session.py +111 -0
  198. cognee/tests/unit/modules/memify_tasks/test_extract_user_sessions.py +175 -0
  199. cognee/tests/unit/modules/memify_tasks/test_get_triplet_datapoints.py +214 -0
  200. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -51
  201. cognee/tests/unit/modules/retrieval/rag_completion_retriever_test.py +1 -0
  202. cognee/tests/unit/modules/retrieval/structured_output_test.py +204 -0
  203. cognee/tests/unit/modules/retrieval/summaries_retriever_test.py +1 -1
  204. cognee/tests/unit/modules/retrieval/temporal_retriever_test.py +0 -1
  205. cognee/tests/unit/modules/retrieval/test_brute_force_triplet_search.py +608 -0
  206. cognee/tests/unit/modules/retrieval/triplet_retriever_test.py +83 -0
  207. cognee/tests/unit/modules/users/test_conditional_authentication.py +0 -63
  208. cognee/tests/unit/processing/chunks/chunk_by_row_test.py +52 -0
  209. cognee/tests/unit/tasks/storage/test_add_data_points.py +288 -0
  210. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/METADATA +11 -6
  211. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/RECORD +215 -163
  212. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/WHEEL +1 -1
  213. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/entry_points.txt +0 -1
  214. cognee/api/v1/cognify/code_graph_pipeline.py +0 -119
  215. cognee/api/v1/cognify/routers/get_code_pipeline_router.py +0 -90
  216. cognee/infrastructure/databases/vector/embeddings/embedding_rate_limiter.py +0 -544
  217. cognee/modules/retrieval/code_retriever.py +0 -232
  218. cognee/tasks/code/enrich_dependency_graph_checker.py +0 -35
  219. cognee/tasks/code/get_local_dependencies_checker.py +0 -20
  220. cognee/tasks/code/get_repo_dependency_graph_checker.py +0 -35
  221. cognee/tasks/documents/check_permissions_on_dataset.py +0 -26
  222. cognee/tasks/repo_processor/__init__.py +0 -2
  223. cognee/tasks/repo_processor/get_local_dependencies.py +0 -335
  224. cognee/tasks/repo_processor/get_non_code_files.py +0 -158
  225. cognee/tasks/repo_processor/get_repo_file_dependencies.py +0 -243
  226. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/licenses/LICENSE +0 -0
  227. {cognee-0.4.0.dist-info → cognee-0.5.0.dist-info}/licenses/NOTICE.md +0 -0
@@ -8,47 +8,58 @@ logger = get_logger("index_data_points")
8
8
 
9
9
 
10
10
  async def index_data_points(data_points: list[DataPoint]):
11
- created_indexes = {}
12
- index_points = {}
11
+ """Index data points in the vector engine by creating embeddings for specified fields.
12
+
13
+ Process:
14
+ 1. Groups data points into a nested dict: {type_name: {field_name: [points]}}
15
+ 2. Creates vector indexes for each (type, field) combination on first encounter
16
+ 3. Batches points per (type, field) and creates async indexing tasks
17
+ 4. Executes all indexing tasks in parallel for efficient embedding generation
18
+
19
+ Args:
20
+ data_points: List of DataPoint objects to index. Each DataPoint's metadata must
21
+ contain an 'index_fields' list specifying which fields to embed.
22
+
23
+ Returns:
24
+ The original data_points list.
25
+ """
26
+ data_points_by_type = {}
13
27
 
14
28
  vector_engine = get_vector_engine()
15
29
 
16
30
  for data_point in data_points:
17
31
  data_point_type = type(data_point)
32
+ type_name = data_point_type.__name__
18
33
 
19
34
  for field_name in data_point.metadata["index_fields"]:
20
35
  if getattr(data_point, field_name, None) is None:
21
36
  continue
22
37
 
23
- index_name = f"{data_point_type.__name__}_{field_name}"
38
+ if type_name not in data_points_by_type:
39
+ data_points_by_type[type_name] = {}
24
40
 
25
- if index_name not in created_indexes:
26
- await vector_engine.create_vector_index(data_point_type.__name__, field_name)
27
- created_indexes[index_name] = True
28
-
29
- if index_name not in index_points:
30
- index_points[index_name] = []
41
+ if field_name not in data_points_by_type[type_name]:
42
+ await vector_engine.create_vector_index(type_name, field_name)
43
+ data_points_by_type[type_name][field_name] = []
31
44
 
32
45
  indexed_data_point = data_point.model_copy()
33
46
  indexed_data_point.metadata["index_fields"] = [field_name]
34
- index_points[index_name].append(indexed_data_point)
47
+ data_points_by_type[type_name][field_name].append(indexed_data_point)
35
48
 
36
- tasks: list[asyncio.Task] = []
37
49
  batch_size = vector_engine.embedding_engine.get_batch_size()
38
50
 
39
- for index_name_and_field, points in index_points.items():
40
- first = index_name_and_field.index("_")
41
- index_name = index_name_and_field[:first]
42
- field_name = index_name_and_field[first + 1 :]
51
+ batches = (
52
+ (type_name, field_name, points[i : i + batch_size])
53
+ for type_name, fields in data_points_by_type.items()
54
+ for field_name, points in fields.items()
55
+ for i in range(0, len(points), batch_size)
56
+ )
43
57
 
44
- # Create embedding requests per batch to run in parallel later
45
- for i in range(0, len(points), batch_size):
46
- batch = points[i : i + batch_size]
47
- tasks.append(
48
- asyncio.create_task(vector_engine.index_data_points(index_name, field_name, batch))
49
- )
58
+ tasks = [
59
+ asyncio.create_task(vector_engine.index_data_points(type_name, field_name, batch_points))
60
+ for type_name, field_name, batch_points in batches
61
+ ]
50
62
 
51
- # Run all embedding requests in parallel
52
63
  await asyncio.gather(*tasks)
53
64
 
54
65
  return data_points
@@ -1,17 +1,44 @@
1
- import asyncio
1
+ from collections import Counter
2
+ from typing import Optional, Dict, Any, List, Tuple, Union
2
3
 
3
4
  from cognee.modules.engine.utils.generate_edge_id import generate_edge_id
4
5
  from cognee.shared.logging_utils import get_logger
5
- from collections import Counter
6
- from typing import Optional, Dict, Any, List, Tuple, Union
7
- from cognee.infrastructure.databases.vector import get_vector_engine
8
6
  from cognee.infrastructure.databases.graph import get_graph_engine
9
7
  from cognee.modules.graph.models.EdgeType import EdgeType
10
8
  from cognee.infrastructure.databases.graph.graph_db_interface import EdgeData
9
+ from cognee.tasks.storage.index_data_points import index_data_points
11
10
 
12
11
  logger = get_logger()
13
12
 
14
13
 
14
+ def _get_edge_text(item: dict) -> str:
15
+ """Extract edge text for embedding - prefers edge_text field with fallback."""
16
+ if "edge_text" in item:
17
+ return item["edge_text"]
18
+
19
+ if "relationship_name" in item:
20
+ return item["relationship_name"]
21
+
22
+ return ""
23
+
24
+
25
+ def create_edge_type_datapoints(edges_data) -> list[EdgeType]:
26
+ """Transform raw edge data into EdgeType datapoints."""
27
+ edge_texts = [
28
+ _get_edge_text(item)
29
+ for edge in edges_data
30
+ for item in edge
31
+ if isinstance(item, dict) and "relationship_name" in item
32
+ ]
33
+
34
+ edge_types = Counter(edge_texts)
35
+
36
+ return [
37
+ EdgeType(id=generate_edge_id(edge_id=text), relationship_name=text, number_of_edges=count)
38
+ for text, count in edge_types.items()
39
+ ]
40
+
41
+
15
42
  async def index_graph_edges(
16
43
  edges_data: Union[List[EdgeData], List[Tuple[str, str, str, Optional[Dict[str, Any]]]]] = None,
17
44
  ):
@@ -23,24 +50,17 @@ async def index_graph_edges(
23
50
  the `relationship_name` field.
24
51
 
25
52
  Steps:
26
- 1. Initialize the vector engine and graph engine.
27
- 2. Retrieve graph edge data and count relationship types (`relationship_name`).
28
- 3. Create vector indexes for `relationship_name` if they don't exist.
29
- 4. Transform the counted relationships into `EdgeType` objects.
30
- 5. Index the transformed data points in the vector engine.
53
+ 1. Initialize the graph engine if needed and retrieve edge data.
54
+ 2. Transform edge data into EdgeType datapoints.
55
+ 3. Index the EdgeType datapoints using the standard indexing function.
31
56
 
32
57
  Raises:
33
- RuntimeError: If initialization of the vector engine or graph engine fails.
58
+ RuntimeError: If initialization of the graph engine fails.
34
59
 
35
60
  Returns:
36
61
  None
37
62
  """
38
63
  try:
39
- created_indexes = {}
40
- index_points = {}
41
-
42
- vector_engine = get_vector_engine()
43
-
44
64
  if edges_data is None:
45
65
  graph_engine = await get_graph_engine()
46
66
  _, edges_data = await graph_engine.get_graph_data()
@@ -51,47 +71,7 @@ async def index_graph_edges(
51
71
  logger.error("Failed to initialize engines: %s", e)
52
72
  raise RuntimeError("Initialization error") from e
53
73
 
54
- edge_types = Counter(
55
- item.get("relationship_name")
56
- for edge in edges_data
57
- for item in edge
58
- if isinstance(item, dict) and "relationship_name" in item
59
- )
60
-
61
- for text, count in edge_types.items():
62
- edge = EdgeType(
63
- id=generate_edge_id(edge_id=text), relationship_name=text, number_of_edges=count
64
- )
65
- data_point_type = type(edge)
66
-
67
- for field_name in edge.metadata["index_fields"]:
68
- index_name = f"{data_point_type.__name__}.{field_name}"
69
-
70
- if index_name not in created_indexes:
71
- await vector_engine.create_vector_index(data_point_type.__name__, field_name)
72
- created_indexes[index_name] = True
73
-
74
- if index_name not in index_points:
75
- index_points[index_name] = []
76
-
77
- indexed_data_point = edge.model_copy()
78
- indexed_data_point.metadata["index_fields"] = [field_name]
79
- index_points[index_name].append(indexed_data_point)
80
-
81
- # Get maximum batch size for embedding model
82
- batch_size = vector_engine.embedding_engine.get_batch_size()
83
- tasks: list[asyncio.Task] = []
84
-
85
- for index_name, indexable_points in index_points.items():
86
- index_name, field_name = index_name.split(".")
87
-
88
- # Create embedding tasks to run in parallel later
89
- for start in range(0, len(indexable_points), batch_size):
90
- batch = indexable_points[start : start + batch_size]
91
-
92
- tasks.append(vector_engine.index_data_points(index_name, field_name, batch))
93
-
94
- # Start all embedding tasks and wait for completion
95
- await asyncio.gather(*tasks)
74
+ edge_type_datapoints = create_edge_type_datapoints(edges_data)
75
+ await index_data_points(edge_type_datapoints)
96
76
 
97
77
  return None
@@ -0,0 +1,70 @@
1
+ import os
2
+ import sys
3
+ import uuid
4
+ import pytest
5
+ import pathlib
6
+ from unittest.mock import patch
7
+
8
+ from cognee.modules.chunking.CsvChunker import CsvChunker
9
+ from cognee.modules.data.processing.document_types.CsvDocument import CsvDocument
10
+ from cognee.tests.integration.documents.AudioDocument_test import mock_get_embedding_engine
11
+ from cognee.tests.integration.documents.async_gen_zip import async_gen_zip
12
+
13
+ chunk_by_row_module = sys.modules.get("cognee.tasks.chunks.chunk_by_row")
14
+
15
+
16
+ GROUND_TRUTH = {
17
+ "chunk_size_10": [
18
+ {"token_count": 9, "len_text": 26, "cut_type": "row_cut", "chunk_index": 0},
19
+ {"token_count": 6, "len_text": 29, "cut_type": "row_end", "chunk_index": 1},
20
+ {"token_count": 9, "len_text": 25, "cut_type": "row_cut", "chunk_index": 2},
21
+ {"token_count": 6, "len_text": 30, "cut_type": "row_end", "chunk_index": 3},
22
+ ],
23
+ "chunk_size_128": [
24
+ {"token_count": 15, "len_text": 57, "cut_type": "row_end", "chunk_index": 0},
25
+ {"token_count": 15, "len_text": 57, "cut_type": "row_end", "chunk_index": 1},
26
+ ],
27
+ }
28
+
29
+
30
+ @pytest.mark.parametrize(
31
+ "input_file,chunk_size",
32
+ [("example_with_header.csv", 10), ("example_with_header.csv", 128)],
33
+ )
34
+ @patch.object(chunk_by_row_module, "get_embedding_engine", side_effect=mock_get_embedding_engine)
35
+ @pytest.mark.asyncio
36
+ async def test_CsvDocument(mock_engine, input_file, chunk_size):
37
+ # Define file paths of test data
38
+ csv_file_path = os.path.join(
39
+ pathlib.Path(__file__).parent.parent.parent,
40
+ "test_data",
41
+ input_file,
42
+ )
43
+
44
+ # Define test documents
45
+ csv_document = CsvDocument(
46
+ id=uuid.uuid4(),
47
+ name="example_with_header.csv",
48
+ raw_data_location=csv_file_path,
49
+ external_metadata="",
50
+ mime_type="text/csv",
51
+ )
52
+
53
+ # TEST CSV
54
+ ground_truth_key = f"chunk_size_{chunk_size}"
55
+ async for ground_truth, row_data in async_gen_zip(
56
+ GROUND_TRUTH[ground_truth_key],
57
+ csv_document.read(chunker_cls=CsvChunker, max_chunk_size=chunk_size),
58
+ ):
59
+ assert ground_truth["token_count"] == row_data.chunk_size, (
60
+ f'{ground_truth["token_count"] = } != {row_data.chunk_size = }'
61
+ )
62
+ assert ground_truth["len_text"] == len(row_data.text), (
63
+ f'{ground_truth["len_text"] = } != {len(row_data.text) = }'
64
+ )
65
+ assert ground_truth["cut_type"] == row_data.cut_type, (
66
+ f'{ground_truth["cut_type"] = } != {row_data.cut_type = }'
67
+ )
68
+ assert ground_truth["chunk_index"] == row_data.chunk_index, (
69
+ f'{ground_truth["chunk_index"] = } != {row_data.chunk_index = }'
70
+ )
@@ -0,0 +1,84 @@
1
+ import os
2
+ import pytest
3
+ import pathlib
4
+ import pytest_asyncio
5
+ import cognee
6
+
7
+ from cognee.low_level import setup
8
+ from cognee.tasks.storage import add_data_points
9
+ from cognee.modules.retrieval.exceptions.exceptions import NoDataError
10
+ from cognee.modules.retrieval.triplet_retriever import TripletRetriever
11
+ from cognee.modules.engine.models import Triplet
12
+
13
+
14
+ @pytest_asyncio.fixture
15
+ async def setup_test_environment_with_triplets():
16
+ """Set up a clean test environment with triplets."""
17
+ base_dir = pathlib.Path(__file__).parent.parent.parent.parent
18
+ system_directory_path = str(base_dir / ".cognee_system/test_triplet_retriever_context_simple")
19
+ data_directory_path = str(base_dir / ".data_storage/test_triplet_retriever_context_simple")
20
+
21
+ cognee.config.system_root_directory(system_directory_path)
22
+ cognee.config.data_root_directory(data_directory_path)
23
+
24
+ await cognee.prune.prune_data()
25
+ await cognee.prune.prune_system(metadata=True)
26
+ await setup()
27
+
28
+ triplet1 = Triplet(
29
+ from_node_id="node1",
30
+ to_node_id="node2",
31
+ text="Alice knows Bob",
32
+ )
33
+ triplet2 = Triplet(
34
+ from_node_id="node2",
35
+ to_node_id="node3",
36
+ text="Bob works at Tech Corp",
37
+ )
38
+
39
+ triplets = [triplet1, triplet2]
40
+ await add_data_points(triplets)
41
+
42
+ yield
43
+
44
+ try:
45
+ await cognee.prune.prune_data()
46
+ await cognee.prune.prune_system(metadata=True)
47
+ except Exception:
48
+ pass
49
+
50
+
51
+ @pytest_asyncio.fixture
52
+ async def setup_test_environment_empty():
53
+ """Set up a clean test environment without triplets."""
54
+ base_dir = pathlib.Path(__file__).parent.parent.parent.parent
55
+ system_directory_path = str(
56
+ base_dir / ".cognee_system/test_triplet_retriever_context_empty_collection"
57
+ )
58
+ data_directory_path = str(
59
+ base_dir / ".data_storage/test_triplet_retriever_context_empty_collection"
60
+ )
61
+
62
+ cognee.config.system_root_directory(system_directory_path)
63
+ cognee.config.data_root_directory(data_directory_path)
64
+
65
+ await cognee.prune.prune_data()
66
+ await cognee.prune.prune_system(metadata=True)
67
+
68
+ yield
69
+
70
+ try:
71
+ await cognee.prune.prune_data()
72
+ await cognee.prune.prune_system(metadata=True)
73
+ except Exception:
74
+ pass
75
+
76
+
77
+ @pytest.mark.asyncio
78
+ async def test_triplet_retriever_context_simple(setup_test_environment_with_triplets):
79
+ """Integration test: verify TripletRetriever can retrieve triplet context."""
80
+ retriever = TripletRetriever(top_k=5)
81
+
82
+ context = await retriever.get_context("Alice")
83
+
84
+ assert "Alice knows Bob" in context, "Failed to get Alice triplet"
@@ -0,0 +1,139 @@
1
+ import pathlib
2
+ import pytest
3
+ import pytest_asyncio
4
+
5
+ import cognee
6
+ from cognee.low_level import setup
7
+ from cognee.infrastructure.engine import DataPoint
8
+ from cognee.tasks.storage.add_data_points import add_data_points
9
+ from cognee.tasks.storage.exceptions import InvalidDataPointsInAddDataPointsError
10
+ from cognee.infrastructure.databases.graph import get_graph_engine
11
+
12
+
13
+ class Person(DataPoint):
14
+ name: str
15
+ age: int
16
+ metadata: dict = {"index_fields": ["name"]}
17
+
18
+
19
+ class Company(DataPoint):
20
+ name: str
21
+ industry: str
22
+ metadata: dict = {"index_fields": ["name", "industry"]}
23
+
24
+
25
+ @pytest_asyncio.fixture
26
+ async def clean_test_environment():
27
+ """Set up a clean test environment for add_data_points tests."""
28
+ base_dir = pathlib.Path(__file__).parent.parent.parent.parent
29
+ system_directory_path = str(base_dir / ".cognee_system/test_add_data_points_integration")
30
+ data_directory_path = str(base_dir / ".data_storage/test_add_data_points_integration")
31
+
32
+ cognee.config.system_root_directory(system_directory_path)
33
+ cognee.config.data_root_directory(data_directory_path)
34
+
35
+ await cognee.prune.prune_data()
36
+ await cognee.prune.prune_system(metadata=True)
37
+ await setup()
38
+
39
+ yield
40
+
41
+ try:
42
+ await cognee.prune.prune_data()
43
+ await cognee.prune.prune_system(metadata=True)
44
+ except Exception:
45
+ pass
46
+
47
+
48
+ @pytest.mark.asyncio
49
+ async def test_add_data_points_comprehensive(clean_test_environment):
50
+ """Comprehensive integration test for add_data_points functionality."""
51
+
52
+ person1 = Person(name="Alice", age=30)
53
+ person2 = Person(name="Bob", age=25)
54
+ result = await add_data_points([person1, person2])
55
+
56
+ assert result == [person1, person2]
57
+ assert len(result) == 2
58
+
59
+ graph_engine = await get_graph_engine()
60
+ nodes, edges = await graph_engine.get_graph_data()
61
+ assert len(nodes) >= 2
62
+
63
+ result_empty = await add_data_points([])
64
+ assert result_empty == []
65
+
66
+ person3 = Person(name="Charlie", age=35)
67
+ person4 = Person(name="Diana", age=32)
68
+ custom_edge = (str(person3.id), str(person4.id), "knows", {"edge_text": "friends with"})
69
+
70
+ result_custom = await add_data_points([person3, person4], custom_edges=[custom_edge])
71
+ assert len(result_custom) == 2
72
+
73
+ nodes, edges = await graph_engine.get_graph_data()
74
+ assert len(edges) == 1
75
+ assert len(nodes) == 4
76
+
77
+ class Employee(DataPoint):
78
+ name: str
79
+ works_at: Company
80
+ metadata: dict = {"index_fields": ["name"]}
81
+
82
+ company = Company(name="TechCorp", industry="Technology")
83
+ employee = Employee(name="Eve", works_at=company)
84
+
85
+ result_rel = await add_data_points([employee])
86
+ assert len(result_rel) == 1
87
+
88
+ nodes, edges = await graph_engine.get_graph_data()
89
+ assert len(nodes) == 6
90
+ assert len(edges) == 2
91
+
92
+ person5 = Person(name="Frank", age=40)
93
+ person6 = Person(name="Grace", age=38)
94
+ triplet_edge = (str(person5.id), str(person6.id), "married_to", {"edge_text": "is married to"})
95
+
96
+ result_triplet = await add_data_points(
97
+ [person5, person6], custom_edges=[triplet_edge], embed_triplets=True
98
+ )
99
+ assert len(result_triplet) == 2
100
+
101
+ nodes, edges = await graph_engine.get_graph_data()
102
+ assert len(nodes) == 8
103
+ assert len(edges) == 3
104
+
105
+ batch1 = [Person(name="Leo", age=25), Person(name="Mia", age=30)]
106
+ batch2 = [Person(name="Noah", age=35), Person(name="Olivia", age=40)]
107
+
108
+ result_batch1 = await add_data_points(batch1)
109
+ result_batch2 = await add_data_points(batch2)
110
+
111
+ assert len(result_batch1) == 2
112
+ assert len(result_batch2) == 2
113
+
114
+ nodes, edges = await graph_engine.get_graph_data()
115
+ assert len(nodes) == 12
116
+ assert len(edges) == 3
117
+
118
+ person7 = Person(name="Paul", age=33)
119
+ person8 = Person(name="Quinn", age=31)
120
+ edge1 = (str(person7.id), str(person8.id), "colleague_of", {"edge_text": "works with"})
121
+ edge2 = (str(person8.id), str(person7.id), "colleague_of", {"edge_text": "works with"})
122
+
123
+ result_bi = await add_data_points([person7, person8], custom_edges=[edge1, edge2])
124
+ assert len(result_bi) == 2
125
+
126
+ nodes, edges = await graph_engine.get_graph_data()
127
+ assert len(nodes) == 14
128
+ assert len(edges) == 5
129
+
130
+ person_invalid = Person(name="Invalid", age=50)
131
+ with pytest.raises(InvalidDataPointsInAddDataPointsError, match="must be a list"):
132
+ await add_data_points(person_invalid)
133
+
134
+ with pytest.raises(InvalidDataPointsInAddDataPointsError, match="must be a DataPoint"):
135
+ await add_data_points(["not", "datapoints"])
136
+
137
+ final_nodes, final_edges = await graph_engine.get_graph_data()
138
+ assert len(final_nodes) == 14
139
+ assert len(final_edges) == 5
@@ -0,0 +1,69 @@
1
+ import os
2
+ import pathlib
3
+ import pytest
4
+ import pytest_asyncio
5
+ from unittest.mock import AsyncMock, patch
6
+
7
+ import cognee
8
+ from cognee.tasks.memify.get_triplet_datapoints import get_triplet_datapoints
9
+ from cognee.modules.engine.models import Triplet
10
+
11
+
12
+ @pytest_asyncio.fixture
13
+ async def setup_test_environment():
14
+ """Set up a clean test environment with a simple graph."""
15
+ base_dir = pathlib.Path(__file__).parent.parent.parent.parent
16
+ data_directory_path = str(base_dir / ".data_storage/test_get_triplet_datapoints_integration")
17
+ cognee_directory_path = str(base_dir / ".cognee_system/test_get_triplet_datapoints_integration")
18
+
19
+ cognee.config.data_root_directory(data_directory_path)
20
+ cognee.config.system_root_directory(cognee_directory_path)
21
+
22
+ await cognee.prune.prune_data()
23
+ await cognee.prune.prune_system(metadata=True)
24
+
25
+ dataset_name = "test_triplets"
26
+
27
+ text = "Volkswagen is a german car manufacturer from Wolfsburg. They produce different models such as Golf, Polo and Touareg."
28
+ await cognee.add(text, dataset_name)
29
+ await cognee.cognify([dataset_name])
30
+
31
+ yield dataset_name
32
+
33
+ await cognee.prune.prune_data()
34
+ await cognee.prune.prune_system(metadata=True)
35
+
36
+
37
+ @pytest.mark.asyncio
38
+ async def test_get_triplet_datapoints_integration(setup_test_environment):
39
+ """Integration test: verify get_triplet_datapoints works with real graph data."""
40
+
41
+ from cognee.infrastructure.databases.graph import get_graph_engine
42
+
43
+ graph_engine = await get_graph_engine()
44
+
45
+ if not hasattr(graph_engine, "get_triplets_batch"):
46
+ pytest.skip("Graph engine does not support get_triplets_batch")
47
+
48
+ triplets = []
49
+ with patch(
50
+ "cognee.tasks.memify.get_triplet_datapoints.index_data_points", new_callable=AsyncMock
51
+ ):
52
+ async for triplet in get_triplet_datapoints([{}], triplets_batch_size=10):
53
+ triplets.append(triplet)
54
+
55
+ nodes, edges = await graph_engine.get_graph_data()
56
+
57
+ if len(edges) > 0 and len(triplets) == 0:
58
+ test_triplets = await graph_engine.get_triplets_batch(offset=0, limit=10)
59
+ if len(test_triplets) == 0:
60
+ pytest.fail(
61
+ f"Edges exist in graph ({len(edges)} edges) but get_triplets_batch found none. "
62
+ f"This indicates the query pattern may not match the graph structure."
63
+ )
64
+
65
+ for triplet in triplets:
66
+ assert isinstance(triplet, Triplet), "Each item should be a Triplet instance"
67
+ assert triplet.from_node_id, "Triplet should have from_node_id"
68
+ assert triplet.to_node_id, "Triplet should have to_node_id"
69
+ assert triplet.text, "Triplet should have embeddable text"
@@ -5,7 +5,7 @@ from cognee.tasks.web_scraper import DefaultUrlCrawler
5
5
  @pytest.mark.asyncio
6
6
  async def test_fetch():
7
7
  crawler = DefaultUrlCrawler()
8
- url = "https://en.wikipedia.org/wiki/Large_language_model"
8
+ url = "http://example.com/"
9
9
  results = await crawler.fetch_urls(url)
10
10
  assert len(results) == 1
11
11
  assert isinstance(results, dict)
@@ -11,7 +11,7 @@ skip_in_ci = pytest.mark.skipif(
11
11
  @skip_in_ci
12
12
  @pytest.mark.asyncio
13
13
  async def test_fetch():
14
- url = "https://en.wikipedia.org/wiki/Large_language_model"
14
+ url = "http://example.com/"
15
15
  results = await fetch_with_tavily(url)
16
16
  assert isinstance(results, dict)
17
17
  assert len(results) == 1