cognee 0.3.4.dev3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. cognee/api/client.py +16 -7
  2. cognee/api/health.py +5 -9
  3. cognee/api/v1/add/add.py +3 -1
  4. cognee/api/v1/cognify/cognify.py +44 -7
  5. cognee/api/v1/permissions/routers/get_permissions_router.py +8 -4
  6. cognee/api/v1/search/search.py +3 -0
  7. cognee/api/v1/ui/__init__.py +1 -1
  8. cognee/api/v1/ui/ui.py +215 -150
  9. cognee/api/v1/update/__init__.py +1 -0
  10. cognee/api/v1/update/routers/__init__.py +1 -0
  11. cognee/api/v1/update/routers/get_update_router.py +90 -0
  12. cognee/api/v1/update/update.py +100 -0
  13. cognee/base_config.py +5 -2
  14. cognee/cli/_cognee.py +28 -10
  15. cognee/cli/commands/delete_command.py +34 -2
  16. cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +2 -2
  17. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +3 -2
  18. cognee/eval_framework/modal_eval_dashboard.py +9 -1
  19. cognee/infrastructure/databases/graph/config.py +9 -9
  20. cognee/infrastructure/databases/graph/get_graph_engine.py +4 -21
  21. cognee/infrastructure/databases/graph/kuzu/adapter.py +60 -9
  22. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +3 -3
  23. cognee/infrastructure/databases/relational/config.py +4 -4
  24. cognee/infrastructure/databases/relational/create_relational_engine.py +11 -3
  25. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +7 -3
  26. cognee/infrastructure/databases/vector/config.py +7 -7
  27. cognee/infrastructure/databases/vector/create_vector_engine.py +7 -15
  28. cognee/infrastructure/databases/vector/embeddings/EmbeddingEngine.py +9 -0
  29. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +11 -0
  30. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +19 -2
  31. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -0
  32. cognee/infrastructure/databases/vector/embeddings/config.py +8 -0
  33. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +5 -0
  34. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +11 -10
  35. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +48 -38
  36. cognee/infrastructure/databases/vector/vector_db_interface.py +8 -4
  37. cognee/infrastructure/files/storage/S3FileStorage.py +15 -5
  38. cognee/infrastructure/files/storage/s3_config.py +1 -0
  39. cognee/infrastructure/files/utils/open_data_file.py +7 -14
  40. cognee/infrastructure/llm/LLMGateway.py +19 -117
  41. cognee/infrastructure/llm/config.py +28 -13
  42. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_categories.py +2 -1
  43. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_event_entities.py +3 -2
  44. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_summary.py +3 -2
  45. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_content_graph.py +2 -1
  46. cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_event_graph.py +3 -2
  47. cognee/infrastructure/llm/prompts/read_query_prompt.py +3 -2
  48. cognee/infrastructure/llm/prompts/show_prompt.py +35 -0
  49. cognee/infrastructure/llm/prompts/test.txt +1 -0
  50. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +2 -2
  51. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +50 -397
  52. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +2 -3
  53. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +8 -88
  54. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +78 -0
  55. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +2 -99
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +49 -401
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +19 -882
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +2 -34
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +2 -107
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/acreate_structured_output.baml +26 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/__init__.py +1 -2
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +76 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/create_dynamic_baml_type.py +122 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +3 -3
  65. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +0 -32
  66. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +107 -98
  67. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +5 -6
  68. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +5 -6
  69. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llm_interface.py +0 -26
  70. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +17 -67
  71. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +8 -7
  72. cognee/infrastructure/llm/utils.py +4 -4
  73. cognee/infrastructure/loaders/LoaderEngine.py +5 -2
  74. cognee/infrastructure/loaders/external/__init__.py +7 -0
  75. cognee/infrastructure/loaders/external/advanced_pdf_loader.py +244 -0
  76. cognee/infrastructure/loaders/supported_loaders.py +7 -0
  77. cognee/modules/data/methods/create_authorized_dataset.py +9 -0
  78. cognee/modules/data/methods/get_authorized_dataset.py +1 -1
  79. cognee/modules/data/methods/get_authorized_dataset_by_name.py +11 -0
  80. cognee/modules/data/methods/get_deletion_counts.py +92 -0
  81. cognee/modules/graph/cognee_graph/CogneeGraph.py +1 -1
  82. cognee/modules/graph/utils/expand_with_nodes_and_edges.py +22 -8
  83. cognee/modules/graph/utils/retrieve_existing_edges.py +0 -2
  84. cognee/modules/ingestion/data_types/TextData.py +0 -1
  85. cognee/modules/notebooks/methods/create_notebook.py +3 -1
  86. cognee/modules/notebooks/methods/get_notebooks.py +27 -1
  87. cognee/modules/observability/get_observe.py +14 -0
  88. cognee/modules/observability/observers.py +1 -0
  89. cognee/modules/ontology/base_ontology_resolver.py +42 -0
  90. cognee/modules/ontology/get_default_ontology_resolver.py +41 -0
  91. cognee/modules/ontology/matching_strategies.py +53 -0
  92. cognee/modules/ontology/models.py +20 -0
  93. cognee/modules/ontology/ontology_config.py +24 -0
  94. cognee/modules/ontology/ontology_env_config.py +45 -0
  95. cognee/modules/ontology/rdf_xml/{OntologyResolver.py → RDFLibOntologyResolver.py} +20 -28
  96. cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +21 -24
  97. cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +3 -3
  98. cognee/modules/retrieval/code_retriever.py +2 -1
  99. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -4
  100. cognee/modules/retrieval/graph_completion_cot_retriever.py +6 -5
  101. cognee/modules/retrieval/graph_completion_retriever.py +0 -3
  102. cognee/modules/retrieval/insights_retriever.py +1 -1
  103. cognee/modules/retrieval/jaccard_retrival.py +60 -0
  104. cognee/modules/retrieval/lexical_retriever.py +123 -0
  105. cognee/modules/retrieval/natural_language_retriever.py +2 -1
  106. cognee/modules/retrieval/temporal_retriever.py +3 -2
  107. cognee/modules/retrieval/utils/brute_force_triplet_search.py +2 -12
  108. cognee/modules/retrieval/utils/completion.py +4 -7
  109. cognee/modules/search/methods/get_search_type_tools.py +7 -0
  110. cognee/modules/search/methods/no_access_control_search.py +1 -1
  111. cognee/modules/search/methods/search.py +32 -13
  112. cognee/modules/search/types/SearchType.py +1 -0
  113. cognee/modules/users/methods/create_user.py +0 -2
  114. cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py +12 -0
  115. cognee/modules/users/permissions/methods/check_permission_on_dataset.py +11 -0
  116. cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +10 -0
  117. cognee/modules/users/permissions/methods/get_document_ids_for_user.py +10 -0
  118. cognee/modules/users/permissions/methods/get_principal.py +9 -0
  119. cognee/modules/users/permissions/methods/get_principal_datasets.py +11 -0
  120. cognee/modules/users/permissions/methods/get_role.py +10 -0
  121. cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +3 -3
  122. cognee/modules/users/permissions/methods/get_tenant.py +9 -0
  123. cognee/modules/users/permissions/methods/give_default_permission_to_role.py +9 -0
  124. cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py +9 -0
  125. cognee/modules/users/permissions/methods/give_default_permission_to_user.py +9 -0
  126. cognee/modules/users/permissions/methods/give_permission_on_dataset.py +10 -0
  127. cognee/modules/users/roles/methods/add_user_to_role.py +11 -0
  128. cognee/modules/users/roles/methods/create_role.py +12 -1
  129. cognee/modules/users/tenants/methods/add_user_to_tenant.py +12 -0
  130. cognee/modules/users/tenants/methods/create_tenant.py +12 -1
  131. cognee/modules/visualization/cognee_network_visualization.py +13 -9
  132. cognee/shared/data_models.py +0 -1
  133. cognee/shared/utils.py +0 -32
  134. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  135. cognee/tasks/codingagents/coding_rule_associations.py +3 -2
  136. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +3 -2
  137. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +3 -2
  138. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +3 -2
  139. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +3 -2
  140. cognee/tasks/graph/extract_graph_from_code.py +2 -2
  141. cognee/tasks/graph/extract_graph_from_data.py +55 -12
  142. cognee/tasks/graph/extract_graph_from_data_v2.py +16 -4
  143. cognee/tasks/ingestion/migrate_relational_database.py +132 -41
  144. cognee/tasks/ingestion/resolve_data_directories.py +4 -1
  145. cognee/tasks/schema/ingest_database_schema.py +134 -0
  146. cognee/tasks/schema/models.py +40 -0
  147. cognee/tasks/storage/index_data_points.py +1 -1
  148. cognee/tasks/storage/index_graph_edges.py +3 -1
  149. cognee/tasks/summarization/summarize_code.py +2 -2
  150. cognee/tasks/summarization/summarize_text.py +2 -2
  151. cognee/tasks/temporal_graph/enrich_events.py +2 -2
  152. cognee/tasks/temporal_graph/extract_events_and_entities.py +2 -2
  153. cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py +13 -4
  154. cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py +13 -3
  155. cognee/tests/test_advanced_pdf_loader.py +141 -0
  156. cognee/tests/test_chromadb.py +40 -0
  157. cognee/tests/test_cognee_server_start.py +6 -1
  158. cognee/tests/test_data/Quantum_computers.txt +9 -0
  159. cognee/tests/test_lancedb.py +211 -0
  160. cognee/tests/test_pgvector.py +40 -0
  161. cognee/tests/test_relational_db_migration.py +76 -0
  162. cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +2 -1
  163. cognee/tests/unit/modules/ontology/test_ontology_adapter.py +330 -13
  164. cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +0 -4
  165. cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -4
  166. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +0 -4
  167. {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/METADATA +92 -96
  168. {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/RECORD +176 -162
  169. distributed/pyproject.toml +0 -1
  170. cognee/infrastructure/data/utils/extract_keywords.py +0 -48
  171. cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py +0 -1227
  172. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +0 -109
  173. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +0 -343
  174. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_categories.py +0 -0
  175. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +0 -89
  176. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/__init__.py +0 -0
  177. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +0 -44
  178. cognee/tasks/graph/infer_data_ontology.py +0 -309
  179. cognee/tests/test_falkordb.py +0 -174
  180. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/__init__.py +0 -0
  181. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/__init__.py +0 -0
  182. /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/texts.json +0 -0
  183. {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/WHEEL +0 -0
  184. {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/entry_points.txt +0 -0
  185. {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/licenses/LICENSE +0 -0
  186. {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/licenses/NOTICE.md +0 -0
@@ -4,6 +4,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine
4
4
  from cognee.infrastructure.databases.vector import get_vector_engine
5
5
 
6
6
  from cognee.low_level import DataPoint
7
+ from cognee.infrastructure.llm.prompts import render_prompt
7
8
  from cognee.infrastructure.llm import LLMGateway
8
9
  from cognee.shared.logging_utils import get_logger
9
10
  from cognee.modules.engine.models import NodeSet
@@ -104,8 +105,8 @@ async def add_rule_associations(
104
105
 
105
106
  user_context = {"chat": data, "rules": existing_rules}
106
107
 
107
- user_prompt = LLMGateway.render_prompt(user_prompt_location, context=user_context)
108
- system_prompt = LLMGateway.render_prompt(system_prompt_location, context={})
108
+ user_prompt = render_prompt(user_prompt_location, context=user_context)
109
+ system_prompt = render_prompt(system_prompt_location, context={})
109
110
 
110
111
  rule_list = await LLMGateway.acreate_structured_output(
111
112
  text_input=user_prompt, system_prompt=system_prompt, response_model=RuleSet
@@ -3,6 +3,7 @@ from typing import List
3
3
 
4
4
  from pydantic import BaseModel
5
5
 
6
+ from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
6
7
  from cognee.infrastructure.entities.BaseEntityExtractor import BaseEntityExtractor
7
8
  from cognee.modules.engine.models import Entity
8
9
  from cognee.modules.engine.models.EntityType import EntityType
@@ -50,8 +51,8 @@ class LLMEntityExtractor(BaseEntityExtractor):
50
51
  try:
51
52
  logger.info(f"Extracting entities from text: {text[:100]}...")
52
53
 
53
- user_prompt = LLMGateway.render_prompt(self.user_prompt_template, {"text": text})
54
- system_prompt = LLMGateway.read_query_prompt(self.system_prompt_template)
54
+ user_prompt = render_prompt(self.user_prompt_template, {"text": text})
55
+ system_prompt = read_query_prompt(self.system_prompt_template)
55
56
 
56
57
  response = await LLMGateway.acreate_structured_output(
57
58
  text_input=user_prompt,
@@ -1,6 +1,7 @@
1
1
  from typing import List, Tuple
2
2
  from pydantic import BaseModel
3
3
 
4
+ from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
4
5
  from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
6
  from cognee.root_dir import get_absolute_path
6
7
 
@@ -32,12 +33,12 @@ async def extract_content_nodes_and_relationship_names(
32
33
  }
33
34
 
34
35
  base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
35
- text_input = LLMGateway.render_prompt(
36
+ text_input = render_prompt(
36
37
  "extract_graph_relationship_names_prompt_input.txt",
37
38
  context,
38
39
  base_directory=base_directory,
39
40
  )
40
- system_prompt = LLMGateway.read_query_prompt(
41
+ system_prompt = read_query_prompt(
41
42
  "extract_graph_relationship_names_prompt_system.txt", base_directory=base_directory
42
43
  )
43
44
  response = await LLMGateway.acreate_structured_output(
@@ -1,5 +1,6 @@
1
1
  from typing import List
2
2
 
3
+ from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
3
4
  from cognee.infrastructure.llm.LLMGateway import LLMGateway
4
5
  from cognee.shared.data_models import KnowledgeGraph
5
6
  from cognee.root_dir import get_absolute_path
@@ -26,10 +27,10 @@ async def extract_edge_triplets(
26
27
  }
27
28
 
28
29
  base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
29
- text_input = LLMGateway.render_prompt(
30
+ text_input = render_prompt(
30
31
  "extract_graph_edge_triplets_prompt_input.txt", context, base_directory=base_directory
31
32
  )
32
- system_prompt = LLMGateway.read_query_prompt(
33
+ system_prompt = read_query_prompt(
33
34
  "extract_graph_edge_triplets_prompt_system.txt", base_directory=base_directory
34
35
  )
35
36
  extracted_graph = await LLMGateway.acreate_structured_output(
@@ -1,6 +1,7 @@
1
1
  from typing import List
2
2
  from pydantic import BaseModel
3
3
 
4
+ from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
4
5
  from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
6
  from cognee.root_dir import get_absolute_path
6
7
 
@@ -24,10 +25,10 @@ async def extract_nodes(text: str, n_rounds: int = 2) -> List[str]:
24
25
  "text": text,
25
26
  }
26
27
  base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
27
- text_input = LLMGateway.render_prompt(
28
+ text_input = render_prompt(
28
29
  "extract_graph_nodes_prompt_input.txt", context, base_directory=base_directory
29
30
  )
30
- system_prompt = LLMGateway.read_query_prompt(
31
+ system_prompt = read_query_prompt(
31
32
  "extract_graph_nodes_prompt_system.txt", base_directory=base_directory
32
33
  )
33
34
  response = await LLMGateway.acreate_structured_output(
@@ -2,7 +2,7 @@ import asyncio
2
2
  from typing import Type, List
3
3
  from pydantic import BaseModel
4
4
 
5
- from cognee.infrastructure.llm.LLMGateway import LLMGateway
5
+ from cognee.infrastructure.llm.extraction import extract_content_graph
6
6
  from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
7
7
  from cognee.tasks.storage import add_data_points
8
8
 
@@ -18,7 +18,7 @@ async def extract_graph_from_code(
18
18
  - Graph nodes are stored using the `add_data_points` function for later retrieval or analysis.
19
19
  """
20
20
  chunk_graphs = await asyncio.gather(
21
- *[LLMGateway.extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
21
+ *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
22
22
  )
23
23
 
24
24
  for chunk_index, chunk in enumerate(data_chunks):
@@ -3,15 +3,21 @@ from typing import Type, List, Optional
3
3
  from pydantic import BaseModel
4
4
 
5
5
  from cognee.infrastructure.databases.graph import get_graph_engine
6
+ from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
6
7
  from cognee.tasks.storage.add_data_points import add_data_points
7
- from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
8
+ from cognee.modules.ontology.ontology_config import Config
9
+ from cognee.modules.ontology.get_default_ontology_resolver import (
10
+ get_default_ontology_resolver,
11
+ get_ontology_resolver_from_env,
12
+ )
13
+ from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
8
14
  from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
9
15
  from cognee.modules.graph.utils import (
10
16
  expand_with_nodes_and_edges,
11
17
  retrieve_existing_edges,
12
18
  )
13
19
  from cognee.shared.data_models import KnowledgeGraph
14
- from cognee.infrastructure.llm.LLMGateway import LLMGateway
20
+ from cognee.infrastructure.llm.extraction import extract_content_graph
15
21
  from cognee.tasks.graph.exceptions import (
16
22
  InvalidGraphModelError,
17
23
  InvalidDataChunksError,
@@ -24,9 +30,28 @@ async def integrate_chunk_graphs(
24
30
  data_chunks: list[DocumentChunk],
25
31
  chunk_graphs: list,
26
32
  graph_model: Type[BaseModel],
27
- ontology_adapter: OntologyResolver,
33
+ ontology_resolver: BaseOntologyResolver,
28
34
  ) -> List[DocumentChunk]:
29
- """Updates DocumentChunk objects, integrates data points and edges into databases."""
35
+ """Integrate chunk graphs with ontology validation and store in databases.
36
+
37
+ This function processes document chunks and their associated knowledge graphs,
38
+ validates entities against an ontology resolver, and stores the integrated
39
+ data points and edges in the configured databases.
40
+
41
+ Args:
42
+ data_chunks: List of document chunks containing source data
43
+ chunk_graphs: List of knowledge graphs corresponding to each chunk
44
+ graph_model: Pydantic model class for graph data validation
45
+ ontology_resolver: Resolver for validating entities against ontology
46
+
47
+ Returns:
48
+ List of updated DocumentChunk objects with integrated data
49
+
50
+ Raises:
51
+ InvalidChunkGraphInputError: If input validation fails
52
+ InvalidGraphModelError: If graph model validation fails
53
+ InvalidOntologyAdapterError: If ontology resolver validation fails
54
+ """
30
55
 
31
56
  if not isinstance(data_chunks, list) or not isinstance(chunk_graphs, list):
32
57
  raise InvalidChunkGraphInputError("data_chunks and chunk_graphs must be lists.")
@@ -36,9 +61,9 @@ async def integrate_chunk_graphs(
36
61
  )
37
62
  if not isinstance(graph_model, type) or not issubclass(graph_model, BaseModel):
38
63
  raise InvalidGraphModelError(graph_model)
39
- if ontology_adapter is None or not hasattr(ontology_adapter, "get_subgraph"):
64
+ if ontology_resolver is None or not hasattr(ontology_resolver, "get_subgraph"):
40
65
  raise InvalidOntologyAdapterError(
41
- type(ontology_adapter).__name__ if ontology_adapter else "None"
66
+ type(ontology_resolver).__name__ if ontology_resolver else "None"
42
67
  )
43
68
 
44
69
  graph_engine = await get_graph_engine()
@@ -55,7 +80,7 @@ async def integrate_chunk_graphs(
55
80
  )
56
81
 
57
82
  graph_nodes, graph_edges = expand_with_nodes_and_edges(
58
- data_chunks, chunk_graphs, ontology_adapter, existing_edges_map
83
+ data_chunks, chunk_graphs, ontology_resolver, existing_edges_map
59
84
  )
60
85
 
61
86
  if len(graph_nodes) > 0:
@@ -70,7 +95,7 @@ async def integrate_chunk_graphs(
70
95
  async def extract_graph_from_data(
71
96
  data_chunks: List[DocumentChunk],
72
97
  graph_model: Type[BaseModel],
73
- ontology_adapter: OntologyResolver = None,
98
+ config: Config = None,
74
99
  custom_prompt: Optional[str] = None,
75
100
  ) -> List[DocumentChunk]:
76
101
  """
@@ -86,7 +111,7 @@ async def extract_graph_from_data(
86
111
 
87
112
  chunk_graphs = await asyncio.gather(
88
113
  *[
89
- LLMGateway.extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt)
114
+ extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt)
90
115
  for chunk in data_chunks
91
116
  ]
92
117
  )
@@ -101,6 +126,24 @@ async def extract_graph_from_data(
101
126
  if edge.source_node_id in valid_node_ids and edge.target_node_id in valid_node_ids
102
127
  ]
103
128
 
104
- return await integrate_chunk_graphs(
105
- data_chunks, chunk_graphs, graph_model, ontology_adapter or OntologyResolver()
106
- )
129
+ # Extract resolver from config if provided, otherwise get default
130
+ if config is None:
131
+ ontology_config = get_ontology_env_config()
132
+ if (
133
+ ontology_config.ontology_file_path
134
+ and ontology_config.ontology_resolver
135
+ and ontology_config.matching_strategy
136
+ ):
137
+ config: Config = {
138
+ "ontology_config": {
139
+ "ontology_resolver": get_ontology_resolver_from_env(**ontology_config.to_dict())
140
+ }
141
+ }
142
+ else:
143
+ config: Config = {
144
+ "ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
145
+ }
146
+
147
+ ontology_resolver = config["ontology_config"]["ontology_resolver"]
148
+
149
+ return await integrate_chunk_graphs(data_chunks, chunk_graphs, graph_model, ontology_resolver)
@@ -3,7 +3,7 @@ from typing import List
3
3
 
4
4
  from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
5
5
  from cognee.shared.data_models import KnowledgeGraph
6
- from cognee.modules.ontology.rdf_xml.OntologyResolver import OntologyResolver
6
+ from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
7
7
  from cognee.tasks.graph.cascade_extract.utils.extract_nodes import extract_nodes
8
8
  from cognee.tasks.graph.cascade_extract.utils.extract_content_nodes_and_relationship_names import (
9
9
  extract_content_nodes_and_relationship_names,
@@ -17,9 +17,21 @@ from cognee.tasks.graph.extract_graph_from_data import integrate_chunk_graphs
17
17
  async def extract_graph_from_data(
18
18
  data_chunks: List[DocumentChunk],
19
19
  n_rounds: int = 2,
20
- ontology_adapter: OntologyResolver = None,
20
+ ontology_adapter: BaseOntologyResolver = None,
21
21
  ) -> List[DocumentChunk]:
22
- """Extract and update graph data from document chunks in multiple steps."""
22
+ """Extract and update graph data from document chunks using cascade extraction.
23
+
24
+ This function performs multi-step graph extraction from document chunks,
25
+ using cascade extraction techniques to build comprehensive knowledge graphs.
26
+
27
+ Args:
28
+ data_chunks: List of document chunks to process
29
+ n_rounds: Number of extraction rounds to perform (default: 2)
30
+ ontology_adapter: Resolver for validating entities against ontology
31
+
32
+ Returns:
33
+ List of updated DocumentChunk objects with extracted graph data
34
+ """
23
35
  chunk_nodes = await asyncio.gather(
24
36
  *[extract_nodes(chunk.text, n_rounds) for chunk in data_chunks]
25
37
  )
@@ -44,5 +56,5 @@ async def extract_graph_from_data(
44
56
  data_chunks=data_chunks,
45
57
  chunk_graphs=chunk_graphs,
46
58
  graph_model=KnowledgeGraph,
47
- ontology_adapter=ontology_adapter or OntologyResolver(),
59
+ ontology_adapter=ontology_adapter,
48
60
  )
@@ -4,16 +4,20 @@ from sqlalchemy import text
4
4
  from cognee.infrastructure.databases.relational.get_migration_relational_engine import (
5
5
  get_migration_relational_engine,
6
6
  )
7
+ from cognee.infrastructure.databases.relational.config import get_migration_config
7
8
 
8
9
  from cognee.tasks.storage.index_data_points import index_data_points
9
10
  from cognee.tasks.storage.index_graph_edges import index_graph_edges
11
+ from cognee.tasks.schema.ingest_database_schema import ingest_database_schema
10
12
 
11
13
  from cognee.modules.engine.models import TableRow, TableType, ColumnValue
12
14
 
13
15
  logger = logging.getLogger(__name__)
14
16
 
15
17
 
16
- async def migrate_relational_database(graph_db, schema, migrate_column_data=True):
18
+ async def migrate_relational_database(
19
+ graph_db, schema, migrate_column_data=True, schema_only=False
20
+ ):
17
21
  """
18
22
  Migrates data from a relational database into a graph database.
19
23
 
@@ -26,11 +30,133 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
26
30
 
27
31
  Both TableType and TableRow inherit from DataPoint to maintain consistency with Cognee data model.
28
32
  """
29
- engine = get_migration_relational_engine()
30
33
  # Create a mapping of node_id to node objects for referencing in edge creation
34
+ if schema_only:
35
+ node_mapping, edge_mapping = await schema_only_ingestion(schema)
36
+
37
+ else:
38
+ node_mapping, edge_mapping = await complete_database_ingestion(schema, migrate_column_data)
39
+
40
+ def _remove_duplicate_edges(edge_mapping):
41
+ seen = set()
42
+ unique_original_shape = []
43
+
44
+ for tup in edge_mapping:
45
+ # We go through all the tuples in the edge_mapping and we only add unique tuples to the list
46
+ # To eliminate duplicate edges.
47
+ source_id, target_id, rel_name, rel_dict = tup
48
+ # We need to convert the dictionary to a frozenset to be able to compare values for it
49
+ rel_dict_hashable = frozenset(sorted(rel_dict.items()))
50
+ hashable_tup = (source_id, target_id, rel_name, rel_dict_hashable)
51
+
52
+ # We use the seen set to keep track of unique edges
53
+ if hashable_tup not in seen:
54
+ # A list that has frozensets elements instead of dictionaries is needed to be able to compare values
55
+ seen.add(hashable_tup)
56
+ # append the original tuple shape (with the dictionary) if it's the first time we see it
57
+ unique_original_shape.append(tup)
58
+
59
+ return unique_original_shape
60
+
61
+ # Add all nodes and edges to the graph
62
+ # NOTE: Nodes and edges have to be added in batch for speed optimization, Especially for NetworkX.
63
+ # If we'd create nodes and add them to graph in real time the process would take too long.
64
+ # Every node and edge added to NetworkX is saved to file which is very slow when not done in batches.
65
+ await graph_db.add_nodes(list(node_mapping.values()))
66
+ await graph_db.add_edges(_remove_duplicate_edges(edge_mapping))
67
+
68
+ # In these steps we calculate the vector embeddings of our nodes and edges and save them to vector database
69
+ # Cognee uses this information to perform searches on the knowledge graph.
70
+ await index_data_points(list(node_mapping.values()))
71
+ await index_graph_edges()
72
+
73
+ logger.info("Data successfully migrated from relational database to desired graph database.")
74
+ return await graph_db.get_graph_data()
75
+
76
+
77
+ async def schema_only_ingestion(schema):
31
78
  node_mapping = {}
32
79
  edge_mapping = []
33
80
 
81
+ # Calling the ingest_database_schema function to return DataPoint subclasses
82
+ result = await ingest_database_schema(
83
+ schema=schema,
84
+ max_sample_rows=5,
85
+ )
86
+ database_schema = result["database_schema"]
87
+ schema_tables = result["schema_tables"]
88
+ schema_relationships = result["relationships"]
89
+ database_node_id = database_schema.id
90
+ node_mapping[database_node_id] = database_schema
91
+ for table in schema_tables:
92
+ table_node_id = table.id
93
+ # Add TableSchema Datapoint as a node.
94
+ node_mapping[table_node_id] = table
95
+ edge_mapping.append(
96
+ (
97
+ table_node_id,
98
+ database_node_id,
99
+ "is_part_of",
100
+ dict(
101
+ source_node_id=table_node_id,
102
+ target_node_id=database_node_id,
103
+ relationship_name="is_part_of",
104
+ ),
105
+ )
106
+ )
107
+ table_name_to_id = {t.name: t.id for t in schema_tables}
108
+ for rel in schema_relationships:
109
+ source_table_id = table_name_to_id.get(rel.source_table)
110
+ target_table_id = table_name_to_id.get(rel.target_table)
111
+
112
+ relationship_id = rel.id
113
+
114
+ # Add RelationshipTable DataPoint as a node.
115
+ node_mapping[relationship_id] = rel
116
+ edge_mapping.append(
117
+ (
118
+ source_table_id,
119
+ relationship_id,
120
+ "has_relationship",
121
+ dict(
122
+ source_node_id=source_table_id,
123
+ target_node_id=relationship_id,
124
+ relationship_name=rel.relationship_type,
125
+ ),
126
+ )
127
+ )
128
+ edge_mapping.append(
129
+ (
130
+ relationship_id,
131
+ target_table_id,
132
+ "has_relationship",
133
+ dict(
134
+ source_node_id=relationship_id,
135
+ target_node_id=target_table_id,
136
+ relationship_name=rel.relationship_type,
137
+ ),
138
+ )
139
+ )
140
+ edge_mapping.append(
141
+ (
142
+ source_table_id,
143
+ target_table_id,
144
+ rel.relationship_type,
145
+ dict(
146
+ source_node_id=source_table_id,
147
+ target_node_id=target_table_id,
148
+ relationship_name=rel.relationship_type,
149
+ ),
150
+ )
151
+ )
152
+ return node_mapping, edge_mapping
153
+
154
+
155
+ async def complete_database_ingestion(schema, migrate_column_data):
156
+ engine = get_migration_relational_engine()
157
+ # Create a mapping of node_id to node objects for referencing in edge creation
158
+ node_mapping = {}
159
+ edge_mapping = []
34
160
  async with engine.engine.begin() as cursor:
35
161
  # First, create table type nodes for all tables
36
162
  for table_name, details in schema.items():
@@ -38,7 +164,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
38
164
  table_node = TableType(
39
165
  id=uuid5(NAMESPACE_OID, name=table_name),
40
166
  name=table_name,
41
- description=f"Table: {table_name}",
167
+ description=f'Relational database table with the following name: "{table_name}".',
42
168
  )
43
169
 
44
170
  # Add TableType node to mapping ( node will be added to the graph later based on this mapping )
@@ -75,7 +201,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
75
201
  name=node_id,
76
202
  is_a=table_node,
77
203
  properties=str(row_properties),
78
- description=f"Row in {table_name} with {primary_key_col}={primary_key_value}",
204
+ description=f'Row in relational database table from the table with the name: "{table_name}" with the following row data {str(row_properties)} where the dictionary key value is the column name and the value is the column value. This row has the id of: {node_id}',
79
205
  )
80
206
 
81
207
  # Store the node object in our mapping
@@ -113,7 +239,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
113
239
  id=uuid5(NAMESPACE_OID, name=column_node_id),
114
240
  name=column_node_id,
115
241
  properties=f"{key} {value} {table_name}",
116
- description=f"Column name={key} and value={value} from column from table={table_name}",
242
+ description=f"column from relational database table={table_name}. Column name={key} and value={value}. The value of the column is related to the following row with this id: {row_node.id}. This column has the following ID: {column_node_id}",
117
243
  )
118
244
  node_mapping[column_node_id] = column_node
119
245
 
@@ -180,39 +306,4 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
180
306
  ),
181
307
  )
182
308
  )
183
-
184
- def _remove_duplicate_edges(edge_mapping):
185
- seen = set()
186
- unique_original_shape = []
187
-
188
- for tup in edge_mapping:
189
- # We go through all the tuples in the edge_mapping and we only add unique tuples to the list
190
- # To eliminate duplicate edges.
191
- source_id, target_id, rel_name, rel_dict = tup
192
- # We need to convert the dictionary to a frozenset to be able to compare values for it
193
- rel_dict_hashable = frozenset(sorted(rel_dict.items()))
194
- hashable_tup = (source_id, target_id, rel_name, rel_dict_hashable)
195
-
196
- # We use the seen set to keep track of unique edges
197
- if hashable_tup not in seen:
198
- # A list that has frozensets elements instead of dictionaries is needed to be able to compare values
199
- seen.add(hashable_tup)
200
- # append the original tuple shape (with the dictionary) if it's the first time we see it
201
- unique_original_shape.append(tup)
202
-
203
- return unique_original_shape
204
-
205
- # Add all nodes and edges to the graph
206
- # NOTE: Nodes and edges have to be added in batch for speed optimization, Especially for NetworkX.
207
- # If we'd create nodes and add them to graph in real time the process would take too long.
208
- # Every node and edge added to NetworkX is saved to file which is very slow when not done in batches.
209
- await graph_db.add_nodes(list(node_mapping.values()))
210
- await graph_db.add_edges(_remove_duplicate_edges(edge_mapping))
211
-
212
- # In these steps we calculate the vector embeddings of our nodes and edges and save them to vector database
213
- # Cognee uses this information to perform searches on the knowledge graph.
214
- await index_data_points(list(node_mapping.values()))
215
- await index_graph_edges()
216
-
217
- logger.info("Data successfully migrated from relational database to desired graph database.")
218
- return await graph_db.get_graph_data()
309
+ return node_mapping, edge_mapping
@@ -32,7 +32,10 @@ async def resolve_data_directories(
32
32
  import s3fs
33
33
 
34
34
  fs = s3fs.S3FileSystem(
35
- key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
35
+ key=s3_config.aws_access_key_id,
36
+ secret=s3_config.aws_secret_access_key,
37
+ token=s3_config.aws_session_token,
38
+ anon=False,
36
39
  )
37
40
 
38
41
  for item in data:
@@ -0,0 +1,134 @@
1
+ import json
2
+ from typing import List, Dict
3
+ from uuid import uuid5, NAMESPACE_OID
4
+ from cognee.infrastructure.engine.models.DataPoint import DataPoint
5
+ from sqlalchemy import text
6
+ from cognee.tasks.schema.models import DatabaseSchema, SchemaTable, SchemaRelationship
7
+ from cognee.infrastructure.databases.relational.get_migration_relational_engine import (
8
+ get_migration_relational_engine,
9
+ )
10
+ from cognee.infrastructure.databases.relational.config import get_migration_config
11
+ from datetime import datetime, timezone
12
+
13
+
14
+ async def ingest_database_schema(
15
+ schema,
16
+ max_sample_rows: int = 0,
17
+ ) -> Dict[str, List[DataPoint] | DataPoint]:
18
+ """
19
+ Extract database schema metadata (optionally with sample data) and return DataPoint models for graph construction.
20
+
21
+ Args:
22
+ schema: Database schema
23
+ max_sample_rows: Maximum sample rows per table (0 means no sampling)
24
+
25
+ Returns:
26
+ Dict with keys:
27
+ "database_schema": DatabaseSchema
28
+ "schema_tables": List[SchemaTable]
29
+ "relationships": List[SchemaRelationship]
30
+ """
31
+
32
+ tables = {}
33
+ sample_data = {}
34
+ schema_tables = []
35
+ schema_relationships = []
36
+
37
+ migration_config = get_migration_config()
38
+ engine = get_migration_relational_engine()
39
+ qi = engine.engine.dialect.identifier_preparer.quote
40
+ try:
41
+ max_sample_rows = max(0, int(max_sample_rows))
42
+ except (TypeError, ValueError):
43
+ max_sample_rows = 0
44
+
45
+ def qname(name: str):
46
+ split_name = name.split(".")
47
+ return ".".join(qi(p) for p in split_name)
48
+
49
+ async with engine.engine.begin() as cursor:
50
+ for table_name, details in schema.items():
51
+ tn = qname(table_name)
52
+ if max_sample_rows > 0:
53
+ rows_result = await cursor.execute(
54
+ text(f"SELECT * FROM {tn} LIMIT :limit;"), # noqa: S608 - tn is fully quoted
55
+ {"limit": max_sample_rows},
56
+ )
57
+ rows = [dict(r) for r in rows_result.mappings().all()]
58
+ else:
59
+ rows = []
60
+
61
+ if engine.engine.dialect.name == "postgresql":
62
+ if "." in table_name:
63
+ schema_part, table_part = table_name.split(".", 1)
64
+ else:
65
+ schema_part, table_part = "public", table_name
66
+ estimate = await cursor.execute(
67
+ text(
68
+ "SELECT reltuples::bigint AS estimate "
69
+ "FROM pg_class c "
70
+ "JOIN pg_namespace n ON n.oid = c.relnamespace "
71
+ "WHERE n.nspname = :schema AND c.relname = :table"
72
+ ),
73
+ {"schema": schema_part, "table": table_part},
74
+ )
75
+ row_count_estimate = estimate.scalar() or 0
76
+ else:
77
+ count_result = await cursor.execute(text(f"SELECT COUNT(*) FROM {tn};")) # noqa: S608 - tn is fully quoted
78
+ row_count_estimate = count_result.scalar()
79
+
80
+ schema_table = SchemaTable(
81
+ id=uuid5(NAMESPACE_OID, name=f"{table_name}"),
82
+ name=table_name,
83
+ columns=json.dumps(details["columns"], default=str),
84
+ primary_key=details.get("primary_key"),
85
+ foreign_keys=json.dumps(details.get("foreign_keys", []), default=str),
86
+ sample_rows=json.dumps(rows, default=str),
87
+ row_count_estimate=row_count_estimate,
88
+ description=f"Relational database table with '{table_name}' with {len(details['columns'])} columns and approx. {row_count_estimate} rows."
89
+ f"Here are the columns this table contains: {details['columns']}"
90
+ f"Here are a few sample_rows to show the contents of the table: {rows}"
91
+ f"Table is part of the database: {migration_config.migration_db_name}",
92
+ )
93
+ schema_tables.append(schema_table)
94
+ tables[table_name] = details
95
+ sample_data[table_name] = rows
96
+
97
+ for fk in details.get("foreign_keys", []):
98
+ ref_table_fq = fk["ref_table"]
99
+ if "." not in ref_table_fq and "." in table_name:
100
+ ref_table_fq = f"{table_name.split('.', 1)[0]}.{ref_table_fq}"
101
+
102
+ relationship_name = (
103
+ f"{table_name}:{fk['column']}->{ref_table_fq}:{fk['ref_column']}"
104
+ )
105
+ relationship = SchemaRelationship(
106
+ id=uuid5(NAMESPACE_OID, name=relationship_name),
107
+ name=relationship_name,
108
+ source_table=table_name,
109
+ target_table=ref_table_fq,
110
+ relationship_type="foreign_key",
111
+ source_column=fk["column"],
112
+ target_column=fk["ref_column"],
113
+ description=f"Relational database table foreign key relationship between: {table_name}.{fk['column']} → {ref_table_fq}.{fk['ref_column']}"
114
+ f"This foreing key relationship between table columns is a part of the following database: {migration_config.migration_db_name}",
115
+ )
116
+ schema_relationships.append(relationship)
117
+
118
+ id_str = f"{migration_config.migration_db_provider}:{migration_config.migration_db_name}"
119
+ database_schema = DatabaseSchema(
120
+ id=uuid5(NAMESPACE_OID, name=id_str),
121
+ name=migration_config.migration_db_name,
122
+ database_type=migration_config.migration_db_provider,
123
+ tables=json.dumps(tables, default=str),
124
+ sample_data=json.dumps(sample_data, default=str),
125
+ description=f"Database schema containing {len(schema_tables)} tables and {len(schema_relationships)} relationships. "
126
+ f"The database type is {migration_config.migration_db_provider}."
127
+ f"The database contains the following tables: {tables}",
128
+ )
129
+
130
+ return {
131
+ "database_schema": database_schema,
132
+ "schema_tables": schema_tables,
133
+ "relationships": schema_relationships,
134
+ }