cognee 0.3.4.dev3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +16 -7
- cognee/api/health.py +5 -9
- cognee/api/v1/add/add.py +3 -1
- cognee/api/v1/cognify/cognify.py +44 -7
- cognee/api/v1/permissions/routers/get_permissions_router.py +8 -4
- cognee/api/v1/search/search.py +3 -0
- cognee/api/v1/ui/__init__.py +1 -1
- cognee/api/v1/ui/ui.py +215 -150
- cognee/api/v1/update/__init__.py +1 -0
- cognee/api/v1/update/routers/__init__.py +1 -0
- cognee/api/v1/update/routers/get_update_router.py +90 -0
- cognee/api/v1/update/update.py +100 -0
- cognee/base_config.py +5 -2
- cognee/cli/_cognee.py +28 -10
- cognee/cli/commands/delete_command.py +34 -2
- cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py +2 -2
- cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +3 -2
- cognee/eval_framework/modal_eval_dashboard.py +9 -1
- cognee/infrastructure/databases/graph/config.py +9 -9
- cognee/infrastructure/databases/graph/get_graph_engine.py +4 -21
- cognee/infrastructure/databases/graph/kuzu/adapter.py +60 -9
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +3 -3
- cognee/infrastructure/databases/relational/config.py +4 -4
- cognee/infrastructure/databases/relational/create_relational_engine.py +11 -3
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +7 -3
- cognee/infrastructure/databases/vector/config.py +7 -7
- cognee/infrastructure/databases/vector/create_vector_engine.py +7 -15
- cognee/infrastructure/databases/vector/embeddings/EmbeddingEngine.py +9 -0
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +11 -0
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +19 -2
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +11 -0
- cognee/infrastructure/databases/vector/embeddings/config.py +8 -0
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +5 -0
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +11 -10
- cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +48 -38
- cognee/infrastructure/databases/vector/vector_db_interface.py +8 -4
- cognee/infrastructure/files/storage/S3FileStorage.py +15 -5
- cognee/infrastructure/files/storage/s3_config.py +1 -0
- cognee/infrastructure/files/utils/open_data_file.py +7 -14
- cognee/infrastructure/llm/LLMGateway.py +19 -117
- cognee/infrastructure/llm/config.py +28 -13
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_categories.py +2 -1
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_event_entities.py +3 -2
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/extract_summary.py +3 -2
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_content_graph.py +2 -1
- cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/extract_event_graph.py +3 -2
- cognee/infrastructure/llm/prompts/read_query_prompt.py +3 -2
- cognee/infrastructure/llm/prompts/show_prompt.py +35 -0
- cognee/infrastructure/llm/prompts/test.txt +1 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +50 -397
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +2 -3
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +8 -88
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +78 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +2 -99
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +49 -401
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +19 -882
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +2 -34
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +2 -107
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/acreate_structured_output.baml +26 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/__init__.py +1 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/acreate_structured_output.py +76 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/create_dynamic_baml_type.py +122 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +3 -3
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py +0 -32
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py +107 -98
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py +5 -6
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py +5 -6
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/llm_interface.py +0 -26
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/adapter.py +17 -67
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +8 -7
- cognee/infrastructure/llm/utils.py +4 -4
- cognee/infrastructure/loaders/LoaderEngine.py +5 -2
- cognee/infrastructure/loaders/external/__init__.py +7 -0
- cognee/infrastructure/loaders/external/advanced_pdf_loader.py +244 -0
- cognee/infrastructure/loaders/supported_loaders.py +7 -0
- cognee/modules/data/methods/create_authorized_dataset.py +9 -0
- cognee/modules/data/methods/get_authorized_dataset.py +1 -1
- cognee/modules/data/methods/get_authorized_dataset_by_name.py +11 -0
- cognee/modules/data/methods/get_deletion_counts.py +92 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +1 -1
- cognee/modules/graph/utils/expand_with_nodes_and_edges.py +22 -8
- cognee/modules/graph/utils/retrieve_existing_edges.py +0 -2
- cognee/modules/ingestion/data_types/TextData.py +0 -1
- cognee/modules/notebooks/methods/create_notebook.py +3 -1
- cognee/modules/notebooks/methods/get_notebooks.py +27 -1
- cognee/modules/observability/get_observe.py +14 -0
- cognee/modules/observability/observers.py +1 -0
- cognee/modules/ontology/base_ontology_resolver.py +42 -0
- cognee/modules/ontology/get_default_ontology_resolver.py +41 -0
- cognee/modules/ontology/matching_strategies.py +53 -0
- cognee/modules/ontology/models.py +20 -0
- cognee/modules/ontology/ontology_config.py +24 -0
- cognee/modules/ontology/ontology_env_config.py +45 -0
- cognee/modules/ontology/rdf_xml/{OntologyResolver.py → RDFLibOntologyResolver.py} +20 -28
- cognee/modules/pipelines/layers/resolve_authorized_user_dataset.py +21 -24
- cognee/modules/pipelines/layers/resolve_authorized_user_datasets.py +3 -3
- cognee/modules/retrieval/code_retriever.py +2 -1
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +1 -4
- cognee/modules/retrieval/graph_completion_cot_retriever.py +6 -5
- cognee/modules/retrieval/graph_completion_retriever.py +0 -3
- cognee/modules/retrieval/insights_retriever.py +1 -1
- cognee/modules/retrieval/jaccard_retrival.py +60 -0
- cognee/modules/retrieval/lexical_retriever.py +123 -0
- cognee/modules/retrieval/natural_language_retriever.py +2 -1
- cognee/modules/retrieval/temporal_retriever.py +3 -2
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +2 -12
- cognee/modules/retrieval/utils/completion.py +4 -7
- cognee/modules/search/methods/get_search_type_tools.py +7 -0
- cognee/modules/search/methods/no_access_control_search.py +1 -1
- cognee/modules/search/methods/search.py +32 -13
- cognee/modules/search/types/SearchType.py +1 -0
- cognee/modules/users/methods/create_user.py +0 -2
- cognee/modules/users/permissions/methods/authorized_give_permission_on_datasets.py +12 -0
- cognee/modules/users/permissions/methods/check_permission_on_dataset.py +11 -0
- cognee/modules/users/permissions/methods/get_all_user_permission_datasets.py +10 -0
- cognee/modules/users/permissions/methods/get_document_ids_for_user.py +10 -0
- cognee/modules/users/permissions/methods/get_principal.py +9 -0
- cognee/modules/users/permissions/methods/get_principal_datasets.py +11 -0
- cognee/modules/users/permissions/methods/get_role.py +10 -0
- cognee/modules/users/permissions/methods/get_specific_user_permission_datasets.py +3 -3
- cognee/modules/users/permissions/methods/get_tenant.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_role.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_tenant.py +9 -0
- cognee/modules/users/permissions/methods/give_default_permission_to_user.py +9 -0
- cognee/modules/users/permissions/methods/give_permission_on_dataset.py +10 -0
- cognee/modules/users/roles/methods/add_user_to_role.py +11 -0
- cognee/modules/users/roles/methods/create_role.py +12 -1
- cognee/modules/users/tenants/methods/add_user_to_tenant.py +12 -0
- cognee/modules/users/tenants/methods/create_tenant.py +12 -1
- cognee/modules/visualization/cognee_network_visualization.py +13 -9
- cognee/shared/data_models.py +0 -1
- cognee/shared/utils.py +0 -32
- cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
- cognee/tasks/codingagents/coding_rule_associations.py +3 -2
- cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +3 -2
- cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +3 -2
- cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +3 -2
- cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +3 -2
- cognee/tasks/graph/extract_graph_from_code.py +2 -2
- cognee/tasks/graph/extract_graph_from_data.py +55 -12
- cognee/tasks/graph/extract_graph_from_data_v2.py +16 -4
- cognee/tasks/ingestion/migrate_relational_database.py +132 -41
- cognee/tasks/ingestion/resolve_data_directories.py +4 -1
- cognee/tasks/schema/ingest_database_schema.py +134 -0
- cognee/tasks/schema/models.py +40 -0
- cognee/tasks/storage/index_data_points.py +1 -1
- cognee/tasks/storage/index_graph_edges.py +3 -1
- cognee/tasks/summarization/summarize_code.py +2 -2
- cognee/tasks/summarization/summarize_text.py +2 -2
- cognee/tasks/temporal_graph/enrich_events.py +2 -2
- cognee/tasks/temporal_graph/extract_events_and_entities.py +2 -2
- cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py +13 -4
- cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py +13 -3
- cognee/tests/test_advanced_pdf_loader.py +141 -0
- cognee/tests/test_chromadb.py +40 -0
- cognee/tests/test_cognee_server_start.py +6 -1
- cognee/tests/test_data/Quantum_computers.txt +9 -0
- cognee/tests/test_lancedb.py +211 -0
- cognee/tests/test_pgvector.py +40 -0
- cognee/tests/test_relational_db_migration.py +76 -0
- cognee/tests/unit/infrastructure/databases/test_index_graph_edges.py +2 -1
- cognee/tests/unit/modules/ontology/test_ontology_adapter.py +330 -13
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_context_extension_test.py +0 -4
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_cot_test.py +0 -4
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +0 -4
- {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/METADATA +92 -96
- {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/RECORD +176 -162
- distributed/pyproject.toml +0 -1
- cognee/infrastructure/data/utils/extract_keywords.py +0 -48
- cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py +0 -1227
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +0 -109
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +0 -343
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_categories.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +0 -89
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/__init__.py +0 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +0 -44
- cognee/tasks/graph/infer_data_ontology.py +0 -309
- cognee/tests/test_falkordb.py +0 -174
- /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/__init__.py +0 -0
- /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/knowledge_graph/__init__.py +0 -0
- /cognee/infrastructure/llm/{structured_output_framework/litellm_instructor/extraction → extraction}/texts.json +0 -0
- {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/WHEEL +0 -0
- {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/entry_points.txt +0 -0
- {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.3.4.dev3.dist-info → cognee-0.3.5.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -4,6 +4,7 @@ from cognee.infrastructure.databases.graph import get_graph_engine
|
|
|
4
4
|
from cognee.infrastructure.databases.vector import get_vector_engine
|
|
5
5
|
|
|
6
6
|
from cognee.low_level import DataPoint
|
|
7
|
+
from cognee.infrastructure.llm.prompts import render_prompt
|
|
7
8
|
from cognee.infrastructure.llm import LLMGateway
|
|
8
9
|
from cognee.shared.logging_utils import get_logger
|
|
9
10
|
from cognee.modules.engine.models import NodeSet
|
|
@@ -104,8 +105,8 @@ async def add_rule_associations(
|
|
|
104
105
|
|
|
105
106
|
user_context = {"chat": data, "rules": existing_rules}
|
|
106
107
|
|
|
107
|
-
user_prompt =
|
|
108
|
-
system_prompt =
|
|
108
|
+
user_prompt = render_prompt(user_prompt_location, context=user_context)
|
|
109
|
+
system_prompt = render_prompt(system_prompt_location, context={})
|
|
109
110
|
|
|
110
111
|
rule_list = await LLMGateway.acreate_structured_output(
|
|
111
112
|
text_input=user_prompt, system_prompt=system_prompt, response_model=RuleSet
|
|
@@ -3,6 +3,7 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
|
+
from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
|
|
6
7
|
from cognee.infrastructure.entities.BaseEntityExtractor import BaseEntityExtractor
|
|
7
8
|
from cognee.modules.engine.models import Entity
|
|
8
9
|
from cognee.modules.engine.models.EntityType import EntityType
|
|
@@ -50,8 +51,8 @@ class LLMEntityExtractor(BaseEntityExtractor):
|
|
|
50
51
|
try:
|
|
51
52
|
logger.info(f"Extracting entities from text: {text[:100]}...")
|
|
52
53
|
|
|
53
|
-
user_prompt =
|
|
54
|
-
system_prompt =
|
|
54
|
+
user_prompt = render_prompt(self.user_prompt_template, {"text": text})
|
|
55
|
+
system_prompt = read_query_prompt(self.system_prompt_template)
|
|
55
56
|
|
|
56
57
|
response = await LLMGateway.acreate_structured_output(
|
|
57
58
|
text_input=user_prompt,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Tuple
|
|
2
2
|
from pydantic import BaseModel
|
|
3
3
|
|
|
4
|
+
from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
|
|
4
5
|
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
5
6
|
from cognee.root_dir import get_absolute_path
|
|
6
7
|
|
|
@@ -32,12 +33,12 @@ async def extract_content_nodes_and_relationship_names(
|
|
|
32
33
|
}
|
|
33
34
|
|
|
34
35
|
base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
|
|
35
|
-
text_input =
|
|
36
|
+
text_input = render_prompt(
|
|
36
37
|
"extract_graph_relationship_names_prompt_input.txt",
|
|
37
38
|
context,
|
|
38
39
|
base_directory=base_directory,
|
|
39
40
|
)
|
|
40
|
-
system_prompt =
|
|
41
|
+
system_prompt = read_query_prompt(
|
|
41
42
|
"extract_graph_relationship_names_prompt_system.txt", base_directory=base_directory
|
|
42
43
|
)
|
|
43
44
|
response = await LLMGateway.acreate_structured_output(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
+
from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
|
|
3
4
|
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
4
5
|
from cognee.shared.data_models import KnowledgeGraph
|
|
5
6
|
from cognee.root_dir import get_absolute_path
|
|
@@ -26,10 +27,10 @@ async def extract_edge_triplets(
|
|
|
26
27
|
}
|
|
27
28
|
|
|
28
29
|
base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
|
|
29
|
-
text_input =
|
|
30
|
+
text_input = render_prompt(
|
|
30
31
|
"extract_graph_edge_triplets_prompt_input.txt", context, base_directory=base_directory
|
|
31
32
|
)
|
|
32
|
-
system_prompt =
|
|
33
|
+
system_prompt = read_query_prompt(
|
|
33
34
|
"extract_graph_edge_triplets_prompt_system.txt", base_directory=base_directory
|
|
34
35
|
)
|
|
35
36
|
extracted_graph = await LLMGateway.acreate_structured_output(
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from pydantic import BaseModel
|
|
3
3
|
|
|
4
|
+
from cognee.infrastructure.llm.prompts import render_prompt, read_query_prompt
|
|
4
5
|
from cognee.infrastructure.llm.LLMGateway import LLMGateway
|
|
5
6
|
from cognee.root_dir import get_absolute_path
|
|
6
7
|
|
|
@@ -24,10 +25,10 @@ async def extract_nodes(text: str, n_rounds: int = 2) -> List[str]:
|
|
|
24
25
|
"text": text,
|
|
25
26
|
}
|
|
26
27
|
base_directory = get_absolute_path("./tasks/graph/cascade_extract/prompts")
|
|
27
|
-
text_input =
|
|
28
|
+
text_input = render_prompt(
|
|
28
29
|
"extract_graph_nodes_prompt_input.txt", context, base_directory=base_directory
|
|
29
30
|
)
|
|
30
|
-
system_prompt =
|
|
31
|
+
system_prompt = read_query_prompt(
|
|
31
32
|
"extract_graph_nodes_prompt_system.txt", base_directory=base_directory
|
|
32
33
|
)
|
|
33
34
|
response = await LLMGateway.acreate_structured_output(
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
from typing import Type, List
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
5
|
-
from cognee.infrastructure.llm.
|
|
5
|
+
from cognee.infrastructure.llm.extraction import extract_content_graph
|
|
6
6
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
|
7
7
|
from cognee.tasks.storage import add_data_points
|
|
8
8
|
|
|
@@ -18,7 +18,7 @@ async def extract_graph_from_code(
|
|
|
18
18
|
- Graph nodes are stored using the `add_data_points` function for later retrieval or analysis.
|
|
19
19
|
"""
|
|
20
20
|
chunk_graphs = await asyncio.gather(
|
|
21
|
-
*[
|
|
21
|
+
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
|
22
22
|
)
|
|
23
23
|
|
|
24
24
|
for chunk_index, chunk in enumerate(data_chunks):
|
|
@@ -3,15 +3,21 @@ from typing import Type, List, Optional
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
5
5
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
|
6
|
+
from cognee.modules.ontology.ontology_env_config import get_ontology_env_config
|
|
6
7
|
from cognee.tasks.storage.add_data_points import add_data_points
|
|
7
|
-
from cognee.modules.ontology.
|
|
8
|
+
from cognee.modules.ontology.ontology_config import Config
|
|
9
|
+
from cognee.modules.ontology.get_default_ontology_resolver import (
|
|
10
|
+
get_default_ontology_resolver,
|
|
11
|
+
get_ontology_resolver_from_env,
|
|
12
|
+
)
|
|
13
|
+
from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
|
|
8
14
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
|
9
15
|
from cognee.modules.graph.utils import (
|
|
10
16
|
expand_with_nodes_and_edges,
|
|
11
17
|
retrieve_existing_edges,
|
|
12
18
|
)
|
|
13
19
|
from cognee.shared.data_models import KnowledgeGraph
|
|
14
|
-
from cognee.infrastructure.llm.
|
|
20
|
+
from cognee.infrastructure.llm.extraction import extract_content_graph
|
|
15
21
|
from cognee.tasks.graph.exceptions import (
|
|
16
22
|
InvalidGraphModelError,
|
|
17
23
|
InvalidDataChunksError,
|
|
@@ -24,9 +30,28 @@ async def integrate_chunk_graphs(
|
|
|
24
30
|
data_chunks: list[DocumentChunk],
|
|
25
31
|
chunk_graphs: list,
|
|
26
32
|
graph_model: Type[BaseModel],
|
|
27
|
-
|
|
33
|
+
ontology_resolver: BaseOntologyResolver,
|
|
28
34
|
) -> List[DocumentChunk]:
|
|
29
|
-
"""
|
|
35
|
+
"""Integrate chunk graphs with ontology validation and store in databases.
|
|
36
|
+
|
|
37
|
+
This function processes document chunks and their associated knowledge graphs,
|
|
38
|
+
validates entities against an ontology resolver, and stores the integrated
|
|
39
|
+
data points and edges in the configured databases.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
data_chunks: List of document chunks containing source data
|
|
43
|
+
chunk_graphs: List of knowledge graphs corresponding to each chunk
|
|
44
|
+
graph_model: Pydantic model class for graph data validation
|
|
45
|
+
ontology_resolver: Resolver for validating entities against ontology
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
List of updated DocumentChunk objects with integrated data
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
InvalidChunkGraphInputError: If input validation fails
|
|
52
|
+
InvalidGraphModelError: If graph model validation fails
|
|
53
|
+
InvalidOntologyAdapterError: If ontology resolver validation fails
|
|
54
|
+
"""
|
|
30
55
|
|
|
31
56
|
if not isinstance(data_chunks, list) or not isinstance(chunk_graphs, list):
|
|
32
57
|
raise InvalidChunkGraphInputError("data_chunks and chunk_graphs must be lists.")
|
|
@@ -36,9 +61,9 @@ async def integrate_chunk_graphs(
|
|
|
36
61
|
)
|
|
37
62
|
if not isinstance(graph_model, type) or not issubclass(graph_model, BaseModel):
|
|
38
63
|
raise InvalidGraphModelError(graph_model)
|
|
39
|
-
if
|
|
64
|
+
if ontology_resolver is None or not hasattr(ontology_resolver, "get_subgraph"):
|
|
40
65
|
raise InvalidOntologyAdapterError(
|
|
41
|
-
type(
|
|
66
|
+
type(ontology_resolver).__name__ if ontology_resolver else "None"
|
|
42
67
|
)
|
|
43
68
|
|
|
44
69
|
graph_engine = await get_graph_engine()
|
|
@@ -55,7 +80,7 @@ async def integrate_chunk_graphs(
|
|
|
55
80
|
)
|
|
56
81
|
|
|
57
82
|
graph_nodes, graph_edges = expand_with_nodes_and_edges(
|
|
58
|
-
data_chunks, chunk_graphs,
|
|
83
|
+
data_chunks, chunk_graphs, ontology_resolver, existing_edges_map
|
|
59
84
|
)
|
|
60
85
|
|
|
61
86
|
if len(graph_nodes) > 0:
|
|
@@ -70,7 +95,7 @@ async def integrate_chunk_graphs(
|
|
|
70
95
|
async def extract_graph_from_data(
|
|
71
96
|
data_chunks: List[DocumentChunk],
|
|
72
97
|
graph_model: Type[BaseModel],
|
|
73
|
-
|
|
98
|
+
config: Config = None,
|
|
74
99
|
custom_prompt: Optional[str] = None,
|
|
75
100
|
) -> List[DocumentChunk]:
|
|
76
101
|
"""
|
|
@@ -86,7 +111,7 @@ async def extract_graph_from_data(
|
|
|
86
111
|
|
|
87
112
|
chunk_graphs = await asyncio.gather(
|
|
88
113
|
*[
|
|
89
|
-
|
|
114
|
+
extract_content_graph(chunk.text, graph_model, custom_prompt=custom_prompt)
|
|
90
115
|
for chunk in data_chunks
|
|
91
116
|
]
|
|
92
117
|
)
|
|
@@ -101,6 +126,24 @@ async def extract_graph_from_data(
|
|
|
101
126
|
if edge.source_node_id in valid_node_ids and edge.target_node_id in valid_node_ids
|
|
102
127
|
]
|
|
103
128
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
129
|
+
# Extract resolver from config if provided, otherwise get default
|
|
130
|
+
if config is None:
|
|
131
|
+
ontology_config = get_ontology_env_config()
|
|
132
|
+
if (
|
|
133
|
+
ontology_config.ontology_file_path
|
|
134
|
+
and ontology_config.ontology_resolver
|
|
135
|
+
and ontology_config.matching_strategy
|
|
136
|
+
):
|
|
137
|
+
config: Config = {
|
|
138
|
+
"ontology_config": {
|
|
139
|
+
"ontology_resolver": get_ontology_resolver_from_env(**ontology_config.to_dict())
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
else:
|
|
143
|
+
config: Config = {
|
|
144
|
+
"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
ontology_resolver = config["ontology_config"]["ontology_resolver"]
|
|
148
|
+
|
|
149
|
+
return await integrate_chunk_graphs(data_chunks, chunk_graphs, graph_model, ontology_resolver)
|
|
@@ -3,7 +3,7 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
|
5
5
|
from cognee.shared.data_models import KnowledgeGraph
|
|
6
|
-
from cognee.modules.ontology.
|
|
6
|
+
from cognee.modules.ontology.base_ontology_resolver import BaseOntologyResolver
|
|
7
7
|
from cognee.tasks.graph.cascade_extract.utils.extract_nodes import extract_nodes
|
|
8
8
|
from cognee.tasks.graph.cascade_extract.utils.extract_content_nodes_and_relationship_names import (
|
|
9
9
|
extract_content_nodes_and_relationship_names,
|
|
@@ -17,9 +17,21 @@ from cognee.tasks.graph.extract_graph_from_data import integrate_chunk_graphs
|
|
|
17
17
|
async def extract_graph_from_data(
|
|
18
18
|
data_chunks: List[DocumentChunk],
|
|
19
19
|
n_rounds: int = 2,
|
|
20
|
-
ontology_adapter:
|
|
20
|
+
ontology_adapter: BaseOntologyResolver = None,
|
|
21
21
|
) -> List[DocumentChunk]:
|
|
22
|
-
"""Extract and update graph data from document chunks
|
|
22
|
+
"""Extract and update graph data from document chunks using cascade extraction.
|
|
23
|
+
|
|
24
|
+
This function performs multi-step graph extraction from document chunks,
|
|
25
|
+
using cascade extraction techniques to build comprehensive knowledge graphs.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
data_chunks: List of document chunks to process
|
|
29
|
+
n_rounds: Number of extraction rounds to perform (default: 2)
|
|
30
|
+
ontology_adapter: Resolver for validating entities against ontology
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
List of updated DocumentChunk objects with extracted graph data
|
|
34
|
+
"""
|
|
23
35
|
chunk_nodes = await asyncio.gather(
|
|
24
36
|
*[extract_nodes(chunk.text, n_rounds) for chunk in data_chunks]
|
|
25
37
|
)
|
|
@@ -44,5 +56,5 @@ async def extract_graph_from_data(
|
|
|
44
56
|
data_chunks=data_chunks,
|
|
45
57
|
chunk_graphs=chunk_graphs,
|
|
46
58
|
graph_model=KnowledgeGraph,
|
|
47
|
-
ontology_adapter=ontology_adapter
|
|
59
|
+
ontology_adapter=ontology_adapter,
|
|
48
60
|
)
|
|
@@ -4,16 +4,20 @@ from sqlalchemy import text
|
|
|
4
4
|
from cognee.infrastructure.databases.relational.get_migration_relational_engine import (
|
|
5
5
|
get_migration_relational_engine,
|
|
6
6
|
)
|
|
7
|
+
from cognee.infrastructure.databases.relational.config import get_migration_config
|
|
7
8
|
|
|
8
9
|
from cognee.tasks.storage.index_data_points import index_data_points
|
|
9
10
|
from cognee.tasks.storage.index_graph_edges import index_graph_edges
|
|
11
|
+
from cognee.tasks.schema.ingest_database_schema import ingest_database_schema
|
|
10
12
|
|
|
11
13
|
from cognee.modules.engine.models import TableRow, TableType, ColumnValue
|
|
12
14
|
|
|
13
15
|
logger = logging.getLogger(__name__)
|
|
14
16
|
|
|
15
17
|
|
|
16
|
-
async def migrate_relational_database(
|
|
18
|
+
async def migrate_relational_database(
|
|
19
|
+
graph_db, schema, migrate_column_data=True, schema_only=False
|
|
20
|
+
):
|
|
17
21
|
"""
|
|
18
22
|
Migrates data from a relational database into a graph database.
|
|
19
23
|
|
|
@@ -26,11 +30,133 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
|
|
|
26
30
|
|
|
27
31
|
Both TableType and TableRow inherit from DataPoint to maintain consistency with Cognee data model.
|
|
28
32
|
"""
|
|
29
|
-
engine = get_migration_relational_engine()
|
|
30
33
|
# Create a mapping of node_id to node objects for referencing in edge creation
|
|
34
|
+
if schema_only:
|
|
35
|
+
node_mapping, edge_mapping = await schema_only_ingestion(schema)
|
|
36
|
+
|
|
37
|
+
else:
|
|
38
|
+
node_mapping, edge_mapping = await complete_database_ingestion(schema, migrate_column_data)
|
|
39
|
+
|
|
40
|
+
def _remove_duplicate_edges(edge_mapping):
|
|
41
|
+
seen = set()
|
|
42
|
+
unique_original_shape = []
|
|
43
|
+
|
|
44
|
+
for tup in edge_mapping:
|
|
45
|
+
# We go through all the tuples in the edge_mapping and we only add unique tuples to the list
|
|
46
|
+
# To eliminate duplicate edges.
|
|
47
|
+
source_id, target_id, rel_name, rel_dict = tup
|
|
48
|
+
# We need to convert the dictionary to a frozenset to be able to compare values for it
|
|
49
|
+
rel_dict_hashable = frozenset(sorted(rel_dict.items()))
|
|
50
|
+
hashable_tup = (source_id, target_id, rel_name, rel_dict_hashable)
|
|
51
|
+
|
|
52
|
+
# We use the seen set to keep track of unique edges
|
|
53
|
+
if hashable_tup not in seen:
|
|
54
|
+
# A list that has frozensets elements instead of dictionaries is needed to be able to compare values
|
|
55
|
+
seen.add(hashable_tup)
|
|
56
|
+
# append the original tuple shape (with the dictionary) if it's the first time we see it
|
|
57
|
+
unique_original_shape.append(tup)
|
|
58
|
+
|
|
59
|
+
return unique_original_shape
|
|
60
|
+
|
|
61
|
+
# Add all nodes and edges to the graph
|
|
62
|
+
# NOTE: Nodes and edges have to be added in batch for speed optimization, Especially for NetworkX.
|
|
63
|
+
# If we'd create nodes and add them to graph in real time the process would take too long.
|
|
64
|
+
# Every node and edge added to NetworkX is saved to file which is very slow when not done in batches.
|
|
65
|
+
await graph_db.add_nodes(list(node_mapping.values()))
|
|
66
|
+
await graph_db.add_edges(_remove_duplicate_edges(edge_mapping))
|
|
67
|
+
|
|
68
|
+
# In these steps we calculate the vector embeddings of our nodes and edges and save them to vector database
|
|
69
|
+
# Cognee uses this information to perform searches on the knowledge graph.
|
|
70
|
+
await index_data_points(list(node_mapping.values()))
|
|
71
|
+
await index_graph_edges()
|
|
72
|
+
|
|
73
|
+
logger.info("Data successfully migrated from relational database to desired graph database.")
|
|
74
|
+
return await graph_db.get_graph_data()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def schema_only_ingestion(schema):
|
|
31
78
|
node_mapping = {}
|
|
32
79
|
edge_mapping = []
|
|
33
80
|
|
|
81
|
+
# Calling the ingest_database_schema function to return DataPoint subclasses
|
|
82
|
+
result = await ingest_database_schema(
|
|
83
|
+
schema=schema,
|
|
84
|
+
max_sample_rows=5,
|
|
85
|
+
)
|
|
86
|
+
database_schema = result["database_schema"]
|
|
87
|
+
schema_tables = result["schema_tables"]
|
|
88
|
+
schema_relationships = result["relationships"]
|
|
89
|
+
database_node_id = database_schema.id
|
|
90
|
+
node_mapping[database_node_id] = database_schema
|
|
91
|
+
for table in schema_tables:
|
|
92
|
+
table_node_id = table.id
|
|
93
|
+
# Add TableSchema Datapoint as a node.
|
|
94
|
+
node_mapping[table_node_id] = table
|
|
95
|
+
edge_mapping.append(
|
|
96
|
+
(
|
|
97
|
+
table_node_id,
|
|
98
|
+
database_node_id,
|
|
99
|
+
"is_part_of",
|
|
100
|
+
dict(
|
|
101
|
+
source_node_id=table_node_id,
|
|
102
|
+
target_node_id=database_node_id,
|
|
103
|
+
relationship_name="is_part_of",
|
|
104
|
+
),
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
table_name_to_id = {t.name: t.id for t in schema_tables}
|
|
108
|
+
for rel in schema_relationships:
|
|
109
|
+
source_table_id = table_name_to_id.get(rel.source_table)
|
|
110
|
+
target_table_id = table_name_to_id.get(rel.target_table)
|
|
111
|
+
|
|
112
|
+
relationship_id = rel.id
|
|
113
|
+
|
|
114
|
+
# Add RelationshipTable DataPoint as a node.
|
|
115
|
+
node_mapping[relationship_id] = rel
|
|
116
|
+
edge_mapping.append(
|
|
117
|
+
(
|
|
118
|
+
source_table_id,
|
|
119
|
+
relationship_id,
|
|
120
|
+
"has_relationship",
|
|
121
|
+
dict(
|
|
122
|
+
source_node_id=source_table_id,
|
|
123
|
+
target_node_id=relationship_id,
|
|
124
|
+
relationship_name=rel.relationship_type,
|
|
125
|
+
),
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
edge_mapping.append(
|
|
129
|
+
(
|
|
130
|
+
relationship_id,
|
|
131
|
+
target_table_id,
|
|
132
|
+
"has_relationship",
|
|
133
|
+
dict(
|
|
134
|
+
source_node_id=relationship_id,
|
|
135
|
+
target_node_id=target_table_id,
|
|
136
|
+
relationship_name=rel.relationship_type,
|
|
137
|
+
),
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
edge_mapping.append(
|
|
141
|
+
(
|
|
142
|
+
source_table_id,
|
|
143
|
+
target_table_id,
|
|
144
|
+
rel.relationship_type,
|
|
145
|
+
dict(
|
|
146
|
+
source_node_id=source_table_id,
|
|
147
|
+
target_node_id=target_table_id,
|
|
148
|
+
relationship_name=rel.relationship_type,
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
return node_mapping, edge_mapping
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
async def complete_database_ingestion(schema, migrate_column_data):
|
|
156
|
+
engine = get_migration_relational_engine()
|
|
157
|
+
# Create a mapping of node_id to node objects for referencing in edge creation
|
|
158
|
+
node_mapping = {}
|
|
159
|
+
edge_mapping = []
|
|
34
160
|
async with engine.engine.begin() as cursor:
|
|
35
161
|
# First, create table type nodes for all tables
|
|
36
162
|
for table_name, details in schema.items():
|
|
@@ -38,7 +164,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
|
|
|
38
164
|
table_node = TableType(
|
|
39
165
|
id=uuid5(NAMESPACE_OID, name=table_name),
|
|
40
166
|
name=table_name,
|
|
41
|
-
description=f
|
|
167
|
+
description=f'Relational database table with the following name: "{table_name}".',
|
|
42
168
|
)
|
|
43
169
|
|
|
44
170
|
# Add TableType node to mapping ( node will be added to the graph later based on this mapping )
|
|
@@ -75,7 +201,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
|
|
|
75
201
|
name=node_id,
|
|
76
202
|
is_a=table_node,
|
|
77
203
|
properties=str(row_properties),
|
|
78
|
-
description=f
|
|
204
|
+
description=f'Row in relational database table from the table with the name: "{table_name}" with the following row data {str(row_properties)} where the dictionary key value is the column name and the value is the column value. This row has the id of: {node_id}',
|
|
79
205
|
)
|
|
80
206
|
|
|
81
207
|
# Store the node object in our mapping
|
|
@@ -113,7 +239,7 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
|
|
|
113
239
|
id=uuid5(NAMESPACE_OID, name=column_node_id),
|
|
114
240
|
name=column_node_id,
|
|
115
241
|
properties=f"{key} {value} {table_name}",
|
|
116
|
-
description=f"Column name={key} and value={value}
|
|
242
|
+
description=f"column from relational database table={table_name}. Column name={key} and value={value}. The value of the column is related to the following row with this id: {row_node.id}. This column has the following ID: {column_node_id}",
|
|
117
243
|
)
|
|
118
244
|
node_mapping[column_node_id] = column_node
|
|
119
245
|
|
|
@@ -180,39 +306,4 @@ async def migrate_relational_database(graph_db, schema, migrate_column_data=True
|
|
|
180
306
|
),
|
|
181
307
|
)
|
|
182
308
|
)
|
|
183
|
-
|
|
184
|
-
def _remove_duplicate_edges(edge_mapping):
|
|
185
|
-
seen = set()
|
|
186
|
-
unique_original_shape = []
|
|
187
|
-
|
|
188
|
-
for tup in edge_mapping:
|
|
189
|
-
# We go through all the tuples in the edge_mapping and we only add unique tuples to the list
|
|
190
|
-
# To eliminate duplicate edges.
|
|
191
|
-
source_id, target_id, rel_name, rel_dict = tup
|
|
192
|
-
# We need to convert the dictionary to a frozenset to be able to compare values for it
|
|
193
|
-
rel_dict_hashable = frozenset(sorted(rel_dict.items()))
|
|
194
|
-
hashable_tup = (source_id, target_id, rel_name, rel_dict_hashable)
|
|
195
|
-
|
|
196
|
-
# We use the seen set to keep track of unique edges
|
|
197
|
-
if hashable_tup not in seen:
|
|
198
|
-
# A list that has frozensets elements instead of dictionaries is needed to be able to compare values
|
|
199
|
-
seen.add(hashable_tup)
|
|
200
|
-
# append the original tuple shape (with the dictionary) if it's the first time we see it
|
|
201
|
-
unique_original_shape.append(tup)
|
|
202
|
-
|
|
203
|
-
return unique_original_shape
|
|
204
|
-
|
|
205
|
-
# Add all nodes and edges to the graph
|
|
206
|
-
# NOTE: Nodes and edges have to be added in batch for speed optimization, Especially for NetworkX.
|
|
207
|
-
# If we'd create nodes and add them to graph in real time the process would take too long.
|
|
208
|
-
# Every node and edge added to NetworkX is saved to file which is very slow when not done in batches.
|
|
209
|
-
await graph_db.add_nodes(list(node_mapping.values()))
|
|
210
|
-
await graph_db.add_edges(_remove_duplicate_edges(edge_mapping))
|
|
211
|
-
|
|
212
|
-
# In these steps we calculate the vector embeddings of our nodes and edges and save them to vector database
|
|
213
|
-
# Cognee uses this information to perform searches on the knowledge graph.
|
|
214
|
-
await index_data_points(list(node_mapping.values()))
|
|
215
|
-
await index_graph_edges()
|
|
216
|
-
|
|
217
|
-
logger.info("Data successfully migrated from relational database to desired graph database.")
|
|
218
|
-
return await graph_db.get_graph_data()
|
|
309
|
+
return node_mapping, edge_mapping
|
|
@@ -32,7 +32,10 @@ async def resolve_data_directories(
|
|
|
32
32
|
import s3fs
|
|
33
33
|
|
|
34
34
|
fs = s3fs.S3FileSystem(
|
|
35
|
-
key=s3_config.aws_access_key_id,
|
|
35
|
+
key=s3_config.aws_access_key_id,
|
|
36
|
+
secret=s3_config.aws_secret_access_key,
|
|
37
|
+
token=s3_config.aws_session_token,
|
|
38
|
+
anon=False,
|
|
36
39
|
)
|
|
37
40
|
|
|
38
41
|
for item in data:
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
from uuid import uuid5, NAMESPACE_OID
|
|
4
|
+
from cognee.infrastructure.engine.models.DataPoint import DataPoint
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
from cognee.tasks.schema.models import DatabaseSchema, SchemaTable, SchemaRelationship
|
|
7
|
+
from cognee.infrastructure.databases.relational.get_migration_relational_engine import (
|
|
8
|
+
get_migration_relational_engine,
|
|
9
|
+
)
|
|
10
|
+
from cognee.infrastructure.databases.relational.config import get_migration_config
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def ingest_database_schema(
|
|
15
|
+
schema,
|
|
16
|
+
max_sample_rows: int = 0,
|
|
17
|
+
) -> Dict[str, List[DataPoint] | DataPoint]:
|
|
18
|
+
"""
|
|
19
|
+
Extract database schema metadata (optionally with sample data) and return DataPoint models for graph construction.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
schema: Database schema
|
|
23
|
+
max_sample_rows: Maximum sample rows per table (0 means no sampling)
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Dict with keys:
|
|
27
|
+
"database_schema": DatabaseSchema
|
|
28
|
+
"schema_tables": List[SchemaTable]
|
|
29
|
+
"relationships": List[SchemaRelationship]
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
tables = {}
|
|
33
|
+
sample_data = {}
|
|
34
|
+
schema_tables = []
|
|
35
|
+
schema_relationships = []
|
|
36
|
+
|
|
37
|
+
migration_config = get_migration_config()
|
|
38
|
+
engine = get_migration_relational_engine()
|
|
39
|
+
qi = engine.engine.dialect.identifier_preparer.quote
|
|
40
|
+
try:
|
|
41
|
+
max_sample_rows = max(0, int(max_sample_rows))
|
|
42
|
+
except (TypeError, ValueError):
|
|
43
|
+
max_sample_rows = 0
|
|
44
|
+
|
|
45
|
+
def qname(name: str):
|
|
46
|
+
split_name = name.split(".")
|
|
47
|
+
return ".".join(qi(p) for p in split_name)
|
|
48
|
+
|
|
49
|
+
async with engine.engine.begin() as cursor:
|
|
50
|
+
for table_name, details in schema.items():
|
|
51
|
+
tn = qname(table_name)
|
|
52
|
+
if max_sample_rows > 0:
|
|
53
|
+
rows_result = await cursor.execute(
|
|
54
|
+
text(f"SELECT * FROM {tn} LIMIT :limit;"), # noqa: S608 - tn is fully quoted
|
|
55
|
+
{"limit": max_sample_rows},
|
|
56
|
+
)
|
|
57
|
+
rows = [dict(r) for r in rows_result.mappings().all()]
|
|
58
|
+
else:
|
|
59
|
+
rows = []
|
|
60
|
+
|
|
61
|
+
if engine.engine.dialect.name == "postgresql":
|
|
62
|
+
if "." in table_name:
|
|
63
|
+
schema_part, table_part = table_name.split(".", 1)
|
|
64
|
+
else:
|
|
65
|
+
schema_part, table_part = "public", table_name
|
|
66
|
+
estimate = await cursor.execute(
|
|
67
|
+
text(
|
|
68
|
+
"SELECT reltuples::bigint AS estimate "
|
|
69
|
+
"FROM pg_class c "
|
|
70
|
+
"JOIN pg_namespace n ON n.oid = c.relnamespace "
|
|
71
|
+
"WHERE n.nspname = :schema AND c.relname = :table"
|
|
72
|
+
),
|
|
73
|
+
{"schema": schema_part, "table": table_part},
|
|
74
|
+
)
|
|
75
|
+
row_count_estimate = estimate.scalar() or 0
|
|
76
|
+
else:
|
|
77
|
+
count_result = await cursor.execute(text(f"SELECT COUNT(*) FROM {tn};")) # noqa: S608 - tn is fully quoted
|
|
78
|
+
row_count_estimate = count_result.scalar()
|
|
79
|
+
|
|
80
|
+
schema_table = SchemaTable(
|
|
81
|
+
id=uuid5(NAMESPACE_OID, name=f"{table_name}"),
|
|
82
|
+
name=table_name,
|
|
83
|
+
columns=json.dumps(details["columns"], default=str),
|
|
84
|
+
primary_key=details.get("primary_key"),
|
|
85
|
+
foreign_keys=json.dumps(details.get("foreign_keys", []), default=str),
|
|
86
|
+
sample_rows=json.dumps(rows, default=str),
|
|
87
|
+
row_count_estimate=row_count_estimate,
|
|
88
|
+
description=f"Relational database table with '{table_name}' with {len(details['columns'])} columns and approx. {row_count_estimate} rows."
|
|
89
|
+
f"Here are the columns this table contains: {details['columns']}"
|
|
90
|
+
f"Here are a few sample_rows to show the contents of the table: {rows}"
|
|
91
|
+
f"Table is part of the database: {migration_config.migration_db_name}",
|
|
92
|
+
)
|
|
93
|
+
schema_tables.append(schema_table)
|
|
94
|
+
tables[table_name] = details
|
|
95
|
+
sample_data[table_name] = rows
|
|
96
|
+
|
|
97
|
+
for fk in details.get("foreign_keys", []):
|
|
98
|
+
ref_table_fq = fk["ref_table"]
|
|
99
|
+
if "." not in ref_table_fq and "." in table_name:
|
|
100
|
+
ref_table_fq = f"{table_name.split('.', 1)[0]}.{ref_table_fq}"
|
|
101
|
+
|
|
102
|
+
relationship_name = (
|
|
103
|
+
f"{table_name}:{fk['column']}->{ref_table_fq}:{fk['ref_column']}"
|
|
104
|
+
)
|
|
105
|
+
relationship = SchemaRelationship(
|
|
106
|
+
id=uuid5(NAMESPACE_OID, name=relationship_name),
|
|
107
|
+
name=relationship_name,
|
|
108
|
+
source_table=table_name,
|
|
109
|
+
target_table=ref_table_fq,
|
|
110
|
+
relationship_type="foreign_key",
|
|
111
|
+
source_column=fk["column"],
|
|
112
|
+
target_column=fk["ref_column"],
|
|
113
|
+
description=f"Relational database table foreign key relationship between: {table_name}.{fk['column']} → {ref_table_fq}.{fk['ref_column']}"
|
|
114
|
+
f"This foreing key relationship between table columns is a part of the following database: {migration_config.migration_db_name}",
|
|
115
|
+
)
|
|
116
|
+
schema_relationships.append(relationship)
|
|
117
|
+
|
|
118
|
+
id_str = f"{migration_config.migration_db_provider}:{migration_config.migration_db_name}"
|
|
119
|
+
database_schema = DatabaseSchema(
|
|
120
|
+
id=uuid5(NAMESPACE_OID, name=id_str),
|
|
121
|
+
name=migration_config.migration_db_name,
|
|
122
|
+
database_type=migration_config.migration_db_provider,
|
|
123
|
+
tables=json.dumps(tables, default=str),
|
|
124
|
+
sample_data=json.dumps(sample_data, default=str),
|
|
125
|
+
description=f"Database schema containing {len(schema_tables)} tables and {len(schema_relationships)} relationships. "
|
|
126
|
+
f"The database type is {migration_config.migration_db_provider}."
|
|
127
|
+
f"The database contains the following tables: {tables}",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"database_schema": database_schema,
|
|
132
|
+
"schema_tables": schema_tables,
|
|
133
|
+
"relationships": schema_relationships,
|
|
134
|
+
}
|