cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. cognee/api/client.py +44 -4
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +13 -3
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
  116. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  117. cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
  118. cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
  119. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  120. cognee/modules/ingestion/data_types/TextData.py +8 -2
  121. cognee/modules/ingestion/save_data_to_file.py +1 -1
  122. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  123. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  124. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  125. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  126. cognee/modules/pipelines/models/__init__.py +1 -0
  127. cognee/modules/pipelines/operations/pipeline.py +10 -2
  128. cognee/modules/pipelines/operations/run_tasks.py +252 -20
  129. cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
  130. cognee/modules/retrieval/chunks_retriever.py +23 -1
  131. cognee/modules/retrieval/code_retriever.py +66 -9
  132. cognee/modules/retrieval/completion_retriever.py +11 -9
  133. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  134. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  135. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  136. cognee/modules/retrieval/graph_completion_retriever.py +1 -1
  137. cognee/modules/retrieval/insights_retriever.py +4 -0
  138. cognee/modules/retrieval/natural_language_retriever.py +9 -15
  139. cognee/modules/retrieval/summaries_retriever.py +23 -1
  140. cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
  141. cognee/modules/retrieval/utils/completion.py +6 -9
  142. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  143. cognee/modules/search/methods/search.py +5 -1
  144. cognee/modules/search/operations/__init__.py +1 -0
  145. cognee/modules/search/operations/select_search_type.py +42 -0
  146. cognee/modules/search/types/SearchType.py +1 -0
  147. cognee/modules/settings/get_settings.py +0 -8
  148. cognee/modules/settings/save_vector_db_config.py +1 -1
  149. cognee/shared/data_models.py +3 -1
  150. cognee/shared/logging_utils.py +0 -5
  151. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  152. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  153. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  154. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  155. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  156. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  157. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  158. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  159. cognee/tasks/graph/infer_data_ontology.py +5 -6
  160. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  161. cognee/tasks/ingestion/ingest_data.py +91 -61
  162. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  163. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  164. cognee/tasks/storage/index_data_points.py +1 -1
  165. cognee/tasks/storage/index_graph_edges.py +4 -1
  166. cognee/tasks/summarization/summarize_code.py +2 -3
  167. cognee/tasks/summarization/summarize_text.py +3 -2
  168. cognee/tests/test_cognee_server_start.py +12 -7
  169. cognee/tests/test_deduplication.py +2 -2
  170. cognee/tests/test_deletion.py +58 -17
  171. cognee/tests/test_graph_visualization_permissions.py +161 -0
  172. cognee/tests/test_neptune_analytics_graph.py +309 -0
  173. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  174. cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
  175. cognee/tests/test_pgvector.py +5 -5
  176. cognee/tests/test_s3.py +1 -6
  177. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  178. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  179. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  180. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  181. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  182. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  183. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  184. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
  185. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  186. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
  187. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
  188. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  189. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  190. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  191. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  192. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  193. cognee/modules/data/extraction/extract_categories.py +0 -14
  194. cognee/tests/test_qdrant.py +0 -99
  195. distributed/Dockerfile +0 -34
  196. distributed/app.py +0 -4
  197. distributed/entrypoint.py +0 -71
  198. distributed/entrypoint.sh +0 -5
  199. distributed/modal_image.py +0 -11
  200. distributed/queues.py +0 -5
  201. distributed/tasks/queued_add_data_points.py +0 -13
  202. distributed/tasks/queued_add_edges.py +0 -13
  203. distributed/tasks/queued_add_nodes.py +0 -13
  204. distributed/test.py +0 -28
  205. distributed/utils.py +0 -19
  206. distributed/workers/data_point_saving_worker.py +0 -93
  207. distributed/workers/graph_saving_worker.py +0 -104
  208. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  209. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  210. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  211. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  212. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  213. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  214. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  215. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  216. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  217. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  218. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  219. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  220. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  221. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
  222. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
  223. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,281 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Kuzu Database Migration Script
4
+
5
+ This script migrates Kuzu databases between different versions by:
6
+ 1. Setting up isolated Python environments for each Kuzu version
7
+ 2. Exporting data from the source database using the old version
8
+ 3. Importing data into the target database using the new version
9
+ 4. If overwrite is enabled target database will replace source database and source database will have the prefix _old
10
+ 5. If delete-old is enabled target database will be renamed to source database and source database will be deleted
11
+
12
+ The script automatically handles:
13
+ - Environment setup (creates virtual environments as needed)
14
+ - Export/import validation
15
+ - Error handling and reporting
16
+
17
+ Usage Examples:
18
+ # Basic migration from 0.9.0 to 0.11.0
19
+ python kuzu_migrate.py --old-version 0.9.0 --new-version 0.11.0 --old-db /path/to/old/database --new-db /path/to/new/database
20
+
21
+ Requirements:
22
+ - Python 3.7+
23
+ - Internet connection (to download Kuzu packages)
24
+ - Sufficient disk space for virtual environments and temporary exports
25
+
26
+ Notes:
27
+ - Can only be used to migrate to newer Kuzu versions, from 0.11.0 onwards
28
+ """
29
+
30
+ import tempfile
31
+ import sys
32
+ import struct
33
+ import shutil
34
+ import subprocess
35
+ import argparse
36
+ import os
37
+
38
+
39
+ kuzu_version_mapping = {
40
+ 34: "0.7.0",
41
+ 35: "0.7.1",
42
+ 36: "0.8.2",
43
+ 37: "0.9.0",
44
+ 38: "0.10.1",
45
+ 39: "0.11.0",
46
+ }
47
+
48
+
49
+ def read_kuzu_storage_version(kuzu_db_path: str) -> int:
50
+ """
51
+ Reads the Kùzu storage version code from the first catalog.bin file bytes.
52
+
53
+ :param kuzu_db_path: Path to the Kuzu database file/directory.
54
+ :return: Storage version code as an integer.
55
+ """
56
+ if os.path.isdir(kuzu_db_path):
57
+ kuzu_version_file_path = os.path.join(kuzu_db_path, "catalog.kz")
58
+ if not os.path.isfile(kuzu_version_file_path):
59
+ raise FileExistsError("Kuzu catalog.kz file does not exist")
60
+ else:
61
+ kuzu_version_file_path = kuzu_db_path
62
+
63
+ with open(kuzu_version_file_path, "rb") as f:
64
+ # Skip the 3-byte magic "KUZ" and one byte of padding
65
+ f.seek(4)
66
+ # Read the next 8 bytes as a little-endian unsigned 64-bit integer
67
+ data = f.read(8)
68
+ if len(data) < 8:
69
+ raise ValueError(
70
+ f"File '{kuzu_version_file_path}' does not contain a storage version code."
71
+ )
72
+ version_code = struct.unpack("<Q", data)[0]
73
+
74
+ if kuzu_version_mapping.get(version_code):
75
+ return kuzu_version_mapping[version_code]
76
+ else:
77
+ raise ValueError("Could not map version_code to proper Kuzu version.")
78
+
79
+
80
+ def ensure_env(version: str, export_dir) -> str:
81
+ """
82
+ Create (if needed) a venv at .kuzu_envs/{version} and install kuzu=={version}.
83
+ Returns the path to the venv's python executable.
84
+ """
85
+ # Use temp directory to create venv
86
+ kuzu_envs_dir = os.path.join(export_dir, ".kuzu_envs")
87
+
88
+ # venv base under the script directory
89
+ base = os.path.join(kuzu_envs_dir, version)
90
+ py_bin = os.path.join(base, "bin", "python")
91
+ # If environment already exists clean it
92
+ if os.path.isfile(py_bin):
93
+ shutil.rmtree(base)
94
+
95
+ print(f"→ Setting up venv for Kùzu {version}...", file=sys.stderr)
96
+ # Create venv
97
+ # NOTE: Running python in debug mode can cause issues with creating a virtual environment from that python instance
98
+ subprocess.run([sys.executable, "-m", "venv", base], check=True)
99
+ # Install the specific Kùzu version
100
+ subprocess.run([py_bin, "-m", "pip", "install", "--upgrade", "pip"], check=True)
101
+ subprocess.run([py_bin, "-m", "pip", "install", f"kuzu=={version}"], check=True)
102
+ return py_bin
103
+
104
+
105
+ def run_migration_step(python_exe: str, db_path: str, cypher: str):
106
+ """
107
+ Uses the given python_exe to execute a short snippet that
108
+ connects to the Kùzu database and runs a Cypher command.
109
+ """
110
+ snippet = f"""
111
+ import kuzu
112
+ db = kuzu.Database(r"{db_path}")
113
+ conn = kuzu.Connection(db)
114
+ conn.execute(r\"\"\"{cypher}\"\"\")
115
+ """
116
+ proc = subprocess.run([python_exe, "-c", snippet], capture_output=True, text=True)
117
+ if proc.returncode != 0:
118
+ print(f"[ERROR] {cypher} failed:\n{proc.stderr}", file=sys.stderr)
119
+ sys.exit(proc.returncode)
120
+
121
+
122
+ def kuzu_migration(new_db, old_db, new_version, old_version=None, overwrite=None, delete_old=None):
123
+ """
124
+ Main migration function that handles the complete migration process.
125
+ """
126
+ print(f"🔄 Migrating Kuzu database from {old_version} to {new_version}", file=sys.stderr)
127
+ print(f"📂 Source: {old_db}", file=sys.stderr)
128
+ print("", file=sys.stderr)
129
+
130
+ # If version of old kuzu db is not provided try to determine it based on file info
131
+ if not old_version:
132
+ old_version = read_kuzu_storage_version(old_db)
133
+
134
+ # Check if old database exists
135
+ if not os.path.exists(old_db):
136
+ print(f"Source database '{old_db}' does not exist.", file=sys.stderr)
137
+ sys.exit(1)
138
+
139
+ # Prepare target - ensure parent directory exists but remove target if it exists
140
+ parent_dir = os.path.dirname(new_db)
141
+ if parent_dir:
142
+ os.makedirs(parent_dir, exist_ok=True)
143
+
144
+ if os.path.exists(new_db):
145
+ raise FileExistsError(
146
+ "File already exists at new database location, remove file or change new database file path to continue"
147
+ )
148
+
149
+ # Use temp directory for all processing, it will be cleaned up after with statement
150
+ with tempfile.TemporaryDirectory() as export_dir:
151
+ # Set up environments
152
+ print(f"Setting up Kuzu {old_version} environment...", file=sys.stderr)
153
+ old_py = ensure_env(old_version, export_dir)
154
+ print(f"Setting up Kuzu {new_version} environment...", file=sys.stderr)
155
+ new_py = ensure_env(new_version, export_dir)
156
+
157
+ export_file = os.path.join(export_dir, "kuzu_export")
158
+ print(f"Exporting old DB → {export_dir}", file=sys.stderr)
159
+ run_migration_step(old_py, old_db, f"EXPORT DATABASE '{export_file}'")
160
+ print("Export complete.", file=sys.stderr)
161
+
162
+ # Check if export files were created and have content
163
+ schema_file = os.path.join(export_file, "schema.cypher")
164
+ if not os.path.exists(schema_file) or os.path.getsize(schema_file) == 0:
165
+ raise ValueError(f"Schema file not found: {schema_file}")
166
+
167
+ print(f"Importing into new DB at {new_db}", file=sys.stderr)
168
+ run_migration_step(new_py, new_db, f"IMPORT DATABASE '{export_file}'")
169
+ print("Import complete.", file=sys.stderr)
170
+
171
+ # Rename new kuzu database to old kuzu database name if enabled
172
+ if overwrite or delete_old:
173
+ # Remove kuzu lock from migrated DB
174
+ lock_file = new_db + ".lock"
175
+ if os.path.exists(lock_file):
176
+ os.remove(lock_file)
177
+ rename_databases(old_db, old_version, new_db, delete_old)
178
+
179
+ print("✅ Kuzu graph database migration finished successfully!")
180
+
181
+
182
+ def rename_databases(old_db: str, old_version: str, new_db: str, delete_old: bool):
183
+ """
184
+ When overwrite is enabled, back up the original old_db (file with .lock and .wal or directory)
185
+ by renaming it to *_old, and replace it with the newly imported new_db files.
186
+
187
+ When delete_old is enabled replace the old database with the new one and delete old database
188
+ """
189
+ base_dir = os.path.dirname(old_db)
190
+ name = os.path.basename(old_db.rstrip(os.sep))
191
+ # Add _old_ and version info to backup graph database
192
+ backup_database_name = f"{name}_old_" + old_version.replace(".", "_")
193
+ backup_base = os.path.join(base_dir, backup_database_name)
194
+
195
+ if os.path.isfile(old_db):
196
+ # File-based database: handle main file and accompanying lock/WAL
197
+ for ext in ["", ".wal"]:
198
+ src = old_db + ext
199
+ dst = backup_base + ext
200
+ if os.path.exists(src):
201
+ if delete_old:
202
+ os.remove(src)
203
+ else:
204
+ os.rename(src, dst)
205
+ print(f"Renamed '{src}' to '{dst}'", file=sys.stderr)
206
+ elif os.path.isdir(old_db):
207
+ # Directory-based Kuzu database
208
+ backup_dir = backup_base
209
+ if delete_old:
210
+ shutil.rmtree(old_db)
211
+ else:
212
+ os.rename(old_db, backup_dir)
213
+ print(f"Renamed directory '{old_db}' to '{backup_dir}'", file=sys.stderr)
214
+ else:
215
+ print(f"Original database path '{old_db}' not found for renaming.", file=sys.stderr)
216
+ sys.exit(1)
217
+
218
+ # Now move new files into place
219
+ for ext in ["", ".wal"]:
220
+ src_new = new_db + ext
221
+ dst_new = os.path.join(base_dir, name + ext)
222
+ if os.path.exists(src_new):
223
+ os.rename(src_new, dst_new)
224
+ print(f"Renamed '{src_new}' to '{dst_new}'", file=sys.stderr)
225
+
226
+
227
+ def main():
228
+ p = argparse.ArgumentParser(
229
+ description="Migrate Kùzu DB via PyPI versions",
230
+ epilog="""
231
+ Examples:
232
+ %(prog)s --old-version 0.9.0 --new-version 0.11.0 \\
233
+ --old-db /path/to/old/db --new-db /path/to/new/db --overwrite
234
+
235
+ Note: This script will create temporary virtual environments in .kuzu_envs/ directory
236
+ to isolate different Kuzu versions.
237
+ """,
238
+ formatter_class=argparse.RawDescriptionHelpFormatter,
239
+ )
240
+ p.add_argument(
241
+ "--old-version",
242
+ required=False,
243
+ default=None,
244
+ help="Source Kuzu version (e.g., 0.9.0). If not provided automatic kuzu version detection will be attempted.",
245
+ )
246
+ p.add_argument("--new-version", required=True, help="Target Kuzu version (e.g., 0.11.0)")
247
+ p.add_argument("--old-db", required=True, help="Path to source database directory")
248
+ p.add_argument(
249
+ "--new-db",
250
+ required=True,
251
+ help="Path to target database directory, it can't be the same path as the old database. Use the overwrite flag if you want to replace the old database with the new one.",
252
+ )
253
+ p.add_argument(
254
+ "--overwrite",
255
+ required=False,
256
+ action="store_true",
257
+ default=False,
258
+ help="Rename new-db to the old-db name and location, keeps old-db as backup if delete-old is not True",
259
+ )
260
+ p.add_argument(
261
+ "--delete-old",
262
+ required=False,
263
+ action="store_true",
264
+ default=False,
265
+ help="When overwrite and delete-old is True old-db will not be stored as backup",
266
+ )
267
+
268
+ args = p.parse_args()
269
+
270
+ kuzu_migration(
271
+ new_db=args.new_db,
272
+ old_db=args.old_db,
273
+ new_version=args.new_version,
274
+ old_version=args.old_version,
275
+ overwrite=args.overwrite,
276
+ delete_old=args.delete_old,
277
+ )
278
+
279
+
280
+ if __name__ == "__main__":
281
+ main()
@@ -33,7 +33,7 @@ from .neo4j_metrics_utils import (
33
33
  from .deadlock_retry import deadlock_retry
34
34
 
35
35
 
36
- logger = get_logger("Neo4jAdapter", level=ERROR)
36
+ logger = get_logger("Neo4jAdapter")
37
37
 
38
38
  BASE_LABEL = "__Node__"
39
39
 
@@ -50,6 +50,7 @@ class Neo4jAdapter(GraphDBInterface):
50
50
  graph_database_url: str,
51
51
  graph_database_username: Optional[str] = None,
52
52
  graph_database_password: Optional[str] = None,
53
+ graph_database_name: Optional[str] = None,
53
54
  driver: Optional[Any] = None,
54
55
  ):
55
56
  # Only use auth if both username and password are provided
@@ -59,7 +60,7 @@ class Neo4jAdapter(GraphDBInterface):
59
60
  elif graph_database_username or graph_database_password:
60
61
  logger = get_logger(__name__)
61
62
  logger.warning("Neo4j credentials incomplete – falling back to anonymous connection.")
62
-
63
+ self.graph_database_name = graph_database_name
63
64
  self.driver = driver or AsyncGraphDatabase.driver(
64
65
  graph_database_url,
65
66
  auth=auth,
@@ -80,7 +81,7 @@ class Neo4jAdapter(GraphDBInterface):
80
81
  """
81
82
  Get a session for database operations.
82
83
  """
83
- async with self.driver.session() as session:
84
+ async with self.driver.session(database=self.graph_database_name) as session:
84
85
  yield session
85
86
 
86
87
  @deadlock_retry()
@@ -410,6 +411,38 @@ class Neo4jAdapter(GraphDBInterface):
410
411
 
411
412
  return await self.query(query, params)
412
413
 
414
+ def _flatten_edge_properties(self, properties: Dict[str, Any]) -> Dict[str, Any]:
415
+ """
416
+ Flatten edge properties to handle nested dictionaries like weights.
417
+
418
+ Neo4j doesn't support nested dictionaries as property values, so we need to
419
+ flatten the 'weights' dictionary into individual properties with prefixes.
420
+
421
+ Args:
422
+ properties: Dictionary of edge properties that may contain nested dicts
423
+
424
+ Returns:
425
+ Flattened properties dictionary suitable for Neo4j storage
426
+ """
427
+ flattened = {}
428
+
429
+ for key, value in properties.items():
430
+ if key == "weights" and isinstance(value, dict):
431
+ # Flatten weights dictionary into individual properties
432
+ for weight_name, weight_value in value.items():
433
+ flattened[f"weight_{weight_name}"] = weight_value
434
+ elif isinstance(value, dict):
435
+ # For other nested dictionaries, serialize as JSON string
436
+ flattened[f"{key}_json"] = json.dumps(value, cls=JSONEncoder)
437
+ elif isinstance(value, list):
438
+ # For lists, serialize as JSON string
439
+ flattened[f"{key}_json"] = json.dumps(value, cls=JSONEncoder)
440
+ else:
441
+ # Keep primitive types as-is
442
+ flattened[key] = value
443
+
444
+ return flattened
445
+
413
446
  @record_graph_changes
414
447
  @override_distributed(queued_add_edges)
415
448
  async def add_edges(self, edges: list[tuple[str, str, str, dict[str, Any]]]) -> None:
@@ -448,11 +481,13 @@ class Neo4jAdapter(GraphDBInterface):
448
481
  "from_node": str(edge[0]),
449
482
  "to_node": str(edge[1]),
450
483
  "relationship_name": edge[2],
451
- "properties": {
452
- **(edge[3] if edge[3] else {}),
453
- "source_node_id": str(edge[0]),
454
- "target_node_id": str(edge[1]),
455
- },
484
+ "properties": self._flatten_edge_properties(
485
+ {
486
+ **(edge[3] if edge[3] else {}),
487
+ "source_node_id": str(edge[0]),
488
+ "target_node_id": str(edge[1]),
489
+ }
490
+ ),
456
491
  }
457
492
  for edge in edges
458
493
  ]
@@ -870,34 +905,52 @@ class Neo4jAdapter(GraphDBInterface):
870
905
 
871
906
  A tuple containing two lists: nodes and edges with their properties.
872
907
  """
873
- query = "MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties"
908
+ import time
874
909
 
875
- result = await self.query(query)
910
+ start_time = time.time()
876
911
 
877
- nodes = [
878
- (
879
- record["properties"]["id"],
880
- record["properties"],
912
+ try:
913
+ # Retrieve nodes
914
+ query = "MATCH (n) RETURN ID(n) AS id, labels(n) AS labels, properties(n) AS properties"
915
+ result = await self.query(query)
916
+
917
+ nodes = []
918
+ for record in result:
919
+ nodes.append(
920
+ (
921
+ record["properties"]["id"],
922
+ record["properties"],
923
+ )
924
+ )
925
+
926
+ # Retrieve edges
927
+ query = """
928
+ MATCH (n)-[r]->(m)
929
+ RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
930
+ """
931
+ result = await self.query(query)
932
+
933
+ edges = []
934
+ for record in result:
935
+ edges.append(
936
+ (
937
+ record["properties"]["source_node_id"],
938
+ record["properties"]["target_node_id"],
939
+ record["type"],
940
+ record["properties"],
941
+ )
942
+ )
943
+
944
+ retrieval_time = time.time() - start_time
945
+ logger.info(
946
+ f"Retrieved {len(nodes)} nodes and {len(edges)} edges in {retrieval_time:.2f} seconds"
881
947
  )
882
- for record in result
883
- ]
884
948
 
885
- query = """
886
- MATCH (n)-[r]->(m)
887
- RETURN ID(n) AS source, ID(m) AS target, TYPE(r) AS type, properties(r) AS properties
888
- """
889
- result = await self.query(query)
890
- edges = [
891
- (
892
- record["properties"]["source_node_id"],
893
- record["properties"]["target_node_id"],
894
- record["type"],
895
- record["properties"],
896
- )
897
- for record in result
898
- ]
949
+ return (nodes, edges)
899
950
 
900
- return (nodes, edges)
951
+ except Exception as e:
952
+ logger.error(f"Error during graph data retrieval: {str(e)}")
953
+ raise
901
954
 
902
955
  async def get_nodeset_subgraph(
903
956
  self, node_type: Type[Any], node_name: List[str]
@@ -918,50 +971,71 @@ class Neo4jAdapter(GraphDBInterface):
918
971
  - Tuple[List[Tuple[int, dict]], List[Tuple[int, int, str, dict]]}: A tuple
919
972
  containing nodes and edges in the requested subgraph.
920
973
  """
921
- label = node_type.__name__
974
+ import time
922
975
 
923
- query = f"""
924
- UNWIND $names AS wantedName
925
- MATCH (n:`{label}`)
926
- WHERE n.name = wantedName
927
- WITH collect(DISTINCT n) AS primary
928
- UNWIND primary AS p
929
- OPTIONAL MATCH (p)--(nbr)
930
- WITH primary, collect(DISTINCT nbr) AS nbrs
931
- WITH primary + nbrs AS nodelist
932
- UNWIND nodelist AS node
933
- WITH collect(DISTINCT node) AS nodes
934
- MATCH (a)-[r]-(b)
935
- WHERE a IN nodes AND b IN nodes
936
- WITH nodes, collect(DISTINCT r) AS rels
937
- RETURN
938
- [n IN nodes |
939
- {{ id: n.id,
940
- properties: properties(n) }}] AS rawNodes,
941
- [r IN rels |
942
- {{ type: type(r),
943
- properties: properties(r) }}] AS rawRels
944
- """
976
+ start_time = time.time()
945
977
 
946
- result = await self.query(query, {"names": node_name})
947
- if not result:
948
- return [], []
978
+ try:
979
+ label = node_type.__name__
949
980
 
950
- raw_nodes = result[0]["rawNodes"]
951
- raw_rels = result[0]["rawRels"]
981
+ query = f"""
982
+ UNWIND $names AS wantedName
983
+ MATCH (n:`{label}`)
984
+ WHERE n.name = wantedName
985
+ WITH collect(DISTINCT n) AS primary
986
+ UNWIND primary AS p
987
+ OPTIONAL MATCH (p)--(nbr)
988
+ WITH primary, collect(DISTINCT nbr) AS nbrs
989
+ WITH primary + nbrs AS nodelist
990
+ UNWIND nodelist AS node
991
+ WITH collect(DISTINCT node) AS nodes
992
+ MATCH (a)-[r]-(b)
993
+ WHERE a IN nodes AND b IN nodes
994
+ WITH nodes, collect(DISTINCT r) AS rels
995
+ RETURN
996
+ [n IN nodes |
997
+ {{ id: n.id,
998
+ properties: properties(n) }}] AS rawNodes,
999
+ [r IN rels |
1000
+ {{ type: type(r),
1001
+ properties: properties(r) }}] AS rawRels
1002
+ """
952
1003
 
953
- nodes = [(n["properties"]["id"], n["properties"]) for n in raw_nodes]
954
- edges = [
955
- (
956
- r["properties"]["source_node_id"],
957
- r["properties"]["target_node_id"],
958
- r["type"],
959
- r["properties"],
1004
+ result = await self.query(query, {"names": node_name})
1005
+
1006
+ if not result:
1007
+ return [], []
1008
+
1009
+ raw_nodes = result[0]["rawNodes"]
1010
+ raw_rels = result[0]["rawRels"]
1011
+
1012
+ # Process nodes
1013
+ nodes = []
1014
+ for n in raw_nodes:
1015
+ nodes.append((n["properties"]["id"], n["properties"]))
1016
+
1017
+ # Process edges
1018
+ edges = []
1019
+ for r in raw_rels:
1020
+ edges.append(
1021
+ (
1022
+ r["properties"]["source_node_id"],
1023
+ r["properties"]["target_node_id"],
1024
+ r["type"],
1025
+ r["properties"],
1026
+ )
1027
+ )
1028
+
1029
+ retrieval_time = time.time() - start_time
1030
+ logger.info(
1031
+ f"Retrieved {len(nodes)} nodes and {len(edges)} edges for {node_type.__name__} in {retrieval_time:.2f} seconds"
960
1032
  )
961
- for r in raw_rels
962
- ]
963
1033
 
964
- return nodes, edges
1034
+ return nodes, edges
1035
+
1036
+ except Exception as e:
1037
+ logger.error(f"Error during nodeset subgraph retrieval: {str(e)}")
1038
+ raise
965
1039
 
966
1040
  async def get_filtered_graph_data(self, attribute_filters):
967
1041
  """
@@ -1011,8 +1085,8 @@ class Neo4jAdapter(GraphDBInterface):
1011
1085
 
1012
1086
  edges = [
1013
1087
  (
1014
- record["source"],
1015
- record["target"],
1088
+ record["properties"]["source_node_id"],
1089
+ record["properties"]["target_node_id"],
1016
1090
  record["type"],
1017
1091
  record["properties"],
1018
1092
  )
@@ -1178,7 +1252,7 @@ class Neo4jAdapter(GraphDBInterface):
1178
1252
 
1179
1253
  return mandatory_metrics | optional_metrics
1180
1254
 
1181
- async def get_document_subgraph(self, content_hash: str):
1255
+ async def get_document_subgraph(self, data_id: str):
1182
1256
  """
1183
1257
  Retrieve a subgraph related to a document identified by its content hash, including
1184
1258
  related entities and chunks.
@@ -1196,21 +1270,21 @@ class Neo4jAdapter(GraphDBInterface):
1196
1270
  """
1197
1271
  query = """
1198
1272
  MATCH (doc)
1199
- WHERE (doc:TextDocument OR doc:PdfDocument)
1200
- AND doc.name = 'text_' + $content_hash
1273
+ WHERE (doc:TextDocument OR doc:PdfDocument OR doc:UnstructuredDocument OR doc:AudioDocument or doc:ImageDocument)
1274
+ AND doc.id = $data_id
1201
1275
 
1202
1276
  OPTIONAL MATCH (doc)<-[:is_part_of]-(chunk:DocumentChunk)
1203
1277
  OPTIONAL MATCH (chunk)-[:contains]->(entity:Entity)
1204
1278
  WHERE NOT EXISTS {
1205
1279
  MATCH (entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc)
1206
- WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument)
1280
+ WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument)
1207
1281
  AND otherDoc.id <> doc.id
1208
1282
  }
1209
1283
  OPTIONAL MATCH (chunk)<-[:made_from]-(made_node:TextSummary)
1210
1284
  OPTIONAL MATCH (entity)-[:is_a]->(type:EntityType)
1211
1285
  WHERE NOT EXISTS {
1212
1286
  MATCH (type)<-[:is_a]-(otherEntity:Entity)<-[:contains]-(otherChunk:DocumentChunk)-[:is_part_of]->(otherDoc)
1213
- WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument)
1287
+ WHERE (otherDoc:TextDocument OR otherDoc:PdfDocument OR otherDoc:UnstructuredDocument OR otherDoc:AudioDocument or otherDoc:ImageDocument)
1214
1288
  AND otherDoc.id <> doc.id
1215
1289
  }
1216
1290
 
@@ -1221,7 +1295,7 @@ class Neo4jAdapter(GraphDBInterface):
1221
1295
  collect(DISTINCT made_node) as made_from_nodes,
1222
1296
  collect(DISTINCT type) as orphan_types
1223
1297
  """
1224
- result = await self.query(query, {"content_hash": content_hash})
1298
+ result = await self.query(query, {"data_id": data_id})
1225
1299
  return result[0] if result else None
1226
1300
 
1227
1301
  async def get_degree_one_nodes(self, node_type: str):
@@ -0,0 +1,15 @@
1
+ """Neptune Analytics Driver Module
2
+
3
+ This module provides the Neptune Analytics adapter and utilities for interacting
4
+ with Amazon Neptune Analytics graph databases.
5
+ """
6
+
7
+ from .adapter import NeptuneGraphDB
8
+ from . import neptune_utils
9
+ from . import exceptions
10
+
11
+ __all__ = [
12
+ "NeptuneGraphDB",
13
+ "neptune_utils",
14
+ "exceptions",
15
+ ]