cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. cognee/api/client.py +44 -4
  2. cognee/api/health.py +332 -0
  3. cognee/api/v1/add/add.py +5 -2
  4. cognee/api/v1/add/routers/get_add_router.py +3 -0
  5. cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
  6. cognee/api/v1/cognify/cognify.py +8 -0
  7. cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
  8. cognee/api/v1/config/config.py +3 -1
  9. cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
  10. cognee/api/v1/delete/delete.py +16 -12
  11. cognee/api/v1/responses/routers/get_responses_router.py +3 -1
  12. cognee/api/v1/search/search.py +10 -0
  13. cognee/api/v1/settings/routers/get_settings_router.py +0 -2
  14. cognee/base_config.py +1 -0
  15. cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
  16. cognee/infrastructure/databases/graph/config.py +2 -0
  17. cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
  18. cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
  19. cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
  20. cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
  21. cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
  22. cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
  23. cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
  24. cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
  25. cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
  26. cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
  27. cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
  28. cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
  29. cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
  30. cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
  31. cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
  32. cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
  33. cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
  34. cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
  35. cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
  36. cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
  37. cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
  38. cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
  39. cognee/infrastructure/files/utils/guess_file_type.py +2 -2
  40. cognee/infrastructure/files/utils/open_data_file.py +4 -23
  41. cognee/infrastructure/llm/LLMGateway.py +137 -0
  42. cognee/infrastructure/llm/__init__.py +14 -4
  43. cognee/infrastructure/llm/config.py +29 -1
  44. cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
  45. cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
  46. cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
  47. cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
  48. cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
  49. cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
  50. cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
  51. cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
  52. cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
  53. cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
  54. cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
  55. cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
  56. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
  57. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
  58. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
  59. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
  60. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
  61. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
  62. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
  63. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
  64. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
  65. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
  66. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
  67. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
  68. cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
  69. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
  70. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
  71. cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
  72. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
  73. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
  74. cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
  75. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
  76. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
  77. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
  78. cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
  79. cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
  80. cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
  81. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
  82. cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
  83. cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
  84. cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
  85. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
  86. cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
  87. cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
  88. cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
  89. cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
  90. cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
  91. cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
  92. cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
  93. cognee/infrastructure/llm/utils.py +3 -1
  94. cognee/infrastructure/loaders/LoaderEngine.py +156 -0
  95. cognee/infrastructure/loaders/LoaderInterface.py +73 -0
  96. cognee/infrastructure/loaders/__init__.py +18 -0
  97. cognee/infrastructure/loaders/core/__init__.py +7 -0
  98. cognee/infrastructure/loaders/core/audio_loader.py +98 -0
  99. cognee/infrastructure/loaders/core/image_loader.py +114 -0
  100. cognee/infrastructure/loaders/core/text_loader.py +90 -0
  101. cognee/infrastructure/loaders/create_loader_engine.py +32 -0
  102. cognee/infrastructure/loaders/external/__init__.py +22 -0
  103. cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
  104. cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
  105. cognee/infrastructure/loaders/get_loader_engine.py +18 -0
  106. cognee/infrastructure/loaders/supported_loaders.py +18 -0
  107. cognee/infrastructure/loaders/use_loader.py +21 -0
  108. cognee/infrastructure/loaders/utils/__init__.py +0 -0
  109. cognee/modules/data/methods/__init__.py +1 -0
  110. cognee/modules/data/methods/get_authorized_dataset.py +23 -0
  111. cognee/modules/data/models/Data.py +13 -3
  112. cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
  113. cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
  114. cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
  115. cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
  116. cognee/modules/engine/utils/generate_edge_id.py +5 -0
  117. cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
  118. cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
  119. cognee/modules/graph/utils/get_graph_from_model.py +93 -101
  120. cognee/modules/ingestion/data_types/TextData.py +8 -2
  121. cognee/modules/ingestion/save_data_to_file.py +1 -1
  122. cognee/modules/pipelines/exceptions/__init__.py +1 -0
  123. cognee/modules/pipelines/exceptions/exceptions.py +12 -0
  124. cognee/modules/pipelines/models/DataItemStatus.py +5 -0
  125. cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
  126. cognee/modules/pipelines/models/__init__.py +1 -0
  127. cognee/modules/pipelines/operations/pipeline.py +10 -2
  128. cognee/modules/pipelines/operations/run_tasks.py +252 -20
  129. cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
  130. cognee/modules/retrieval/chunks_retriever.py +23 -1
  131. cognee/modules/retrieval/code_retriever.py +66 -9
  132. cognee/modules/retrieval/completion_retriever.py +11 -9
  133. cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
  134. cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
  135. cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
  136. cognee/modules/retrieval/graph_completion_retriever.py +1 -1
  137. cognee/modules/retrieval/insights_retriever.py +4 -0
  138. cognee/modules/retrieval/natural_language_retriever.py +9 -15
  139. cognee/modules/retrieval/summaries_retriever.py +23 -1
  140. cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
  141. cognee/modules/retrieval/utils/completion.py +6 -9
  142. cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
  143. cognee/modules/search/methods/search.py +5 -1
  144. cognee/modules/search/operations/__init__.py +1 -0
  145. cognee/modules/search/operations/select_search_type.py +42 -0
  146. cognee/modules/search/types/SearchType.py +1 -0
  147. cognee/modules/settings/get_settings.py +0 -8
  148. cognee/modules/settings/save_vector_db_config.py +1 -1
  149. cognee/shared/data_models.py +3 -1
  150. cognee/shared/logging_utils.py +0 -5
  151. cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
  152. cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
  153. cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
  154. cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
  155. cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
  156. cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
  157. cognee/tasks/graph/extract_graph_from_code.py +3 -2
  158. cognee/tasks/graph/extract_graph_from_data.py +4 -3
  159. cognee/tasks/graph/infer_data_ontology.py +5 -6
  160. cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
  161. cognee/tasks/ingestion/ingest_data.py +91 -61
  162. cognee/tasks/ingestion/resolve_data_directories.py +3 -0
  163. cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
  164. cognee/tasks/storage/index_data_points.py +1 -1
  165. cognee/tasks/storage/index_graph_edges.py +4 -1
  166. cognee/tasks/summarization/summarize_code.py +2 -3
  167. cognee/tasks/summarization/summarize_text.py +3 -2
  168. cognee/tests/test_cognee_server_start.py +12 -7
  169. cognee/tests/test_deduplication.py +2 -2
  170. cognee/tests/test_deletion.py +58 -17
  171. cognee/tests/test_graph_visualization_permissions.py +161 -0
  172. cognee/tests/test_neptune_analytics_graph.py +309 -0
  173. cognee/tests/test_neptune_analytics_hybrid.py +176 -0
  174. cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
  175. cognee/tests/test_pgvector.py +5 -5
  176. cognee/tests/test_s3.py +1 -6
  177. cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
  178. cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
  179. cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
  180. cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
  181. cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
  182. cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
  183. cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
  184. cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
  185. cognee/tests/unit/modules/search/search_methods_test.py +55 -0
  186. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
  187. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
  188. cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
  189. cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
  190. cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
  191. cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
  192. cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
  193. cognee/modules/data/extraction/extract_categories.py +0 -14
  194. cognee/tests/test_qdrant.py +0 -99
  195. distributed/Dockerfile +0 -34
  196. distributed/app.py +0 -4
  197. distributed/entrypoint.py +0 -71
  198. distributed/entrypoint.sh +0 -5
  199. distributed/modal_image.py +0 -11
  200. distributed/queues.py +0 -5
  201. distributed/tasks/queued_add_data_points.py +0 -13
  202. distributed/tasks/queued_add_edges.py +0 -13
  203. distributed/tasks/queued_add_nodes.py +0 -13
  204. distributed/test.py +0 -28
  205. distributed/utils.py +0 -19
  206. distributed/workers/data_point_saving_worker.py +0 -93
  207. distributed/workers/graph_saving_worker.py +0 -104
  208. /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
  209. /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
  210. /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
  211. /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
  212. /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
  213. /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
  214. /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
  215. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
  216. /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
  217. /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
  218. {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
  219. {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
  220. /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
  221. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
  222. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
  223. {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,449 @@
1
+ """Neptune Analytics Hybrid Adapter combining Vector and Graph functionality"""
2
+
3
+ import asyncio
4
+ import json
5
+ from typing import List, Optional, Any, Dict, Type, Tuple
6
+ from uuid import UUID
7
+
8
+ from cognee.exceptions import InvalidValueError
9
+ from cognee.infrastructure.databases.graph.neptune_driver.adapter import NeptuneGraphDB
10
+ from cognee.infrastructure.databases.vector.vector_db_interface import VectorDBInterface
11
+ from cognee.infrastructure.engine import DataPoint
12
+ from cognee.modules.storage.utils import JSONEncoder
13
+ from cognee.shared.logging_utils import get_logger
14
+ from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
15
+ from cognee.infrastructure.databases.vector.models.PayloadSchema import PayloadSchema
16
+ from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult
17
+
18
+ logger = get_logger("NeptuneAnalyticsAdapter")
19
+
20
+
21
+ class IndexSchema(DataPoint):
22
+ """
23
+ Represents a schema for an index data point containing an ID and text.
24
+
25
+ Attributes:
26
+ - id: A string representing the unique identifier for the data point.
27
+ - text: A string representing the content of the data point.
28
+ - metadata: A dictionary with default index fields for the schema, currently configured
29
+ to include 'text'.
30
+ """
31
+
32
+ id: str
33
+ text: str
34
+ metadata: dict = {"index_fields": ["text"]}
35
+
36
+
37
+ NEPTUNE_ANALYTICS_ENDPOINT_URL = "neptune-graph://"
38
+
39
+
40
+ class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface):
41
+ """
42
+ Hybrid adapter that combines Neptune Analytics Vector and Graph functionality.
43
+
44
+ This adapter extends NeptuneGraphDB and implements VectorDBInterface to provide
45
+ a unified interface for working with Neptune Analytics as both a vector store
46
+ and a graph database.
47
+ """
48
+
49
+ _VECTOR_NODE_LABEL = "COGNEE_NODE"
50
+ _COLLECTION_PREFIX = "VECTOR_COLLECTION"
51
+ _TOPK_LOWER_BOUND = 0
52
+ _TOPK_UPPER_BOUND = 10
53
+
54
+ def __init__(
55
+ self,
56
+ graph_id: str,
57
+ embedding_engine: Optional[EmbeddingEngine] = None,
58
+ region: Optional[str] = None,
59
+ aws_access_key_id: Optional[str] = None,
60
+ aws_secret_access_key: Optional[str] = None,
61
+ aws_session_token: Optional[str] = None,
62
+ ):
63
+ """
64
+ Initialize the Neptune Analytics hybrid adapter.
65
+
66
+ Parameters:
67
+ -----------
68
+ - graph_id (str): The Neptune Analytics graph identifier
69
+ - embedding_engine(Optional[EmbeddingEngine]): The embedding engine instance to translate text to vector.
70
+ - region (Optional[str]): AWS region where the graph is located (default: us-east-1)
71
+ - aws_access_key_id (Optional[str]): AWS access key ID
72
+ - aws_secret_access_key (Optional[str]): AWS secret access key
73
+ - aws_session_token (Optional[str]): AWS session token for temporary credentials
74
+ """
75
+ # Initialize the graph database functionality
76
+ super().__init__(
77
+ graph_id=graph_id,
78
+ region=region,
79
+ aws_access_key_id=aws_access_key_id,
80
+ aws_secret_access_key=aws_secret_access_key,
81
+ aws_session_token=aws_session_token,
82
+ )
83
+
84
+ # Add vector-specific attributes
85
+ self.embedding_engine = embedding_engine
86
+ logger.info(
87
+ f'Initialized Neptune Analytics hybrid adapter for graph: "{graph_id}" in region: "{self.region}"'
88
+ )
89
+
90
+ # VectorDBInterface methods implementation
91
+
92
+ async def get_connection(self):
93
+ """
94
+ This method is part of the default implementation but not defined in the interface.
95
+ No operation is performed and None will be returned here,
96
+ because the concept of connection is not applicable in this context.
97
+ """
98
+ return None
99
+
100
+ async def embed_data(self, data: list[str]) -> list[list[float]]:
101
+ """
102
+ Embeds the provided textual data into vector representation.
103
+
104
+ Uses the embedding engine to convert the list of strings into a list of float vectors.
105
+
106
+ Parameters:
107
+ -----------
108
+ - data (list[str]): A list of strings representing the data to be embedded.
109
+
110
+ Returns:
111
+ --------
112
+ - list[list[float]]: A list of embedded vectors corresponding to the input data.
113
+ """
114
+ self._validate_embedding_engine()
115
+ return await self.embedding_engine.embed_text(data)
116
+
117
+ async def has_collection(self, collection_name: str) -> bool:
118
+ """
119
+ Neptune Analytics stores vector on a node level,
120
+ so create_collection() implements interface for compliance but performs no operations when called.
121
+
122
+ Parameters:
123
+ -----------
124
+ - collection_name (str): The name of the collection to check for existence.
125
+ Returns:
126
+ --------
127
+ - bool: Always return True.
128
+ """
129
+ return True
130
+
131
+ async def create_collection(
132
+ self,
133
+ collection_name: str,
134
+ payload_schema: Optional[PayloadSchema] = None,
135
+ ):
136
+ """
137
+ Neptune Analytics stores vector on a node level, so create_collection() implements interface for compliance but performs no operations when called.
138
+ As the result, create_collection() will be no-op.
139
+
140
+ Parameters:
141
+ -----------
142
+ - collection_name (str): The name of the new collection to create.
143
+ - payload_schema (Optional[PayloadSchema]): An optional schema for the payloads
144
+ within this collection. (default None)
145
+ """
146
+ pass
147
+
148
+ async def get_collection(self, collection_name: str):
149
+ """
150
+ This method is part of the default implementation but not defined in the interface.
151
+ No operation is performed here because the concept of collection is not applicable in NeptuneAnalytics vector store.
152
+ """
153
+ return None
154
+
155
+ async def create_data_points(self, collection_name: str, data_points: List[DataPoint]):
156
+ """
157
+ Insert new data points into the specified collection, by first inserting the node itself on the graph,
158
+ then execute neptune.algo.vectors.upsert() to insert the corresponded embedding.
159
+
160
+ Parameters:
161
+ -----------
162
+ - collection_name (str): The name of the collection where data points will be added.
163
+ - data_points (List[DataPoint]): A list of data points to be added to the
164
+ collection.
165
+ """
166
+ self._validate_embedding_engine()
167
+
168
+ # Fetch embeddings
169
+ texts = [DataPoint.get_embeddable_data(t) for t in data_points]
170
+ data_vectors = await self.embedding_engine.embed_text(texts)
171
+
172
+ for index, data_point in enumerate(data_points):
173
+ node_id = data_point.id
174
+ # Fetch embedding from list instead
175
+ data_vector = data_vectors[index]
176
+
177
+ # Fetch properties
178
+ properties = self._serialize_properties(data_point.model_dump())
179
+ properties[self._COLLECTION_PREFIX] = collection_name
180
+ params = dict(
181
+ node_id=str(node_id),
182
+ properties=properties,
183
+ embedding=data_vector,
184
+ collection_name=collection_name,
185
+ )
186
+
187
+ # Compose the query and send
188
+ query_string = (
189
+ f"MERGE (n "
190
+ f":{self._VECTOR_NODE_LABEL} "
191
+ f" {{`~id`: $node_id}}) "
192
+ f"ON CREATE SET n = $properties, n.updated_at = timestamp() "
193
+ f"ON MATCH SET n += $properties, n.updated_at = timestamp() "
194
+ f"WITH n, $embedding AS embedding "
195
+ f"CALL neptune.algo.vectors.upsert(n, embedding) "
196
+ f"YIELD success "
197
+ f"RETURN success "
198
+ )
199
+
200
+ try:
201
+ self._client.query(query_string, params)
202
+ except Exception as e:
203
+ self._na_exception_handler(e, query_string)
204
+ pass
205
+
206
+ async def retrieve(self, collection_name: str, data_point_ids: list[str]):
207
+ """
208
+ Retrieve data points from a collection using their IDs.
209
+
210
+ Parameters:
211
+ -----------
212
+ - collection_name (str): The name of the collection from which to retrieve data
213
+ points.
214
+ - data_point_ids (list[str]): A list of IDs of the data points to retrieve.
215
+ """
216
+ # Do the fetch for each node
217
+ params = dict(node_ids=data_point_ids, collection_name=collection_name)
218
+ query_string = (
219
+ f"MATCH( n :{self._VECTOR_NODE_LABEL}) "
220
+ f"WHERE id(n) in $node_ids AND "
221
+ f"n.{self._COLLECTION_PREFIX} = $collection_name "
222
+ f"RETURN n as payload "
223
+ )
224
+
225
+ try:
226
+ result = self._client.query(query_string, params)
227
+ return [self._get_scored_result(item) for item in result]
228
+ except Exception as e:
229
+ self._na_exception_handler(e, query_string)
230
+
231
+ async def search(
232
+ self,
233
+ collection_name: str,
234
+ query_text: Optional[str] = None,
235
+ query_vector: Optional[List[float]] = None,
236
+ limit: int = None,
237
+ with_vector: bool = False,
238
+ ):
239
+ """
240
+ Perform a search in the specified collection using either a text query or a vector
241
+ query.
242
+
243
+ Parameters:
244
+ -----------
245
+ - collection_name (str): The name of the collection in which to perform the search.
246
+ - query_text (Optional[str]): An optional text query to search for in the
247
+ collection.
248
+ - query_vector (Optional[List[float]]): An optional vector representation for
249
+ searching the collection.
250
+ - limit (int): The maximum number of results to return from the search.
251
+ - with_vector (bool): Whether to return the vector representations with search
252
+ results, this is not supported for Neptune Analytics backend at the moment.
253
+
254
+ Returns:
255
+ --------
256
+ A list of scored results that match the query.
257
+ """
258
+ self._validate_embedding_engine()
259
+
260
+ if with_vector:
261
+ logger.warning(
262
+ "with_vector=True will include embedding vectors in the result. "
263
+ "This may trigger a resource-intensive query and increase response time. "
264
+ "Use this option only when vector data is required."
265
+ )
266
+
267
+ # In the case of excessive limit, or zero / negative value, limit will be set to 10.
268
+ if not limit or limit <= self._TOPK_LOWER_BOUND or limit > self._TOPK_UPPER_BOUND:
269
+ logger.warning(
270
+ "Provided limit (%s) is invalid (zero, negative, or exceeds maximum). "
271
+ "Defaulting to limit=10.",
272
+ limit,
273
+ )
274
+ limit = self._TOPK_UPPER_BOUND
275
+
276
+ if query_vector and query_text:
277
+ raise InvalidValueError(
278
+ message="The search function accepts either text or embedding as input, but not both."
279
+ )
280
+ elif query_text is None and query_vector is None:
281
+ raise InvalidValueError(message="One of query_text or query_vector must be provided!")
282
+ elif query_vector:
283
+ embedding = query_vector
284
+ else:
285
+ data_vectors = await self.embedding_engine.embed_text([query_text])
286
+ embedding = data_vectors[0]
287
+
288
+ # Compose the parameters map
289
+ params = dict(embedding=embedding, param_topk=limit)
290
+ # Compose the query
291
+ query_string = f"""
292
+ CALL neptune.algo.vectors.topKByEmbeddingWithFiltering({{
293
+ topK: {limit},
294
+ embedding: {embedding},
295
+ nodeFilter: {{ equals: {{property: '{self._COLLECTION_PREFIX}', value: '{collection_name}'}} }}
296
+ }}
297
+ )
298
+ YIELD node, score
299
+ """
300
+
301
+ if with_vector:
302
+ query_string += """
303
+ WITH node, score, id(node) as node_id
304
+ MATCH (n)
305
+ WHERE id(n) = id(node)
306
+ CALL neptune.algo.vectors.get(n)
307
+ YIELD embedding
308
+ RETURN node as payload, score, embedding
309
+ """
310
+
311
+ else:
312
+ query_string += """
313
+ RETURN node as payload, score
314
+ """
315
+
316
+ try:
317
+ query_response = self._client.query(query_string, params)
318
+ return [self._get_scored_result(item=item, with_score=True) for item in query_response]
319
+ except Exception as e:
320
+ self._na_exception_handler(e, query_string)
321
+
322
+ async def batch_search(
323
+ self, collection_name: str, query_texts: List[str], limit: int, with_vectors: bool = False
324
+ ):
325
+ """
326
+ Perform a batch search using multiple text queries against a collection.
327
+
328
+ Parameters:
329
+ -----------
330
+ - collection_name (str): The name of the collection to conduct the batch search in.
331
+ - query_texts (List[str]): A list of text queries to use for the search.
332
+ - limit (int): The maximum number of results to return for each query.
333
+ - with_vectors (bool): Whether to include vector representations with search
334
+ results. (default False)
335
+
336
+ Returns:
337
+ --------
338
+ A list of search result sets, one for each query input.
339
+ """
340
+ self._validate_embedding_engine()
341
+
342
+ # Convert text to embedding array in batch
343
+ data_vectors = await self.embedding_engine.embed_text(query_texts)
344
+ return await asyncio.gather(
345
+ *[
346
+ self.search(collection_name, None, vector, limit, with_vectors)
347
+ for vector in data_vectors
348
+ ]
349
+ )
350
+
351
+ async def delete_data_points(self, collection_name: str, data_point_ids: list[str]):
352
+ """
353
+ Delete specified data points from a collection, by executing an OpenCypher query,
354
+ with matching [vector_label, collection_label, node_id] combination.
355
+
356
+ Parameters:
357
+ -----------
358
+ - collection_name (str): The name of the collection from which to delete data
359
+ points.
360
+ - data_point_ids (list[str]): A list of IDs of the data points to delete.
361
+ """
362
+ params = dict(node_ids=data_point_ids, collection_name=collection_name)
363
+ query_string = (
364
+ f"MATCH (n :{self._VECTOR_NODE_LABEL}) "
365
+ f"WHERE id(n) IN $node_ids "
366
+ f"AND n.{self._COLLECTION_PREFIX} = $collection_name "
367
+ f"DETACH DELETE n"
368
+ )
369
+ try:
370
+ self._client.query(query_string, params)
371
+ except Exception as e:
372
+ self._na_exception_handler(e, query_string)
373
+ pass
374
+
375
+ async def create_vector_index(self, index_name: str, index_property_name: str):
376
+ """
377
+ Neptune Analytics stores vectors at the node level,
378
+ so create_vector_index() implements the interface for compliance but performs no operation when called.
379
+ As a result, create_vector_index() invokes create_collection(), which is also a no-op.
380
+ This ensures the logic flow remains consistent, even if the concept of collections is introduced in a future release.
381
+ """
382
+ await self.create_collection(f"{index_name}_{index_property_name}")
383
+
384
+ async def index_data_points(
385
+ self, index_name: str, index_property_name: str, data_points: list[DataPoint]
386
+ ):
387
+ """
388
+ Indexes a list of data points into Neptune Analytics by creating them as nodes.
389
+
390
+ This method constructs a unique collection name by combining the `index_name` and
391
+ `index_property_name`, then delegates to `create_data_points()` to store the data.
392
+
393
+ Args:
394
+ index_name (str): The base name of the index.
395
+ index_property_name (str): The property name to append to the index name for uniqueness.
396
+ data_points (list[DataPoint]): A list of `DataPoint` instances to be indexed.
397
+
398
+ Returns:
399
+ None
400
+ """
401
+ await self.create_data_points(
402
+ f"{index_name}_{index_property_name}",
403
+ [
404
+ IndexSchema(
405
+ id=str(data_point.id),
406
+ text=getattr(data_point, data_point.metadata["index_fields"][0]),
407
+ )
408
+ for data_point in data_points
409
+ ],
410
+ )
411
+
412
+ async def prune(self):
413
+ """
414
+ Remove obsolete or unnecessary data from the database.
415
+ """
416
+ # Run actual truncate
417
+ self._client.query(f"MATCH (n :{self._VECTOR_NODE_LABEL}) DETACH DELETE n")
418
+ pass
419
+
420
+ @staticmethod
421
+ def _get_scored_result(
422
+ item: dict, with_vector: bool = False, with_score: bool = False
423
+ ) -> ScoredResult:
424
+ """
425
+ Util method to simplify the object creation of ScoredResult base on incoming NX payload response.
426
+ """
427
+ return ScoredResult(
428
+ id=item.get("payload").get("~id"),
429
+ payload=item.get("payload").get("~properties"),
430
+ score=item.get("score") if with_score else 0,
431
+ vector=item.get("embedding") if with_vector else None,
432
+ )
433
+
434
+ def _na_exception_handler(self, ex, query_string: str):
435
+ """
436
+ Generic exception handler for NA langchain.
437
+ """
438
+ logger.error("Neptune Analytics query failed: %s | Query: [%s]", ex, query_string)
439
+ raise ex
440
+
441
+ def _validate_embedding_engine(self):
442
+ """
443
+ Validates if the embedding_engine is defined
444
+ :raises: ValueError if this object does not have a valid embedding_engine
445
+ """
446
+ if self.embedding_engine is None:
447
+ raise ValueError(
448
+ "Neptune Analytics requires an embedder defined to make vector operations"
449
+ )
@@ -49,9 +49,17 @@ class SQLAlchemyAdapter:
49
49
 
50
50
  run_sync(self.pull_from_s3())
51
51
 
52
- self.engine = create_async_engine(
53
- connection_string, poolclass=NullPool if "sqlite" in connection_string else None
54
- )
52
+ if "sqlite" in connection_string:
53
+ self.engine = create_async_engine(
54
+ connection_string,
55
+ poolclass=NullPool,
56
+ connect_args={"timeout": 30},
57
+ )
58
+ else:
59
+ self.engine = create_async_engine(
60
+ connection_string, pool_size=12, max_overflow=12, poolclass=None
61
+ )
62
+
55
63
  self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
56
64
 
57
65
  async def push_to_s3(self) -> None:
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import asyncio
2
3
  from uuid import UUID
3
4
  from typing import List, Optional
4
5
  from chromadb import AsyncHttpClient, Settings
@@ -161,6 +162,7 @@ class ChromaDBAdapter(VectorDBInterface):
161
162
  self.embedding_engine = embedding_engine
162
163
  self.url = url
163
164
  self.api_key = api_key
165
+ self.VECTOR_DB_LOCK = asyncio.Lock()
164
166
 
165
167
  async def get_connection(self) -> AsyncHttpClient:
166
168
  """
@@ -224,10 +226,13 @@ class ChromaDBAdapter(VectorDBInterface):
224
226
  - collection_name (str): The name of the collection to create.
225
227
  - payload_schema: The schema for the payload; can be None. (default None)
226
228
  """
227
- client = await self.get_connection()
229
+ async with self.VECTOR_DB_LOCK:
230
+ client = await self.get_connection()
228
231
 
229
- if not await self.has_collection(collection_name):
230
- await client.create_collection(name=collection_name, metadata={"hnsw:space": "cosine"})
232
+ if not await self.has_collection(collection_name):
233
+ await client.create_collection(
234
+ name=collection_name, metadata={"hnsw:space": "cosine"}
235
+ )
231
236
 
232
237
  async def get_collection(self, collection_name: str) -> AsyncHttpClient:
233
238
  """
@@ -19,7 +19,7 @@ def create_vector_engine(
19
19
  for each provider, raising an EnvironmentError if any are missing, or ImportError if the
20
20
  ChromaDB package is not installed.
21
21
 
22
- Supported providers include: Weaviate, Qdrant, pgvector, FalkorDB, ChromaDB, and
22
+ Supported providers include: pgvector, FalkorDB, ChromaDB, and
23
23
  LanceDB.
24
24
 
25
25
  Parameters:
@@ -30,7 +30,7 @@ def create_vector_engine(
30
30
  providers.
31
31
  - vector_db_key (str): The API key or access token for the vector database instance.
32
32
  - vector_db_provider (str): The name of the vector database provider to use (e.g.,
33
- 'weaviate', 'qdrant').
33
+ 'pgvector').
34
34
 
35
35
  Returns:
36
36
  --------
@@ -48,27 +48,7 @@ def create_vector_engine(
48
48
  embedding_engine=embedding_engine,
49
49
  )
50
50
 
51
- if vector_db_provider == "weaviate":
52
- from .weaviate_db import WeaviateAdapter
53
-
54
- if not (vector_db_url and vector_db_key):
55
- raise EnvironmentError("Missing requred Weaviate credentials!")
56
-
57
- return WeaviateAdapter(vector_db_url, vector_db_key, embedding_engine=embedding_engine)
58
-
59
- elif vector_db_provider == "qdrant":
60
- if not (vector_db_url and vector_db_key):
61
- raise EnvironmentError("Missing requred Qdrant credentials!")
62
-
63
- from .qdrant.QDrantAdapter import QDrantAdapter
64
-
65
- return QDrantAdapter(
66
- url=vector_db_url,
67
- api_key=vector_db_key,
68
- embedding_engine=embedding_engine,
69
- )
70
-
71
- elif vector_db_provider == "pgvector":
51
+ if vector_db_provider == "pgvector":
72
52
  from cognee.infrastructure.databases.relational import get_relational_config
73
53
 
74
54
  # Get configuration for postgres database
@@ -122,6 +102,34 @@ def create_vector_engine(
122
102
  embedding_engine=embedding_engine,
123
103
  )
124
104
 
105
+ elif vector_db_provider == "neptune_analytics":
106
+ try:
107
+ from langchain_aws import NeptuneAnalyticsGraph
108
+ except ImportError:
109
+ raise ImportError(
110
+ "langchain_aws is not installed. Please install it with 'pip install langchain_aws'"
111
+ )
112
+
113
+ if not vector_db_url:
114
+ raise EnvironmentError("Missing Neptune endpoint.")
115
+
116
+ from cognee.infrastructure.databases.hybrid.neptune_analytics.NeptuneAnalyticsAdapter import (
117
+ NeptuneAnalyticsAdapter,
118
+ NEPTUNE_ANALYTICS_ENDPOINT_URL,
119
+ )
120
+
121
+ if not vector_db_url.startswith(NEPTUNE_ANALYTICS_ENDPOINT_URL):
122
+ raise ValueError(
123
+ f"Neptune endpoint must have the format '{NEPTUNE_ANALYTICS_ENDPOINT_URL}<GRAPH_ID>'"
124
+ )
125
+
126
+ graph_identifier = vector_db_url.replace(NEPTUNE_ANALYTICS_ENDPOINT_URL, "")
127
+
128
+ return NeptuneAnalyticsAdapter(
129
+ graph_id=graph_identifier,
130
+ embedding_engine=embedding_engine,
131
+ )
132
+
125
133
  else:
126
134
  from .lancedb.LanceDBAdapter import LanceDBAdapter
127
135
 
@@ -5,7 +5,9 @@ import litellm
5
5
  import os
6
6
  from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
7
7
  from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
8
- from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer
8
+ from cognee.infrastructure.llm.tokenizer.TikToken import (
9
+ TikTokenTokenizer,
10
+ )
9
11
 
10
12
  litellm.set_verbose = False
11
13
  logger = get_logger("FastembedEmbeddingEngine")
@@ -7,11 +7,19 @@ import litellm
7
7
  import os
8
8
  from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
9
9
  from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
10
- from cognee.infrastructure.llm.tokenizer.Gemini import GeminiTokenizer
11
- from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
12
- from cognee.infrastructure.llm.tokenizer.Mistral import MistralTokenizer
13
- from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer
14
- from cognee.infrastructure.llm.embedding_rate_limiter import (
10
+ from cognee.infrastructure.llm.tokenizer.Gemini import (
11
+ GeminiTokenizer,
12
+ )
13
+ from cognee.infrastructure.llm.tokenizer.HuggingFace import (
14
+ HuggingFaceTokenizer,
15
+ )
16
+ from cognee.infrastructure.llm.tokenizer.Mistral import (
17
+ MistralTokenizer,
18
+ )
19
+ from cognee.infrastructure.llm.tokenizer.TikToken import (
20
+ TikTokenTokenizer,
21
+ )
22
+ from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
15
23
  embedding_rate_limit_async,
16
24
  embedding_sleep_and_retry_async,
17
25
  )
@@ -177,7 +185,14 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
177
185
  elif "mistral" in self.provider.lower():
178
186
  tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens)
179
187
  else:
180
- tokenizer = HuggingFaceTokenizer(model=self.model, max_tokens=self.max_tokens)
188
+ try:
189
+ tokenizer = HuggingFaceTokenizer(
190
+ model=self.model.replace("hosted_vllm/", ""), max_tokens=self.max_tokens
191
+ )
192
+ except Exception as e:
193
+ logger.warning(f"Could not get tokenizer from HuggingFace due to: {e}")
194
+ logger.info("Switching to TikToken default tokenizer.")
195
+ tokenizer = TikTokenTokenizer(model=None, max_tokens=self.max_tokens)
181
196
 
182
197
  logger.debug(f"Tokenizer loaded for model: {self.model}")
183
198
  return tokenizer
@@ -7,9 +7,10 @@ import os
7
7
  import aiohttp.http_exceptions
8
8
 
9
9
  from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
10
- from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
11
- from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
12
- from cognee.infrastructure.llm.embedding_rate_limiter import (
10
+ from cognee.infrastructure.llm.tokenizer.HuggingFace import (
11
+ HuggingFaceTokenizer,
12
+ )
13
+ from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
13
14
  embedding_rate_limit_async,
14
15
  embedding_sleep_and_retry_async,
15
16
  )
@@ -1,5 +1,7 @@
1
1
  from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
2
- from cognee.infrastructure.llm.config import get_llm_config
2
+ from cognee.infrastructure.llm.config import (
3
+ get_llm_config,
4
+ )
3
5
  from .EmbeddingEngine import EmbeddingEngine
4
6
  from functools import lru_cache
5
7