cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +44 -4
- cognee/api/health.py +332 -0
- cognee/api/v1/add/add.py +5 -2
- cognee/api/v1/add/routers/get_add_router.py +3 -0
- cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
- cognee/api/v1/cognify/cognify.py +8 -0
- cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
- cognee/api/v1/config/config.py +3 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
- cognee/api/v1/delete/delete.py +16 -12
- cognee/api/v1/responses/routers/get_responses_router.py +3 -1
- cognee/api/v1/search/search.py +10 -0
- cognee/api/v1/settings/routers/get_settings_router.py +0 -2
- cognee/base_config.py +1 -0
- cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
- cognee/infrastructure/databases/graph/config.py +2 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
- cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
- cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
- cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
- cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
- cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
- cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
- cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
- cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
- cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
- cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
- cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
- cognee/infrastructure/files/utils/guess_file_type.py +2 -2
- cognee/infrastructure/files/utils/open_data_file.py +4 -23
- cognee/infrastructure/llm/LLMGateway.py +137 -0
- cognee/infrastructure/llm/__init__.py +14 -4
- cognee/infrastructure/llm/config.py +29 -1
- cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
- cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
- cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
- cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
- cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
- cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
- cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
- cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
- cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
- cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
- cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
- cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
- cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
- cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
- cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
- cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
- cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
- cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
- cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
- cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
- cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
- cognee/infrastructure/llm/utils.py +3 -1
- cognee/infrastructure/loaders/LoaderEngine.py +156 -0
- cognee/infrastructure/loaders/LoaderInterface.py +73 -0
- cognee/infrastructure/loaders/__init__.py +18 -0
- cognee/infrastructure/loaders/core/__init__.py +7 -0
- cognee/infrastructure/loaders/core/audio_loader.py +98 -0
- cognee/infrastructure/loaders/core/image_loader.py +114 -0
- cognee/infrastructure/loaders/core/text_loader.py +90 -0
- cognee/infrastructure/loaders/create_loader_engine.py +32 -0
- cognee/infrastructure/loaders/external/__init__.py +22 -0
- cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
- cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
- cognee/infrastructure/loaders/get_loader_engine.py +18 -0
- cognee/infrastructure/loaders/supported_loaders.py +18 -0
- cognee/infrastructure/loaders/use_loader.py +21 -0
- cognee/infrastructure/loaders/utils/__init__.py +0 -0
- cognee/modules/data/methods/__init__.py +1 -0
- cognee/modules/data/methods/get_authorized_dataset.py +23 -0
- cognee/modules/data/models/Data.py +13 -3
- cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
- cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
- cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
- cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
- cognee/modules/engine/utils/generate_edge_id.py +5 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
- cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
- cognee/modules/graph/utils/get_graph_from_model.py +93 -101
- cognee/modules/ingestion/data_types/TextData.py +8 -2
- cognee/modules/ingestion/save_data_to_file.py +1 -1
- cognee/modules/pipelines/exceptions/__init__.py +1 -0
- cognee/modules/pipelines/exceptions/exceptions.py +12 -0
- cognee/modules/pipelines/models/DataItemStatus.py +5 -0
- cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
- cognee/modules/pipelines/models/__init__.py +1 -0
- cognee/modules/pipelines/operations/pipeline.py +10 -2
- cognee/modules/pipelines/operations/run_tasks.py +252 -20
- cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
- cognee/modules/retrieval/chunks_retriever.py +23 -1
- cognee/modules/retrieval/code_retriever.py +66 -9
- cognee/modules/retrieval/completion_retriever.py +11 -9
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
- cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
- cognee/modules/retrieval/graph_completion_retriever.py +1 -1
- cognee/modules/retrieval/insights_retriever.py +4 -0
- cognee/modules/retrieval/natural_language_retriever.py +9 -15
- cognee/modules/retrieval/summaries_retriever.py +23 -1
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
- cognee/modules/retrieval/utils/completion.py +6 -9
- cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
- cognee/modules/search/methods/search.py +5 -1
- cognee/modules/search/operations/__init__.py +1 -0
- cognee/modules/search/operations/select_search_type.py +42 -0
- cognee/modules/search/types/SearchType.py +1 -0
- cognee/modules/settings/get_settings.py +0 -8
- cognee/modules/settings/save_vector_db_config.py +1 -1
- cognee/shared/data_models.py +3 -1
- cognee/shared/logging_utils.py +0 -5
- cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
- cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
- cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
- cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
- cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
- cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
- cognee/tasks/graph/extract_graph_from_code.py +3 -2
- cognee/tasks/graph/extract_graph_from_data.py +4 -3
- cognee/tasks/graph/infer_data_ontology.py +5 -6
- cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
- cognee/tasks/ingestion/ingest_data.py +91 -61
- cognee/tasks/ingestion/resolve_data_directories.py +3 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
- cognee/tasks/storage/index_data_points.py +1 -1
- cognee/tasks/storage/index_graph_edges.py +4 -1
- cognee/tasks/summarization/summarize_code.py +2 -3
- cognee/tasks/summarization/summarize_text.py +3 -2
- cognee/tests/test_cognee_server_start.py +12 -7
- cognee/tests/test_deduplication.py +2 -2
- cognee/tests/test_deletion.py +58 -17
- cognee/tests/test_graph_visualization_permissions.py +161 -0
- cognee/tests/test_neptune_analytics_graph.py +309 -0
- cognee/tests/test_neptune_analytics_hybrid.py +176 -0
- cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
- cognee/tests/test_pgvector.py +5 -5
- cognee/tests/test_s3.py +1 -6
- cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
- cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
- cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
- cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
- cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
- cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
- cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
- cognee/tests/unit/modules/search/search_methods_test.py +55 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
- cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
- cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
- cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
- cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
- cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
- cognee/modules/data/extraction/extract_categories.py +0 -14
- cognee/tests/test_qdrant.py +0 -99
- distributed/Dockerfile +0 -34
- distributed/app.py +0 -4
- distributed/entrypoint.py +0 -71
- distributed/entrypoint.sh +0 -5
- distributed/modal_image.py +0 -11
- distributed/queues.py +0 -5
- distributed/tasks/queued_add_data_points.py +0 -13
- distributed/tasks/queued_add_edges.py +0 -13
- distributed/tasks/queued_add_nodes.py +0 -13
- distributed/test.py +0 -28
- distributed/utils.py +0 -19
- distributed/workers/data_point_saving_worker.py +0 -93
- distributed/workers/graph_saving_worker.py +0 -104
- /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
- /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
- /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
- /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
- /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
- /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
- /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
- /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
- /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
- /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
- {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
- {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
- /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
"""Neptune Analytics Hybrid Adapter combining Vector and Graph functionality"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
from typing import List, Optional, Any, Dict, Type, Tuple
|
|
6
|
+
from uuid import UUID
|
|
7
|
+
|
|
8
|
+
from cognee.exceptions import InvalidValueError
|
|
9
|
+
from cognee.infrastructure.databases.graph.neptune_driver.adapter import NeptuneGraphDB
|
|
10
|
+
from cognee.infrastructure.databases.vector.vector_db_interface import VectorDBInterface
|
|
11
|
+
from cognee.infrastructure.engine import DataPoint
|
|
12
|
+
from cognee.modules.storage.utils import JSONEncoder
|
|
13
|
+
from cognee.shared.logging_utils import get_logger
|
|
14
|
+
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
|
15
|
+
from cognee.infrastructure.databases.vector.models.PayloadSchema import PayloadSchema
|
|
16
|
+
from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult
|
|
17
|
+
|
|
18
|
+
logger = get_logger("NeptuneAnalyticsAdapter")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class IndexSchema(DataPoint):
|
|
22
|
+
"""
|
|
23
|
+
Represents a schema for an index data point containing an ID and text.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
- id: A string representing the unique identifier for the data point.
|
|
27
|
+
- text: A string representing the content of the data point.
|
|
28
|
+
- metadata: A dictionary with default index fields for the schema, currently configured
|
|
29
|
+
to include 'text'.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
id: str
|
|
33
|
+
text: str
|
|
34
|
+
metadata: dict = {"index_fields": ["text"]}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
NEPTUNE_ANALYTICS_ENDPOINT_URL = "neptune-graph://"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class NeptuneAnalyticsAdapter(NeptuneGraphDB, VectorDBInterface):
|
|
41
|
+
"""
|
|
42
|
+
Hybrid adapter that combines Neptune Analytics Vector and Graph functionality.
|
|
43
|
+
|
|
44
|
+
This adapter extends NeptuneGraphDB and implements VectorDBInterface to provide
|
|
45
|
+
a unified interface for working with Neptune Analytics as both a vector store
|
|
46
|
+
and a graph database.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
_VECTOR_NODE_LABEL = "COGNEE_NODE"
|
|
50
|
+
_COLLECTION_PREFIX = "VECTOR_COLLECTION"
|
|
51
|
+
_TOPK_LOWER_BOUND = 0
|
|
52
|
+
_TOPK_UPPER_BOUND = 10
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
graph_id: str,
|
|
57
|
+
embedding_engine: Optional[EmbeddingEngine] = None,
|
|
58
|
+
region: Optional[str] = None,
|
|
59
|
+
aws_access_key_id: Optional[str] = None,
|
|
60
|
+
aws_secret_access_key: Optional[str] = None,
|
|
61
|
+
aws_session_token: Optional[str] = None,
|
|
62
|
+
):
|
|
63
|
+
"""
|
|
64
|
+
Initialize the Neptune Analytics hybrid adapter.
|
|
65
|
+
|
|
66
|
+
Parameters:
|
|
67
|
+
-----------
|
|
68
|
+
- graph_id (str): The Neptune Analytics graph identifier
|
|
69
|
+
- embedding_engine(Optional[EmbeddingEngine]): The embedding engine instance to translate text to vector.
|
|
70
|
+
- region (Optional[str]): AWS region where the graph is located (default: us-east-1)
|
|
71
|
+
- aws_access_key_id (Optional[str]): AWS access key ID
|
|
72
|
+
- aws_secret_access_key (Optional[str]): AWS secret access key
|
|
73
|
+
- aws_session_token (Optional[str]): AWS session token for temporary credentials
|
|
74
|
+
"""
|
|
75
|
+
# Initialize the graph database functionality
|
|
76
|
+
super().__init__(
|
|
77
|
+
graph_id=graph_id,
|
|
78
|
+
region=region,
|
|
79
|
+
aws_access_key_id=aws_access_key_id,
|
|
80
|
+
aws_secret_access_key=aws_secret_access_key,
|
|
81
|
+
aws_session_token=aws_session_token,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Add vector-specific attributes
|
|
85
|
+
self.embedding_engine = embedding_engine
|
|
86
|
+
logger.info(
|
|
87
|
+
f'Initialized Neptune Analytics hybrid adapter for graph: "{graph_id}" in region: "{self.region}"'
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# VectorDBInterface methods implementation
|
|
91
|
+
|
|
92
|
+
async def get_connection(self):
|
|
93
|
+
"""
|
|
94
|
+
This method is part of the default implementation but not defined in the interface.
|
|
95
|
+
No operation is performed and None will be returned here,
|
|
96
|
+
because the concept of connection is not applicable in this context.
|
|
97
|
+
"""
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
async def embed_data(self, data: list[str]) -> list[list[float]]:
|
|
101
|
+
"""
|
|
102
|
+
Embeds the provided textual data into vector representation.
|
|
103
|
+
|
|
104
|
+
Uses the embedding engine to convert the list of strings into a list of float vectors.
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
-----------
|
|
108
|
+
- data (list[str]): A list of strings representing the data to be embedded.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
--------
|
|
112
|
+
- list[list[float]]: A list of embedded vectors corresponding to the input data.
|
|
113
|
+
"""
|
|
114
|
+
self._validate_embedding_engine()
|
|
115
|
+
return await self.embedding_engine.embed_text(data)
|
|
116
|
+
|
|
117
|
+
async def has_collection(self, collection_name: str) -> bool:
|
|
118
|
+
"""
|
|
119
|
+
Neptune Analytics stores vector on a node level,
|
|
120
|
+
so create_collection() implements interface for compliance but performs no operations when called.
|
|
121
|
+
|
|
122
|
+
Parameters:
|
|
123
|
+
-----------
|
|
124
|
+
- collection_name (str): The name of the collection to check for existence.
|
|
125
|
+
Returns:
|
|
126
|
+
--------
|
|
127
|
+
- bool: Always return True.
|
|
128
|
+
"""
|
|
129
|
+
return True
|
|
130
|
+
|
|
131
|
+
async def create_collection(
|
|
132
|
+
self,
|
|
133
|
+
collection_name: str,
|
|
134
|
+
payload_schema: Optional[PayloadSchema] = None,
|
|
135
|
+
):
|
|
136
|
+
"""
|
|
137
|
+
Neptune Analytics stores vector on a node level, so create_collection() implements interface for compliance but performs no operations when called.
|
|
138
|
+
As the result, create_collection() will be no-op.
|
|
139
|
+
|
|
140
|
+
Parameters:
|
|
141
|
+
-----------
|
|
142
|
+
- collection_name (str): The name of the new collection to create.
|
|
143
|
+
- payload_schema (Optional[PayloadSchema]): An optional schema for the payloads
|
|
144
|
+
within this collection. (default None)
|
|
145
|
+
"""
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
async def get_collection(self, collection_name: str):
|
|
149
|
+
"""
|
|
150
|
+
This method is part of the default implementation but not defined in the interface.
|
|
151
|
+
No operation is performed here because the concept of collection is not applicable in NeptuneAnalytics vector store.
|
|
152
|
+
"""
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
async def create_data_points(self, collection_name: str, data_points: List[DataPoint]):
|
|
156
|
+
"""
|
|
157
|
+
Insert new data points into the specified collection, by first inserting the node itself on the graph,
|
|
158
|
+
then execute neptune.algo.vectors.upsert() to insert the corresponded embedding.
|
|
159
|
+
|
|
160
|
+
Parameters:
|
|
161
|
+
-----------
|
|
162
|
+
- collection_name (str): The name of the collection where data points will be added.
|
|
163
|
+
- data_points (List[DataPoint]): A list of data points to be added to the
|
|
164
|
+
collection.
|
|
165
|
+
"""
|
|
166
|
+
self._validate_embedding_engine()
|
|
167
|
+
|
|
168
|
+
# Fetch embeddings
|
|
169
|
+
texts = [DataPoint.get_embeddable_data(t) for t in data_points]
|
|
170
|
+
data_vectors = await self.embedding_engine.embed_text(texts)
|
|
171
|
+
|
|
172
|
+
for index, data_point in enumerate(data_points):
|
|
173
|
+
node_id = data_point.id
|
|
174
|
+
# Fetch embedding from list instead
|
|
175
|
+
data_vector = data_vectors[index]
|
|
176
|
+
|
|
177
|
+
# Fetch properties
|
|
178
|
+
properties = self._serialize_properties(data_point.model_dump())
|
|
179
|
+
properties[self._COLLECTION_PREFIX] = collection_name
|
|
180
|
+
params = dict(
|
|
181
|
+
node_id=str(node_id),
|
|
182
|
+
properties=properties,
|
|
183
|
+
embedding=data_vector,
|
|
184
|
+
collection_name=collection_name,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Compose the query and send
|
|
188
|
+
query_string = (
|
|
189
|
+
f"MERGE (n "
|
|
190
|
+
f":{self._VECTOR_NODE_LABEL} "
|
|
191
|
+
f" {{`~id`: $node_id}}) "
|
|
192
|
+
f"ON CREATE SET n = $properties, n.updated_at = timestamp() "
|
|
193
|
+
f"ON MATCH SET n += $properties, n.updated_at = timestamp() "
|
|
194
|
+
f"WITH n, $embedding AS embedding "
|
|
195
|
+
f"CALL neptune.algo.vectors.upsert(n, embedding) "
|
|
196
|
+
f"YIELD success "
|
|
197
|
+
f"RETURN success "
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
self._client.query(query_string, params)
|
|
202
|
+
except Exception as e:
|
|
203
|
+
self._na_exception_handler(e, query_string)
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
async def retrieve(self, collection_name: str, data_point_ids: list[str]):
|
|
207
|
+
"""
|
|
208
|
+
Retrieve data points from a collection using their IDs.
|
|
209
|
+
|
|
210
|
+
Parameters:
|
|
211
|
+
-----------
|
|
212
|
+
- collection_name (str): The name of the collection from which to retrieve data
|
|
213
|
+
points.
|
|
214
|
+
- data_point_ids (list[str]): A list of IDs of the data points to retrieve.
|
|
215
|
+
"""
|
|
216
|
+
# Do the fetch for each node
|
|
217
|
+
params = dict(node_ids=data_point_ids, collection_name=collection_name)
|
|
218
|
+
query_string = (
|
|
219
|
+
f"MATCH( n :{self._VECTOR_NODE_LABEL}) "
|
|
220
|
+
f"WHERE id(n) in $node_ids AND "
|
|
221
|
+
f"n.{self._COLLECTION_PREFIX} = $collection_name "
|
|
222
|
+
f"RETURN n as payload "
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
result = self._client.query(query_string, params)
|
|
227
|
+
return [self._get_scored_result(item) for item in result]
|
|
228
|
+
except Exception as e:
|
|
229
|
+
self._na_exception_handler(e, query_string)
|
|
230
|
+
|
|
231
|
+
async def search(
|
|
232
|
+
self,
|
|
233
|
+
collection_name: str,
|
|
234
|
+
query_text: Optional[str] = None,
|
|
235
|
+
query_vector: Optional[List[float]] = None,
|
|
236
|
+
limit: int = None,
|
|
237
|
+
with_vector: bool = False,
|
|
238
|
+
):
|
|
239
|
+
"""
|
|
240
|
+
Perform a search in the specified collection using either a text query or a vector
|
|
241
|
+
query.
|
|
242
|
+
|
|
243
|
+
Parameters:
|
|
244
|
+
-----------
|
|
245
|
+
- collection_name (str): The name of the collection in which to perform the search.
|
|
246
|
+
- query_text (Optional[str]): An optional text query to search for in the
|
|
247
|
+
collection.
|
|
248
|
+
- query_vector (Optional[List[float]]): An optional vector representation for
|
|
249
|
+
searching the collection.
|
|
250
|
+
- limit (int): The maximum number of results to return from the search.
|
|
251
|
+
- with_vector (bool): Whether to return the vector representations with search
|
|
252
|
+
results, this is not supported for Neptune Analytics backend at the moment.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
--------
|
|
256
|
+
A list of scored results that match the query.
|
|
257
|
+
"""
|
|
258
|
+
self._validate_embedding_engine()
|
|
259
|
+
|
|
260
|
+
if with_vector:
|
|
261
|
+
logger.warning(
|
|
262
|
+
"with_vector=True will include embedding vectors in the result. "
|
|
263
|
+
"This may trigger a resource-intensive query and increase response time. "
|
|
264
|
+
"Use this option only when vector data is required."
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# In the case of excessive limit, or zero / negative value, limit will be set to 10.
|
|
268
|
+
if not limit or limit <= self._TOPK_LOWER_BOUND or limit > self._TOPK_UPPER_BOUND:
|
|
269
|
+
logger.warning(
|
|
270
|
+
"Provided limit (%s) is invalid (zero, negative, or exceeds maximum). "
|
|
271
|
+
"Defaulting to limit=10.",
|
|
272
|
+
limit,
|
|
273
|
+
)
|
|
274
|
+
limit = self._TOPK_UPPER_BOUND
|
|
275
|
+
|
|
276
|
+
if query_vector and query_text:
|
|
277
|
+
raise InvalidValueError(
|
|
278
|
+
message="The search function accepts either text or embedding as input, but not both."
|
|
279
|
+
)
|
|
280
|
+
elif query_text is None and query_vector is None:
|
|
281
|
+
raise InvalidValueError(message="One of query_text or query_vector must be provided!")
|
|
282
|
+
elif query_vector:
|
|
283
|
+
embedding = query_vector
|
|
284
|
+
else:
|
|
285
|
+
data_vectors = await self.embedding_engine.embed_text([query_text])
|
|
286
|
+
embedding = data_vectors[0]
|
|
287
|
+
|
|
288
|
+
# Compose the parameters map
|
|
289
|
+
params = dict(embedding=embedding, param_topk=limit)
|
|
290
|
+
# Compose the query
|
|
291
|
+
query_string = f"""
|
|
292
|
+
CALL neptune.algo.vectors.topKByEmbeddingWithFiltering({{
|
|
293
|
+
topK: {limit},
|
|
294
|
+
embedding: {embedding},
|
|
295
|
+
nodeFilter: {{ equals: {{property: '{self._COLLECTION_PREFIX}', value: '{collection_name}'}} }}
|
|
296
|
+
}}
|
|
297
|
+
)
|
|
298
|
+
YIELD node, score
|
|
299
|
+
"""
|
|
300
|
+
|
|
301
|
+
if with_vector:
|
|
302
|
+
query_string += """
|
|
303
|
+
WITH node, score, id(node) as node_id
|
|
304
|
+
MATCH (n)
|
|
305
|
+
WHERE id(n) = id(node)
|
|
306
|
+
CALL neptune.algo.vectors.get(n)
|
|
307
|
+
YIELD embedding
|
|
308
|
+
RETURN node as payload, score, embedding
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
else:
|
|
312
|
+
query_string += """
|
|
313
|
+
RETURN node as payload, score
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
query_response = self._client.query(query_string, params)
|
|
318
|
+
return [self._get_scored_result(item=item, with_score=True) for item in query_response]
|
|
319
|
+
except Exception as e:
|
|
320
|
+
self._na_exception_handler(e, query_string)
|
|
321
|
+
|
|
322
|
+
async def batch_search(
|
|
323
|
+
self, collection_name: str, query_texts: List[str], limit: int, with_vectors: bool = False
|
|
324
|
+
):
|
|
325
|
+
"""
|
|
326
|
+
Perform a batch search using multiple text queries against a collection.
|
|
327
|
+
|
|
328
|
+
Parameters:
|
|
329
|
+
-----------
|
|
330
|
+
- collection_name (str): The name of the collection to conduct the batch search in.
|
|
331
|
+
- query_texts (List[str]): A list of text queries to use for the search.
|
|
332
|
+
- limit (int): The maximum number of results to return for each query.
|
|
333
|
+
- with_vectors (bool): Whether to include vector representations with search
|
|
334
|
+
results. (default False)
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
--------
|
|
338
|
+
A list of search result sets, one for each query input.
|
|
339
|
+
"""
|
|
340
|
+
self._validate_embedding_engine()
|
|
341
|
+
|
|
342
|
+
# Convert text to embedding array in batch
|
|
343
|
+
data_vectors = await self.embedding_engine.embed_text(query_texts)
|
|
344
|
+
return await asyncio.gather(
|
|
345
|
+
*[
|
|
346
|
+
self.search(collection_name, None, vector, limit, with_vectors)
|
|
347
|
+
for vector in data_vectors
|
|
348
|
+
]
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
async def delete_data_points(self, collection_name: str, data_point_ids: list[str]):
|
|
352
|
+
"""
|
|
353
|
+
Delete specified data points from a collection, by executing an OpenCypher query,
|
|
354
|
+
with matching [vector_label, collection_label, node_id] combination.
|
|
355
|
+
|
|
356
|
+
Parameters:
|
|
357
|
+
-----------
|
|
358
|
+
- collection_name (str): The name of the collection from which to delete data
|
|
359
|
+
points.
|
|
360
|
+
- data_point_ids (list[str]): A list of IDs of the data points to delete.
|
|
361
|
+
"""
|
|
362
|
+
params = dict(node_ids=data_point_ids, collection_name=collection_name)
|
|
363
|
+
query_string = (
|
|
364
|
+
f"MATCH (n :{self._VECTOR_NODE_LABEL}) "
|
|
365
|
+
f"WHERE id(n) IN $node_ids "
|
|
366
|
+
f"AND n.{self._COLLECTION_PREFIX} = $collection_name "
|
|
367
|
+
f"DETACH DELETE n"
|
|
368
|
+
)
|
|
369
|
+
try:
|
|
370
|
+
self._client.query(query_string, params)
|
|
371
|
+
except Exception as e:
|
|
372
|
+
self._na_exception_handler(e, query_string)
|
|
373
|
+
pass
|
|
374
|
+
|
|
375
|
+
async def create_vector_index(self, index_name: str, index_property_name: str):
|
|
376
|
+
"""
|
|
377
|
+
Neptune Analytics stores vectors at the node level,
|
|
378
|
+
so create_vector_index() implements the interface for compliance but performs no operation when called.
|
|
379
|
+
As a result, create_vector_index() invokes create_collection(), which is also a no-op.
|
|
380
|
+
This ensures the logic flow remains consistent, even if the concept of collections is introduced in a future release.
|
|
381
|
+
"""
|
|
382
|
+
await self.create_collection(f"{index_name}_{index_property_name}")
|
|
383
|
+
|
|
384
|
+
async def index_data_points(
|
|
385
|
+
self, index_name: str, index_property_name: str, data_points: list[DataPoint]
|
|
386
|
+
):
|
|
387
|
+
"""
|
|
388
|
+
Indexes a list of data points into Neptune Analytics by creating them as nodes.
|
|
389
|
+
|
|
390
|
+
This method constructs a unique collection name by combining the `index_name` and
|
|
391
|
+
`index_property_name`, then delegates to `create_data_points()` to store the data.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
index_name (str): The base name of the index.
|
|
395
|
+
index_property_name (str): The property name to append to the index name for uniqueness.
|
|
396
|
+
data_points (list[DataPoint]): A list of `DataPoint` instances to be indexed.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
None
|
|
400
|
+
"""
|
|
401
|
+
await self.create_data_points(
|
|
402
|
+
f"{index_name}_{index_property_name}",
|
|
403
|
+
[
|
|
404
|
+
IndexSchema(
|
|
405
|
+
id=str(data_point.id),
|
|
406
|
+
text=getattr(data_point, data_point.metadata["index_fields"][0]),
|
|
407
|
+
)
|
|
408
|
+
for data_point in data_points
|
|
409
|
+
],
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
async def prune(self):
|
|
413
|
+
"""
|
|
414
|
+
Remove obsolete or unnecessary data from the database.
|
|
415
|
+
"""
|
|
416
|
+
# Run actual truncate
|
|
417
|
+
self._client.query(f"MATCH (n :{self._VECTOR_NODE_LABEL}) DETACH DELETE n")
|
|
418
|
+
pass
|
|
419
|
+
|
|
420
|
+
@staticmethod
|
|
421
|
+
def _get_scored_result(
|
|
422
|
+
item: dict, with_vector: bool = False, with_score: bool = False
|
|
423
|
+
) -> ScoredResult:
|
|
424
|
+
"""
|
|
425
|
+
Util method to simplify the object creation of ScoredResult base on incoming NX payload response.
|
|
426
|
+
"""
|
|
427
|
+
return ScoredResult(
|
|
428
|
+
id=item.get("payload").get("~id"),
|
|
429
|
+
payload=item.get("payload").get("~properties"),
|
|
430
|
+
score=item.get("score") if with_score else 0,
|
|
431
|
+
vector=item.get("embedding") if with_vector else None,
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
def _na_exception_handler(self, ex, query_string: str):
|
|
435
|
+
"""
|
|
436
|
+
Generic exception handler for NA langchain.
|
|
437
|
+
"""
|
|
438
|
+
logger.error("Neptune Analytics query failed: %s | Query: [%s]", ex, query_string)
|
|
439
|
+
raise ex
|
|
440
|
+
|
|
441
|
+
def _validate_embedding_engine(self):
|
|
442
|
+
"""
|
|
443
|
+
Validates if the embedding_engine is defined
|
|
444
|
+
:raises: ValueError if this object does not have a valid embedding_engine
|
|
445
|
+
"""
|
|
446
|
+
if self.embedding_engine is None:
|
|
447
|
+
raise ValueError(
|
|
448
|
+
"Neptune Analytics requires an embedder defined to make vector operations"
|
|
449
|
+
)
|
|
@@ -49,9 +49,17 @@ class SQLAlchemyAdapter:
|
|
|
49
49
|
|
|
50
50
|
run_sync(self.pull_from_s3())
|
|
51
51
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
if "sqlite" in connection_string:
|
|
53
|
+
self.engine = create_async_engine(
|
|
54
|
+
connection_string,
|
|
55
|
+
poolclass=NullPool,
|
|
56
|
+
connect_args={"timeout": 30},
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
self.engine = create_async_engine(
|
|
60
|
+
connection_string, pool_size=12, max_overflow=12, poolclass=None
|
|
61
|
+
)
|
|
62
|
+
|
|
55
63
|
self.sessionmaker = async_sessionmaker(bind=self.engine, expire_on_commit=False)
|
|
56
64
|
|
|
57
65
|
async def push_to_s3(self) -> None:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import asyncio
|
|
2
3
|
from uuid import UUID
|
|
3
4
|
from typing import List, Optional
|
|
4
5
|
from chromadb import AsyncHttpClient, Settings
|
|
@@ -161,6 +162,7 @@ class ChromaDBAdapter(VectorDBInterface):
|
|
|
161
162
|
self.embedding_engine = embedding_engine
|
|
162
163
|
self.url = url
|
|
163
164
|
self.api_key = api_key
|
|
165
|
+
self.VECTOR_DB_LOCK = asyncio.Lock()
|
|
164
166
|
|
|
165
167
|
async def get_connection(self) -> AsyncHttpClient:
|
|
166
168
|
"""
|
|
@@ -224,10 +226,13 @@ class ChromaDBAdapter(VectorDBInterface):
|
|
|
224
226
|
- collection_name (str): The name of the collection to create.
|
|
225
227
|
- payload_schema: The schema for the payload; can be None. (default None)
|
|
226
228
|
"""
|
|
227
|
-
|
|
229
|
+
async with self.VECTOR_DB_LOCK:
|
|
230
|
+
client = await self.get_connection()
|
|
228
231
|
|
|
229
|
-
|
|
230
|
-
|
|
232
|
+
if not await self.has_collection(collection_name):
|
|
233
|
+
await client.create_collection(
|
|
234
|
+
name=collection_name, metadata={"hnsw:space": "cosine"}
|
|
235
|
+
)
|
|
231
236
|
|
|
232
237
|
async def get_collection(self, collection_name: str) -> AsyncHttpClient:
|
|
233
238
|
"""
|
|
@@ -19,7 +19,7 @@ def create_vector_engine(
|
|
|
19
19
|
for each provider, raising an EnvironmentError if any are missing, or ImportError if the
|
|
20
20
|
ChromaDB package is not installed.
|
|
21
21
|
|
|
22
|
-
Supported providers include:
|
|
22
|
+
Supported providers include: pgvector, FalkorDB, ChromaDB, and
|
|
23
23
|
LanceDB.
|
|
24
24
|
|
|
25
25
|
Parameters:
|
|
@@ -30,7 +30,7 @@ def create_vector_engine(
|
|
|
30
30
|
providers.
|
|
31
31
|
- vector_db_key (str): The API key or access token for the vector database instance.
|
|
32
32
|
- vector_db_provider (str): The name of the vector database provider to use (e.g.,
|
|
33
|
-
'
|
|
33
|
+
'pgvector').
|
|
34
34
|
|
|
35
35
|
Returns:
|
|
36
36
|
--------
|
|
@@ -48,27 +48,7 @@ def create_vector_engine(
|
|
|
48
48
|
embedding_engine=embedding_engine,
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
-
if vector_db_provider == "
|
|
52
|
-
from .weaviate_db import WeaviateAdapter
|
|
53
|
-
|
|
54
|
-
if not (vector_db_url and vector_db_key):
|
|
55
|
-
raise EnvironmentError("Missing requred Weaviate credentials!")
|
|
56
|
-
|
|
57
|
-
return WeaviateAdapter(vector_db_url, vector_db_key, embedding_engine=embedding_engine)
|
|
58
|
-
|
|
59
|
-
elif vector_db_provider == "qdrant":
|
|
60
|
-
if not (vector_db_url and vector_db_key):
|
|
61
|
-
raise EnvironmentError("Missing requred Qdrant credentials!")
|
|
62
|
-
|
|
63
|
-
from .qdrant.QDrantAdapter import QDrantAdapter
|
|
64
|
-
|
|
65
|
-
return QDrantAdapter(
|
|
66
|
-
url=vector_db_url,
|
|
67
|
-
api_key=vector_db_key,
|
|
68
|
-
embedding_engine=embedding_engine,
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
elif vector_db_provider == "pgvector":
|
|
51
|
+
if vector_db_provider == "pgvector":
|
|
72
52
|
from cognee.infrastructure.databases.relational import get_relational_config
|
|
73
53
|
|
|
74
54
|
# Get configuration for postgres database
|
|
@@ -122,6 +102,34 @@ def create_vector_engine(
|
|
|
122
102
|
embedding_engine=embedding_engine,
|
|
123
103
|
)
|
|
124
104
|
|
|
105
|
+
elif vector_db_provider == "neptune_analytics":
|
|
106
|
+
try:
|
|
107
|
+
from langchain_aws import NeptuneAnalyticsGraph
|
|
108
|
+
except ImportError:
|
|
109
|
+
raise ImportError(
|
|
110
|
+
"langchain_aws is not installed. Please install it with 'pip install langchain_aws'"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if not vector_db_url:
|
|
114
|
+
raise EnvironmentError("Missing Neptune endpoint.")
|
|
115
|
+
|
|
116
|
+
from cognee.infrastructure.databases.hybrid.neptune_analytics.NeptuneAnalyticsAdapter import (
|
|
117
|
+
NeptuneAnalyticsAdapter,
|
|
118
|
+
NEPTUNE_ANALYTICS_ENDPOINT_URL,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
if not vector_db_url.startswith(NEPTUNE_ANALYTICS_ENDPOINT_URL):
|
|
122
|
+
raise ValueError(
|
|
123
|
+
f"Neptune endpoint must have the format '{NEPTUNE_ANALYTICS_ENDPOINT_URL}<GRAPH_ID>'"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
graph_identifier = vector_db_url.replace(NEPTUNE_ANALYTICS_ENDPOINT_URL, "")
|
|
127
|
+
|
|
128
|
+
return NeptuneAnalyticsAdapter(
|
|
129
|
+
graph_id=graph_identifier,
|
|
130
|
+
embedding_engine=embedding_engine,
|
|
131
|
+
)
|
|
132
|
+
|
|
125
133
|
else:
|
|
126
134
|
from .lancedb.LanceDBAdapter import LanceDBAdapter
|
|
127
135
|
|
|
@@ -5,7 +5,9 @@ import litellm
|
|
|
5
5
|
import os
|
|
6
6
|
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
|
7
7
|
from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
|
|
8
|
-
from cognee.infrastructure.llm.tokenizer.TikToken import
|
|
8
|
+
from cognee.infrastructure.llm.tokenizer.TikToken import (
|
|
9
|
+
TikTokenTokenizer,
|
|
10
|
+
)
|
|
9
11
|
|
|
10
12
|
litellm.set_verbose = False
|
|
11
13
|
logger = get_logger("FastembedEmbeddingEngine")
|
|
@@ -7,11 +7,19 @@ import litellm
|
|
|
7
7
|
import os
|
|
8
8
|
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
|
9
9
|
from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
|
|
10
|
-
from cognee.infrastructure.llm.tokenizer.Gemini import
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
from cognee.infrastructure.llm.tokenizer.
|
|
14
|
-
|
|
10
|
+
from cognee.infrastructure.llm.tokenizer.Gemini import (
|
|
11
|
+
GeminiTokenizer,
|
|
12
|
+
)
|
|
13
|
+
from cognee.infrastructure.llm.tokenizer.HuggingFace import (
|
|
14
|
+
HuggingFaceTokenizer,
|
|
15
|
+
)
|
|
16
|
+
from cognee.infrastructure.llm.tokenizer.Mistral import (
|
|
17
|
+
MistralTokenizer,
|
|
18
|
+
)
|
|
19
|
+
from cognee.infrastructure.llm.tokenizer.TikToken import (
|
|
20
|
+
TikTokenTokenizer,
|
|
21
|
+
)
|
|
22
|
+
from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
|
|
15
23
|
embedding_rate_limit_async,
|
|
16
24
|
embedding_sleep_and_retry_async,
|
|
17
25
|
)
|
|
@@ -177,7 +185,14 @@ class LiteLLMEmbeddingEngine(EmbeddingEngine):
|
|
|
177
185
|
elif "mistral" in self.provider.lower():
|
|
178
186
|
tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens)
|
|
179
187
|
else:
|
|
180
|
-
|
|
188
|
+
try:
|
|
189
|
+
tokenizer = HuggingFaceTokenizer(
|
|
190
|
+
model=self.model.replace("hosted_vllm/", ""), max_tokens=self.max_tokens
|
|
191
|
+
)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.warning(f"Could not get tokenizer from HuggingFace due to: {e}")
|
|
194
|
+
logger.info("Switching to TikToken default tokenizer.")
|
|
195
|
+
tokenizer = TikTokenTokenizer(model=None, max_tokens=self.max_tokens)
|
|
181
196
|
|
|
182
197
|
logger.debug(f"Tokenizer loaded for model: {self.model}")
|
|
183
198
|
return tokenizer
|
|
@@ -7,9 +7,10 @@ import os
|
|
|
7
7
|
import aiohttp.http_exceptions
|
|
8
8
|
|
|
9
9
|
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
|
10
|
-
from cognee.infrastructure.
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
from cognee.infrastructure.llm.tokenizer.HuggingFace import (
|
|
11
|
+
HuggingFaceTokenizer,
|
|
12
|
+
)
|
|
13
|
+
from cognee.infrastructure.databases.vector.embeddings.embedding_rate_limiter import (
|
|
13
14
|
embedding_rate_limit_async,
|
|
14
15
|
embedding_sleep_and_retry_async,
|
|
15
16
|
)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from cognee.infrastructure.databases.vector.embeddings.config import get_embedding_config
|
|
2
|
-
from cognee.infrastructure.llm.config import
|
|
2
|
+
from cognee.infrastructure.llm.config import (
|
|
3
|
+
get_llm_config,
|
|
4
|
+
)
|
|
3
5
|
from .EmbeddingEngine import EmbeddingEngine
|
|
4
6
|
from functools import lru_cache
|
|
5
7
|
|