cognee 0.2.1.dev7__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee/api/client.py +44 -4
- cognee/api/health.py +332 -0
- cognee/api/v1/add/add.py +5 -2
- cognee/api/v1/add/routers/get_add_router.py +3 -0
- cognee/api/v1/cognify/code_graph_pipeline.py +3 -1
- cognee/api/v1/cognify/cognify.py +8 -0
- cognee/api/v1/cognify/routers/get_cognify_router.py +8 -1
- cognee/api/v1/config/config.py +3 -1
- cognee/api/v1/datasets/routers/get_datasets_router.py +2 -8
- cognee/api/v1/delete/delete.py +16 -12
- cognee/api/v1/responses/routers/get_responses_router.py +3 -1
- cognee/api/v1/search/search.py +10 -0
- cognee/api/v1/settings/routers/get_settings_router.py +0 -2
- cognee/base_config.py +1 -0
- cognee/eval_framework/evaluation/direct_llm_eval_adapter.py +5 -6
- cognee/infrastructure/databases/graph/config.py +2 -0
- cognee/infrastructure/databases/graph/get_graph_engine.py +58 -12
- cognee/infrastructure/databases/graph/graph_db_interface.py +15 -10
- cognee/infrastructure/databases/graph/kuzu/adapter.py +43 -16
- cognee/infrastructure/databases/graph/kuzu/kuzu_migrate.py +281 -0
- cognee/infrastructure/databases/graph/neo4j_driver/adapter.py +151 -77
- cognee/infrastructure/databases/graph/neptune_driver/__init__.py +15 -0
- cognee/infrastructure/databases/graph/neptune_driver/adapter.py +1427 -0
- cognee/infrastructure/databases/graph/neptune_driver/exceptions.py +115 -0
- cognee/infrastructure/databases/graph/neptune_driver/neptune_utils.py +224 -0
- cognee/infrastructure/databases/graph/networkx/adapter.py +3 -3
- cognee/infrastructure/databases/hybrid/neptune_analytics/NeptuneAnalyticsAdapter.py +449 -0
- cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py +11 -3
- cognee/infrastructure/databases/vector/chromadb/ChromaDBAdapter.py +8 -3
- cognee/infrastructure/databases/vector/create_vector_engine.py +31 -23
- cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py +3 -1
- cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py +21 -6
- cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py +4 -3
- cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py +3 -1
- cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +22 -16
- cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +36 -34
- cognee/infrastructure/databases/vector/vector_db_interface.py +78 -7
- cognee/infrastructure/files/utils/get_data_file_path.py +39 -0
- cognee/infrastructure/files/utils/guess_file_type.py +2 -2
- cognee/infrastructure/files/utils/open_data_file.py +4 -23
- cognee/infrastructure/llm/LLMGateway.py +137 -0
- cognee/infrastructure/llm/__init__.py +14 -4
- cognee/infrastructure/llm/config.py +29 -1
- cognee/infrastructure/llm/prompts/answer_hotpot_question.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_hotpot_using_cognee_search.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_simple_question.txt +1 -1
- cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt +1 -1
- cognee/infrastructure/llm/prompts/categorize_categories.txt +1 -1
- cognee/infrastructure/llm/prompts/classify_content.txt +1 -1
- cognee/infrastructure/llm/prompts/context_for_question.txt +1 -1
- cognee/infrastructure/llm/prompts/graph_context_for_question.txt +1 -1
- cognee/infrastructure/llm/prompts/natural_language_retriever_system.txt +1 -1
- cognee/infrastructure/llm/prompts/patch_gen_instructions.txt +1 -1
- cognee/infrastructure/llm/prompts/search_type_selector_prompt.txt +130 -0
- cognee/infrastructure/llm/prompts/summarize_code.txt +2 -2
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/__init__.py +57 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/async_client.py +533 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/config.py +94 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/globals.py +37 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/inlinedbaml.py +21 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/parser.py +131 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/runtime.py +266 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/stream_types.py +137 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/sync_client.py +550 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/tracing.py +26 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_builder.py +962 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/type_map.py +52 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_client/types.py +166 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_categories.baml +109 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extract_content_graph.baml +343 -0
- cognee/{modules/data → infrastructure/llm/structured_output_framework/baml/baml_src}/extraction/__init__.py +1 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/extract_summary.py +89 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/extraction/knowledge_graph/extract_content_graph.py +33 -0
- cognee/infrastructure/llm/structured_output_framework/baml/baml_src/generators.baml +18 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/__init__.py +3 -0
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/extraction/extract_categories.py +12 -0
- cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/extract_summary.py +16 -7
- cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/extract_content_graph.py +7 -6
- cognee/infrastructure/llm/{anthropic → structured_output_framework/litellm_instructor/llm/anthropic}/adapter.py +10 -4
- cognee/infrastructure/llm/{gemini → structured_output_framework/litellm_instructor/llm/gemini}/adapter.py +6 -5
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/__init__.py +0 -0
- cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/litellm_instructor/llm/generic_llm_api}/adapter.py +7 -3
- cognee/infrastructure/llm/{get_llm_client.py → structured_output_framework/litellm_instructor/llm/get_llm_client.py} +18 -6
- cognee/infrastructure/llm/{llm_interface.py → structured_output_framework/litellm_instructor/llm/llm_interface.py} +2 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/ollama/__init__.py +0 -0
- cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor/llm/ollama}/adapter.py +4 -2
- cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/openai/__init__.py +0 -0
- cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm/openai}/adapter.py +6 -4
- cognee/infrastructure/llm/{rate_limiter.py → structured_output_framework/litellm_instructor/llm/rate_limiter.py} +0 -5
- cognee/infrastructure/llm/tokenizer/Gemini/adapter.py +4 -2
- cognee/infrastructure/llm/tokenizer/TikToken/adapter.py +7 -3
- cognee/infrastructure/llm/tokenizer/__init__.py +4 -0
- cognee/infrastructure/llm/utils.py +3 -1
- cognee/infrastructure/loaders/LoaderEngine.py +156 -0
- cognee/infrastructure/loaders/LoaderInterface.py +73 -0
- cognee/infrastructure/loaders/__init__.py +18 -0
- cognee/infrastructure/loaders/core/__init__.py +7 -0
- cognee/infrastructure/loaders/core/audio_loader.py +98 -0
- cognee/infrastructure/loaders/core/image_loader.py +114 -0
- cognee/infrastructure/loaders/core/text_loader.py +90 -0
- cognee/infrastructure/loaders/create_loader_engine.py +32 -0
- cognee/infrastructure/loaders/external/__init__.py +22 -0
- cognee/infrastructure/loaders/external/pypdf_loader.py +96 -0
- cognee/infrastructure/loaders/external/unstructured_loader.py +127 -0
- cognee/infrastructure/loaders/get_loader_engine.py +18 -0
- cognee/infrastructure/loaders/supported_loaders.py +18 -0
- cognee/infrastructure/loaders/use_loader.py +21 -0
- cognee/infrastructure/loaders/utils/__init__.py +0 -0
- cognee/modules/data/methods/__init__.py +1 -0
- cognee/modules/data/methods/get_authorized_dataset.py +23 -0
- cognee/modules/data/models/Data.py +13 -3
- cognee/modules/data/processing/document_types/AudioDocument.py +2 -2
- cognee/modules/data/processing/document_types/ImageDocument.py +2 -2
- cognee/modules/data/processing/document_types/PdfDocument.py +4 -11
- cognee/modules/data/processing/document_types/UnstructuredDocument.py +2 -5
- cognee/modules/engine/utils/generate_edge_id.py +5 -0
- cognee/modules/graph/cognee_graph/CogneeGraph.py +45 -35
- cognee/modules/graph/methods/get_formatted_graph_data.py +8 -2
- cognee/modules/graph/utils/get_graph_from_model.py +93 -101
- cognee/modules/ingestion/data_types/TextData.py +8 -2
- cognee/modules/ingestion/save_data_to_file.py +1 -1
- cognee/modules/pipelines/exceptions/__init__.py +1 -0
- cognee/modules/pipelines/exceptions/exceptions.py +12 -0
- cognee/modules/pipelines/models/DataItemStatus.py +5 -0
- cognee/modules/pipelines/models/PipelineRunInfo.py +6 -0
- cognee/modules/pipelines/models/__init__.py +1 -0
- cognee/modules/pipelines/operations/pipeline.py +10 -2
- cognee/modules/pipelines/operations/run_tasks.py +252 -20
- cognee/modules/pipelines/operations/run_tasks_distributed.py +1 -1
- cognee/modules/retrieval/chunks_retriever.py +23 -1
- cognee/modules/retrieval/code_retriever.py +66 -9
- cognee/modules/retrieval/completion_retriever.py +11 -9
- cognee/modules/retrieval/context_providers/TripletSearchContextProvider.py +0 -2
- cognee/modules/retrieval/graph_completion_context_extension_retriever.py +0 -2
- cognee/modules/retrieval/graph_completion_cot_retriever.py +8 -9
- cognee/modules/retrieval/graph_completion_retriever.py +1 -1
- cognee/modules/retrieval/insights_retriever.py +4 -0
- cognee/modules/retrieval/natural_language_retriever.py +9 -15
- cognee/modules/retrieval/summaries_retriever.py +23 -1
- cognee/modules/retrieval/utils/brute_force_triplet_search.py +23 -4
- cognee/modules/retrieval/utils/completion.py +6 -9
- cognee/modules/retrieval/utils/description_to_codepart_search.py +2 -3
- cognee/modules/search/methods/search.py +5 -1
- cognee/modules/search/operations/__init__.py +1 -0
- cognee/modules/search/operations/select_search_type.py +42 -0
- cognee/modules/search/types/SearchType.py +1 -0
- cognee/modules/settings/get_settings.py +0 -8
- cognee/modules/settings/save_vector_db_config.py +1 -1
- cognee/shared/data_models.py +3 -1
- cognee/shared/logging_utils.py +0 -5
- cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py +2 -2
- cognee/tasks/documents/extract_chunks_from_documents.py +10 -12
- cognee/tasks/entity_completion/entity_extractors/llm_entity_extractor.py +4 -6
- cognee/tasks/graph/cascade_extract/utils/extract_content_nodes_and_relationship_names.py +4 -6
- cognee/tasks/graph/cascade_extract/utils/extract_edge_triplets.py +6 -7
- cognee/tasks/graph/cascade_extract/utils/extract_nodes.py +4 -7
- cognee/tasks/graph/extract_graph_from_code.py +3 -2
- cognee/tasks/graph/extract_graph_from_data.py +4 -3
- cognee/tasks/graph/infer_data_ontology.py +5 -6
- cognee/tasks/ingestion/data_item_to_text_file.py +79 -0
- cognee/tasks/ingestion/ingest_data.py +91 -61
- cognee/tasks/ingestion/resolve_data_directories.py +3 -0
- cognee/tasks/repo_processor/get_repo_file_dependencies.py +3 -0
- cognee/tasks/storage/index_data_points.py +1 -1
- cognee/tasks/storage/index_graph_edges.py +4 -1
- cognee/tasks/summarization/summarize_code.py +2 -3
- cognee/tasks/summarization/summarize_text.py +3 -2
- cognee/tests/test_cognee_server_start.py +12 -7
- cognee/tests/test_deduplication.py +2 -2
- cognee/tests/test_deletion.py +58 -17
- cognee/tests/test_graph_visualization_permissions.py +161 -0
- cognee/tests/test_neptune_analytics_graph.py +309 -0
- cognee/tests/test_neptune_analytics_hybrid.py +176 -0
- cognee/tests/{test_weaviate.py → test_neptune_analytics_vector.py} +86 -11
- cognee/tests/test_pgvector.py +5 -5
- cognee/tests/test_s3.py +1 -6
- cognee/tests/unit/infrastructure/databases/test_rate_limiter.py +11 -10
- cognee/tests/unit/infrastructure/databases/vector/__init__.py +0 -0
- cognee/tests/unit/infrastructure/mock_embedding_engine.py +1 -1
- cognee/tests/unit/infrastructure/test_embedding_rate_limiting_realistic.py +5 -5
- cognee/tests/unit/infrastructure/test_rate_limiting_realistic.py +6 -4
- cognee/tests/unit/infrastructure/test_rate_limiting_retry.py +1 -1
- cognee/tests/unit/interfaces/graph/get_graph_from_model_unit_test.py +61 -3
- cognee/tests/unit/modules/retrieval/graph_completion_retriever_test.py +84 -9
- cognee/tests/unit/modules/search/search_methods_test.py +55 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/METADATA +13 -9
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/RECORD +203 -164
- cognee/infrastructure/databases/vector/pinecone/adapter.py +0 -8
- cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +0 -514
- cognee/infrastructure/databases/vector/qdrant/__init__.py +0 -2
- cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +0 -527
- cognee/infrastructure/databases/vector/weaviate_db/__init__.py +0 -1
- cognee/modules/data/extraction/extract_categories.py +0 -14
- cognee/tests/test_qdrant.py +0 -99
- distributed/Dockerfile +0 -34
- distributed/app.py +0 -4
- distributed/entrypoint.py +0 -71
- distributed/entrypoint.sh +0 -5
- distributed/modal_image.py +0 -11
- distributed/queues.py +0 -5
- distributed/tasks/queued_add_data_points.py +0 -13
- distributed/tasks/queued_add_edges.py +0 -13
- distributed/tasks/queued_add_nodes.py +0 -13
- distributed/test.py +0 -28
- distributed/utils.py +0 -19
- distributed/workers/data_point_saving_worker.py +0 -93
- distributed/workers/graph_saving_worker.py +0 -104
- /cognee/infrastructure/databases/{graph/memgraph → hybrid/neptune_analytics}/__init__.py +0 -0
- /cognee/infrastructure/{llm → databases/vector/embeddings}/embedding_rate_limiter.py +0 -0
- /cognee/infrastructure/{databases/vector/pinecone → llm/structured_output_framework}/__init__.py +0 -0
- /cognee/infrastructure/llm/{anthropic → structured_output_framework/baml/baml_src}/__init__.py +0 -0
- /cognee/infrastructure/llm/{gemini/__init__.py → structured_output_framework/baml/baml_src/extraction/extract_categories.py} +0 -0
- /cognee/infrastructure/llm/{generic_llm_api → structured_output_framework/baml/baml_src/extraction/knowledge_graph}/__init__.py +0 -0
- /cognee/infrastructure/llm/{ollama → structured_output_framework/litellm_instructor}/__init__.py +0 -0
- /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/knowledge_graph/__init__.py +0 -0
- /cognee/{modules/data → infrastructure/llm/structured_output_framework/litellm_instructor}/extraction/texts.json +0 -0
- /cognee/infrastructure/llm/{openai → structured_output_framework/litellm_instructor/llm}/__init__.py +0 -0
- {distributed → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic}/__init__.py +0 -0
- {distributed/tasks → cognee/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini}/__init__.py +0 -0
- /cognee/modules/data/{extraction/knowledge_graph → methods}/add_model_class_to_graph.py +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/WHEEL +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
- {cognee-0.2.1.dev7.dist-info → cognee-0.2.2.dev1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -1,514 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Dict, List, Optional
|
|
3
|
-
from qdrant_client import AsyncQdrantClient, models
|
|
4
|
-
|
|
5
|
-
from cognee.shared.logging_utils import get_logger
|
|
6
|
-
from cognee.infrastructure.engine.utils import parse_id
|
|
7
|
-
from cognee.exceptions import InvalidValueError
|
|
8
|
-
from cognee.infrastructure.engine import DataPoint
|
|
9
|
-
from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
|
|
10
|
-
from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult
|
|
11
|
-
|
|
12
|
-
from ..embeddings.EmbeddingEngine import EmbeddingEngine
|
|
13
|
-
from ..vector_db_interface import VectorDBInterface
|
|
14
|
-
|
|
15
|
-
logger = get_logger("QDrantAdapter")
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class IndexSchema(DataPoint):
|
|
19
|
-
"""
|
|
20
|
-
Represents a schema for indexing where each data point contains a text field.
|
|
21
|
-
|
|
22
|
-
This class inherits from DataPoint and defines a text attribute as well as metadata
|
|
23
|
-
containing index fields used for indexing operations.
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
text: str
|
|
27
|
-
|
|
28
|
-
metadata: dict = {"index_fields": ["text"]}
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# class CollectionConfig(BaseModel, extra = "forbid"):
|
|
32
|
-
# vector_config: Dict[str, models.VectorParams] = Field(..., description="Vectors configuration" )
|
|
33
|
-
# hnsw_config: Optional[models.HnswConfig] = Field(default = None, description="HNSW vector index configuration")
|
|
34
|
-
# optimizers_config: Optional[models.OptimizersConfig] = Field(default = None, description="Optimizers configuration")
|
|
35
|
-
# quantization_config: Optional[models.QuantizationConfig] = Field(default = None, description="Quantization configuration")
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def create_hnsw_config(hnsw_config: Dict):
|
|
39
|
-
"""
|
|
40
|
-
Create HNSW configuration.
|
|
41
|
-
|
|
42
|
-
This function returns an HNSW configuration object if the provided configuration is not
|
|
43
|
-
None, otherwise it returns None.
|
|
44
|
-
|
|
45
|
-
Parameters:
|
|
46
|
-
-----------
|
|
47
|
-
|
|
48
|
-
- hnsw_config (Dict): A dictionary containing HNSW configuration parameters.
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
--------
|
|
52
|
-
|
|
53
|
-
An instance of models.HnswConfig if hnsw_config is not None, otherwise None.
|
|
54
|
-
"""
|
|
55
|
-
if hnsw_config is not None:
|
|
56
|
-
return models.HnswConfig()
|
|
57
|
-
return None
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def create_optimizers_config(optimizers_config: Dict):
|
|
61
|
-
"""
|
|
62
|
-
Create and return an OptimizersConfig instance if the input configuration is provided.
|
|
63
|
-
|
|
64
|
-
This function checks if the given optimizers configuration is not None. If valid, it
|
|
65
|
-
initializes and returns a new instance of the OptimizersConfig class from the models
|
|
66
|
-
module. If the configuration is None, it returns None instead.
|
|
67
|
-
|
|
68
|
-
Parameters:
|
|
69
|
-
-----------
|
|
70
|
-
|
|
71
|
-
- optimizers_config (Dict): A dictionary containing optimizer configuration
|
|
72
|
-
settings.
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
--------
|
|
76
|
-
|
|
77
|
-
Returns an instance of OptimizersConfig if optimizers_config is provided; otherwise,
|
|
78
|
-
returns None.
|
|
79
|
-
"""
|
|
80
|
-
if optimizers_config is not None:
|
|
81
|
-
return models.OptimizersConfig()
|
|
82
|
-
return None
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def create_quantization_config(quantization_config: Dict):
|
|
86
|
-
"""
|
|
87
|
-
Create a quantization configuration based on the provided settings.
|
|
88
|
-
|
|
89
|
-
This function generates an instance of `QuantizationConfig` if the provided
|
|
90
|
-
`quantization_config` is not None. If it is None, the function returns None.
|
|
91
|
-
|
|
92
|
-
Parameters:
|
|
93
|
-
-----------
|
|
94
|
-
|
|
95
|
-
- quantization_config (Dict): A dictionary containing the quantization configuration
|
|
96
|
-
settings.
|
|
97
|
-
|
|
98
|
-
Returns:
|
|
99
|
-
--------
|
|
100
|
-
|
|
101
|
-
An instance of `QuantizationConfig` if `quantization_config` is provided; otherwise,
|
|
102
|
-
returns None.
|
|
103
|
-
"""
|
|
104
|
-
if quantization_config is not None:
|
|
105
|
-
return models.QuantizationConfig()
|
|
106
|
-
return None
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
class QDrantAdapter(VectorDBInterface):
|
|
110
|
-
"""
|
|
111
|
-
Adapt to the Qdrant vector database interface.
|
|
112
|
-
|
|
113
|
-
Public methods:
|
|
114
|
-
- get_qdrant_client
|
|
115
|
-
- embed_data
|
|
116
|
-
- has_collection
|
|
117
|
-
- create_collection
|
|
118
|
-
- create_data_points
|
|
119
|
-
- create_vector_index
|
|
120
|
-
- index_data_points
|
|
121
|
-
- retrieve
|
|
122
|
-
- search
|
|
123
|
-
- batch_search
|
|
124
|
-
- delete_data_points
|
|
125
|
-
- prune
|
|
126
|
-
"""
|
|
127
|
-
|
|
128
|
-
name = "Qdrant"
|
|
129
|
-
url: str = None
|
|
130
|
-
api_key: str = None
|
|
131
|
-
qdrant_path: str = None
|
|
132
|
-
|
|
133
|
-
def __init__(self, url, api_key, embedding_engine: EmbeddingEngine, qdrant_path=None):
|
|
134
|
-
self.embedding_engine = embedding_engine
|
|
135
|
-
|
|
136
|
-
if qdrant_path is not None:
|
|
137
|
-
self.qdrant_path = qdrant_path
|
|
138
|
-
else:
|
|
139
|
-
self.url = url
|
|
140
|
-
self.api_key = api_key
|
|
141
|
-
|
|
142
|
-
def get_qdrant_client(self) -> AsyncQdrantClient:
|
|
143
|
-
"""
|
|
144
|
-
Retrieve an instance of AsyncQdrantClient configured with the appropriate
|
|
145
|
-
settings based on the instance's attributes.
|
|
146
|
-
|
|
147
|
-
Returns an instance of AsyncQdrantClient configured to connect to the database.
|
|
148
|
-
|
|
149
|
-
Returns:
|
|
150
|
-
--------
|
|
151
|
-
- AsyncQdrantClient: An instance of AsyncQdrantClient configured for database
|
|
152
|
-
operations.
|
|
153
|
-
"""
|
|
154
|
-
is_prod = os.getenv("ENV").lower() == "prod"
|
|
155
|
-
|
|
156
|
-
if self.qdrant_path is not None:
|
|
157
|
-
return AsyncQdrantClient(path=self.qdrant_path, port=6333, https=is_prod)
|
|
158
|
-
elif self.url is not None:
|
|
159
|
-
return AsyncQdrantClient(url=self.url, api_key=self.api_key, port=6333, https=is_prod)
|
|
160
|
-
|
|
161
|
-
return AsyncQdrantClient(location=":memory:")
|
|
162
|
-
|
|
163
|
-
async def embed_data(self, data: List[str]) -> List[float]:
|
|
164
|
-
"""
|
|
165
|
-
Embed a list of text data into vector representations asynchronously.
|
|
166
|
-
|
|
167
|
-
Parameters:
|
|
168
|
-
-----------
|
|
169
|
-
|
|
170
|
-
- data (List[str]): A list of strings containing the text data to be embedded.
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
--------
|
|
174
|
-
|
|
175
|
-
- List[float]: A list of floating-point vectors representing the embedded text data.
|
|
176
|
-
"""
|
|
177
|
-
return await self.embedding_engine.embed_text(data)
|
|
178
|
-
|
|
179
|
-
async def has_collection(self, collection_name: str) -> bool:
|
|
180
|
-
"""
|
|
181
|
-
Check if a specified collection exists in the Qdrant database asynchronously.
|
|
182
|
-
|
|
183
|
-
Parameters:
|
|
184
|
-
-----------
|
|
185
|
-
|
|
186
|
-
- collection_name (str): The name of the collection to check for existence.
|
|
187
|
-
|
|
188
|
-
Returns:
|
|
189
|
-
--------
|
|
190
|
-
|
|
191
|
-
- bool: True if the specified collection exists, False otherwise.
|
|
192
|
-
"""
|
|
193
|
-
client = self.get_qdrant_client()
|
|
194
|
-
result = await client.collection_exists(collection_name)
|
|
195
|
-
await client.close()
|
|
196
|
-
return result
|
|
197
|
-
|
|
198
|
-
async def create_collection(
|
|
199
|
-
self,
|
|
200
|
-
collection_name: str,
|
|
201
|
-
payload_schema=None,
|
|
202
|
-
):
|
|
203
|
-
"""
|
|
204
|
-
Create a new collection in the Qdrant database if it does not already exist.
|
|
205
|
-
|
|
206
|
-
If the collection already exists, this operation has no effect.
|
|
207
|
-
|
|
208
|
-
Parameters:
|
|
209
|
-
-----------
|
|
210
|
-
|
|
211
|
-
- collection_name (str): The name of the collection to create.
|
|
212
|
-
- payload_schema: Optional schema for the payload. Defaults to None. (default None)
|
|
213
|
-
"""
|
|
214
|
-
client = self.get_qdrant_client()
|
|
215
|
-
|
|
216
|
-
if not await client.collection_exists(collection_name):
|
|
217
|
-
await client.create_collection(
|
|
218
|
-
collection_name=collection_name,
|
|
219
|
-
vectors_config={
|
|
220
|
-
"text": models.VectorParams(
|
|
221
|
-
size=self.embedding_engine.get_vector_size(), distance="Cosine"
|
|
222
|
-
)
|
|
223
|
-
},
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
await client.close()
|
|
227
|
-
|
|
228
|
-
async def create_data_points(self, collection_name: str, data_points: List[DataPoint]):
|
|
229
|
-
"""
|
|
230
|
-
Create and upload data points to a specified collection in the database.
|
|
231
|
-
|
|
232
|
-
Raises CollectionNotFoundError if the collection does not exist.
|
|
233
|
-
|
|
234
|
-
Parameters:
|
|
235
|
-
-----------
|
|
236
|
-
|
|
237
|
-
- collection_name (str): The name of the collection to which data points will be
|
|
238
|
-
uploaded.
|
|
239
|
-
- data_points (List[DataPoint]): A list of DataPoint objects to be uploaded.
|
|
240
|
-
|
|
241
|
-
Returns:
|
|
242
|
-
--------
|
|
243
|
-
|
|
244
|
-
None if the operation is successful; raises exceptions on error.
|
|
245
|
-
"""
|
|
246
|
-
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
247
|
-
|
|
248
|
-
client = self.get_qdrant_client()
|
|
249
|
-
|
|
250
|
-
data_vectors = await self.embed_data(
|
|
251
|
-
[DataPoint.get_embeddable_data(data_point) for data_point in data_points]
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
def convert_to_qdrant_point(data_point: DataPoint):
|
|
255
|
-
"""
|
|
256
|
-
Convert a DataPoint object into the format expected by Qdrant for upload.
|
|
257
|
-
|
|
258
|
-
Parameters:
|
|
259
|
-
-----------
|
|
260
|
-
|
|
261
|
-
- data_point (DataPoint): The DataPoint object to convert.
|
|
262
|
-
|
|
263
|
-
Returns:
|
|
264
|
-
--------
|
|
265
|
-
|
|
266
|
-
None; performs an operation without returning a value.
|
|
267
|
-
"""
|
|
268
|
-
return models.PointStruct(
|
|
269
|
-
id=str(data_point.id),
|
|
270
|
-
payload=data_point.model_dump(),
|
|
271
|
-
vector={"text": data_vectors[data_points.index(data_point)]},
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
points = [convert_to_qdrant_point(point) for point in data_points]
|
|
275
|
-
|
|
276
|
-
try:
|
|
277
|
-
client.upload_points(collection_name=collection_name, points=points)
|
|
278
|
-
except UnexpectedResponse as error:
|
|
279
|
-
if "Collection not found" in str(error):
|
|
280
|
-
raise CollectionNotFoundError(
|
|
281
|
-
message=f"Collection {collection_name} not found!"
|
|
282
|
-
) from error
|
|
283
|
-
else:
|
|
284
|
-
raise error
|
|
285
|
-
except Exception as error:
|
|
286
|
-
logger.error("Error uploading data points to Qdrant: %s", str(error))
|
|
287
|
-
raise error
|
|
288
|
-
finally:
|
|
289
|
-
await client.close()
|
|
290
|
-
|
|
291
|
-
async def create_vector_index(self, index_name: str, index_property_name: str):
|
|
292
|
-
"""
|
|
293
|
-
Create a vector index for a specified property name.
|
|
294
|
-
|
|
295
|
-
This is essentially a wrapper around create_collection, which allows for more
|
|
296
|
-
flexibility
|
|
297
|
-
in index naming.
|
|
298
|
-
|
|
299
|
-
Parameters:
|
|
300
|
-
-----------
|
|
301
|
-
|
|
302
|
-
- index_name (str): The base name for the index to be created.
|
|
303
|
-
- index_property_name (str): The property name that will be part of the index name.
|
|
304
|
-
"""
|
|
305
|
-
await self.create_collection(f"{index_name}_{index_property_name}")
|
|
306
|
-
|
|
307
|
-
async def index_data_points(
|
|
308
|
-
self, index_name: str, index_property_name: str, data_points: list[DataPoint]
|
|
309
|
-
):
|
|
310
|
-
"""
|
|
311
|
-
Index data points into a specific collection based on provided metadata.
|
|
312
|
-
|
|
313
|
-
Transforms DataPoint objects into an appropriate format and uploads them.
|
|
314
|
-
|
|
315
|
-
Parameters:
|
|
316
|
-
-----------
|
|
317
|
-
|
|
318
|
-
- index_name (str): The base name for the index used for naming the collection.
|
|
319
|
-
- index_property_name (str): The property name used for naming the collection.
|
|
320
|
-
- data_points (list[DataPoint]): A list of DataPoint objects to index.
|
|
321
|
-
"""
|
|
322
|
-
await self.create_data_points(
|
|
323
|
-
f"{index_name}_{index_property_name}",
|
|
324
|
-
[
|
|
325
|
-
IndexSchema(
|
|
326
|
-
id=data_point.id,
|
|
327
|
-
text=getattr(data_point, data_point.metadata["index_fields"][0]),
|
|
328
|
-
)
|
|
329
|
-
for data_point in data_points
|
|
330
|
-
],
|
|
331
|
-
)
|
|
332
|
-
|
|
333
|
-
async def retrieve(self, collection_name: str, data_point_ids: list[str]):
|
|
334
|
-
"""
|
|
335
|
-
Retrieve data points from a specified collection based on their IDs.
|
|
336
|
-
|
|
337
|
-
Returns the data corresponding to the provided IDs from the collection.
|
|
338
|
-
|
|
339
|
-
Parameters:
|
|
340
|
-
-----------
|
|
341
|
-
|
|
342
|
-
- collection_name (str): The name of the collection to retrieve from.
|
|
343
|
-
- data_point_ids (list[str]): A list of IDs of the data points to retrieve.
|
|
344
|
-
|
|
345
|
-
Returns:
|
|
346
|
-
--------
|
|
347
|
-
|
|
348
|
-
The retrieved data points, including payloads for each ID.
|
|
349
|
-
"""
|
|
350
|
-
client = self.get_qdrant_client()
|
|
351
|
-
results = await client.retrieve(collection_name, data_point_ids, with_payload=True)
|
|
352
|
-
await client.close()
|
|
353
|
-
return results
|
|
354
|
-
|
|
355
|
-
async def search(
|
|
356
|
-
self,
|
|
357
|
-
collection_name: str,
|
|
358
|
-
query_text: Optional[str] = None,
|
|
359
|
-
query_vector: Optional[List[float]] = None,
|
|
360
|
-
limit: int = 15,
|
|
361
|
-
with_vector: bool = False,
|
|
362
|
-
) -> List[ScoredResult]:
|
|
363
|
-
"""
|
|
364
|
-
Search for data points in a collection based on either a textual query or a vector
|
|
365
|
-
query.
|
|
366
|
-
|
|
367
|
-
Raises InvalidValueError if both query_text and query_vector are None.
|
|
368
|
-
|
|
369
|
-
Returns a list of scored results that match the search criteria.
|
|
370
|
-
|
|
371
|
-
Parameters:
|
|
372
|
-
-----------
|
|
373
|
-
|
|
374
|
-
- collection_name (str): The name of the collection to search within.
|
|
375
|
-
- query_text (Optional[str]): The text to be used in the search query; optional if
|
|
376
|
-
query_vector is provided. (default None)
|
|
377
|
-
- query_vector (Optional[List[float]]): The vector to be used in the search query;
|
|
378
|
-
optional if query_text is provided. (default None)
|
|
379
|
-
- limit (int): The maximum number of results to return; defaults to 15. (default 15)
|
|
380
|
-
- with_vector (bool): Indicates whether to return vector data along with results;
|
|
381
|
-
defaults to False. (default False)
|
|
382
|
-
|
|
383
|
-
Returns:
|
|
384
|
-
--------
|
|
385
|
-
|
|
386
|
-
- List[ScoredResult]: A list of ScoredResult objects representing the results of the
|
|
387
|
-
search.
|
|
388
|
-
"""
|
|
389
|
-
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
390
|
-
|
|
391
|
-
if query_text is None and query_vector is None:
|
|
392
|
-
raise InvalidValueError(message="One of query_text or query_vector must be provided!")
|
|
393
|
-
|
|
394
|
-
if not await self.has_collection(collection_name):
|
|
395
|
-
return []
|
|
396
|
-
|
|
397
|
-
if query_vector is None:
|
|
398
|
-
query_vector = (await self.embed_data([query_text]))[0]
|
|
399
|
-
|
|
400
|
-
try:
|
|
401
|
-
client = self.get_qdrant_client()
|
|
402
|
-
if limit == 0:
|
|
403
|
-
collection_size = await client.count(collection_name=collection_name)
|
|
404
|
-
|
|
405
|
-
results = await client.search(
|
|
406
|
-
collection_name=collection_name,
|
|
407
|
-
query_vector=models.NamedVector(
|
|
408
|
-
name="text",
|
|
409
|
-
vector=query_vector
|
|
410
|
-
if query_vector is not None
|
|
411
|
-
else (await self.embed_data([query_text]))[0],
|
|
412
|
-
),
|
|
413
|
-
limit=limit if limit > 0 else collection_size.count,
|
|
414
|
-
with_vectors=with_vector,
|
|
415
|
-
)
|
|
416
|
-
|
|
417
|
-
await client.close()
|
|
418
|
-
|
|
419
|
-
return [
|
|
420
|
-
ScoredResult(
|
|
421
|
-
id=parse_id(result.id),
|
|
422
|
-
payload={
|
|
423
|
-
**result.payload,
|
|
424
|
-
"id": parse_id(result.id),
|
|
425
|
-
},
|
|
426
|
-
score=1 - result.score,
|
|
427
|
-
)
|
|
428
|
-
for result in results
|
|
429
|
-
]
|
|
430
|
-
finally:
|
|
431
|
-
await client.close()
|
|
432
|
-
|
|
433
|
-
async def batch_search(
|
|
434
|
-
self,
|
|
435
|
-
collection_name: str,
|
|
436
|
-
query_texts: List[str],
|
|
437
|
-
limit: int = None,
|
|
438
|
-
with_vectors: bool = False,
|
|
439
|
-
):
|
|
440
|
-
"""
|
|
441
|
-
Perform a batch search in a specified collection using multiple query texts.
|
|
442
|
-
|
|
443
|
-
Returns the results of the search for each query, filtering for results with a score
|
|
444
|
-
higher than 0.9.
|
|
445
|
-
|
|
446
|
-
Parameters:
|
|
447
|
-
-----------
|
|
448
|
-
|
|
449
|
-
- collection_name (str): The name of the collection to search in.
|
|
450
|
-
- query_texts (List[str]): A list of query texts to search for in the collection.
|
|
451
|
-
- limit (int): The maximum number of results to return for each search request; can
|
|
452
|
-
be None. (default None)
|
|
453
|
-
- with_vectors (bool): Indicates whether to include vector data in the results;
|
|
454
|
-
defaults to False. (default False)
|
|
455
|
-
|
|
456
|
-
Returns:
|
|
457
|
-
--------
|
|
458
|
-
|
|
459
|
-
A list containing the filtered search results for each query text.
|
|
460
|
-
"""
|
|
461
|
-
|
|
462
|
-
vectors = await self.embed_data(query_texts)
|
|
463
|
-
|
|
464
|
-
# Generate dynamic search requests based on the provided embeddings
|
|
465
|
-
requests = [
|
|
466
|
-
models.SearchRequest(
|
|
467
|
-
vector=models.NamedVector(name="text", vector=vector),
|
|
468
|
-
limit=limit,
|
|
469
|
-
with_vector=with_vectors,
|
|
470
|
-
)
|
|
471
|
-
for vector in vectors
|
|
472
|
-
]
|
|
473
|
-
|
|
474
|
-
client = self.get_qdrant_client()
|
|
475
|
-
|
|
476
|
-
# Perform batch search with the dynamically generated requests
|
|
477
|
-
results = await client.search_batch(collection_name=collection_name, requests=requests)
|
|
478
|
-
|
|
479
|
-
await client.close()
|
|
480
|
-
|
|
481
|
-
return [filter(lambda result: result.score > 0.9, result_group) for result_group in results]
|
|
482
|
-
|
|
483
|
-
async def delete_data_points(self, collection_name: str, data_point_ids: list[str]):
|
|
484
|
-
"""
|
|
485
|
-
Delete specific data points from a specified collection based on their IDs.
|
|
486
|
-
|
|
487
|
-
Parameters:
|
|
488
|
-
-----------
|
|
489
|
-
|
|
490
|
-
- collection_name (str): The name of the collection from which to delete the data
|
|
491
|
-
points.
|
|
492
|
-
- data_point_ids (list[str]): The list of IDs of data points to be deleted.
|
|
493
|
-
|
|
494
|
-
Returns:
|
|
495
|
-
--------
|
|
496
|
-
|
|
497
|
-
The result of the delete operation from the database.
|
|
498
|
-
"""
|
|
499
|
-
client = self.get_qdrant_client()
|
|
500
|
-
results = await client.delete(collection_name, data_point_ids)
|
|
501
|
-
return results
|
|
502
|
-
|
|
503
|
-
async def prune(self):
|
|
504
|
-
"""
|
|
505
|
-
Remove all collections from the Qdrant database asynchronously.
|
|
506
|
-
"""
|
|
507
|
-
client = self.get_qdrant_client()
|
|
508
|
-
|
|
509
|
-
response = await client.get_collections()
|
|
510
|
-
|
|
511
|
-
for collection in response.collections:
|
|
512
|
-
await client.delete_collection(collection.name)
|
|
513
|
-
|
|
514
|
-
await client.close()
|