graphrag-vectors 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphrag_vectors-3.0.0/.gitignore +65 -0
- graphrag_vectors-3.0.0/LICENSE +21 -0
- graphrag_vectors-3.0.0/PKG-INFO +136 -0
- graphrag_vectors-3.0.0/README.md +111 -0
- graphrag_vectors-3.0.0/graphrag_vectors/__init__.py +34 -0
- graphrag_vectors-3.0.0/graphrag_vectors/azure_ai_search.py +173 -0
- graphrag_vectors-3.0.0/graphrag_vectors/cosmosdb.py +244 -0
- graphrag_vectors-3.0.0/graphrag_vectors/index_schema.py +56 -0
- graphrag_vectors-3.0.0/graphrag_vectors/lancedb.py +128 -0
- graphrag_vectors-3.0.0/graphrag_vectors/types.py +8 -0
- graphrag_vectors-3.0.0/graphrag_vectors/vector_store.py +81 -0
- graphrag_vectors-3.0.0/graphrag_vectors/vector_store_config.py +53 -0
- graphrag_vectors-3.0.0/graphrag_vectors/vector_store_factory.py +99 -0
- graphrag_vectors-3.0.0/graphrag_vectors/vector_store_type.py +14 -0
- graphrag_vectors-3.0.0/pyproject.toml +49 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Python Artifacts
|
|
2
|
+
python/*/lib/
|
|
3
|
+
dist/
|
|
4
|
+
build/
|
|
5
|
+
*.egg-info/
|
|
6
|
+
|
|
7
|
+
# Test Output
|
|
8
|
+
.coverage
|
|
9
|
+
coverage/
|
|
10
|
+
licenses.txt
|
|
11
|
+
examples_notebooks/*/data
|
|
12
|
+
tests/fixtures/cache
|
|
13
|
+
tests/fixtures/*/cache
|
|
14
|
+
tests/fixtures/*/output
|
|
15
|
+
output/lancedb
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Random
|
|
19
|
+
.DS_Store
|
|
20
|
+
*.log*
|
|
21
|
+
.venv
|
|
22
|
+
venv/
|
|
23
|
+
.conda
|
|
24
|
+
.tmp
|
|
25
|
+
packages/graphrag-llm/notebooks/metrics
|
|
26
|
+
packages/graphrag-llm/notebooks/cache
|
|
27
|
+
|
|
28
|
+
.env
|
|
29
|
+
build.zip
|
|
30
|
+
|
|
31
|
+
.turbo
|
|
32
|
+
|
|
33
|
+
__pycache__
|
|
34
|
+
|
|
35
|
+
.pipeline
|
|
36
|
+
|
|
37
|
+
# Azurite
|
|
38
|
+
temp_azurite/
|
|
39
|
+
__azurite*.json
|
|
40
|
+
__blobstorage*.json
|
|
41
|
+
__blobstorage__/
|
|
42
|
+
|
|
43
|
+
# Getting started example
|
|
44
|
+
ragtest/
|
|
45
|
+
.ragtest/
|
|
46
|
+
.pipelines
|
|
47
|
+
.pipeline
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# mkdocs
|
|
51
|
+
site/
|
|
52
|
+
|
|
53
|
+
# Docs migration
|
|
54
|
+
docsite/
|
|
55
|
+
.yarn/
|
|
56
|
+
.pnp*
|
|
57
|
+
|
|
58
|
+
# PyCharm
|
|
59
|
+
.idea/
|
|
60
|
+
|
|
61
|
+
# Jupyter notebook
|
|
62
|
+
.ipynb_checkpoints/
|
|
63
|
+
|
|
64
|
+
# Root build assets
|
|
65
|
+
packages/*/LICENSE
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Microsoft Corporation.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graphrag-vectors
|
|
3
|
+
Version: 3.0.0
|
|
4
|
+
Summary: GraphRAG vector store package.
|
|
5
|
+
Project-URL: Source, https://github.com/microsoft/graphrag
|
|
6
|
+
Author: Mónica Carvajal
|
|
7
|
+
Author-email: Alonso Guevara Fernández <alonsog@microsoft.com>, Andrés Morales Esquivel <andresmor@microsoft.com>, Chris Trevino <chtrevin@microsoft.com>, David Tittsworth <datittsw@microsoft.com>, Dayenne de Souza <ddesouza@microsoft.com>, Derek Worthen <deworthe@microsoft.com>, Gaudy Blanco Meneses <gaudyb@microsoft.com>, Ha Trinh <trinhha@microsoft.com>, Jonathan Larson <jolarso@microsoft.com>, Josh Bradley <joshbradley@microsoft.com>, Kate Lytvynets <kalytv@microsoft.com>, Kenny Zhang <zhangken@microsoft.com>, Nathan Evans <naevans@microsoft.com>, Rodrigo Racanicci <rracanicci@microsoft.com>, Sarah Smith <smithsarah@microsoft.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Python: <3.14,>=3.11
|
|
15
|
+
Requires-Dist: azure-core~=1.32
|
|
16
|
+
Requires-Dist: azure-cosmos~=4.9
|
|
17
|
+
Requires-Dist: azure-identity~=1.19
|
|
18
|
+
Requires-Dist: azure-search-documents~=11.6
|
|
19
|
+
Requires-Dist: graphrag-common==3.0.0
|
|
20
|
+
Requires-Dist: lancedb~=0.24.1
|
|
21
|
+
Requires-Dist: numpy~=2.1
|
|
22
|
+
Requires-Dist: pyarrow~=22.0
|
|
23
|
+
Requires-Dist: pydantic~=2.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# GraphRAG Vectors
|
|
27
|
+
|
|
28
|
+
This package provides vector store implementations for GraphRAG with support for multiple backends including LanceDB, Azure AI Search, and Azure Cosmos DB. It offers both a convenient configuration-driven API and direct factory access for creating and managing vector stores with flexible index schema definitions.
|
|
29
|
+
|
|
30
|
+
## Basic usage with the utility function (recommended)
|
|
31
|
+
|
|
32
|
+
This demonstrates the recommended approach to create a vector store using the create_vector_store convenience function with configuration objects that specify the store type and index schema. The example shows setting up a LanceDB vector store with a defined index configuration, then connecting to it and creating the index for vector operations.
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from graphrag_vectors import (
|
|
36
|
+
create_vector_store,
|
|
37
|
+
VectorStoreType,
|
|
38
|
+
IndexSchema,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Create a vector store using the convenience function
|
|
42
|
+
store_config = VectorStoreConfig(
|
|
43
|
+
type="lancedb",
|
|
44
|
+
db_uri="lance"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
schema_config = IndexSchema(
|
|
48
|
+
index_name="my_index",
|
|
49
|
+
vector_size=1536,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
vector_store = create_vector_store(
|
|
53
|
+
config=store_config
|
|
54
|
+
index_schema=schema_config,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
vector_store.connect()
|
|
58
|
+
vector_store.create_index()
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Basic usage implementing the factory directly
|
|
62
|
+
|
|
63
|
+
This example shows a different approach to create vector stores by directly using the vector_store_factory with enum types and dictionary-based initialization arguments. This method provides more direct control over the factory creation process while bypassing the convenience function layer.
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from graphrag_vectors import (
|
|
67
|
+
VectorStoreFactory,
|
|
68
|
+
vector_store_factory,
|
|
69
|
+
VectorStoreType,
|
|
70
|
+
IndexSchema,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Create a vector store using the factory
|
|
74
|
+
schema_config = IndexSchema(
|
|
75
|
+
index_name="my_index",
|
|
76
|
+
vector_size=1536,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
vector_store = vector_store_factory.create(
|
|
80
|
+
VectorStoreType.LanceDB,
|
|
81
|
+
{
|
|
82
|
+
"index_schema": schema_config,
|
|
83
|
+
"db_uri": "./lancedb"
|
|
84
|
+
}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
vector_store.connect()
|
|
88
|
+
vector_store.create_index()
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Supported Vector Stores
|
|
92
|
+
|
|
93
|
+
- **LanceDB**: Local vector database
|
|
94
|
+
- **Azure AI Search**: Azure's managed search service with vector capabilities
|
|
95
|
+
- **Azure Cosmos DB**: Azure's NoSQL database with vector search support
|
|
96
|
+
|
|
97
|
+
## Custom Vector Store
|
|
98
|
+
|
|
99
|
+
You can register custom vector store implementations:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from graphrag_vectors import VectorStore, register_vector_store, create_vector_store
|
|
103
|
+
|
|
104
|
+
class MyCustomVectorStore(VectorStore):
|
|
105
|
+
def __init__(self, my_param):
|
|
106
|
+
self.my_param = my_param
|
|
107
|
+
|
|
108
|
+
def connect(self):
|
|
109
|
+
# Implementation
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
def create_index(self):
|
|
113
|
+
# Implementation
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
# ... implement other required methods
|
|
117
|
+
|
|
118
|
+
# Register your custom implementation
|
|
119
|
+
register_vector_store("my_custom_store", MyCustomVectorStore)
|
|
120
|
+
|
|
121
|
+
# Use your custom vector store
|
|
122
|
+
config = VectorStoreConfig(
|
|
123
|
+
type="my_custom_store",
|
|
124
|
+
my_param="something"
|
|
125
|
+
)
|
|
126
|
+
custom_store = create_vector_store(
|
|
127
|
+
config=config,
|
|
128
|
+
index_schema=schema_config,
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Configuration
|
|
133
|
+
|
|
134
|
+
Vector stores are configured using:
|
|
135
|
+
- `VectorStoreConfig`: baseline parameters for the store
|
|
136
|
+
- `IndexSchema`: Schema configuration for the specific index to create/connect to (index name, field names, vector size)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# GraphRAG Vectors
|
|
2
|
+
|
|
3
|
+
This package provides vector store implementations for GraphRAG with support for multiple backends including LanceDB, Azure AI Search, and Azure Cosmos DB. It offers both a convenient configuration-driven API and direct factory access for creating and managing vector stores with flexible index schema definitions.
|
|
4
|
+
|
|
5
|
+
## Basic usage with the utility function (recommended)
|
|
6
|
+
|
|
7
|
+
This demonstrates the recommended approach to create a vector store using the create_vector_store convenience function with configuration objects that specify the store type and index schema. The example shows setting up a LanceDB vector store with a defined index configuration, then connecting to it and creating the index for vector operations.
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from graphrag_vectors import (
|
|
11
|
+
create_vector_store,
|
|
12
|
+
VectorStoreType,
|
|
13
|
+
IndexSchema,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
# Create a vector store using the convenience function
|
|
17
|
+
store_config = VectorStoreConfig(
|
|
18
|
+
type="lancedb",
|
|
19
|
+
db_uri="lance"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
schema_config = IndexSchema(
|
|
23
|
+
index_name="my_index",
|
|
24
|
+
vector_size=1536,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
vector_store = create_vector_store(
|
|
28
|
+
config=store_config
|
|
29
|
+
index_schema=schema_config,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
vector_store.connect()
|
|
33
|
+
vector_store.create_index()
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Basic usage implementing the factory directly
|
|
37
|
+
|
|
38
|
+
This example shows a different approach to create vector stores by directly using the vector_store_factory with enum types and dictionary-based initialization arguments. This method provides more direct control over the factory creation process while bypassing the convenience function layer.
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from graphrag_vectors import (
|
|
42
|
+
VectorStoreFactory,
|
|
43
|
+
vector_store_factory,
|
|
44
|
+
VectorStoreType,
|
|
45
|
+
IndexSchema,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Create a vector store using the factory
|
|
49
|
+
schema_config = IndexSchema(
|
|
50
|
+
index_name="my_index",
|
|
51
|
+
vector_size=1536,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
vector_store = vector_store_factory.create(
|
|
55
|
+
VectorStoreType.LanceDB,
|
|
56
|
+
{
|
|
57
|
+
"index_schema": schema_config,
|
|
58
|
+
"db_uri": "./lancedb"
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
vector_store.connect()
|
|
63
|
+
vector_store.create_index()
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Supported Vector Stores
|
|
67
|
+
|
|
68
|
+
- **LanceDB**: Local vector database
|
|
69
|
+
- **Azure AI Search**: Azure's managed search service with vector capabilities
|
|
70
|
+
- **Azure Cosmos DB**: Azure's NoSQL database with vector search support
|
|
71
|
+
|
|
72
|
+
## Custom Vector Store
|
|
73
|
+
|
|
74
|
+
You can register custom vector store implementations:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from graphrag_vectors import VectorStore, register_vector_store, create_vector_store
|
|
78
|
+
|
|
79
|
+
class MyCustomVectorStore(VectorStore):
|
|
80
|
+
def __init__(self, my_param):
|
|
81
|
+
self.my_param = my_param
|
|
82
|
+
|
|
83
|
+
def connect(self):
|
|
84
|
+
# Implementation
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def create_index(self):
|
|
88
|
+
# Implementation
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
# ... implement other required methods
|
|
92
|
+
|
|
93
|
+
# Register your custom implementation
|
|
94
|
+
register_vector_store("my_custom_store", MyCustomVectorStore)
|
|
95
|
+
|
|
96
|
+
# Use your custom vector store
|
|
97
|
+
config = VectorStoreConfig(
|
|
98
|
+
type="my_custom_store",
|
|
99
|
+
my_param="something"
|
|
100
|
+
)
|
|
101
|
+
custom_store = create_vector_store(
|
|
102
|
+
config=config,
|
|
103
|
+
index_schema=schema_config,
|
|
104
|
+
)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Configuration
|
|
108
|
+
|
|
109
|
+
Vector stores are configured using:
|
|
110
|
+
- `VectorStoreConfig`: baseline parameters for the store
|
|
111
|
+
- `IndexSchema`: Schema configuration for the specific index to create/connect to (index name, field names, vector size)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""GraphRAG vector store implementations."""
|
|
5
|
+
|
|
6
|
+
from graphrag_vectors.index_schema import IndexSchema
|
|
7
|
+
from graphrag_vectors.types import TextEmbedder
|
|
8
|
+
from graphrag_vectors.vector_store import (
|
|
9
|
+
VectorStore,
|
|
10
|
+
VectorStoreDocument,
|
|
11
|
+
VectorStoreSearchResult,
|
|
12
|
+
)
|
|
13
|
+
from graphrag_vectors.vector_store_config import VectorStoreConfig
|
|
14
|
+
from graphrag_vectors.vector_store_factory import (
|
|
15
|
+
VectorStoreFactory,
|
|
16
|
+
create_vector_store,
|
|
17
|
+
register_vector_store,
|
|
18
|
+
vector_store_factory,
|
|
19
|
+
)
|
|
20
|
+
from graphrag_vectors.vector_store_type import VectorStoreType
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"IndexSchema",
|
|
24
|
+
"TextEmbedder",
|
|
25
|
+
"VectorStore",
|
|
26
|
+
"VectorStoreConfig",
|
|
27
|
+
"VectorStoreDocument",
|
|
28
|
+
"VectorStoreFactory",
|
|
29
|
+
"VectorStoreSearchResult",
|
|
30
|
+
"VectorStoreType",
|
|
31
|
+
"create_vector_store",
|
|
32
|
+
"register_vector_store",
|
|
33
|
+
"vector_store_factory",
|
|
34
|
+
]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""A package containing the Azure AI Search vector store implementation."""
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from azure.core.credentials import AzureKeyCredential
|
|
9
|
+
from azure.identity import DefaultAzureCredential
|
|
10
|
+
from azure.search.documents import SearchClient
|
|
11
|
+
from azure.search.documents.indexes import SearchIndexClient
|
|
12
|
+
from azure.search.documents.indexes.models import (
|
|
13
|
+
HnswAlgorithmConfiguration,
|
|
14
|
+
HnswParameters,
|
|
15
|
+
SearchField,
|
|
16
|
+
SearchFieldDataType,
|
|
17
|
+
SearchIndex,
|
|
18
|
+
SimpleField,
|
|
19
|
+
VectorSearch,
|
|
20
|
+
VectorSearchAlgorithmMetric,
|
|
21
|
+
VectorSearchProfile,
|
|
22
|
+
)
|
|
23
|
+
from azure.search.documents.models import VectorizedQuery
|
|
24
|
+
|
|
25
|
+
from graphrag_vectors.vector_store import (
|
|
26
|
+
VectorStore,
|
|
27
|
+
VectorStoreDocument,
|
|
28
|
+
VectorStoreSearchResult,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AzureAISearchVectorStore(VectorStore):
|
|
33
|
+
"""Azure AI Search vector storage implementation."""
|
|
34
|
+
|
|
35
|
+
index_client: SearchIndexClient
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
url: str,
|
|
40
|
+
api_key: str | None = None,
|
|
41
|
+
audience: str | None = None,
|
|
42
|
+
vector_search_profile_name: str = "vectorSearchProfile",
|
|
43
|
+
**kwargs: Any,
|
|
44
|
+
):
|
|
45
|
+
super().__init__(**kwargs)
|
|
46
|
+
if not url:
|
|
47
|
+
msg = "url must be provided for Azure AI Search."
|
|
48
|
+
raise ValueError(msg)
|
|
49
|
+
self.url = url
|
|
50
|
+
self.api_key = api_key
|
|
51
|
+
self.audience = audience
|
|
52
|
+
self.vector_search_profile_name = vector_search_profile_name
|
|
53
|
+
|
|
54
|
+
def connect(self) -> Any:
|
|
55
|
+
"""Connect to AI search vector storage."""
|
|
56
|
+
audience_arg = (
|
|
57
|
+
{"audience": self.audience} if self.audience and not self.api_key else {}
|
|
58
|
+
)
|
|
59
|
+
self.db_connection = SearchClient(
|
|
60
|
+
endpoint=self.url,
|
|
61
|
+
index_name=self.index_name,
|
|
62
|
+
credential=(
|
|
63
|
+
AzureKeyCredential(self.api_key)
|
|
64
|
+
if self.api_key
|
|
65
|
+
else DefaultAzureCredential()
|
|
66
|
+
),
|
|
67
|
+
**audience_arg,
|
|
68
|
+
)
|
|
69
|
+
self.index_client = SearchIndexClient(
|
|
70
|
+
endpoint=self.url,
|
|
71
|
+
credential=(
|
|
72
|
+
AzureKeyCredential(self.api_key)
|
|
73
|
+
if self.api_key
|
|
74
|
+
else DefaultAzureCredential()
|
|
75
|
+
),
|
|
76
|
+
**audience_arg,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def create_index(self) -> None:
|
|
80
|
+
"""Load documents into an Azure AI Search index."""
|
|
81
|
+
if (
|
|
82
|
+
self.index_name is not None
|
|
83
|
+
and self.index_name in self.index_client.list_index_names()
|
|
84
|
+
):
|
|
85
|
+
self.index_client.delete_index(self.index_name)
|
|
86
|
+
|
|
87
|
+
# Configure vector search profile
|
|
88
|
+
vector_search = VectorSearch(
|
|
89
|
+
algorithms=[
|
|
90
|
+
HnswAlgorithmConfiguration(
|
|
91
|
+
name="HnswAlg",
|
|
92
|
+
parameters=HnswParameters(
|
|
93
|
+
metric=VectorSearchAlgorithmMetric.COSINE
|
|
94
|
+
),
|
|
95
|
+
)
|
|
96
|
+
],
|
|
97
|
+
profiles=[
|
|
98
|
+
VectorSearchProfile(
|
|
99
|
+
name=self.vector_search_profile_name,
|
|
100
|
+
algorithm_configuration_name="HnswAlg",
|
|
101
|
+
)
|
|
102
|
+
],
|
|
103
|
+
)
|
|
104
|
+
# Configure the index
|
|
105
|
+
index = SearchIndex(
|
|
106
|
+
name=self.index_name,
|
|
107
|
+
fields=[
|
|
108
|
+
SimpleField(
|
|
109
|
+
name=self.id_field,
|
|
110
|
+
type=SearchFieldDataType.String,
|
|
111
|
+
key=True,
|
|
112
|
+
),
|
|
113
|
+
SearchField(
|
|
114
|
+
name=self.vector_field,
|
|
115
|
+
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
|
|
116
|
+
searchable=True,
|
|
117
|
+
hidden=False, # DRIFT needs to return the vector for client-side similarity
|
|
118
|
+
vector_search_dimensions=self.vector_size,
|
|
119
|
+
vector_search_profile_name=self.vector_search_profile_name,
|
|
120
|
+
),
|
|
121
|
+
],
|
|
122
|
+
vector_search=vector_search,
|
|
123
|
+
)
|
|
124
|
+
self.index_client.create_or_update_index(
|
|
125
|
+
index,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def load_documents(self, documents: list[VectorStoreDocument]) -> None:
|
|
129
|
+
"""Load documents into an Azure AI Search index."""
|
|
130
|
+
batch = [
|
|
131
|
+
{
|
|
132
|
+
self.id_field: doc.id,
|
|
133
|
+
self.vector_field: doc.vector,
|
|
134
|
+
}
|
|
135
|
+
for doc in documents
|
|
136
|
+
if doc.vector is not None
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
if len(batch) > 0:
|
|
140
|
+
self.db_connection.upload_documents(batch)
|
|
141
|
+
|
|
142
|
+
def similarity_search_by_vector(
|
|
143
|
+
self, query_embedding: list[float], k: int = 10
|
|
144
|
+
) -> list[VectorStoreSearchResult]:
|
|
145
|
+
"""Perform a vector-based similarity search."""
|
|
146
|
+
vectorized_query = VectorizedQuery(
|
|
147
|
+
vector=query_embedding, k_nearest_neighbors=k, fields=self.vector_field
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
response = self.db_connection.search(
|
|
151
|
+
vector_queries=[vectorized_query],
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return [
|
|
155
|
+
VectorStoreSearchResult(
|
|
156
|
+
document=VectorStoreDocument(
|
|
157
|
+
id=doc.get(self.id_field, ""),
|
|
158
|
+
vector=doc.get(self.vector_field, []),
|
|
159
|
+
),
|
|
160
|
+
# Cosine similarity between 0.333 and 1.000
|
|
161
|
+
# https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#scores-in-a-hybrid-search-results
|
|
162
|
+
score=doc["@search.score"],
|
|
163
|
+
)
|
|
164
|
+
for doc in response
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
def search_by_id(self, id: str) -> VectorStoreDocument:
|
|
168
|
+
"""Search for a document by id."""
|
|
169
|
+
response = self.db_connection.get_document(id)
|
|
170
|
+
return VectorStoreDocument(
|
|
171
|
+
id=response.get(self.id_field, ""),
|
|
172
|
+
vector=response.get(self.vector_field, []),
|
|
173
|
+
)
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""A package containing the CosmosDB vector store implementation."""
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from azure.cosmos import ContainerProxy, CosmosClient, DatabaseProxy
|
|
9
|
+
from azure.cosmos.exceptions import CosmosHttpResponseError
|
|
10
|
+
from azure.cosmos.partition_key import PartitionKey
|
|
11
|
+
from azure.identity import DefaultAzureCredential
|
|
12
|
+
|
|
13
|
+
from graphrag_vectors.vector_store import (
|
|
14
|
+
VectorStore,
|
|
15
|
+
VectorStoreDocument,
|
|
16
|
+
VectorStoreSearchResult,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CosmosDBVectorStore(VectorStore):
|
|
21
|
+
"""Azure CosmosDB vector storage implementation."""
|
|
22
|
+
|
|
23
|
+
_cosmos_client: CosmosClient
|
|
24
|
+
_database_client: DatabaseProxy
|
|
25
|
+
_container_client: ContainerProxy
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
database_name: str,
|
|
30
|
+
connection_string: str | None = None,
|
|
31
|
+
url: str | None = None,
|
|
32
|
+
**kwargs,
|
|
33
|
+
):
|
|
34
|
+
super().__init__(**kwargs)
|
|
35
|
+
if self.id_field != "id":
|
|
36
|
+
msg = "CosmosDB requires the id_field to be 'id'."
|
|
37
|
+
raise ValueError(msg)
|
|
38
|
+
if not connection_string and not url:
|
|
39
|
+
msg = "Either connection_string or url must be provided for CosmosDB."
|
|
40
|
+
raise ValueError(msg)
|
|
41
|
+
|
|
42
|
+
self.database_name = database_name
|
|
43
|
+
self.connection_string = connection_string
|
|
44
|
+
self.url = url
|
|
45
|
+
|
|
46
|
+
def connect(self) -> Any:
|
|
47
|
+
"""Connect to CosmosDB vector storage."""
|
|
48
|
+
if self.connection_string:
|
|
49
|
+
self._cosmos_client = CosmosClient.from_connection_string(
|
|
50
|
+
self.connection_string
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
self._cosmos_client = CosmosClient(
|
|
54
|
+
url=self.url, credential=DefaultAzureCredential()
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self._create_database()
|
|
58
|
+
self._create_container()
|
|
59
|
+
|
|
60
|
+
def _create_database(self) -> None:
|
|
61
|
+
"""Create the database if it doesn't exist."""
|
|
62
|
+
self._cosmos_client.create_database_if_not_exists(id=self.database_name)
|
|
63
|
+
self._database_client = self._cosmos_client.get_database_client(
|
|
64
|
+
self.database_name
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def _delete_database(self) -> None:
|
|
68
|
+
"""Delete the database if it exists."""
|
|
69
|
+
if self._database_exists():
|
|
70
|
+
self._cosmos_client.delete_database(self.database_name)
|
|
71
|
+
|
|
72
|
+
def _database_exists(self) -> bool:
|
|
73
|
+
"""Check if the database exists."""
|
|
74
|
+
existing_database_names = [
|
|
75
|
+
database["id"] for database in self._cosmos_client.list_databases()
|
|
76
|
+
]
|
|
77
|
+
return self.database_name in existing_database_names
|
|
78
|
+
|
|
79
|
+
def _create_container(self) -> None:
|
|
80
|
+
"""Create the container if it doesn't exist."""
|
|
81
|
+
partition_key = PartitionKey(path=f"/{self.id_field}", kind="Hash")
|
|
82
|
+
|
|
83
|
+
# Define the container vector policy
|
|
84
|
+
vector_embedding_policy = {
|
|
85
|
+
"vectorEmbeddings": [
|
|
86
|
+
{
|
|
87
|
+
"path": f"/{self.vector_field}",
|
|
88
|
+
"dataType": "float32",
|
|
89
|
+
"distanceFunction": "cosine",
|
|
90
|
+
"dimensions": self.vector_size,
|
|
91
|
+
}
|
|
92
|
+
]
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# Define the vector indexing policy
|
|
96
|
+
indexing_policy = {
|
|
97
|
+
"indexingMode": "consistent",
|
|
98
|
+
"automatic": True,
|
|
99
|
+
"includedPaths": [{"path": "/*"}],
|
|
100
|
+
"excludedPaths": [
|
|
101
|
+
{"path": "/_etag/?"},
|
|
102
|
+
{"path": f"/{self.vector_field}/*"},
|
|
103
|
+
],
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# Currently, the CosmosDB emulator does not support the diskANN policy.
|
|
107
|
+
try:
|
|
108
|
+
# First try with the standard diskANN policy
|
|
109
|
+
indexing_policy["vectorIndexes"] = [
|
|
110
|
+
{"path": f"/{self.vector_field}", "type": "diskANN"}
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
# Create the container and container client
|
|
114
|
+
self._database_client.create_container_if_not_exists(
|
|
115
|
+
id=self.index_name,
|
|
116
|
+
partition_key=partition_key,
|
|
117
|
+
indexing_policy=indexing_policy,
|
|
118
|
+
vector_embedding_policy=vector_embedding_policy,
|
|
119
|
+
)
|
|
120
|
+
except CosmosHttpResponseError:
|
|
121
|
+
# If diskANN fails (likely in emulator), retry without vector indexes
|
|
122
|
+
indexing_policy.pop("vectorIndexes", None)
|
|
123
|
+
|
|
124
|
+
# Create the container with compatible indexing policy
|
|
125
|
+
self._database_client.create_container_if_not_exists(
|
|
126
|
+
id=self.index_name,
|
|
127
|
+
partition_key=partition_key,
|
|
128
|
+
indexing_policy=indexing_policy,
|
|
129
|
+
vector_embedding_policy=vector_embedding_policy,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
self._container_client = self._database_client.get_container_client(
|
|
133
|
+
self.index_name
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _delete_container(self) -> None:
|
|
137
|
+
"""Delete the vector store container in the database if it exists."""
|
|
138
|
+
if self._container_exists():
|
|
139
|
+
self._database_client.delete_container(self.index_name)
|
|
140
|
+
|
|
141
|
+
def _container_exists(self) -> bool:
|
|
142
|
+
"""Check if the container name exists in the database."""
|
|
143
|
+
existing_container_names = [
|
|
144
|
+
container["id"] for container in self._database_client.list_containers()
|
|
145
|
+
]
|
|
146
|
+
return self.index_name in existing_container_names
|
|
147
|
+
|
|
148
|
+
def create_index(self) -> None:
|
|
149
|
+
"""Load documents into CosmosDB."""
|
|
150
|
+
# Create a CosmosDB container on overwrite
|
|
151
|
+
self._delete_container()
|
|
152
|
+
self._create_container()
|
|
153
|
+
|
|
154
|
+
if self._container_client is None:
|
|
155
|
+
msg = "Container client is not initialized."
|
|
156
|
+
raise ValueError(msg)
|
|
157
|
+
|
|
158
|
+
def load_documents(self, documents: list[VectorStoreDocument]) -> None:
|
|
159
|
+
"""Load documents into CosmosDB."""
|
|
160
|
+
# Upload documents to CosmosDB
|
|
161
|
+
for doc in documents:
|
|
162
|
+
if doc.vector is not None:
|
|
163
|
+
doc_json = {
|
|
164
|
+
self.id_field: doc.id,
|
|
165
|
+
self.vector_field: doc.vector,
|
|
166
|
+
}
|
|
167
|
+
self._container_client.upsert_item(doc_json)
|
|
168
|
+
|
|
169
|
+
def similarity_search_by_vector(
|
|
170
|
+
self, query_embedding: list[float], k: int = 10
|
|
171
|
+
) -> list[VectorStoreSearchResult]:
|
|
172
|
+
"""Perform a vector-based similarity search."""
|
|
173
|
+
if self._container_client is None:
|
|
174
|
+
msg = "Container client is not initialized."
|
|
175
|
+
raise ValueError(msg)
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
query = f"SELECT TOP {k} c.{self.id_field}, c.{self.vector_field}, VectorDistance(c.{self.vector_field}, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.{self.vector_field}, @embedding)" # noqa: S608
|
|
179
|
+
query_params = [{"name": "@embedding", "value": query_embedding}]
|
|
180
|
+
items = list(
|
|
181
|
+
self._container_client.query_items(
|
|
182
|
+
query=query,
|
|
183
|
+
parameters=query_params,
|
|
184
|
+
enable_cross_partition_query=True,
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
except (CosmosHttpResponseError, ValueError):
|
|
188
|
+
# Currently, the CosmosDB emulator does not support the VectorDistance function.
|
|
189
|
+
# For emulator or test environments - fetch all items and calculate distance locally
|
|
190
|
+
query = f"SELECT c.{self.id_field}, c.{self.vector_field} FROM c" # noqa: S608
|
|
191
|
+
items = list(
|
|
192
|
+
self._container_client.query_items(
|
|
193
|
+
query=query,
|
|
194
|
+
enable_cross_partition_query=True,
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Calculate cosine similarity locally (1 - cosine distance)
|
|
199
|
+
from numpy import dot
|
|
200
|
+
from numpy.linalg import norm
|
|
201
|
+
|
|
202
|
+
def cosine_similarity(a, b):
|
|
203
|
+
if norm(a) * norm(b) == 0:
|
|
204
|
+
return 0.0
|
|
205
|
+
return dot(a, b) / (norm(a) * norm(b))
|
|
206
|
+
|
|
207
|
+
# Calculate scores for all items
|
|
208
|
+
for item in items:
|
|
209
|
+
item_vector = item.get(self.vector_field, [])
|
|
210
|
+
similarity = cosine_similarity(query_embedding, item_vector)
|
|
211
|
+
item["SimilarityScore"] = similarity
|
|
212
|
+
|
|
213
|
+
# Sort by similarity score (higher is better) and take top k
|
|
214
|
+
items = sorted(
|
|
215
|
+
items, key=lambda x: x.get("SimilarityScore", 0.0), reverse=True
|
|
216
|
+
)[:k]
|
|
217
|
+
|
|
218
|
+
return [
|
|
219
|
+
VectorStoreSearchResult(
|
|
220
|
+
document=VectorStoreDocument(
|
|
221
|
+
id=item.get(self.id_field, ""),
|
|
222
|
+
vector=item.get(self.vector_field, []),
|
|
223
|
+
),
|
|
224
|
+
score=item.get("SimilarityScore", 0.0),
|
|
225
|
+
)
|
|
226
|
+
for item in items
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
def search_by_id(self, id: str) -> VectorStoreDocument:
|
|
230
|
+
"""Search for a document by id."""
|
|
231
|
+
if self._container_client is None:
|
|
232
|
+
msg = "Container client is not initialized."
|
|
233
|
+
raise ValueError(msg)
|
|
234
|
+
|
|
235
|
+
item = self._container_client.read_item(item=id, partition_key=id)
|
|
236
|
+
return VectorStoreDocument(
|
|
237
|
+
id=item.get(self.id_field, ""),
|
|
238
|
+
vector=item.get(self.vector_field, []),
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def clear(self) -> None:
|
|
242
|
+
"""Clear the vector store."""
|
|
243
|
+
self._delete_container()
|
|
244
|
+
self._delete_database()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""Parameterization settings for the default configuration."""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field, model_validator
|
|
9
|
+
|
|
10
|
+
DEFAULT_VECTOR_SIZE: int = 3072
|
|
11
|
+
|
|
12
|
+
VALID_IDENTIFIER_REGEX = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def is_valid_field_name(field: str) -> bool:
|
|
16
|
+
"""Check if a field name is valid for CosmosDB."""
|
|
17
|
+
return bool(VALID_IDENTIFIER_REGEX.match(field))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class IndexSchema(BaseModel):
|
|
21
|
+
"""The default configuration section for Vector Store Schema."""
|
|
22
|
+
|
|
23
|
+
index_name: str = Field(
|
|
24
|
+
description="The index name to use.", default="vector_index"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
id_field: str = Field(
|
|
28
|
+
description="The ID field to use.",
|
|
29
|
+
default="id",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
vector_field: str = Field(
|
|
33
|
+
description="The vector field to use.",
|
|
34
|
+
default="vector",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
vector_size: int = Field(
|
|
38
|
+
description="The vector size to use.",
|
|
39
|
+
default=DEFAULT_VECTOR_SIZE,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def _validate_schema(self) -> None:
|
|
43
|
+
"""Validate the schema."""
|
|
44
|
+
for field in [
|
|
45
|
+
self.id_field,
|
|
46
|
+
self.vector_field,
|
|
47
|
+
]:
|
|
48
|
+
if not is_valid_field_name(field):
|
|
49
|
+
msg = f"Unsafe or invalid field name: {field}"
|
|
50
|
+
raise ValueError(msg)
|
|
51
|
+
|
|
52
|
+
@model_validator(mode="after")
|
|
53
|
+
def _validate_model(self):
|
|
54
|
+
"""Validate the model."""
|
|
55
|
+
self._validate_schema()
|
|
56
|
+
return self
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""The LanceDB vector storage implementation package."""
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import lancedb
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
|
|
12
|
+
from graphrag_vectors.vector_store import (
|
|
13
|
+
VectorStore,
|
|
14
|
+
VectorStoreDocument,
|
|
15
|
+
VectorStoreSearchResult,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LanceDBVectorStore(VectorStore):
|
|
20
|
+
"""LanceDB vector storage implementation."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, db_uri: str = "lancedb", **kwargs: Any):
|
|
23
|
+
super().__init__(**kwargs)
|
|
24
|
+
self.db_uri = db_uri
|
|
25
|
+
|
|
26
|
+
def connect(self) -> Any:
|
|
27
|
+
"""Connect to the vector storage."""
|
|
28
|
+
self.db_connection = lancedb.connect(self.db_uri)
|
|
29
|
+
|
|
30
|
+
if self.index_name and self.index_name in self.db_connection.table_names():
|
|
31
|
+
self.document_collection = self.db_connection.open_table(self.index_name)
|
|
32
|
+
|
|
33
|
+
def create_index(self) -> None:
|
|
34
|
+
"""Create index."""
|
|
35
|
+
dummy_vector = np.zeros(self.vector_size, dtype=np.float32)
|
|
36
|
+
flat_array = pa.array(dummy_vector, type=pa.float32())
|
|
37
|
+
vector_column = pa.FixedSizeListArray.from_arrays(flat_array, self.vector_size)
|
|
38
|
+
|
|
39
|
+
data = pa.table({
|
|
40
|
+
self.id_field: pa.array(["__DUMMY__"], type=pa.string()),
|
|
41
|
+
self.vector_field: vector_column,
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
self.document_collection = self.db_connection.create_table(
|
|
45
|
+
self.index_name if self.index_name else "",
|
|
46
|
+
data=data,
|
|
47
|
+
mode="overwrite",
|
|
48
|
+
schema=data.schema,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Step 5: Create index now that schema exists
|
|
52
|
+
self.document_collection.create_index(
|
|
53
|
+
vector_column_name=self.vector_field, index_type="IVF_FLAT"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def load_documents(self, documents: list[VectorStoreDocument]) -> None:
|
|
57
|
+
"""Load documents into vector storage."""
|
|
58
|
+
self.document_collection.delete(f"{self.id_field} = '__DUMMY__'")
|
|
59
|
+
|
|
60
|
+
# Step 1: Prepare data columns manually
|
|
61
|
+
ids = []
|
|
62
|
+
vectors = []
|
|
63
|
+
|
|
64
|
+
for document in documents:
|
|
65
|
+
self.vector_size = (
|
|
66
|
+
len(document.vector) if document.vector else self.vector_size
|
|
67
|
+
)
|
|
68
|
+
if document.vector is not None and len(document.vector) == self.vector_size:
|
|
69
|
+
ids.append(document.id)
|
|
70
|
+
vectors.append(np.array(document.vector, dtype=np.float32))
|
|
71
|
+
|
|
72
|
+
# Step 2: Handle empty case
|
|
73
|
+
if len(ids) == 0:
|
|
74
|
+
data = None
|
|
75
|
+
else:
|
|
76
|
+
# Step 3: Flatten the vectors and build FixedSizeListArray manually
|
|
77
|
+
flat_vector = np.concatenate(vectors).astype(np.float32)
|
|
78
|
+
flat_array = pa.array(flat_vector, type=pa.float32())
|
|
79
|
+
vector_column = pa.FixedSizeListArray.from_arrays(
|
|
80
|
+
flat_array, self.vector_size
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Step 4: Create PyArrow table (let schema be inferred)
|
|
84
|
+
data = pa.table({
|
|
85
|
+
self.id_field: pa.array(ids, type=pa.string()),
|
|
86
|
+
self.vector_field: vector_column,
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
if data:
|
|
90
|
+
self.document_collection.add(data)
|
|
91
|
+
|
|
92
|
+
def similarity_search_by_vector(
|
|
93
|
+
self, query_embedding: list[float] | np.ndarray, k: int = 10
|
|
94
|
+
) -> list[VectorStoreSearchResult]:
|
|
95
|
+
"""Perform a vector-based similarity search."""
|
|
96
|
+
query_embedding = np.array(query_embedding, dtype=np.float32)
|
|
97
|
+
|
|
98
|
+
docs = (
|
|
99
|
+
self.document_collection
|
|
100
|
+
.search(query=query_embedding, vector_column_name=self.vector_field)
|
|
101
|
+
.limit(k)
|
|
102
|
+
.to_list()
|
|
103
|
+
)
|
|
104
|
+
return [
|
|
105
|
+
VectorStoreSearchResult(
|
|
106
|
+
document=VectorStoreDocument(
|
|
107
|
+
id=doc[self.id_field],
|
|
108
|
+
vector=doc[self.vector_field],
|
|
109
|
+
),
|
|
110
|
+
score=1 - abs(float(doc["_distance"])),
|
|
111
|
+
)
|
|
112
|
+
for doc in docs
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
def search_by_id(self, id: str) -> VectorStoreDocument:
|
|
116
|
+
"""Search for a document by id."""
|
|
117
|
+
doc = (
|
|
118
|
+
self.document_collection
|
|
119
|
+
.search()
|
|
120
|
+
.where(f"{self.id_field} == '{id}'", prefilter=True)
|
|
121
|
+
.to_list()
|
|
122
|
+
)
|
|
123
|
+
if doc:
|
|
124
|
+
return VectorStoreDocument(
|
|
125
|
+
id=doc[0][self.id_field],
|
|
126
|
+
vector=doc[0][self.vector_field],
|
|
127
|
+
)
|
|
128
|
+
return VectorStoreDocument(id=id, vector=None)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""Base classes for vector stores."""
|
|
5
|
+
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from graphrag_vectors.types import TextEmbedder
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class VectorStoreDocument:
|
|
15
|
+
"""A document that is stored in vector storage."""
|
|
16
|
+
|
|
17
|
+
id: str | int
|
|
18
|
+
"""unique id for the document"""
|
|
19
|
+
|
|
20
|
+
vector: list[float] | None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class VectorStoreSearchResult:
|
|
25
|
+
"""A vector storage search result."""
|
|
26
|
+
|
|
27
|
+
document: VectorStoreDocument
|
|
28
|
+
"""Document that was found."""
|
|
29
|
+
|
|
30
|
+
score: float
|
|
31
|
+
"""Similarity score between -1 and 1. Higher is more similar."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class VectorStore(ABC):
|
|
35
|
+
"""The base class for vector storage data-access classes."""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
index_name: str = "vector_index",
|
|
40
|
+
id_field: str = "id",
|
|
41
|
+
vector_field: str = "vector",
|
|
42
|
+
vector_size: int = 3072,
|
|
43
|
+
**kwargs: Any,
|
|
44
|
+
):
|
|
45
|
+
self.index_name = index_name
|
|
46
|
+
self.id_field = id_field
|
|
47
|
+
self.vector_field = vector_field
|
|
48
|
+
self.vector_size = vector_size
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def connect(self) -> None:
|
|
52
|
+
"""Connect to vector storage."""
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def create_index(self) -> None:
|
|
56
|
+
"""Create index."""
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def load_documents(self, documents: list[VectorStoreDocument]) -> None:
|
|
60
|
+
"""Load documents into the vector-store."""
|
|
61
|
+
|
|
62
|
+
@abstractmethod
|
|
63
|
+
def similarity_search_by_vector(
|
|
64
|
+
self, query_embedding: list[float], k: int = 10
|
|
65
|
+
) -> list[VectorStoreSearchResult]:
|
|
66
|
+
"""Perform ANN search by vector."""
|
|
67
|
+
|
|
68
|
+
def similarity_search_by_text(
|
|
69
|
+
self, text: str, text_embedder: TextEmbedder, k: int = 10
|
|
70
|
+
) -> list[VectorStoreSearchResult]:
|
|
71
|
+
"""Perform a text-based similarity search."""
|
|
72
|
+
query_embedding = text_embedder(text)
|
|
73
|
+
if query_embedding:
|
|
74
|
+
return self.similarity_search_by_vector(
|
|
75
|
+
query_embedding=query_embedding, k=k
|
|
76
|
+
)
|
|
77
|
+
return []
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def search_by_id(self, id: str) -> VectorStoreDocument:
|
|
81
|
+
"""Search for a document by id."""
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""Parameterization settings for the default configuration."""
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
from graphrag_vectors.index_schema import IndexSchema
|
|
9
|
+
from graphrag_vectors.vector_store_type import VectorStoreType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VectorStoreConfig(BaseModel):
|
|
13
|
+
"""The default configuration section for Vector Store."""
|
|
14
|
+
|
|
15
|
+
model_config = ConfigDict(extra="allow")
|
|
16
|
+
"""Allow extra fields to support custom vector implementations."""
|
|
17
|
+
|
|
18
|
+
type: str = Field(
|
|
19
|
+
description="The vector store type to use.",
|
|
20
|
+
default=VectorStoreType.LanceDB,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
db_uri: str | None = Field(
|
|
24
|
+
description="The database URI to use (only used by lancedb for built-in stores).",
|
|
25
|
+
default=None,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
url: str | None = Field(
|
|
29
|
+
description="The database URL when type == azure_ai_search or cosmosdb.",
|
|
30
|
+
default=None,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
api_key: str | None = Field(
|
|
34
|
+
description="The database API key when type == azure_ai_search.",
|
|
35
|
+
default=None,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
audience: str | None = Field(
|
|
39
|
+
description="The database audience when type == azure_ai_search.",
|
|
40
|
+
default=None,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
connection_string: str | None = Field(
|
|
44
|
+
description="The connection string when type == cosmosdb.",
|
|
45
|
+
default=None,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
database_name: str | None = Field(
|
|
49
|
+
description="The database name to use when type == cosmosdb.",
|
|
50
|
+
default=None,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
index_schema: dict[str, IndexSchema] = {}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""Factory functions for creating a vector store."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from graphrag_common.factory import Factory, ServiceScope
|
|
11
|
+
|
|
12
|
+
from graphrag_vectors.vector_store import VectorStore
|
|
13
|
+
from graphrag_vectors.vector_store_type import VectorStoreType
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Callable
|
|
17
|
+
|
|
18
|
+
from graphrag_vectors.index_schema import IndexSchema
|
|
19
|
+
from graphrag_vectors.vector_store_config import VectorStoreConfig
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class VectorStoreFactory(Factory[VectorStore]):
|
|
23
|
+
"""A factory for vector stores.
|
|
24
|
+
|
|
25
|
+
Includes a method for users to register a custom vector store implementation.
|
|
26
|
+
|
|
27
|
+
Configuration arguments are passed to each vector store implementation as kwargs
|
|
28
|
+
for individual enforcement of required/optional arguments.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
vector_store_factory = VectorStoreFactory()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def register_vector_store(
|
|
36
|
+
vector_store_type: str,
|
|
37
|
+
vector_store_initializer: Callable[..., VectorStore],
|
|
38
|
+
scope: ServiceScope = "transient",
|
|
39
|
+
) -> None:
|
|
40
|
+
"""Register a custom vector store implementation.
|
|
41
|
+
|
|
42
|
+
Args
|
|
43
|
+
----
|
|
44
|
+
- vector_store_type: str
|
|
45
|
+
The vector store id to register.
|
|
46
|
+
- vector_store_initializer: Callable[..., VectorStore]
|
|
47
|
+
The vector store initializer to register.
|
|
48
|
+
- scope: ServiceScope
|
|
49
|
+
The service scope for the vector store (default: "transient").
|
|
50
|
+
"""
|
|
51
|
+
vector_store_factory.register(vector_store_type, vector_store_initializer, scope)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def create_vector_store(
|
|
55
|
+
config: VectorStoreConfig, index_schema: IndexSchema
|
|
56
|
+
) -> VectorStore:
|
|
57
|
+
"""Create a vector store implementation based on the given type and configuration.
|
|
58
|
+
|
|
59
|
+
Args
|
|
60
|
+
----
|
|
61
|
+
- config: VectorStoreConfig
|
|
62
|
+
The base vector store configuration.
|
|
63
|
+
- index_schema: IndexSchema
|
|
64
|
+
The index schema configuration for the vector store instance - i.e., for the specific table we are reading/writing.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
VectorStore
|
|
69
|
+
The created vector store implementation.
|
|
70
|
+
"""
|
|
71
|
+
strategy = config.type
|
|
72
|
+
|
|
73
|
+
# Lazy load built-in implementations
|
|
74
|
+
if strategy not in vector_store_factory:
|
|
75
|
+
match strategy:
|
|
76
|
+
case VectorStoreType.LanceDB:
|
|
77
|
+
from graphrag_vectors.lancedb import LanceDBVectorStore
|
|
78
|
+
|
|
79
|
+
register_vector_store(VectorStoreType.LanceDB, LanceDBVectorStore)
|
|
80
|
+
case VectorStoreType.AzureAISearch:
|
|
81
|
+
from graphrag_vectors.azure_ai_search import AzureAISearchVectorStore
|
|
82
|
+
|
|
83
|
+
register_vector_store(
|
|
84
|
+
VectorStoreType.AzureAISearch, AzureAISearchVectorStore
|
|
85
|
+
)
|
|
86
|
+
case VectorStoreType.CosmosDB:
|
|
87
|
+
from graphrag_vectors.cosmosdb import CosmosDBVectorStore
|
|
88
|
+
|
|
89
|
+
register_vector_store(VectorStoreType.CosmosDB, CosmosDBVectorStore)
|
|
90
|
+
case _:
|
|
91
|
+
msg = f"Vector store type '{strategy}' is not registered in the VectorStoreFactory. Registered types: {', '.join(vector_store_factory.keys())}."
|
|
92
|
+
raise ValueError(msg)
|
|
93
|
+
|
|
94
|
+
# collapse the base config and specific index config into a single dict for the initializer
|
|
95
|
+
config_model = config.model_dump()
|
|
96
|
+
index_model = index_schema.model_dump()
|
|
97
|
+
return vector_store_factory.create(
|
|
98
|
+
strategy, init_args={**config_model, **index_model}
|
|
99
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Copyright (c) 2024 Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License
|
|
3
|
+
|
|
4
|
+
"""Vector store type enum."""
|
|
5
|
+
|
|
6
|
+
from enum import StrEnum
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VectorStoreType(StrEnum):
|
|
10
|
+
"""The supported vector store types."""
|
|
11
|
+
|
|
12
|
+
LanceDB = "lancedb"
|
|
13
|
+
AzureAISearch = "azure_ai_search"
|
|
14
|
+
CosmosDB = "cosmosdb"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "graphrag-vectors"
|
|
3
|
+
version = "3.0.0"
|
|
4
|
+
description = "GraphRAG vector store package."
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Alonso Guevara Fernández", email = "alonsog@microsoft.com"},
|
|
7
|
+
{name = "Andrés Morales Esquivel", email = "andresmor@microsoft.com"},
|
|
8
|
+
{name = "Chris Trevino", email = "chtrevin@microsoft.com"},
|
|
9
|
+
{name = "David Tittsworth", email = "datittsw@microsoft.com"},
|
|
10
|
+
{name = "Dayenne de Souza", email = "ddesouza@microsoft.com"},
|
|
11
|
+
{name = "Derek Worthen", email = "deworthe@microsoft.com"},
|
|
12
|
+
{name = "Gaudy Blanco Meneses", email = "gaudyb@microsoft.com"},
|
|
13
|
+
{name = "Ha Trinh", email = "trinhha@microsoft.com"},
|
|
14
|
+
{name = "Jonathan Larson", email = "jolarso@microsoft.com"},
|
|
15
|
+
{name = "Josh Bradley", email = "joshbradley@microsoft.com"},
|
|
16
|
+
{name = "Kate Lytvynets", email = "kalytv@microsoft.com"},
|
|
17
|
+
{name = "Kenny Zhang", email = "zhangken@microsoft.com"},
|
|
18
|
+
{name = "Mónica Carvajal"},
|
|
19
|
+
{name = "Nathan Evans", email = "naevans@microsoft.com"},
|
|
20
|
+
{name = "Rodrigo Racanicci", email = "rracanicci@microsoft.com"},
|
|
21
|
+
{name = "Sarah Smith", email = "smithsarah@microsoft.com"},
|
|
22
|
+
]
|
|
23
|
+
license = {text = "MIT"}
|
|
24
|
+
readme = "README.md"
|
|
25
|
+
requires-python = ">=3.11,<3.14"
|
|
26
|
+
classifiers = [
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"azure-core~=1.32",
|
|
34
|
+
"azure-cosmos~=4.9",
|
|
35
|
+
"azure-identity~=1.19",
|
|
36
|
+
"azure-search-documents~=11.6",
|
|
37
|
+
"graphrag-common==3.0.0",
|
|
38
|
+
"lancedb~=0.24.1",
|
|
39
|
+
"numpy~=2.1",
|
|
40
|
+
"pyarrow~=22.0",
|
|
41
|
+
"pydantic~=2.10",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Source = "https://github.com/microsoft/graphrag"
|
|
46
|
+
|
|
47
|
+
[build-system]
|
|
48
|
+
requires = ["hatchling>=1.27.0,<2.0.0"]
|
|
49
|
+
build-backend = "hatchling.build"
|