graphrag-vectors 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,65 @@
1
+ # Python Artifacts
2
+ python/*/lib/
3
+ dist/
4
+ build/
5
+ *.egg-info/
6
+
7
+ # Test Output
8
+ .coverage
9
+ coverage/
10
+ licenses.txt
11
+ examples_notebooks/*/data
12
+ tests/fixtures/cache
13
+ tests/fixtures/*/cache
14
+ tests/fixtures/*/output
15
+ output/lancedb
16
+
17
+
18
+ # Random
19
+ .DS_Store
20
+ *.log*
21
+ .venv
22
+ venv/
23
+ .conda
24
+ .tmp
25
+ packages/graphrag-llm/notebooks/metrics
26
+ packages/graphrag-llm/notebooks/cache
27
+
28
+ .env
29
+ build.zip
30
+
31
+ .turbo
32
+
33
+ __pycache__
34
+
35
+ .pipeline
36
+
37
+ # Azurite
38
+ temp_azurite/
39
+ __azurite*.json
40
+ __blobstorage*.json
41
+ __blobstorage__/
42
+
43
+ # Getting started example
44
+ ragtest/
45
+ .ragtest/
46
+ .pipelines
47
+ .pipeline
48
+
49
+
50
+ # mkdocs
51
+ site/
52
+
53
+ # Docs migration
54
+ docsite/
55
+ .yarn/
56
+ .pnp*
57
+
58
+ # PyCharm
59
+ .idea/
60
+
61
+ # Jupyter notebook
62
+ .ipynb_checkpoints/
63
+
64
+ # Root build assets
65
+ packages/*/LICENSE
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: graphrag-vectors
3
+ Version: 3.0.0
4
+ Summary: GraphRAG vector store package.
5
+ Project-URL: Source, https://github.com/microsoft/graphrag
6
+ Author: Mónica Carvajal
7
+ Author-email: Alonso Guevara Fernández <alonsog@microsoft.com>, Andrés Morales Esquivel <andresmor@microsoft.com>, Chris Trevino <chtrevin@microsoft.com>, David Tittsworth <datittsw@microsoft.com>, Dayenne de Souza <ddesouza@microsoft.com>, Derek Worthen <deworthe@microsoft.com>, Gaudy Blanco Meneses <gaudyb@microsoft.com>, Ha Trinh <trinhha@microsoft.com>, Jonathan Larson <jolarso@microsoft.com>, Josh Bradley <joshbradley@microsoft.com>, Kate Lytvynets <kalytv@microsoft.com>, Kenny Zhang <zhangken@microsoft.com>, Nathan Evans <naevans@microsoft.com>, Rodrigo Racanicci <rracanicci@microsoft.com>, Sarah Smith <smithsarah@microsoft.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Python: <3.14,>=3.11
15
+ Requires-Dist: azure-core~=1.32
16
+ Requires-Dist: azure-cosmos~=4.9
17
+ Requires-Dist: azure-identity~=1.19
18
+ Requires-Dist: azure-search-documents~=11.6
19
+ Requires-Dist: graphrag-common==3.0.0
20
+ Requires-Dist: lancedb~=0.24.1
21
+ Requires-Dist: numpy~=2.1
22
+ Requires-Dist: pyarrow~=22.0
23
+ Requires-Dist: pydantic~=2.10
24
+ Description-Content-Type: text/markdown
25
+
26
+ # GraphRAG Vectors
27
+
28
+ This package provides vector store implementations for GraphRAG with support for multiple backends including LanceDB, Azure AI Search, and Azure Cosmos DB. It offers both a convenient configuration-driven API and direct factory access for creating and managing vector stores with flexible index schema definitions.
29
+
30
+ ## Basic usage with the utility function (recommended)
31
+
32
+ This demonstrates the recommended approach to create a vector store using the create_vector_store convenience function with configuration objects that specify the store type and index schema. The example shows setting up a LanceDB vector store with a defined index configuration, then connecting to it and creating the index for vector operations.
33
+
34
+ ```python
35
+ from graphrag_vectors import (
36
+ create_vector_store,
37
+ VectorStoreType,
38
+ IndexSchema,
39
+ )
40
+
41
+ # Create a vector store using the convenience function
42
+ store_config = VectorStoreConfig(
43
+ type="lancedb",
44
+ db_uri="lance"
45
+ )
46
+
47
+ schema_config = IndexSchema(
48
+ index_name="my_index",
49
+ vector_size=1536,
50
+ )
51
+
52
+ vector_store = create_vector_store(
53
+ config=store_config
54
+ index_schema=schema_config,
55
+ )
56
+
57
+ vector_store.connect()
58
+ vector_store.create_index()
59
+ ```
60
+
61
+ ## Basic usage implementing the factory directly
62
+
63
+ This example shows a different approach to create vector stores by directly using the vector_store_factory with enum types and dictionary-based initialization arguments. This method provides more direct control over the factory creation process while bypassing the convenience function layer.
64
+
65
+ ```python
66
+ from graphrag_vectors import (
67
+ VectorStoreFactory,
68
+ vector_store_factory,
69
+ VectorStoreType,
70
+ IndexSchema,
71
+ )
72
+
73
+ # Create a vector store using the factory
74
+ schema_config = IndexSchema(
75
+ index_name="my_index",
76
+ vector_size=1536,
77
+ )
78
+
79
+ vector_store = vector_store_factory.create(
80
+ VectorStoreType.LanceDB,
81
+ {
82
+ "index_schema": schema_config,
83
+ "db_uri": "./lancedb"
84
+ }
85
+ )
86
+
87
+ vector_store.connect()
88
+ vector_store.create_index()
89
+ ```
90
+
91
+ ## Supported Vector Stores
92
+
93
+ - **LanceDB**: Local vector database
94
+ - **Azure AI Search**: Azure's managed search service with vector capabilities
95
+ - **Azure Cosmos DB**: Azure's NoSQL database with vector search support
96
+
97
+ ## Custom Vector Store
98
+
99
+ You can register custom vector store implementations:
100
+
101
+ ```python
102
+ from graphrag_vectors import VectorStore, register_vector_store, create_vector_store
103
+
104
+ class MyCustomVectorStore(VectorStore):
105
+ def __init__(self, my_param):
106
+ self.my_param = my_param
107
+
108
+ def connect(self):
109
+ # Implementation
110
+ pass
111
+
112
+ def create_index(self):
113
+ # Implementation
114
+ pass
115
+
116
+ # ... implement other required methods
117
+
118
+ # Register your custom implementation
119
+ register_vector_store("my_custom_store", MyCustomVectorStore)
120
+
121
+ # Use your custom vector store
122
+ config = VectorStoreConfig(
123
+ type="my_custom_store",
124
+ my_param="something"
125
+ )
126
+ custom_store = create_vector_store(
127
+ config=config,
128
+ index_schema=schema_config,
129
+ )
130
+ ```
131
+
132
+ ## Configuration
133
+
134
+ Vector stores are configured using:
135
+ - `VectorStoreConfig`: baseline parameters for the store
136
+ - `IndexSchema`: Schema configuration for the specific index to create/connect to (index name, field names, vector size)
@@ -0,0 +1,111 @@
1
+ # GraphRAG Vectors
2
+
3
+ This package provides vector store implementations for GraphRAG with support for multiple backends including LanceDB, Azure AI Search, and Azure Cosmos DB. It offers both a convenient configuration-driven API and direct factory access for creating and managing vector stores with flexible index schema definitions.
4
+
5
+ ## Basic usage with the utility function (recommended)
6
+
7
+ This demonstrates the recommended approach to create a vector store using the create_vector_store convenience function with configuration objects that specify the store type and index schema. The example shows setting up a LanceDB vector store with a defined index configuration, then connecting to it and creating the index for vector operations.
8
+
9
+ ```python
10
+ from graphrag_vectors import (
11
+ create_vector_store,
12
+ VectorStoreType,
13
+ IndexSchema,
14
+ )
15
+
16
+ # Create a vector store using the convenience function
17
+ store_config = VectorStoreConfig(
18
+ type="lancedb",
19
+ db_uri="lance"
20
+ )
21
+
22
+ schema_config = IndexSchema(
23
+ index_name="my_index",
24
+ vector_size=1536,
25
+ )
26
+
27
+ vector_store = create_vector_store(
28
+ config=store_config
29
+ index_schema=schema_config,
30
+ )
31
+
32
+ vector_store.connect()
33
+ vector_store.create_index()
34
+ ```
35
+
36
+ ## Basic usage implementing the factory directly
37
+
38
+ This example shows a different approach to create vector stores by directly using the vector_store_factory with enum types and dictionary-based initialization arguments. This method provides more direct control over the factory creation process while bypassing the convenience function layer.
39
+
40
+ ```python
41
+ from graphrag_vectors import (
42
+ VectorStoreFactory,
43
+ vector_store_factory,
44
+ VectorStoreType,
45
+ IndexSchema,
46
+ )
47
+
48
+ # Create a vector store using the factory
49
+ schema_config = IndexSchema(
50
+ index_name="my_index",
51
+ vector_size=1536,
52
+ )
53
+
54
+ vector_store = vector_store_factory.create(
55
+ VectorStoreType.LanceDB,
56
+ {
57
+ "index_schema": schema_config,
58
+ "db_uri": "./lancedb"
59
+ }
60
+ )
61
+
62
+ vector_store.connect()
63
+ vector_store.create_index()
64
+ ```
65
+
66
+ ## Supported Vector Stores
67
+
68
+ - **LanceDB**: Local vector database
69
+ - **Azure AI Search**: Azure's managed search service with vector capabilities
70
+ - **Azure Cosmos DB**: Azure's NoSQL database with vector search support
71
+
72
+ ## Custom Vector Store
73
+
74
+ You can register custom vector store implementations:
75
+
76
+ ```python
77
+ from graphrag_vectors import VectorStore, register_vector_store, create_vector_store
78
+
79
+ class MyCustomVectorStore(VectorStore):
80
+ def __init__(self, my_param):
81
+ self.my_param = my_param
82
+
83
+ def connect(self):
84
+ # Implementation
85
+ pass
86
+
87
+ def create_index(self):
88
+ # Implementation
89
+ pass
90
+
91
+ # ... implement other required methods
92
+
93
+ # Register your custom implementation
94
+ register_vector_store("my_custom_store", MyCustomVectorStore)
95
+
96
+ # Use your custom vector store
97
+ config = VectorStoreConfig(
98
+ type="my_custom_store",
99
+ my_param="something"
100
+ )
101
+ custom_store = create_vector_store(
102
+ config=config,
103
+ index_schema=schema_config,
104
+ )
105
+ ```
106
+
107
+ ## Configuration
108
+
109
+ Vector stores are configured using:
110
+ - `VectorStoreConfig`: baseline parameters for the store
111
+ - `IndexSchema`: Schema configuration for the specific index to create/connect to (index name, field names, vector size)
@@ -0,0 +1,34 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """GraphRAG vector store implementations."""
5
+
6
+ from graphrag_vectors.index_schema import IndexSchema
7
+ from graphrag_vectors.types import TextEmbedder
8
+ from graphrag_vectors.vector_store import (
9
+ VectorStore,
10
+ VectorStoreDocument,
11
+ VectorStoreSearchResult,
12
+ )
13
+ from graphrag_vectors.vector_store_config import VectorStoreConfig
14
+ from graphrag_vectors.vector_store_factory import (
15
+ VectorStoreFactory,
16
+ create_vector_store,
17
+ register_vector_store,
18
+ vector_store_factory,
19
+ )
20
+ from graphrag_vectors.vector_store_type import VectorStoreType
21
+
22
+ __all__ = [
23
+ "IndexSchema",
24
+ "TextEmbedder",
25
+ "VectorStore",
26
+ "VectorStoreConfig",
27
+ "VectorStoreDocument",
28
+ "VectorStoreFactory",
29
+ "VectorStoreSearchResult",
30
+ "VectorStoreType",
31
+ "create_vector_store",
32
+ "register_vector_store",
33
+ "vector_store_factory",
34
+ ]
@@ -0,0 +1,173 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A package containing the Azure AI Search vector store implementation."""
5
+
6
+ from typing import Any
7
+
8
+ from azure.core.credentials import AzureKeyCredential
9
+ from azure.identity import DefaultAzureCredential
10
+ from azure.search.documents import SearchClient
11
+ from azure.search.documents.indexes import SearchIndexClient
12
+ from azure.search.documents.indexes.models import (
13
+ HnswAlgorithmConfiguration,
14
+ HnswParameters,
15
+ SearchField,
16
+ SearchFieldDataType,
17
+ SearchIndex,
18
+ SimpleField,
19
+ VectorSearch,
20
+ VectorSearchAlgorithmMetric,
21
+ VectorSearchProfile,
22
+ )
23
+ from azure.search.documents.models import VectorizedQuery
24
+
25
+ from graphrag_vectors.vector_store import (
26
+ VectorStore,
27
+ VectorStoreDocument,
28
+ VectorStoreSearchResult,
29
+ )
30
+
31
+
32
+ class AzureAISearchVectorStore(VectorStore):
33
+ """Azure AI Search vector storage implementation."""
34
+
35
+ index_client: SearchIndexClient
36
+
37
+ def __init__(
38
+ self,
39
+ url: str,
40
+ api_key: str | None = None,
41
+ audience: str | None = None,
42
+ vector_search_profile_name: str = "vectorSearchProfile",
43
+ **kwargs: Any,
44
+ ):
45
+ super().__init__(**kwargs)
46
+ if not url:
47
+ msg = "url must be provided for Azure AI Search."
48
+ raise ValueError(msg)
49
+ self.url = url
50
+ self.api_key = api_key
51
+ self.audience = audience
52
+ self.vector_search_profile_name = vector_search_profile_name
53
+
54
+ def connect(self) -> Any:
55
+ """Connect to AI search vector storage."""
56
+ audience_arg = (
57
+ {"audience": self.audience} if self.audience and not self.api_key else {}
58
+ )
59
+ self.db_connection = SearchClient(
60
+ endpoint=self.url,
61
+ index_name=self.index_name,
62
+ credential=(
63
+ AzureKeyCredential(self.api_key)
64
+ if self.api_key
65
+ else DefaultAzureCredential()
66
+ ),
67
+ **audience_arg,
68
+ )
69
+ self.index_client = SearchIndexClient(
70
+ endpoint=self.url,
71
+ credential=(
72
+ AzureKeyCredential(self.api_key)
73
+ if self.api_key
74
+ else DefaultAzureCredential()
75
+ ),
76
+ **audience_arg,
77
+ )
78
+
79
+ def create_index(self) -> None:
80
+ """Load documents into an Azure AI Search index."""
81
+ if (
82
+ self.index_name is not None
83
+ and self.index_name in self.index_client.list_index_names()
84
+ ):
85
+ self.index_client.delete_index(self.index_name)
86
+
87
+ # Configure vector search profile
88
+ vector_search = VectorSearch(
89
+ algorithms=[
90
+ HnswAlgorithmConfiguration(
91
+ name="HnswAlg",
92
+ parameters=HnswParameters(
93
+ metric=VectorSearchAlgorithmMetric.COSINE
94
+ ),
95
+ )
96
+ ],
97
+ profiles=[
98
+ VectorSearchProfile(
99
+ name=self.vector_search_profile_name,
100
+ algorithm_configuration_name="HnswAlg",
101
+ )
102
+ ],
103
+ )
104
+ # Configure the index
105
+ index = SearchIndex(
106
+ name=self.index_name,
107
+ fields=[
108
+ SimpleField(
109
+ name=self.id_field,
110
+ type=SearchFieldDataType.String,
111
+ key=True,
112
+ ),
113
+ SearchField(
114
+ name=self.vector_field,
115
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
116
+ searchable=True,
117
+ hidden=False, # DRIFT needs to return the vector for client-side similarity
118
+ vector_search_dimensions=self.vector_size,
119
+ vector_search_profile_name=self.vector_search_profile_name,
120
+ ),
121
+ ],
122
+ vector_search=vector_search,
123
+ )
124
+ self.index_client.create_or_update_index(
125
+ index,
126
+ )
127
+
128
+ def load_documents(self, documents: list[VectorStoreDocument]) -> None:
129
+ """Load documents into an Azure AI Search index."""
130
+ batch = [
131
+ {
132
+ self.id_field: doc.id,
133
+ self.vector_field: doc.vector,
134
+ }
135
+ for doc in documents
136
+ if doc.vector is not None
137
+ ]
138
+
139
+ if len(batch) > 0:
140
+ self.db_connection.upload_documents(batch)
141
+
142
+ def similarity_search_by_vector(
143
+ self, query_embedding: list[float], k: int = 10
144
+ ) -> list[VectorStoreSearchResult]:
145
+ """Perform a vector-based similarity search."""
146
+ vectorized_query = VectorizedQuery(
147
+ vector=query_embedding, k_nearest_neighbors=k, fields=self.vector_field
148
+ )
149
+
150
+ response = self.db_connection.search(
151
+ vector_queries=[vectorized_query],
152
+ )
153
+
154
+ return [
155
+ VectorStoreSearchResult(
156
+ document=VectorStoreDocument(
157
+ id=doc.get(self.id_field, ""),
158
+ vector=doc.get(self.vector_field, []),
159
+ ),
160
+ # Cosine similarity between 0.333 and 1.000
161
+ # https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#scores-in-a-hybrid-search-results
162
+ score=doc["@search.score"],
163
+ )
164
+ for doc in response
165
+ ]
166
+
167
+ def search_by_id(self, id: str) -> VectorStoreDocument:
168
+ """Search for a document by id."""
169
+ response = self.db_connection.get_document(id)
170
+ return VectorStoreDocument(
171
+ id=response.get(self.id_field, ""),
172
+ vector=response.get(self.vector_field, []),
173
+ )
@@ -0,0 +1,244 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """A package containing the CosmosDB vector store implementation."""
5
+
6
+ from typing import Any
7
+
8
+ from azure.cosmos import ContainerProxy, CosmosClient, DatabaseProxy
9
+ from azure.cosmos.exceptions import CosmosHttpResponseError
10
+ from azure.cosmos.partition_key import PartitionKey
11
+ from azure.identity import DefaultAzureCredential
12
+
13
+ from graphrag_vectors.vector_store import (
14
+ VectorStore,
15
+ VectorStoreDocument,
16
+ VectorStoreSearchResult,
17
+ )
18
+
19
+
20
+ class CosmosDBVectorStore(VectorStore):
21
+ """Azure CosmosDB vector storage implementation."""
22
+
23
+ _cosmos_client: CosmosClient
24
+ _database_client: DatabaseProxy
25
+ _container_client: ContainerProxy
26
+
27
+ def __init__(
28
+ self,
29
+ database_name: str,
30
+ connection_string: str | None = None,
31
+ url: str | None = None,
32
+ **kwargs,
33
+ ):
34
+ super().__init__(**kwargs)
35
+ if self.id_field != "id":
36
+ msg = "CosmosDB requires the id_field to be 'id'."
37
+ raise ValueError(msg)
38
+ if not connection_string and not url:
39
+ msg = "Either connection_string or url must be provided for CosmosDB."
40
+ raise ValueError(msg)
41
+
42
+ self.database_name = database_name
43
+ self.connection_string = connection_string
44
+ self.url = url
45
+
46
+ def connect(self) -> Any:
47
+ """Connect to CosmosDB vector storage."""
48
+ if self.connection_string:
49
+ self._cosmos_client = CosmosClient.from_connection_string(
50
+ self.connection_string
51
+ )
52
+ else:
53
+ self._cosmos_client = CosmosClient(
54
+ url=self.url, credential=DefaultAzureCredential()
55
+ )
56
+
57
+ self._create_database()
58
+ self._create_container()
59
+
60
+ def _create_database(self) -> None:
61
+ """Create the database if it doesn't exist."""
62
+ self._cosmos_client.create_database_if_not_exists(id=self.database_name)
63
+ self._database_client = self._cosmos_client.get_database_client(
64
+ self.database_name
65
+ )
66
+
67
+ def _delete_database(self) -> None:
68
+ """Delete the database if it exists."""
69
+ if self._database_exists():
70
+ self._cosmos_client.delete_database(self.database_name)
71
+
72
+ def _database_exists(self) -> bool:
73
+ """Check if the database exists."""
74
+ existing_database_names = [
75
+ database["id"] for database in self._cosmos_client.list_databases()
76
+ ]
77
+ return self.database_name in existing_database_names
78
+
79
+ def _create_container(self) -> None:
80
+ """Create the container if it doesn't exist."""
81
+ partition_key = PartitionKey(path=f"/{self.id_field}", kind="Hash")
82
+
83
+ # Define the container vector policy
84
+ vector_embedding_policy = {
85
+ "vectorEmbeddings": [
86
+ {
87
+ "path": f"/{self.vector_field}",
88
+ "dataType": "float32",
89
+ "distanceFunction": "cosine",
90
+ "dimensions": self.vector_size,
91
+ }
92
+ ]
93
+ }
94
+
95
+ # Define the vector indexing policy
96
+ indexing_policy = {
97
+ "indexingMode": "consistent",
98
+ "automatic": True,
99
+ "includedPaths": [{"path": "/*"}],
100
+ "excludedPaths": [
101
+ {"path": "/_etag/?"},
102
+ {"path": f"/{self.vector_field}/*"},
103
+ ],
104
+ }
105
+
106
+ # Currently, the CosmosDB emulator does not support the diskANN policy.
107
+ try:
108
+ # First try with the standard diskANN policy
109
+ indexing_policy["vectorIndexes"] = [
110
+ {"path": f"/{self.vector_field}", "type": "diskANN"}
111
+ ]
112
+
113
+ # Create the container and container client
114
+ self._database_client.create_container_if_not_exists(
115
+ id=self.index_name,
116
+ partition_key=partition_key,
117
+ indexing_policy=indexing_policy,
118
+ vector_embedding_policy=vector_embedding_policy,
119
+ )
120
+ except CosmosHttpResponseError:
121
+ # If diskANN fails (likely in emulator), retry without vector indexes
122
+ indexing_policy.pop("vectorIndexes", None)
123
+
124
+ # Create the container with compatible indexing policy
125
+ self._database_client.create_container_if_not_exists(
126
+ id=self.index_name,
127
+ partition_key=partition_key,
128
+ indexing_policy=indexing_policy,
129
+ vector_embedding_policy=vector_embedding_policy,
130
+ )
131
+
132
+ self._container_client = self._database_client.get_container_client(
133
+ self.index_name
134
+ )
135
+
136
+ def _delete_container(self) -> None:
137
+ """Delete the vector store container in the database if it exists."""
138
+ if self._container_exists():
139
+ self._database_client.delete_container(self.index_name)
140
+
141
+ def _container_exists(self) -> bool:
142
+ """Check if the container name exists in the database."""
143
+ existing_container_names = [
144
+ container["id"] for container in self._database_client.list_containers()
145
+ ]
146
+ return self.index_name in existing_container_names
147
+
148
+ def create_index(self) -> None:
149
+ """Load documents into CosmosDB."""
150
+ # Create a CosmosDB container on overwrite
151
+ self._delete_container()
152
+ self._create_container()
153
+
154
+ if self._container_client is None:
155
+ msg = "Container client is not initialized."
156
+ raise ValueError(msg)
157
+
158
+ def load_documents(self, documents: list[VectorStoreDocument]) -> None:
159
+ """Load documents into CosmosDB."""
160
+ # Upload documents to CosmosDB
161
+ for doc in documents:
162
+ if doc.vector is not None:
163
+ doc_json = {
164
+ self.id_field: doc.id,
165
+ self.vector_field: doc.vector,
166
+ }
167
+ self._container_client.upsert_item(doc_json)
168
+
169
+ def similarity_search_by_vector(
170
+ self, query_embedding: list[float], k: int = 10
171
+ ) -> list[VectorStoreSearchResult]:
172
+ """Perform a vector-based similarity search."""
173
+ if self._container_client is None:
174
+ msg = "Container client is not initialized."
175
+ raise ValueError(msg)
176
+
177
+ try:
178
+ query = f"SELECT TOP {k} c.{self.id_field}, c.{self.vector_field}, VectorDistance(c.{self.vector_field}, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.{self.vector_field}, @embedding)" # noqa: S608
179
+ query_params = [{"name": "@embedding", "value": query_embedding}]
180
+ items = list(
181
+ self._container_client.query_items(
182
+ query=query,
183
+ parameters=query_params,
184
+ enable_cross_partition_query=True,
185
+ )
186
+ )
187
+ except (CosmosHttpResponseError, ValueError):
188
+ # Currently, the CosmosDB emulator does not support the VectorDistance function.
189
+ # For emulator or test environments - fetch all items and calculate distance locally
190
+ query = f"SELECT c.{self.id_field}, c.{self.vector_field} FROM c" # noqa: S608
191
+ items = list(
192
+ self._container_client.query_items(
193
+ query=query,
194
+ enable_cross_partition_query=True,
195
+ )
196
+ )
197
+
198
+ # Calculate cosine similarity locally (1 - cosine distance)
199
+ from numpy import dot
200
+ from numpy.linalg import norm
201
+
202
+ def cosine_similarity(a, b):
203
+ if norm(a) * norm(b) == 0:
204
+ return 0.0
205
+ return dot(a, b) / (norm(a) * norm(b))
206
+
207
+ # Calculate scores for all items
208
+ for item in items:
209
+ item_vector = item.get(self.vector_field, [])
210
+ similarity = cosine_similarity(query_embedding, item_vector)
211
+ item["SimilarityScore"] = similarity
212
+
213
+ # Sort by similarity score (higher is better) and take top k
214
+ items = sorted(
215
+ items, key=lambda x: x.get("SimilarityScore", 0.0), reverse=True
216
+ )[:k]
217
+
218
+ return [
219
+ VectorStoreSearchResult(
220
+ document=VectorStoreDocument(
221
+ id=item.get(self.id_field, ""),
222
+ vector=item.get(self.vector_field, []),
223
+ ),
224
+ score=item.get("SimilarityScore", 0.0),
225
+ )
226
+ for item in items
227
+ ]
228
+
229
+ def search_by_id(self, id: str) -> VectorStoreDocument:
230
+ """Search for a document by id."""
231
+ if self._container_client is None:
232
+ msg = "Container client is not initialized."
233
+ raise ValueError(msg)
234
+
235
+ item = self._container_client.read_item(item=id, partition_key=id)
236
+ return VectorStoreDocument(
237
+ id=item.get(self.id_field, ""),
238
+ vector=item.get(self.vector_field, []),
239
+ )
240
+
241
+ def clear(self) -> None:
242
+ """Clear the vector store."""
243
+ self._delete_container()
244
+ self._delete_database()
@@ -0,0 +1,56 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Parameterization settings for the default configuration."""
5
+
6
+ import re
7
+
8
+ from pydantic import BaseModel, Field, model_validator
9
+
10
+ DEFAULT_VECTOR_SIZE: int = 3072
11
+
12
+ VALID_IDENTIFIER_REGEX = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
13
+
14
+
15
+ def is_valid_field_name(field: str) -> bool:
16
+ """Check if a field name is valid for CosmosDB."""
17
+ return bool(VALID_IDENTIFIER_REGEX.match(field))
18
+
19
+
20
+ class IndexSchema(BaseModel):
21
+ """The default configuration section for Vector Store Schema."""
22
+
23
+ index_name: str = Field(
24
+ description="The index name to use.", default="vector_index"
25
+ )
26
+
27
+ id_field: str = Field(
28
+ description="The ID field to use.",
29
+ default="id",
30
+ )
31
+
32
+ vector_field: str = Field(
33
+ description="The vector field to use.",
34
+ default="vector",
35
+ )
36
+
37
+ vector_size: int = Field(
38
+ description="The vector size to use.",
39
+ default=DEFAULT_VECTOR_SIZE,
40
+ )
41
+
42
+ def _validate_schema(self) -> None:
43
+ """Validate the schema."""
44
+ for field in [
45
+ self.id_field,
46
+ self.vector_field,
47
+ ]:
48
+ if not is_valid_field_name(field):
49
+ msg = f"Unsafe or invalid field name: {field}"
50
+ raise ValueError(msg)
51
+
52
+ @model_validator(mode="after")
53
+ def _validate_model(self):
54
+ """Validate the model."""
55
+ self._validate_schema()
56
+ return self
@@ -0,0 +1,128 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """The LanceDB vector storage implementation package."""
5
+
6
+ from typing import Any
7
+
8
+ import lancedb
9
+ import numpy as np
10
+ import pyarrow as pa
11
+
12
+ from graphrag_vectors.vector_store import (
13
+ VectorStore,
14
+ VectorStoreDocument,
15
+ VectorStoreSearchResult,
16
+ )
17
+
18
+
19
+ class LanceDBVectorStore(VectorStore):
20
+ """LanceDB vector storage implementation."""
21
+
22
+ def __init__(self, db_uri: str = "lancedb", **kwargs: Any):
23
+ super().__init__(**kwargs)
24
+ self.db_uri = db_uri
25
+
26
+ def connect(self) -> Any:
27
+ """Connect to the vector storage."""
28
+ self.db_connection = lancedb.connect(self.db_uri)
29
+
30
+ if self.index_name and self.index_name in self.db_connection.table_names():
31
+ self.document_collection = self.db_connection.open_table(self.index_name)
32
+
33
+ def create_index(self) -> None:
34
+ """Create index."""
35
+ dummy_vector = np.zeros(self.vector_size, dtype=np.float32)
36
+ flat_array = pa.array(dummy_vector, type=pa.float32())
37
+ vector_column = pa.FixedSizeListArray.from_arrays(flat_array, self.vector_size)
38
+
39
+ data = pa.table({
40
+ self.id_field: pa.array(["__DUMMY__"], type=pa.string()),
41
+ self.vector_field: vector_column,
42
+ })
43
+
44
+ self.document_collection = self.db_connection.create_table(
45
+ self.index_name if self.index_name else "",
46
+ data=data,
47
+ mode="overwrite",
48
+ schema=data.schema,
49
+ )
50
+
51
+ # Step 5: Create index now that schema exists
52
+ self.document_collection.create_index(
53
+ vector_column_name=self.vector_field, index_type="IVF_FLAT"
54
+ )
55
+
56
+ def load_documents(self, documents: list[VectorStoreDocument]) -> None:
57
+ """Load documents into vector storage."""
58
+ self.document_collection.delete(f"{self.id_field} = '__DUMMY__'")
59
+
60
+ # Step 1: Prepare data columns manually
61
+ ids = []
62
+ vectors = []
63
+
64
+ for document in documents:
65
+ self.vector_size = (
66
+ len(document.vector) if document.vector else self.vector_size
67
+ )
68
+ if document.vector is not None and len(document.vector) == self.vector_size:
69
+ ids.append(document.id)
70
+ vectors.append(np.array(document.vector, dtype=np.float32))
71
+
72
+ # Step 2: Handle empty case
73
+ if len(ids) == 0:
74
+ data = None
75
+ else:
76
+ # Step 3: Flatten the vectors and build FixedSizeListArray manually
77
+ flat_vector = np.concatenate(vectors).astype(np.float32)
78
+ flat_array = pa.array(flat_vector, type=pa.float32())
79
+ vector_column = pa.FixedSizeListArray.from_arrays(
80
+ flat_array, self.vector_size
81
+ )
82
+
83
+ # Step 4: Create PyArrow table (let schema be inferred)
84
+ data = pa.table({
85
+ self.id_field: pa.array(ids, type=pa.string()),
86
+ self.vector_field: vector_column,
87
+ })
88
+
89
+ if data:
90
+ self.document_collection.add(data)
91
+
92
+ def similarity_search_by_vector(
93
+ self, query_embedding: list[float] | np.ndarray, k: int = 10
94
+ ) -> list[VectorStoreSearchResult]:
95
+ """Perform a vector-based similarity search."""
96
+ query_embedding = np.array(query_embedding, dtype=np.float32)
97
+
98
+ docs = (
99
+ self.document_collection
100
+ .search(query=query_embedding, vector_column_name=self.vector_field)
101
+ .limit(k)
102
+ .to_list()
103
+ )
104
+ return [
105
+ VectorStoreSearchResult(
106
+ document=VectorStoreDocument(
107
+ id=doc[self.id_field],
108
+ vector=doc[self.vector_field],
109
+ ),
110
+ score=1 - abs(float(doc["_distance"])),
111
+ )
112
+ for doc in docs
113
+ ]
114
+
115
+ def search_by_id(self, id: str) -> VectorStoreDocument:
116
+ """Search for a document by id."""
117
+ doc = (
118
+ self.document_collection
119
+ .search()
120
+ .where(f"{self.id_field} == '{id}'", prefilter=True)
121
+ .to_list()
122
+ )
123
+ if doc:
124
+ return VectorStoreDocument(
125
+ id=doc[0][self.id_field],
126
+ vector=doc[0][self.vector_field],
127
+ )
128
+ return VectorStoreDocument(id=id, vector=None)
@@ -0,0 +1,8 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Common types for vector stores."""
5
+
6
+ from collections.abc import Callable
7
+
8
+ TextEmbedder = Callable[[str], list[float]]
@@ -0,0 +1,81 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Base classes for vector stores."""
5
+
6
+ from abc import ABC, abstractmethod
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ from graphrag_vectors.types import TextEmbedder
11
+
12
+
13
+ @dataclass
14
+ class VectorStoreDocument:
15
+ """A document that is stored in vector storage."""
16
+
17
+ id: str | int
18
+ """unique id for the document"""
19
+
20
+ vector: list[float] | None
21
+
22
+
23
+ @dataclass
24
+ class VectorStoreSearchResult:
25
+ """A vector storage search result."""
26
+
27
+ document: VectorStoreDocument
28
+ """Document that was found."""
29
+
30
+ score: float
31
+ """Similarity score between -1 and 1. Higher is more similar."""
32
+
33
+
34
+ class VectorStore(ABC):
35
+ """The base class for vector storage data-access classes."""
36
+
37
+ def __init__(
38
+ self,
39
+ index_name: str = "vector_index",
40
+ id_field: str = "id",
41
+ vector_field: str = "vector",
42
+ vector_size: int = 3072,
43
+ **kwargs: Any,
44
+ ):
45
+ self.index_name = index_name
46
+ self.id_field = id_field
47
+ self.vector_field = vector_field
48
+ self.vector_size = vector_size
49
+
50
+ @abstractmethod
51
+ def connect(self) -> None:
52
+ """Connect to vector storage."""
53
+
54
+ @abstractmethod
55
+ def create_index(self) -> None:
56
+ """Create index."""
57
+
58
+ @abstractmethod
59
+ def load_documents(self, documents: list[VectorStoreDocument]) -> None:
60
+ """Load documents into the vector-store."""
61
+
62
+ @abstractmethod
63
+ def similarity_search_by_vector(
64
+ self, query_embedding: list[float], k: int = 10
65
+ ) -> list[VectorStoreSearchResult]:
66
+ """Perform ANN search by vector."""
67
+
68
+ def similarity_search_by_text(
69
+ self, text: str, text_embedder: TextEmbedder, k: int = 10
70
+ ) -> list[VectorStoreSearchResult]:
71
+ """Perform a text-based similarity search."""
72
+ query_embedding = text_embedder(text)
73
+ if query_embedding:
74
+ return self.similarity_search_by_vector(
75
+ query_embedding=query_embedding, k=k
76
+ )
77
+ return []
78
+
79
+ @abstractmethod
80
+ def search_by_id(self, id: str) -> VectorStoreDocument:
81
+ """Search for a document by id."""
@@ -0,0 +1,53 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Parameterization settings for the default configuration."""
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+ from graphrag_vectors.index_schema import IndexSchema
9
+ from graphrag_vectors.vector_store_type import VectorStoreType
10
+
11
+
12
+ class VectorStoreConfig(BaseModel):
13
+ """The default configuration section for Vector Store."""
14
+
15
+ model_config = ConfigDict(extra="allow")
16
+ """Allow extra fields to support custom vector implementations."""
17
+
18
+ type: str = Field(
19
+ description="The vector store type to use.",
20
+ default=VectorStoreType.LanceDB,
21
+ )
22
+
23
+ db_uri: str | None = Field(
24
+ description="The database URI to use (only used by lancedb for built-in stores).",
25
+ default=None,
26
+ )
27
+
28
+ url: str | None = Field(
29
+ description="The database URL when type == azure_ai_search or cosmosdb.",
30
+ default=None,
31
+ )
32
+
33
+ api_key: str | None = Field(
34
+ description="The database API key when type == azure_ai_search.",
35
+ default=None,
36
+ )
37
+
38
+ audience: str | None = Field(
39
+ description="The database audience when type == azure_ai_search.",
40
+ default=None,
41
+ )
42
+
43
+ connection_string: str | None = Field(
44
+ description="The connection string when type == cosmosdb.",
45
+ default=None,
46
+ )
47
+
48
+ database_name: str | None = Field(
49
+ description="The database name to use when type == cosmosdb.",
50
+ default=None,
51
+ )
52
+
53
+ index_schema: dict[str, IndexSchema] = {}
@@ -0,0 +1,99 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Factory functions for creating a vector store."""
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import TYPE_CHECKING
9
+
10
+ from graphrag_common.factory import Factory, ServiceScope
11
+
12
+ from graphrag_vectors.vector_store import VectorStore
13
+ from graphrag_vectors.vector_store_type import VectorStoreType
14
+
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Callable
17
+
18
+ from graphrag_vectors.index_schema import IndexSchema
19
+ from graphrag_vectors.vector_store_config import VectorStoreConfig
20
+
21
+
22
+ class VectorStoreFactory(Factory[VectorStore]):
23
+ """A factory for vector stores.
24
+
25
+ Includes a method for users to register a custom vector store implementation.
26
+
27
+ Configuration arguments are passed to each vector store implementation as kwargs
28
+ for individual enforcement of required/optional arguments.
29
+ """
30
+
31
+
32
+ vector_store_factory = VectorStoreFactory()
33
+
34
+
35
+ def register_vector_store(
36
+ vector_store_type: str,
37
+ vector_store_initializer: Callable[..., VectorStore],
38
+ scope: ServiceScope = "transient",
39
+ ) -> None:
40
+ """Register a custom vector store implementation.
41
+
42
+ Args
43
+ ----
44
+ - vector_store_type: str
45
+ The vector store id to register.
46
+ - vector_store_initializer: Callable[..., VectorStore]
47
+ The vector store initializer to register.
48
+ - scope: ServiceScope
49
+ The service scope for the vector store (default: "transient").
50
+ """
51
+ vector_store_factory.register(vector_store_type, vector_store_initializer, scope)
52
+
53
+
54
+ def create_vector_store(
55
+ config: VectorStoreConfig, index_schema: IndexSchema
56
+ ) -> VectorStore:
57
+ """Create a vector store implementation based on the given type and configuration.
58
+
59
+ Args
60
+ ----
61
+ - config: VectorStoreConfig
62
+ The base vector store configuration.
63
+ - index_schema: IndexSchema
64
+ The index schema configuration for the vector store instance - i.e., for the specific table we are reading/writing.
65
+
66
+ Returns
67
+ -------
68
+ VectorStore
69
+ The created vector store implementation.
70
+ """
71
+ strategy = config.type
72
+
73
+ # Lazy load built-in implementations
74
+ if strategy not in vector_store_factory:
75
+ match strategy:
76
+ case VectorStoreType.LanceDB:
77
+ from graphrag_vectors.lancedb import LanceDBVectorStore
78
+
79
+ register_vector_store(VectorStoreType.LanceDB, LanceDBVectorStore)
80
+ case VectorStoreType.AzureAISearch:
81
+ from graphrag_vectors.azure_ai_search import AzureAISearchVectorStore
82
+
83
+ register_vector_store(
84
+ VectorStoreType.AzureAISearch, AzureAISearchVectorStore
85
+ )
86
+ case VectorStoreType.CosmosDB:
87
+ from graphrag_vectors.cosmosdb import CosmosDBVectorStore
88
+
89
+ register_vector_store(VectorStoreType.CosmosDB, CosmosDBVectorStore)
90
+ case _:
91
+ msg = f"Vector store type '{strategy}' is not registered in the VectorStoreFactory. Registered types: {', '.join(vector_store_factory.keys())}."
92
+ raise ValueError(msg)
93
+
94
+ # collapse the base config and specific index config into a single dict for the initializer
95
+ config_model = config.model_dump()
96
+ index_model = index_schema.model_dump()
97
+ return vector_store_factory.create(
98
+ strategy, init_args={**config_model, **index_model}
99
+ )
@@ -0,0 +1,14 @@
1
+ # Copyright (c) 2024 Microsoft Corporation.
2
+ # Licensed under the MIT License
3
+
4
+ """Vector store type enum."""
5
+
6
+ from enum import StrEnum
7
+
8
+
9
+ class VectorStoreType(StrEnum):
10
+ """The supported vector store types."""
11
+
12
+ LanceDB = "lancedb"
13
+ AzureAISearch = "azure_ai_search"
14
+ CosmosDB = "cosmosdb"
@@ -0,0 +1,49 @@
1
+ [project]
2
+ name = "graphrag-vectors"
3
+ version = "3.0.0"
4
+ description = "GraphRAG vector store package."
5
+ authors = [
6
+ {name = "Alonso Guevara Fernández", email = "alonsog@microsoft.com"},
7
+ {name = "Andrés Morales Esquivel", email = "andresmor@microsoft.com"},
8
+ {name = "Chris Trevino", email = "chtrevin@microsoft.com"},
9
+ {name = "David Tittsworth", email = "datittsw@microsoft.com"},
10
+ {name = "Dayenne de Souza", email = "ddesouza@microsoft.com"},
11
+ {name = "Derek Worthen", email = "deworthe@microsoft.com"},
12
+ {name = "Gaudy Blanco Meneses", email = "gaudyb@microsoft.com"},
13
+ {name = "Ha Trinh", email = "trinhha@microsoft.com"},
14
+ {name = "Jonathan Larson", email = "jolarso@microsoft.com"},
15
+ {name = "Josh Bradley", email = "joshbradley@microsoft.com"},
16
+ {name = "Kate Lytvynets", email = "kalytv@microsoft.com"},
17
+ {name = "Kenny Zhang", email = "zhangken@microsoft.com"},
18
+ {name = "Mónica Carvajal"},
19
+ {name = "Nathan Evans", email = "naevans@microsoft.com"},
20
+ {name = "Rodrigo Racanicci", email = "rracanicci@microsoft.com"},
21
+ {name = "Sarah Smith", email = "smithsarah@microsoft.com"},
22
+ ]
23
+ license = {text = "MIT"}
24
+ readme = "README.md"
25
+ requires-python = ">=3.11,<3.14"
26
+ classifiers = [
27
+ "Programming Language :: Python :: 3",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ "Programming Language :: Python :: 3.13",
31
+ ]
32
+ dependencies = [
33
+ "azure-core~=1.32",
34
+ "azure-cosmos~=4.9",
35
+ "azure-identity~=1.19",
36
+ "azure-search-documents~=11.6",
37
+ "graphrag-common==3.0.0",
38
+ "lancedb~=0.24.1",
39
+ "numpy~=2.1",
40
+ "pyarrow~=22.0",
41
+ "pydantic~=2.10",
42
+ ]
43
+
44
+ [project.urls]
45
+ Source = "https://github.com/microsoft/graphrag"
46
+
47
+ [build-system]
48
+ requires = ["hatchling>=1.27.0,<2.0.0"]
49
+ build-backend = "hatchling.build"