gllm-datastore-binary 0.5.50__cp312-cp312-macosx_13_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. gllm_datastore/__init__.pyi +0 -0
  2. gllm_datastore/cache/__init__.pyi +4 -0
  3. gllm_datastore/cache/base.pyi +84 -0
  4. gllm_datastore/cache/cache.pyi +137 -0
  5. gllm_datastore/cache/hybrid_cache/__init__.pyi +5 -0
  6. gllm_datastore/cache/hybrid_cache/file_system_hybrid_cache.pyi +50 -0
  7. gllm_datastore/cache/hybrid_cache/hybrid_cache.pyi +115 -0
  8. gllm_datastore/cache/hybrid_cache/in_memory_hybrid_cache.pyi +29 -0
  9. gllm_datastore/cache/hybrid_cache/key_matcher/__init__.pyi +5 -0
  10. gllm_datastore/cache/hybrid_cache/key_matcher/exact_key_matcher.pyi +44 -0
  11. gllm_datastore/cache/hybrid_cache/key_matcher/fuzzy_key_matcher.pyi +70 -0
  12. gllm_datastore/cache/hybrid_cache/key_matcher/key_matcher.pyi +60 -0
  13. gllm_datastore/cache/hybrid_cache/key_matcher/semantic_key_matcher.pyi +93 -0
  14. gllm_datastore/cache/hybrid_cache/redis_hybrid_cache.pyi +34 -0
  15. gllm_datastore/cache/hybrid_cache/utils.pyi +36 -0
  16. gllm_datastore/cache/utils.pyi +34 -0
  17. gllm_datastore/cache/vector_cache/__init__.pyi +0 -0
  18. gllm_datastore/cache/vector_cache/eviction_manager/__init__.pyi +0 -0
  19. gllm_datastore/cache/vector_cache/eviction_manager/asyncio_eviction_manager.pyi +48 -0
  20. gllm_datastore/cache/vector_cache/eviction_manager/eviction_manager.pyi +38 -0
  21. gllm_datastore/cache/vector_cache/eviction_strategy/__init__.pyi +0 -0
  22. gllm_datastore/cache/vector_cache/eviction_strategy/eviction_strategy.pyi +34 -0
  23. gllm_datastore/cache/vector_cache/eviction_strategy/ttl_eviction_strategy.pyi +34 -0
  24. gllm_datastore/cache/vector_cache/vector_cache.pyi +99 -0
  25. gllm_datastore/constants.pyi +66 -0
  26. gllm_datastore/core/__init__.pyi +7 -0
  27. gllm_datastore/core/capabilities/__init__.pyi +7 -0
  28. gllm_datastore/core/capabilities/encryption_capability.pyi +21 -0
  29. gllm_datastore/core/capabilities/fulltext_capability.pyi +73 -0
  30. gllm_datastore/core/capabilities/graph_capability.pyi +70 -0
  31. gllm_datastore/core/capabilities/hybrid_capability.pyi +184 -0
  32. gllm_datastore/core/capabilities/vector_capability.pyi +90 -0
  33. gllm_datastore/core/filters/__init__.pyi +4 -0
  34. gllm_datastore/core/filters/filter.pyi +340 -0
  35. gllm_datastore/core/filters/schema.pyi +149 -0
  36. gllm_datastore/data_store/__init__.pyi +8 -0
  37. gllm_datastore/data_store/_elastic_core/__init__.pyi +0 -0
  38. gllm_datastore/data_store/_elastic_core/client_factory.pyi +66 -0
  39. gllm_datastore/data_store/_elastic_core/constants.pyi +27 -0
  40. gllm_datastore/data_store/_elastic_core/elastic_like_core.pyi +115 -0
  41. gllm_datastore/data_store/_elastic_core/index_manager.pyi +37 -0
  42. gllm_datastore/data_store/_elastic_core/query_translator.pyi +89 -0
  43. gllm_datastore/data_store/base.pyi +176 -0
  44. gllm_datastore/data_store/chroma/__init__.pyi +4 -0
  45. gllm_datastore/data_store/chroma/_chroma_import.pyi +13 -0
  46. gllm_datastore/data_store/chroma/data_store.pyi +201 -0
  47. gllm_datastore/data_store/chroma/fulltext.pyi +134 -0
  48. gllm_datastore/data_store/chroma/query.pyi +266 -0
  49. gllm_datastore/data_store/chroma/query_translator.pyi +41 -0
  50. gllm_datastore/data_store/chroma/vector.pyi +197 -0
  51. gllm_datastore/data_store/elasticsearch/__init__.pyi +5 -0
  52. gllm_datastore/data_store/elasticsearch/data_store.pyi +147 -0
  53. gllm_datastore/data_store/elasticsearch/fulltext.pyi +238 -0
  54. gllm_datastore/data_store/elasticsearch/query.pyi +118 -0
  55. gllm_datastore/data_store/elasticsearch/query_translator.pyi +18 -0
  56. gllm_datastore/data_store/elasticsearch/vector.pyi +180 -0
  57. gllm_datastore/data_store/exceptions.pyi +35 -0
  58. gllm_datastore/data_store/in_memory/__init__.pyi +5 -0
  59. gllm_datastore/data_store/in_memory/data_store.pyi +71 -0
  60. gllm_datastore/data_store/in_memory/fulltext.pyi +131 -0
  61. gllm_datastore/data_store/in_memory/query.pyi +175 -0
  62. gllm_datastore/data_store/in_memory/vector.pyi +174 -0
  63. gllm_datastore/data_store/opensearch/__init__.pyi +5 -0
  64. gllm_datastore/data_store/opensearch/data_store.pyi +160 -0
  65. gllm_datastore/data_store/opensearch/fulltext.pyi +240 -0
  66. gllm_datastore/data_store/opensearch/query.pyi +89 -0
  67. gllm_datastore/data_store/opensearch/query_translator.pyi +18 -0
  68. gllm_datastore/data_store/opensearch/vector.pyi +211 -0
  69. gllm_datastore/data_store/redis/__init__.pyi +5 -0
  70. gllm_datastore/data_store/redis/data_store.pyi +153 -0
  71. gllm_datastore/data_store/redis/fulltext.pyi +128 -0
  72. gllm_datastore/data_store/redis/query.pyi +428 -0
  73. gllm_datastore/data_store/redis/query_translator.pyi +37 -0
  74. gllm_datastore/data_store/redis/vector.pyi +131 -0
  75. gllm_datastore/data_store/sql/__init__.pyi +4 -0
  76. gllm_datastore/data_store/sql/constants.pyi +5 -0
  77. gllm_datastore/data_store/sql/data_store.pyi +201 -0
  78. gllm_datastore/data_store/sql/fulltext.pyi +164 -0
  79. gllm_datastore/data_store/sql/query.pyi +81 -0
  80. gllm_datastore/data_store/sql/query_translator.pyi +51 -0
  81. gllm_datastore/data_store/sql/schema.pyi +16 -0
  82. gllm_datastore/encryptor/__init__.pyi +4 -0
  83. gllm_datastore/encryptor/aes_gcm_encryptor.pyi +45 -0
  84. gllm_datastore/encryptor/capability/__init__.pyi +3 -0
  85. gllm_datastore/encryptor/capability/mixin.pyi +32 -0
  86. gllm_datastore/encryptor/encryptor.pyi +52 -0
  87. gllm_datastore/encryptor/key_ring/__init__.pyi +3 -0
  88. gllm_datastore/encryptor/key_ring/in_memory_key_ring.pyi +52 -0
  89. gllm_datastore/encryptor/key_ring/key_ring.pyi +45 -0
  90. gllm_datastore/encryptor/key_rotating_encryptor.pyi +60 -0
  91. gllm_datastore/graph_data_store/__init__.pyi +6 -0
  92. gllm_datastore/graph_data_store/graph_data_store.pyi +151 -0
  93. gllm_datastore/graph_data_store/graph_rag_data_store.pyi +29 -0
  94. gllm_datastore/graph_data_store/light_rag_data_store.pyi +93 -0
  95. gllm_datastore/graph_data_store/light_rag_postgres_data_store.pyi +96 -0
  96. gllm_datastore/graph_data_store/llama_index_graph_rag_data_store.pyi +49 -0
  97. gllm_datastore/graph_data_store/llama_index_neo4j_graph_rag_data_store.pyi +78 -0
  98. gllm_datastore/graph_data_store/mixins/__init__.pyi +3 -0
  99. gllm_datastore/graph_data_store/mixins/agentic_graph_tools_mixin.pyi +175 -0
  100. gllm_datastore/graph_data_store/nebula_graph_data_store.pyi +206 -0
  101. gllm_datastore/graph_data_store/neo4j_graph_data_store.pyi +182 -0
  102. gllm_datastore/graph_data_store/schema.pyi +27 -0
  103. gllm_datastore/graph_data_store/utils/__init__.pyi +6 -0
  104. gllm_datastore/graph_data_store/utils/constants.pyi +21 -0
  105. gllm_datastore/graph_data_store/utils/light_rag_em_invoker_adapter.pyi +56 -0
  106. gllm_datastore/graph_data_store/utils/light_rag_lm_invoker_adapter.pyi +43 -0
  107. gllm_datastore/graph_data_store/utils/llama_index_em_invoker_adapter.pyi +45 -0
  108. gllm_datastore/graph_data_store/utils/llama_index_lm_invoker_adapter.pyi +169 -0
  109. gllm_datastore/signature/__init__.pyi +0 -0
  110. gllm_datastore/signature/webhook_signature.pyi +31 -0
  111. gllm_datastore/sql_data_store/__init__.pyi +4 -0
  112. gllm_datastore/sql_data_store/adapter/__init__.pyi +0 -0
  113. gllm_datastore/sql_data_store/adapter/sqlalchemy_adapter.pyi +38 -0
  114. gllm_datastore/sql_data_store/constants.pyi +6 -0
  115. gllm_datastore/sql_data_store/sql_data_store.pyi +86 -0
  116. gllm_datastore/sql_data_store/sqlalchemy_sql_data_store.pyi +216 -0
  117. gllm_datastore/sql_data_store/types.pyi +31 -0
  118. gllm_datastore/utils/__init__.pyi +6 -0
  119. gllm_datastore/utils/converter.pyi +51 -0
  120. gllm_datastore/utils/dict.pyi +21 -0
  121. gllm_datastore/utils/ttl.pyi +25 -0
  122. gllm_datastore/utils/types.pyi +32 -0
  123. gllm_datastore/vector_data_store/__init__.pyi +6 -0
  124. gllm_datastore/vector_data_store/chroma_vector_data_store.pyi +259 -0
  125. gllm_datastore/vector_data_store/elasticsearch_vector_data_store.pyi +357 -0
  126. gllm_datastore/vector_data_store/in_memory_vector_data_store.pyi +179 -0
  127. gllm_datastore/vector_data_store/mixin/__init__.pyi +0 -0
  128. gllm_datastore/vector_data_store/mixin/cache_compatible_mixin.pyi +145 -0
  129. gllm_datastore/vector_data_store/redis_vector_data_store.pyi +191 -0
  130. gllm_datastore/vector_data_store/vector_data_store.pyi +146 -0
  131. gllm_datastore.build/.gitignore +1 -0
  132. gllm_datastore.cpython-312-darwin.so +0 -0
  133. gllm_datastore.pyi +178 -0
  134. gllm_datastore_binary-0.5.50.dist-info/METADATA +185 -0
  135. gllm_datastore_binary-0.5.50.dist-info/RECORD +137 -0
  136. gllm_datastore_binary-0.5.50.dist-info/WHEEL +5 -0
  137. gllm_datastore_binary-0.5.50.dist-info/top_level.txt +1 -0
@@ -0,0 +1,31 @@
1
+ from pydantic import BaseModel
2
+ from typing import Any, Sequence
3
+
4
+ class QueryFilter(BaseModel):
5
+ '''Model for query filters.
6
+
7
+ Attributes:
8
+ conditions (dict[str, Any]): The conditions for filtering the query.
9
+
10
+ Example:
11
+ QueryFilter(conditions={"column1": "value1", "column2": "value2"})
12
+ '''
13
+ conditions: dict[str, Any]
14
+
15
+ class QueryOptions(BaseModel):
16
+ '''Model for query options.
17
+
18
+ Attributes:
19
+ columns (Sequence[str] | None): The columns to include in the query result. Defaults to None.
20
+ fields (Sequence[str] | None): The fields to include in the query result. Defaults to None.
21
+ order_by (str | None): The column to order the query result by. Defaults to None.
22
+ order_desc (bool): Whether to order the query result in descending order. Defaults to False.
23
+ limit (int | None): The maximum number of rows to return. Defaults to None.
24
+
25
+ Example:
26
+ QueryOptions(fields=["field1", "field2"], order_by="column1", order_desc=True, limit=10)
27
+ '''
28
+ columns: Sequence[str] | None
29
+ order_by: str | None
30
+ order_desc: bool
31
+ limit: int | None
@@ -0,0 +1,6 @@
1
+ from gllm_datastore.utils.converter import from_langchain as from_langchain
2
+ from gllm_datastore.utils.dict import flatten_dict as flatten_dict
3
+ from gllm_datastore.utils.ttl import convert_ttl_to_seconds as convert_ttl_to_seconds
4
+ from gllm_datastore.utils.types import QueryFilter as QueryFilter, QueryOptions as QueryOptions
5
+
6
+ __all__ = ['from_langchain', 'convert_ttl_to_seconds', 'flatten_dict', 'QueryFilter', 'QueryOptions']
@@ -0,0 +1,51 @@
1
+ from gllm_core.schema import Chunk
2
+ from gllm_datastore.constants import SIMILARITY_SCORE as SIMILARITY_SCORE
3
+ from langchain_core.documents import Document
4
+
5
+ def from_langchain(doc: Document, score: float | None = None) -> Chunk:
6
+ """Create a standardized Chunk from a LangChain Document.
7
+
8
+ Args:
9
+ doc (Document): The document to create a Chunk from.
10
+ score (float | None, optional): The score to assign to the Chunk. Defaults to None, in which case it will
11
+ attempt to get the score from the `score` metadata.
12
+
13
+ Returns:
14
+ Chunk: The standardized Chunk object.
15
+ """
16
+ def to_langchain(chunk: Chunk) -> Document:
17
+ """Create a LangChain Document from a standardized Chunk.
18
+
19
+ Args:
20
+ chunk (Chunk): The standardized Chunk to create a Document from.
21
+
22
+ Returns:
23
+ Document: The LangChain Document object.
24
+ """
25
+ def l2_distance_to_similarity_score(distance: float) -> float:
26
+ """Convert distance to similarity.
27
+
28
+ Args:
29
+ distance (float): The distance value to convert. Ranges in [0, inf].
30
+
31
+ Returns:
32
+ float: The converted similarity value.
33
+ """
34
+ def cosine_distance_to_similarity_score(distance: float) -> float:
35
+ """Convert cosine distance to similarity.
36
+
37
+ Args:
38
+ distance (float): The cosine distance value to convert. Ranges in [0, 2].
39
+
40
+ Returns:
41
+ float: The converted similarity value. Ranges in [0, 1].
42
+ """
43
+ def similarity_score_to_cosine_distance(similarity: float) -> float:
44
+ """Convert similarity to cosine distance.
45
+
46
+ Args:
47
+ similarity (float): The similarity value to convert. Ranges in [0, 1].
48
+
49
+ Returns:
50
+ float: The converted cosine distance value. Ranges in [0, 2].
51
+ """
@@ -0,0 +1,21 @@
1
+ from typing import Any
2
+
3
+ def flatten_dict(nested_dict: dict[str, Any], parent_key: str = '', sep: str = '.') -> dict[str, Any]:
4
+ '''Flatten a nested dictionary into a single level dictionary.
5
+
6
+ Args:
7
+ nested_dict (dict[str, Any]): The nested dictionary to flatten.
8
+ parent_key (str, optional): The parent key to prepend to the keys in the flattened dictionary.
9
+ Defaults to empty string.
10
+ sep (str, optional): The separator to use between the parent key and the child key. Defaults to ".".
11
+
12
+ Returns:
13
+ dict[str, Any]: The flattened dictionary.
14
+
15
+ Examples:
16
+ ```python
17
+ nested = {"a": {"b": 1, "c": 2}, "d": 3}
18
+ flattened = flatten_dict(nested)
19
+ # Result: {"a.b": 1, "a.c": 2, "d": 3}
20
+ ```
21
+ '''
@@ -0,0 +1,25 @@
1
+ from _typeshed import Incomplete
2
+
3
+ TIME_UNIT_TO_SECOND_MAPPING: Incomplete
4
+
5
+ def convert_ttl_to_seconds(ttl: str | int) -> int:
6
+ '''Convert TTL (time-to-live) string with time units to seconds.
7
+
8
+ Supported units: s (seconds), m (minutes), h (hours), d (days), w (weeks), y (years).
9
+
10
+ Examples:
11
+ "2m" -> 120 (2 minutes in seconds)
12
+ "1h" -> 3600 (1 hour in seconds)
13
+ "1y" -> 31536000 (1 year in seconds)
14
+ 300 -> 300 (numeric input returned as is)
15
+
16
+ Args:
17
+ ttl (str | int): Time to live value with optional unit suffix (e.g., "2m", "1h", "1y")
18
+ or numeric value in seconds.
19
+
20
+ Returns:
21
+ int: TTL converted to seconds.
22
+
23
+ Raises:
24
+ ValueError: If the input format is invalid.
25
+ '''
@@ -0,0 +1,32 @@
1
+ from pydantic import BaseModel
2
+ from typing import Any, Sequence
3
+
4
+ class QueryFilter(BaseModel):
5
+ '''Model for query filters.
6
+
7
+ Attributes:
8
+ conditions (dict[str, Any]): The conditions for filtering the query.
9
+
10
+ Example:
11
+ QueryFilter(conditions={"column1": "value1", "column2": "value2"})
12
+ '''
13
+ conditions: dict[str, Any]
14
+
15
+ class QueryOptions(BaseModel):
16
+ '''Model for query options.
17
+
18
+ Attributes:
19
+ columns (Sequence[str] | None): The columns to include in the query result. Defaults to None.
20
+ fields (Sequence[str] | None): The fields to include in the query result. Defaults to None.
21
+ order_by (str | None): The column to order the query result by. Defaults to None.
22
+ order_desc (bool): Whether to order the query result in descending order. Defaults to False.
23
+ limit (int | None): The maximum number of rows to return. Defaults to None.
24
+
25
+ Example:
26
+ QueryOptions(fields=["field1", "field2"], order_by="column1", order_desc=True, limit=10)
27
+ '''
28
+ columns: Sequence[str] | None
29
+ fields: Sequence[str] | None
30
+ order_by: str | None
31
+ order_desc: bool
32
+ limit: int | None
@@ -0,0 +1,6 @@
1
+ from gllm_datastore.vector_data_store.chroma_vector_data_store import ChromaVectorDataStore as ChromaVectorDataStore
2
+ from gllm_datastore.vector_data_store.elasticsearch_vector_data_store import ElasticsearchVectorDataStore as ElasticsearchVectorDataStore
3
+ from gllm_datastore.vector_data_store.in_memory_vector_data_store import InMemoryVectorDataStore as InMemoryVectorDataStore
4
+ from gllm_datastore.vector_data_store.redis_vector_data_store import RedisVectorDataStore as RedisVectorDataStore
5
+
6
+ __all__ = ['ChromaVectorDataStore', 'ElasticsearchVectorDataStore', 'InMemoryVectorDataStore', 'RedisVectorDataStore']
@@ -0,0 +1,259 @@
1
+ from _typeshed import Incomplete
2
+ from chromadb.types import Where, WhereDocument
3
+ from datetime import datetime
4
+ from enum import Enum
5
+ from gllm_core.schema.chunk import Chunk
6
+ from gllm_datastore.constants import DEFAULT_TOP_K as DEFAULT_TOP_K, METADATA_KEYS as METADATA_KEYS
7
+ from gllm_datastore.utils.converter import from_langchain as from_langchain, l2_distance_to_similarity_score as l2_distance_to_similarity_score, to_langchain as to_langchain
8
+ from gllm_datastore.vector_data_store.mixin.cache_compatible_mixin import CacheCompatibleMixin as CacheCompatibleMixin
9
+ from gllm_datastore.vector_data_store.vector_data_store import BaseVectorDataStore as BaseVectorDataStore
10
+ from gllm_inference.em_invoker.em_invoker import BaseEMInvoker
11
+ from langchain_core.embeddings import Embeddings
12
+ from typing import Any
13
+
14
+ DEFAULT_NUM_CANDIDATES: int
15
+
16
+ class ChromaClientType(str, Enum):
17
+ """Enum for different types of ChromaDB clients.
18
+
19
+ Attributes:
20
+ MEMORY (str): Client type for an in-memory data store.
21
+ PERSISTENT (str): Client type for a persistent data store.
22
+ HTTP (str): Client type for a client-server architecture.
23
+ """
24
+ MEMORY: str
25
+ PERSISTENT: str
26
+ HTTP: str
27
+
28
+ class ChromaVectorDataStore(BaseVectorDataStore, CacheCompatibleMixin):
29
+ """Datastore for interacting with ChromaDB.
30
+
31
+ This class provides methods to interact with ChromaDB for vector storage and retrieval
32
+ using the langchain-chroma integration.
33
+
34
+ Attributes:
35
+ vector_store (Chroma): The langchain Chroma vector store instance.
36
+ collection_name (str): The name of the ChromaDB collection to use.
37
+ num_candidates (int): The maximum number of candidates to consider during search.
38
+ embedding (BaseEMInvoker | Embeddings | None): The embedding model to perform vectorization.
39
+ """
40
+ vector_store: Incomplete
41
+ collection_name: Incomplete
42
+ num_candidates: Incomplete
43
+ def __init__(self, collection_name: str, embedding: BaseEMInvoker | Embeddings | None = None, client_type: ChromaClientType = ..., persist_directory: str | None = None, host: str | None = None, port: int | None = None, headers: dict | None = None, num_candidates: int = ..., **kwargs: Any) -> None:
44
+ """Initialize the ChromaDB vector data store with langchain-chroma.
45
+
46
+ Args:
47
+ collection_name (str): Name of the collection to use in ChromaDB.
48
+ embedding (BaseEMInvoker | Embeddings | None, optional): The embedding model to perform vectorization.
49
+ Defaults to None.
50
+ client_type (ChromaClientType, optional): Type of ChromaDB client to use.
51
+ Defaults to ChromaClientType.MEMORY.
52
+ persist_directory (str | None, optional): Directory to persist vector store data.
53
+ Required for PERSISTENT client type. Defaults to None.
54
+ host (str | None, optional): Host address for ChromaDB server.
55
+ Required for HTTP client type. Defaults to None.
56
+ port (int | None, optional): Port for ChromaDB server.
57
+ Required for HTTP client type. Defaults to None.
58
+ headers (dict | None, optional): Headers for ChromaDB server.
59
+ Used for HTTP client type. Defaults to None.
60
+ num_candidates (int, optional): Maximum number of candidates to consider during search.
61
+ Defaults to DEFAULT_NUM_CANDIDATES.
62
+ **kwargs: Additional parameters for Chroma initialization.
63
+
64
+ Note:
65
+ num_candidates (int, optional): This constant affects the maximum number of results to consider
66
+ during the search. Index with more documents would need a higher value for the whole documents
67
+ to be considered during search. This happens due to a bug with Chroma's search algorithm as discussed
68
+ in this issue: [3] https://github.com/langchain-ai/langchain/issues/1946
69
+ """
70
+ async def get_size(self) -> int:
71
+ """Returns the total number of vectors in the index.
72
+
73
+ If the index is not initialized returns 0.
74
+
75
+ Returns:
76
+ int: The total number of vectors.
77
+ """
78
+ async def query(self, query: str, top_k: int = ..., retrieval_params: dict[str, dict[str, str]] | None = None) -> list[Chunk]:
79
+ '''Query the vector data store for similar chunks with similarity scores.
80
+
81
+ Args:
82
+ query (str): The query string to find similar chunks for.
83
+ top_k (int, optional): Maximum number of results to return. Defaults to DEFAULT_TOP_K.
84
+ retrieval_params (dict[str, Any] | None, optional): Additional parameters for retrieval.
85
+ - filter (Where, optional): A Where type dict used to filter the retrieval by the metadata keys.
86
+ E.g. `{"$and": [{"color" : "red"}, {"price": {"$gte": 4.20}]}}`.
87
+ - where_document (WhereDocument, optional): A WhereDocument type dict used to filter the retrieval by
88
+ the document content. E.g. `{$contains: {"text": "hello"}}`.
89
+ Defaults to None.
90
+
91
+ Returns:
92
+ list[Chunk]: A list of Chunk objects matching the query, with similarity scores.
93
+ '''
94
+ async def query_by_id(self, id: str | list[str]) -> list[Chunk]:
95
+ """Retrieve chunks by their IDs.
96
+
97
+ Args:
98
+ id (str | list[str]): A single ID or a list of IDs to retrieve.
99
+
100
+ Returns:
101
+ list[Chunk]: A list of retrieved Chunk objects.
102
+ """
103
+ async def add_chunks(self, chunks: Chunk | list[Chunk], **kwargs) -> list[str]:
104
+ """Add chunks to the vector data store.
105
+
106
+ Args:
107
+ chunks (Chunk | list[Chunk]): A single chunk or list of chunks to add.
108
+ **kwargs: Additional keyword arguments for the add operation.
109
+
110
+ Returns:
111
+ list[str]: List of IDs of the added chunks.
112
+ """
113
+ async def delete_chunks(self, where: Where | None = None, where_document: WhereDocument | None = None, **kwargs: Any) -> None:
114
+ '''Delete chunks from the vector data store.
115
+
116
+ Args:
117
+ where (Where | None, optional): A Where type dict used to filter the deletion by metadata.
118
+ E.g. `{"source": "mydoc"}`. Defaults to None.
119
+ where_document (WhereDocument | None, optional): A WhereDocument type dict used to filter the deletion by
120
+ the document content. E.g. `{$contains: {"text": "hello"}}`. Defaults to None.
121
+ **kwargs: Additional keyword arguments for the delete operation.
122
+
123
+ Note:
124
+ If no filter criteria is provided, all chunks in the collection will be deleted. Please use with caution.
125
+ '''
126
+ async def delete_chunks_by_ids(self, ids: str | list[str], **kwargs: Any) -> None:
127
+ """Delete chunks from the vector data store by IDs.
128
+
129
+ Args:
130
+ ids (str | list[str]): A single ID or a list of IDs to delete.
131
+ **kwargs: Additional keyword arguments.
132
+
133
+ Note:
134
+ If no IDs are provided, no chunks will be deleted.
135
+ """
136
+ async def exact_match(self, key: str, metadata: dict[str, Any] | None = None) -> Any | None:
137
+ '''Find chunks that exactly match the given key.
138
+
139
+ This method searches for documents with the exact original_key in metadata.
140
+
141
+ Args:
142
+ key (str): The key to match.
143
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
144
+ For example, `{"key": "value"}`. Defaults to None.
145
+
146
+ Returns:
147
+ Any: The value stored with the exact key match, or None if no match is found.
148
+ '''
149
+ async def fuzzy_match(self, key: str, max_distance: int = 2, metadata: dict[str, Any] | None = None) -> Any | None:
150
+ '''Find chunks that approximately match the given key using fuzzy matching.
151
+
152
+ Args:
153
+ key (str): The key to match.
154
+ max_distance (int): Maximum allowed Levenshtein distance for fuzzy matching.
155
+ Higher values are more lenient. Defaults to 2.
156
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
157
+ For example, `{"key": "value"}`. Defaults to None.
158
+
159
+ Returns:
160
+ Any: The value with the closest fuzzy match to the key, or None if no match meets the threshold.
161
+ '''
162
+ async def semantic_match(self, key: str, min_similarity: float = 0.2, metadata: dict[str, Any] | None = None) -> Any | None:
163
+ '''Find chunks that semantically match the given key using vector similarity.
164
+
165
+ Args:
166
+ key (str): The key to match.
167
+ min_similarity (float): Minimum similarity score for semantic matching
168
+ (higher values are more strict). Ranges from 0 to 1. Defaults to 0.8.
169
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
170
+ For example, `{"key": "value"}`. Defaults to None.
171
+
172
+ Returns:
173
+ Any: The semantically closest value, or None if no match meets the min_similarity.
174
+ '''
175
+ async def delete_expired_entries(self, now: datetime, max_size: int = 10000) -> None:
176
+ """Delete expired entries (for TTL eviction).
177
+
178
+ Args:
179
+ now (datetime): The current datetime for comparison.
180
+ max_size (int): The maximum number of entries to return. Defaults to 10000.
181
+
182
+ Raises:
183
+ NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
184
+ """
185
+ async def delete_least_frequently_used_entries(self, num_entries: int) -> None:
186
+ """Delete least frequently used entries (for LFU eviction).
187
+
188
+ Args:
189
+ num_entries (int): Number of entries to return.
190
+
191
+ Raises:
192
+ NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
193
+ """
194
+ async def delete_least_recently_used_entries(self, num_entries: int) -> None:
195
+ """Delete least recently used entries (for LRU eviction).
196
+
197
+ Args:
198
+ num_entries (int): Number of entries to return.
199
+
200
+ Raises:
201
+ NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
202
+ """
203
+ async def delete_entries_by_key(self, key: str, metadata: dict[str, Any] | None = None) -> None:
204
+ '''Delete entries by key.
205
+
206
+ Args:
207
+ key (str): The key to delete entries for.
208
+ metadata (dict[str, Any] | None, optional): Optional metadata filter to apply to the search.
209
+ For example, `{"key": "value"}`. Defaults to None.
210
+
211
+ Raises:
212
+ NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
213
+ '''
214
+ async def clear(self) -> None:
215
+ """Clear all entries in the storage.
216
+
217
+ Raises:
218
+ NotImplementedError: Currently, app-level eviction is not supported for ChromaVectorDataStore.
219
+ """
220
+ async def query_by_field(self, retrieval_params: dict[str, Any], limit: int | None = None, **kwargs) -> list[Chunk]:
221
+ """Retrieve documents that match specific metadata constraints.
222
+
223
+ This method filters and returns stored chunks based on metadata values
224
+ rather than vector similarity. It is particularly useful for structured lookups,
225
+ such as retrieving all chunks from a certain source, tagged with a specific label,
226
+ or authored by a particular user.
227
+
228
+ Unlike semantic search methods, `query_by_field` operates purely on metadata fields
229
+ associated with each document, allowing precise filtering based on key-value pairs.
230
+
231
+ Args:
232
+ retrieval_params (dict[str, Any]): A dictionary defining filter criteria. Common keys include:
233
+ - `filter` (dict): A dictionary of metadata field conditions.
234
+ - `where_document` (dict, optional): Conditions based on document content.
235
+ limit (int | None, optional): The maximum number of results to return. If None, all matching
236
+ documents will be returned.
237
+ **kwargs: Additional arguments to support datastore-specific behavior or filtering logic.
238
+
239
+ Returns:
240
+ list[Chunk]: A list of `Chunk` objects that satisfy the metadata criteria.
241
+
242
+ Raises:
243
+ NotImplementedError: If not implemented in the subclass.
244
+ """
245
+ async def query_by_vector(self, vector: list[float], top_k: int = ..., min_similarity: float = 0.8, retrieval_params: dict | None = None) -> list[Chunk]:
246
+ """Search for documents that are similar to a given vector.
247
+
248
+ Args:
249
+ vector (list[float]): The query embedding vector to compare against stored vectors.
250
+ top_k (int, optional): The number of top results to return. Defaults to DEFAULT_TOP_K.
251
+ min_similarity (float): Minimum similarity score for vector similarity.
252
+ retrieval_params (dict | None, optional): Filter parameters to narrow the search:
253
+ - filter (Where): Metadata-based filter.
254
+ - where_document (WhereDocument): Content-based filter.
255
+ Defaults to None.
256
+
257
+ Returns:
258
+ list[Chunk]: A list of Chunk objects with similarity scores based on the input vector.
259
+ """