llama-stack 0.4.4__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. llama_stack/cli/stack/_list_deps.py +11 -7
  2. llama_stack/cli/stack/run.py +3 -25
  3. llama_stack/core/access_control/datatypes.py +78 -0
  4. llama_stack/core/configure.py +2 -2
  5. llama_stack/{distributions/meta-reference-gpu → core/connectors}/__init__.py +3 -1
  6. llama_stack/core/connectors/connectors.py +162 -0
  7. llama_stack/core/conversations/conversations.py +61 -58
  8. llama_stack/core/datatypes.py +54 -8
  9. llama_stack/core/library_client.py +60 -13
  10. llama_stack/core/prompts/prompts.py +43 -42
  11. llama_stack/core/routers/datasets.py +20 -17
  12. llama_stack/core/routers/eval_scoring.py +143 -53
  13. llama_stack/core/routers/inference.py +20 -9
  14. llama_stack/core/routers/safety.py +30 -42
  15. llama_stack/core/routers/vector_io.py +15 -7
  16. llama_stack/core/routing_tables/models.py +42 -3
  17. llama_stack/core/routing_tables/scoring_functions.py +19 -19
  18. llama_stack/core/routing_tables/shields.py +20 -17
  19. llama_stack/core/routing_tables/vector_stores.py +8 -5
  20. llama_stack/core/server/auth.py +192 -17
  21. llama_stack/core/server/fastapi_router_registry.py +40 -5
  22. llama_stack/core/server/server.py +24 -5
  23. llama_stack/core/stack.py +54 -10
  24. llama_stack/core/storage/datatypes.py +9 -0
  25. llama_stack/core/store/registry.py +1 -1
  26. llama_stack/core/utils/exec.py +2 -2
  27. llama_stack/core/utils/type_inspection.py +16 -2
  28. llama_stack/distributions/dell/config.yaml +4 -1
  29. llama_stack/distributions/dell/run-with-safety.yaml +4 -1
  30. llama_stack/distributions/nvidia/config.yaml +4 -1
  31. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
  32. llama_stack/distributions/oci/config.yaml +4 -1
  33. llama_stack/distributions/open-benchmark/config.yaml +9 -1
  34. llama_stack/distributions/postgres-demo/config.yaml +1 -1
  35. llama_stack/distributions/starter/build.yaml +62 -0
  36. llama_stack/distributions/starter/config.yaml +22 -3
  37. llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
  38. llama_stack/distributions/starter/starter.py +13 -1
  39. llama_stack/distributions/starter-gpu/build.yaml +62 -0
  40. llama_stack/distributions/starter-gpu/config.yaml +22 -3
  41. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
  42. llama_stack/distributions/template.py +10 -2
  43. llama_stack/distributions/watsonx/config.yaml +4 -1
  44. llama_stack/log.py +1 -0
  45. llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
  46. llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
  47. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +49 -51
  48. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
  49. llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
  50. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
  51. llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
  52. llama_stack/providers/inline/batches/reference/batches.py +2 -1
  53. llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
  54. llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
  55. llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
  56. llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
  57. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
  58. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
  59. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
  60. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
  61. llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
  62. llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
  63. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
  64. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
  65. llama_stack/providers/registry/agents.py +1 -0
  66. llama_stack/providers/registry/inference.py +1 -9
  67. llama_stack/providers/registry/vector_io.py +136 -16
  68. llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
  69. llama_stack/providers/remote/files/s3/config.py +5 -3
  70. llama_stack/providers/remote/files/s3/files.py +2 -2
  71. llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
  72. llama_stack/providers/remote/inference/openai/openai.py +2 -0
  73. llama_stack/providers/remote/inference/together/together.py +4 -0
  74. llama_stack/providers/remote/inference/vertexai/config.py +3 -3
  75. llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
  76. llama_stack/providers/remote/inference/vllm/config.py +37 -18
  77. llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
  78. llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
  79. llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
  80. llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
  81. llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
  82. llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
  83. llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
  84. llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
  85. llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
  86. llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
  87. llama_stack/providers/remote/vector_io/oci/config.py +41 -0
  88. llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
  89. llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
  90. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
  91. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
  92. llama_stack/providers/utils/bedrock/client.py +3 -3
  93. llama_stack/providers/utils/bedrock/config.py +7 -7
  94. llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
  95. llama_stack/providers/utils/inference/http_client.py +239 -0
  96. llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
  97. llama_stack/providers/utils/inference/model_registry.py +148 -2
  98. llama_stack/providers/utils/inference/openai_compat.py +2 -1
  99. llama_stack/providers/utils/inference/openai_mixin.py +41 -2
  100. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
  101. llama_stack/providers/utils/memory/vector_store.py +46 -19
  102. llama_stack/providers/utils/responses/responses_store.py +7 -7
  103. llama_stack/providers/utils/safety.py +114 -0
  104. llama_stack/providers/utils/tools/mcp.py +44 -3
  105. llama_stack/testing/api_recorder.py +9 -3
  106. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
  107. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +111 -144
  108. llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
  109. llama_stack/distributions/meta-reference-gpu/doc_template.md +0 -119
  110. llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
  111. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
  112. llama_stack/models/llama/hadamard_utils.py +0 -88
  113. llama_stack/models/llama/llama3/args.py +0 -74
  114. llama_stack/models/llama/llama3/dog.jpg +0 -0
  115. llama_stack/models/llama/llama3/generation.py +0 -378
  116. llama_stack/models/llama/llama3/model.py +0 -304
  117. llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
  118. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
  119. llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
  120. llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
  121. llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
  122. llama_stack/models/llama/llama3/pasta.jpeg +0 -0
  123. llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
  124. llama_stack/models/llama/llama3/quantization/loader.py +0 -316
  125. llama_stack/models/llama/llama3_1/__init__.py +0 -12
  126. llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
  127. llama_stack/models/llama/llama3_1/prompts.py +0 -258
  128. llama_stack/models/llama/llama3_2/__init__.py +0 -5
  129. llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
  130. llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
  131. llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
  132. llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
  133. llama_stack/models/llama/llama3_3/__init__.py +0 -5
  134. llama_stack/models/llama/llama3_3/prompts.py +0 -259
  135. llama_stack/models/llama/llama4/args.py +0 -107
  136. llama_stack/models/llama/llama4/ffn.py +0 -58
  137. llama_stack/models/llama/llama4/moe.py +0 -214
  138. llama_stack/models/llama/llama4/preprocess.py +0 -435
  139. llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
  140. llama_stack/models/llama/llama4/quantization/loader.py +0 -226
  141. llama_stack/models/llama/llama4/vision/__init__.py +0 -5
  142. llama_stack/models/llama/llama4/vision/embedding.py +0 -210
  143. llama_stack/models/llama/llama4/vision/encoder.py +0 -412
  144. llama_stack/models/llama/quantize_impls.py +0 -316
  145. llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
  146. llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
  147. llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
  148. llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
  149. llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
  150. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
  151. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
  152. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
  153. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
  154. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  155. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -37,7 +37,7 @@ from llama_stack_api import (
37
37
  )
38
38
  from llama_stack_api.internal.kvstore import KVStore
39
39
 
40
- from .config import PGVectorVectorIOConfig
40
+ from .config import PGVectorIndexConfig, PGVectorIndexType, PGVectorVectorIOConfig
41
41
 
42
42
  log = get_logger(name=__name__, category="vector_io::pgvector")
43
43
 
@@ -81,6 +81,26 @@ def upsert_models(conn, keys_models: list[tuple[str, BaseModel]]):
81
81
  execute_values(cur, query, values, template="(%s, %s)")
82
82
 
83
83
 
84
+ def remove_vector_store_metadata(conn: psycopg2.extensions.connection, vector_store_id: str) -> None:
85
+ """
86
+ Performs removal of vector store metadata from PGVector metadata_store table when vector store is unregistered
87
+
88
+ Args:
89
+ conn: active PostgreSQL connection
90
+ vector_store_id: identifier of VectorStore resource
91
+ """
92
+ try:
93
+ with conn.cursor() as cur:
94
+ cur.execute("DELETE FROM metadata_store WHERE key = %s", (vector_store_id,))
95
+ if cur.rowcount > 0:
96
+ log.info(f"Removed metadata for vector store '{vector_store_id}' from PGVector metadata_store table.")
97
+
98
+ except Exception as e:
99
+ raise RuntimeError(
100
+ f"Error removing metadata from PGVector metadata_store for vector_store: {vector_store_id}"
101
+ ) from e
102
+
103
+
84
104
  def load_models(cur, cls):
85
105
  cur.execute("SELECT key, data FROM metadata_store")
86
106
  rows = cur.fetchall()
@@ -89,22 +109,35 @@ def load_models(cur, cls):
89
109
 
90
110
  class PGVectorIndex(EmbeddingIndex):
91
111
  # reference: https://github.com/pgvector/pgvector?tab=readme-ov-file#querying
112
+ # Llama Stack supports only search functions that are applied for embeddings with vector type
92
113
  PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION: dict[str, str] = {
93
114
  "L2": "<->",
94
115
  "L1": "<+>",
95
116
  "COSINE": "<=>",
96
117
  "INNER_PRODUCT": "<#>",
97
- "HAMMING": "<~>",
98
- "JACCARD": "<%>",
99
118
  }
100
119
 
120
+ # reference: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw
121
+ # Llama Stack supports only index operator classes that are applied for embeddings with vector type
122
+ PGVECTOR_DISTANCE_METRIC_TO_INDEX_OPERATOR_CLASS: dict[str, str] = {
123
+ "L2": "vector_l2_ops",
124
+ "L1": "vector_l1_ops",
125
+ "COSINE": "vector_cosine_ops",
126
+ "INNER_PRODUCT": "vector_ip_ops",
127
+ }
128
+
129
+ # pgvector's maximum embedding dimension for HNSW/IVFFlat indexes on column with type vector
130
+ # references: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw and https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat
131
+ MAX_EMBEDDING_DIMENSION_FOR_HNSW_AND_IVFFLAT_INDEX = 2000
132
+
101
133
  def __init__(
102
134
  self,
103
135
  vector_store: VectorStore,
104
136
  dimension: int,
105
137
  conn: psycopg2.extensions.connection,
138
+ distance_metric: str,
139
+ vector_index: PGVectorIndexConfig,
106
140
  kvstore: KVStore | None = None,
107
- distance_metric: str = "COSINE",
108
141
  ):
109
142
  self.vector_store = vector_store
110
143
  self.dimension = dimension
@@ -112,6 +145,7 @@ class PGVectorIndex(EmbeddingIndex):
112
145
  self.kvstore = kvstore
113
146
  self.check_distance_metric_availability(distance_metric)
114
147
  self.distance_metric = distance_metric
148
+ self.vector_index = vector_index
115
149
  self.table_name = None
116
150
 
117
151
  async def initialize(self) -> None:
@@ -135,6 +169,28 @@ class PGVectorIndex(EmbeddingIndex):
135
169
  """
136
170
  )
137
171
 
172
+ # pgvector's embedding dimensions requirement to create an index for Approximate Nearest Neighbor (ANN) search is up to 2,000 dimensions for column with type vector
173
+ if self.dimension <= self.MAX_EMBEDDING_DIMENSION_FOR_HNSW_AND_IVFFLAT_INDEX:
174
+ if self.vector_index.type == PGVectorIndexType.HNSW:
175
+ await self.create_hnsw_vector_index(cur)
176
+
177
+ # Create the index only after the table has some data (https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat)
178
+ elif (
179
+ self.vector_index.type == PGVectorIndexType.IVFFlat
180
+ and not await self.check_conflicting_vector_index_exists(cur)
181
+ ):
182
+ log.info(
183
+ f"Creation of {PGVectorIndexType.IVFFlat} vector index in vector_store: {self.vector_store.identifier} was deferred. It will be created when the table has some data."
184
+ )
185
+
186
+ else:
187
+ log.info(
188
+ f"Skip creation of {self.vector_index.type} vector index for embedding in PGVector for vector_store: {self.vector_store.identifier}"
189
+ )
190
+ log.info(
191
+ "PGVector requires embedding dimensions are up to 2,000 to successfully create a vector index."
192
+ )
193
+
138
194
  # Create GIN index for full-text search performance
139
195
  cur.execute(
140
196
  f"""
@@ -177,6 +233,13 @@ class PGVectorIndex(EmbeddingIndex):
177
233
  with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
178
234
  execute_values(cur, query, values, template="(%s, %s, %s::vector, %s, to_tsvector('english', %s))")
179
235
 
236
+ # Create the IVFFlat index only after the table has some data (https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat)
237
+ if (
238
+ self.vector_index.type == PGVectorIndexType.IVFFlat
239
+ and self.dimension <= self.MAX_EMBEDDING_DIMENSION_FOR_HNSW_AND_IVFFLAT_INDEX
240
+ ):
241
+ await self.create_ivfflat_vector_index(cur)
242
+
180
243
  async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
181
244
  """
182
245
  Performs vector similarity search using PostgreSQL's search function. Default distance metric is COSINE.
@@ -192,6 +255,14 @@ class PGVectorIndex(EmbeddingIndex):
192
255
  pgvector_search_function = self.get_pgvector_search_function()
193
256
 
194
257
  with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
258
+ # Specify the number of probes to allow PGVector to use Index Scan using IVFFlat index if it was configured (https://github.com/pgvector/pgvector?tab=readme-ov-file#query-options-1)
259
+ if self.vector_index.type == PGVectorIndexType.IVFFlat:
260
+ cur.execute(
261
+ f"""
262
+ SET ivfflat.probes = {self.vector_index.probes};
263
+ """
264
+ )
265
+
195
266
  cur.execute(
196
267
  f"""
197
268
  SELECT document, embedding {pgvector_search_function} %s::vector AS distance
@@ -324,6 +395,14 @@ class PGVectorIndex(EmbeddingIndex):
324
395
  # Fix: Use proper tuple parameter binding with explicit array cast
325
396
  cur.execute(f"DELETE FROM {self.table_name} WHERE id = ANY(%s::text[])", (chunk_ids,))
326
397
 
398
+ def get_pgvector_index_operator_class(self) -> str:
399
+ """Get the pgvector index operator class for the current distance metric.
400
+
401
+ Returns:
402
+ The operator class name.
403
+ """
404
+ return self.PGVECTOR_DISTANCE_METRIC_TO_INDEX_OPERATOR_CLASS[self.distance_metric]
405
+
327
406
  def get_pgvector_search_function(self) -> str:
328
407
  return self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION[self.distance_metric]
329
408
 
@@ -343,6 +422,160 @@ class PGVectorIndex(EmbeddingIndex):
343
422
  f"Supported metrics are: {', '.join(supported_metrics)}"
344
423
  )
345
424
 
425
+ async def create_hnsw_vector_index(self, cur: cursor) -> None:
426
+ """Create PGVector HNSW vector index for Approximate Nearest Neighbor (ANN) search
427
+
428
+ Args:
429
+ cur: PostgreSQL cursor
430
+
431
+ Raises:
432
+ RuntimeError: If the error occurred when creating vector index in PGVector
433
+ """
434
+
435
+ # prevents from creating index for the table that already has conflicting index (HNSW or IVFFlat)
436
+ if await self.check_conflicting_vector_index_exists(cur):
437
+ return
438
+
439
+ try:
440
+ index_operator_class = self.get_pgvector_index_operator_class()
441
+
442
+ # Create HNSW (Hierarchical Navigable Small Worlds) index on embedding column to allow efficient and performant vector search in pgvector
443
+ # HNSW finds the approximate nearest neighbors by only calculating distance metric for vectors it visits during graph traversal instead of processing all vectors
444
+ cur.execute(
445
+ f"""
446
+ CREATE INDEX IF NOT EXISTS {self.table_name}_hnsw_idx
447
+ ON {self.table_name} USING hnsw(embedding {index_operator_class}) WITH (m = {self.vector_index.m}, ef_construction = {self.vector_index.ef_construction});
448
+ """
449
+ )
450
+ log.info(
451
+ f"{PGVectorIndexType.HNSW} vector index was created with parameters m = {self.vector_index.m}, ef_construction = {self.vector_index.ef_construction} for vector_store: {self.vector_store.identifier}."
452
+ )
453
+
454
+ except psycopg2.Error as e:
455
+ raise RuntimeError(
456
+ f"Failed to create {PGVectorIndexType.HNSW} vector index for vector_store: {self.vector_store.identifier}: {e}"
457
+ ) from e
458
+
459
+ async def create_ivfflat_vector_index(self, cur: cursor) -> None:
460
+ """Create PGVector IVFFlat vector index for Approximate Nearest Neighbor (ANN) search
461
+
462
+ Args:
463
+ cur: PostgreSQL cursor
464
+
465
+ Raises:
466
+ RuntimeError: If the error occurred when creating vector index in PGVector
467
+ """
468
+
469
+ # prevents from creating index for the table that already has conflicting index (HNSW or IVFFlat)
470
+ if await self.check_conflicting_vector_index_exists(cur):
471
+ return
472
+
473
+ # don't create index too early as it decreases a performance (https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat)
474
+ # create IVFFLAT index only if vector store has rows >= lists * 1000
475
+ if await self.fetch_number_of_records(cur) < self.vector_index.lists * 1000:
476
+ log.info(
477
+ f"IVFFlat index wasn't created for vector_store {self.vector_store.identifier} because table doesn't have enough records."
478
+ )
479
+ return
480
+
481
+ try:
482
+ index_operator_class = self.get_pgvector_index_operator_class()
483
+
484
+ # Create Inverted File with Flat Compression (IVFFlat) index on embedding column to allow efficient and performant vector search in pgvector
485
+ # IVFFlat index divides vectors into lists, and then searches a subset of those lists that are closest to the query vector
486
+ # Index should be created only after the table has some data (https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat)
487
+ cur.execute(
488
+ f"""
489
+ CREATE INDEX IF NOT EXISTS {self.table_name}_ivfflat_idx
490
+ ON {self.table_name} USING ivfflat(embedding {index_operator_class}) WITH (lists = {self.vector_index.lists});
491
+ """
492
+ )
493
+ log.info(
494
+ f"{PGVectorIndexType.IVFFlat} vector index was created with parameter lists = {self.vector_index.lists} for vector_store: {self.vector_store.identifier}."
495
+ )
496
+
497
+ except psycopg2.Error as e:
498
+ raise RuntimeError(
499
+ f"Failed to create {PGVectorIndexType.IVFFlat} vector index for vector_store: {self.vector_store.identifier}: {e}"
500
+ ) from e
501
+
502
+ async def check_conflicting_vector_index_exists(self, cur: cursor) -> bool:
503
+ """Check if vector index of any type has already been created for the table to prevent the conflict
504
+
505
+ Args:
506
+ cur: PostgreSQL cursor
507
+
508
+ Returns:
509
+ True if exists, otherwise False
510
+
511
+ Raises:
512
+ RuntimeError: If the error occurred when checking vector index exists in PGVector
513
+ """
514
+ try:
515
+ log.info(
516
+ f"Checking vector_store: {self.vector_store.identifier} for conflicting vector index in PGVector..."
517
+ )
518
+ cur.execute(
519
+ """
520
+ SELECT indexname FROM pg_indexes
521
+ WHERE (indexname LIKE %s OR indexname LIKE %s) AND tablename = %s;
522
+ """,
523
+ (
524
+ "%hnsw%",
525
+ "%ivfflat%",
526
+ self.table_name,
527
+ ),
528
+ )
529
+ result = cur.fetchone()
530
+
531
+ if result:
532
+ log.warning(
533
+ f"Conflicting vector index {result[0]} already exists in vector_store: {self.vector_store.identifier}"
534
+ )
535
+ log.warning(
536
+ f"vector_store: {self.vector_store.identifier} will continue to use vector index {result[0]} to preserve performance."
537
+ )
538
+ return True
539
+
540
+ log.info(f"vector_store: {self.vector_store.identifier} currently doesn't have conflicting vector index")
541
+ log.info(f"Proceeding with creation of vector index for {self.vector_store.identifier}")
542
+ return False
543
+
544
+ except psycopg2.Error as e:
545
+ raise RuntimeError(f"Failed to check if vector index exists in PGVector: {e}") from e
546
+
547
+ async def fetch_number_of_records(self, cur: cursor) -> int:
548
+ """Returns number of records in a vector store
549
+
550
+ Args:
551
+ cur: PostgreSQL cursor
552
+
553
+ Returns:
554
+ number of records in a vector store
555
+
556
+ Raises:
557
+ RuntimeError: If the error occurred when fetching a number of records in a vector store in PGVector
558
+ """
559
+ try:
560
+ log.info(f"Fetching number of records in vector_store: {self.vector_store.identifier}...")
561
+ cur.execute(
562
+ f"""
563
+ SELECT COUNT(DISTINCT id)
564
+ FROM {self.table_name};
565
+ """
566
+ )
567
+ result = cur.fetchone()
568
+
569
+ if result:
570
+ log.info(f"vector_store: {self.vector_store.identifier} has {result[0]} records.")
571
+ return result[0]
572
+
573
+ log.info(f"vector_store: {self.vector_store.identifier} currently doesn't have any records.")
574
+ return 0
575
+
576
+ except psycopg2.Error as e:
577
+ raise RuntimeError(f"Failed to check if vector store has records in PGVector: {e}") from e
578
+
346
579
 
347
580
  class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
348
581
  def __init__(
@@ -401,6 +634,8 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
401
634
  dimension=vector_store.embedding_dimension,
402
635
  conn=self.conn,
403
636
  kvstore=self.kvstore,
637
+ distance_metric=self.config.distance_metric,
638
+ vector_index=self.config.vector_index,
404
639
  )
405
640
  await pgvector_index.initialize()
406
641
  index = VectorStoreWithIndex(vector_store, index=pgvector_index, inference_api=self.inference_api)
@@ -427,7 +662,12 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
427
662
 
428
663
  # Create and cache the PGVector index table for the vector DB
429
664
  pgvector_index = PGVectorIndex(
430
- vector_store=vector_store, dimension=vector_store.embedding_dimension, conn=self.conn, kvstore=self.kvstore
665
+ vector_store=vector_store,
666
+ dimension=vector_store.embedding_dimension,
667
+ conn=self.conn,
668
+ kvstore=self.kvstore,
669
+ distance_metric=self.config.distance_metric,
670
+ vector_index=self.config.vector_index,
431
671
  )
432
672
  await pgvector_index.initialize()
433
673
  index = VectorStoreWithIndex(vector_store, index=pgvector_index, inference_api=self.inference_api)
@@ -444,6 +684,9 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
444
684
  raise RuntimeError("KVStore not initialized. Call initialize() before unregistering vector stores.")
445
685
  await self.kvstore.delete(key=f"{VECTOR_DBS_PREFIX}{vector_store_id}")
446
686
 
687
+ # Delete vector store metadata from PGVector metadata_store table
688
+ remove_vector_store_metadata(self.conn, vector_store_id)
689
+
447
690
  async def insert_chunks(
448
691
  self, vector_store_id: str, chunks: list[EmbeddedChunk], ttl_seconds: int | None = None
449
692
  ) -> None:
@@ -470,7 +713,13 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
470
713
  raise VectorStoreNotFoundError(vector_store_id)
471
714
 
472
715
  vector_store = VectorStore.model_validate_json(vector_store_data)
473
- index = PGVectorIndex(vector_store, vector_store.embedding_dimension, self.conn)
716
+ index = PGVectorIndex(
717
+ vector_store,
718
+ vector_store.embedding_dimension,
719
+ self.conn,
720
+ distance_metric=self.config.distance_metric,
721
+ vector_index=self.config.vector_index,
722
+ )
474
723
  await index.initialize()
475
724
  self.cache[vector_store_id] = VectorStoreWithIndex(vector_store, index, self.inference_api)
476
725
  return self.cache[vector_store_id]
@@ -16,6 +16,7 @@ from qdrant_client.models import PointStruct
16
16
  from llama_stack.core.storage.kvstore import kvstore_impl
17
17
  from llama_stack.log import get_logger
18
18
  from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
19
+ from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
19
20
  from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
20
21
  from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorStoreWithIndex
21
22
  from llama_stack.providers.utils.vector_io.vector_utils import load_embedded_chunk_with_backward_compat
@@ -80,11 +81,16 @@ class QdrantIndex(EmbeddingIndex):
80
81
  points = []
81
82
  for chunk in chunks:
82
83
  chunk_id = chunk.chunk_id
84
+ content_text = interleaved_content_as_str(chunk.content)
83
85
  points.append(
84
86
  PointStruct(
85
87
  id=convert_id(chunk_id),
86
- vector=chunk.embedding, # Already a list[float]
87
- payload={"chunk_content": chunk.model_dump()} | {CHUNK_ID_KEY: chunk_id},
88
+ vector=chunk.embedding,
89
+ payload={
90
+ "chunk_content": chunk.model_dump(),
91
+ "content_text": content_text,
92
+ CHUNK_ID_KEY: chunk_id,
93
+ },
88
94
  )
89
95
  )
90
96
 
@@ -144,32 +150,32 @@ class QdrantIndex(EmbeddingIndex):
144
150
  QueryChunksResponse with chunks and scores matching the keyword query
145
151
  """
146
152
  try:
147
- results = (
148
- await self.client.query_points(
149
- collection_name=self.collection_name,
150
- query_filter=models.Filter(
151
- must=[
152
- models.FieldCondition(
153
- key="chunk_content.content", match=models.MatchText(text=query_string)
154
- )
155
- ]
156
- ),
157
- limit=k,
158
- with_payload=True,
159
- with_vectors=False,
160
- score_threshold=score_threshold,
161
- )
162
- ).points
153
+ # Use scroll for keyword-only search since query_points requires a query vector
154
+ # Scroll allows filtering without a query vector
155
+ query_words = query_string.lower().split()
156
+ if not query_words:
157
+ return QueryChunksResponse(chunks=[], scores=[])
158
+ scroll_result = await self.client.scroll(
159
+ collection_name=self.collection_name,
160
+ scroll_filter=models.Filter(
161
+ should=[
162
+ models.FieldCondition(key="content_text", match=models.MatchText(text=word))
163
+ for word in query_words
164
+ ]
165
+ ),
166
+ limit=k,
167
+ with_payload=True,
168
+ with_vectors=False,
169
+ )
170
+ results = scroll_result[0]
163
171
  except Exception as e:
164
172
  log.error(f"Error querying keyword search in Qdrant collection {self.collection_name}: {e}")
165
173
  raise
166
174
 
167
175
  chunks, scores = [], []
168
176
  for point in results:
169
- if not isinstance(point, models.ScoredPoint):
170
- raise RuntimeError(f"Expected ScoredPoint from Qdrant query, got {type(point).__name__}")
171
177
  if point.payload is None:
172
- raise RuntimeError("Qdrant query returned point with no payload")
178
+ raise RuntimeError("Qdrant scroll returned point with no payload")
173
179
 
174
180
  try:
175
181
  chunk = load_embedded_chunk_with_backward_compat(point.payload["chunk_content"])
@@ -182,8 +188,13 @@ class QdrantIndex(EmbeddingIndex):
182
188
  )
183
189
  continue
184
190
 
191
+ # For keyword search, use a fixed score of 1.0 since we're not doing vector similarity
192
+ score = 1.0
193
+ if score < score_threshold:
194
+ continue
195
+
185
196
  chunks.append(chunk)
186
- scores.append(point.score)
197
+ scores.append(score)
187
198
 
188
199
  return QueryChunksResponse(chunks=chunks, scores=scores)
189
200
 
@@ -214,22 +225,35 @@ class QdrantIndex(EmbeddingIndex):
214
225
  QueryChunksResponse with filtered vector search results
215
226
  """
216
227
  try:
217
- results = (
218
- await self.client.query_points(
219
- collection_name=self.collection_name,
220
- query=embedding.tolist(),
221
- query_filter=models.Filter(
222
- must=[
223
- models.FieldCondition(
224
- key="chunk_content.content", match=models.MatchText(text=query_string)
225
- )
226
- ]
227
- ),
228
- limit=k,
229
- with_payload=True,
230
- score_threshold=score_threshold,
231
- )
232
- ).points
228
+ query_words = query_string.lower().split()
229
+ if not query_words:
230
+ # If no words, just do vector search without keyword filter
231
+ results = (
232
+ await self.client.query_points(
233
+ collection_name=self.collection_name,
234
+ query=embedding.tolist(),
235
+ limit=k,
236
+ with_payload=True,
237
+ score_threshold=score_threshold,
238
+ )
239
+ ).points
240
+ else:
241
+ # Use should to match any of the query words
242
+ results = (
243
+ await self.client.query_points(
244
+ collection_name=self.collection_name,
245
+ query=embedding.tolist(),
246
+ query_filter=models.Filter(
247
+ should=[
248
+ models.FieldCondition(key="content_text", match=models.MatchText(text=word))
249
+ for word in query_words
250
+ ]
251
+ ),
252
+ limit=k,
253
+ with_payload=True,
254
+ score_threshold=score_threshold,
255
+ )
256
+ ).points
233
257
  except Exception as e:
234
258
  log.error(f"Error querying hybrid search in Qdrant collection {self.collection_name}: {e}")
235
259
  raise
@@ -49,9 +49,9 @@ def create_bedrock_client(config: BedrockBaseConfig, service_name: str = "bedroc
49
49
  boto3_config = Config(**config_args)
50
50
 
51
51
  session_args = {
52
- "aws_access_key_id": config.aws_access_key_id,
53
- "aws_secret_access_key": config.aws_secret_access_key,
54
- "aws_session_token": config.aws_session_token,
52
+ "aws_access_key_id": config.aws_access_key_id.get_secret_value(),
53
+ "aws_secret_access_key": config.aws_secret_access_key.get_secret_value(),
54
+ "aws_session_token": config.aws_session_token.get_secret_value() if config.aws_session_token else None,
55
55
  "region_name": config.region_name,
56
56
  "profile_name": config.profile_name,
57
57
  "session_ttl": config.session_ttl,
@@ -6,23 +6,23 @@
6
6
 
7
7
  import os
8
8
 
9
- from pydantic import Field
9
+ from pydantic import Field, SecretStr
10
10
 
11
11
  from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
12
12
 
13
13
 
14
14
  class BedrockBaseConfig(RemoteInferenceProviderConfig):
15
15
  auth_credential: None = Field(default=None, exclude=True)
16
- aws_access_key_id: str | None = Field(
17
- default_factory=lambda: os.getenv("AWS_ACCESS_KEY_ID"),
16
+ aws_access_key_id: SecretStr | None = Field(
17
+ default_factory=lambda: SecretStr(val) if (val := os.getenv("AWS_ACCESS_KEY_ID")) else None,
18
18
  description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
19
19
  )
20
- aws_secret_access_key: str | None = Field(
21
- default_factory=lambda: os.getenv("AWS_SECRET_ACCESS_KEY"),
20
+ aws_secret_access_key: SecretStr | None = Field(
21
+ default_factory=lambda: SecretStr(val) if (val := os.getenv("AWS_SECRET_ACCESS_KEY")) else None,
22
22
  description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
23
23
  )
24
- aws_session_token: str | None = Field(
25
- default_factory=lambda: os.getenv("AWS_SESSION_TOKEN"),
24
+ aws_session_token: SecretStr | None = Field(
25
+ default_factory=lambda: SecretStr(val) if (val := os.getenv("AWS_SESSION_TOKEN")) else None,
26
26
  description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
27
27
  )
28
28
  region_name: str | None = Field(
@@ -23,6 +23,7 @@ from llama_stack_api import (
23
23
  OpenAIEmbeddingsRequestWithExtraBody,
24
24
  OpenAIEmbeddingsResponse,
25
25
  OpenAIEmbeddingUsage,
26
+ validate_embeddings_input_is_text,
26
27
  )
27
28
 
28
29
  EMBEDDING_MODELS: dict[str, "SentenceTransformer"] = {}
@@ -41,6 +42,9 @@ class SentenceTransformerEmbeddingMixin:
41
42
  self,
42
43
  params: OpenAIEmbeddingsRequestWithExtraBody,
43
44
  ) -> OpenAIEmbeddingsResponse:
45
+ # Validate that input contains only text, not token arrays
46
+ validate_embeddings_input_is_text(params)
47
+
44
48
  # Convert input to list format if it's a single string
45
49
  input_list = [params.input] if isinstance(params.input, str) else params.input
46
50
  if not input_list: