llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (738) hide show
  1. llama_stack/__init__.py +5 -0
  2. llama_stack/apis/agents/__init__.py +1 -1
  3. llama_stack/apis/agents/agents.py +700 -281
  4. llama_stack/apis/agents/openai_responses.py +1311 -0
  5. llama_stack/{providers/adapters/memory/sample/config.py → apis/batches/__init__.py} +2 -5
  6. llama_stack/apis/batches/batches.py +100 -0
  7. llama_stack/apis/benchmarks/__init__.py +7 -0
  8. llama_stack/apis/benchmarks/benchmarks.py +108 -0
  9. llama_stack/apis/common/content_types.py +143 -0
  10. llama_stack/apis/common/errors.py +103 -0
  11. llama_stack/apis/common/job_types.py +38 -0
  12. llama_stack/apis/common/responses.py +36 -0
  13. llama_stack/apis/common/training_types.py +36 -5
  14. llama_stack/apis/common/type_system.py +158 -0
  15. llama_stack/apis/conversations/__init__.py +31 -0
  16. llama_stack/apis/conversations/conversations.py +286 -0
  17. llama_stack/apis/datasetio/__init__.py +7 -0
  18. llama_stack/apis/datasetio/datasetio.py +59 -0
  19. llama_stack/apis/datasets/__init__.py +7 -0
  20. llama_stack/apis/datasets/datasets.py +251 -0
  21. llama_stack/apis/datatypes.py +160 -0
  22. llama_stack/apis/eval/__init__.py +7 -0
  23. llama_stack/apis/eval/eval.py +169 -0
  24. llama_stack/apis/files/__init__.py +7 -0
  25. llama_stack/apis/files/files.py +199 -0
  26. llama_stack/apis/inference/__init__.py +1 -1
  27. llama_stack/apis/inference/inference.py +1169 -113
  28. llama_stack/apis/inspect/__init__.py +1 -1
  29. llama_stack/apis/inspect/inspect.py +69 -16
  30. llama_stack/apis/models/__init__.py +1 -1
  31. llama_stack/apis/models/models.py +148 -21
  32. llama_stack/apis/post_training/__init__.py +1 -1
  33. llama_stack/apis/post_training/post_training.py +265 -120
  34. llama_stack/{providers/adapters/agents/sample/config.py → apis/prompts/__init__.py} +2 -5
  35. llama_stack/apis/prompts/prompts.py +204 -0
  36. llama_stack/apis/providers/__init__.py +7 -0
  37. llama_stack/apis/providers/providers.py +69 -0
  38. llama_stack/apis/resource.py +37 -0
  39. llama_stack/apis/safety/__init__.py +1 -1
  40. llama_stack/apis/safety/safety.py +95 -12
  41. llama_stack/apis/scoring/__init__.py +7 -0
  42. llama_stack/apis/scoring/scoring.py +93 -0
  43. llama_stack/apis/scoring_functions/__init__.py +7 -0
  44. llama_stack/apis/scoring_functions/scoring_functions.py +208 -0
  45. llama_stack/apis/shields/__init__.py +1 -1
  46. llama_stack/apis/shields/shields.py +76 -33
  47. llama_stack/apis/synthetic_data_generation/__init__.py +1 -1
  48. llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +40 -17
  49. llama_stack/apis/telemetry/__init__.py +1 -1
  50. llama_stack/apis/telemetry/telemetry.py +322 -31
  51. llama_stack/apis/{dataset → tools}/__init__.py +2 -1
  52. llama_stack/apis/tools/rag_tool.py +218 -0
  53. llama_stack/apis/tools/tools.py +221 -0
  54. llama_stack/apis/vector_io/__init__.py +7 -0
  55. llama_stack/apis/vector_io/vector_io.py +960 -0
  56. llama_stack/apis/vector_stores/__init__.py +7 -0
  57. llama_stack/apis/vector_stores/vector_stores.py +51 -0
  58. llama_stack/apis/version.py +9 -0
  59. llama_stack/cli/llama.py +13 -5
  60. llama_stack/cli/stack/_list_deps.py +182 -0
  61. llama_stack/cli/stack/list_apis.py +1 -1
  62. llama_stack/cli/stack/list_deps.py +55 -0
  63. llama_stack/cli/stack/list_providers.py +24 -10
  64. llama_stack/cli/stack/list_stacks.py +56 -0
  65. llama_stack/cli/stack/remove.py +115 -0
  66. llama_stack/cli/stack/run.py +169 -56
  67. llama_stack/cli/stack/stack.py +18 -4
  68. llama_stack/cli/stack/utils.py +151 -0
  69. llama_stack/cli/table.py +23 -61
  70. llama_stack/cli/utils.py +29 -0
  71. llama_stack/core/access_control/access_control.py +131 -0
  72. llama_stack/core/access_control/conditions.py +129 -0
  73. llama_stack/core/access_control/datatypes.py +107 -0
  74. llama_stack/core/build.py +164 -0
  75. llama_stack/core/client.py +205 -0
  76. llama_stack/core/common.sh +37 -0
  77. llama_stack/{distribution → core}/configure.py +74 -55
  78. llama_stack/core/conversations/conversations.py +309 -0
  79. llama_stack/core/datatypes.py +625 -0
  80. llama_stack/core/distribution.py +276 -0
  81. llama_stack/core/external.py +54 -0
  82. llama_stack/core/id_generation.py +42 -0
  83. llama_stack/core/inspect.py +86 -0
  84. llama_stack/core/library_client.py +539 -0
  85. llama_stack/core/prompts/prompts.py +234 -0
  86. llama_stack/core/providers.py +137 -0
  87. llama_stack/core/request_headers.py +115 -0
  88. llama_stack/core/resolver.py +506 -0
  89. llama_stack/core/routers/__init__.py +101 -0
  90. llama_stack/core/routers/datasets.py +73 -0
  91. llama_stack/core/routers/eval_scoring.py +155 -0
  92. llama_stack/core/routers/inference.py +645 -0
  93. llama_stack/core/routers/safety.py +85 -0
  94. llama_stack/core/routers/tool_runtime.py +91 -0
  95. llama_stack/core/routers/vector_io.py +442 -0
  96. llama_stack/core/routing_tables/benchmarks.py +62 -0
  97. llama_stack/core/routing_tables/common.py +254 -0
  98. llama_stack/core/routing_tables/datasets.py +91 -0
  99. llama_stack/core/routing_tables/models.py +163 -0
  100. llama_stack/core/routing_tables/scoring_functions.py +66 -0
  101. llama_stack/core/routing_tables/shields.py +61 -0
  102. llama_stack/core/routing_tables/toolgroups.py +129 -0
  103. llama_stack/core/routing_tables/vector_stores.py +292 -0
  104. llama_stack/core/server/auth.py +187 -0
  105. llama_stack/core/server/auth_providers.py +494 -0
  106. llama_stack/core/server/quota.py +110 -0
  107. llama_stack/core/server/routes.py +141 -0
  108. llama_stack/core/server/server.py +542 -0
  109. llama_stack/core/server/tracing.py +80 -0
  110. llama_stack/core/stack.py +546 -0
  111. llama_stack/core/start_stack.sh +117 -0
  112. llama_stack/core/storage/datatypes.py +283 -0
  113. llama_stack/{cli/model → core/store}/__init__.py +1 -1
  114. llama_stack/core/store/registry.py +199 -0
  115. llama_stack/core/testing_context.py +49 -0
  116. llama_stack/core/ui/app.py +55 -0
  117. llama_stack/core/ui/modules/api.py +32 -0
  118. llama_stack/core/ui/modules/utils.py +42 -0
  119. llama_stack/core/ui/page/distribution/datasets.py +18 -0
  120. llama_stack/core/ui/page/distribution/eval_tasks.py +20 -0
  121. llama_stack/core/ui/page/distribution/models.py +18 -0
  122. llama_stack/core/ui/page/distribution/providers.py +27 -0
  123. llama_stack/core/ui/page/distribution/resources.py +48 -0
  124. llama_stack/core/ui/page/distribution/scoring_functions.py +18 -0
  125. llama_stack/core/ui/page/distribution/shields.py +19 -0
  126. llama_stack/core/ui/page/evaluations/app_eval.py +143 -0
  127. llama_stack/core/ui/page/evaluations/native_eval.py +253 -0
  128. llama_stack/core/ui/page/playground/chat.py +130 -0
  129. llama_stack/core/ui/page/playground/tools.py +352 -0
  130. llama_stack/core/utils/config.py +30 -0
  131. llama_stack/{distribution → core}/utils/config_dirs.py +3 -6
  132. llama_stack/core/utils/config_resolution.py +125 -0
  133. llama_stack/core/utils/context.py +84 -0
  134. llama_stack/core/utils/exec.py +96 -0
  135. llama_stack/{providers/impls/meta_reference/codeshield/config.py → core/utils/image_types.py} +4 -3
  136. llama_stack/{distribution → core}/utils/model_utils.py +2 -2
  137. llama_stack/{distribution → core}/utils/prompt_for_config.py +30 -63
  138. llama_stack/{apis/batch_inference → distributions/dell}/__init__.py +1 -1
  139. llama_stack/distributions/dell/build.yaml +33 -0
  140. llama_stack/distributions/dell/dell.py +158 -0
  141. llama_stack/distributions/dell/run-with-safety.yaml +141 -0
  142. llama_stack/distributions/dell/run.yaml +132 -0
  143. llama_stack/distributions/meta-reference-gpu/__init__.py +7 -0
  144. llama_stack/distributions/meta-reference-gpu/build.yaml +32 -0
  145. llama_stack/distributions/meta-reference-gpu/meta_reference.py +163 -0
  146. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +154 -0
  147. llama_stack/distributions/meta-reference-gpu/run.yaml +139 -0
  148. llama_stack/{apis/evals → distributions/nvidia}/__init__.py +1 -1
  149. llama_stack/distributions/nvidia/build.yaml +29 -0
  150. llama_stack/distributions/nvidia/nvidia.py +154 -0
  151. llama_stack/distributions/nvidia/run-with-safety.yaml +137 -0
  152. llama_stack/distributions/nvidia/run.yaml +116 -0
  153. llama_stack/distributions/open-benchmark/__init__.py +7 -0
  154. llama_stack/distributions/open-benchmark/build.yaml +36 -0
  155. llama_stack/distributions/open-benchmark/open_benchmark.py +303 -0
  156. llama_stack/distributions/open-benchmark/run.yaml +252 -0
  157. llama_stack/distributions/postgres-demo/__init__.py +7 -0
  158. llama_stack/distributions/postgres-demo/build.yaml +23 -0
  159. llama_stack/distributions/postgres-demo/postgres_demo.py +125 -0
  160. llama_stack/distributions/postgres-demo/run.yaml +115 -0
  161. llama_stack/{apis/memory → distributions/starter}/__init__.py +1 -1
  162. llama_stack/distributions/starter/build.yaml +61 -0
  163. llama_stack/distributions/starter/run-with-postgres-store.yaml +285 -0
  164. llama_stack/distributions/starter/run.yaml +276 -0
  165. llama_stack/distributions/starter/starter.py +345 -0
  166. llama_stack/distributions/starter-gpu/__init__.py +7 -0
  167. llama_stack/distributions/starter-gpu/build.yaml +61 -0
  168. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +288 -0
  169. llama_stack/distributions/starter-gpu/run.yaml +279 -0
  170. llama_stack/distributions/starter-gpu/starter_gpu.py +20 -0
  171. llama_stack/distributions/template.py +456 -0
  172. llama_stack/distributions/watsonx/__init__.py +7 -0
  173. llama_stack/distributions/watsonx/build.yaml +33 -0
  174. llama_stack/distributions/watsonx/run.yaml +133 -0
  175. llama_stack/distributions/watsonx/watsonx.py +95 -0
  176. llama_stack/env.py +24 -0
  177. llama_stack/log.py +314 -0
  178. llama_stack/models/llama/checkpoint.py +164 -0
  179. llama_stack/models/llama/datatypes.py +164 -0
  180. llama_stack/models/llama/hadamard_utils.py +86 -0
  181. llama_stack/models/llama/llama3/args.py +74 -0
  182. llama_stack/models/llama/llama3/chat_format.py +286 -0
  183. llama_stack/models/llama/llama3/generation.py +376 -0
  184. llama_stack/models/llama/llama3/interface.py +255 -0
  185. llama_stack/models/llama/llama3/model.py +304 -0
  186. llama_stack/models/llama/llama3/multimodal/__init__.py +12 -0
  187. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +180 -0
  188. llama_stack/models/llama/llama3/multimodal/image_transform.py +409 -0
  189. llama_stack/models/llama/llama3/multimodal/model.py +1430 -0
  190. llama_stack/models/llama/llama3/multimodal/utils.py +26 -0
  191. llama_stack/models/llama/llama3/prompt_templates/__init__.py +22 -0
  192. llama_stack/models/llama/llama3/prompt_templates/base.py +39 -0
  193. llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +319 -0
  194. llama_stack/models/llama/llama3/prompt_templates/tool_response.py +62 -0
  195. llama_stack/models/llama/llama3/quantization/loader.py +316 -0
  196. llama_stack/models/llama/llama3/template_data.py +116 -0
  197. llama_stack/models/llama/llama3/tokenizer.model +128000 -0
  198. llama_stack/models/llama/llama3/tokenizer.py +198 -0
  199. llama_stack/models/llama/llama3/tool_utils.py +266 -0
  200. llama_stack/models/llama/llama3_1/__init__.py +12 -0
  201. llama_stack/models/llama/llama3_1/prompt_format.md +358 -0
  202. llama_stack/models/llama/llama3_1/prompts.py +258 -0
  203. llama_stack/models/llama/llama3_2/prompts_text.py +229 -0
  204. llama_stack/models/llama/llama3_2/prompts_vision.py +126 -0
  205. llama_stack/models/llama/llama3_2/text_prompt_format.md +286 -0
  206. llama_stack/models/llama/llama3_2/vision_prompt_format.md +141 -0
  207. llama_stack/models/llama/llama3_3/prompts.py +259 -0
  208. llama_stack/models/llama/llama4/args.py +107 -0
  209. llama_stack/models/llama/llama4/chat_format.py +317 -0
  210. llama_stack/models/llama/llama4/datatypes.py +56 -0
  211. llama_stack/models/llama/llama4/ffn.py +58 -0
  212. llama_stack/models/llama/llama4/generation.py +313 -0
  213. llama_stack/models/llama/llama4/model.py +437 -0
  214. llama_stack/models/llama/llama4/moe.py +214 -0
  215. llama_stack/models/llama/llama4/preprocess.py +435 -0
  216. llama_stack/models/llama/llama4/prompt_format.md +304 -0
  217. llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +136 -0
  218. llama_stack/models/llama/llama4/prompts.py +279 -0
  219. llama_stack/models/llama/llama4/quantization/__init__.py +5 -0
  220. llama_stack/models/llama/llama4/quantization/loader.py +226 -0
  221. llama_stack/models/llama/llama4/tokenizer.model +200000 -0
  222. llama_stack/models/llama/llama4/tokenizer.py +263 -0
  223. llama_stack/models/llama/llama4/vision/__init__.py +5 -0
  224. llama_stack/models/llama/llama4/vision/embedding.py +210 -0
  225. llama_stack/models/llama/llama4/vision/encoder.py +412 -0
  226. llama_stack/models/llama/prompt_format.py +191 -0
  227. llama_stack/models/llama/quantize_impls.py +316 -0
  228. llama_stack/models/llama/sku_list.py +1029 -0
  229. llama_stack/models/llama/sku_types.py +233 -0
  230. llama_stack/models/llama/tokenizer_utils.py +40 -0
  231. llama_stack/providers/datatypes.py +136 -107
  232. llama_stack/providers/inline/__init__.py +5 -0
  233. llama_stack/providers/inline/agents/__init__.py +5 -0
  234. llama_stack/providers/{impls/meta_reference/agents → inline/agents/meta_reference}/__init__.py +12 -5
  235. llama_stack/providers/inline/agents/meta_reference/agent_instance.py +1024 -0
  236. llama_stack/providers/inline/agents/meta_reference/agents.py +383 -0
  237. llama_stack/providers/inline/agents/meta_reference/config.py +37 -0
  238. llama_stack/providers/inline/agents/meta_reference/persistence.py +228 -0
  239. llama_stack/providers/inline/agents/meta_reference/responses/__init__.py +5 -0
  240. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +423 -0
  241. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +1226 -0
  242. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +449 -0
  243. llama_stack/providers/inline/agents/meta_reference/responses/types.py +194 -0
  244. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +365 -0
  245. llama_stack/providers/inline/agents/meta_reference/safety.py +52 -0
  246. llama_stack/providers/inline/batches/__init__.py +5 -0
  247. llama_stack/providers/inline/batches/reference/__init__.py +36 -0
  248. llama_stack/providers/inline/batches/reference/batches.py +679 -0
  249. llama_stack/providers/inline/batches/reference/config.py +40 -0
  250. llama_stack/providers/inline/datasetio/__init__.py +5 -0
  251. llama_stack/providers/inline/datasetio/localfs/__init__.py +20 -0
  252. llama_stack/providers/inline/datasetio/localfs/config.py +23 -0
  253. llama_stack/providers/inline/datasetio/localfs/datasetio.py +113 -0
  254. llama_stack/providers/inline/eval/__init__.py +5 -0
  255. llama_stack/providers/inline/eval/meta_reference/__init__.py +28 -0
  256. llama_stack/providers/inline/eval/meta_reference/config.py +23 -0
  257. llama_stack/providers/inline/eval/meta_reference/eval.py +259 -0
  258. llama_stack/providers/inline/files/localfs/__init__.py +20 -0
  259. llama_stack/providers/inline/files/localfs/config.py +31 -0
  260. llama_stack/providers/inline/files/localfs/files.py +219 -0
  261. llama_stack/providers/inline/inference/__init__.py +5 -0
  262. llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/__init__.py +4 -4
  263. llama_stack/providers/inline/inference/meta_reference/common.py +24 -0
  264. llama_stack/providers/inline/inference/meta_reference/config.py +68 -0
  265. llama_stack/providers/inline/inference/meta_reference/generators.py +211 -0
  266. llama_stack/providers/inline/inference/meta_reference/inference.py +158 -0
  267. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +96 -0
  268. llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/parallel_utils.py +56 -73
  269. llama_stack/providers/inline/inference/sentence_transformers/__init__.py +22 -0
  270. llama_stack/providers/{impls/meta_reference/agents → inline/inference/sentence_transformers}/config.py +6 -4
  271. llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +83 -0
  272. llama_stack/providers/inline/post_training/__init__.py +5 -0
  273. llama_stack/providers/inline/post_training/common/__init__.py +5 -0
  274. llama_stack/providers/inline/post_training/common/utils.py +35 -0
  275. llama_stack/providers/inline/post_training/common/validator.py +36 -0
  276. llama_stack/providers/inline/post_training/huggingface/__init__.py +27 -0
  277. llama_stack/providers/inline/post_training/huggingface/config.py +83 -0
  278. llama_stack/providers/inline/post_training/huggingface/post_training.py +208 -0
  279. llama_stack/providers/inline/post_training/huggingface/recipes/__init__.py +5 -0
  280. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +519 -0
  281. llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +485 -0
  282. llama_stack/providers/inline/post_training/huggingface/utils.py +269 -0
  283. llama_stack/providers/inline/post_training/torchtune/__init__.py +27 -0
  284. llama_stack/providers/inline/post_training/torchtune/common/__init__.py +5 -0
  285. llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +240 -0
  286. llama_stack/providers/inline/post_training/torchtune/common/utils.py +99 -0
  287. llama_stack/providers/inline/post_training/torchtune/config.py +20 -0
  288. llama_stack/providers/inline/post_training/torchtune/datasets/__init__.py +5 -0
  289. llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +57 -0
  290. llama_stack/providers/inline/post_training/torchtune/datasets/sft.py +78 -0
  291. llama_stack/providers/inline/post_training/torchtune/post_training.py +178 -0
  292. llama_stack/providers/inline/post_training/torchtune/recipes/__init__.py +5 -0
  293. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +588 -0
  294. llama_stack/providers/inline/safety/__init__.py +5 -0
  295. llama_stack/providers/{impls/meta_reference/codeshield → inline/safety/code_scanner}/__init__.py +4 -2
  296. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +128 -0
  297. llama_stack/providers/{impls/meta_reference/memory → inline/safety/code_scanner}/config.py +5 -3
  298. llama_stack/providers/inline/safety/llama_guard/__init__.py +19 -0
  299. llama_stack/providers/inline/safety/llama_guard/config.py +19 -0
  300. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +489 -0
  301. llama_stack/providers/{adapters/memory/sample → inline/safety/prompt_guard}/__init__.py +4 -4
  302. llama_stack/providers/inline/safety/prompt_guard/config.py +32 -0
  303. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +131 -0
  304. llama_stack/providers/inline/scoring/__init__.py +5 -0
  305. llama_stack/providers/inline/scoring/basic/__init__.py +25 -0
  306. llama_stack/providers/{adapters/memory/weaviate → inline/scoring/basic}/config.py +5 -7
  307. llama_stack/providers/inline/scoring/basic/scoring.py +126 -0
  308. llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py +5 -0
  309. llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +240 -0
  310. llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +41 -0
  311. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py +5 -0
  312. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +21 -0
  313. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +21 -0
  314. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +23 -0
  315. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +27 -0
  316. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +71 -0
  317. llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +21 -0
  318. llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +80 -0
  319. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +66 -0
  320. llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +58 -0
  321. llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +38 -0
  322. llama_stack/providers/inline/scoring/basic/utils/__init__.py +5 -0
  323. llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py +3319 -0
  324. llama_stack/providers/inline/scoring/basic/utils/math_utils.py +330 -0
  325. llama_stack/providers/inline/scoring/braintrust/__init__.py +27 -0
  326. llama_stack/providers/inline/scoring/braintrust/braintrust.py +230 -0
  327. llama_stack/providers/inline/scoring/braintrust/config.py +21 -0
  328. llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py +5 -0
  329. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py +5 -0
  330. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +24 -0
  331. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +24 -0
  332. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +24 -0
  333. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +24 -0
  334. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +24 -0
  335. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +24 -0
  336. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +23 -0
  337. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +24 -0
  338. llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +24 -0
  339. llama_stack/providers/inline/scoring/llm_as_judge/__init__.py +21 -0
  340. llama_stack/providers/inline/scoring/llm_as_judge/config.py +14 -0
  341. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +113 -0
  342. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py +5 -0
  343. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py +5 -0
  344. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +96 -0
  345. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +20 -0
  346. llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +81 -0
  347. llama_stack/providers/inline/telemetry/__init__.py +5 -0
  348. llama_stack/providers/inline/telemetry/meta_reference/__init__.py +21 -0
  349. llama_stack/providers/inline/telemetry/meta_reference/config.py +47 -0
  350. llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +252 -0
  351. llama_stack/providers/inline/tool_runtime/__init__.py +5 -0
  352. llama_stack/providers/inline/tool_runtime/rag/__init__.py +19 -0
  353. llama_stack/providers/{impls/meta_reference/telemetry → inline/tool_runtime/rag}/config.py +5 -3
  354. llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +77 -0
  355. llama_stack/providers/inline/tool_runtime/rag/memory.py +332 -0
  356. llama_stack/providers/inline/vector_io/__init__.py +5 -0
  357. llama_stack/providers/inline/vector_io/chroma/__init__.py +19 -0
  358. llama_stack/providers/inline/vector_io/chroma/config.py +30 -0
  359. llama_stack/providers/inline/vector_io/faiss/__init__.py +21 -0
  360. llama_stack/providers/inline/vector_io/faiss/config.py +26 -0
  361. llama_stack/providers/inline/vector_io/faiss/faiss.py +293 -0
  362. llama_stack/providers/inline/vector_io/milvus/__init__.py +19 -0
  363. llama_stack/providers/inline/vector_io/milvus/config.py +29 -0
  364. llama_stack/providers/inline/vector_io/qdrant/__init__.py +20 -0
  365. llama_stack/providers/inline/vector_io/qdrant/config.py +29 -0
  366. llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +20 -0
  367. llama_stack/providers/inline/vector_io/sqlite_vec/config.py +26 -0
  368. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +483 -0
  369. llama_stack/providers/registry/agents.py +16 -18
  370. llama_stack/providers/registry/batches.py +26 -0
  371. llama_stack/providers/registry/datasetio.py +49 -0
  372. llama_stack/providers/registry/eval.py +46 -0
  373. llama_stack/providers/registry/files.py +31 -0
  374. llama_stack/providers/registry/inference.py +273 -118
  375. llama_stack/providers/registry/post_training.py +69 -0
  376. llama_stack/providers/registry/safety.py +46 -41
  377. llama_stack/providers/registry/scoring.py +51 -0
  378. llama_stack/providers/registry/tool_runtime.py +87 -0
  379. llama_stack/providers/registry/vector_io.py +828 -0
  380. llama_stack/providers/remote/__init__.py +5 -0
  381. llama_stack/providers/remote/agents/__init__.py +5 -0
  382. llama_stack/providers/remote/datasetio/__init__.py +5 -0
  383. llama_stack/providers/{adapters/memory/chroma → remote/datasetio/huggingface}/__init__.py +7 -4
  384. llama_stack/providers/remote/datasetio/huggingface/config.py +23 -0
  385. llama_stack/providers/remote/datasetio/huggingface/huggingface.py +99 -0
  386. llama_stack/providers/remote/datasetio/nvidia/__init__.py +23 -0
  387. llama_stack/providers/remote/datasetio/nvidia/config.py +61 -0
  388. llama_stack/providers/remote/datasetio/nvidia/datasetio.py +116 -0
  389. llama_stack/providers/remote/eval/__init__.py +5 -0
  390. llama_stack/providers/remote/eval/nvidia/__init__.py +31 -0
  391. llama_stack/providers/remote/eval/nvidia/config.py +29 -0
  392. llama_stack/providers/remote/eval/nvidia/eval.py +162 -0
  393. llama_stack/providers/remote/files/s3/__init__.py +19 -0
  394. llama_stack/providers/remote/files/s3/config.py +42 -0
  395. llama_stack/providers/remote/files/s3/files.py +313 -0
  396. llama_stack/providers/remote/inference/__init__.py +5 -0
  397. llama_stack/providers/{adapters/safety/sample → remote/inference/anthropic}/__init__.py +4 -6
  398. llama_stack/providers/remote/inference/anthropic/anthropic.py +36 -0
  399. llama_stack/providers/remote/inference/anthropic/config.py +28 -0
  400. llama_stack/providers/{impls/meta_reference/telemetry → remote/inference/azure}/__init__.py +4 -4
  401. llama_stack/providers/remote/inference/azure/azure.py +25 -0
  402. llama_stack/providers/remote/inference/azure/config.py +61 -0
  403. llama_stack/providers/{adapters → remote}/inference/bedrock/__init__.py +18 -17
  404. llama_stack/providers/remote/inference/bedrock/bedrock.py +142 -0
  405. llama_stack/providers/{adapters/inference/sample → remote/inference/bedrock}/config.py +3 -4
  406. llama_stack/providers/remote/inference/bedrock/models.py +29 -0
  407. llama_stack/providers/remote/inference/cerebras/__init__.py +19 -0
  408. llama_stack/providers/remote/inference/cerebras/cerebras.py +28 -0
  409. llama_stack/providers/remote/inference/cerebras/config.py +30 -0
  410. llama_stack/providers/{adapters → remote}/inference/databricks/__init__.py +4 -5
  411. llama_stack/providers/remote/inference/databricks/config.py +37 -0
  412. llama_stack/providers/remote/inference/databricks/databricks.py +44 -0
  413. llama_stack/providers/{adapters → remote}/inference/fireworks/__init__.py +8 -4
  414. llama_stack/providers/remote/inference/fireworks/config.py +27 -0
  415. llama_stack/providers/remote/inference/fireworks/fireworks.py +27 -0
  416. llama_stack/providers/{adapters/memory/pgvector → remote/inference/gemini}/__init__.py +4 -4
  417. llama_stack/providers/remote/inference/gemini/config.py +28 -0
  418. llama_stack/providers/remote/inference/gemini/gemini.py +82 -0
  419. llama_stack/providers/remote/inference/groq/__init__.py +15 -0
  420. llama_stack/providers/remote/inference/groq/config.py +34 -0
  421. llama_stack/providers/remote/inference/groq/groq.py +18 -0
  422. llama_stack/providers/remote/inference/llama_openai_compat/__init__.py +15 -0
  423. llama_stack/providers/remote/inference/llama_openai_compat/config.py +34 -0
  424. llama_stack/providers/remote/inference/llama_openai_compat/llama.py +46 -0
  425. llama_stack/providers/remote/inference/nvidia/__init__.py +23 -0
  426. llama_stack/providers/remote/inference/nvidia/config.py +64 -0
  427. llama_stack/providers/remote/inference/nvidia/nvidia.py +61 -0
  428. llama_stack/providers/{adapters/safety/sample/config.py → remote/inference/nvidia/utils.py} +3 -4
  429. llama_stack/providers/{impls/vllm → remote/inference/ollama}/__init__.py +4 -6
  430. llama_stack/providers/remote/inference/ollama/config.py +25 -0
  431. llama_stack/providers/remote/inference/ollama/ollama.py +102 -0
  432. llama_stack/providers/{adapters/telemetry/opentelemetry → remote/inference/openai}/__init__.py +4 -4
  433. llama_stack/providers/remote/inference/openai/config.py +39 -0
  434. llama_stack/providers/remote/inference/openai/openai.py +38 -0
  435. llama_stack/providers/remote/inference/passthrough/__init__.py +23 -0
  436. llama_stack/providers/remote/inference/passthrough/config.py +34 -0
  437. llama_stack/providers/remote/inference/passthrough/passthrough.py +122 -0
  438. llama_stack/providers/remote/inference/runpod/__init__.py +16 -0
  439. llama_stack/providers/remote/inference/runpod/config.py +32 -0
  440. llama_stack/providers/remote/inference/runpod/runpod.py +42 -0
  441. llama_stack/providers/remote/inference/sambanova/__init__.py +16 -0
  442. llama_stack/providers/remote/inference/sambanova/config.py +34 -0
  443. llama_stack/providers/remote/inference/sambanova/sambanova.py +28 -0
  444. llama_stack/providers/{adapters → remote}/inference/tgi/__init__.py +3 -4
  445. llama_stack/providers/remote/inference/tgi/config.py +76 -0
  446. llama_stack/providers/remote/inference/tgi/tgi.py +85 -0
  447. llama_stack/providers/{adapters → remote}/inference/together/__init__.py +8 -4
  448. llama_stack/providers/remote/inference/together/config.py +27 -0
  449. llama_stack/providers/remote/inference/together/together.py +102 -0
  450. llama_stack/providers/remote/inference/vertexai/__init__.py +15 -0
  451. llama_stack/providers/remote/inference/vertexai/config.py +48 -0
  452. llama_stack/providers/remote/inference/vertexai/vertexai.py +54 -0
  453. llama_stack/providers/remote/inference/vllm/__init__.py +22 -0
  454. llama_stack/providers/remote/inference/vllm/config.py +59 -0
  455. llama_stack/providers/remote/inference/vllm/vllm.py +111 -0
  456. llama_stack/providers/remote/inference/watsonx/__init__.py +15 -0
  457. llama_stack/providers/remote/inference/watsonx/config.py +45 -0
  458. llama_stack/providers/remote/inference/watsonx/watsonx.py +336 -0
  459. llama_stack/providers/remote/post_training/__init__.py +5 -0
  460. llama_stack/providers/remote/post_training/nvidia/__init__.py +23 -0
  461. llama_stack/providers/remote/post_training/nvidia/config.py +113 -0
  462. llama_stack/providers/remote/post_training/nvidia/models.py +27 -0
  463. llama_stack/providers/remote/post_training/nvidia/post_training.py +430 -0
  464. llama_stack/providers/remote/post_training/nvidia/utils.py +63 -0
  465. llama_stack/providers/remote/safety/__init__.py +5 -0
  466. llama_stack/providers/remote/safety/bedrock/bedrock.py +111 -0
  467. llama_stack/providers/remote/safety/bedrock/config.py +14 -0
  468. llama_stack/providers/{adapters/inference/sample → remote/safety/nvidia}/__init__.py +5 -4
  469. llama_stack/providers/remote/safety/nvidia/config.py +40 -0
  470. llama_stack/providers/remote/safety/nvidia/nvidia.py +161 -0
  471. llama_stack/providers/{adapters/agents/sample → remote/safety/sambanova}/__init__.py +5 -4
  472. llama_stack/providers/remote/safety/sambanova/config.py +37 -0
  473. llama_stack/providers/remote/safety/sambanova/sambanova.py +98 -0
  474. llama_stack/providers/remote/tool_runtime/__init__.py +5 -0
  475. llama_stack/providers/remote/tool_runtime/bing_search/__init__.py +21 -0
  476. llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +112 -0
  477. llama_stack/providers/remote/tool_runtime/bing_search/config.py +22 -0
  478. llama_stack/providers/remote/tool_runtime/brave_search/__init__.py +20 -0
  479. llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +148 -0
  480. llama_stack/providers/remote/tool_runtime/brave_search/config.py +27 -0
  481. llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py +15 -0
  482. llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +20 -0
  483. llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +73 -0
  484. llama_stack/providers/remote/tool_runtime/tavily_search/__init__.py +20 -0
  485. llama_stack/providers/remote/tool_runtime/tavily_search/config.py +27 -0
  486. llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +84 -0
  487. llama_stack/providers/remote/tool_runtime/wolfram_alpha/__init__.py +22 -0
  488. llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py +21 -0
  489. llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +140 -0
  490. llama_stack/providers/remote/vector_io/__init__.py +5 -0
  491. llama_stack/providers/remote/vector_io/chroma/__init__.py +17 -0
  492. llama_stack/providers/remote/vector_io/chroma/chroma.py +215 -0
  493. llama_stack/providers/remote/vector_io/chroma/config.py +28 -0
  494. llama_stack/providers/remote/vector_io/milvus/__init__.py +18 -0
  495. llama_stack/providers/remote/vector_io/milvus/config.py +35 -0
  496. llama_stack/providers/remote/vector_io/milvus/milvus.py +375 -0
  497. llama_stack/providers/remote/vector_io/pgvector/__init__.py +17 -0
  498. llama_stack/providers/remote/vector_io/pgvector/config.py +47 -0
  499. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +460 -0
  500. llama_stack/providers/remote/vector_io/qdrant/__init__.py +17 -0
  501. llama_stack/providers/remote/vector_io/qdrant/config.py +37 -0
  502. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +265 -0
  503. llama_stack/providers/remote/vector_io/weaviate/__init__.py +17 -0
  504. llama_stack/providers/remote/vector_io/weaviate/config.py +32 -0
  505. llama_stack/providers/remote/vector_io/weaviate/weaviate.py +393 -0
  506. llama_stack/providers/utils/bedrock/__init__.py +5 -0
  507. llama_stack/providers/utils/bedrock/client.py +74 -0
  508. llama_stack/providers/utils/bedrock/config.py +64 -0
  509. llama_stack/providers/utils/bedrock/refreshable_boto_session.py +112 -0
  510. llama_stack/providers/utils/common/__init__.py +5 -0
  511. llama_stack/providers/utils/common/data_schema_validator.py +103 -0
  512. llama_stack/providers/utils/datasetio/__init__.py +5 -0
  513. llama_stack/providers/utils/datasetio/url_utils.py +47 -0
  514. llama_stack/providers/utils/files/__init__.py +5 -0
  515. llama_stack/providers/utils/files/form_data.py +69 -0
  516. llama_stack/providers/utils/inference/__init__.py +8 -7
  517. llama_stack/providers/utils/inference/embedding_mixin.py +101 -0
  518. llama_stack/providers/utils/inference/inference_store.py +264 -0
  519. llama_stack/providers/utils/inference/litellm_openai_mixin.py +336 -0
  520. llama_stack/providers/utils/inference/model_registry.py +173 -23
  521. llama_stack/providers/utils/inference/openai_compat.py +1261 -49
  522. llama_stack/providers/utils/inference/openai_mixin.py +506 -0
  523. llama_stack/providers/utils/inference/prompt_adapter.py +365 -67
  524. llama_stack/providers/utils/kvstore/api.py +6 -6
  525. llama_stack/providers/utils/kvstore/config.py +28 -48
  526. llama_stack/providers/utils/kvstore/kvstore.py +61 -15
  527. llama_stack/providers/utils/kvstore/mongodb/__init__.py +9 -0
  528. llama_stack/providers/utils/kvstore/mongodb/mongodb.py +82 -0
  529. llama_stack/providers/utils/kvstore/postgres/__init__.py +7 -0
  530. llama_stack/providers/utils/kvstore/postgres/postgres.py +114 -0
  531. llama_stack/providers/utils/kvstore/redis/redis.py +33 -9
  532. llama_stack/providers/utils/kvstore/sqlite/config.py +2 -1
  533. llama_stack/providers/utils/kvstore/sqlite/sqlite.py +123 -22
  534. llama_stack/providers/utils/memory/file_utils.py +1 -1
  535. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +1304 -0
  536. llama_stack/providers/utils/memory/vector_store.py +220 -82
  537. llama_stack/providers/utils/pagination.py +43 -0
  538. llama_stack/providers/utils/responses/__init__.py +5 -0
  539. llama_stack/providers/utils/responses/responses_store.py +292 -0
  540. llama_stack/providers/utils/scheduler.py +270 -0
  541. llama_stack/providers/utils/scoring/__init__.py +5 -0
  542. llama_stack/providers/utils/scoring/aggregation_utils.py +75 -0
  543. llama_stack/providers/utils/scoring/base_scoring_fn.py +114 -0
  544. llama_stack/providers/utils/scoring/basic_scoring_utils.py +26 -0
  545. llama_stack/providers/utils/sqlstore/__init__.py +5 -0
  546. llama_stack/providers/utils/sqlstore/api.py +128 -0
  547. llama_stack/providers/utils/sqlstore/authorized_sqlstore.py +319 -0
  548. llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py +343 -0
  549. llama_stack/providers/utils/sqlstore/sqlstore.py +70 -0
  550. llama_stack/providers/utils/telemetry/trace_protocol.py +142 -0
  551. llama_stack/providers/utils/telemetry/tracing.py +192 -53
  552. llama_stack/providers/utils/tools/__init__.py +5 -0
  553. llama_stack/providers/utils/tools/mcp.py +148 -0
  554. llama_stack/providers/utils/tools/ttl_dict.py +70 -0
  555. llama_stack/providers/utils/vector_io/__init__.py +5 -0
  556. llama_stack/providers/utils/vector_io/vector_utils.py +156 -0
  557. llama_stack/schema_utils.py +118 -0
  558. llama_stack/strong_typing/__init__.py +19 -0
  559. llama_stack/strong_typing/auxiliary.py +228 -0
  560. llama_stack/strong_typing/classdef.py +440 -0
  561. llama_stack/strong_typing/core.py +46 -0
  562. llama_stack/strong_typing/deserializer.py +877 -0
  563. llama_stack/strong_typing/docstring.py +409 -0
  564. llama_stack/strong_typing/exception.py +23 -0
  565. llama_stack/strong_typing/inspection.py +1085 -0
  566. llama_stack/strong_typing/mapping.py +40 -0
  567. llama_stack/strong_typing/name.py +182 -0
  568. llama_stack/strong_typing/py.typed +0 -0
  569. llama_stack/strong_typing/schema.py +792 -0
  570. llama_stack/strong_typing/serialization.py +97 -0
  571. llama_stack/strong_typing/serializer.py +500 -0
  572. llama_stack/strong_typing/slots.py +27 -0
  573. llama_stack/strong_typing/topological.py +89 -0
  574. llama_stack/testing/__init__.py +5 -0
  575. llama_stack/testing/api_recorder.py +956 -0
  576. llama_stack/ui/node_modules/flatted/python/flatted.py +149 -0
  577. llama_stack-0.3.4.dist-info/METADATA +261 -0
  578. llama_stack-0.3.4.dist-info/RECORD +625 -0
  579. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/WHEEL +1 -1
  580. llama_stack/apis/agents/client.py +0 -292
  581. llama_stack/apis/agents/event_logger.py +0 -184
  582. llama_stack/apis/batch_inference/batch_inference.py +0 -72
  583. llama_stack/apis/common/deployment_types.py +0 -31
  584. llama_stack/apis/dataset/dataset.py +0 -63
  585. llama_stack/apis/evals/evals.py +0 -122
  586. llama_stack/apis/inference/client.py +0 -197
  587. llama_stack/apis/inspect/client.py +0 -82
  588. llama_stack/apis/memory/client.py +0 -155
  589. llama_stack/apis/memory/memory.py +0 -65
  590. llama_stack/apis/memory_banks/__init__.py +0 -7
  591. llama_stack/apis/memory_banks/client.py +0 -101
  592. llama_stack/apis/memory_banks/memory_banks.py +0 -78
  593. llama_stack/apis/models/client.py +0 -83
  594. llama_stack/apis/reward_scoring/__init__.py +0 -7
  595. llama_stack/apis/reward_scoring/reward_scoring.py +0 -55
  596. llama_stack/apis/safety/client.py +0 -105
  597. llama_stack/apis/shields/client.py +0 -79
  598. llama_stack/cli/download.py +0 -340
  599. llama_stack/cli/model/describe.py +0 -82
  600. llama_stack/cli/model/download.py +0 -24
  601. llama_stack/cli/model/list.py +0 -62
  602. llama_stack/cli/model/model.py +0 -34
  603. llama_stack/cli/model/prompt_format.py +0 -112
  604. llama_stack/cli/model/safety_models.py +0 -52
  605. llama_stack/cli/stack/build.py +0 -299
  606. llama_stack/cli/stack/configure.py +0 -178
  607. llama_stack/distribution/build.py +0 -123
  608. llama_stack/distribution/build_conda_env.sh +0 -136
  609. llama_stack/distribution/build_container.sh +0 -142
  610. llama_stack/distribution/common.sh +0 -40
  611. llama_stack/distribution/configure_container.sh +0 -47
  612. llama_stack/distribution/datatypes.py +0 -139
  613. llama_stack/distribution/distribution.py +0 -58
  614. llama_stack/distribution/inspect.py +0 -67
  615. llama_stack/distribution/request_headers.py +0 -57
  616. llama_stack/distribution/resolver.py +0 -323
  617. llama_stack/distribution/routers/__init__.py +0 -48
  618. llama_stack/distribution/routers/routers.py +0 -158
  619. llama_stack/distribution/routers/routing_tables.py +0 -173
  620. llama_stack/distribution/server/endpoints.py +0 -48
  621. llama_stack/distribution/server/server.py +0 -343
  622. llama_stack/distribution/start_conda_env.sh +0 -42
  623. llama_stack/distribution/start_container.sh +0 -64
  624. llama_stack/distribution/templates/local-bedrock-conda-example-build.yaml +0 -10
  625. llama_stack/distribution/templates/local-build.yaml +0 -10
  626. llama_stack/distribution/templates/local-databricks-build.yaml +0 -10
  627. llama_stack/distribution/templates/local-fireworks-build.yaml +0 -10
  628. llama_stack/distribution/templates/local-hf-endpoint-build.yaml +0 -10
  629. llama_stack/distribution/templates/local-hf-serverless-build.yaml +0 -10
  630. llama_stack/distribution/templates/local-ollama-build.yaml +0 -10
  631. llama_stack/distribution/templates/local-tgi-build.yaml +0 -10
  632. llama_stack/distribution/templates/local-together-build.yaml +0 -10
  633. llama_stack/distribution/templates/local-vllm-build.yaml +0 -10
  634. llama_stack/distribution/utils/exec.py +0 -105
  635. llama_stack/providers/adapters/agents/sample/sample.py +0 -18
  636. llama_stack/providers/adapters/inference/bedrock/bedrock.py +0 -451
  637. llama_stack/providers/adapters/inference/bedrock/config.py +0 -55
  638. llama_stack/providers/adapters/inference/databricks/config.py +0 -21
  639. llama_stack/providers/adapters/inference/databricks/databricks.py +0 -125
  640. llama_stack/providers/adapters/inference/fireworks/config.py +0 -20
  641. llama_stack/providers/adapters/inference/fireworks/fireworks.py +0 -130
  642. llama_stack/providers/adapters/inference/ollama/__init__.py +0 -19
  643. llama_stack/providers/adapters/inference/ollama/ollama.py +0 -175
  644. llama_stack/providers/adapters/inference/sample/sample.py +0 -23
  645. llama_stack/providers/adapters/inference/tgi/config.py +0 -43
  646. llama_stack/providers/adapters/inference/tgi/tgi.py +0 -200
  647. llama_stack/providers/adapters/inference/together/config.py +0 -22
  648. llama_stack/providers/adapters/inference/together/together.py +0 -143
  649. llama_stack/providers/adapters/memory/chroma/chroma.py +0 -157
  650. llama_stack/providers/adapters/memory/pgvector/config.py +0 -17
  651. llama_stack/providers/adapters/memory/pgvector/pgvector.py +0 -211
  652. llama_stack/providers/adapters/memory/sample/sample.py +0 -23
  653. llama_stack/providers/adapters/memory/weaviate/__init__.py +0 -15
  654. llama_stack/providers/adapters/memory/weaviate/weaviate.py +0 -190
  655. llama_stack/providers/adapters/safety/bedrock/bedrock.py +0 -113
  656. llama_stack/providers/adapters/safety/bedrock/config.py +0 -16
  657. llama_stack/providers/adapters/safety/sample/sample.py +0 -23
  658. llama_stack/providers/adapters/safety/together/__init__.py +0 -18
  659. llama_stack/providers/adapters/safety/together/config.py +0 -26
  660. llama_stack/providers/adapters/safety/together/together.py +0 -101
  661. llama_stack/providers/adapters/telemetry/opentelemetry/config.py +0 -12
  662. llama_stack/providers/adapters/telemetry/opentelemetry/opentelemetry.py +0 -201
  663. llama_stack/providers/adapters/telemetry/sample/__init__.py +0 -17
  664. llama_stack/providers/adapters/telemetry/sample/config.py +0 -12
  665. llama_stack/providers/adapters/telemetry/sample/sample.py +0 -18
  666. llama_stack/providers/impls/meta_reference/agents/agent_instance.py +0 -844
  667. llama_stack/providers/impls/meta_reference/agents/agents.py +0 -161
  668. llama_stack/providers/impls/meta_reference/agents/persistence.py +0 -84
  669. llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py +0 -74
  670. llama_stack/providers/impls/meta_reference/agents/safety.py +0 -57
  671. llama_stack/providers/impls/meta_reference/agents/tests/code_execution.py +0 -93
  672. llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py +0 -305
  673. llama_stack/providers/impls/meta_reference/agents/tools/base.py +0 -20
  674. llama_stack/providers/impls/meta_reference/agents/tools/builtin.py +0 -375
  675. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_env_prefix.py +0 -133
  676. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_execution.py +0 -256
  677. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/matplotlib_custom_backend.py +0 -87
  678. llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/utils.py +0 -21
  679. llama_stack/providers/impls/meta_reference/agents/tools/safety.py +0 -43
  680. llama_stack/providers/impls/meta_reference/codeshield/code_scanner.py +0 -58
  681. llama_stack/providers/impls/meta_reference/inference/config.py +0 -45
  682. llama_stack/providers/impls/meta_reference/inference/generation.py +0 -376
  683. llama_stack/providers/impls/meta_reference/inference/inference.py +0 -280
  684. llama_stack/providers/impls/meta_reference/inference/model_parallel.py +0 -99
  685. llama_stack/providers/impls/meta_reference/inference/quantization/fp8_impls.py +0 -184
  686. llama_stack/providers/impls/meta_reference/inference/quantization/fp8_txest_disabled.py +0 -76
  687. llama_stack/providers/impls/meta_reference/inference/quantization/loader.py +0 -97
  688. llama_stack/providers/impls/meta_reference/inference/quantization/scripts/quantize_checkpoint.py +0 -161
  689. llama_stack/providers/impls/meta_reference/memory/__init__.py +0 -19
  690. llama_stack/providers/impls/meta_reference/memory/faiss.py +0 -113
  691. llama_stack/providers/impls/meta_reference/safety/__init__.py +0 -17
  692. llama_stack/providers/impls/meta_reference/safety/base.py +0 -57
  693. llama_stack/providers/impls/meta_reference/safety/config.py +0 -48
  694. llama_stack/providers/impls/meta_reference/safety/llama_guard.py +0 -268
  695. llama_stack/providers/impls/meta_reference/safety/prompt_guard.py +0 -145
  696. llama_stack/providers/impls/meta_reference/safety/safety.py +0 -112
  697. llama_stack/providers/impls/meta_reference/telemetry/console.py +0 -89
  698. llama_stack/providers/impls/vllm/config.py +0 -35
  699. llama_stack/providers/impls/vllm/vllm.py +0 -241
  700. llama_stack/providers/registry/memory.py +0 -78
  701. llama_stack/providers/registry/telemetry.py +0 -44
  702. llama_stack/providers/tests/agents/test_agents.py +0 -210
  703. llama_stack/providers/tests/inference/test_inference.py +0 -257
  704. llama_stack/providers/tests/inference/test_prompt_adapter.py +0 -126
  705. llama_stack/providers/tests/memory/test_memory.py +0 -136
  706. llama_stack/providers/tests/resolver.py +0 -100
  707. llama_stack/providers/tests/safety/test_safety.py +0 -77
  708. llama_stack-0.0.42.dist-info/METADATA +0 -137
  709. llama_stack-0.0.42.dist-info/RECORD +0 -256
  710. /llama_stack/{distribution → core}/__init__.py +0 -0
  711. /llama_stack/{distribution/server → core/access_control}/__init__.py +0 -0
  712. /llama_stack/{distribution/utils → core/conversations}/__init__.py +0 -0
  713. /llama_stack/{providers/adapters → core/prompts}/__init__.py +0 -0
  714. /llama_stack/{providers/adapters/agents → core/routing_tables}/__init__.py +0 -0
  715. /llama_stack/{providers/adapters/inference → core/server}/__init__.py +0 -0
  716. /llama_stack/{providers/adapters/memory → core/storage}/__init__.py +0 -0
  717. /llama_stack/{providers/adapters/safety → core/ui}/__init__.py +0 -0
  718. /llama_stack/{providers/adapters/telemetry → core/ui/modules}/__init__.py +0 -0
  719. /llama_stack/{providers/impls → core/ui/page}/__init__.py +0 -0
  720. /llama_stack/{providers/impls/meta_reference → core/ui/page/distribution}/__init__.py +0 -0
  721. /llama_stack/{providers/impls/meta_reference/agents/rag → core/ui/page/evaluations}/__init__.py +0 -0
  722. /llama_stack/{providers/impls/meta_reference/agents/tests → core/ui/page/playground}/__init__.py +0 -0
  723. /llama_stack/{providers/impls/meta_reference/agents/tools → core/utils}/__init__.py +0 -0
  724. /llama_stack/{distribution → core}/utils/dynamic.py +0 -0
  725. /llama_stack/{distribution → core}/utils/serialize.py +0 -0
  726. /llama_stack/{providers/impls/meta_reference/agents/tools/ipython_tool → distributions}/__init__.py +0 -0
  727. /llama_stack/{providers/impls/meta_reference/inference/quantization → models}/__init__.py +0 -0
  728. /llama_stack/{providers/impls/meta_reference/inference/quantization/scripts → models/llama}/__init__.py +0 -0
  729. /llama_stack/{providers/tests → models/llama/llama3}/__init__.py +0 -0
  730. /llama_stack/{providers/tests/agents → models/llama/llama3/quantization}/__init__.py +0 -0
  731. /llama_stack/{providers/tests/inference → models/llama/llama3_2}/__init__.py +0 -0
  732. /llama_stack/{providers/tests/memory → models/llama/llama3_3}/__init__.py +0 -0
  733. /llama_stack/{providers/tests/safety → models/llama/llama4}/__init__.py +0 -0
  734. /llama_stack/{scripts → models/llama/llama4/prompt_templates}/__init__.py +0 -0
  735. /llama_stack/providers/{adapters → remote}/safety/bedrock/__init__.py +0 -0
  736. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/entry_points.txt +0 -0
  737. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info/licenses}/LICENSE +0 -0
  738. {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,956 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the terms described in the LICENSE file in
5
+ # the root directory of this source tree.
6
+
7
+ from __future__ import annotations # for forward references
8
+
9
+ import hashlib
10
+ import json
11
+ import os
12
+ import re
13
+ from collections.abc import Callable, Generator
14
+ from contextlib import contextmanager
15
+ from enum import StrEnum
16
+ from pathlib import Path
17
+ from typing import Any, Literal, cast
18
+
19
+ from openai import NOT_GIVEN, OpenAI
20
+
21
+ from llama_stack.core.id_generation import reset_id_override, set_id_override
22
+ from llama_stack.log import get_logger
23
+
24
+ logger = get_logger(__name__, category="testing")
25
+
26
+ # Global state for the recording system
27
+ # Note: Using module globals instead of ContextVars because the session-scoped
28
+ # client initialization happens in one async context, but tests run in different
29
+ # contexts, and we need the mode/storage to persist across all contexts.
30
+ _current_mode: str | None = None
31
+ _current_storage: ResponseStorage | None = None
32
+ _original_methods: dict[str, Any] = {}
33
+
34
+ # Per-test deterministic ID counters (test_id -> id_kind -> counter)
35
+ _id_counters: dict[str, dict[str, int]] = {}
36
+
37
+ # Test context uses ContextVar since it changes per-test and needs async isolation
38
+ from openai.types.completion_choice import CompletionChoice
39
+
40
+ from llama_stack.core.testing_context import get_test_context, is_debug_mode
41
+
42
+ # update the "finish_reason" field, since its type definition is wrong (no None is accepted)
43
+ CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None
44
+ CompletionChoice.model_rebuild()
45
+
46
+ REPO_ROOT = Path(__file__).parent.parent.parent
47
+ DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/common"
48
+
49
+
50
+ class APIRecordingMode(StrEnum):
51
+ LIVE = "live"
52
+ RECORD = "record"
53
+ REPLAY = "replay"
54
+ RECORD_IF_MISSING = "record-if-missing"
55
+
56
+
57
+ _ID_KIND_PREFIXES: dict[str, str] = {
58
+ "file": "file-",
59
+ "vector_store": "vs_",
60
+ "vector_store_file_batch": "batch_",
61
+ "tool_call": "call_",
62
+ }
63
+
64
+
65
+ _FLOAT_IN_STRING_PATTERN = re.compile(r"(-?\d+\.\d{4,})")
66
+
67
+
68
+ def _normalize_numeric_literal_strings(value: str) -> str:
69
+ """Round any long decimal literals embedded in strings for stable hashing."""
70
+
71
+ def _replace(match: re.Match[str]) -> str:
72
+ number = float(match.group(0))
73
+ return f"{number:.5f}"
74
+
75
+ return _FLOAT_IN_STRING_PATTERN.sub(_replace, value)
76
+
77
+
78
+ def _normalize_body_for_hash(value: Any) -> Any:
79
+ """Recursively normalize a JSON-like value to improve hash stability."""
80
+
81
+ if isinstance(value, dict):
82
+ return {key: _normalize_body_for_hash(item) for key, item in value.items()}
83
+ if isinstance(value, list):
84
+ return [_normalize_body_for_hash(item) for item in value]
85
+ if isinstance(value, tuple):
86
+ return tuple(_normalize_body_for_hash(item) for item in value)
87
+ if isinstance(value, float):
88
+ return round(value, 5)
89
+ if isinstance(value, str):
90
+ return _normalize_numeric_literal_strings(value)
91
+ return value
92
+
93
+
94
+ def _allocate_test_scoped_id(kind: str) -> str | None:
95
+ """Return the next deterministic ID for the given kind within the current test."""
96
+
97
+ global _id_counters
98
+
99
+ test_id = get_test_context()
100
+ prefix = _ID_KIND_PREFIXES.get(kind)
101
+
102
+ if prefix is None:
103
+ return None
104
+
105
+ if not test_id:
106
+ raise ValueError(f"Test ID is required for {kind} ID allocation")
107
+
108
+ key = test_id
109
+ if key not in _id_counters:
110
+ _id_counters[key] = {}
111
+
112
+ # each test should get a contiguous block of IDs otherwise we will get
113
+ # collisions between tests inside other systems (like file storage) which
114
+ # expect IDs to be unique
115
+ test_hash = hashlib.sha256(test_id.encode()).hexdigest()
116
+ test_hash_int = int(test_hash, 16)
117
+ counter = test_hash_int % 1000000000000
118
+
119
+ counter = _id_counters[key].get(kind, counter) + 1
120
+ _id_counters[key][kind] = counter
121
+
122
+ return f"{prefix}{counter}"
123
+
124
+
125
+ def _deterministic_id_override(kind: str, factory: Callable[[], str]) -> str:
126
+ deterministic_id = _allocate_test_scoped_id(kind)
127
+ if deterministic_id is not None:
128
+ return deterministic_id
129
+ return factory()
130
+
131
+
132
+ def normalize_inference_request(method: str, url: str, headers: dict[str, Any], body: dict[str, Any]) -> str:
133
+ """Create a normalized hash of the request for consistent matching.
134
+
135
+ Includes test_id from context to ensure test isolation - identical requests
136
+ from different tests will have different hashes.
137
+
138
+ Exception: Model list endpoints (/v1/models, /api/tags) exclude test_id since
139
+ they are infrastructure/shared and need to work across session setup and tests.
140
+ """
141
+
142
+ # Extract just the endpoint path
143
+ from urllib.parse import urlparse
144
+
145
+ parsed = urlparse(url)
146
+
147
+ body_for_hash = _normalize_body_for_hash(body)
148
+
149
+ test_id = get_test_context()
150
+ normalized: dict[str, Any] = {
151
+ "method": method.upper(),
152
+ "endpoint": parsed.path,
153
+ "body": body_for_hash,
154
+ }
155
+
156
+ # Include test_id for isolation, except for shared infrastructure endpoints
157
+ if parsed.path not in ("/api/tags", "/v1/models"):
158
+ normalized["test_id"] = test_id
159
+
160
+ normalized_json = json.dumps(normalized, sort_keys=True)
161
+ request_hash = hashlib.sha256(normalized_json.encode()).hexdigest()
162
+
163
+ if is_debug_mode():
164
+ logger.info("[RECORDING DEBUG] Hash computation:")
165
+ logger.info(f" Test ID: {test_id}")
166
+ logger.info(f" Method: {method.upper()}")
167
+ logger.info(f" Endpoint: {parsed.path}")
168
+ logger.info(f" Model: {body.get('model', 'N/A')}")
169
+ logger.info(f" Computed hash: {request_hash}")
170
+
171
+ return request_hash
172
+
173
+
174
+ def normalize_tool_request(provider_name: str, tool_name: str, kwargs: dict[str, Any]) -> str:
175
+ """Create a normalized hash of the tool request for consistent matching."""
176
+ normalized = {
177
+ "provider": provider_name,
178
+ "tool_name": tool_name,
179
+ "kwargs": kwargs,
180
+ }
181
+
182
+ # Create hash - sort_keys=True ensures deterministic ordering
183
+ normalized_json = json.dumps(normalized, sort_keys=True)
184
+ return hashlib.sha256(normalized_json.encode()).hexdigest()
185
+
186
+
187
+ def patch_httpx_for_test_id():
188
+ """Patch client _prepare_request methods to inject test ID into provider data header.
189
+
190
+ This is needed for server mode where the test ID must be transported from
191
+ client to server via HTTP headers. In library_client mode, this patch is a no-op
192
+ since everything runs in the same process.
193
+
194
+ We use the _prepare_request hook that Stainless clients provide for mutating
195
+ requests after construction but before sending.
196
+ """
197
+ from llama_stack_client import LlamaStackClient
198
+
199
+ if "llama_stack_client_prepare_request" in _original_methods:
200
+ return
201
+
202
+ _original_methods["llama_stack_client_prepare_request"] = LlamaStackClient._prepare_request
203
+ _original_methods["openai_prepare_request"] = OpenAI._prepare_request
204
+
205
+ def patched_prepare_request(self, request):
206
+ # Call original first (it's a sync method that returns None)
207
+ # Determine which original to call based on client type
208
+ _original_methods["llama_stack_client_prepare_request"](self, request)
209
+ _original_methods["openai_prepare_request"](self, request)
210
+
211
+ # Only inject test ID in server mode
212
+ stack_config_type = os.environ.get("LLAMA_STACK_TEST_STACK_CONFIG_TYPE", "library_client")
213
+ test_id = get_test_context()
214
+
215
+ if stack_config_type == "server" and test_id:
216
+ provider_data_header = request.headers.get("X-LlamaStack-Provider-Data")
217
+
218
+ if provider_data_header:
219
+ provider_data = json.loads(provider_data_header)
220
+ else:
221
+ provider_data = {}
222
+
223
+ provider_data["__test_id"] = test_id
224
+ request.headers["X-LlamaStack-Provider-Data"] = json.dumps(provider_data)
225
+
226
+ if is_debug_mode():
227
+ logger.info("[RECORDING DEBUG] Injected test ID into request header:")
228
+ logger.info(f" Test ID: {test_id}")
229
+ logger.info(f" URL: {request.url}")
230
+
231
+ return None
232
+
233
+ LlamaStackClient._prepare_request = patched_prepare_request
234
+ OpenAI._prepare_request = patched_prepare_request
235
+
236
+
237
+ # currently, unpatch is never called
238
+ def unpatch_httpx_for_test_id():
239
+ """Remove client _prepare_request patches for test ID injection."""
240
+ if "llama_stack_client_prepare_request" not in _original_methods:
241
+ return
242
+
243
+ from llama_stack_client import LlamaStackClient
244
+
245
+ LlamaStackClient._prepare_request = _original_methods["llama_stack_client_prepare_request"]
246
+ del _original_methods["llama_stack_client_prepare_request"]
247
+ OpenAI._prepare_request = _original_methods["openai_prepare_request"]
248
+ del _original_methods["openai_prepare_request"]
249
+
250
+
251
+ def get_api_recording_mode() -> APIRecordingMode:
252
+ return APIRecordingMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "replay").lower())
253
+
254
+
255
+ def setup_api_recording():
256
+ """
257
+ Returns a context manager that can be used to record or replay API requests (inference and tools).
258
+ This is to be used in tests to increase their reliability and reduce reliance on expensive, external services.
259
+
260
+ Currently supports:
261
+ - Inference: OpenAI and Ollama clients
262
+ - Tools: Search providers (Tavily)
263
+
264
+ Two environment variables are supported:
265
+ - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', 'replay', or 'record-if-missing'. Default is 'replay'.
266
+ - 'live': Make all requests live without recording
267
+ - 'record': Record all requests (overwrites existing recordings)
268
+ - 'replay': Use only recorded responses (fails if recording not found)
269
+ - 'record-if-missing': Use recorded responses when available, record new ones when not found
270
+ - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.
271
+
272
+ The recordings are stored as JSON files.
273
+ """
274
+ mode = get_api_recording_mode()
275
+ if mode == APIRecordingMode.LIVE:
276
+ return None
277
+
278
+ storage_dir = os.environ.get("LLAMA_STACK_TEST_RECORDING_DIR", DEFAULT_STORAGE_DIR)
279
+ return api_recording(mode=mode, storage_dir=storage_dir)
280
+
281
+
282
+ def _normalize_response(data: dict[str, Any], request_hash: str) -> dict[str, Any]:
283
+ """Normalize fields that change between recordings but don't affect functionality.
284
+
285
+ This reduces noise in git diffs by making IDs deterministic and timestamps constant.
286
+ """
287
+ # Only normalize ID for completion/chat responses, not for model objects
288
+ # Model objects have "object": "model" and the ID is the actual model identifier
289
+ if "id" in data and data.get("object") != "model":
290
+ data["id"] = f"rec-{request_hash[:12]}"
291
+
292
+ # Normalize timestamp to epoch (0) (for OpenAI-style responses)
293
+ # But not for model objects where created timestamp might be meaningful
294
+ if "created" in data and data.get("object") != "model":
295
+ data["created"] = 0
296
+
297
+ # Normalize Ollama-specific timestamp fields
298
+ if "created_at" in data:
299
+ data["created_at"] = "1970-01-01T00:00:00.000000Z"
300
+
301
+ # Normalize Ollama-specific duration fields (these vary based on system load)
302
+ if "total_duration" in data and data["total_duration"] is not None:
303
+ data["total_duration"] = 0
304
+ if "load_duration" in data and data["load_duration"] is not None:
305
+ data["load_duration"] = 0
306
+ if "prompt_eval_duration" in data and data["prompt_eval_duration"] is not None:
307
+ data["prompt_eval_duration"] = 0
308
+ if "eval_duration" in data and data["eval_duration"] is not None:
309
+ data["eval_duration"] = 0
310
+
311
+ return data
312
+
313
+
314
+ def _serialize_response(response: Any, request_hash: str = "") -> Any:
315
+ if hasattr(response, "model_dump"):
316
+ data = response.model_dump(mode="json")
317
+ # Normalize fields to reduce noise
318
+ data = _normalize_response(data, request_hash)
319
+ return {
320
+ "__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
321
+ "__data__": data,
322
+ }
323
+ elif hasattr(response, "__dict__"):
324
+ return dict(response.__dict__)
325
+ else:
326
+ return response
327
+
328
+
329
+ def _deserialize_response(data: dict[str, Any]) -> Any:
330
+ # Check if this is a serialized Pydantic model with type information
331
+ if isinstance(data, dict) and "__type__" in data and "__data__" in data:
332
+ try:
333
+ # Import the original class and reconstruct the object
334
+ module_path, class_name = data["__type__"].rsplit(".", 1)
335
+ module = __import__(module_path, fromlist=[class_name])
336
+ cls = getattr(module, class_name)
337
+
338
+ if not hasattr(cls, "model_validate"):
339
+ raise ValueError(f"Pydantic class {cls} does not support model_validate?")
340
+
341
+ return cls.model_validate(data["__data__"])
342
+ except (ImportError, AttributeError, TypeError, ValueError) as e:
343
+ logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_validate: {e}")
344
+ try:
345
+ return cls.model_construct(**data["__data__"])
346
+ except Exception as e:
347
+ logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_construct: {e}")
348
+ return data["__data__"]
349
+
350
+ return data
351
+
352
+
353
+ class ResponseStorage:
354
+ """Handles SQLite index + JSON file storage/retrieval for inference recordings."""
355
+
356
+ def __init__(self, base_dir: Path):
357
+ self.base_dir = base_dir
358
+ # Don't create responses_dir here - determine it per-test at runtime
359
+
360
+ def _get_test_dir(self) -> Path:
361
+ """Get the recordings directory in the test file's parent directory.
362
+
363
+ For test at "tests/integration/inference/test_foo.py::test_bar",
364
+ returns "tests/integration/inference/recordings/".
365
+ """
366
+ test_id = get_test_context()
367
+ if test_id:
368
+ # Extract the directory path from the test nodeid
369
+ # e.g., "tests/integration/inference/test_basic.py::test_foo[params]"
370
+ # -> get "tests/integration/inference"
371
+ test_file = test_id.split("::")[0] # Remove test function part
372
+ test_dir = Path(test_file).parent # Get parent directory
373
+
374
+ if self.base_dir.is_absolute():
375
+ repo_root = self.base_dir.parent.parent.parent
376
+ result = repo_root / test_dir / "recordings"
377
+ if is_debug_mode():
378
+ logger.info("[RECORDING DEBUG] Path resolution (absolute base_dir):")
379
+ logger.info(f" Test ID: {test_id}")
380
+ logger.info(f" Base dir: {self.base_dir}")
381
+ logger.info(f" Repo root: {repo_root}")
382
+ logger.info(f" Test file: {test_file}")
383
+ logger.info(f" Test dir: {test_dir}")
384
+ logger.info(f" Recordings dir: {result}")
385
+ return result
386
+ else:
387
+ result = test_dir / "recordings"
388
+ if is_debug_mode():
389
+ logger.info("[RECORDING DEBUG] Path resolution (relative base_dir):")
390
+ logger.info(f" Test ID: {test_id}")
391
+ logger.info(f" Base dir: {self.base_dir}")
392
+ logger.info(f" Test dir: {test_dir}")
393
+ logger.info(f" Recordings dir: {result}")
394
+ return result
395
+ else:
396
+ # Fallback for non-test contexts
397
+ result = self.base_dir / "recordings"
398
+ if is_debug_mode():
399
+ logger.info("[RECORDING DEBUG] Path resolution (no test context):")
400
+ logger.info(f" Base dir: {self.base_dir}")
401
+ logger.info(f" Recordings dir: {result}")
402
+ return result
403
+
404
+ def _ensure_directory(self):
405
+ """Ensure test-specific directories exist."""
406
+ test_dir = self._get_test_dir()
407
+ test_dir.mkdir(parents=True, exist_ok=True)
408
+ return test_dir
409
+
410
+ def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
411
+ """Store a request/response pair."""
412
+ responses_dir = self._ensure_directory()
413
+
414
+ # Use FULL hash (not truncated)
415
+ response_file = f"{request_hash}.json"
416
+
417
+ # Serialize response body if needed
418
+ serialized_response = dict(response)
419
+ if "body" in serialized_response:
420
+ if isinstance(serialized_response["body"], list):
421
+ # Handle streaming responses (list of chunks)
422
+ serialized_response["body"] = [
423
+ _serialize_response(chunk, request_hash) for chunk in serialized_response["body"]
424
+ ]
425
+ else:
426
+ # Handle single response
427
+ serialized_response["body"] = _serialize_response(serialized_response["body"], request_hash)
428
+
429
+ # For model-list endpoints, include digest in filename to distinguish different model sets
430
+ endpoint = request.get("endpoint")
431
+ if endpoint in ("/api/tags", "/v1/models"):
432
+ digest = _model_identifiers_digest(endpoint, response)
433
+ response_file = f"models-{request_hash}-{digest}.json"
434
+
435
+ response_path = responses_dir / response_file
436
+
437
+ if is_debug_mode():
438
+ logger.info("[RECORDING DEBUG] Storing recording:")
439
+ logger.info(f" Request hash: {request_hash}")
440
+ logger.info(f" File: {response_path}")
441
+ logger.info(f" Test ID: {get_test_context()}")
442
+ logger.info(f" Endpoint: {endpoint}")
443
+
444
+ # Save response to JSON file with metadata
445
+ with open(response_path, "w") as f:
446
+ json.dump(
447
+ {
448
+ "test_id": get_test_context(),
449
+ "request": request,
450
+ "response": serialized_response,
451
+ "id_normalization_mapping": {},
452
+ },
453
+ f,
454
+ indent=2,
455
+ )
456
+ f.write("\n")
457
+ f.flush()
458
+
459
+ def find_recording(self, request_hash: str) -> dict[str, Any] | None:
460
+ """Find a recorded response by request hash.
461
+
462
+ Uses fallback: first checks test-specific dir, then falls back to base recordings dir.
463
+ This handles cases where recordings happen during session setup (no test context) but
464
+ are requested during tests (with test context).
465
+ """
466
+ response_file = f"{request_hash}.json"
467
+
468
+ # Try test-specific directory first
469
+ test_dir = self._get_test_dir()
470
+ response_path = test_dir / response_file
471
+
472
+ if is_debug_mode():
473
+ logger.info("[RECORDING DEBUG] Looking up recording:")
474
+ logger.info(f" Request hash: {request_hash}")
475
+ logger.info(f" Primary path: {response_path}")
476
+ logger.info(f" Primary exists: {response_path.exists()}")
477
+
478
+ if response_path.exists():
479
+ if is_debug_mode():
480
+ logger.info(" Found in primary location")
481
+ return _recording_from_file(response_path)
482
+
483
+ # Fallback to base recordings directory (for session-level recordings)
484
+ fallback_dir = self.base_dir / "recordings"
485
+ fallback_path = fallback_dir / response_file
486
+
487
+ if is_debug_mode():
488
+ logger.info(f" Fallback path: {fallback_path}")
489
+ logger.info(f" Fallback exists: {fallback_path.exists()}")
490
+
491
+ if fallback_path.exists():
492
+ if is_debug_mode():
493
+ logger.info(" Found in fallback location")
494
+ return _recording_from_file(fallback_path)
495
+
496
+ if is_debug_mode():
497
+ logger.info(" Recording not found in either location")
498
+
499
+ return None
500
+
501
+ def _model_list_responses(self, request_hash: str) -> list[dict[str, Any]]:
502
+ """Find all model-list recordings with the given hash (different digests)."""
503
+ results: list[dict[str, Any]] = []
504
+
505
+ # Check test-specific directory first
506
+ test_dir = self._get_test_dir()
507
+ if test_dir.exists():
508
+ for path in test_dir.glob(f"models-{request_hash}-*.json"):
509
+ data = _recording_from_file(path)
510
+ results.append(data)
511
+
512
+ # Also check fallback directory
513
+ fallback_dir = self.base_dir / "recordings"
514
+ if fallback_dir.exists():
515
+ for path in fallback_dir.glob(f"models-{request_hash}-*.json"):
516
+ data = _recording_from_file(path)
517
+ results.append(data)
518
+
519
+ return results
520
+
521
+
522
+ def _recording_from_file(response_path) -> dict[str, Any]:
523
+ with open(response_path) as f:
524
+ data = json.load(f)
525
+
526
+ mapping = data.get("id_normalization_mapping") or {}
527
+ if mapping:
528
+ serialized = json.dumps(data)
529
+ for normalized, original in mapping.items():
530
+ serialized = serialized.replace(original, normalized)
531
+ data = json.loads(serialized)
532
+ data["id_normalization_mapping"] = {}
533
+
534
+ # Deserialize response body if needed
535
+ if "response" in data and "body" in data["response"]:
536
+ if isinstance(data["response"]["body"], list):
537
+ # Handle streaming responses
538
+ data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
539
+ else:
540
+ # Handle single response
541
+ data["response"]["body"] = _deserialize_response(data["response"]["body"])
542
+
543
+ return cast(dict[str, Any], data)
544
+
545
+
546
+ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
547
+ """Generate a digest from model identifiers for distinguishing different model sets."""
548
+
549
+ def _extract_model_identifiers():
550
+ """Extract a stable set of identifiers for model-list endpoints.
551
+
552
+ Supported endpoints:
553
+ - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
554
+ - '/v1/models' (OpenAI): response body is: [ { id: ... }, ... ]
555
+ Returns a list of unique identifiers or None if structure doesn't match.
556
+ """
557
+ if "models" in response["body"]:
558
+ # ollama
559
+ items = response["body"]["models"]
560
+ else:
561
+ # openai
562
+ items = response["body"]
563
+ idents = [m.model if endpoint == "/api/tags" else m.id for m in items]
564
+ return sorted(set(idents))
565
+
566
+ identifiers = _extract_model_identifiers()
567
+ return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
568
+
569
+
570
+ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
571
+ """Return a single, unioned recording for supported model-list endpoints.
572
+
573
+ Merges multiple recordings with different model sets (from different servers) into
574
+ a single response containing all models.
575
+ """
576
+ if not records:
577
+ return None
578
+
579
+ seen: dict[str, dict[str, Any]] = {}
580
+ for rec in records:
581
+ body = rec["response"]["body"]
582
+ if endpoint == "/v1/models":
583
+ for m in body:
584
+ key = m.id
585
+ seen[key] = m
586
+ elif endpoint == "/api/tags":
587
+ for m in body.models:
588
+ key = m.model
589
+ seen[key] = m
590
+
591
+ ordered = [seen[k] for k in sorted(seen.keys())]
592
+ canonical = records[0]
593
+ canonical_req = canonical.get("request", {})
594
+ if isinstance(canonical_req, dict):
595
+ canonical_req["endpoint"] = endpoint
596
+ body = ordered
597
+ if endpoint == "/api/tags":
598
+ from ollama import ListResponse
599
+
600
+ body = ListResponse(models=ordered)
601
+ return {"request": canonical_req, "response": {"body": body, "is_streaming": False}}
602
+
603
+
604
+ async def _patched_tool_invoke_method(
605
+ original_method, provider_name: str, self, tool_name: str, kwargs: dict[str, Any]
606
+ ):
607
+ """Patched version of tool runtime invoke_tool method for recording/replay."""
608
+ global _current_mode, _current_storage
609
+
610
+ if _current_mode == APIRecordingMode.LIVE or _current_storage is None:
611
+ # Normal operation
612
+ return await original_method(self, tool_name, kwargs)
613
+
614
+ request_hash = normalize_tool_request(provider_name, tool_name, kwargs)
615
+
616
+ if _current_mode in (APIRecordingMode.REPLAY, APIRecordingMode.RECORD_IF_MISSING):
617
+ recording = _current_storage.find_recording(request_hash)
618
+ if recording:
619
+ return recording["response"]["body"]
620
+ elif _current_mode == APIRecordingMode.REPLAY:
621
+ raise RuntimeError(
622
+ f"Recording not found for {provider_name}.{tool_name} | Request: {kwargs}\n"
623
+ f"\n"
624
+ f"Run './scripts/integration-tests.sh --inference-mode record-if-missing' with required API keys to generate."
625
+ )
626
+ # If RECORD_IF_MISSING and no recording found, fall through to record
627
+
628
+ if _current_mode in (APIRecordingMode.RECORD, APIRecordingMode.RECORD_IF_MISSING):
629
+ # Make the tool call and record it
630
+ result = await original_method(self, tool_name, kwargs)
631
+
632
+ request_data = {
633
+ "test_id": get_test_context(),
634
+ "provider": provider_name,
635
+ "tool_name": tool_name,
636
+ "kwargs": kwargs,
637
+ }
638
+ response_data = {"body": result, "is_streaming": False}
639
+
640
+ # Store the recording
641
+ _current_storage.store_recording(request_hash, request_data, response_data)
642
+ return result
643
+
644
+ else:
645
+ raise AssertionError(f"Invalid mode: {_current_mode}")
646
+
647
+
648
+ async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
649
+ global _current_mode, _current_storage
650
+
651
+ mode = _current_mode
652
+ storage = _current_storage
653
+
654
+ if is_debug_mode():
655
+ logger.info("[RECORDING DEBUG] Entering inference method:")
656
+ logger.info(f" Mode: {mode}")
657
+ logger.info(f" Client type: {client_type}")
658
+ logger.info(f" Endpoint: {endpoint}")
659
+ logger.info(f" Test context: {get_test_context()}")
660
+
661
+ if mode == APIRecordingMode.LIVE or storage is None:
662
+ if endpoint == "/v1/models":
663
+ return original_method(self, *args, **kwargs)
664
+ else:
665
+ return await original_method(self, *args, **kwargs)
666
+
667
+ # Get base URL based on client type
668
+ if client_type == "openai":
669
+ base_url = str(self._client.base_url)
670
+
671
+ # the OpenAI client methods may pass NOT_GIVEN for unset parameters; filter these out
672
+ kwargs = {k: v for k, v in kwargs.items() if v is not NOT_GIVEN}
673
+ elif client_type == "ollama":
674
+ # Get base URL from the client (Ollama client uses host attribute)
675
+ base_url = getattr(self, "host", "http://localhost:11434")
676
+ if not base_url.startswith("http"):
677
+ base_url = f"http://{base_url}"
678
+ else:
679
+ raise ValueError(f"Unknown client type: {client_type}")
680
+
681
+ url = base_url.rstrip("/") + endpoint
682
+ # Special handling for Databricks URLs to avoid leaking workspace info
683
+ # e.g. https://adb-1234567890123456.7.cloud.databricks.com -> https://...cloud.databricks.com
684
+ if "cloud.databricks.com" in url:
685
+ url = "__databricks__" + url.split("cloud.databricks.com")[-1]
686
+ method = "POST"
687
+ headers = {}
688
+ body = kwargs
689
+
690
+ request_hash = normalize_inference_request(method, url, headers, body)
691
+
692
+ # Try to find existing recording for REPLAY or RECORD_IF_MISSING modes
693
+ recording = None
694
+ if mode == APIRecordingMode.REPLAY or mode == APIRecordingMode.RECORD_IF_MISSING:
695
+ # Special handling for model-list endpoints: merge all recordings with this hash
696
+ if endpoint in ("/api/tags", "/v1/models"):
697
+ records = storage._model_list_responses(request_hash)
698
+ recording = _combine_model_list_responses(endpoint, records)
699
+ else:
700
+ recording = storage.find_recording(request_hash)
701
+
702
+ if recording:
703
+ response_body = recording["response"]["body"]
704
+
705
+ if recording["response"].get("is_streaming", False):
706
+
707
+ async def replay_stream():
708
+ for chunk in response_body:
709
+ yield chunk
710
+
711
+ return replay_stream()
712
+ else:
713
+ return response_body
714
+ elif mode == APIRecordingMode.REPLAY:
715
+ # REPLAY mode requires recording to exist
716
+ if is_debug_mode():
717
+ logger.error("[RECORDING DEBUG] Recording not found!")
718
+ logger.error(f" Mode: {mode}")
719
+ logger.error(f" Request hash: {request_hash}")
720
+ logger.error(f" Method: {method}")
721
+ logger.error(f" URL: {url}")
722
+ logger.error(f" Endpoint: {endpoint}")
723
+ logger.error(f" Model: {body.get('model', 'unknown')}")
724
+ logger.error(f" Test context: {get_test_context()}")
725
+ logger.error(
726
+ f" Stack config type: {os.environ.get('LLAMA_STACK_TEST_STACK_CONFIG_TYPE', 'library_client')}"
727
+ )
728
+ raise RuntimeError(
729
+ f"Recording not found for request hash: {request_hash}\n"
730
+ f"Model: {body.get('model', 'unknown')} | Request: {method} {url}\n"
731
+ f"\n"
732
+ f"Run './scripts/integration-tests.sh --inference-mode record-if-missing' with required API keys to generate."
733
+ )
734
+
735
+ if mode == APIRecordingMode.RECORD or (mode == APIRecordingMode.RECORD_IF_MISSING and not recording):
736
+ if endpoint == "/v1/models":
737
+ response = original_method(self, *args, **kwargs)
738
+ else:
739
+ response = await original_method(self, *args, **kwargs)
740
+
741
+ # we want to store the result of the iterator, not the iterator itself
742
+ if endpoint == "/v1/models":
743
+ response = [m async for m in response]
744
+
745
+ request_data = {
746
+ "method": method,
747
+ "url": url,
748
+ "headers": headers,
749
+ "body": body,
750
+ "endpoint": endpoint,
751
+ "model": body.get("model", ""),
752
+ }
753
+
754
+ # Determine if this is a streaming request based on request parameters
755
+ is_streaming = body.get("stream", False)
756
+
757
+ if is_streaming:
758
+ # For streaming responses, we need to collect all chunks immediately before yielding
759
+ # This ensures the recording is saved even if the generator isn't fully consumed
760
+ chunks: list[Any] = []
761
+ async for chunk in response:
762
+ chunks.append(chunk)
763
+
764
+ # Store the recording immediately
765
+ response_data = {"body": chunks, "is_streaming": True}
766
+ storage.store_recording(request_hash, request_data, response_data)
767
+
768
+ # Return a generator that replays the stored chunks
769
+ async def replay_recorded_stream():
770
+ for chunk in chunks:
771
+ yield chunk
772
+
773
+ return replay_recorded_stream()
774
+ else:
775
+ response_data = {"body": response, "is_streaming": False}
776
+ storage.store_recording(request_hash, request_data, response_data)
777
+ return response
778
+
779
+ else:
780
+ raise AssertionError(f"Invalid mode: {mode}")
781
+
782
+
783
+ def patch_inference_clients():
784
+ """Install monkey patches for OpenAI client methods, Ollama AsyncClient methods, and tool runtime methods."""
785
+ global _original_methods
786
+
787
+ from ollama import AsyncClient as OllamaAsyncClient
788
+ from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
789
+ from openai.resources.completions import AsyncCompletions
790
+ from openai.resources.embeddings import AsyncEmbeddings
791
+ from openai.resources.models import AsyncModels
792
+
793
+ from llama_stack.providers.remote.tool_runtime.tavily_search.tavily_search import TavilySearchToolRuntimeImpl
794
+
795
+ # Store original methods for OpenAI, Ollama clients, and tool runtimes
796
+ _original_methods = {
797
+ "chat_completions_create": AsyncChatCompletions.create,
798
+ "completions_create": AsyncCompletions.create,
799
+ "embeddings_create": AsyncEmbeddings.create,
800
+ "models_list": AsyncModels.list,
801
+ "ollama_generate": OllamaAsyncClient.generate,
802
+ "ollama_chat": OllamaAsyncClient.chat,
803
+ "ollama_embed": OllamaAsyncClient.embed,
804
+ "ollama_ps": OllamaAsyncClient.ps,
805
+ "ollama_pull": OllamaAsyncClient.pull,
806
+ "ollama_list": OllamaAsyncClient.list,
807
+ "tavily_invoke_tool": TavilySearchToolRuntimeImpl.invoke_tool,
808
+ }
809
+
810
+ # Create patched methods for OpenAI client
811
+ async def patched_chat_completions_create(self, *args, **kwargs):
812
+ return await _patched_inference_method(
813
+ _original_methods["chat_completions_create"], self, "openai", "/v1/chat/completions", *args, **kwargs
814
+ )
815
+
816
+ async def patched_completions_create(self, *args, **kwargs):
817
+ return await _patched_inference_method(
818
+ _original_methods["completions_create"], self, "openai", "/v1/completions", *args, **kwargs
819
+ )
820
+
821
+ async def patched_embeddings_create(self, *args, **kwargs):
822
+ return await _patched_inference_method(
823
+ _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
824
+ )
825
+
826
+ def patched_models_list(self, *args, **kwargs):
827
+ async def _iter():
828
+ for item in await _patched_inference_method(
829
+ _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
830
+ ):
831
+ yield item
832
+
833
+ return _iter()
834
+
835
+ # Apply OpenAI patches
836
+ AsyncChatCompletions.create = patched_chat_completions_create
837
+ AsyncCompletions.create = patched_completions_create
838
+ AsyncEmbeddings.create = patched_embeddings_create
839
+ AsyncModels.list = patched_models_list
840
+
841
+ # Create patched methods for Ollama client
842
+ async def patched_ollama_generate(self, *args, **kwargs):
843
+ return await _patched_inference_method(
844
+ _original_methods["ollama_generate"], self, "ollama", "/api/generate", *args, **kwargs
845
+ )
846
+
847
+ async def patched_ollama_chat(self, *args, **kwargs):
848
+ return await _patched_inference_method(
849
+ _original_methods["ollama_chat"], self, "ollama", "/api/chat", *args, **kwargs
850
+ )
851
+
852
+ async def patched_ollama_embed(self, *args, **kwargs):
853
+ return await _patched_inference_method(
854
+ _original_methods["ollama_embed"], self, "ollama", "/api/embeddings", *args, **kwargs
855
+ )
856
+
857
+ async def patched_ollama_ps(self, *args, **kwargs):
858
+ return await _patched_inference_method(
859
+ _original_methods["ollama_ps"], self, "ollama", "/api/ps", *args, **kwargs
860
+ )
861
+
862
+ async def patched_ollama_pull(self, *args, **kwargs):
863
+ return await _patched_inference_method(
864
+ _original_methods["ollama_pull"], self, "ollama", "/api/pull", *args, **kwargs
865
+ )
866
+
867
+ async def patched_ollama_list(self, *args, **kwargs):
868
+ return await _patched_inference_method(
869
+ _original_methods["ollama_list"], self, "ollama", "/api/tags", *args, **kwargs
870
+ )
871
+
872
+ # Apply Ollama patches
873
+ OllamaAsyncClient.generate = patched_ollama_generate
874
+ OllamaAsyncClient.chat = patched_ollama_chat
875
+ OllamaAsyncClient.embed = patched_ollama_embed
876
+ OllamaAsyncClient.ps = patched_ollama_ps
877
+ OllamaAsyncClient.pull = patched_ollama_pull
878
+ OllamaAsyncClient.list = patched_ollama_list
879
+
880
+ # Create patched methods for tool runtimes
881
+ async def patched_tavily_invoke_tool(self, tool_name: str, kwargs: dict[str, Any]):
882
+ return await _patched_tool_invoke_method(
883
+ _original_methods["tavily_invoke_tool"], "tavily", self, tool_name, kwargs
884
+ )
885
+
886
+ # Apply tool runtime patches
887
+ TavilySearchToolRuntimeImpl.invoke_tool = patched_tavily_invoke_tool
888
+
889
+
890
+ def unpatch_inference_clients():
891
+ """Remove monkey patches and restore original OpenAI, Ollama client, and tool runtime methods."""
892
+ global _original_methods
893
+
894
+ if not _original_methods:
895
+ return
896
+
897
+ # Import here to avoid circular imports
898
+ from ollama import AsyncClient as OllamaAsyncClient
899
+ from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
900
+ from openai.resources.completions import AsyncCompletions
901
+ from openai.resources.embeddings import AsyncEmbeddings
902
+ from openai.resources.models import AsyncModels
903
+
904
+ from llama_stack.providers.remote.tool_runtime.tavily_search.tavily_search import TavilySearchToolRuntimeImpl
905
+
906
+ # Restore OpenAI client methods
907
+ AsyncChatCompletions.create = _original_methods["chat_completions_create"]
908
+ AsyncCompletions.create = _original_methods["completions_create"]
909
+ AsyncEmbeddings.create = _original_methods["embeddings_create"]
910
+ AsyncModels.list = _original_methods["models_list"]
911
+
912
+ # Restore Ollama client methods if they were patched
913
+ OllamaAsyncClient.generate = _original_methods["ollama_generate"]
914
+ OllamaAsyncClient.chat = _original_methods["ollama_chat"]
915
+ OllamaAsyncClient.embed = _original_methods["ollama_embed"]
916
+ OllamaAsyncClient.ps = _original_methods["ollama_ps"]
917
+ OllamaAsyncClient.pull = _original_methods["ollama_pull"]
918
+ OllamaAsyncClient.list = _original_methods["ollama_list"]
919
+
920
+ # Restore tool runtime methods
921
+ TavilySearchToolRuntimeImpl.invoke_tool = _original_methods["tavily_invoke_tool"]
922
+
923
+ _original_methods.clear()
924
+
925
+
926
+ @contextmanager
927
+ def api_recording(mode: str, storage_dir: str | Path | None = None) -> Generator[None, None, None]:
928
+ """Context manager for API recording/replaying (inference and tools)."""
929
+ global _current_mode, _current_storage
930
+
931
+ # Store previous state
932
+ prev_mode = _current_mode
933
+ prev_storage = _current_storage
934
+ previous_override = None
935
+
936
+ try:
937
+ _current_mode = mode
938
+
939
+ if mode in ["record", "replay", "record-if-missing"]:
940
+ if storage_dir is None:
941
+ raise ValueError("storage_dir is required for record, replay, and record-if-missing modes")
942
+ _current_storage = ResponseStorage(Path(storage_dir))
943
+ _id_counters.clear()
944
+ patch_inference_clients()
945
+ previous_override = set_id_override(_deterministic_id_override)
946
+
947
+ yield
948
+
949
+ finally:
950
+ # Restore previous state
951
+ if mode in ["record", "replay", "record-if-missing"]:
952
+ unpatch_inference_clients()
953
+ reset_id_override(previous_override)
954
+
955
+ _current_mode = prev_mode
956
+ _current_storage = prev_storage