aeri-python 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (391) hide show
  1. aeri/__init__.py +72 -0
  2. aeri/_client/_validation.py +204 -0
  3. aeri/_client/attributes.py +188 -0
  4. aeri/_client/client.py +3761 -0
  5. aeri/_client/constants.py +65 -0
  6. aeri/_client/datasets.py +302 -0
  7. aeri/_client/environment_variables.py +158 -0
  8. aeri/_client/get_client.py +149 -0
  9. aeri/_client/observe.py +661 -0
  10. aeri/_client/propagation.py +475 -0
  11. aeri/_client/resource_manager.py +510 -0
  12. aeri/_client/span.py +1519 -0
  13. aeri/_client/span_filter.py +76 -0
  14. aeri/_client/span_processor.py +206 -0
  15. aeri/_client/utils.py +132 -0
  16. aeri/_task_manager/media_manager.py +331 -0
  17. aeri/_task_manager/media_upload_consumer.py +44 -0
  18. aeri/_task_manager/media_upload_queue.py +12 -0
  19. aeri/_task_manager/score_ingestion_consumer.py +208 -0
  20. aeri/_task_manager/task_manager.py +475 -0
  21. aeri/_utils/__init__.py +19 -0
  22. aeri/_utils/environment.py +34 -0
  23. aeri/_utils/error_logging.py +47 -0
  24. aeri/_utils/parse_error.py +99 -0
  25. aeri/_utils/prompt_cache.py +188 -0
  26. aeri/_utils/request.py +137 -0
  27. aeri/_utils/serializer.py +205 -0
  28. aeri/api/.fern/metadata.json +14 -0
  29. aeri/api/__init__.py +836 -0
  30. aeri/api/annotation_queues/__init__.py +82 -0
  31. aeri/api/annotation_queues/client.py +1111 -0
  32. aeri/api/annotation_queues/raw_client.py +2288 -0
  33. aeri/api/annotation_queues/types/__init__.py +84 -0
  34. aeri/api/annotation_queues/types/annotation_queue.py +28 -0
  35. aeri/api/annotation_queues/types/annotation_queue_assignment_request.py +16 -0
  36. aeri/api/annotation_queues/types/annotation_queue_item.py +34 -0
  37. aeri/api/annotation_queues/types/annotation_queue_object_type.py +26 -0
  38. aeri/api/annotation_queues/types/annotation_queue_status.py +22 -0
  39. aeri/api/annotation_queues/types/create_annotation_queue_assignment_response.py +18 -0
  40. aeri/api/annotation_queues/types/create_annotation_queue_item_request.py +25 -0
  41. aeri/api/annotation_queues/types/create_annotation_queue_request.py +20 -0
  42. aeri/api/annotation_queues/types/delete_annotation_queue_assignment_response.py +14 -0
  43. aeri/api/annotation_queues/types/delete_annotation_queue_item_response.py +15 -0
  44. aeri/api/annotation_queues/types/paginated_annotation_queue_items.py +17 -0
  45. aeri/api/annotation_queues/types/paginated_annotation_queues.py +17 -0
  46. aeri/api/annotation_queues/types/update_annotation_queue_item_request.py +15 -0
  47. aeri/api/blob_storage_integrations/__init__.py +73 -0
  48. aeri/api/blob_storage_integrations/client.py +550 -0
  49. aeri/api/blob_storage_integrations/raw_client.py +976 -0
  50. aeri/api/blob_storage_integrations/types/__init__.py +77 -0
  51. aeri/api/blob_storage_integrations/types/blob_storage_export_frequency.py +26 -0
  52. aeri/api/blob_storage_integrations/types/blob_storage_export_mode.py +26 -0
  53. aeri/api/blob_storage_integrations/types/blob_storage_integration_deletion_response.py +14 -0
  54. aeri/api/blob_storage_integrations/types/blob_storage_integration_file_type.py +26 -0
  55. aeri/api/blob_storage_integrations/types/blob_storage_integration_response.py +64 -0
  56. aeri/api/blob_storage_integrations/types/blob_storage_integration_status_response.py +50 -0
  57. aeri/api/blob_storage_integrations/types/blob_storage_integration_type.py +26 -0
  58. aeri/api/blob_storage_integrations/types/blob_storage_integrations_response.py +15 -0
  59. aeri/api/blob_storage_integrations/types/blob_storage_sync_status.py +47 -0
  60. aeri/api/blob_storage_integrations/types/create_blob_storage_integration_request.py +91 -0
  61. aeri/api/client.py +679 -0
  62. aeri/api/comments/__init__.py +44 -0
  63. aeri/api/comments/client.py +407 -0
  64. aeri/api/comments/raw_client.py +750 -0
  65. aeri/api/comments/types/__init__.py +46 -0
  66. aeri/api/comments/types/create_comment_request.py +47 -0
  67. aeri/api/comments/types/create_comment_response.py +17 -0
  68. aeri/api/comments/types/get_comments_response.py +17 -0
  69. aeri/api/commons/__init__.py +210 -0
  70. aeri/api/commons/errors/__init__.py +56 -0
  71. aeri/api/commons/errors/access_denied_error.py +12 -0
  72. aeri/api/commons/errors/error.py +12 -0
  73. aeri/api/commons/errors/method_not_allowed_error.py +12 -0
  74. aeri/api/commons/errors/not_found_error.py +12 -0
  75. aeri/api/commons/errors/unauthorized_error.py +12 -0
  76. aeri/api/commons/types/__init__.py +190 -0
  77. aeri/api/commons/types/base_score.py +90 -0
  78. aeri/api/commons/types/base_score_v1.py +70 -0
  79. aeri/api/commons/types/boolean_score.py +26 -0
  80. aeri/api/commons/types/boolean_score_v1.py +26 -0
  81. aeri/api/commons/types/categorical_score.py +26 -0
  82. aeri/api/commons/types/categorical_score_v1.py +26 -0
  83. aeri/api/commons/types/comment.py +36 -0
  84. aeri/api/commons/types/comment_object_type.py +30 -0
  85. aeri/api/commons/types/config_category.py +15 -0
  86. aeri/api/commons/types/correction_score.py +26 -0
  87. aeri/api/commons/types/create_score_value.py +5 -0
  88. aeri/api/commons/types/dataset.py +49 -0
  89. aeri/api/commons/types/dataset_item.py +58 -0
  90. aeri/api/commons/types/dataset_run.py +63 -0
  91. aeri/api/commons/types/dataset_run_item.py +40 -0
  92. aeri/api/commons/types/dataset_run_with_items.py +19 -0
  93. aeri/api/commons/types/dataset_status.py +22 -0
  94. aeri/api/commons/types/map_value.py +11 -0
  95. aeri/api/commons/types/model.py +125 -0
  96. aeri/api/commons/types/model_price.py +14 -0
  97. aeri/api/commons/types/model_usage_unit.py +42 -0
  98. aeri/api/commons/types/numeric_score.py +17 -0
  99. aeri/api/commons/types/numeric_score_v1.py +17 -0
  100. aeri/api/commons/types/observation.py +142 -0
  101. aeri/api/commons/types/observation_level.py +30 -0
  102. aeri/api/commons/types/observation_v2.py +235 -0
  103. aeri/api/commons/types/observations_view.py +89 -0
  104. aeri/api/commons/types/pricing_tier.py +91 -0
  105. aeri/api/commons/types/pricing_tier_condition.py +68 -0
  106. aeri/api/commons/types/pricing_tier_input.py +76 -0
  107. aeri/api/commons/types/pricing_tier_operator.py +42 -0
  108. aeri/api/commons/types/score.py +201 -0
  109. aeri/api/commons/types/score_config.py +66 -0
  110. aeri/api/commons/types/score_config_data_type.py +26 -0
  111. aeri/api/commons/types/score_data_type.py +30 -0
  112. aeri/api/commons/types/score_source.py +26 -0
  113. aeri/api/commons/types/score_v1.py +131 -0
  114. aeri/api/commons/types/session.py +25 -0
  115. aeri/api/commons/types/session_with_traces.py +15 -0
  116. aeri/api/commons/types/trace.py +84 -0
  117. aeri/api/commons/types/trace_with_details.py +43 -0
  118. aeri/api/commons/types/trace_with_full_details.py +45 -0
  119. aeri/api/commons/types/usage.py +59 -0
  120. aeri/api/core/__init__.py +111 -0
  121. aeri/api/core/api_error.py +23 -0
  122. aeri/api/core/client_wrapper.py +141 -0
  123. aeri/api/core/datetime_utils.py +30 -0
  124. aeri/api/core/enum.py +20 -0
  125. aeri/api/core/file.py +70 -0
  126. aeri/api/core/force_multipart.py +18 -0
  127. aeri/api/core/http_client.py +711 -0
  128. aeri/api/core/http_response.py +55 -0
  129. aeri/api/core/http_sse/__init__.py +48 -0
  130. aeri/api/core/http_sse/_api.py +114 -0
  131. aeri/api/core/http_sse/_decoders.py +66 -0
  132. aeri/api/core/http_sse/_exceptions.py +7 -0
  133. aeri/api/core/http_sse/_models.py +17 -0
  134. aeri/api/core/jsonable_encoder.py +102 -0
  135. aeri/api/core/pydantic_utilities.py +310 -0
  136. aeri/api/core/query_encoder.py +60 -0
  137. aeri/api/core/remove_none_from_dict.py +11 -0
  138. aeri/api/core/request_options.py +35 -0
  139. aeri/api/core/serialization.py +282 -0
  140. aeri/api/dataset_items/__init__.py +52 -0
  141. aeri/api/dataset_items/client.py +499 -0
  142. aeri/api/dataset_items/raw_client.py +973 -0
  143. aeri/api/dataset_items/types/__init__.py +50 -0
  144. aeri/api/dataset_items/types/create_dataset_item_request.py +37 -0
  145. aeri/api/dataset_items/types/delete_dataset_item_response.py +17 -0
  146. aeri/api/dataset_items/types/paginated_dataset_items.py +17 -0
  147. aeri/api/dataset_run_items/__init__.py +43 -0
  148. aeri/api/dataset_run_items/client.py +323 -0
  149. aeri/api/dataset_run_items/raw_client.py +547 -0
  150. aeri/api/dataset_run_items/types/__init__.py +44 -0
  151. aeri/api/dataset_run_items/types/create_dataset_run_item_request.py +51 -0
  152. aeri/api/dataset_run_items/types/paginated_dataset_run_items.py +17 -0
  153. aeri/api/datasets/__init__.py +55 -0
  154. aeri/api/datasets/client.py +661 -0
  155. aeri/api/datasets/raw_client.py +1368 -0
  156. aeri/api/datasets/types/__init__.py +53 -0
  157. aeri/api/datasets/types/create_dataset_request.py +31 -0
  158. aeri/api/datasets/types/delete_dataset_run_response.py +14 -0
  159. aeri/api/datasets/types/paginated_dataset_runs.py +17 -0
  160. aeri/api/datasets/types/paginated_datasets.py +17 -0
  161. aeri/api/health/__init__.py +44 -0
  162. aeri/api/health/client.py +112 -0
  163. aeri/api/health/errors/__init__.py +42 -0
  164. aeri/api/health/errors/service_unavailable_error.py +13 -0
  165. aeri/api/health/raw_client.py +227 -0
  166. aeri/api/health/types/__init__.py +40 -0
  167. aeri/api/health/types/health_response.py +30 -0
  168. aeri/api/ingestion/__init__.py +169 -0
  169. aeri/api/ingestion/client.py +221 -0
  170. aeri/api/ingestion/raw_client.py +293 -0
  171. aeri/api/ingestion/types/__init__.py +169 -0
  172. aeri/api/ingestion/types/base_event.py +27 -0
  173. aeri/api/ingestion/types/create_event_body.py +14 -0
  174. aeri/api/ingestion/types/create_event_event.py +15 -0
  175. aeri/api/ingestion/types/create_generation_body.py +40 -0
  176. aeri/api/ingestion/types/create_generation_event.py +15 -0
  177. aeri/api/ingestion/types/create_observation_event.py +15 -0
  178. aeri/api/ingestion/types/create_span_body.py +19 -0
  179. aeri/api/ingestion/types/create_span_event.py +15 -0
  180. aeri/api/ingestion/types/ingestion_error.py +17 -0
  181. aeri/api/ingestion/types/ingestion_event.py +155 -0
  182. aeri/api/ingestion/types/ingestion_response.py +17 -0
  183. aeri/api/ingestion/types/ingestion_success.py +15 -0
  184. aeri/api/ingestion/types/ingestion_usage.py +8 -0
  185. aeri/api/ingestion/types/observation_body.py +53 -0
  186. aeri/api/ingestion/types/observation_type.py +54 -0
  187. aeri/api/ingestion/types/open_ai_completion_usage_schema.py +26 -0
  188. aeri/api/ingestion/types/open_ai_response_usage_schema.py +24 -0
  189. aeri/api/ingestion/types/open_ai_usage.py +28 -0
  190. aeri/api/ingestion/types/optional_observation_body.py +36 -0
  191. aeri/api/ingestion/types/score_body.py +75 -0
  192. aeri/api/ingestion/types/score_event.py +15 -0
  193. aeri/api/ingestion/types/sdk_log_body.py +14 -0
  194. aeri/api/ingestion/types/sdk_log_event.py +15 -0
  195. aeri/api/ingestion/types/trace_body.py +36 -0
  196. aeri/api/ingestion/types/trace_event.py +15 -0
  197. aeri/api/ingestion/types/update_event_body.py +14 -0
  198. aeri/api/ingestion/types/update_generation_body.py +40 -0
  199. aeri/api/ingestion/types/update_generation_event.py +15 -0
  200. aeri/api/ingestion/types/update_observation_event.py +15 -0
  201. aeri/api/ingestion/types/update_span_body.py +19 -0
  202. aeri/api/ingestion/types/update_span_event.py +15 -0
  203. aeri/api/ingestion/types/usage_details.py +10 -0
  204. aeri/api/legacy/__init__.py +61 -0
  205. aeri/api/legacy/client.py +105 -0
  206. aeri/api/legacy/metrics_v1/__init__.py +40 -0
  207. aeri/api/legacy/metrics_v1/client.py +214 -0
  208. aeri/api/legacy/metrics_v1/raw_client.py +322 -0
  209. aeri/api/legacy/metrics_v1/types/__init__.py +40 -0
  210. aeri/api/legacy/metrics_v1/types/metrics_response.py +19 -0
  211. aeri/api/legacy/observations_v1/__init__.py +43 -0
  212. aeri/api/legacy/observations_v1/client.py +523 -0
  213. aeri/api/legacy/observations_v1/raw_client.py +759 -0
  214. aeri/api/legacy/observations_v1/types/__init__.py +44 -0
  215. aeri/api/legacy/observations_v1/types/observations.py +17 -0
  216. aeri/api/legacy/observations_v1/types/observations_views.py +17 -0
  217. aeri/api/legacy/raw_client.py +13 -0
  218. aeri/api/legacy/score_v1/__init__.py +43 -0
  219. aeri/api/legacy/score_v1/client.py +329 -0
  220. aeri/api/legacy/score_v1/raw_client.py +545 -0
  221. aeri/api/legacy/score_v1/types/__init__.py +44 -0
  222. aeri/api/legacy/score_v1/types/create_score_request.py +75 -0
  223. aeri/api/legacy/score_v1/types/create_score_response.py +17 -0
  224. aeri/api/llm_connections/__init__.py +55 -0
  225. aeri/api/llm_connections/client.py +311 -0
  226. aeri/api/llm_connections/raw_client.py +541 -0
  227. aeri/api/llm_connections/types/__init__.py +53 -0
  228. aeri/api/llm_connections/types/llm_adapter.py +38 -0
  229. aeri/api/llm_connections/types/llm_connection.py +77 -0
  230. aeri/api/llm_connections/types/paginated_llm_connections.py +17 -0
  231. aeri/api/llm_connections/types/upsert_llm_connection_request.py +69 -0
  232. aeri/api/media/__init__.py +58 -0
  233. aeri/api/media/client.py +427 -0
  234. aeri/api/media/raw_client.py +739 -0
  235. aeri/api/media/types/__init__.py +56 -0
  236. aeri/api/media/types/get_media_response.py +55 -0
  237. aeri/api/media/types/get_media_upload_url_request.py +51 -0
  238. aeri/api/media/types/get_media_upload_url_response.py +28 -0
  239. aeri/api/media/types/media_content_type.py +232 -0
  240. aeri/api/media/types/patch_media_body.py +43 -0
  241. aeri/api/metrics/__init__.py +40 -0
  242. aeri/api/metrics/client.py +422 -0
  243. aeri/api/metrics/raw_client.py +530 -0
  244. aeri/api/metrics/types/__init__.py +40 -0
  245. aeri/api/metrics/types/metrics_v2response.py +19 -0
  246. aeri/api/models/__init__.py +43 -0
  247. aeri/api/models/client.py +523 -0
  248. aeri/api/models/raw_client.py +993 -0
  249. aeri/api/models/types/__init__.py +44 -0
  250. aeri/api/models/types/create_model_request.py +103 -0
  251. aeri/api/models/types/paginated_models.py +17 -0
  252. aeri/api/observations/__init__.py +43 -0
  253. aeri/api/observations/client.py +522 -0
  254. aeri/api/observations/raw_client.py +641 -0
  255. aeri/api/observations/types/__init__.py +44 -0
  256. aeri/api/observations/types/observations_v2meta.py +21 -0
  257. aeri/api/observations/types/observations_v2response.py +28 -0
  258. aeri/api/opentelemetry/__init__.py +67 -0
  259. aeri/api/opentelemetry/client.py +276 -0
  260. aeri/api/opentelemetry/raw_client.py +291 -0
  261. aeri/api/opentelemetry/types/__init__.py +65 -0
  262. aeri/api/opentelemetry/types/otel_attribute.py +27 -0
  263. aeri/api/opentelemetry/types/otel_attribute_value.py +46 -0
  264. aeri/api/opentelemetry/types/otel_resource.py +24 -0
  265. aeri/api/opentelemetry/types/otel_resource_span.py +32 -0
  266. aeri/api/opentelemetry/types/otel_scope.py +34 -0
  267. aeri/api/opentelemetry/types/otel_scope_span.py +28 -0
  268. aeri/api/opentelemetry/types/otel_span.py +76 -0
  269. aeri/api/opentelemetry/types/otel_trace_response.py +16 -0
  270. aeri/api/organizations/__init__.py +73 -0
  271. aeri/api/organizations/client.py +756 -0
  272. aeri/api/organizations/raw_client.py +1707 -0
  273. aeri/api/organizations/types/__init__.py +71 -0
  274. aeri/api/organizations/types/delete_membership_request.py +16 -0
  275. aeri/api/organizations/types/membership_deletion_response.py +17 -0
  276. aeri/api/organizations/types/membership_request.py +18 -0
  277. aeri/api/organizations/types/membership_response.py +20 -0
  278. aeri/api/organizations/types/membership_role.py +30 -0
  279. aeri/api/organizations/types/memberships_response.py +15 -0
  280. aeri/api/organizations/types/organization_api_key.py +31 -0
  281. aeri/api/organizations/types/organization_api_keys_response.py +19 -0
  282. aeri/api/organizations/types/organization_project.py +25 -0
  283. aeri/api/organizations/types/organization_projects_response.py +15 -0
  284. aeri/api/projects/__init__.py +67 -0
  285. aeri/api/projects/client.py +760 -0
  286. aeri/api/projects/raw_client.py +1577 -0
  287. aeri/api/projects/types/__init__.py +65 -0
  288. aeri/api/projects/types/api_key_deletion_response.py +18 -0
  289. aeri/api/projects/types/api_key_list.py +23 -0
  290. aeri/api/projects/types/api_key_response.py +30 -0
  291. aeri/api/projects/types/api_key_summary.py +35 -0
  292. aeri/api/projects/types/organization.py +22 -0
  293. aeri/api/projects/types/project.py +34 -0
  294. aeri/api/projects/types/project_deletion_response.py +15 -0
  295. aeri/api/projects/types/projects.py +15 -0
  296. aeri/api/prompt_version/__init__.py +4 -0
  297. aeri/api/prompt_version/client.py +157 -0
  298. aeri/api/prompt_version/raw_client.py +264 -0
  299. aeri/api/prompts/__init__.py +100 -0
  300. aeri/api/prompts/client.py +550 -0
  301. aeri/api/prompts/raw_client.py +987 -0
  302. aeri/api/prompts/types/__init__.py +96 -0
  303. aeri/api/prompts/types/base_prompt.py +42 -0
  304. aeri/api/prompts/types/chat_message.py +17 -0
  305. aeri/api/prompts/types/chat_message_type.py +15 -0
  306. aeri/api/prompts/types/chat_message_with_placeholders.py +8 -0
  307. aeri/api/prompts/types/chat_prompt.py +15 -0
  308. aeri/api/prompts/types/create_chat_prompt_request.py +37 -0
  309. aeri/api/prompts/types/create_chat_prompt_type.py +15 -0
  310. aeri/api/prompts/types/create_prompt_request.py +8 -0
  311. aeri/api/prompts/types/create_text_prompt_request.py +36 -0
  312. aeri/api/prompts/types/create_text_prompt_type.py +15 -0
  313. aeri/api/prompts/types/placeholder_message.py +16 -0
  314. aeri/api/prompts/types/placeholder_message_type.py +15 -0
  315. aeri/api/prompts/types/prompt.py +58 -0
  316. aeri/api/prompts/types/prompt_meta.py +35 -0
  317. aeri/api/prompts/types/prompt_meta_list_response.py +17 -0
  318. aeri/api/prompts/types/prompt_type.py +20 -0
  319. aeri/api/prompts/types/text_prompt.py +14 -0
  320. aeri/api/scim/__init__.py +94 -0
  321. aeri/api/scim/client.py +686 -0
  322. aeri/api/scim/raw_client.py +1528 -0
  323. aeri/api/scim/types/__init__.py +92 -0
  324. aeri/api/scim/types/authentication_scheme.py +20 -0
  325. aeri/api/scim/types/bulk_config.py +22 -0
  326. aeri/api/scim/types/empty_response.py +16 -0
  327. aeri/api/scim/types/filter_config.py +17 -0
  328. aeri/api/scim/types/resource_meta.py +17 -0
  329. aeri/api/scim/types/resource_type.py +27 -0
  330. aeri/api/scim/types/resource_types_response.py +21 -0
  331. aeri/api/scim/types/schema_extension.py +17 -0
  332. aeri/api/scim/types/schema_resource.py +19 -0
  333. aeri/api/scim/types/schemas_response.py +21 -0
  334. aeri/api/scim/types/scim_email.py +16 -0
  335. aeri/api/scim/types/scim_feature_support.py +14 -0
  336. aeri/api/scim/types/scim_name.py +14 -0
  337. aeri/api/scim/types/scim_user.py +24 -0
  338. aeri/api/scim/types/scim_users_list_response.py +25 -0
  339. aeri/api/scim/types/service_provider_config.py +36 -0
  340. aeri/api/scim/types/user_meta.py +20 -0
  341. aeri/api/score_configs/__init__.py +44 -0
  342. aeri/api/score_configs/client.py +526 -0
  343. aeri/api/score_configs/raw_client.py +1012 -0
  344. aeri/api/score_configs/types/__init__.py +46 -0
  345. aeri/api/score_configs/types/create_score_config_request.py +46 -0
  346. aeri/api/score_configs/types/score_configs.py +17 -0
  347. aeri/api/score_configs/types/update_score_config_request.py +53 -0
  348. aeri/api/scores/__init__.py +76 -0
  349. aeri/api/scores/client.py +420 -0
  350. aeri/api/scores/raw_client.py +656 -0
  351. aeri/api/scores/types/__init__.py +76 -0
  352. aeri/api/scores/types/get_scores_response.py +17 -0
  353. aeri/api/scores/types/get_scores_response_data.py +211 -0
  354. aeri/api/scores/types/get_scores_response_data_boolean.py +15 -0
  355. aeri/api/scores/types/get_scores_response_data_categorical.py +15 -0
  356. aeri/api/scores/types/get_scores_response_data_correction.py +15 -0
  357. aeri/api/scores/types/get_scores_response_data_numeric.py +15 -0
  358. aeri/api/scores/types/get_scores_response_trace_data.py +38 -0
  359. aeri/api/sessions/__init__.py +40 -0
  360. aeri/api/sessions/client.py +262 -0
  361. aeri/api/sessions/raw_client.py +500 -0
  362. aeri/api/sessions/types/__init__.py +40 -0
  363. aeri/api/sessions/types/paginated_sessions.py +17 -0
  364. aeri/api/trace/__init__.py +44 -0
  365. aeri/api/trace/client.py +728 -0
  366. aeri/api/trace/raw_client.py +1208 -0
  367. aeri/api/trace/types/__init__.py +46 -0
  368. aeri/api/trace/types/delete_trace_response.py +14 -0
  369. aeri/api/trace/types/sort.py +14 -0
  370. aeri/api/trace/types/traces.py +17 -0
  371. aeri/api/utils/__init__.py +44 -0
  372. aeri/api/utils/pagination/__init__.py +40 -0
  373. aeri/api/utils/pagination/types/__init__.py +40 -0
  374. aeri/api/utils/pagination/types/meta_response.py +38 -0
  375. aeri/batch_evaluation.py +1643 -0
  376. aeri/experiment.py +1044 -0
  377. aeri/langchain/CallbackHandler.py +1377 -0
  378. aeri/langchain/__init__.py +5 -0
  379. aeri/langchain/utils.py +212 -0
  380. aeri/logger.py +28 -0
  381. aeri/media.py +352 -0
  382. aeri/model.py +477 -0
  383. aeri/openai.py +1124 -0
  384. aeri/py.typed +0 -0
  385. aeri/span_filter.py +17 -0
  386. aeri/types.py +79 -0
  387. aeri/version.py +3 -0
  388. aeri_python-4.0.0.dist-info/METADATA +51 -0
  389. aeri_python-4.0.0.dist-info/RECORD +391 -0
  390. aeri_python-4.0.0.dist-info/WHEEL +4 -0
  391. aeri_python-4.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1643 @@
1
+ """Batch evaluation functionality for Aeri.
2
+
3
+ This module provides comprehensive batch evaluation capabilities for running evaluations
4
+ on traces and observations fetched from Aeri. It includes type definitions,
5
+ protocols, result classes, and the implementation for large-scale evaluation workflows
6
+ with error handling, retry logic, and resume capability.
7
+ """
8
+
9
+ import asyncio
10
+ import json
11
+ import time
12
+ from typing import (
13
+ TYPE_CHECKING,
14
+ Any,
15
+ Awaitable,
16
+ Dict,
17
+ List,
18
+ Optional,
19
+ Protocol,
20
+ Set,
21
+ Tuple,
22
+ Union,
23
+ cast,
24
+ )
25
+
26
+ from aeri.api import (
27
+ ObservationsView,
28
+ TraceWithFullDetails,
29
+ )
30
+ from aeri.experiment import Evaluation, EvaluatorFunction
31
+ from aeri.logger import aeri_logger as logger
32
+
33
+ if TYPE_CHECKING:
34
+ from aeri._client.client import Aeri
35
+
36
+
37
+ class EvaluatorInputs:
38
+ """Input data structure for evaluators, returned by mapper functions.
39
+
40
+ This class provides a strongly-typed container for transforming API response
41
+ objects (traces, observations) into the standardized format expected
42
+ by evaluator functions. It ensures consistent access to input, output, expected
43
+ output, and metadata regardless of the source entity type.
44
+
45
+ Attributes:
46
+ input: The input data that was provided to generate the output being evaluated.
47
+ For traces, this might be the initial prompt or request. For observations,
48
+ this could be the span's input. The exact meaning depends on your use case.
49
+ output: The actual output that was produced and needs to be evaluated.
50
+ For traces, this is typically the final response. For observations,
51
+ this might be the generation output or span result.
52
+ expected_output: Optional ground truth or expected result for comparison.
53
+ Used by evaluators to assess correctness. May be None if no ground truth
54
+ is available for the entity being evaluated.
55
+ metadata: Optional structured metadata providing additional context for evaluation.
56
+ Can include information about the entity, execution context, user attributes,
57
+ or any other relevant data that evaluators might use.
58
+
59
+ Examples:
60
+ Simple mapper for traces:
61
+ ```python
62
+ from aeri import EvaluatorInputs
63
+
64
+ def trace_mapper(trace):
65
+ return EvaluatorInputs(
66
+ input=trace.input,
67
+ output=trace.output,
68
+ expected_output=None, # No ground truth available
69
+ metadata={"user_id": trace.user_id, "tags": trace.tags}
70
+ )
71
+ ```
72
+
73
+ Mapper for observations extracting specific fields:
74
+ ```python
75
+ def observation_mapper(observation):
76
+ # Extract input/output from observation's data
77
+ input_data = observation.input if hasattr(observation, 'input') else None
78
+ output_data = observation.output if hasattr(observation, 'output') else None
79
+
80
+ return EvaluatorInputs(
81
+ input=input_data,
82
+ output=output_data,
83
+ expected_output=None,
84
+ metadata={
85
+ "observation_type": observation.type,
86
+ "model": observation.model,
87
+ "latency_ms": observation.end_time - observation.start_time
88
+ }
89
+ )
90
+ ```
91
+ ```
92
+
93
+ Note:
94
+ All arguments must be passed as keywords when instantiating this class.
95
+ """
96
+
97
+ def __init__(
98
+ self,
99
+ *,
100
+ input: Any,
101
+ output: Any,
102
+ expected_output: Any = None,
103
+ metadata: Optional[Dict[str, Any]] = None,
104
+ ):
105
+ """Initialize EvaluatorInputs with the provided data.
106
+
107
+ Args:
108
+ input: The input data for evaluation.
109
+ output: The output data to be evaluated.
110
+ expected_output: Optional ground truth for comparison.
111
+ metadata: Optional additional context for evaluation.
112
+
113
+ Note:
114
+ All arguments must be provided as keywords.
115
+ """
116
+ self.input = input
117
+ self.output = output
118
+ self.expected_output = expected_output
119
+ self.metadata = metadata
120
+
121
+
122
+ class MapperFunction(Protocol):
123
+ """Protocol defining the interface for mapper functions in batch evaluation.
124
+
125
+ Mapper functions transform API response objects (traces or observations)
126
+ into the standardized EvaluatorInputs format that evaluators expect. This abstraction
127
+ allows you to define how to extract and structure evaluation data from different
128
+ entity types.
129
+
130
+ Mapper functions must:
131
+ - Accept a single item parameter (trace, observation)
132
+ - Return an EvaluatorInputs instance with input, output, expected_output, metadata
133
+ - Can be either synchronous or asynchronous
134
+ - Should handle missing or malformed data gracefully
135
+ """
136
+
137
+ def __call__(
138
+ self,
139
+ *,
140
+ item: Union["TraceWithFullDetails", "ObservationsView"],
141
+ **kwargs: Dict[str, Any],
142
+ ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
143
+ """Transform an API response object into evaluator inputs.
144
+
145
+ This method defines how to extract evaluation-relevant data from the raw
146
+ API response object. The implementation should map entity-specific fields
147
+ to the standardized input/output/expected_output/metadata structure.
148
+
149
+ Args:
150
+ item: The API response object to transform. The type depends on the scope:
151
+ - TraceWithFullDetails: When evaluating traces
152
+ - ObservationsView: When evaluating observations
153
+
154
+ Returns:
155
+ EvaluatorInputs: A structured container with:
156
+ - input: The input data that generated the output
157
+ - output: The output to be evaluated
158
+ - expected_output: Optional ground truth for comparison
159
+ - metadata: Optional additional context
160
+
161
+ Can return either a direct EvaluatorInputs instance or an awaitable
162
+ (for async mappers that need to fetch additional data).
163
+
164
+ Examples:
165
+ Basic trace mapper:
166
+ ```python
167
+ def map_trace(trace):
168
+ return EvaluatorInputs(
169
+ input=trace.input,
170
+ output=trace.output,
171
+ expected_output=None,
172
+ metadata={"trace_id": trace.id, "user": trace.user_id}
173
+ )
174
+ ```
175
+
176
+ Observation mapper with conditional logic:
177
+ ```python
178
+ def map_observation(observation):
179
+ # Extract fields based on observation type
180
+ if observation.type == "GENERATION":
181
+ input_data = observation.input
182
+ output_data = observation.output
183
+ else:
184
+ # For other types, use different fields
185
+ input_data = observation.metadata.get("input")
186
+ output_data = observation.metadata.get("output")
187
+
188
+ return EvaluatorInputs(
189
+ input=input_data,
190
+ output=output_data,
191
+ expected_output=None,
192
+ metadata={"obs_id": observation.id, "type": observation.type}
193
+ )
194
+ ```
195
+
196
+ Async mapper (if additional processing needed):
197
+ ```python
198
+ async def map_trace_async(trace):
199
+ # Could do async processing here if needed
200
+ processed_output = await some_async_transformation(trace.output)
201
+
202
+ return EvaluatorInputs(
203
+ input=trace.input,
204
+ output=processed_output,
205
+ expected_output=None,
206
+ metadata={"trace_id": trace.id}
207
+ )
208
+ ```
209
+ """
210
+ ...
211
+
212
+
213
+ class CompositeEvaluatorFunction(Protocol):
214
+ """Protocol defining the interface for composite evaluator functions.
215
+
216
+ Composite evaluators create aggregate scores from multiple item-level evaluations.
217
+ This is commonly used to compute weighted averages, combined metrics, or other
218
+ composite assessments based on individual evaluation results.
219
+
220
+ Composite evaluators:
221
+ - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
222
+ plus the list of evaluations
223
+ - Return either a single Evaluation, a list of Evaluations, or a dict
224
+ - Can be either synchronous or asynchronous
225
+ - Have access to both raw item data and evaluation results
226
+ """
227
+
228
+ def __call__(
229
+ self,
230
+ *,
231
+ input: Optional[Any] = None,
232
+ output: Optional[Any] = None,
233
+ expected_output: Optional[Any] = None,
234
+ metadata: Optional[Dict[str, Any]] = None,
235
+ evaluations: List[Evaluation],
236
+ **kwargs: Dict[str, Any],
237
+ ) -> Union[
238
+ Evaluation,
239
+ List[Evaluation],
240
+ Dict[str, Any],
241
+ Awaitable[Evaluation],
242
+ Awaitable[List[Evaluation]],
243
+ Awaitable[Dict[str, Any]],
244
+ ]:
245
+ r"""Create a composite evaluation from item-level evaluation results.
246
+
247
+ This method combines multiple evaluation scores into a single composite metric.
248
+ Common use cases include weighted averages, pass/fail decisions based on multiple
249
+ criteria, or custom scoring logic that considers multiple dimensions.
250
+
251
+ Args:
252
+ input: The input data that was provided to the system being evaluated.
253
+ output: The output generated by the system being evaluated.
254
+ expected_output: The expected/reference output for comparison (if available).
255
+ metadata: Additional metadata about the evaluation context.
256
+ evaluations: List of evaluation results from item-level evaluators.
257
+ Each evaluation contains name, value, comment, and metadata.
258
+
259
+ Returns:
260
+ Can return any of:
261
+ - Evaluation: A single composite evaluation result
262
+ - List[Evaluation]: Multiple composite evaluations
263
+ - Dict: A dict that will be converted to an Evaluation
264
+ - name: Identifier for the composite metric (e.g., "composite_score")
265
+ - value: The computed composite value
266
+ - comment: Optional explanation of how the score was computed
267
+ - metadata: Optional details about the composition logic
268
+
269
+ Can return either a direct Evaluation instance or an awaitable
270
+ (for async composite evaluators).
271
+
272
+ Examples:
273
+ Simple weighted average:
274
+ ```python
275
+ def weighted_composite(*, input, output, expected_output, metadata, evaluations):
276
+ weights = {
277
+ "accuracy": 0.5,
278
+ "relevance": 0.3,
279
+ "safety": 0.2
280
+ }
281
+
282
+ total_score = 0.0
283
+ total_weight = 0.0
284
+
285
+ for eval in evaluations:
286
+ if eval.name in weights and isinstance(eval.value, (int, float)):
287
+ total_score += eval.value * weights[eval.name]
288
+ total_weight += weights[eval.name]
289
+
290
+ final_score = total_score / total_weight if total_weight > 0 else 0.0
291
+
292
+ return Evaluation(
293
+ name="composite_score",
294
+ value=final_score,
295
+ comment=f"Weighted average of {len(evaluations)} metrics"
296
+ )
297
+ ```
298
+
299
+ Pass/fail composite based on thresholds:
300
+ ```python
301
+ def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
302
+ # Must pass all criteria
303
+ thresholds = {
304
+ "accuracy": 0.7,
305
+ "safety": 0.9,
306
+ "relevance": 0.6
307
+ }
308
+
309
+ passes = True
310
+ failing_metrics = []
311
+
312
+ for metric, threshold in thresholds.items():
313
+ eval_result = next((e for e in evaluations if e.name == metric), None)
314
+ if eval_result and isinstance(eval_result.value, (int, float)):
315
+ if eval_result.value < threshold:
316
+ passes = False
317
+ failing_metrics.append(metric)
318
+
319
+ return Evaluation(
320
+ name="passes_all_checks",
321
+ value=passes,
322
+ comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
323
+ data_type="BOOLEAN"
324
+ )
325
+ ```
326
+
327
+ Async composite with external scoring:
328
+ ```python
329
+ async def llm_composite(*, input, output, expected_output, metadata, evaluations):
330
+ # Use LLM to synthesize multiple evaluation results
331
+ eval_summary = "\n".join(
332
+ f"- {e.name}: {e.value}" for e in evaluations
333
+ )
334
+
335
+ prompt = f"Given these evaluation scores:\n{eval_summary}\n"
336
+ prompt += f"For the output: {output}\n"
337
+ prompt += "Provide an overall quality score from 0-1."
338
+
339
+ response = await openai.chat.completions.create(
340
+ model="gpt-4",
341
+ messages=[{"role": "user", "content": prompt}]
342
+ )
343
+
344
+ score = float(response.choices[0].message.content.strip())
345
+
346
+ return Evaluation(
347
+ name="llm_composite_score",
348
+ value=score,
349
+ comment="LLM-synthesized composite score"
350
+ )
351
+ ```
352
+
353
+ Context-aware composite:
354
+ ```python
355
+ def context_composite(*, input, output, expected_output, metadata, evaluations):
356
+ # Adjust weighting based on metadata
357
+ base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
358
+
359
+ # If metadata indicates high importance, prioritize accuracy
360
+ if metadata and metadata.get('importance') == 'high':
361
+ weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
362
+ else:
363
+ weights = base_weights
364
+
365
+ total = sum(
366
+ e.value * weights.get(e.name, 0)
367
+ for e in evaluations
368
+ if isinstance(e.value, (int, float))
369
+ )
370
+
371
+ return Evaluation(
372
+ name="weighted_composite",
373
+ value=total,
374
+ comment="Context-aware weighted composite"
375
+ )
376
+ ```
377
+ """
378
+ ...
379
+
380
+
381
+ class EvaluatorStats:
382
+ """Statistics for a single evaluator's performance during batch evaluation.
383
+
384
+ This class tracks detailed metrics about how a specific evaluator performed
385
+ across all items in a batch evaluation run. It helps identify evaluator issues,
386
+ understand reliability, and optimize evaluation pipelines.
387
+
388
+ Attributes:
389
+ name: The name of the evaluator function (extracted from __name__).
390
+ total_runs: Total number of times the evaluator was invoked.
391
+ successful_runs: Number of times the evaluator completed successfully.
392
+ failed_runs: Number of times the evaluator raised an exception or failed.
393
+ total_scores_created: Total number of evaluation scores created by this evaluator.
394
+ Can be higher than successful_runs if the evaluator returns multiple scores.
395
+
396
+ Examples:
397
+ Accessing evaluator stats from batch evaluation result:
398
+ ```python
399
+ result = client.run_batched_evaluation(...)
400
+
401
+ for stats in result.evaluator_stats:
402
+ print(f"Evaluator: {stats.name}")
403
+ print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}")
404
+ print(f" Scores created: {stats.total_scores_created}")
405
+
406
+ if stats.failed_runs > 0:
407
+ print(f" ⚠️ Failed {stats.failed_runs} times")
408
+ ```
409
+
410
+ Identifying problematic evaluators:
411
+ ```python
412
+ result = client.run_batched_evaluation(...)
413
+
414
+ # Find evaluators with high failure rates
415
+ for stats in result.evaluator_stats:
416
+ failure_rate = stats.failed_runs / stats.total_runs
417
+ if failure_rate > 0.1: # More than 10% failures
418
+ print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate")
419
+ print(f" Consider debugging or removing this evaluator")
420
+ ```
421
+
422
+ Note:
423
+ All arguments must be passed as keywords when instantiating this class.
424
+ """
425
+
426
+ def __init__(
427
+ self,
428
+ *,
429
+ name: str,
430
+ total_runs: int = 0,
431
+ successful_runs: int = 0,
432
+ failed_runs: int = 0,
433
+ total_scores_created: int = 0,
434
+ ):
435
+ """Initialize EvaluatorStats with the provided metrics.
436
+
437
+ Args:
438
+ name: The evaluator function name.
439
+ total_runs: Total number of evaluator invocations.
440
+ successful_runs: Number of successful completions.
441
+ failed_runs: Number of failures.
442
+ total_scores_created: Total scores created by this evaluator.
443
+
444
+ Note:
445
+ All arguments must be provided as keywords.
446
+ """
447
+ self.name = name
448
+ self.total_runs = total_runs
449
+ self.successful_runs = successful_runs
450
+ self.failed_runs = failed_runs
451
+ self.total_scores_created = total_scores_created
452
+
453
+
454
+ class BatchEvaluationResumeToken:
455
+ """Token for resuming a failed batch evaluation run.
456
+
457
+ This class encapsulates all the information needed to resume a batch evaluation
458
+ that was interrupted or failed partway through. It uses timestamp-based filtering
459
+ to avoid re-processing items that were already evaluated, even if the underlying
460
+ dataset changed between runs.
461
+
462
+ Attributes:
463
+ scope: The type of items being evaluated ("traces", "observations").
464
+ filter: The original JSON filter string used to query items.
465
+ last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
466
+ Used to construct a filter that only fetches items after this timestamp.
467
+ last_processed_id: The ID of the last successfully processed item, for reference.
468
+ items_processed: Count of items successfully processed before interruption.
469
+
470
+ Examples:
471
+ Resuming a failed batch evaluation:
472
+ ```python
473
+ # Initial run that fails partway through
474
+ try:
475
+ result = client.run_batched_evaluation(
476
+ scope="traces",
477
+ mapper=my_mapper,
478
+ evaluators=[evaluator1, evaluator2],
479
+ filter='{"tags": ["production"]}',
480
+ max_items=10000
481
+ )
482
+ except Exception as e:
483
+ print(f"Evaluation failed: {e}")
484
+
485
+ # Save the resume token
486
+ if result.resume_token:
487
+ # Store resume token for later (e.g., in a file or database)
488
+ import json
489
+ with open("resume_token.json", "w") as f:
490
+ json.dump({
491
+ "scope": result.resume_token.scope,
492
+ "filter": result.resume_token.filter,
493
+ "last_timestamp": result.resume_token.last_processed_timestamp,
494
+ "last_id": result.resume_token.last_processed_id,
495
+ "items_done": result.resume_token.items_processed
496
+ }, f)
497
+
498
+ # Later, resume from where it left off
499
+ with open("resume_token.json") as f:
500
+ token_data = json.load(f)
501
+
502
+ resume_token = BatchEvaluationResumeToken(
503
+ scope=token_data["scope"],
504
+ filter=token_data["filter"],
505
+ last_processed_timestamp=token_data["last_timestamp"],
506
+ last_processed_id=token_data["last_id"],
507
+ items_processed=token_data["items_done"]
508
+ )
509
+
510
+ # Resume the evaluation
511
+ result = client.run_batched_evaluation(
512
+ scope="traces",
513
+ mapper=my_mapper,
514
+ evaluators=[evaluator1, evaluator2],
515
+ resume_from=resume_token
516
+ )
517
+
518
+ print(f"Processed {result.total_items_processed} additional items")
519
+ ```
520
+
521
+ Handling partial completion:
522
+ ```python
523
+ result = client.run_batched_evaluation(...)
524
+
525
+ if not result.completed:
526
+ print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
527
+ print(f"Last item: {result.resume_token.last_processed_id}")
528
+ print(f"Resume from: {result.resume_token.last_processed_timestamp}")
529
+
530
+ # Optionally retry automatically
531
+ if result.resume_token:
532
+ print("Retrying...")
533
+ result = client.run_batched_evaluation(
534
+ scope=result.resume_token.scope,
535
+ mapper=my_mapper,
536
+ evaluators=my_evaluators,
537
+ resume_from=result.resume_token
538
+ )
539
+ ```
540
+
541
+ Note:
542
+ All arguments must be passed as keywords when instantiating this class.
543
+ The timestamp-based approach means that items created after the initial run
544
+ but before the timestamp will be skipped. This is intentional to avoid
545
+ duplicates and ensure consistent evaluation.
546
+ """
547
+
548
+ def __init__(
549
+ self,
550
+ *,
551
+ scope: str,
552
+ filter: Optional[str],
553
+ last_processed_timestamp: str,
554
+ last_processed_id: str,
555
+ items_processed: int,
556
+ ):
557
+ """Initialize BatchEvaluationResumeToken with the provided state.
558
+
559
+ Args:
560
+ scope: The scope type ("traces", "observations").
561
+ filter: The original JSON filter string.
562
+ last_processed_timestamp: ISO 8601 timestamp of last processed item.
563
+ last_processed_id: ID of last processed item.
564
+ items_processed: Count of items processed before interruption.
565
+
566
+ Note:
567
+ All arguments must be provided as keywords.
568
+ """
569
+ self.scope = scope
570
+ self.filter = filter
571
+ self.last_processed_timestamp = last_processed_timestamp
572
+ self.last_processed_id = last_processed_id
573
+ self.items_processed = items_processed
574
+
575
+
576
+ class BatchEvaluationResult:
577
+ r"""Complete result structure for batch evaluation execution.
578
+
579
+ This class encapsulates comprehensive statistics and metadata about a batch
580
+ evaluation run, including counts, evaluator-specific metrics, timing information,
581
+ error details, and resume capability.
582
+
583
+ Attributes:
584
+ total_items_fetched: Total number of items fetched from the API.
585
+ total_items_processed: Number of items successfully evaluated.
586
+ total_items_failed: Number of items that failed during evaluation.
587
+ total_scores_created: Total scores created by all item-level evaluators.
588
+ total_composite_scores_created: Scores created by the composite evaluator.
589
+ total_evaluations_failed: Number of individual evaluator failures across all items.
590
+ evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
591
+ resume_token: Token for resuming if evaluation was interrupted (None if completed).
592
+ completed: True if all items were processed, False if stopped early or failed.
593
+ duration_seconds: Total time taken to execute the batch evaluation.
594
+ failed_item_ids: List of IDs for items that failed evaluation.
595
+ error_summary: Dictionary mapping error types to occurrence counts.
596
+ has_more_items: True if max_items limit was reached but more items exist.
597
+ item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
598
+
599
+ Examples:
600
+ Basic result inspection:
601
+ ```python
602
+ result = client.run_batched_evaluation(...)
603
+
604
+ print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
605
+ print(f"Scores created: {result.total_scores_created}")
606
+ print(f"Duration: {result.duration_seconds:.2f}s")
607
+ print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
608
+ ```
609
+
610
+ Detailed analysis with evaluator stats:
611
+ ```python
612
+ result = client.run_batched_evaluation(...)
613
+
614
+ print(f"\n📊 Batch Evaluation Results")
615
+ print(f"{'='*50}")
616
+ print(f"Items processed: {result.total_items_processed}")
617
+ print(f"Items failed: {result.total_items_failed}")
618
+ print(f"Scores created: {result.total_scores_created}")
619
+
620
+ if result.total_composite_scores_created > 0:
621
+ print(f"Composite scores: {result.total_composite_scores_created}")
622
+
623
+ print(f"\n📈 Evaluator Performance:")
624
+ for stats in result.evaluator_stats:
625
+ success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
626
+ print(f"\n {stats.name}:")
627
+ print(f" Success rate: {success_rate:.1%}")
628
+ print(f" Scores created: {stats.total_scores_created}")
629
+ if stats.failed_runs > 0:
630
+ print(f" ⚠️ Failures: {stats.failed_runs}")
631
+
632
+ if result.error_summary:
633
+ print(f"\n⚠️ Errors encountered:")
634
+ for error_type, count in result.error_summary.items():
635
+ print(f" {error_type}: {count}")
636
+ ```
637
+
638
+ Handling incomplete runs:
639
+ ```python
640
+ result = client.run_batched_evaluation(...)
641
+
642
+ if not result.completed:
643
+ print("⚠️ Evaluation incomplete!")
644
+
645
+ if result.resume_token:
646
+ print(f"Processed {result.resume_token.items_processed} items before failure")
647
+ print(f"Use resume_from parameter to continue from:")
648
+ print(f" Timestamp: {result.resume_token.last_processed_timestamp}")
649
+ print(f" Last ID: {result.resume_token.last_processed_id}")
650
+
651
+ if result.has_more_items:
652
+ print(f"ℹ️ More items available beyond max_items limit")
653
+ ```
654
+
655
+ Performance monitoring:
656
+ ```python
657
+ result = client.run_batched_evaluation(...)
658
+
659
+ items_per_second = result.total_items_processed / result.duration_seconds
660
+ avg_scores_per_item = result.total_scores_created / result.total_items_processed
661
+
662
+ print(f"Performance metrics:")
663
+ print(f" Throughput: {items_per_second:.2f} items/second")
664
+ print(f" Avg scores/item: {avg_scores_per_item:.2f}")
665
+ print(f" Total duration: {result.duration_seconds:.2f}s")
666
+
667
+ if result.total_evaluations_failed > 0:
668
+ failure_rate = result.total_evaluations_failed / (
669
+ result.total_items_processed * len(result.evaluator_stats)
670
+ )
671
+ print(f" Evaluation failure rate: {failure_rate:.1%}")
672
+ ```
673
+
674
+ Note:
675
+ All arguments must be passed as keywords when instantiating this class.
676
+ """
677
+
678
+ def __init__(
679
+ self,
680
+ *,
681
+ total_items_fetched: int,
682
+ total_items_processed: int,
683
+ total_items_failed: int,
684
+ total_scores_created: int,
685
+ total_composite_scores_created: int,
686
+ total_evaluations_failed: int,
687
+ evaluator_stats: List[EvaluatorStats],
688
+ resume_token: Optional[BatchEvaluationResumeToken],
689
+ completed: bool,
690
+ duration_seconds: float,
691
+ failed_item_ids: List[str],
692
+ error_summary: Dict[str, int],
693
+ has_more_items: bool,
694
+ item_evaluations: Dict[str, List["Evaluation"]],
695
+ ):
696
+ """Initialize BatchEvaluationResult with comprehensive statistics.
697
+
698
+ Args:
699
+ total_items_fetched: Total items fetched from API.
700
+ total_items_processed: Items successfully evaluated.
701
+ total_items_failed: Items that failed evaluation.
702
+ total_scores_created: Scores from item-level evaluators.
703
+ total_composite_scores_created: Scores from composite evaluator.
704
+ total_evaluations_failed: Individual evaluator failures.
705
+ evaluator_stats: Per-evaluator statistics.
706
+ resume_token: Token for resuming (None if completed).
707
+ completed: Whether all items were processed.
708
+ duration_seconds: Total execution time.
709
+ failed_item_ids: IDs of failed items.
710
+ error_summary: Error types and counts.
711
+ has_more_items: Whether more items exist beyond max_items.
712
+ item_evaluations: Dictionary mapping item IDs to their evaluation results.
713
+
714
+ Note:
715
+ All arguments must be provided as keywords.
716
+ """
717
+ self.total_items_fetched = total_items_fetched
718
+ self.total_items_processed = total_items_processed
719
+ self.total_items_failed = total_items_failed
720
+ self.total_scores_created = total_scores_created
721
+ self.total_composite_scores_created = total_composite_scores_created
722
+ self.total_evaluations_failed = total_evaluations_failed
723
+ self.evaluator_stats = evaluator_stats
724
+ self.resume_token = resume_token
725
+ self.completed = completed
726
+ self.duration_seconds = duration_seconds
727
+ self.failed_item_ids = failed_item_ids
728
+ self.error_summary = error_summary
729
+ self.has_more_items = has_more_items
730
+ self.item_evaluations = item_evaluations
731
+
732
+ def __str__(self) -> str:
733
+ """Return a formatted string representation of the batch evaluation results.
734
+
735
+ Returns:
736
+ A multi-line string with a summary of the evaluation results.
737
+ """
738
+ lines = []
739
+ lines.append("=" * 60)
740
+ lines.append("Batch Evaluation Results")
741
+ lines.append("=" * 60)
742
+
743
+ # Summary statistics
744
+ lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
745
+ lines.append(f"Duration: {self.duration_seconds:.2f}s")
746
+ lines.append(f"\nItems fetched: {self.total_items_fetched}")
747
+ lines.append(f"Items processed: {self.total_items_processed}")
748
+
749
+ if self.total_items_failed > 0:
750
+ lines.append(f"Items failed: {self.total_items_failed}")
751
+
752
+ # Success rate
753
+ if self.total_items_fetched > 0:
754
+ success_rate = self.total_items_processed / self.total_items_fetched * 100
755
+ lines.append(f"Success rate: {success_rate:.1f}%")
756
+
757
+ # Scores created
758
+ lines.append(f"\nScores created: {self.total_scores_created}")
759
+ if self.total_composite_scores_created > 0:
760
+ lines.append(f"Composite scores: {self.total_composite_scores_created}")
761
+
762
+ total_scores = self.total_scores_created + self.total_composite_scores_created
763
+ lines.append(f"Total scores: {total_scores}")
764
+
765
+ # Evaluator statistics
766
+ if self.evaluator_stats:
767
+ lines.append("\nEvaluator Performance:")
768
+ for stats in self.evaluator_stats:
769
+ lines.append(f" {stats.name}:")
770
+ if stats.total_runs > 0:
771
+ success_rate = (
772
+ stats.successful_runs / stats.total_runs * 100
773
+ if stats.total_runs > 0
774
+ else 0
775
+ )
776
+ lines.append(
777
+ f" Runs: {stats.successful_runs}/{stats.total_runs} "
778
+ f"({success_rate:.1f}% success)"
779
+ )
780
+ lines.append(f" Scores created: {stats.total_scores_created}")
781
+ if stats.failed_runs > 0:
782
+ lines.append(f" Failed runs: {stats.failed_runs}")
783
+
784
+ # Performance metrics
785
+ if self.total_items_processed > 0 and self.duration_seconds > 0:
786
+ items_per_sec = self.total_items_processed / self.duration_seconds
787
+ lines.append("\nPerformance:")
788
+ lines.append(f" Throughput: {items_per_sec:.2f} items/second")
789
+ if self.total_scores_created > 0:
790
+ avg_scores = self.total_scores_created / self.total_items_processed
791
+ lines.append(f" Avg scores per item: {avg_scores:.2f}")
792
+
793
+ # Errors and warnings
794
+ if self.error_summary:
795
+ lines.append("\nErrors encountered:")
796
+ for error_type, count in self.error_summary.items():
797
+ lines.append(f" {error_type}: {count}")
798
+
799
+ # Incomplete run information
800
+ if not self.completed:
801
+ lines.append("\nWarning: Evaluation incomplete")
802
+ if self.resume_token:
803
+ lines.append(
804
+ f" Last processed: {self.resume_token.last_processed_timestamp}"
805
+ )
806
+ lines.append(f" Items processed: {self.resume_token.items_processed}")
807
+ lines.append(" Use resume_from parameter to continue")
808
+
809
+ if self.has_more_items:
810
+ lines.append("\nNote: More items available beyond max_items limit")
811
+
812
+ lines.append("=" * 60)
813
+ return "\n".join(lines)
814
+
815
+
816
+ class BatchEvaluationRunner:
817
+ """Handles batch evaluation execution for a Aeri client.
818
+
819
+ This class encapsulates all the logic for fetching items, running evaluators,
820
+ creating scores, and managing the evaluation lifecycle. It provides a clean
821
+ separation of concerns from the main Aeri client class.
822
+
823
+ The runner uses a streaming/pipeline approach to process items in batches,
824
+ avoiding loading the entire dataset into memory. This makes it suitable for
825
+ evaluating large numbers of items.
826
+
827
+ Attributes:
828
+ client: The Aeri client instance used for API calls and score creation.
829
+ """
830
+
831
+ def __init__(self, client: "Aeri"):
832
+ """Initialize the batch evaluation runner.
833
+
834
+ Args:
835
+ client: The Aeri client instance.
836
+ """
837
+ self.client = client
838
+
839
+ async def run_async(
840
+ self,
841
+ *,
842
+ scope: str,
843
+ mapper: MapperFunction,
844
+ evaluators: List[EvaluatorFunction],
845
+ filter: Optional[str] = None,
846
+ fetch_batch_size: int = 50,
847
+ fetch_trace_fields: Optional[str] = "io",
848
+ max_items: Optional[int] = None,
849
+ max_concurrency: int = 5,
850
+ composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
851
+ metadata: Optional[Dict[str, Any]] = None,
852
+ _add_observation_scores_to_trace: bool = False,
853
+ _additional_trace_tags: Optional[List[str]] = None,
854
+ max_retries: int = 3,
855
+ verbose: bool = False,
856
+ resume_from: Optional[BatchEvaluationResumeToken] = None,
857
+ ) -> BatchEvaluationResult:
858
+ """Run batch evaluation asynchronously.
859
+
860
+ This is the main implementation method that orchestrates the entire batch
861
+ evaluation process: fetching items, mapping, evaluating, creating scores,
862
+ and tracking statistics.
863
+
864
+ Args:
865
+ scope: The type of items to evaluate ("traces", "observations").
866
+ mapper: Function to transform API response items to evaluator inputs.
867
+ evaluators: List of evaluation functions to run on each item.
868
+ filter: JSON filter string for querying items.
869
+ fetch_batch_size: Number of items to fetch per API call.
870
+ fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. Default: 'io'
871
+ max_items: Maximum number of items to process (None = all).
872
+ max_concurrency: Maximum number of concurrent evaluations.
873
+ composite_evaluator: Optional function to create composite scores.
874
+ metadata: Metadata to add to all created scores.
875
+ _add_observation_scores_to_trace: Private option to duplicate
876
+ observation-level scores onto the parent trace.
877
+ _additional_trace_tags: Private option to add tags on traces via
878
+ ingestion trace-create events.
879
+ max_retries: Maximum retries for failed batch fetches.
880
+ verbose: If True, log progress to console.
881
+ resume_from: Resume token from a previous failed run.
882
+
883
+ Returns:
884
+ BatchEvaluationResult with comprehensive statistics.
885
+ """
886
+ start_time = time.time()
887
+
888
+ # Initialize tracking variables
889
+ total_items_fetched = 0
890
+ total_items_processed = 0
891
+ total_items_failed = 0
892
+ total_scores_created = 0
893
+ total_composite_scores_created = 0
894
+ total_evaluations_failed = 0
895
+ failed_item_ids: List[str] = []
896
+ error_summary: Dict[str, int] = {}
897
+ item_evaluations: Dict[str, List[Evaluation]] = {}
898
+
899
+ # Initialize evaluator stats
900
+ evaluator_stats_dict = {
901
+ getattr(evaluator, "__name__", "unknown_evaluator"): EvaluatorStats(
902
+ name=getattr(evaluator, "__name__", "unknown_evaluator")
903
+ )
904
+ for evaluator in evaluators
905
+ }
906
+
907
+ # Handle resume token by modifying filter
908
+ effective_filter = self._build_timestamp_filter(filter, resume_from)
909
+ normalized_additional_trace_tags = (
910
+ self._dedupe_tags(_additional_trace_tags)
911
+ if _additional_trace_tags is not None
912
+ else []
913
+ )
914
+ updated_trace_ids: Set[str] = set()
915
+
916
+ # Create semaphore for concurrency control
917
+ semaphore = asyncio.Semaphore(max_concurrency)
918
+
919
+ # Pagination state
920
+ page = 1
921
+ has_more = True
922
+ last_item_timestamp: Optional[str] = None
923
+ last_item_id: Optional[str] = None
924
+
925
+ if verbose:
926
+ logger.info(f"Starting batch evaluation on {scope}")
927
+ if scope == "traces" and fetch_trace_fields:
928
+ logger.info(f"Fetching trace fields: {fetch_trace_fields}")
929
+ if resume_from:
930
+ logger.info(
931
+ f"Resuming from {resume_from.last_processed_timestamp} "
932
+ f"({resume_from.items_processed} items already processed)"
933
+ )
934
+
935
+ # Main pagination loop
936
+ while has_more:
937
+ # Check if we've reached max_items
938
+ if max_items is not None and total_items_fetched >= max_items:
939
+ if verbose:
940
+ logger.info(f"Reached max_items limit ({max_items})")
941
+ has_more = True # More items may exist
942
+ break
943
+
944
+ # Fetch next batch with retry logic
945
+ try:
946
+ items = await self._fetch_batch_with_retry(
947
+ scope=scope,
948
+ filter=effective_filter,
949
+ page=page,
950
+ limit=fetch_batch_size,
951
+ max_retries=max_retries,
952
+ fields=fetch_trace_fields,
953
+ )
954
+ except Exception as e:
955
+ # Failed after max_retries - create resume token and return
956
+ error_msg = f"Failed to fetch batch after {max_retries} retries"
957
+ logger.error(f"{error_msg}: {e}")
958
+
959
+ resume_token = BatchEvaluationResumeToken(
960
+ scope=scope,
961
+ filter=filter, # Original filter, not modified
962
+ last_processed_timestamp=last_item_timestamp or "",
963
+ last_processed_id=last_item_id or "",
964
+ items_processed=total_items_processed,
965
+ )
966
+
967
+ return self._build_result(
968
+ total_items_fetched=total_items_fetched,
969
+ total_items_processed=total_items_processed,
970
+ total_items_failed=total_items_failed,
971
+ total_scores_created=total_scores_created,
972
+ total_composite_scores_created=total_composite_scores_created,
973
+ total_evaluations_failed=total_evaluations_failed,
974
+ evaluator_stats_dict=evaluator_stats_dict,
975
+ resume_token=resume_token,
976
+ completed=False,
977
+ start_time=start_time,
978
+ failed_item_ids=failed_item_ids,
979
+ error_summary=error_summary,
980
+ has_more_items=has_more,
981
+ item_evaluations=item_evaluations,
982
+ )
983
+
984
+ # Check if we got any items
985
+ if not items:
986
+ has_more = False
987
+ if verbose:
988
+ logger.info("No more items to fetch")
989
+ break
990
+
991
+ total_items_fetched += len(items)
992
+
993
+ if verbose:
994
+ logger.info(f"Fetched batch {page} ({len(items)} items)")
995
+
996
+ # Limit items if max_items would be exceeded
997
+ items_to_process = items
998
+ if max_items is not None:
999
+ remaining_capacity = max_items - total_items_processed
1000
+ if len(items) > remaining_capacity:
1001
+ items_to_process = items[:remaining_capacity]
1002
+ if verbose:
1003
+ logger.info(
1004
+ f"Limiting batch to {len(items_to_process)} items "
1005
+ f"to respect max_items={max_items}"
1006
+ )
1007
+
1008
+ # Process items concurrently
1009
+ async def process_item(
1010
+ item: Union[TraceWithFullDetails, ObservationsView],
1011
+ ) -> Tuple[str, Union[Tuple[int, int, int, List[Evaluation]], Exception]]:
1012
+ """Process a single item and return (item_id, result)."""
1013
+ async with semaphore:
1014
+ item_id = self._get_item_id(item, scope)
1015
+ try:
1016
+ result = await self._process_batch_evaluation_item(
1017
+ item=item,
1018
+ scope=scope,
1019
+ mapper=mapper,
1020
+ evaluators=evaluators,
1021
+ composite_evaluator=composite_evaluator,
1022
+ metadata=metadata,
1023
+ _add_observation_scores_to_trace=_add_observation_scores_to_trace,
1024
+ evaluator_stats_dict=evaluator_stats_dict,
1025
+ )
1026
+ return (item_id, result)
1027
+ except Exception as e:
1028
+ return (item_id, e)
1029
+
1030
+ # Run all items in batch concurrently
1031
+ tasks = [process_item(item) for item in items_to_process]
1032
+ results = await asyncio.gather(*tasks)
1033
+
1034
+ # Process results and update statistics
1035
+ for item, (item_id, result) in zip(items_to_process, results):
1036
+ if isinstance(result, Exception):
1037
+ # Item processing failed
1038
+ total_items_failed += 1
1039
+ failed_item_ids.append(item_id)
1040
+ error_type = type(result).__name__
1041
+ error_summary[error_type] = error_summary.get(error_type, 0) + 1
1042
+ logger.warning(f"Item {item_id} failed: {result}")
1043
+ else:
1044
+ # Item processed successfully
1045
+ total_items_processed += 1
1046
+ scores_created, composite_created, evals_failed, evaluations = (
1047
+ result
1048
+ )
1049
+ total_scores_created += scores_created
1050
+ total_composite_scores_created += composite_created
1051
+ total_evaluations_failed += evals_failed
1052
+
1053
+ # Store evaluations for this item
1054
+ item_evaluations[item_id] = evaluations
1055
+
1056
+ if normalized_additional_trace_tags:
1057
+ trace_id = (
1058
+ item_id
1059
+ if scope == "traces"
1060
+ else cast(ObservationsView, item).trace_id
1061
+ )
1062
+
1063
+ if trace_id and trace_id not in updated_trace_ids:
1064
+ self.client._create_trace_tags_via_ingestion(
1065
+ trace_id=trace_id,
1066
+ tags=normalized_additional_trace_tags,
1067
+ )
1068
+ updated_trace_ids.add(trace_id)
1069
+
1070
+ # Update last processed tracking
1071
+ last_item_timestamp = self._get_item_timestamp(item, scope)
1072
+ last_item_id = item_id
1073
+
1074
+ if verbose:
1075
+ if max_items is not None and max_items > 0:
1076
+ progress_pct = total_items_processed / max_items * 100
1077
+ logger.info(
1078
+ f"Progress: {total_items_processed}/{max_items} items "
1079
+ f"({progress_pct:.1f}%), {total_scores_created} scores created"
1080
+ )
1081
+ else:
1082
+ logger.info(
1083
+ f"Progress: {total_items_processed} items processed, "
1084
+ f"{total_scores_created} scores created"
1085
+ )
1086
+
1087
+ # Check if we should continue to next page
1088
+ if len(items) < fetch_batch_size:
1089
+ # Last page - no more items available
1090
+ has_more = False
1091
+ else:
1092
+ page += 1
1093
+
1094
+ # Check max_items again before next fetch
1095
+ if max_items is not None and total_items_fetched >= max_items:
1096
+ has_more = True # More items exist but we're stopping
1097
+ break
1098
+
1099
+ # Flush all scores to Aeri
1100
+ if verbose:
1101
+ logger.info("Flushing scores to Aeri...")
1102
+ self.client.flush()
1103
+
1104
+ # Build final result
1105
+ duration = time.time() - start_time
1106
+
1107
+ if verbose:
1108
+ logger.info(
1109
+ f"Batch evaluation complete: {total_items_processed} items processed "
1110
+ f"in {duration:.2f}s"
1111
+ )
1112
+
1113
+ # Completed successfully if we either:
1114
+ # 1. Ran out of items (has_more is False), OR
1115
+ # 2. Hit max_items limit (intentionally stopped)
1116
+ completed_successfully = not has_more or (
1117
+ max_items is not None and total_items_fetched >= max_items
1118
+ )
1119
+
1120
+ return self._build_result(
1121
+ total_items_fetched=total_items_fetched,
1122
+ total_items_processed=total_items_processed,
1123
+ total_items_failed=total_items_failed,
1124
+ total_scores_created=total_scores_created,
1125
+ total_composite_scores_created=total_composite_scores_created,
1126
+ total_evaluations_failed=total_evaluations_failed,
1127
+ evaluator_stats_dict=evaluator_stats_dict,
1128
+ resume_token=None, # No resume needed on successful completion
1129
+ completed=completed_successfully,
1130
+ start_time=start_time,
1131
+ failed_item_ids=failed_item_ids,
1132
+ error_summary=error_summary,
1133
+ has_more_items=(
1134
+ has_more and max_items is not None and total_items_fetched >= max_items
1135
+ ),
1136
+ item_evaluations=item_evaluations,
1137
+ )
1138
+
1139
+ async def _fetch_batch_with_retry(
1140
+ self,
1141
+ *,
1142
+ scope: str,
1143
+ filter: Optional[str],
1144
+ page: int,
1145
+ limit: int,
1146
+ max_retries: int,
1147
+ fields: Optional[str],
1148
+ ) -> List[Union[TraceWithFullDetails, ObservationsView]]:
1149
+ """Fetch a batch of items with retry logic.
1150
+
1151
+ Args:
1152
+ scope: The type of items ("traces", "observations").
1153
+ filter: JSON filter string for querying.
1154
+ page: Page number (1-indexed).
1155
+ limit: Number of items per page.
1156
+ max_retries: Maximum number of retry attempts.
1157
+ verbose: Whether to log retry attempts.
1158
+ fields: Trace fields to fetch
1159
+
1160
+ Returns:
1161
+ List of items from the API.
1162
+
1163
+ Raises:
1164
+ Exception: If all retry attempts fail.
1165
+ """
1166
+ if scope == "traces":
1167
+ response = self.client.api.trace.list(
1168
+ page=page,
1169
+ limit=limit,
1170
+ filter=filter,
1171
+ request_options={"max_retries": max_retries},
1172
+ fields=fields,
1173
+ ) # type: ignore
1174
+ return list(response.data) # type: ignore
1175
+ elif scope == "observations":
1176
+ response = self.client.api.legacy.observations_v1.get_many(
1177
+ page=page,
1178
+ limit=limit,
1179
+ filter=filter,
1180
+ request_options={"max_retries": max_retries},
1181
+ ) # type: ignore
1182
+ return list(response.data) # type: ignore
1183
+ else:
1184
+ error_message = f"Invalid scope: {scope}"
1185
+ raise ValueError(error_message)
1186
+
1187
+ async def _process_batch_evaluation_item(
1188
+ self,
1189
+ item: Union[TraceWithFullDetails, ObservationsView],
1190
+ scope: str,
1191
+ mapper: MapperFunction,
1192
+ evaluators: List[EvaluatorFunction],
1193
+ composite_evaluator: Optional[CompositeEvaluatorFunction],
1194
+ metadata: Optional[Dict[str, Any]],
1195
+ _add_observation_scores_to_trace: bool,
1196
+ evaluator_stats_dict: Dict[str, EvaluatorStats],
1197
+ ) -> Tuple[int, int, int, List[Evaluation]]:
1198
+ """Process a single item: map, evaluate, create scores.
1199
+
1200
+ Args:
1201
+ item: The API response object to evaluate.
1202
+ scope: The type of item ("traces", "observations").
1203
+ mapper: Function to transform item to evaluator inputs.
1204
+ evaluators: List of evaluator functions.
1205
+ composite_evaluator: Optional composite evaluator function.
1206
+ metadata: Additional metadata to add to scores.
1207
+ _add_observation_scores_to_trace: Whether to duplicate
1208
+ observation-level scores at trace level.
1209
+ evaluator_stats_dict: Dictionary tracking evaluator statistics.
1210
+
1211
+ Returns:
1212
+ Tuple of (scores_created, composite_scores_created, evaluations_failed, all_evaluations).
1213
+
1214
+ Raises:
1215
+ Exception: If mapping fails or item processing encounters fatal error.
1216
+ """
1217
+ scores_created = 0
1218
+ composite_scores_created = 0
1219
+ evaluations_failed = 0
1220
+
1221
+ # Run mapper to transform item
1222
+ evaluator_inputs = await self._run_mapper(mapper, item)
1223
+
1224
+ # Run all evaluators
1225
+ evaluations: List[Evaluation] = []
1226
+ for evaluator in evaluators:
1227
+ evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
1228
+ stats = evaluator_stats_dict[evaluator_name]
1229
+ stats.total_runs += 1
1230
+
1231
+ try:
1232
+ eval_results = await self._run_evaluator_internal(
1233
+ evaluator,
1234
+ input=evaluator_inputs.input,
1235
+ output=evaluator_inputs.output,
1236
+ expected_output=evaluator_inputs.expected_output,
1237
+ metadata=evaluator_inputs.metadata,
1238
+ )
1239
+
1240
+ stats.successful_runs += 1
1241
+ stats.total_scores_created += len(eval_results)
1242
+ evaluations.extend(eval_results)
1243
+
1244
+ except Exception as e:
1245
+ # Evaluator failed - log warning and continue with other evaluators
1246
+ stats.failed_runs += 1
1247
+ evaluations_failed += 1
1248
+ logger.warning(
1249
+ f"Evaluator {evaluator_name} failed on item "
1250
+ f"{self._get_item_id(item, scope)}: {e}"
1251
+ )
1252
+
1253
+ # Create scores for item-level evaluations
1254
+ item_id = self._get_item_id(item, scope)
1255
+ for evaluation in evaluations:
1256
+ scores_created += self._create_score_for_scope(
1257
+ scope=scope,
1258
+ item_id=item_id,
1259
+ trace_id=cast(ObservationsView, item).trace_id
1260
+ if scope == "observations"
1261
+ else None,
1262
+ evaluation=evaluation,
1263
+ additional_metadata=metadata,
1264
+ add_observation_score_to_trace=_add_observation_scores_to_trace,
1265
+ )
1266
+
1267
+ # Run composite evaluator if provided and we have evaluations
1268
+ if composite_evaluator and evaluations:
1269
+ try:
1270
+ composite_evals = await self._run_composite_evaluator(
1271
+ composite_evaluator,
1272
+ input=evaluator_inputs.input,
1273
+ output=evaluator_inputs.output,
1274
+ expected_output=evaluator_inputs.expected_output,
1275
+ metadata=evaluator_inputs.metadata,
1276
+ evaluations=evaluations,
1277
+ )
1278
+
1279
+ # Create scores for all composite evaluations
1280
+ for composite_eval in composite_evals:
1281
+ composite_scores_created += self._create_score_for_scope(
1282
+ scope=scope,
1283
+ item_id=item_id,
1284
+ trace_id=cast(ObservationsView, item).trace_id
1285
+ if scope == "observations"
1286
+ else None,
1287
+ evaluation=composite_eval,
1288
+ additional_metadata=metadata,
1289
+ add_observation_score_to_trace=_add_observation_scores_to_trace,
1290
+ )
1291
+
1292
+ # Add composite evaluations to the list
1293
+ evaluations.extend(composite_evals)
1294
+
1295
+ except Exception as e:
1296
+ logger.warning(f"Composite evaluator failed on item {item_id}: {e}")
1297
+
1298
+ return (
1299
+ scores_created,
1300
+ composite_scores_created,
1301
+ evaluations_failed,
1302
+ evaluations,
1303
+ )
1304
+
1305
+ async def _run_evaluator_internal(
1306
+ self,
1307
+ evaluator: EvaluatorFunction,
1308
+ **kwargs: Any,
1309
+ ) -> List[Evaluation]:
1310
+ """Run an evaluator function and normalize the result.
1311
+
1312
+ Unlike experiment._run_evaluator, this version raises exceptions
1313
+ so we can track failures in our statistics.
1314
+
1315
+ Args:
1316
+ evaluator: The evaluator function to run.
1317
+ **kwargs: Arguments to pass to the evaluator.
1318
+
1319
+ Returns:
1320
+ List of Evaluation objects.
1321
+
1322
+ Raises:
1323
+ Exception: If evaluator raises an exception (not caught).
1324
+ """
1325
+ result = evaluator(**kwargs)
1326
+
1327
+ # Handle async evaluators
1328
+ if asyncio.iscoroutine(result):
1329
+ result = await result
1330
+
1331
+ # Normalize to list
1332
+ if isinstance(result, (dict, Evaluation)):
1333
+ return [result] # type: ignore
1334
+ elif isinstance(result, list):
1335
+ return result
1336
+ else:
1337
+ return []
1338
+
1339
+ async def _run_mapper(
1340
+ self,
1341
+ mapper: MapperFunction,
1342
+ item: Union[TraceWithFullDetails, ObservationsView],
1343
+ ) -> EvaluatorInputs:
1344
+ """Run mapper function (handles both sync and async mappers).
1345
+
1346
+ Args:
1347
+ mapper: The mapper function to run.
1348
+ item: The API response object to map.
1349
+
1350
+ Returns:
1351
+ EvaluatorInputs instance.
1352
+
1353
+ Raises:
1354
+ Exception: If mapper raises an exception.
1355
+ """
1356
+ result = mapper(item=item)
1357
+ if asyncio.iscoroutine(result):
1358
+ return await result # type: ignore
1359
+ return result # type: ignore
1360
+
1361
+ async def _run_composite_evaluator(
1362
+ self,
1363
+ composite_evaluator: CompositeEvaluatorFunction,
1364
+ input: Optional[Any],
1365
+ output: Optional[Any],
1366
+ expected_output: Optional[Any],
1367
+ metadata: Optional[Dict[str, Any]],
1368
+ evaluations: List[Evaluation],
1369
+ ) -> List[Evaluation]:
1370
+ """Run composite evaluator function (handles both sync and async).
1371
+
1372
+ Args:
1373
+ composite_evaluator: The composite evaluator function.
1374
+ input: The input data provided to the system.
1375
+ output: The output generated by the system.
1376
+ expected_output: The expected/reference output.
1377
+ metadata: Additional metadata about the evaluation context.
1378
+ evaluations: List of item-level evaluations.
1379
+
1380
+ Returns:
1381
+ List of Evaluation objects (normalized from single or list return).
1382
+
1383
+ Raises:
1384
+ Exception: If composite evaluator raises an exception.
1385
+ """
1386
+ result = composite_evaluator(
1387
+ input=input,
1388
+ output=output,
1389
+ expected_output=expected_output,
1390
+ metadata=metadata,
1391
+ evaluations=evaluations,
1392
+ )
1393
+ if asyncio.iscoroutine(result):
1394
+ result = await result
1395
+
1396
+ # Normalize to list (same as regular evaluator)
1397
+ if isinstance(result, (dict, Evaluation)):
1398
+ return [result] # type: ignore
1399
+ elif isinstance(result, list):
1400
+ return result
1401
+ else:
1402
+ return []
1403
+
1404
+ def _create_score_for_scope(
1405
+ self,
1406
+ *,
1407
+ scope: str,
1408
+ item_id: str,
1409
+ trace_id: Optional[str] = None,
1410
+ evaluation: Evaluation,
1411
+ additional_metadata: Optional[Dict[str, Any]],
1412
+ add_observation_score_to_trace: bool = False,
1413
+ ) -> int:
1414
+ """Create a score linked to the appropriate entity based on scope.
1415
+
1416
+ Args:
1417
+ scope: The type of entity ("traces", "observations").
1418
+ item_id: The ID of the entity.
1419
+ trace_id: The trace ID of the entity; required if scope=observations
1420
+ evaluation: The evaluation result to create a score from.
1421
+ additional_metadata: Additional metadata to merge with evaluation metadata.
1422
+ add_observation_score_to_trace: Whether to duplicate observation
1423
+ score on parent trace as well.
1424
+
1425
+ Returns:
1426
+ Number of score events created.
1427
+ """
1428
+ # Merge metadata
1429
+ score_metadata = {
1430
+ **(evaluation.metadata or {}),
1431
+ **(additional_metadata or {}),
1432
+ }
1433
+
1434
+ if scope == "traces":
1435
+ self.client.create_score(
1436
+ trace_id=item_id,
1437
+ name=evaluation.name,
1438
+ value=evaluation.value, # type: ignore
1439
+ comment=evaluation.comment,
1440
+ metadata=score_metadata,
1441
+ data_type=evaluation.data_type, # type: ignore[arg-type]
1442
+ config_id=evaluation.config_id,
1443
+ )
1444
+ return 1
1445
+ elif scope == "observations":
1446
+ self.client.create_score(
1447
+ observation_id=item_id,
1448
+ trace_id=trace_id,
1449
+ name=evaluation.name,
1450
+ value=evaluation.value, # type: ignore
1451
+ comment=evaluation.comment,
1452
+ metadata=score_metadata,
1453
+ data_type=evaluation.data_type, # type: ignore[arg-type]
1454
+ config_id=evaluation.config_id,
1455
+ )
1456
+ score_count = 1
1457
+
1458
+ if add_observation_score_to_trace and trace_id:
1459
+ self.client.create_score(
1460
+ trace_id=trace_id,
1461
+ name=evaluation.name,
1462
+ value=evaluation.value, # type: ignore
1463
+ comment=evaluation.comment,
1464
+ metadata=score_metadata,
1465
+ data_type=evaluation.data_type, # type: ignore[arg-type]
1466
+ config_id=evaluation.config_id,
1467
+ )
1468
+ score_count += 1
1469
+
1470
+ return score_count
1471
+
1472
+ return 0
1473
+
1474
+ def _build_timestamp_filter(
1475
+ self,
1476
+ original_filter: Optional[str],
1477
+ resume_from: Optional[BatchEvaluationResumeToken],
1478
+ ) -> Optional[str]:
1479
+ """Build filter with timestamp constraint for resume capability.
1480
+
1481
+ Args:
1482
+ original_filter: The original JSON filter string.
1483
+ resume_from: Optional resume token with timestamp information.
1484
+
1485
+ Returns:
1486
+ Modified filter string with timestamp constraint, or original filter.
1487
+ """
1488
+ if not resume_from:
1489
+ return original_filter
1490
+
1491
+ # Parse original filter (should be array) or create empty array
1492
+ try:
1493
+ filter_list = json.loads(original_filter) if original_filter else []
1494
+ if not isinstance(filter_list, list):
1495
+ logger.warning(
1496
+ f"Filter should be a JSON array, got: {type(filter_list).__name__}"
1497
+ )
1498
+ filter_list = []
1499
+ except json.JSONDecodeError:
1500
+ logger.warning(
1501
+ f"Invalid JSON in original filter, ignoring: {original_filter}"
1502
+ )
1503
+ filter_list = []
1504
+
1505
+ # Add timestamp constraint to filter array
1506
+ timestamp_field = self._get_timestamp_field_for_scope(resume_from.scope)
1507
+ timestamp_filter = {
1508
+ "type": "datetime",
1509
+ "column": timestamp_field,
1510
+ "operator": ">",
1511
+ "value": resume_from.last_processed_timestamp,
1512
+ }
1513
+ filter_list.append(timestamp_filter)
1514
+
1515
+ return json.dumps(filter_list)
1516
+
1517
+ @staticmethod
1518
+ def _get_item_id(
1519
+ item: Union[TraceWithFullDetails, ObservationsView],
1520
+ scope: str,
1521
+ ) -> str:
1522
+ """Extract ID from item based on scope.
1523
+
1524
+ Args:
1525
+ item: The API response object.
1526
+ scope: The type of item.
1527
+
1528
+ Returns:
1529
+ The item's ID.
1530
+ """
1531
+ return item.id
1532
+
1533
+ @staticmethod
1534
+ def _get_item_timestamp(
1535
+ item: Union[TraceWithFullDetails, ObservationsView],
1536
+ scope: str,
1537
+ ) -> str:
1538
+ """Extract timestamp from item based on scope.
1539
+
1540
+ Args:
1541
+ item: The API response object.
1542
+ scope: The type of item.
1543
+
1544
+ Returns:
1545
+ ISO 8601 timestamp string.
1546
+ """
1547
+ if scope == "traces":
1548
+ # Type narrowing for traces
1549
+ if hasattr(item, "timestamp"):
1550
+ return item.timestamp.isoformat() # type: ignore[attr-defined]
1551
+ elif scope == "observations":
1552
+ # Type narrowing for observations
1553
+ if hasattr(item, "start_time"):
1554
+ return item.start_time.isoformat() # type: ignore[attr-defined]
1555
+ return ""
1556
+
1557
+ @staticmethod
1558
+ def _get_timestamp_field_for_scope(scope: str) -> str:
1559
+ """Get the timestamp field name for filtering based on scope.
1560
+
1561
+ Args:
1562
+ scope: The type of items.
1563
+
1564
+ Returns:
1565
+ The field name to use in filters.
1566
+ """
1567
+ if scope == "traces":
1568
+ return "timestamp"
1569
+ elif scope == "observations":
1570
+ return "start_time"
1571
+ return "timestamp" # Default
1572
+
1573
+ @staticmethod
1574
+ def _dedupe_tags(tags: Optional[List[str]]) -> List[str]:
1575
+ """Deduplicate tags while preserving order."""
1576
+ if tags is None:
1577
+ return []
1578
+
1579
+ deduped: List[str] = []
1580
+ seen = set()
1581
+ for tag in tags:
1582
+ if tag not in seen:
1583
+ deduped.append(tag)
1584
+ seen.add(tag)
1585
+
1586
+ return deduped
1587
+
1588
+ def _build_result(
1589
+ self,
1590
+ total_items_fetched: int,
1591
+ total_items_processed: int,
1592
+ total_items_failed: int,
1593
+ total_scores_created: int,
1594
+ total_composite_scores_created: int,
1595
+ total_evaluations_failed: int,
1596
+ evaluator_stats_dict: Dict[str, EvaluatorStats],
1597
+ resume_token: Optional[BatchEvaluationResumeToken],
1598
+ completed: bool,
1599
+ start_time: float,
1600
+ failed_item_ids: List[str],
1601
+ error_summary: Dict[str, int],
1602
+ has_more_items: bool,
1603
+ item_evaluations: Dict[str, List[Evaluation]],
1604
+ ) -> BatchEvaluationResult:
1605
+ """Build the final BatchEvaluationResult.
1606
+
1607
+ Args:
1608
+ total_items_fetched: Total items fetched.
1609
+ total_items_processed: Items successfully processed.
1610
+ total_items_failed: Items that failed.
1611
+ total_scores_created: Scores from item evaluators.
1612
+ total_composite_scores_created: Scores from composite evaluator.
1613
+ total_evaluations_failed: Individual evaluator failures.
1614
+ evaluator_stats_dict: Per-evaluator statistics.
1615
+ resume_token: Resume token if incomplete.
1616
+ completed: Whether evaluation completed fully.
1617
+ start_time: Start time (unix timestamp).
1618
+ failed_item_ids: IDs of failed items.
1619
+ error_summary: Error type counts.
1620
+ has_more_items: Whether more items exist.
1621
+ item_evaluations: Dictionary mapping item IDs to their evaluation results.
1622
+
1623
+ Returns:
1624
+ BatchEvaluationResult instance.
1625
+ """
1626
+ duration = time.time() - start_time
1627
+
1628
+ return BatchEvaluationResult(
1629
+ total_items_fetched=total_items_fetched,
1630
+ total_items_processed=total_items_processed,
1631
+ total_items_failed=total_items_failed,
1632
+ total_scores_created=total_scores_created,
1633
+ total_composite_scores_created=total_composite_scores_created,
1634
+ total_evaluations_failed=total_evaluations_failed,
1635
+ evaluator_stats=list(evaluator_stats_dict.values()),
1636
+ resume_token=resume_token,
1637
+ completed=completed,
1638
+ duration_seconds=duration,
1639
+ failed_item_ids=failed_item_ids,
1640
+ error_summary=error_summary,
1641
+ has_more_items=has_more_items,
1642
+ item_evaluations=item_evaluations,
1643
+ )