bigquery-agent-analytics 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. bigquery_agent_analytics-0.2.3/CHANGELOG.md +131 -0
  2. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/PKG-INFO +1 -1
  3. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/SDK.md +30 -1
  4. bigquery_agent_analytics-0.2.3/examples/ci/README.md +42 -0
  5. bigquery_agent_analytics-0.2.3/examples/ci/evaluate_thresholds.yml +78 -0
  6. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/pyproject.toml +1 -1
  7. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/cli.py +54 -9
  8. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/client.py +45 -15
  9. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/evaluators.py +171 -15
  10. bigquery_agent_analytics-0.2.3/tests/test_ai_generate_judge_live.py +203 -0
  11. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_cli.py +237 -0
  12. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_client.py +292 -0
  13. bigquery_agent_analytics-0.2.2/CHANGELOG.md +0 -50
  14. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/.github/workflows/ci.yml +0 -0
  15. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/.github/workflows/release.yml +0 -0
  16. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/.gitignore +0 -0
  17. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/CODE_OF_CONDUCT.md +0 -0
  18. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/CONTRIBUTING.md +0 -0
  19. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/LICENSE +0 -0
  20. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/README.md +0 -0
  21. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/SECURITY.md +0 -0
  22. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/autoformat.sh +0 -0
  23. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/dashboard/README.md +0 -0
  24. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/dashboard/agent_analytics_dashboard.ipynb +0 -0
  25. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/dashboard/app.py +0 -0
  26. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/dashboard/requirements.txt +0 -0
  27. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/README.md +0 -0
  28. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/README.md +0 -0
  29. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/bigtable_dashboard.sql +0 -0
  30. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/pubsub_alerting.sql +0 -0
  31. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/realtime_error_analysis.sql +0 -0
  32. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/session_scoring.sql +0 -0
  33. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/setup_reservation.md +0 -0
  34. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/python_udf/README.md +0 -0
  35. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/python_udf/register.sql +0 -0
  36. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/README.md +0 -0
  37. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/deploy.sh +0 -0
  38. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/dispatch.py +0 -0
  39. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/main.py +0 -0
  40. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/register.sql +0 -0
  41. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/README.md +0 -0
  42. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/main.py +0 -0
  43. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/requirements.txt +0 -0
  44. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/setup.sh +0 -0
  45. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/trigger_query.sql +0 -0
  46. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/worker.py +0 -0
  47. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/README.md +0 -0
  48. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/context_graph_v2_design.md +0 -0
  49. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/context_graph_v3_design.md +0 -0
  50. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/design.md +0 -0
  51. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/entity_resolution_primitives.md +0 -0
  52. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/hatteras_evaluation.md +0 -0
  53. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/implementation_plan_concept_index_runtime.md +0 -0
  54. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/implementation_plan_remote_function.md +0 -0
  55. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/learning_ontology_and_context_graph.md +0 -0
  56. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/binding.md +0 -0
  57. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/cli.md +0 -0
  58. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/compilation.md +0 -0
  59. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/ontology.md +0 -0
  60. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/owl-import.md +0 -0
  61. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/scaffold.md +0 -0
  62. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology_graph_v4_design.md +0 -0
  63. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology_graph_v5_design.md +0 -0
  64. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/prd_unified_analytics_interface.md +0 -0
  65. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/proposal_bigquery_agent_cli.md +0 -0
  66. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/python_udf_support_design.md +0 -0
  67. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/remote_function_rationale.md +0 -0
  68. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/sdk_usage_tracking.md +0 -0
  69. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/README.md +0 -0
  70. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/DEMO_SCRIPT.md +0 -0
  71. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/README.md +0 -0
  72. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent/__init__.py +0 -0
  73. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent/agent.py +0 -0
  74. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent/prompts.py +0 -0
  75. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent/tools.py +0 -0
  76. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/__init__.py +0 -0
  77. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/config.py +0 -0
  78. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/config_loader.py +0 -0
  79. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/eval_runner.py +0 -0
  80. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/improver_agent.py +0 -0
  81. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/prompt_adapter.py +0 -0
  82. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/prompts.py +0 -0
  83. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/tool_introspection.py +0 -0
  84. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/config.json +0 -0
  85. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/eval/eval_cases.json +0 -0
  86. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/eval/generate_traffic.py +0 -0
  87. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/eval/run_eval.py +0 -0
  88. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/overview.png +0 -0
  89. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/reset.sh +0 -0
  90. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/run_cycle.sh +0 -0
  91. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/run_improvement.py +0 -0
  92. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/setup.sh +0 -0
  93. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/setup_vertex.py +0 -0
  94. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/show_prompt.sh +0 -0
  95. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ai_classify_side_by_side.sql +0 -0
  96. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ai_forecast_side_by_side.sql +0 -0
  97. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ai_ml_integration_demo.ipynb +0 -0
  98. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ai_similarity_validation.sql +0 -0
  99. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/categorical_dashboard.sql +0 -0
  100. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/categorical_evaluation_demo.ipynb +0 -0
  101. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ci_eval_pipeline.sh +0 -0
  102. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/cli_agent_tool.py +0 -0
  103. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/context_graph_adcp_demo.ipynb +0 -0
  104. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/continuous_query_alerting.sql +0 -0
  105. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/dashboard_v2.ipynb +0 -0
  106. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/dashboard_v2_bigframes.ipynb +0 -0
  107. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/e2e_demo.py +0 -0
  108. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/e2e_demo_output.txt +0 -0
  109. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/e2e_notebook_demo.ipynb +0 -0
  110. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/event_semantics_views_bigframes_demo.ipynb +0 -0
  111. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/memory_service_demo.ipynb +0 -0
  112. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/migration_v5_demo_notebook.ipynb +0 -0
  113. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/nba_agent_trace_analysis_notebook.ipynb +0 -0
  114. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ontology_graph_v4_demo.ipynb +0 -0
  115. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ontology_graph_v5_demo.ipynb +0 -0
  116. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/python_udf_eval_summary.sql +0 -0
  117. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/python_udf_evaluation.sql +0 -0
  118. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/python_udf_event_semantics.sql +0 -0
  119. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/remote_function_dashboard.sql +0 -0
  120. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ymgo_graph_spec.yaml +0 -0
  121. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/scripts/README.md +0 -0
  122. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/scripts/quality_report.py +0 -0
  123. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/scripts/quality_report.sh +0 -0
  124. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/scripts/sample_report.md +0 -0
  125. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/__init__.py +0 -0
  126. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/_deploy_runtime.py +0 -0
  127. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/_streaming_evaluation.py +0 -0
  128. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/_telemetry.py +0 -0
  129. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ai_ml_integration.py +0 -0
  130. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/bigframes_evaluator.py +0 -0
  131. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/categorical_evaluator.py +0 -0
  132. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/categorical_views.py +0 -0
  133. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/context_graph.py +0 -0
  134. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/eval_suite.py +0 -0
  135. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/eval_validator.py +0 -0
  136. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/event_semantics.py +0 -0
  137. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/extracted_models.py +0 -0
  138. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/feedback.py +0 -0
  139. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/formatter.py +0 -0
  140. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/grader_pipeline.py +0 -0
  141. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/insights.py +0 -0
  142. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/memory_service.py +0 -0
  143. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/multi_trial.py +0 -0
  144. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_graph.py +0 -0
  145. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_materializer.py +0 -0
  146. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_models.py +0 -0
  147. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_orchestrator.py +0 -0
  148. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_property_graph.py +0 -0
  149. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_schema_compiler.py +0 -0
  150. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/resolved_spec.py +0 -0
  151. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/runtime_spec.py +0 -0
  152. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/serialization.py +0 -0
  153. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/structured_extraction.py +0 -0
  154. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/trace.py +0 -0
  155. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/trace_evaluator.py +0 -0
  156. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ttl_importer.py +0 -0
  157. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/udf_kernels.py +0 -0
  158. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/udf_sql_templates.py +0 -0
  159. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/views.py +0 -0
  160. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/__init__.py +0 -0
  161. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/binding_loader.py +0 -0
  162. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/binding_models.py +0 -0
  163. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/cli.py +0 -0
  164. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/docs/user_manual.md +0 -0
  165. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/graph_ddl_compiler.py +0 -0
  166. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/graph_ddl_models.py +0 -0
  167. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/ontology_loader.py +0 -0
  168. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/ontology_models.py +0 -0
  169. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/owl_importer.py +0 -0
  170. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/scaffold.py +0 -0
  171. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/__init__.py +0 -0
  172. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/__init__.py +0 -0
  173. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_binding_loader.py +0 -0
  174. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_binding_models.py +0 -0
  175. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_cli.py +0 -0
  176. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_graph_ddl_compiler.py +0 -0
  177. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_ontology_models.py +0 -0
  178. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_owl_importer.py +0 -0
  179. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_scaffold.py +0 -0
  180. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_scaffold_cli.py +0 -0
  181. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/lineage_sessions.json +0 -0
  182. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/mixed_events.json +0 -0
  183. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/mixed_owl_skos.ttl +0 -0
  184. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/skos_taxonomy.ttl +0 -0
  185. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/test_binding.yaml +0 -0
  186. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/test_combined_spec.yaml +0 -0
  187. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/test_ontology.yaml +0 -0
  188. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/yamo_sample.ttl +0 -0
  189. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_abstract_adapter_filter.py +0 -0
  190. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ai_ml_integration.py +0 -0
  191. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ai_ml_integration_labels.py +0 -0
  192. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_bigframes_evaluator.py +0 -0
  193. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_bridge_hardening.py +0 -0
  194. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_categorical_evaluator.py +0 -0
  195. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_categorical_views.py +0 -0
  196. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_client_labels.py +0 -0
  197. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_context_graph.py +0 -0
  198. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_context_graph_labels.py +0 -0
  199. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_dual_loader.py +0 -0
  200. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_eval_suite.py +0 -0
  201. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_eval_validator.py +0 -0
  202. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_event_semantics.py +0 -0
  203. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_extracted_models.py +0 -0
  204. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_feedback_labels.py +0 -0
  205. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_formatter.py +0 -0
  206. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_grader_pipeline.py +0 -0
  207. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_integration_ontology_binding.py +0 -0
  208. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_memory_service.py +0 -0
  209. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_memory_service_labels.py +0 -0
  210. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_multi_trial.py +0 -0
  211. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_graph.py +0 -0
  212. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_labels.py +0 -0
  213. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_materializer.py +0 -0
  214. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_models.py +0 -0
  215. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_orchestrator.py +0 -0
  216. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_property_graph.py +0 -0
  217. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_schema_compiler.py +0 -0
  218. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_owl_import_bridge.py +0 -0
  219. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_pr16_fixes.py +0 -0
  220. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_pr17_fixes.py +0 -0
  221. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_pr19_fixes.py +0 -0
  222. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_quality_report_helpers.py +0 -0
  223. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_remote_function.py +0 -0
  224. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_resolved_spec.py +0 -0
  225. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_runtime_factory.py +0 -0
  226. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_runtime_spec.py +0 -0
  227. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_evaluators.py +0 -0
  228. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_feedback.py +0 -0
  229. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_insights.py +0 -0
  230. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_trace.py +0 -0
  231. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_serialization.py +0 -0
  232. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_streaming_evaluation.py +0 -0
  233. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_surface_tags.py +0 -0
  234. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_telemetry.py +0 -0
  235. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_trace_evaluator.py +0 -0
  236. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_trace_filter_factory.py +0 -0
  237. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_udf_kernels.py +0 -0
  238. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_udf_sql_generation.py +0 -0
  239. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_v5_golden.py +0 -0
  240. {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_views.py +0 -0
@@ -0,0 +1,131 @@
1
+ # Changelog
2
+
3
+ All notable changes to `bigquery-agent-analytics` are documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ### Fixed
11
+
12
+ - **LLM-as-Judge AI.GENERATE path now executes against current
13
+ BigQuery.** Earlier versions emitted a table-valued
14
+ ``FROM session_traces, AI.GENERATE(...) AS result`` shape with
15
+ ``output_schema`` and a flat ``model_params`` dict. Current
16
+ ``AI.GENERATE`` is a scalar function that returns a STRUCT;
17
+ the table-valued form raises ``Table-valued function not found``
18
+ and the flat ``model_params`` raises ``does not conform to the
19
+ GenerateContent request body``. Mocked unit tests passed because
20
+ they bypassed real query execution. The SDK now renders a
21
+ ``SELECT AI.GENERATE(...).score, ...`` query with a
22
+ ``generationConfig``-wrapped ``model_params`` and ``output_schema``
23
+ on the scalar form, runs against live BigQuery, and unwraps the
24
+ returned struct's ``score`` / ``justification`` / ``status``
25
+ fields.
26
+ - **LLM-as-Judge AI.GENERATE / ML.GENERATE_TEXT now uses the full
27
+ Python prompt template.** Previously both BQ-native paths sent
28
+ only ``prompt_template.split('{trace_text}')[0]`` to BigQuery,
29
+ silently dropping every instruction that followed the
30
+ placeholders — including the per-criterion output-format spec
31
+ the judge model needs to score consistently with the
32
+ API-fallback path. The two BQ paths and the Python API path now
33
+ produce comparable scores against the same prompt.
34
+
35
+ ### Added
36
+
37
+ - ``evaluators.render_ai_generate_judge_query(...)`` is the new
38
+ entry point that builds the AI.GENERATE batch SQL.
39
+ ``connection_id`` is optional — when omitted the call uses
40
+ end-user credentials; when supplied it inlines the
41
+ ``connection_id =>`` argument so callers can route through a
42
+ service-account-owned connection when their environment
43
+ requires it.
44
+ - ``Client.connection_id`` already existed; it is now plumbed
45
+ through to ``_ai_generate_judge`` so a connection set at client
46
+ construction propagates to the judge SQL automatically.
47
+ - Live BigQuery integration tests for the LLM-judge AI.GENERATE
48
+ path (``tests/test_ai_generate_judge_live.py``). Skipped by
49
+ default; opt in with ``BQAA_RUN_LIVE_TESTS=1`` plus
50
+ ``PROJECT_ID`` / ``DATASET_ID``. Three tests cover SQL parse
51
+ acceptance, expected result-schema column names, and the
52
+ ``connection_id`` escape hatch when
53
+ ``BQAA_AI_GENERATE_CONNECTION_ID`` is set. Catches the class of
54
+ mock-divergence bug that let the prior broken template ship.
55
+ - ``EvaluationReport.details["execution_mode"]`` is now populated
56
+ for LLM-as-Judge runs with one of ``ai_generate``,
57
+ ``ml_generate_text``, ``api_fallback``, or ``no_op`` — matching
58
+ the value space the categorical evaluator already exposes. When
59
+ an earlier tier raised before a later tier succeeded,
60
+ ``details["fallback_reason"]`` carries the chained exception
61
+ messages in attempt order, so CI and dashboards can audit which
62
+ path actually ran.
63
+ - ``evaluators.split_judge_prompt_template(prompt_template)`` is
64
+ the helper the SQL paths use to safely substitute the template
65
+ into ``CONCAT()``; exposed publicly for downstream code that
66
+ needs the same shape.
67
+ - ``bq-agent-sdk evaluate --exit-code`` FAIL lines now carry a
68
+ bounded ``feedback="…"`` snippet drawn from
69
+ ``SessionScore.llm_feedback`` for LLM-judge failures. The
70
+ snippet collapses internal whitespace to a single space,
71
+ truncates to 120 characters with an ellipsis, and is omitted
72
+ entirely for code-based metrics (which leave ``llm_feedback``
73
+ empty). CI logs now explain *why* the judge said the session
74
+ failed without forcing the reader to chase the JSON output.
75
+
76
+ ### Changed
77
+
78
+ - ``--strict`` help text and ``SDK.md §4`` clarified to match shipped
79
+ behavior. ``--strict`` is a *visibility* knob — it stamps
80
+ ``details['parse_error']=True`` on AI.GENERATE/ML.GENERATE_TEXT
81
+ judge rows whose ``scores`` dict is empty, and adds a report-level
82
+ ``parse_errors`` counter. It does **not** flip any session's
83
+ pass/fail outcome: both BQ-native judge methods compute ``passed``
84
+ as ``bool(scores) and all(...)``, so empty-scores rows already
85
+ fail without the flag. API-fallback parse errors coerce to
86
+ ``score=0.0``, so they fail as low-score failures rather than
87
+ parse errors. For pass/fail-only CI consumers ``--strict`` is a
88
+ no-op; reach for it when a dashboard needs to tell "no parseable
89
+ score" apart from "low score."
90
+
91
+ ## [0.2.2] - 2026-04-24
92
+
93
+ ### Changed (breaking)
94
+
95
+ - **Prebuilt `CodeEvaluator` gates now compare raw observed values
96
+ directly against the user-supplied budget.** `CodeEvaluator.latency`,
97
+ `.turn_count`, `.error_rate`, `.token_efficiency`, `.ttft`, and
98
+ `.cost_per_session` return `1.0` when the observed metric is within
99
+ budget and `0.0` otherwise. The previous implementation scored sessions
100
+ on a normalized `1.0 - (observed / budget)` scale against a `0.5` pass
101
+ cutoff, which effectively fired every gate at roughly half the budget
102
+ the user typed (e.g. `latency(threshold_ms=5000)` failed sessions at
103
+ `avg_latency_ms > 2500`). Users relying on the old sub-budget fail
104
+ behavior should lower their budgets to match their intent.
105
+ - The scheduled streaming evaluator (`streaming_observability_v1`) uses
106
+ the same raw-budget gate semantics for consistency with the prebuilt
107
+ `CodeEvaluator` factories.
108
+
109
+ ### Added
110
+
111
+ - `CodeEvaluator.add_metric` accepts `observed_key`, `observed_fn`, and
112
+ `budget` arguments that flow into `SessionScore.details[f"metric_{name}"]`
113
+ for downstream reporting. The CLI uses these to emit readable failure
114
+ lines without re-running the scorer.
115
+ - `bq-agent-sdk evaluate --exit-code` now prints a per-session failure
116
+ summary on stderr before exiting non-zero. Each line names the
117
+ session_id, failing metric, observed value, and the budget it blew
118
+ through. Output is capped at the first 10 failing sessions to keep
119
+ CI logs scannable.
120
+ - `bq-agent-sdk categorical-eval` gains `--exit-code`,
121
+ `--min-pass-rate`, and `--pass-category METRIC=CATEGORY`
122
+ (repeatable) flags. Declare which classification counts as passing
123
+ per metric, set a minimum pass rate across the run, and fail CI when
124
+ any metric falls below it. Multiple pass categories per metric are
125
+ OR'd together (e.g. `--pass-category tone=positive --pass-category
126
+ tone=neutral`). Missing metric names warn on stderr without failing
127
+ the run so configuration mistakes are visible in CI logs.
128
+
129
+ ## [0.2.1]
130
+
131
+ - See `git log` for prior changes.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bigquery-agent-analytics
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: SDK for analyzing and evaluating agent traces stored in BigQuery.
5
5
  Author: Google LLC
6
6
  License-Expression: Apache-2.0
@@ -275,7 +275,36 @@ print(report.summary())
275
275
 
276
276
  ### Strict Mode
277
277
 
278
- When `strict=True`, sessions where the LLM judge returns empty or unparseable output are marked as **failed** instead of silently passing. Operational counters are placed in `report.details` (not `aggregate_scores`) so downstream consumers can treat scores as purely normalized metrics:
278
+ `strict=True` adds **parse-error visibility** it does not flip
279
+ any session's pass/fail outcome. Both BQ-native judge methods set
280
+ `passed = bool(scores) and all(score >= threshold for score in
281
+ scores.values())`, so a row whose `scores` dict is empty (the
282
+ judge model returned no parseable output) already fails. Without
283
+ `strict=True` you can't tell from the report whether a failed
284
+ session failed because the judge gave a low score or because the
285
+ judge gave nothing parseable at all.
286
+
287
+ `strict=True` walks the merged report and:
288
+
289
+ - Stamps `SessionScore.details["parse_error"] = True` on every
290
+ session whose `scores` dict is empty.
291
+ - Adds a report-level `details["parse_errors"]` count plus
292
+ `details["parse_error_rate"]` (fraction of `total_sessions`).
293
+
294
+ The API-fallback path coerces malformed model output to
295
+ `score=0.0` and always populates `scores`, so its failures look
296
+ like low-score failures rather than parse errors. `strict=True`
297
+ won't surface them as parse errors today; it's an AI.GENERATE /
298
+ ML.GENERATE_TEXT visibility knob in practice.
299
+
300
+ For pass/fail-only consumers (CI gates with `--exit-code`),
301
+ `strict=True` is a no-op. Reach for it when a dashboard or
302
+ investigation needs to distinguish "no parseable score" from
303
+ "low score" failures.
304
+
305
+ Operational counters are placed in `report.details` (not
306
+ `aggregate_scores`) so downstream consumers can treat scores as
307
+ purely normalized metrics:
279
308
 
280
309
  ```python
281
310
  report = client.evaluate(
@@ -0,0 +1,42 @@
1
+ # `examples/ci/`
2
+
3
+ Reference CI artifacts for agent quality gates backed by
4
+ BigQuery Agent Analytics.
5
+
6
+ ## `evaluate_thresholds.yml`
7
+
8
+ Drop-in GitHub Actions workflow that runs four deterministic
9
+ budgets (latency, token usage, tool error rate, turn count) on
10
+ every PR, scoring the last 24 hours of production traces from an
11
+ `agent_events` BigQuery table. Exits non-zero when any session
12
+ breaches its budget, so a bad merge lights up the PR status
13
+ before code ships.
14
+
15
+ See the companion Medium post, *Your Agent Events Table Is Also a
16
+ Test Suite*, for the narrative, threshold-setting guidance, and
17
+ the companion categorical-eval gate that pairs naturally with
18
+ this workflow.
19
+
20
+ ### Quick start
21
+
22
+ 1. Copy `evaluate_thresholds.yml` to `.github/workflows/` in
23
+ your agent repo.
24
+ 2. Set repository variables `PROJECT_ID` and `DATASET_ID` to the
25
+ GCP project + BigQuery dataset where your `agent_events` table
26
+ lives.
27
+ 3. Set the repository secret `GCP_SA_KEY` to a service-account JSON
28
+ with `bigquery.jobUser` + `bigquery.dataViewer` on the dataset.
29
+ 4. Replace `calendar_assistant` with your agent's name in all four
30
+ `--agent-id` flags inside the workflow.
31
+ 5. Tune the four `--threshold` numbers against your own production
32
+ distribution. A defensible starting point for each is "p95 of
33
+ the last 30 days + 10% buffer"; revisit after week one of CI
34
+ gating.
35
+
36
+ ### Requirements
37
+
38
+ - `bigquery-agent-analytics >= 0.2.2` — earlier releases shipped
39
+ normalized `1.0 - observed/budget` gate scoring with a `0.5`
40
+ pass cutoff, which fires every gate at roughly half the budget
41
+ the user typed. 0.2.2 switched to raw-budget binary gates so
42
+ the `--threshold` value means what it says.
@@ -0,0 +1,78 @@
1
+ # .github/workflows/evaluate_thresholds.yml
2
+ #
3
+ # Reference GitHub Actions workflow that gates every PR against the
4
+ # last 24 hours of production traces stored in an `agent_events`
5
+ # BigQuery table. Four deterministic budgets run as separate steps
6
+ # so a red PR status tells you which gate regressed.
7
+ #
8
+ # Companion to the Medium post "Your Agent Events Table Is Also a
9
+ # Test Suite." See the post for the narrative and for the sidebar
10
+ # on picking initial threshold values from 30-day production data.
11
+ #
12
+ # Requires bigquery-agent-analytics >= 0.2.2 — the first release
13
+ # with the raw-budget `--threshold` semantics and the tight
14
+ # `--exit-code` failure output this workflow depends on.
15
+ #
16
+ # To adopt this workflow in your own agent repo:
17
+ # 1. Copy this file to .github/workflows/evaluate_thresholds.yml.
18
+ # 2. Set repo variables PROJECT_ID and DATASET_ID to the GCP
19
+ # project + BigQuery dataset where your agent_events table
20
+ # lives.
21
+ # 3. Set the repo secret GCP_SA_KEY to a service account JSON
22
+ # with bigquery.jobUser + bigquery.dataViewer on the dataset.
23
+ # 4. Replace `calendar_assistant` with your agent's name in all
24
+ # four --agent-id flags.
25
+ # 5. Tune the four --threshold numbers against your own
26
+ # production distribution. A defensible starting point for
27
+ # each is "p95 of last 30 days + 10% buffer"; revisit after
28
+ # week one of CI gating.
29
+
30
+ name: Agent quality gate
31
+
32
+ on:
33
+ pull_request:
34
+ paths:
35
+ - 'agents/**'
36
+ - 'prompts/**'
37
+
38
+ jobs:
39
+ gate:
40
+ runs-on: ubuntu-latest
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+ - uses: actions/setup-python@v5
44
+ with: { python-version: '3.12' }
45
+ - run: pip install 'bigquery-agent-analytics>=0.2.2,<0.3.0'
46
+ - uses: google-github-actions/auth@v2
47
+ with: { credentials_json: '${{ secrets.GCP_SA_KEY }}' }
48
+ - name: Latency budget
49
+ run: >
50
+ bq-agent-sdk evaluate --evaluator=latency --threshold=5000
51
+ --last=24h --agent-id=calendar_assistant --exit-code
52
+ --project-id=${{ vars.PROJECT_ID }}
53
+ --dataset-id=${{ vars.DATASET_ID }}
54
+ - name: Token budget
55
+ # Tune this to your agent's real token distribution. A short
56
+ # system prompt + few-turn sessions will land in the low
57
+ # thousands; production agents with longer instructions and
58
+ # multi-turn tool chains typically want tens of thousands.
59
+ # Run `bq-agent-sdk evaluate --evaluator=token_efficiency
60
+ # --last=30d` without `--exit-code` once to see your own
61
+ # baseline before picking a number.
62
+ run: >
63
+ bq-agent-sdk evaluate --evaluator=token_efficiency --threshold=5000
64
+ --last=24h --agent-id=calendar_assistant --exit-code
65
+ --project-id=${{ vars.PROJECT_ID }}
66
+ --dataset-id=${{ vars.DATASET_ID }}
67
+ - name: Tool error rate
68
+ run: >
69
+ bq-agent-sdk evaluate --evaluator=error_rate --threshold=0.1
70
+ --last=24h --agent-id=calendar_assistant --exit-code
71
+ --project-id=${{ vars.PROJECT_ID }}
72
+ --dataset-id=${{ vars.DATASET_ID }}
73
+ - name: Turn count
74
+ run: >
75
+ bq-agent-sdk evaluate --evaluator=turn_count --threshold=10
76
+ --last=24h --agent-id=calendar_assistant --exit-code
77
+ --project-id=${{ vars.PROJECT_ID }}
78
+ --dataset-id=${{ vars.DATASET_ID }}
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bigquery-agent-analytics"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "SDK for analyzing and evaluating agent traces stored in BigQuery."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -302,7 +302,14 @@ def evaluate(
302
302
  ),
303
303
  strict: bool = typer.Option(
304
304
  False,
305
- help="Fail sessions with unparseable judge output.",
305
+ help=(
306
+ "Stamp parse-error metadata on AI.GENERATE judge rows with"
307
+ " empty or NULL typed output. Those rows already fail"
308
+ " (empty score < threshold); --strict adds"
309
+ " details['parse_error']=True and a report-level"
310
+ " parse_errors counter so dashboards can tell 'no"
311
+ " parseable score' apart from 'low score' failures."
312
+ ),
306
313
  ),
307
314
  endpoint: Optional[str] = typer.Option(
308
315
  None,
@@ -368,6 +375,31 @@ def evaluate(
368
375
  raise typer.Exit(code=2)
369
376
 
370
377
 
378
+ _FEEDBACK_SNIPPET_MAX = 120
379
+
380
+
381
+ def _format_feedback_snippet(
382
+ feedback: Optional[str], max_chars: int = _FEEDBACK_SNIPPET_MAX
383
+ ) -> Optional[str]:
384
+ """Return a single-line, bounded snippet of an LLM-judge justification.
385
+
386
+ Collapses internal whitespace runs (including newlines) to a single
387
+ space so the snippet fits on one CI log line, then truncates to
388
+ ``max_chars`` with a trailing ``…`` when the original was longer.
389
+ Returns ``None`` for empty / whitespace-only input so callers can
390
+ cleanly skip the field.
391
+ """
392
+ if not feedback:
393
+ return None
394
+ collapsed = " ".join(feedback.split())
395
+ if not collapsed:
396
+ return None
397
+ if len(collapsed) <= max_chars:
398
+ return collapsed
399
+ # Reserve one char for the ellipsis to keep the visual width capped.
400
+ return collapsed[: max_chars - 1].rstrip() + "\u2026"
401
+
402
+
371
403
  def _emit_evaluate_failures(
372
404
  report: EvaluationReport, max_sessions: int = 10
373
405
  ) -> None:
@@ -377,10 +409,14 @@ def _emit_evaluate_failures(
377
409
  Prefers the raw observed + budget pair (``CodeEvaluator`` prebuilts);
378
410
  falls back to score + threshold when the metric didn't declare
379
411
  observed/budget (custom ``add_metric`` users, ``LLMAsJudge``
380
- criteria). A failing session is guaranteed to produce at least one
381
- FAIL line never just the summary header.
382
-
383
- Capped at ``max_sessions`` most-recent failures so CI logs stay scannable.
412
+ criteria). For LLM-judge failures the line also carries a bounded
413
+ ``feedback="…"`` snippet drawn from ``SessionScore.llm_feedback``
414
+ so CI logs explain *why* the judge said the session failed without
415
+ forcing the reader to chase the JSON output.
416
+
417
+ A failing session is guaranteed to produce at least one FAIL line —
418
+ never just the summary header. Capped at ``max_sessions`` most-recent
419
+ failures so CI logs stay scannable.
384
420
  """
385
421
  failed = [s for s in report.session_scores if not s.passed]
386
422
  if not failed:
@@ -393,6 +429,7 @@ def _emit_evaluate_failures(
393
429
  )
394
430
  shown = failed[:max_sessions]
395
431
  for s in shown:
432
+ feedback_snippet = _format_feedback_snippet(s.llm_feedback)
396
433
  emitted_for_session = False
397
434
  for metric_name, score in s.scores.items():
398
435
  detail = s.details.get(f"metric_{metric_name}") or {}
@@ -433,6 +470,12 @@ def _emit_evaluate_failures(
433
470
  parts.append(f"score={score:.4g}")
434
471
  if threshold is not None and isinstance(threshold, (int, float)):
435
472
  parts.append(f"threshold={threshold:.4g}")
473
+ # LLM judges populate ``SessionScore.llm_feedback`` with the
474
+ # judge's justification. Surface a bounded snippet on the FAIL
475
+ # line so CI logs explain *why* without dumping the full JSON.
476
+ # Code-based metrics leave ``llm_feedback`` empty and skip this.
477
+ if feedback_snippet is not None:
478
+ parts.append(f'feedback="{feedback_snippet}"')
436
479
  typer.echo(" " + " ".join(parts), err=True)
437
480
  emitted_for_session = True
438
481
 
@@ -441,10 +484,12 @@ def _emit_evaluate_failures(
441
484
  # while the session itself is flagged failed (a bug upstream) — we
442
485
  # still point the reader at the session id.
443
486
  if not emitted_for_session:
444
- typer.echo(
445
- f" FAIL session={s.session_id} (no per-metric detail available)",
446
- err=True,
447
- )
487
+ fallback = f" FAIL session={s.session_id}"
488
+ if feedback_snippet is not None:
489
+ fallback += f' feedback="{feedback_snippet}"'
490
+ else:
491
+ fallback += " (no per-metric detail available)"
492
+ typer.echo(fallback, err=True)
448
493
  if len(failed) > len(shown):
449
494
  typer.echo(
450
495
  f" ... {len(failed) - len(shown)} more failing session(s) "
@@ -78,8 +78,10 @@ from .evaluators import DEFAULT_ENDPOINT
78
78
  from .evaluators import EvaluationReport
79
79
  from .evaluators import LLM_JUDGE_BATCH_QUERY
80
80
  from .evaluators import LLMAsJudge
81
+ from .evaluators import render_ai_generate_judge_query
81
82
  from .evaluators import SESSION_SUMMARY_QUERY
82
83
  from .evaluators import SessionScore
84
+ from .evaluators import split_judge_prompt_template
83
85
  from .feedback import AnalysisConfig
84
86
  from .feedback import compute_drift
85
87
  from .feedback import compute_question_distribution
@@ -975,14 +977,27 @@ class Client:
975
977
  then falls back to the Gemini API. Each path evaluates
976
978
  every criterion in the evaluator and merges the per-session
977
979
  scores into a single report.
980
+
981
+ Stamps ``report.details["execution_mode"]`` with one of
982
+ ``ai_generate``, ``ml_generate_text``, ``api_fallback`` so the
983
+ caller (and CI gates) can audit which path actually ran.
984
+ When an earlier tier raised before a later tier succeeded,
985
+ ``report.details["fallback_reason"]`` carries the chained
986
+ exception messages in attempt order. (The naming mirrors the
987
+ categorical evaluator's ``execution_mode`` value space for
988
+ consistency.)
978
989
  """
979
990
  criteria = evaluator._criteria
980
991
  if not criteria:
981
- return _build_report(
992
+ report = _build_report(
982
993
  evaluator_name=evaluator.name,
983
994
  dataset=f"{self._table_ref} WHERE {where}",
984
995
  session_scores=[],
985
996
  )
997
+ report.details["execution_mode"] = "no_op"
998
+ return report
999
+
1000
+ fallback_reasons: list[str] = []
986
1001
 
987
1002
  # Try AI.GENERATE (new path) when endpoint is not a legacy ref
988
1003
  if not self._is_legacy_model_ref(self.endpoint):
@@ -997,17 +1012,20 @@ class Client:
997
1012
  params,
998
1013
  )
999
1014
  criterion_reports.append((criterion, report))
1000
- return _merge_criterion_reports(
1015
+ merged = _merge_criterion_reports(
1001
1016
  evaluator.name,
1002
1017
  f"{self._table_ref} WHERE {where}",
1003
1018
  criteria,
1004
1019
  criterion_reports,
1005
1020
  )
1021
+ merged.details["execution_mode"] = "ai_generate"
1022
+ return merged
1006
1023
  except Exception as e:
1007
1024
  logger.debug(
1008
1025
  "AI.GENERATE judge failed, trying legacy: %s",
1009
1026
  e,
1010
1027
  )
1028
+ fallback_reasons.append(f"ai_generate: {e}")
1011
1029
 
1012
1030
  # Try legacy BQML batch evaluation
1013
1031
  text_model = (
@@ -1028,20 +1046,29 @@ class Client:
1028
1046
  text_model,
1029
1047
  )
1030
1048
  criterion_reports.append((criterion, report))
1031
- return _merge_criterion_reports(
1049
+ merged = _merge_criterion_reports(
1032
1050
  evaluator.name,
1033
1051
  f"{self._table_ref} WHERE {where}",
1034
1052
  criteria,
1035
1053
  criterion_reports,
1036
1054
  )
1055
+ merged.details["execution_mode"] = "ml_generate_text"
1056
+ if fallback_reasons:
1057
+ merged.details["fallback_reason"] = "; ".join(fallback_reasons)
1058
+ return merged
1037
1059
  except Exception as e:
1038
1060
  logger.debug(
1039
1061
  "BQML judge failed, falling back to API: %s",
1040
1062
  e,
1041
1063
  )
1064
+ fallback_reasons.append(f"ml_generate_text: {e}")
1042
1065
 
1043
1066
  # Fallback: fetch traces using same table/filter, evaluate via API
1044
- return self._api_judge(evaluator, table, where, params)
1067
+ api_report = self._api_judge(evaluator, table, where, params)
1068
+ api_report.details["execution_mode"] = "api_fallback"
1069
+ if fallback_reasons:
1070
+ api_report.details["fallback_reason"] = "; ".join(fallback_reasons)
1071
+ return api_report
1045
1072
 
1046
1073
  def _ai_generate_judge(
1047
1074
  self,
@@ -1054,20 +1081,22 @@ class Client:
1054
1081
  """Evaluates using BigQuery AI.GENERATE with typed output."""
1055
1082
  from google.cloud import bigquery as bq
1056
1083
 
1084
+ prefix, middle, suffix = split_judge_prompt_template(
1085
+ criterion.prompt_template
1086
+ )
1057
1087
  judge_params = list(params) + [
1058
- bq.ScalarQueryParameter(
1059
- "judge_prompt",
1060
- "STRING",
1061
- criterion.prompt_template.split("{trace_text}")[0],
1062
- ),
1088
+ bq.ScalarQueryParameter("judge_prompt_prefix", "STRING", prefix),
1089
+ bq.ScalarQueryParameter("judge_prompt_middle", "STRING", middle),
1090
+ bq.ScalarQueryParameter("judge_prompt_suffix", "STRING", suffix),
1063
1091
  ]
1064
1092
 
1065
- query = AI_GENERATE_JUDGE_BATCH_QUERY.format(
1093
+ query = render_ai_generate_judge_query(
1066
1094
  project=self.project_id,
1067
1095
  dataset=self.dataset_id,
1068
1096
  table=table,
1069
1097
  where=where,
1070
1098
  endpoint=self.endpoint,
1099
+ connection_id=self.connection_id,
1071
1100
  )
1072
1101
  job_config = bq.QueryJobConfig(
1073
1102
  query_parameters=judge_params,
@@ -1121,12 +1150,13 @@ class Client:
1121
1150
  """Evaluates using BigQuery ML.GENERATE_TEXT."""
1122
1151
  from google.cloud import bigquery as bq
1123
1152
 
1153
+ prefix, middle, suffix = split_judge_prompt_template(
1154
+ criterion.prompt_template
1155
+ )
1124
1156
  judge_params = list(params) + [
1125
- bq.ScalarQueryParameter(
1126
- "judge_prompt",
1127
- "STRING",
1128
- criterion.prompt_template.split("{trace_text}")[0],
1129
- ),
1157
+ bq.ScalarQueryParameter("judge_prompt_prefix", "STRING", prefix),
1158
+ bq.ScalarQueryParameter("judge_prompt_middle", "STRING", middle),
1159
+ bq.ScalarQueryParameter("judge_prompt_suffix", "STRING", suffix),
1130
1160
  ]
1131
1161
 
1132
1162
  query = LLM_JUDGE_BATCH_QUERY.format(