bigquery-agent-analytics 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bigquery_agent_analytics-0.2.3/CHANGELOG.md +131 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/PKG-INFO +1 -1
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/SDK.md +30 -1
- bigquery_agent_analytics-0.2.3/examples/ci/README.md +42 -0
- bigquery_agent_analytics-0.2.3/examples/ci/evaluate_thresholds.yml +78 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/pyproject.toml +1 -1
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/cli.py +54 -9
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/client.py +45 -15
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/evaluators.py +171 -15
- bigquery_agent_analytics-0.2.3/tests/test_ai_generate_judge_live.py +203 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_cli.py +237 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_client.py +292 -0
- bigquery_agent_analytics-0.2.2/CHANGELOG.md +0 -50
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/.github/workflows/ci.yml +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/.github/workflows/release.yml +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/.gitignore +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/CODE_OF_CONDUCT.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/CONTRIBUTING.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/LICENSE +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/SECURITY.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/autoformat.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/dashboard/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/dashboard/agent_analytics_dashboard.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/dashboard/app.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/dashboard/requirements.txt +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/bigtable_dashboard.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/pubsub_alerting.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/realtime_error_analysis.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/session_scoring.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/continuous_queries/setup_reservation.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/python_udf/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/python_udf/register.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/deploy.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/dispatch.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/main.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/remote_function/register.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/main.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/requirements.txt +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/setup.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/trigger_query.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/deploy/streaming_evaluation/worker.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/context_graph_v2_design.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/context_graph_v3_design.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/design.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/entity_resolution_primitives.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/hatteras_evaluation.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/implementation_plan_concept_index_runtime.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/implementation_plan_remote_function.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/learning_ontology_and_context_graph.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/binding.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/cli.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/compilation.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/ontology.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/owl-import.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology/scaffold.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology_graph_v4_design.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/ontology_graph_v5_design.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/prd_unified_analytics_interface.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/proposal_bigquery_agent_cli.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/python_udf_support_design.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/remote_function_rationale.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/docs/sdk_usage_tracking.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/DEMO_SCRIPT.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent/__init__.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent/agent.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent/prompts.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent/tools.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/__init__.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/config.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/config_loader.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/eval_runner.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/improver_agent.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/prompt_adapter.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/prompts.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/agent_improvement/tool_introspection.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/config.json +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/eval/eval_cases.json +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/eval/generate_traffic.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/eval/run_eval.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/overview.png +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/reset.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/run_cycle.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/run_improvement.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/setup.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/setup_vertex.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/agent_improvement_cycle/show_prompt.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ai_classify_side_by_side.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ai_forecast_side_by_side.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ai_ml_integration_demo.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ai_similarity_validation.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/categorical_dashboard.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/categorical_evaluation_demo.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ci_eval_pipeline.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/cli_agent_tool.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/context_graph_adcp_demo.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/continuous_query_alerting.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/dashboard_v2.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/dashboard_v2_bigframes.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/e2e_demo.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/e2e_demo_output.txt +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/e2e_notebook_demo.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/event_semantics_views_bigframes_demo.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/memory_service_demo.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/migration_v5_demo_notebook.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/nba_agent_trace_analysis_notebook.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ontology_graph_v4_demo.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ontology_graph_v5_demo.ipynb +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/python_udf_eval_summary.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/python_udf_evaluation.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/python_udf_event_semantics.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/remote_function_dashboard.sql +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/examples/ymgo_graph_spec.yaml +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/scripts/README.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/scripts/quality_report.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/scripts/quality_report.sh +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/scripts/sample_report.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/__init__.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/_deploy_runtime.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/_streaming_evaluation.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/_telemetry.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ai_ml_integration.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/bigframes_evaluator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/categorical_evaluator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/categorical_views.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/context_graph.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/eval_suite.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/eval_validator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/event_semantics.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/extracted_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/feedback.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/formatter.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/grader_pipeline.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/insights.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/memory_service.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/multi_trial.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_graph.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_materializer.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_orchestrator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_property_graph.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ontology_schema_compiler.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/resolved_spec.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/runtime_spec.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/serialization.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/structured_extraction.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/trace.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/trace_evaluator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/ttl_importer.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/udf_kernels.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/udf_sql_templates.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_agent_analytics/views.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/__init__.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/binding_loader.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/binding_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/cli.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/docs/user_manual.md +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/graph_ddl_compiler.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/graph_ddl_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/ontology_loader.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/ontology_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/owl_importer.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/src/bigquery_ontology/scaffold.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/__init__.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/__init__.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_binding_loader.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_binding_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_cli.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_graph_ddl_compiler.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_ontology_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_owl_importer.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_scaffold.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/bigquery_ontology/test_scaffold_cli.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/lineage_sessions.json +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/mixed_events.json +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/mixed_owl_skos.ttl +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/skos_taxonomy.ttl +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/test_binding.yaml +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/test_combined_spec.yaml +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/test_ontology.yaml +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/fixtures/yamo_sample.ttl +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_abstract_adapter_filter.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ai_ml_integration.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ai_ml_integration_labels.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_bigframes_evaluator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_bridge_hardening.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_categorical_evaluator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_categorical_views.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_client_labels.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_context_graph.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_context_graph_labels.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_dual_loader.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_eval_suite.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_eval_validator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_event_semantics.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_extracted_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_feedback_labels.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_formatter.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_grader_pipeline.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_integration_ontology_binding.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_memory_service.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_memory_service_labels.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_multi_trial.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_graph.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_labels.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_materializer.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_models.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_orchestrator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_property_graph.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_ontology_schema_compiler.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_owl_import_bridge.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_pr16_fixes.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_pr17_fixes.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_pr19_fixes.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_quality_report_helpers.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_remote_function.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_resolved_spec.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_runtime_factory.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_runtime_spec.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_evaluators.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_feedback.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_insights.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_sdk_trace.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_serialization.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_streaming_evaluation.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_surface_tags.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_telemetry.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_trace_evaluator.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_trace_filter_factory.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_udf_kernels.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_udf_sql_generation.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_v5_golden.py +0 -0
- {bigquery_agent_analytics-0.2.2 → bigquery_agent_analytics-0.2.3}/tests/test_views.py +0 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `bigquery-agent-analytics` are documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
|
|
12
|
+
- **LLM-as-Judge AI.GENERATE path now executes against current
|
|
13
|
+
BigQuery.** Earlier versions emitted a table-valued
|
|
14
|
+
``FROM session_traces, AI.GENERATE(...) AS result`` shape with
|
|
15
|
+
``output_schema`` and a flat ``model_params`` dict. Current
|
|
16
|
+
``AI.GENERATE`` is a scalar function that returns a STRUCT;
|
|
17
|
+
the table-valued form raises ``Table-valued function not found``
|
|
18
|
+
and the flat ``model_params`` raises ``does not conform to the
|
|
19
|
+
GenerateContent request body``. Mocked unit tests passed because
|
|
20
|
+
they bypassed real query execution. The SDK now renders a
|
|
21
|
+
``SELECT AI.GENERATE(...).score, ...`` query with a
|
|
22
|
+
``generationConfig``-wrapped ``model_params`` and ``output_schema``
|
|
23
|
+
on the scalar form, runs against live BigQuery, and unwraps the
|
|
24
|
+
returned struct's ``score`` / ``justification`` / ``status``
|
|
25
|
+
fields.
|
|
26
|
+
- **LLM-as-Judge AI.GENERATE / ML.GENERATE_TEXT now uses the full
|
|
27
|
+
Python prompt template.** Previously both BQ-native paths sent
|
|
28
|
+
only ``prompt_template.split('{trace_text}')[0]`` to BigQuery,
|
|
29
|
+
silently dropping every instruction that followed the
|
|
30
|
+
placeholders — including the per-criterion output-format spec
|
|
31
|
+
the judge model needs to score consistently with the
|
|
32
|
+
API-fallback path. The two BQ paths and the Python API path now
|
|
33
|
+
produce comparable scores against the same prompt.
|
|
34
|
+
|
|
35
|
+
### Added
|
|
36
|
+
|
|
37
|
+
- ``evaluators.render_ai_generate_judge_query(...)`` is the new
|
|
38
|
+
entry point that builds the AI.GENERATE batch SQL.
|
|
39
|
+
``connection_id`` is optional — when omitted the call uses
|
|
40
|
+
end-user credentials; when supplied it inlines the
|
|
41
|
+
``connection_id =>`` argument so callers can route through a
|
|
42
|
+
service-account-owned connection when their environment
|
|
43
|
+
requires it.
|
|
44
|
+
- ``Client.connection_id`` already existed; it is now plumbed
|
|
45
|
+
through to ``_ai_generate_judge`` so a connection set at client
|
|
46
|
+
construction propagates to the judge SQL automatically.
|
|
47
|
+
- Live BigQuery integration tests for the LLM-judge AI.GENERATE
|
|
48
|
+
path (``tests/test_ai_generate_judge_live.py``). Skipped by
|
|
49
|
+
default; opt in with ``BQAA_RUN_LIVE_TESTS=1`` plus
|
|
50
|
+
``PROJECT_ID`` / ``DATASET_ID``. Three tests cover SQL parse
|
|
51
|
+
acceptance, expected result-schema column names, and the
|
|
52
|
+
``connection_id`` escape hatch when
|
|
53
|
+
``BQAA_AI_GENERATE_CONNECTION_ID`` is set. Catches the class of
|
|
54
|
+
mock-divergence bug that let the prior broken template ship.
|
|
55
|
+
- ``EvaluationReport.details["execution_mode"]`` is now populated
|
|
56
|
+
for LLM-as-Judge runs with one of ``ai_generate``,
|
|
57
|
+
``ml_generate_text``, ``api_fallback``, or ``no_op`` — matching
|
|
58
|
+
the value space the categorical evaluator already exposes. When
|
|
59
|
+
an earlier tier raised before a later tier succeeded,
|
|
60
|
+
``details["fallback_reason"]`` carries the chained exception
|
|
61
|
+
messages in attempt order, so CI and dashboards can audit which
|
|
62
|
+
path actually ran.
|
|
63
|
+
- ``evaluators.split_judge_prompt_template(prompt_template)`` is
|
|
64
|
+
the helper the SQL paths use to safely substitute the template
|
|
65
|
+
into ``CONCAT()``; exposed publicly for downstream code that
|
|
66
|
+
needs the same shape.
|
|
67
|
+
- ``bq-agent-sdk evaluate --exit-code`` FAIL lines now carry a
|
|
68
|
+
bounded ``feedback="…"`` snippet drawn from
|
|
69
|
+
``SessionScore.llm_feedback`` for LLM-judge failures. The
|
|
70
|
+
snippet collapses internal whitespace to a single space,
|
|
71
|
+
truncates to 120 characters with an ellipsis, and is omitted
|
|
72
|
+
entirely for code-based metrics (which leave ``llm_feedback``
|
|
73
|
+
empty). CI logs now explain *why* the judge said the session
|
|
74
|
+
failed without forcing the reader to chase the JSON output.
|
|
75
|
+
|
|
76
|
+
### Changed
|
|
77
|
+
|
|
78
|
+
- ``--strict`` help text and ``SDK.md §4`` clarified to match shipped
|
|
79
|
+
behavior. ``--strict`` is a *visibility* knob — it stamps
|
|
80
|
+
``details['parse_error']=True`` on AI.GENERATE/ML.GENERATE_TEXT
|
|
81
|
+
judge rows whose ``scores`` dict is empty, and adds a report-level
|
|
82
|
+
``parse_errors`` counter. It does **not** flip any session's
|
|
83
|
+
pass/fail outcome: both BQ-native judge methods compute ``passed``
|
|
84
|
+
as ``bool(scores) and all(...)``, so empty-scores rows already
|
|
85
|
+
fail without the flag. API-fallback parse errors coerce to
|
|
86
|
+
``score=0.0``, so they fail as low-score failures rather than
|
|
87
|
+
parse errors. For pass/fail-only CI consumers ``--strict`` is a
|
|
88
|
+
no-op; reach for it when a dashboard needs to tell "no parseable
|
|
89
|
+
score" apart from "low score."
|
|
90
|
+
|
|
91
|
+
## [0.2.2] - 2026-04-24
|
|
92
|
+
|
|
93
|
+
### Changed (breaking)
|
|
94
|
+
|
|
95
|
+
- **Prebuilt `CodeEvaluator` gates now compare raw observed values
|
|
96
|
+
directly against the user-supplied budget.** `CodeEvaluator.latency`,
|
|
97
|
+
`.turn_count`, `.error_rate`, `.token_efficiency`, `.ttft`, and
|
|
98
|
+
`.cost_per_session` return `1.0` when the observed metric is within
|
|
99
|
+
budget and `0.0` otherwise. The previous implementation scored sessions
|
|
100
|
+
on a normalized `1.0 - (observed / budget)` scale against a `0.5` pass
|
|
101
|
+
cutoff, which effectively fired every gate at roughly half the budget
|
|
102
|
+
the user typed (e.g. `latency(threshold_ms=5000)` failed sessions at
|
|
103
|
+
`avg_latency_ms > 2500`). Users relying on the old sub-budget fail
|
|
104
|
+
behavior should lower their budgets to match their intent.
|
|
105
|
+
- The scheduled streaming evaluator (`streaming_observability_v1`) uses
|
|
106
|
+
the same raw-budget gate semantics for consistency with the prebuilt
|
|
107
|
+
`CodeEvaluator` factories.
|
|
108
|
+
|
|
109
|
+
### Added
|
|
110
|
+
|
|
111
|
+
- `CodeEvaluator.add_metric` accepts `observed_key`, `observed_fn`, and
|
|
112
|
+
`budget` arguments that flow into `SessionScore.details[f"metric_{name}"]`
|
|
113
|
+
for downstream reporting. The CLI uses these to emit readable failure
|
|
114
|
+
lines without re-running the scorer.
|
|
115
|
+
- `bq-agent-sdk evaluate --exit-code` now prints a per-session failure
|
|
116
|
+
summary on stderr before exiting non-zero. Each line names the
|
|
117
|
+
session_id, failing metric, observed value, and the budget it blew
|
|
118
|
+
through. Output is capped at the first 10 failing sessions to keep
|
|
119
|
+
CI logs scannable.
|
|
120
|
+
- `bq-agent-sdk categorical-eval` gains `--exit-code`,
|
|
121
|
+
`--min-pass-rate`, and `--pass-category METRIC=CATEGORY`
|
|
122
|
+
(repeatable) flags. Declare which classification counts as passing
|
|
123
|
+
per metric, set a minimum pass rate across the run, and fail CI when
|
|
124
|
+
any metric falls below it. Multiple pass categories per metric are
|
|
125
|
+
OR'd together (e.g. `--pass-category tone=positive --pass-category
|
|
126
|
+
tone=neutral`). Missing metric names warn on stderr without failing
|
|
127
|
+
the run so configuration mistakes are visible in CI logs.
|
|
128
|
+
|
|
129
|
+
## [0.2.1]
|
|
130
|
+
|
|
131
|
+
- See `git log` for prior changes.
|
|
@@ -275,7 +275,36 @@ print(report.summary())
|
|
|
275
275
|
|
|
276
276
|
### Strict Mode
|
|
277
277
|
|
|
278
|
-
|
|
278
|
+
`strict=True` adds **parse-error visibility** — it does not flip
|
|
279
|
+
any session's pass/fail outcome. Both BQ-native judge methods set
|
|
280
|
+
`passed = bool(scores) and all(score >= threshold for score in
|
|
281
|
+
scores.values())`, so a row whose `scores` dict is empty (the
|
|
282
|
+
judge model returned no parseable output) already fails. Without
|
|
283
|
+
`strict=True` you can't tell from the report whether a failed
|
|
284
|
+
session failed because the judge gave a low score or because the
|
|
285
|
+
judge gave nothing parseable at all.
|
|
286
|
+
|
|
287
|
+
`strict=True` walks the merged report and:
|
|
288
|
+
|
|
289
|
+
- Stamps `SessionScore.details["parse_error"] = True` on every
|
|
290
|
+
session whose `scores` dict is empty.
|
|
291
|
+
- Adds a report-level `details["parse_errors"]` count plus
|
|
292
|
+
`details["parse_error_rate"]` (fraction of `total_sessions`).
|
|
293
|
+
|
|
294
|
+
The API-fallback path coerces malformed model output to
|
|
295
|
+
`score=0.0` and always populates `scores`, so its failures look
|
|
296
|
+
like low-score failures rather than parse errors. `strict=True`
|
|
297
|
+
won't surface them as parse errors today; it's an AI.GENERATE /
|
|
298
|
+
ML.GENERATE_TEXT visibility knob in practice.
|
|
299
|
+
|
|
300
|
+
For pass/fail-only consumers (CI gates with `--exit-code`),
|
|
301
|
+
`strict=True` is a no-op. Reach for it when a dashboard or
|
|
302
|
+
investigation needs to distinguish "no parseable score" from
|
|
303
|
+
"low score" failures.
|
|
304
|
+
|
|
305
|
+
Operational counters are placed in `report.details` (not
|
|
306
|
+
`aggregate_scores`) so downstream consumers can treat scores as
|
|
307
|
+
purely normalized metrics:
|
|
279
308
|
|
|
280
309
|
```python
|
|
281
310
|
report = client.evaluate(
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# `examples/ci/`
|
|
2
|
+
|
|
3
|
+
Reference CI artifacts for agent quality gates backed by
|
|
4
|
+
BigQuery Agent Analytics.
|
|
5
|
+
|
|
6
|
+
## `evaluate_thresholds.yml`
|
|
7
|
+
|
|
8
|
+
Drop-in GitHub Actions workflow that runs four deterministic
|
|
9
|
+
budgets (latency, token usage, tool error rate, turn count) on
|
|
10
|
+
every PR, scoring the last 24 hours of production traces from an
|
|
11
|
+
`agent_events` BigQuery table. Exits non-zero when any session
|
|
12
|
+
breaches its budget, so a bad merge lights up the PR status
|
|
13
|
+
before code ships.
|
|
14
|
+
|
|
15
|
+
See the companion Medium post, *Your Agent Events Table Is Also a
|
|
16
|
+
Test Suite*, for the narrative, threshold-setting guidance, and
|
|
17
|
+
the companion categorical-eval gate that pairs naturally with
|
|
18
|
+
this workflow.
|
|
19
|
+
|
|
20
|
+
### Quick start
|
|
21
|
+
|
|
22
|
+
1. Copy `evaluate_thresholds.yml` to `.github/workflows/` in
|
|
23
|
+
your agent repo.
|
|
24
|
+
2. Set repository variables `PROJECT_ID` and `DATASET_ID` to the
|
|
25
|
+
GCP project + BigQuery dataset where your `agent_events` table
|
|
26
|
+
lives.
|
|
27
|
+
3. Set the repository secret `GCP_SA_KEY` to a service-account JSON
|
|
28
|
+
with `bigquery.jobUser` + `bigquery.dataViewer` on the dataset.
|
|
29
|
+
4. Replace `calendar_assistant` with your agent's name in all four
|
|
30
|
+
`--agent-id` flags inside the workflow.
|
|
31
|
+
5. Tune the four `--threshold` numbers against your own production
|
|
32
|
+
distribution. A defensible starting point for each is "p95 of
|
|
33
|
+
the last 30 days + 10% buffer"; revisit after week one of CI
|
|
34
|
+
gating.
|
|
35
|
+
|
|
36
|
+
### Requirements
|
|
37
|
+
|
|
38
|
+
- `bigquery-agent-analytics >= 0.2.2` — earlier releases shipped
|
|
39
|
+
normalized `1.0 - observed/budget` gate scoring with a `0.5`
|
|
40
|
+
pass cutoff, which fires every gate at roughly half the budget
|
|
41
|
+
the user typed. 0.2.2 switched to raw-budget binary gates so
|
|
42
|
+
the `--threshold` value means what it says.
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# .github/workflows/evaluate_thresholds.yml
|
|
2
|
+
#
|
|
3
|
+
# Reference GitHub Actions workflow that gates every PR against the
|
|
4
|
+
# last 24 hours of production traces stored in an `agent_events`
|
|
5
|
+
# BigQuery table. Four deterministic budgets run as separate steps
|
|
6
|
+
# so a red PR status tells you which gate regressed.
|
|
7
|
+
#
|
|
8
|
+
# Companion to the Medium post "Your Agent Events Table Is Also a
|
|
9
|
+
# Test Suite." See the post for the narrative and for the sidebar
|
|
10
|
+
# on picking initial threshold values from 30-day production data.
|
|
11
|
+
#
|
|
12
|
+
# Requires bigquery-agent-analytics >= 0.2.2 — the first release
|
|
13
|
+
# with the raw-budget `--threshold` semantics and the tight
|
|
14
|
+
# `--exit-code` failure output this workflow depends on.
|
|
15
|
+
#
|
|
16
|
+
# To adopt this workflow in your own agent repo:
|
|
17
|
+
# 1. Copy this file to .github/workflows/evaluate_thresholds.yml.
|
|
18
|
+
# 2. Set repo variables PROJECT_ID and DATASET_ID to the GCP
|
|
19
|
+
# project + BigQuery dataset where your agent_events table
|
|
20
|
+
# lives.
|
|
21
|
+
# 3. Set the repo secret GCP_SA_KEY to a service account JSON
|
|
22
|
+
# with bigquery.jobUser + bigquery.dataViewer on the dataset.
|
|
23
|
+
# 4. Replace `calendar_assistant` with your agent's name in all
|
|
24
|
+
# four --agent-id flags.
|
|
25
|
+
# 5. Tune the four --threshold numbers against your own
|
|
26
|
+
# production distribution. A defensible starting point for
|
|
27
|
+
# each is "p95 of last 30 days + 10% buffer"; revisit after
|
|
28
|
+
# week one of CI gating.
|
|
29
|
+
|
|
30
|
+
name: Agent quality gate
|
|
31
|
+
|
|
32
|
+
on:
|
|
33
|
+
pull_request:
|
|
34
|
+
paths:
|
|
35
|
+
- 'agents/**'
|
|
36
|
+
- 'prompts/**'
|
|
37
|
+
|
|
38
|
+
jobs:
|
|
39
|
+
gate:
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
steps:
|
|
42
|
+
- uses: actions/checkout@v4
|
|
43
|
+
- uses: actions/setup-python@v5
|
|
44
|
+
with: { python-version: '3.12' }
|
|
45
|
+
- run: pip install 'bigquery-agent-analytics>=0.2.2,<0.3.0'
|
|
46
|
+
- uses: google-github-actions/auth@v2
|
|
47
|
+
with: { credentials_json: '${{ secrets.GCP_SA_KEY }}' }
|
|
48
|
+
- name: Latency budget
|
|
49
|
+
run: >
|
|
50
|
+
bq-agent-sdk evaluate --evaluator=latency --threshold=5000
|
|
51
|
+
--last=24h --agent-id=calendar_assistant --exit-code
|
|
52
|
+
--project-id=${{ vars.PROJECT_ID }}
|
|
53
|
+
--dataset-id=${{ vars.DATASET_ID }}
|
|
54
|
+
- name: Token budget
|
|
55
|
+
# Tune this to your agent's real token distribution. A short
|
|
56
|
+
# system prompt + few-turn sessions will land in the low
|
|
57
|
+
# thousands; production agents with longer instructions and
|
|
58
|
+
# multi-turn tool chains typically want tens of thousands.
|
|
59
|
+
# Run `bq-agent-sdk evaluate --evaluator=token_efficiency
|
|
60
|
+
# --last=30d` without `--exit-code` once to see your own
|
|
61
|
+
# baseline before picking a number.
|
|
62
|
+
run: >
|
|
63
|
+
bq-agent-sdk evaluate --evaluator=token_efficiency --threshold=5000
|
|
64
|
+
--last=24h --agent-id=calendar_assistant --exit-code
|
|
65
|
+
--project-id=${{ vars.PROJECT_ID }}
|
|
66
|
+
--dataset-id=${{ vars.DATASET_ID }}
|
|
67
|
+
- name: Tool error rate
|
|
68
|
+
run: >
|
|
69
|
+
bq-agent-sdk evaluate --evaluator=error_rate --threshold=0.1
|
|
70
|
+
--last=24h --agent-id=calendar_assistant --exit-code
|
|
71
|
+
--project-id=${{ vars.PROJECT_ID }}
|
|
72
|
+
--dataset-id=${{ vars.DATASET_ID }}
|
|
73
|
+
- name: Turn count
|
|
74
|
+
run: >
|
|
75
|
+
bq-agent-sdk evaluate --evaluator=turn_count --threshold=10
|
|
76
|
+
--last=24h --agent-id=calendar_assistant --exit-code
|
|
77
|
+
--project-id=${{ vars.PROJECT_ID }}
|
|
78
|
+
--dataset-id=${{ vars.DATASET_ID }}
|
|
@@ -302,7 +302,14 @@ def evaluate(
|
|
|
302
302
|
),
|
|
303
303
|
strict: bool = typer.Option(
|
|
304
304
|
False,
|
|
305
|
-
help=
|
|
305
|
+
help=(
|
|
306
|
+
"Stamp parse-error metadata on AI.GENERATE judge rows with"
|
|
307
|
+
" empty or NULL typed output. Those rows already fail"
|
|
308
|
+
" (empty score < threshold); --strict adds"
|
|
309
|
+
" details['parse_error']=True and a report-level"
|
|
310
|
+
" parse_errors counter so dashboards can tell 'no"
|
|
311
|
+
" parseable score' apart from 'low score' failures."
|
|
312
|
+
),
|
|
306
313
|
),
|
|
307
314
|
endpoint: Optional[str] = typer.Option(
|
|
308
315
|
None,
|
|
@@ -368,6 +375,31 @@ def evaluate(
|
|
|
368
375
|
raise typer.Exit(code=2)
|
|
369
376
|
|
|
370
377
|
|
|
378
|
+
_FEEDBACK_SNIPPET_MAX = 120
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def _format_feedback_snippet(
|
|
382
|
+
feedback: Optional[str], max_chars: int = _FEEDBACK_SNIPPET_MAX
|
|
383
|
+
) -> Optional[str]:
|
|
384
|
+
"""Return a single-line, bounded snippet of an LLM-judge justification.
|
|
385
|
+
|
|
386
|
+
Collapses internal whitespace runs (including newlines) to a single
|
|
387
|
+
space so the snippet fits on one CI log line, then truncates to
|
|
388
|
+
``max_chars`` with a trailing ``…`` when the original was longer.
|
|
389
|
+
Returns ``None`` for empty / whitespace-only input so callers can
|
|
390
|
+
cleanly skip the field.
|
|
391
|
+
"""
|
|
392
|
+
if not feedback:
|
|
393
|
+
return None
|
|
394
|
+
collapsed = " ".join(feedback.split())
|
|
395
|
+
if not collapsed:
|
|
396
|
+
return None
|
|
397
|
+
if len(collapsed) <= max_chars:
|
|
398
|
+
return collapsed
|
|
399
|
+
# Reserve one char for the ellipsis to keep the visual width capped.
|
|
400
|
+
return collapsed[: max_chars - 1].rstrip() + "\u2026"
|
|
401
|
+
|
|
402
|
+
|
|
371
403
|
def _emit_evaluate_failures(
|
|
372
404
|
report: EvaluationReport, max_sessions: int = 10
|
|
373
405
|
) -> None:
|
|
@@ -377,10 +409,14 @@ def _emit_evaluate_failures(
|
|
|
377
409
|
Prefers the raw observed + budget pair (``CodeEvaluator`` prebuilts);
|
|
378
410
|
falls back to score + threshold when the metric didn't declare
|
|
379
411
|
observed/budget (custom ``add_metric`` users, ``LLMAsJudge``
|
|
380
|
-
criteria).
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
412
|
+
criteria). For LLM-judge failures the line also carries a bounded
|
|
413
|
+
``feedback="…"`` snippet drawn from ``SessionScore.llm_feedback``
|
|
414
|
+
so CI logs explain *why* the judge said the session failed without
|
|
415
|
+
forcing the reader to chase the JSON output.
|
|
416
|
+
|
|
417
|
+
A failing session is guaranteed to produce at least one FAIL line —
|
|
418
|
+
never just the summary header. Capped at ``max_sessions`` most-recent
|
|
419
|
+
failures so CI logs stay scannable.
|
|
384
420
|
"""
|
|
385
421
|
failed = [s for s in report.session_scores if not s.passed]
|
|
386
422
|
if not failed:
|
|
@@ -393,6 +429,7 @@ def _emit_evaluate_failures(
|
|
|
393
429
|
)
|
|
394
430
|
shown = failed[:max_sessions]
|
|
395
431
|
for s in shown:
|
|
432
|
+
feedback_snippet = _format_feedback_snippet(s.llm_feedback)
|
|
396
433
|
emitted_for_session = False
|
|
397
434
|
for metric_name, score in s.scores.items():
|
|
398
435
|
detail = s.details.get(f"metric_{metric_name}") or {}
|
|
@@ -433,6 +470,12 @@ def _emit_evaluate_failures(
|
|
|
433
470
|
parts.append(f"score={score:.4g}")
|
|
434
471
|
if threshold is not None and isinstance(threshold, (int, float)):
|
|
435
472
|
parts.append(f"threshold={threshold:.4g}")
|
|
473
|
+
# LLM judges populate ``SessionScore.llm_feedback`` with the
|
|
474
|
+
# judge's justification. Surface a bounded snippet on the FAIL
|
|
475
|
+
# line so CI logs explain *why* without dumping the full JSON.
|
|
476
|
+
# Code-based metrics leave ``llm_feedback`` empty and skip this.
|
|
477
|
+
if feedback_snippet is not None:
|
|
478
|
+
parts.append(f'feedback="{feedback_snippet}"')
|
|
436
479
|
typer.echo(" " + " ".join(parts), err=True)
|
|
437
480
|
emitted_for_session = True
|
|
438
481
|
|
|
@@ -441,10 +484,12 @@ def _emit_evaluate_failures(
|
|
|
441
484
|
# while the session itself is flagged failed (a bug upstream) — we
|
|
442
485
|
# still point the reader at the session id.
|
|
443
486
|
if not emitted_for_session:
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
487
|
+
fallback = f" FAIL session={s.session_id}"
|
|
488
|
+
if feedback_snippet is not None:
|
|
489
|
+
fallback += f' feedback="{feedback_snippet}"'
|
|
490
|
+
else:
|
|
491
|
+
fallback += " (no per-metric detail available)"
|
|
492
|
+
typer.echo(fallback, err=True)
|
|
448
493
|
if len(failed) > len(shown):
|
|
449
494
|
typer.echo(
|
|
450
495
|
f" ... {len(failed) - len(shown)} more failing session(s) "
|
|
@@ -78,8 +78,10 @@ from .evaluators import DEFAULT_ENDPOINT
|
|
|
78
78
|
from .evaluators import EvaluationReport
|
|
79
79
|
from .evaluators import LLM_JUDGE_BATCH_QUERY
|
|
80
80
|
from .evaluators import LLMAsJudge
|
|
81
|
+
from .evaluators import render_ai_generate_judge_query
|
|
81
82
|
from .evaluators import SESSION_SUMMARY_QUERY
|
|
82
83
|
from .evaluators import SessionScore
|
|
84
|
+
from .evaluators import split_judge_prompt_template
|
|
83
85
|
from .feedback import AnalysisConfig
|
|
84
86
|
from .feedback import compute_drift
|
|
85
87
|
from .feedback import compute_question_distribution
|
|
@@ -975,14 +977,27 @@ class Client:
|
|
|
975
977
|
then falls back to the Gemini API. Each path evaluates
|
|
976
978
|
every criterion in the evaluator and merges the per-session
|
|
977
979
|
scores into a single report.
|
|
980
|
+
|
|
981
|
+
Stamps ``report.details["execution_mode"]`` with one of
|
|
982
|
+
``ai_generate``, ``ml_generate_text``, ``api_fallback`` so the
|
|
983
|
+
caller (and CI gates) can audit which path actually ran.
|
|
984
|
+
When an earlier tier raised before a later tier succeeded,
|
|
985
|
+
``report.details["fallback_reason"]`` carries the chained
|
|
986
|
+
exception messages in attempt order. (The naming mirrors the
|
|
987
|
+
categorical evaluator's ``execution_mode`` value space for
|
|
988
|
+
consistency.)
|
|
978
989
|
"""
|
|
979
990
|
criteria = evaluator._criteria
|
|
980
991
|
if not criteria:
|
|
981
|
-
|
|
992
|
+
report = _build_report(
|
|
982
993
|
evaluator_name=evaluator.name,
|
|
983
994
|
dataset=f"{self._table_ref} WHERE {where}",
|
|
984
995
|
session_scores=[],
|
|
985
996
|
)
|
|
997
|
+
report.details["execution_mode"] = "no_op"
|
|
998
|
+
return report
|
|
999
|
+
|
|
1000
|
+
fallback_reasons: list[str] = []
|
|
986
1001
|
|
|
987
1002
|
# Try AI.GENERATE (new path) when endpoint is not a legacy ref
|
|
988
1003
|
if not self._is_legacy_model_ref(self.endpoint):
|
|
@@ -997,17 +1012,20 @@ class Client:
|
|
|
997
1012
|
params,
|
|
998
1013
|
)
|
|
999
1014
|
criterion_reports.append((criterion, report))
|
|
1000
|
-
|
|
1015
|
+
merged = _merge_criterion_reports(
|
|
1001
1016
|
evaluator.name,
|
|
1002
1017
|
f"{self._table_ref} WHERE {where}",
|
|
1003
1018
|
criteria,
|
|
1004
1019
|
criterion_reports,
|
|
1005
1020
|
)
|
|
1021
|
+
merged.details["execution_mode"] = "ai_generate"
|
|
1022
|
+
return merged
|
|
1006
1023
|
except Exception as e:
|
|
1007
1024
|
logger.debug(
|
|
1008
1025
|
"AI.GENERATE judge failed, trying legacy: %s",
|
|
1009
1026
|
e,
|
|
1010
1027
|
)
|
|
1028
|
+
fallback_reasons.append(f"ai_generate: {e}")
|
|
1011
1029
|
|
|
1012
1030
|
# Try legacy BQML batch evaluation
|
|
1013
1031
|
text_model = (
|
|
@@ -1028,20 +1046,29 @@ class Client:
|
|
|
1028
1046
|
text_model,
|
|
1029
1047
|
)
|
|
1030
1048
|
criterion_reports.append((criterion, report))
|
|
1031
|
-
|
|
1049
|
+
merged = _merge_criterion_reports(
|
|
1032
1050
|
evaluator.name,
|
|
1033
1051
|
f"{self._table_ref} WHERE {where}",
|
|
1034
1052
|
criteria,
|
|
1035
1053
|
criterion_reports,
|
|
1036
1054
|
)
|
|
1055
|
+
merged.details["execution_mode"] = "ml_generate_text"
|
|
1056
|
+
if fallback_reasons:
|
|
1057
|
+
merged.details["fallback_reason"] = "; ".join(fallback_reasons)
|
|
1058
|
+
return merged
|
|
1037
1059
|
except Exception as e:
|
|
1038
1060
|
logger.debug(
|
|
1039
1061
|
"BQML judge failed, falling back to API: %s",
|
|
1040
1062
|
e,
|
|
1041
1063
|
)
|
|
1064
|
+
fallback_reasons.append(f"ml_generate_text: {e}")
|
|
1042
1065
|
|
|
1043
1066
|
# Fallback: fetch traces using same table/filter, evaluate via API
|
|
1044
|
-
|
|
1067
|
+
api_report = self._api_judge(evaluator, table, where, params)
|
|
1068
|
+
api_report.details["execution_mode"] = "api_fallback"
|
|
1069
|
+
if fallback_reasons:
|
|
1070
|
+
api_report.details["fallback_reason"] = "; ".join(fallback_reasons)
|
|
1071
|
+
return api_report
|
|
1045
1072
|
|
|
1046
1073
|
def _ai_generate_judge(
|
|
1047
1074
|
self,
|
|
@@ -1054,20 +1081,22 @@ class Client:
|
|
|
1054
1081
|
"""Evaluates using BigQuery AI.GENERATE with typed output."""
|
|
1055
1082
|
from google.cloud import bigquery as bq
|
|
1056
1083
|
|
|
1084
|
+
prefix, middle, suffix = split_judge_prompt_template(
|
|
1085
|
+
criterion.prompt_template
|
|
1086
|
+
)
|
|
1057
1087
|
judge_params = list(params) + [
|
|
1058
|
-
bq.ScalarQueryParameter(
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
criterion.prompt_template.split("{trace_text}")[0],
|
|
1062
|
-
),
|
|
1088
|
+
bq.ScalarQueryParameter("judge_prompt_prefix", "STRING", prefix),
|
|
1089
|
+
bq.ScalarQueryParameter("judge_prompt_middle", "STRING", middle),
|
|
1090
|
+
bq.ScalarQueryParameter("judge_prompt_suffix", "STRING", suffix),
|
|
1063
1091
|
]
|
|
1064
1092
|
|
|
1065
|
-
query =
|
|
1093
|
+
query = render_ai_generate_judge_query(
|
|
1066
1094
|
project=self.project_id,
|
|
1067
1095
|
dataset=self.dataset_id,
|
|
1068
1096
|
table=table,
|
|
1069
1097
|
where=where,
|
|
1070
1098
|
endpoint=self.endpoint,
|
|
1099
|
+
connection_id=self.connection_id,
|
|
1071
1100
|
)
|
|
1072
1101
|
job_config = bq.QueryJobConfig(
|
|
1073
1102
|
query_parameters=judge_params,
|
|
@@ -1121,12 +1150,13 @@ class Client:
|
|
|
1121
1150
|
"""Evaluates using BigQuery ML.GENERATE_TEXT."""
|
|
1122
1151
|
from google.cloud import bigquery as bq
|
|
1123
1152
|
|
|
1153
|
+
prefix, middle, suffix = split_judge_prompt_template(
|
|
1154
|
+
criterion.prompt_template
|
|
1155
|
+
)
|
|
1124
1156
|
judge_params = list(params) + [
|
|
1125
|
-
bq.ScalarQueryParameter(
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
criterion.prompt_template.split("{trace_text}")[0],
|
|
1129
|
-
),
|
|
1157
|
+
bq.ScalarQueryParameter("judge_prompt_prefix", "STRING", prefix),
|
|
1158
|
+
bq.ScalarQueryParameter("judge_prompt_middle", "STRING", middle),
|
|
1159
|
+
bq.ScalarQueryParameter("judge_prompt_suffix", "STRING", suffix),
|
|
1130
1160
|
]
|
|
1131
1161
|
|
|
1132
1162
|
query = LLM_JUDGE_BATCH_QUERY.format(
|