benchmax 0.1.2.dev25__tar.gz → 0.1.2.dev27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/PKG-INFO +1 -1
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/pyproject.toml +1 -1
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/config.py +0 -5
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/base_env.py +25 -8
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/client.py +96 -20
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/namespace.py +52 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/source.py +34 -3
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/pipeline.py +27 -15
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/rubric.py +44 -2
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/PKG-INFO +1 -1
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/LICENSE +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/README.md +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/setup.cfg +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/bundle.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/crm/crm_env.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/crm/workdir/reward_fn.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/example_id.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/data_utils.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/excel_env.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/excel_code_runner_mcp.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/excel_utils.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/reward_fn.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/logging.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/math/math_env.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/math/workdir/reward_fn.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/example_workdir/demo_mcp_server.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/example_workdir/reward_fn.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/parallel_mcp_env.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/base_provisioner.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/local_provisioner.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/manual_provisioner.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/skypilot_provisioner.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/utils.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/proxy_server.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/server_pool.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/utils.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/linker_env.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/search_env.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/reward_helpers.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/types.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/wikipedia/utils.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/wikipedia/wiki_env.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/caller.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/clients.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/example_usage.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/inspector.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/models.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/pricing.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/credentials.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/exceptions.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/training_run.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/validation.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/prompts/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/prompts/tools.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/email.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/inspector.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/markdown.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/models.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/storage.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/client.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/files.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/filter_mapper.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/search.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/source.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/files.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/filter_mapper.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/index_client.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/search.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/source.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/client.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/exceptions.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/filter_mapper.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/models.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/search.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/source.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_client.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/builders.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/dsl_parser.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/search_exceptions.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/search_types.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/source.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/files.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/filter_mapper.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/search.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/clean_bodies.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/dedupe.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/filter_automated_email_qas.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/filter_automated_emails.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/mbox.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/schema.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/anchor_selector.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/auto_tune.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/batch_processor.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/checkpoint.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/corpus_capabilities.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/corpus_profile.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/deterministic_guards.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/env_rollout.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/grounding_llm.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/hop_count_validity.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/quality_gate.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/retrieval_llm.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/formatters/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/formatters/train_eval.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generated_qa.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generators/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generators/direct_llm.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/helpers.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/metadata_linker.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/metrics.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/models.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/pipeline_config.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/protocols.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/query_rewriter.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/response_parsers.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/retrieval_query.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/scoring.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/search_agent_linker.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/storage.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/style_controls.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/base.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/dedup.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/wiki_builder.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/wiki_chunk_linker.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/_utils.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/adaptive.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/cache.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/prompts.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/reward_fns.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/adapter.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/adapter.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/message_extraction.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/http.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/pipeline.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/pivot.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/processing.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/registry.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/utils/__init__.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/utils/checkpoint.py +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/SOURCES.txt +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/dependency_links.txt +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/requires.txt +0 -0
- {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/top_level.txt +0 -0
|
@@ -38,8 +38,3 @@ def web_app_url() -> str:
|
|
|
38
38
|
def llm_url() -> str:
|
|
39
39
|
"""OpenAI-compatible LLM endpoint hosted by the platform."""
|
|
40
40
|
return os.environ.get("CASTFORM_LLM_URL") or f"https://llm.{base_domain()}/v1"
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def rollout_url() -> str:
|
|
44
|
-
"""Rollout / inference server."""
|
|
45
|
-
return os.environ.get("CASTFORM_ROLLOUT_URL") or f"https://autobots.{base_domain()}"
|
|
@@ -88,20 +88,37 @@ class BaseEnv(ABC):
|
|
|
88
88
|
)
|
|
89
89
|
|
|
90
90
|
@classmethod
|
|
91
|
-
def playground_preprocess(
|
|
92
|
-
|
|
91
|
+
def playground_preprocess(
|
|
92
|
+
cls,
|
|
93
|
+
prompt: str | None = None,
|
|
94
|
+
messages: Messages | None = None,
|
|
95
|
+
**kwargs: Any,
|
|
96
|
+
) -> Example:
|
|
97
|
+
"""Wrap a playground input into an :class:`Example`.
|
|
98
|
+
|
|
99
|
+
Accepts either ``prompt`` (single user string — the typical one-shot
|
|
100
|
+
chat case) or ``messages`` (a full chat list, used when replaying a
|
|
101
|
+
multi-turn eval prompt). Exactly one must be provided.
|
|
93
102
|
|
|
94
103
|
Classmethod (like :meth:`dataset_preprocess`), reading the static
|
|
95
|
-
``cls.system_prompt`` class attribute — so a
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
104
|
+
``cls.system_prompt`` class attribute — so a playground input is
|
|
105
|
+
preprocessed without constructing an env instance, and the system
|
|
106
|
+
prompt matches what training uses. ``cls.system_prompt`` is prepended
|
|
107
|
+
unless the caller already supplied a system message (a replayed eval
|
|
108
|
+
prompt typically does). ``task=None`` — the rollout worker skips
|
|
99
109
|
reward computation for playground examples.
|
|
100
110
|
"""
|
|
111
|
+
if messages is None:
|
|
112
|
+
if not prompt:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
"playground_preprocess requires either 'prompt' or 'messages'"
|
|
115
|
+
)
|
|
116
|
+
messages = [{"role": "user", "content": prompt}]
|
|
117
|
+
has_system = any(m.get("role") == "system" for m in messages)
|
|
101
118
|
return make_example(
|
|
102
|
-
prompt_messages=
|
|
119
|
+
prompt_messages=messages,
|
|
103
120
|
task=None,
|
|
104
|
-
system_prompt=cls.system_prompt,
|
|
121
|
+
system_prompt=None if has_system else cls.system_prompt,
|
|
105
122
|
)
|
|
106
123
|
|
|
107
124
|
@classmethod
|
|
@@ -29,7 +29,9 @@ from .exceptions import (
|
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
if TYPE_CHECKING:
|
|
32
|
-
|
|
32
|
+
from types import ModuleType
|
|
33
|
+
|
|
34
|
+
from benchmax.envs.base_env import BaseEnv
|
|
33
35
|
|
|
34
36
|
|
|
35
37
|
@dataclass(frozen=True)
|
|
@@ -279,7 +281,9 @@ class StorageClient:
|
|
|
279
281
|
# Stream from disk instead of read_bytes() to keep memory bounded for
|
|
280
282
|
# multi-GB datasets. httpx infers Content-Length from the file size.
|
|
281
283
|
url_response = self._get_upload_url(
|
|
282
|
-
path,
|
|
284
|
+
path,
|
|
285
|
+
mime_type,
|
|
286
|
+
expires_in_minutes=expires_in_minutes,
|
|
283
287
|
)
|
|
284
288
|
with file_path.open("rb") as fh:
|
|
285
289
|
self._put_to_signed_url(url_response["uploadUrl"], fh, mime_type)
|
|
@@ -437,7 +441,11 @@ class TrainerClient:
|
|
|
437
441
|
specs = self.list_launch_args()
|
|
438
442
|
print(_hdr("Launch args accepted by POST /train/runs/launch"))
|
|
439
443
|
for spec in specs:
|
|
440
|
-
req =
|
|
444
|
+
req = (
|
|
445
|
+
_RED + "required" + _RESET
|
|
446
|
+
if spec.required
|
|
447
|
+
else _CYAN + "optional" + _RESET
|
|
448
|
+
)
|
|
441
449
|
header = f" {_BOLD}{spec.name}{_RESET} ({spec.type}, {req})"
|
|
442
450
|
bits: list[str] = []
|
|
443
451
|
if spec.default is not None:
|
|
@@ -652,7 +660,9 @@ def _print_event(
|
|
|
652
660
|
tool_text,
|
|
653
661
|
)
|
|
654
662
|
else:
|
|
655
|
-
preview = textwrap.shorten(
|
|
663
|
+
preview = textwrap.shorten(
|
|
664
|
+
tool_text, width=120, placeholder="…"
|
|
665
|
+
)
|
|
656
666
|
print(
|
|
657
667
|
f"{prefix} → message [{role}/tool_result] "
|
|
658
668
|
f"(chars={len(tool_text)}): {preview}"
|
|
@@ -690,7 +700,13 @@ def _print_event(
|
|
|
690
700
|
|
|
691
701
|
|
|
692
702
|
class RolloutClient:
|
|
693
|
-
"""Thin synchronous client for the
|
|
703
|
+
"""Thin synchronous client for the rollout-stream endpoint.
|
|
704
|
+
|
|
705
|
+
Rollouts are reached through platform-service. platform-service is the API-key
|
|
706
|
+
gate: it validates the ``sk_`` key and mints a short-lived act_as JWT that
|
|
707
|
+
rollout-service accepts (rollout-service's own auth only takes
|
|
708
|
+
auth-service-minted JWTs — never a raw platform key). The proxy is mounted at
|
|
709
|
+
``/v1/rollout/stream``.
|
|
694
710
|
|
|
695
711
|
Supports two ways to provide the environment:
|
|
696
712
|
|
|
@@ -700,8 +716,11 @@ class RolloutClient:
|
|
|
700
716
|
raw file contents; they will be base64-encoded and sent inline.
|
|
701
717
|
|
|
702
718
|
Args:
|
|
703
|
-
api_key:
|
|
704
|
-
|
|
719
|
+
api_key: Platform API key (``sk_``); forwarded as the Bearer token
|
|
720
|
+
platform-service validates.
|
|
721
|
+
server_url: Base URL of platform-service. Defaults to
|
|
722
|
+
``config.platform_url()``; the ``/v1/rollout/stream`` path is
|
|
723
|
+
appended per request.
|
|
705
724
|
timeout: Per-request timeout in seconds (default 300 — rollouts can be slow).
|
|
706
725
|
"""
|
|
707
726
|
|
|
@@ -716,7 +735,9 @@ class RolloutClient:
|
|
|
716
735
|
self._api_key = api_key
|
|
717
736
|
# Resolve at construction time, not import time, so env-var changes
|
|
718
737
|
# take effect (mirrors StorageClient/TrainerClient default_factory pattern).
|
|
719
|
-
|
|
738
|
+
# Target platform-service (the API-key gate), not the rollout-service
|
|
739
|
+
# host directly — see the class docstring for why.
|
|
740
|
+
self._server_url = (server_url or config.platform_url()).rstrip("/")
|
|
720
741
|
self._timeout = timeout
|
|
721
742
|
|
|
722
743
|
@staticmethod
|
|
@@ -734,7 +755,9 @@ class RolloutClient:
|
|
|
734
755
|
has_bytes = env_cls_bytes is not None and env_metadata_bytes is not None
|
|
735
756
|
|
|
736
757
|
if has_paths and has_bytes:
|
|
737
|
-
raise ValueError(
|
|
758
|
+
raise ValueError(
|
|
759
|
+
"Provide either blob paths or raw bytes for the env, not both."
|
|
760
|
+
)
|
|
738
761
|
if not has_paths and not has_bytes:
|
|
739
762
|
raise ValueError(
|
|
740
763
|
"Provide either (env_cls_path, env_metadata_path) or "
|
|
@@ -844,7 +867,9 @@ class RolloutClient:
|
|
|
844
867
|
},
|
|
845
868
|
}
|
|
846
869
|
|
|
847
|
-
|
|
870
|
+
# platform-service mounts the proxy at /v1/rollout/stream; it validates
|
|
871
|
+
# the platform key and forwards to rollout-service with an act_as JWT.
|
|
872
|
+
url = f"{self._server_url}/v1/rollout/stream"
|
|
848
873
|
headers = {"Authorization": f"Bearer {self._api_key}"}
|
|
849
874
|
|
|
850
875
|
with httpx.stream(
|
|
@@ -858,7 +883,10 @@ class RolloutClient:
|
|
|
858
883
|
body = response.read().decode()
|
|
859
884
|
# Typed errors so callers can distinguish retryable from
|
|
860
885
|
# caller-fix from auth-fix without parsing exception messages.
|
|
861
|
-
|
|
886
|
+
# 403 too: rollouts route through platform-service's optionalAuth
|
|
887
|
+
# gate, which rejects a missing/invalid/expired key as 403
|
|
888
|
+
# ("sign in to run rollouts") rather than 401 — same fix (the key).
|
|
889
|
+
if response.status_code in (401, 403):
|
|
862
890
|
raise AuthenticationError(body[:300], response.status_code)
|
|
863
891
|
if response.status_code == 404:
|
|
864
892
|
raise RolloutNotFound(body[:300], response.status_code)
|
|
@@ -922,6 +950,10 @@ class RolloutClient:
|
|
|
922
950
|
env_metadata_path: str | None = None,
|
|
923
951
|
n: int = 2,
|
|
924
952
|
*,
|
|
953
|
+
env_class: type[BaseEnv] | None = None,
|
|
954
|
+
constructor_args: dict[str, Any] | None = None,
|
|
955
|
+
pip_dependencies: list[str] | None = None,
|
|
956
|
+
local_modules: list[ModuleType] | None = None,
|
|
925
957
|
env_cls_bytes: bytes | None = None,
|
|
926
958
|
env_metadata_bytes: bytes | None = None,
|
|
927
959
|
llm_model: str = _VALIDATION_MODEL,
|
|
@@ -930,14 +962,22 @@ class RolloutClient:
|
|
|
930
962
|
) -> ValidationResult:
|
|
931
963
|
"""Run rollouts on the first *n* examples and report pass/fail.
|
|
932
964
|
|
|
933
|
-
The environment can be specified
|
|
934
|
-
(
|
|
965
|
+
The environment can be specified three ways (mutually exclusive): an
|
|
966
|
+
**env class** (bundled to bytes here, so validation needs no prior
|
|
967
|
+
upload — preferred for a pre-launch smoke test), **blob paths** to an
|
|
968
|
+
already-uploaded env, or **raw bytes** (see class docstring).
|
|
935
969
|
|
|
936
970
|
Args:
|
|
937
971
|
examples: Full dataset (list of raw dicts).
|
|
938
972
|
env_cls_path: Blob path to the uploaded env .pkl file.
|
|
939
973
|
env_metadata_path: Blob path to the uploaded env-meta .json file.
|
|
940
974
|
n: Number of examples to validate (default 2).
|
|
975
|
+
env_class: BaseEnv subclass to bundle and validate without
|
|
976
|
+
uploading. Mutually exclusive with paths/bytes.
|
|
977
|
+
constructor_args: kwargs baked into the env bundle (env_class only).
|
|
978
|
+
pip_dependencies: Pip deps recorded in the bundle (env_class only).
|
|
979
|
+
local_modules: Modules to pickle by-value (env_class only; for
|
|
980
|
+
envs that import from local .py files).
|
|
941
981
|
env_cls_bytes: Raw bytes of the pickled env class (will be base64-encoded).
|
|
942
982
|
env_metadata_bytes: Raw bytes of the env metadata JSON (will be base64-encoded).
|
|
943
983
|
verbose: Print colored progress to stdout (default True for
|
|
@@ -949,12 +989,39 @@ class RolloutClient:
|
|
|
949
989
|
"did everything pass" check, with per-example detail in
|
|
950
990
|
``result.examples`` for richer reporting.
|
|
951
991
|
"""
|
|
992
|
+
# An env class is bundled to bytes here so validation can run a smoke
|
|
993
|
+
# test BEFORE uploading anything (the launch flow uploads only after
|
|
994
|
+
# validation passes). Mutually exclusive with explicit paths/bytes.
|
|
995
|
+
if env_class is not None:
|
|
996
|
+
if any(
|
|
997
|
+
(env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes)
|
|
998
|
+
):
|
|
999
|
+
raise ValueError(
|
|
1000
|
+
"Provide env_class OR explicit env paths/bytes, not both."
|
|
1001
|
+
)
|
|
1002
|
+
from benchmax.bundle import dump_bundle
|
|
1003
|
+
|
|
1004
|
+
bundle = dump_bundle(
|
|
1005
|
+
env_class,
|
|
1006
|
+
constructor_args=constructor_args,
|
|
1007
|
+
pip_dependencies=pip_dependencies,
|
|
1008
|
+
local_modules=local_modules,
|
|
1009
|
+
)
|
|
1010
|
+
env_cls_bytes = bundle.pickled
|
|
1011
|
+
env_metadata_bytes = bundle.metadata.to_json_bytes()
|
|
1012
|
+
|
|
952
1013
|
# Validate env args early so we fail before running any rollouts.
|
|
953
|
-
self._build_env(
|
|
1014
|
+
self._build_env(
|
|
1015
|
+
env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes
|
|
1016
|
+
)
|
|
954
1017
|
|
|
955
1018
|
sample = examples[:n]
|
|
956
1019
|
if verbose:
|
|
957
|
-
print(
|
|
1020
|
+
print(
|
|
1021
|
+
_hdr(
|
|
1022
|
+
f"── Remote validation: {len(sample)} example(s) on {llm_model} ──"
|
|
1023
|
+
)
|
|
1024
|
+
)
|
|
958
1025
|
|
|
959
1026
|
per_example: list[ExampleValidation] = []
|
|
960
1027
|
for i, example in enumerate(sample):
|
|
@@ -972,10 +1039,15 @@ class RolloutClient:
|
|
|
972
1039
|
max_turns=max_turns,
|
|
973
1040
|
)
|
|
974
1041
|
ok = bool(final.get("success"))
|
|
975
|
-
per_example.append(
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
1042
|
+
per_example.append(
|
|
1043
|
+
ExampleValidation(
|
|
1044
|
+
index=i,
|
|
1045
|
+
ok=ok,
|
|
1046
|
+
error=None
|
|
1047
|
+
if ok
|
|
1048
|
+
else (final.get("error") or "rollout reported success=False"),
|
|
1049
|
+
)
|
|
1050
|
+
)
|
|
979
1051
|
except (RolloutError, RuntimeError) as exc:
|
|
980
1052
|
if verbose:
|
|
981
1053
|
print(_err(f" Example {i} failed: {exc}"))
|
|
@@ -987,6 +1059,10 @@ class RolloutClient:
|
|
|
987
1059
|
if result.ok:
|
|
988
1060
|
print(_ok("Remote validation passed"))
|
|
989
1061
|
else:
|
|
990
|
-
print(
|
|
1062
|
+
print(
|
|
1063
|
+
_err(
|
|
1064
|
+
"Remote validation failed — check output above before launching a full job"
|
|
1065
|
+
)
|
|
1066
|
+
)
|
|
991
1067
|
|
|
992
1068
|
return result
|
{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/namespace.py
RENAMED
|
@@ -218,6 +218,29 @@ class TpufNamespace:
|
|
|
218
218
|
|
|
219
219
|
return len(all_chunks)
|
|
220
220
|
|
|
221
|
+
# ------------------------------------------------------------------
|
|
222
|
+
# Namespace metadata
|
|
223
|
+
# ------------------------------------------------------------------
|
|
224
|
+
|
|
225
|
+
def get_approx_row_count(self) -> int | None:
|
|
226
|
+
"""Return the approximate row count from namespace metadata.
|
|
227
|
+
|
|
228
|
+
Uses the tpuf metadata endpoint which returns ``approx_row_count``.
|
|
229
|
+
Unlike ``get_max_id()``, this reflects actual rows (accounting for
|
|
230
|
+
deletions) rather than the highest assigned ID.
|
|
231
|
+
"""
|
|
232
|
+
try:
|
|
233
|
+
meta = self._ns.metadata()
|
|
234
|
+
count = getattr(meta, "approx_row_count", None)
|
|
235
|
+
if isinstance(count, int):
|
|
236
|
+
return count
|
|
237
|
+
# Fallback: some SDK versions return a dict
|
|
238
|
+
if isinstance(meta, dict):
|
|
239
|
+
return meta.get("approx_row_count")
|
|
240
|
+
return None
|
|
241
|
+
except Exception:
|
|
242
|
+
return None
|
|
243
|
+
|
|
221
244
|
# ------------------------------------------------------------------
|
|
222
245
|
# ID pagination
|
|
223
246
|
# ------------------------------------------------------------------
|
|
@@ -237,6 +260,35 @@ class TpufNamespace:
|
|
|
237
260
|
return None
|
|
238
261
|
return rows[0].id
|
|
239
262
|
|
|
263
|
+
def scan_all_rows(self, limit: int | None = None, page_size: int = 10_000) -> list[Any]:
|
|
264
|
+
"""Sequentially scan all rows with attributes via cursor pagination.
|
|
265
|
+
|
|
266
|
+
Much faster than random-ID sampling for large fetches — single pass,
|
|
267
|
+
no retries, no ID collisions. Returns up to ``limit`` rows (all if
|
|
268
|
+
None).
|
|
269
|
+
"""
|
|
270
|
+
all_rows: list[Any] = []
|
|
271
|
+
last_id = 0
|
|
272
|
+
|
|
273
|
+
while True:
|
|
274
|
+
result = self._ns.query(
|
|
275
|
+
rank_by=["id", "asc"],
|
|
276
|
+
filters=["id", "Gt", last_id],
|
|
277
|
+
top_k=page_size,
|
|
278
|
+
include_attributes=True,
|
|
279
|
+
)
|
|
280
|
+
rows = result.rows
|
|
281
|
+
if not rows:
|
|
282
|
+
break
|
|
283
|
+
all_rows.extend(rows)
|
|
284
|
+
last_id = rows[-1].id
|
|
285
|
+
if limit is not None and len(all_rows) >= limit:
|
|
286
|
+
return all_rows[:limit]
|
|
287
|
+
if len(rows) < page_size:
|
|
288
|
+
break
|
|
289
|
+
|
|
290
|
+
return all_rows
|
|
291
|
+
|
|
240
292
|
def paginate_all_ids(self, page_size: int = 1000) -> list[int]:
|
|
241
293
|
"""Return all row IDs in the namespace via cursor pagination."""
|
|
242
294
|
all_ids: list[int] = []
|
|
@@ -198,9 +198,37 @@ class TpufChunkSource:
|
|
|
198
198
|
# ------------------------------------------------------------------
|
|
199
199
|
|
|
200
200
|
def get_chunk_count(self) -> int:
|
|
201
|
-
"""Return the total number of chunks in the namespace.
|
|
201
|
+
"""Return the total number of chunks in the namespace.
|
|
202
|
+
|
|
203
|
+
Prefers ``approx_row_count`` from the metadata endpoint (reflects
|
|
204
|
+
actual rows after deletions). Falls back to ``get_max_id()`` which
|
|
205
|
+
can over-count in sparse namespaces.
|
|
206
|
+
"""
|
|
207
|
+
approx = self._client.get_approx_row_count()
|
|
208
|
+
if approx is not None:
|
|
209
|
+
return approx
|
|
202
210
|
return self._client.get_max_id() or 0
|
|
203
211
|
|
|
212
|
+
def scan_chunks(self, limit: int | None = None, min_chars: int = 0) -> list[Chunk]:
|
|
213
|
+
"""Sequentially scan chunks via cursor pagination.
|
|
214
|
+
|
|
215
|
+
Much faster than ``sample_chunks`` for large fetches (single pass, no
|
|
216
|
+
retries). Returns chunks in ID order, not random. Use this when you
|
|
217
|
+
need most or all of the namespace (e.g. materialization).
|
|
218
|
+
"""
|
|
219
|
+
# Over-fetch to account for min_chars filtering
|
|
220
|
+
fetch_limit = None if limit is None else int(limit * (3 if min_chars > 0 else 1.1))
|
|
221
|
+
rows = self._client.scan_all_rows(limit=fetch_limit)
|
|
222
|
+
collected: list[Chunk] = []
|
|
223
|
+
for row in rows:
|
|
224
|
+
chunk = self._client.row_to_chunk(row)
|
|
225
|
+
if min_chars > 0 and len(chunk.content) < min_chars:
|
|
226
|
+
continue
|
|
227
|
+
collected.append(chunk)
|
|
228
|
+
if limit is not None and len(collected) >= limit:
|
|
229
|
+
break
|
|
230
|
+
return collected
|
|
231
|
+
|
|
204
232
|
def sample_chunks(self, n: int, min_chars: int = 0) -> list[Chunk]:
|
|
205
233
|
"""Return n randomly sampled chunks, optionally filtered by minimum length.
|
|
206
234
|
|
|
@@ -357,8 +385,11 @@ class TpufChunkSource:
|
|
|
357
385
|
return []
|
|
358
386
|
|
|
359
387
|
# Skip expensive full-namespace pagination for large namespaces.
|
|
360
|
-
# Use
|
|
361
|
-
#
|
|
388
|
+
# Use approx_row_count (actual rows) rather than paginating all IDs
|
|
389
|
+
# just to count them — that's O(N) API calls for large namespaces.
|
|
390
|
+
chunk_count = self.get_chunk_count()
|
|
391
|
+
if chunk_count > 50_000:
|
|
392
|
+
return []
|
|
362
393
|
all_ids = self._client.paginate_all_ids()
|
|
363
394
|
if len(all_ids) > 50_000:
|
|
364
395
|
return []
|
|
@@ -1332,30 +1332,42 @@ class Pipeline:
|
|
|
1332
1332
|
# resolve any chunk by hash.
|
|
1333
1333
|
max_materialize = 50_000
|
|
1334
1334
|
if getattr(source, "collection", None) is None and chunk_count > 0:
|
|
1335
|
-
|
|
1336
|
-
from benchmax.rag.chunkers.models import ChunkCollection # noqa: PLC0415
|
|
1335
|
+
from benchmax.rag.chunkers.models import ChunkCollection # noqa: PLC0415
|
|
1337
1336
|
|
|
1337
|
+
materialize_count = min(chunk_count, max_materialize)
|
|
1338
|
+
if chunk_count > max_materialize:
|
|
1339
|
+
logger.warning(
|
|
1340
|
+
"Corpus has %d chunks (limit %d). Materialising a capped "
|
|
1341
|
+
"sample so entity extraction and the chunk graph still work.",
|
|
1342
|
+
chunk_count,
|
|
1343
|
+
max_materialize,
|
|
1344
|
+
)
|
|
1345
|
+
else:
|
|
1338
1346
|
logger.info(
|
|
1339
1347
|
"Materialising %d chunks from API backend into memory...",
|
|
1340
1348
|
chunk_count,
|
|
1341
1349
|
)
|
|
1342
|
-
|
|
1343
|
-
|
|
1350
|
+
|
|
1351
|
+
# Use sequential scan when available — cursor pagination avoids
|
|
1352
|
+
# the ID-collision overhead of random sampling at high fill rates.
|
|
1353
|
+
# ~1.9x faster for 50k chunks from a 65k namespace.
|
|
1354
|
+
if hasattr(source, "scan_chunks"):
|
|
1355
|
+
all_chunks = source.scan_chunks(
|
|
1356
|
+
limit=materialize_count,
|
|
1344
1357
|
min_chars=cfg.corpus.min_chunk_chars,
|
|
1345
1358
|
)
|
|
1346
|
-
if all_chunks:
|
|
1347
|
-
source.collection = ChunkCollection(chunks=all_chunks) # type: ignore[attr-defined]
|
|
1348
|
-
logger.info(
|
|
1349
|
-
"Cached %d/%d chunks on source.collection",
|
|
1350
|
-
len(all_chunks),
|
|
1351
|
-
chunk_count,
|
|
1352
|
-
)
|
|
1353
1359
|
else:
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1360
|
+
all_chunks = source.sample_chunks(
|
|
1361
|
+
materialize_count,
|
|
1362
|
+
min_chars=cfg.corpus.min_chunk_chars,
|
|
1363
|
+
)
|
|
1364
|
+
|
|
1365
|
+
if all_chunks:
|
|
1366
|
+
source.collection = ChunkCollection(chunks=all_chunks) # type: ignore[attr-defined]
|
|
1367
|
+
logger.info(
|
|
1368
|
+
"Cached %d/%d chunks on source.collection",
|
|
1369
|
+
len(all_chunks),
|
|
1357
1370
|
chunk_count,
|
|
1358
|
-
max_materialize,
|
|
1359
1371
|
)
|
|
1360
1372
|
|
|
1361
1373
|
profile_sample = diverse_profile_sample(
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import os
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from typing import Any, Dict, List, Literal, Optional
|
|
4
5
|
|
|
5
6
|
from openai import AsyncOpenAI
|
|
6
7
|
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
7
10
|
from benchmax.platform.credentials import platform_bearer
|
|
8
11
|
|
|
9
12
|
from ._utils import _extract_json
|
|
@@ -76,6 +79,7 @@ async def evaluate_single_rubric(
|
|
|
76
79
|
ground_truth: Optional[str] = None,
|
|
77
80
|
api_key: str = "",
|
|
78
81
|
timeout: Optional[float] = None,
|
|
82
|
+
enable_logging: bool = True,
|
|
79
83
|
) -> Dict[str, Any]:
|
|
80
84
|
"""
|
|
81
85
|
Evaluate a single response against a single rubric.
|
|
@@ -146,11 +150,26 @@ async def evaluate_single_rubric(
|
|
|
146
150
|
return {"score": 0, "reasoning": "Empty response", "llm_output": ""}
|
|
147
151
|
|
|
148
152
|
result = _extract_json(content)
|
|
149
|
-
|
|
153
|
+
out = {
|
|
150
154
|
"score": result.get("score", 0),
|
|
151
155
|
"reasoning": result.get("reasoning", ""),
|
|
152
156
|
"llm_output": content,
|
|
153
157
|
}
|
|
158
|
+
if enable_logging:
|
|
159
|
+
logger.info(
|
|
160
|
+
"\n┌─ rubric: %s ─────────────────────\n"
|
|
161
|
+
"│ ground_truth : %s\n"
|
|
162
|
+
"│ score : %s\n"
|
|
163
|
+
"│ reasoning : %s\n"
|
|
164
|
+
"│ llm_output :\n%s\n"
|
|
165
|
+
"└──────────────────────────────────────────────────",
|
|
166
|
+
rubric.title,
|
|
167
|
+
(ground_truth or "").strip() or "(none)",
|
|
168
|
+
out["score"],
|
|
169
|
+
out["reasoning"],
|
|
170
|
+
content,
|
|
171
|
+
)
|
|
172
|
+
return out
|
|
154
173
|
|
|
155
174
|
except Exception as e:
|
|
156
175
|
print(f"Error evaluating rubric '{rubric.title}': {e}\njudge output:\n{content}")
|
|
@@ -166,6 +185,7 @@ async def evaluate_rubric_ranking(
|
|
|
166
185
|
api_key: str = "",
|
|
167
186
|
timeout: Optional[float] = None,
|
|
168
187
|
ground_truth: Optional[str] = None,
|
|
188
|
+
enable_logging: bool = True,
|
|
169
189
|
) -> Dict[str, Any]:
|
|
170
190
|
"""
|
|
171
191
|
Rank N responses against a single rubric in one judge call and convert the
|
|
@@ -276,12 +296,34 @@ async def evaluate_rubric_ranking(
|
|
|
276
296
|
for j, p in pos_of.items():
|
|
277
297
|
scores[nonempty[j][0]] = 1.0 - p / max_pos if max_pos > 0 else 1.0
|
|
278
298
|
|
|
279
|
-
|
|
299
|
+
out = {
|
|
280
300
|
"scores": scores,
|
|
281
301
|
"ranking": ranking,
|
|
282
302
|
"reasoning": result.get("reasoning", ""),
|
|
283
303
|
"llm_output": content,
|
|
284
304
|
}
|
|
305
|
+
if enable_logging:
|
|
306
|
+
scores_fmt = " ".join(f"[{i}]={s:.3f}" for i, s in enumerate(scores))
|
|
307
|
+
ranking_fmt = " > ".join(
|
|
308
|
+
f"[{', '.join(str(j) for j in tier)}]" if isinstance(tier, list) else str(tier)
|
|
309
|
+
for tier in ranking
|
|
310
|
+
)
|
|
311
|
+
logger.info(
|
|
312
|
+
"\n┌─ ranked rubric: %s ────────────────────\n"
|
|
313
|
+
"│ ground_truth : %s\n"
|
|
314
|
+
"│ ranking : %s\n"
|
|
315
|
+
"│ scores : %s\n"
|
|
316
|
+
"│ reasoning : %s\n"
|
|
317
|
+
"│ llm_output :\n%s\n"
|
|
318
|
+
"└──────────────────────────────────────────────────",
|
|
319
|
+
rubric.title,
|
|
320
|
+
(ground_truth or "").strip() or "(none)",
|
|
321
|
+
ranking_fmt or "(empty)",
|
|
322
|
+
scores_fmt,
|
|
323
|
+
out["reasoning"],
|
|
324
|
+
content,
|
|
325
|
+
)
|
|
326
|
+
return out
|
|
285
327
|
except Exception as e:
|
|
286
328
|
print(f"Error ranking rubric '{rubric.title}': {e}\njudge output:\n{content}")
|
|
287
329
|
return {"scores": scores, "ranking": [], "reasoning": f"Error: {e}", "llm_output": content}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/excel_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/example_workdir/reward_fn.py
RENAMED
|
File without changes
|
|
File without changes
|
{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/__init__.py
RENAMED
|
File without changes
|
{benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/base_provisioner.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|