benchmax 0.1.2.dev25__tar.gz → 0.1.2.dev27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/PKG-INFO +1 -1
  2. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/pyproject.toml +1 -1
  3. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/config.py +0 -5
  4. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/base_env.py +25 -8
  5. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/client.py +96 -20
  6. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/namespace.py +52 -0
  7. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/source.py +34 -3
  8. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/pipeline.py +27 -15
  9. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/rubric.py +44 -2
  10. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/PKG-INFO +1 -1
  11. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/LICENSE +0 -0
  12. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/README.md +0 -0
  13. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/setup.cfg +0 -0
  14. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/bundle.py +0 -0
  15. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/__init__.py +0 -0
  16. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/crm/crm_env.py +0 -0
  17. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/crm/workdir/reward_fn.py +0 -0
  18. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/example_id.py +0 -0
  19. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/data_utils.py +0 -0
  20. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/excel_env.py +0 -0
  21. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/__init__.py +0 -0
  22. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/excel_code_runner_mcp.py +0 -0
  23. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/excel_utils.py +0 -0
  24. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/reward_fn.py +0 -0
  25. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/logging.py +0 -0
  26. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/math/math_env.py +0 -0
  27. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/math/workdir/reward_fn.py +0 -0
  28. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/__init__.py +0 -0
  29. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/example_workdir/demo_mcp_server.py +0 -0
  30. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/example_workdir/reward_fn.py +0 -0
  31. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/parallel_mcp_env.py +0 -0
  32. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/__init__.py +0 -0
  33. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/base_provisioner.py +0 -0
  34. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/local_provisioner.py +0 -0
  35. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/manual_provisioner.py +0 -0
  36. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/skypilot_provisioner.py +0 -0
  37. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/utils.py +0 -0
  38. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/proxy_server.py +0 -0
  39. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/server_pool.py +0 -0
  40. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/utils.py +0 -0
  41. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/__init__.py +0 -0
  42. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/linker_env.py +0 -0
  43. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/search_env.py +0 -0
  44. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/reward_helpers.py +0 -0
  45. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/types.py +0 -0
  46. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/wikipedia/utils.py +0 -0
  47. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/envs/wikipedia/wiki_env.py +0 -0
  48. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/__init__.py +0 -0
  49. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/caller.py +0 -0
  50. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/clients.py +0 -0
  51. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/example_usage.py +0 -0
  52. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/inspector.py +0 -0
  53. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/models.py +0 -0
  54. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/pricing.py +0 -0
  55. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/__init__.py +0 -0
  56. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/credentials.py +0 -0
  57. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/exceptions.py +0 -0
  58. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/training_run.py +0 -0
  59. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/platform/validation.py +0 -0
  60. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/prompts/__init__.py +0 -0
  61. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/prompts/tools.py +0 -0
  62. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/__init__.py +0 -0
  63. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/email.py +0 -0
  64. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/inspector.py +0 -0
  65. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/markdown.py +0 -0
  66. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/models.py +0 -0
  67. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/storage.py +0 -0
  68. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/__init__.py +0 -0
  69. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/__init__.py +0 -0
  70. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/client.py +0 -0
  71. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/files.py +0 -0
  72. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/filter_mapper.py +0 -0
  73. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/search.py +0 -0
  74. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/source.py +0 -0
  75. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/__init__.py +0 -0
  76. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/files.py +0 -0
  77. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/filter_mapper.py +0 -0
  78. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/index_client.py +0 -0
  79. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/search.py +0 -0
  80. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/source.py +0 -0
  81. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/__init__.py +0 -0
  82. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/client.py +0 -0
  83. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/exceptions.py +0 -0
  84. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/filter_mapper.py +0 -0
  85. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/models.py +0 -0
  86. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/search.py +0 -0
  87. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/source.py +0 -0
  88. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_client.py +0 -0
  89. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/__init__.py +0 -0
  90. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/builders.py +0 -0
  91. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/dsl_parser.py +0 -0
  92. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/search_exceptions.py +0 -0
  93. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/search_types.py +0 -0
  94. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/source.py +0 -0
  95. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/__init__.py +0 -0
  96. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/files.py +0 -0
  97. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/filter_mapper.py +0 -0
  98. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/search.py +0 -0
  99. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/__init__.py +0 -0
  100. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/__init__.py +0 -0
  101. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/clean_bodies.py +0 -0
  102. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/dedupe.py +0 -0
  103. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/filter_automated_email_qas.py +0 -0
  104. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/filter_automated_emails.py +0 -0
  105. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/mbox.py +0 -0
  106. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/schema.py +0 -0
  107. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/__init__.py +0 -0
  108. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/anchor_selector.py +0 -0
  109. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/auto_tune.py +0 -0
  110. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/batch_processor.py +0 -0
  111. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/checkpoint.py +0 -0
  112. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/corpus_capabilities.py +0 -0
  113. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/corpus_profile.py +0 -0
  114. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/__init__.py +0 -0
  115. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/deterministic_guards.py +0 -0
  116. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/env_rollout.py +0 -0
  117. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/grounding_llm.py +0 -0
  118. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/hop_count_validity.py +0 -0
  119. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/quality_gate.py +0 -0
  120. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/retrieval_llm.py +0 -0
  121. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/formatters/__init__.py +0 -0
  122. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/formatters/train_eval.py +0 -0
  123. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generated_qa.py +0 -0
  124. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generators/__init__.py +0 -0
  125. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generators/direct_llm.py +0 -0
  126. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/helpers.py +0 -0
  127. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/metadata_linker.py +0 -0
  128. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/metrics.py +0 -0
  129. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/models.py +0 -0
  130. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/pipeline_config.py +0 -0
  131. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/protocols.py +0 -0
  132. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/query_rewriter.py +0 -0
  133. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/response_parsers.py +0 -0
  134. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/retrieval_query.py +0 -0
  135. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/scoring.py +0 -0
  136. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/search_agent_linker.py +0 -0
  137. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/storage.py +0 -0
  138. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/style_controls.py +0 -0
  139. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/__init__.py +0 -0
  140. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/base.py +0 -0
  141. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/dedup.py +0 -0
  142. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/wiki_builder.py +0 -0
  143. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/wiki_chunk_linker.py +0 -0
  144. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/__init__.py +0 -0
  145. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/_utils.py +0 -0
  146. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/adaptive.py +0 -0
  147. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/cache.py +0 -0
  148. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/prompts.py +0 -0
  149. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/reward_fns.py +0 -0
  150. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/__init__.py +0 -0
  151. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/adapter.py +0 -0
  152. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/__init__.py +0 -0
  153. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/adapter.py +0 -0
  154. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/message_extraction.py +0 -0
  155. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/http.py +0 -0
  156. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/pipeline.py +0 -0
  157. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/pivot.py +0 -0
  158. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/processing.py +0 -0
  159. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/traces/registry.py +0 -0
  160. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/utils/__init__.py +0 -0
  161. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax/utils/checkpoint.py +0 -0
  162. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/SOURCES.txt +0 -0
  163. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/dependency_links.txt +0 -0
  164. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/requires.txt +0 -0
  165. {benchmax-0.1.2.dev25 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmax
3
- Version: 0.1.2.dev25
3
+ Version: 0.1.2.dev27
4
4
  Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
5
5
  Author: castie@castform.com
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchmax"
3
- version = "0.1.2.dev25"
3
+ version = "0.1.2.dev27"
4
4
  description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "castie@castform.com" }]
@@ -38,8 +38,3 @@ def web_app_url() -> str:
38
38
  def llm_url() -> str:
39
39
  """OpenAI-compatible LLM endpoint hosted by the platform."""
40
40
  return os.environ.get("CASTFORM_LLM_URL") or f"https://llm.{base_domain()}/v1"
41
-
42
-
43
- def rollout_url() -> str:
44
- """Rollout / inference server."""
45
- return os.environ.get("CASTFORM_ROLLOUT_URL") or f"https://autobots.{base_domain()}"
@@ -88,20 +88,37 @@ class BaseEnv(ABC):
88
88
  )
89
89
 
90
90
  @classmethod
91
- def playground_preprocess(cls, prompt: str, **kwargs: Any) -> Example:
92
- """Wrap a one-shot playground prompt into an :class:`Example`.
91
+ def playground_preprocess(
92
+ cls,
93
+ prompt: str | None = None,
94
+ messages: Messages | None = None,
95
+ **kwargs: Any,
96
+ ) -> Example:
97
+ """Wrap a playground input into an :class:`Example`.
98
+
99
+ Accepts either ``prompt`` (single user string — the typical one-shot
100
+ chat case) or ``messages`` (a full chat list, used when replaying a
101
+ multi-turn eval prompt). Exactly one must be provided.
93
102
 
94
103
  Classmethod (like :meth:`dataset_preprocess`), reading the static
95
- ``cls.system_prompt`` class attribute — so a one-shot playground
96
- prompt is preprocessed without constructing an env instance, and the
97
- system prompt matches what training uses. Prepends ``cls.system_prompt``
98
- via :func:`make_example` with ``task=None`` the rollout worker skips
104
+ ``cls.system_prompt`` class attribute — so a playground input is
105
+ preprocessed without constructing an env instance, and the system
106
+ prompt matches what training uses. ``cls.system_prompt`` is prepended
107
+ unless the caller already supplied a system message (a replayed eval
108
+ prompt typically does). ``task=None`` — the rollout worker skips
99
109
  reward computation for playground examples.
100
110
  """
111
+ if messages is None:
112
+ if not prompt:
113
+ raise ValueError(
114
+ "playground_preprocess requires either 'prompt' or 'messages'"
115
+ )
116
+ messages = [{"role": "user", "content": prompt}]
117
+ has_system = any(m.get("role") == "system" for m in messages)
101
118
  return make_example(
102
- prompt_messages=[{"role": "user", "content": prompt}],
119
+ prompt_messages=messages,
103
120
  task=None,
104
- system_prompt=cls.system_prompt,
121
+ system_prompt=None if has_system else cls.system_prompt,
105
122
  )
106
123
 
107
124
  @classmethod
@@ -29,7 +29,9 @@ from .exceptions import (
29
29
  )
30
30
 
31
31
  if TYPE_CHECKING:
32
- pass
32
+ from types import ModuleType
33
+
34
+ from benchmax.envs.base_env import BaseEnv
33
35
 
34
36
 
35
37
  @dataclass(frozen=True)
@@ -279,7 +281,9 @@ class StorageClient:
279
281
  # Stream from disk instead of read_bytes() to keep memory bounded for
280
282
  # multi-GB datasets. httpx infers Content-Length from the file size.
281
283
  url_response = self._get_upload_url(
282
- path, mime_type, expires_in_minutes=expires_in_minutes,
284
+ path,
285
+ mime_type,
286
+ expires_in_minutes=expires_in_minutes,
283
287
  )
284
288
  with file_path.open("rb") as fh:
285
289
  self._put_to_signed_url(url_response["uploadUrl"], fh, mime_type)
@@ -437,7 +441,11 @@ class TrainerClient:
437
441
  specs = self.list_launch_args()
438
442
  print(_hdr("Launch args accepted by POST /train/runs/launch"))
439
443
  for spec in specs:
440
- req = _RED + "required" + _RESET if spec.required else _CYAN + "optional" + _RESET
444
+ req = (
445
+ _RED + "required" + _RESET
446
+ if spec.required
447
+ else _CYAN + "optional" + _RESET
448
+ )
441
449
  header = f" {_BOLD}{spec.name}{_RESET} ({spec.type}, {req})"
442
450
  bits: list[str] = []
443
451
  if spec.default is not None:
@@ -652,7 +660,9 @@ def _print_event(
652
660
  tool_text,
653
661
  )
654
662
  else:
655
- preview = textwrap.shorten(tool_text, width=120, placeholder="…")
663
+ preview = textwrap.shorten(
664
+ tool_text, width=120, placeholder="…"
665
+ )
656
666
  print(
657
667
  f"{prefix} → message [{role}/tool_result] "
658
668
  f"(chars={len(tool_text)}): {preview}"
@@ -690,7 +700,13 @@ def _print_event(
690
700
 
691
701
 
692
702
  class RolloutClient:
693
- """Thin synchronous client for the /rollout/stream endpoint.
703
+ """Thin synchronous client for the rollout-stream endpoint.
704
+
705
+ Rollouts are reached through platform-service. platform-service is the API-key
706
+ gate: it validates the ``sk_`` key and mints a short-lived act_as JWT that
707
+ rollout-service accepts (rollout-service's own auth only takes
708
+ auth-service-minted JWTs — never a raw platform key). The proxy is mounted at
709
+ ``/v1/rollout/stream``.
694
710
 
695
711
  Supports two ways to provide the environment:
696
712
 
@@ -700,8 +716,11 @@ class RolloutClient:
700
716
  raw file contents; they will be base64-encoded and sent inline.
701
717
 
702
718
  Args:
703
- api_key: Bearer token for the rollout server.
704
- server_url: Base URL of the rollout server.
719
+ api_key: Platform API key (``sk_``); forwarded as the Bearer token
720
+ platform-service validates.
721
+ server_url: Base URL of platform-service. Defaults to
722
+ ``config.platform_url()``; the ``/v1/rollout/stream`` path is
723
+ appended per request.
705
724
  timeout: Per-request timeout in seconds (default 300 — rollouts can be slow).
706
725
  """
707
726
 
@@ -716,7 +735,9 @@ class RolloutClient:
716
735
  self._api_key = api_key
717
736
  # Resolve at construction time, not import time, so env-var changes
718
737
  # take effect (mirrors StorageClient/TrainerClient default_factory pattern).
719
- self._server_url = (server_url or config.rollout_url()).rstrip("/")
738
+ # Target platform-service (the API-key gate), not the rollout-service
739
+ # host directly — see the class docstring for why.
740
+ self._server_url = (server_url or config.platform_url()).rstrip("/")
720
741
  self._timeout = timeout
721
742
 
722
743
  @staticmethod
@@ -734,7 +755,9 @@ class RolloutClient:
734
755
  has_bytes = env_cls_bytes is not None and env_metadata_bytes is not None
735
756
 
736
757
  if has_paths and has_bytes:
737
- raise ValueError("Provide either blob paths or raw bytes for the env, not both.")
758
+ raise ValueError(
759
+ "Provide either blob paths or raw bytes for the env, not both."
760
+ )
738
761
  if not has_paths and not has_bytes:
739
762
  raise ValueError(
740
763
  "Provide either (env_cls_path, env_metadata_path) or "
@@ -844,7 +867,9 @@ class RolloutClient:
844
867
  },
845
868
  }
846
869
 
847
- url = f"{self._server_url}/rollout/stream"
870
+ # platform-service mounts the proxy at /v1/rollout/stream; it validates
871
+ # the platform key and forwards to rollout-service with an act_as JWT.
872
+ url = f"{self._server_url}/v1/rollout/stream"
848
873
  headers = {"Authorization": f"Bearer {self._api_key}"}
849
874
 
850
875
  with httpx.stream(
@@ -858,7 +883,10 @@ class RolloutClient:
858
883
  body = response.read().decode()
859
884
  # Typed errors so callers can distinguish retryable from
860
885
  # caller-fix from auth-fix without parsing exception messages.
861
- if response.status_code == 401:
886
+ # 403 too: rollouts route through platform-service's optionalAuth
887
+ # gate, which rejects a missing/invalid/expired key as 403
888
+ # ("sign in to run rollouts") rather than 401 — same fix (the key).
889
+ if response.status_code in (401, 403):
862
890
  raise AuthenticationError(body[:300], response.status_code)
863
891
  if response.status_code == 404:
864
892
  raise RolloutNotFound(body[:300], response.status_code)
@@ -922,6 +950,10 @@ class RolloutClient:
922
950
  env_metadata_path: str | None = None,
923
951
  n: int = 2,
924
952
  *,
953
+ env_class: type[BaseEnv] | None = None,
954
+ constructor_args: dict[str, Any] | None = None,
955
+ pip_dependencies: list[str] | None = None,
956
+ local_modules: list[ModuleType] | None = None,
925
957
  env_cls_bytes: bytes | None = None,
926
958
  env_metadata_bytes: bytes | None = None,
927
959
  llm_model: str = _VALIDATION_MODEL,
@@ -930,14 +962,22 @@ class RolloutClient:
930
962
  ) -> ValidationResult:
931
963
  """Run rollouts on the first *n* examples and report pass/fail.
932
964
 
933
- The environment can be specified via **blob paths** or **raw bytes**
934
- (mutually exclusive see class docstring).
965
+ The environment can be specified three ways (mutually exclusive): an
966
+ **env class** (bundled to bytes here, so validation needs no prior
967
+ upload — preferred for a pre-launch smoke test), **blob paths** to an
968
+ already-uploaded env, or **raw bytes** (see class docstring).
935
969
 
936
970
  Args:
937
971
  examples: Full dataset (list of raw dicts).
938
972
  env_cls_path: Blob path to the uploaded env .pkl file.
939
973
  env_metadata_path: Blob path to the uploaded env-meta .json file.
940
974
  n: Number of examples to validate (default 2).
975
+ env_class: BaseEnv subclass to bundle and validate without
976
+ uploading. Mutually exclusive with paths/bytes.
977
+ constructor_args: kwargs baked into the env bundle (env_class only).
978
+ pip_dependencies: Pip deps recorded in the bundle (env_class only).
979
+ local_modules: Modules to pickle by-value (env_class only; for
980
+ envs that import from local .py files).
941
981
  env_cls_bytes: Raw bytes of the pickled env class (will be base64-encoded).
942
982
  env_metadata_bytes: Raw bytes of the env metadata JSON (will be base64-encoded).
943
983
  verbose: Print colored progress to stdout (default True for
@@ -949,12 +989,39 @@ class RolloutClient:
949
989
  "did everything pass" check, with per-example detail in
950
990
  ``result.examples`` for richer reporting.
951
991
  """
992
+ # An env class is bundled to bytes here so validation can run a smoke
993
+ # test BEFORE uploading anything (the launch flow uploads only after
994
+ # validation passes). Mutually exclusive with explicit paths/bytes.
995
+ if env_class is not None:
996
+ if any(
997
+ (env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes)
998
+ ):
999
+ raise ValueError(
1000
+ "Provide env_class OR explicit env paths/bytes, not both."
1001
+ )
1002
+ from benchmax.bundle import dump_bundle
1003
+
1004
+ bundle = dump_bundle(
1005
+ env_class,
1006
+ constructor_args=constructor_args,
1007
+ pip_dependencies=pip_dependencies,
1008
+ local_modules=local_modules,
1009
+ )
1010
+ env_cls_bytes = bundle.pickled
1011
+ env_metadata_bytes = bundle.metadata.to_json_bytes()
1012
+
952
1013
  # Validate env args early so we fail before running any rollouts.
953
- self._build_env(env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes)
1014
+ self._build_env(
1015
+ env_cls_path, env_metadata_path, env_cls_bytes, env_metadata_bytes
1016
+ )
954
1017
 
955
1018
  sample = examples[:n]
956
1019
  if verbose:
957
- print(_hdr(f"── Remote validation: {len(sample)} example(s) on {llm_model} ──"))
1020
+ print(
1021
+ _hdr(
1022
+ f"── Remote validation: {len(sample)} example(s) on {llm_model} ──"
1023
+ )
1024
+ )
958
1025
 
959
1026
  per_example: list[ExampleValidation] = []
960
1027
  for i, example in enumerate(sample):
@@ -972,10 +1039,15 @@ class RolloutClient:
972
1039
  max_turns=max_turns,
973
1040
  )
974
1041
  ok = bool(final.get("success"))
975
- per_example.append(ExampleValidation(
976
- index=i, ok=ok,
977
- error=None if ok else (final.get("error") or "rollout reported success=False"),
978
- ))
1042
+ per_example.append(
1043
+ ExampleValidation(
1044
+ index=i,
1045
+ ok=ok,
1046
+ error=None
1047
+ if ok
1048
+ else (final.get("error") or "rollout reported success=False"),
1049
+ )
1050
+ )
979
1051
  except (RolloutError, RuntimeError) as exc:
980
1052
  if verbose:
981
1053
  print(_err(f" Example {i} failed: {exc}"))
@@ -987,6 +1059,10 @@ class RolloutClient:
987
1059
  if result.ok:
988
1060
  print(_ok("Remote validation passed"))
989
1061
  else:
990
- print(_err("Remote validation failed — check output above before launching a full job"))
1062
+ print(
1063
+ _err(
1064
+ "Remote validation failed — check output above before launching a full job"
1065
+ )
1066
+ )
991
1067
 
992
1068
  return result
@@ -218,6 +218,29 @@ class TpufNamespace:
218
218
 
219
219
  return len(all_chunks)
220
220
 
221
+ # ------------------------------------------------------------------
222
+ # Namespace metadata
223
+ # ------------------------------------------------------------------
224
+
225
+ def get_approx_row_count(self) -> int | None:
226
+ """Return the approximate row count from namespace metadata.
227
+
228
+ Uses the tpuf metadata endpoint which returns ``approx_row_count``.
229
+ Unlike ``get_max_id()``, this reflects actual rows (accounting for
230
+ deletions) rather than the highest assigned ID.
231
+ """
232
+ try:
233
+ meta = self._ns.metadata()
234
+ count = getattr(meta, "approx_row_count", None)
235
+ if isinstance(count, int):
236
+ return count
237
+ # Fallback: some SDK versions return a dict
238
+ if isinstance(meta, dict):
239
+ return meta.get("approx_row_count")
240
+ return None
241
+ except Exception:
242
+ return None
243
+
221
244
  # ------------------------------------------------------------------
222
245
  # ID pagination
223
246
  # ------------------------------------------------------------------
@@ -237,6 +260,35 @@ class TpufNamespace:
237
260
  return None
238
261
  return rows[0].id
239
262
 
263
+ def scan_all_rows(self, limit: int | None = None, page_size: int = 10_000) -> list[Any]:
264
+ """Sequentially scan all rows with attributes via cursor pagination.
265
+
266
+ Much faster than random-ID sampling for large fetches — single pass,
267
+ no retries, no ID collisions. Returns up to ``limit`` rows (all if
268
+ None).
269
+ """
270
+ all_rows: list[Any] = []
271
+ last_id = 0
272
+
273
+ while True:
274
+ result = self._ns.query(
275
+ rank_by=["id", "asc"],
276
+ filters=["id", "Gt", last_id],
277
+ top_k=page_size,
278
+ include_attributes=True,
279
+ )
280
+ rows = result.rows
281
+ if not rows:
282
+ break
283
+ all_rows.extend(rows)
284
+ last_id = rows[-1].id
285
+ if limit is not None and len(all_rows) >= limit:
286
+ return all_rows[:limit]
287
+ if len(rows) < page_size:
288
+ break
289
+
290
+ return all_rows
291
+
240
292
  def paginate_all_ids(self, page_size: int = 1000) -> list[int]:
241
293
  """Return all row IDs in the namespace via cursor pagination."""
242
294
  all_ids: list[int] = []
@@ -198,9 +198,37 @@ class TpufChunkSource:
198
198
  # ------------------------------------------------------------------
199
199
 
200
200
  def get_chunk_count(self) -> int:
201
- """Return the total number of chunks in the namespace."""
201
+ """Return the total number of chunks in the namespace.
202
+
203
+ Prefers ``approx_row_count`` from the metadata endpoint (reflects
204
+ actual rows after deletions). Falls back to ``get_max_id()`` which
205
+ can over-count in sparse namespaces.
206
+ """
207
+ approx = self._client.get_approx_row_count()
208
+ if approx is not None:
209
+ return approx
202
210
  return self._client.get_max_id() or 0
203
211
 
212
+ def scan_chunks(self, limit: int | None = None, min_chars: int = 0) -> list[Chunk]:
213
+ """Sequentially scan chunks via cursor pagination.
214
+
215
+ Much faster than ``sample_chunks`` for large fetches (single pass, no
216
+ retries). Returns chunks in ID order, not random. Use this when you
217
+ need most or all of the namespace (e.g. materialization).
218
+ """
219
+ # Over-fetch to account for min_chars filtering
220
+ fetch_limit = None if limit is None else int(limit * (3 if min_chars > 0 else 1.1))
221
+ rows = self._client.scan_all_rows(limit=fetch_limit)
222
+ collected: list[Chunk] = []
223
+ for row in rows:
224
+ chunk = self._client.row_to_chunk(row)
225
+ if min_chars > 0 and len(chunk.content) < min_chars:
226
+ continue
227
+ collected.append(chunk)
228
+ if limit is not None and len(collected) >= limit:
229
+ break
230
+ return collected
231
+
204
232
  def sample_chunks(self, n: int, min_chars: int = 0) -> list[Chunk]:
205
233
  """Return n randomly sampled chunks, optionally filtered by minimum length.
206
234
 
@@ -357,8 +385,11 @@ class TpufChunkSource:
357
385
  return []
358
386
 
359
387
  # Skip expensive full-namespace pagination for large namespaces.
360
- # Use actual row count (not max_id) to handle sparse ID spaces where
361
- # max_id >> row_count due to deletions or non-sequential assignment.
388
+ # Use approx_row_count (actual rows) rather than paginating all IDs
389
+ # just to count them that's O(N) API calls for large namespaces.
390
+ chunk_count = self.get_chunk_count()
391
+ if chunk_count > 50_000:
392
+ return []
362
393
  all_ids = self._client.paginate_all_ids()
363
394
  if len(all_ids) > 50_000:
364
395
  return []
@@ -1332,30 +1332,42 @@ class Pipeline:
1332
1332
  # resolve any chunk by hash.
1333
1333
  max_materialize = 50_000
1334
1334
  if getattr(source, "collection", None) is None and chunk_count > 0:
1335
- if chunk_count <= max_materialize:
1336
- from benchmax.rag.chunkers.models import ChunkCollection # noqa: PLC0415
1335
+ from benchmax.rag.chunkers.models import ChunkCollection # noqa: PLC0415
1337
1336
 
1337
+ materialize_count = min(chunk_count, max_materialize)
1338
+ if chunk_count > max_materialize:
1339
+ logger.warning(
1340
+ "Corpus has %d chunks (limit %d). Materialising a capped "
1341
+ "sample so entity extraction and the chunk graph still work.",
1342
+ chunk_count,
1343
+ max_materialize,
1344
+ )
1345
+ else:
1338
1346
  logger.info(
1339
1347
  "Materialising %d chunks from API backend into memory...",
1340
1348
  chunk_count,
1341
1349
  )
1342
- all_chunks = source.sample_chunks(
1343
- chunk_count,
1350
+
1351
+ # Use sequential scan when available — cursor pagination avoids
1352
+ # the ID-collision overhead of random sampling at high fill rates.
1353
+ # ~1.9x faster for 50k chunks from a 65k namespace.
1354
+ if hasattr(source, "scan_chunks"):
1355
+ all_chunks = source.scan_chunks(
1356
+ limit=materialize_count,
1344
1357
  min_chars=cfg.corpus.min_chunk_chars,
1345
1358
  )
1346
- if all_chunks:
1347
- source.collection = ChunkCollection(chunks=all_chunks) # type: ignore[attr-defined]
1348
- logger.info(
1349
- "Cached %d/%d chunks on source.collection",
1350
- len(all_chunks),
1351
- chunk_count,
1352
- )
1353
1359
  else:
1354
- logger.warning(
1355
- "Corpus too large to materialise (%d chunks > %d cap); "
1356
- "entity-chunk graph will use profile sample only.",
1360
+ all_chunks = source.sample_chunks(
1361
+ materialize_count,
1362
+ min_chars=cfg.corpus.min_chunk_chars,
1363
+ )
1364
+
1365
+ if all_chunks:
1366
+ source.collection = ChunkCollection(chunks=all_chunks) # type: ignore[attr-defined]
1367
+ logger.info(
1368
+ "Cached %d/%d chunks on source.collection",
1369
+ len(all_chunks),
1357
1370
  chunk_count,
1358
- max_materialize,
1359
1371
  )
1360
1372
 
1361
1373
  profile_sample = diverse_profile_sample(
@@ -1,9 +1,12 @@
1
+ import logging
1
2
  import os
2
3
  from dataclasses import dataclass
3
4
  from typing import Any, Dict, List, Literal, Optional
4
5
 
5
6
  from openai import AsyncOpenAI
6
7
 
8
+ logger = logging.getLogger(__name__)
9
+
7
10
  from benchmax.platform.credentials import platform_bearer
8
11
 
9
12
  from ._utils import _extract_json
@@ -76,6 +79,7 @@ async def evaluate_single_rubric(
76
79
  ground_truth: Optional[str] = None,
77
80
  api_key: str = "",
78
81
  timeout: Optional[float] = None,
82
+ enable_logging: bool = True,
79
83
  ) -> Dict[str, Any]:
80
84
  """
81
85
  Evaluate a single response against a single rubric.
@@ -146,11 +150,26 @@ async def evaluate_single_rubric(
146
150
  return {"score": 0, "reasoning": "Empty response", "llm_output": ""}
147
151
 
148
152
  result = _extract_json(content)
149
- return {
153
+ out = {
150
154
  "score": result.get("score", 0),
151
155
  "reasoning": result.get("reasoning", ""),
152
156
  "llm_output": content,
153
157
  }
158
+ if enable_logging:
159
+ logger.info(
160
+ "\n┌─ rubric: %s ─────────────────────\n"
161
+ "│ ground_truth : %s\n"
162
+ "│ score : %s\n"
163
+ "│ reasoning : %s\n"
164
+ "│ llm_output :\n%s\n"
165
+ "└──────────────────────────────────────────────────",
166
+ rubric.title,
167
+ (ground_truth or "").strip() or "(none)",
168
+ out["score"],
169
+ out["reasoning"],
170
+ content,
171
+ )
172
+ return out
154
173
 
155
174
  except Exception as e:
156
175
  print(f"Error evaluating rubric '{rubric.title}': {e}\njudge output:\n{content}")
@@ -166,6 +185,7 @@ async def evaluate_rubric_ranking(
166
185
  api_key: str = "",
167
186
  timeout: Optional[float] = None,
168
187
  ground_truth: Optional[str] = None,
188
+ enable_logging: bool = True,
169
189
  ) -> Dict[str, Any]:
170
190
  """
171
191
  Rank N responses against a single rubric in one judge call and convert the
@@ -276,12 +296,34 @@ async def evaluate_rubric_ranking(
276
296
  for j, p in pos_of.items():
277
297
  scores[nonempty[j][0]] = 1.0 - p / max_pos if max_pos > 0 else 1.0
278
298
 
279
- return {
299
+ out = {
280
300
  "scores": scores,
281
301
  "ranking": ranking,
282
302
  "reasoning": result.get("reasoning", ""),
283
303
  "llm_output": content,
284
304
  }
305
+ if enable_logging:
306
+ scores_fmt = " ".join(f"[{i}]={s:.3f}" for i, s in enumerate(scores))
307
+ ranking_fmt = " > ".join(
308
+ f"[{', '.join(str(j) for j in tier)}]" if isinstance(tier, list) else str(tier)
309
+ for tier in ranking
310
+ )
311
+ logger.info(
312
+ "\n┌─ ranked rubric: %s ────────────────────\n"
313
+ "│ ground_truth : %s\n"
314
+ "│ ranking : %s\n"
315
+ "│ scores : %s\n"
316
+ "│ reasoning : %s\n"
317
+ "│ llm_output :\n%s\n"
318
+ "└──────────────────────────────────────────────────",
319
+ rubric.title,
320
+ (ground_truth or "").strip() or "(none)",
321
+ ranking_fmt or "(empty)",
322
+ scores_fmt,
323
+ out["reasoning"],
324
+ content,
325
+ )
326
+ return out
285
327
  except Exception as e:
286
328
  print(f"Error ranking rubric '{rubric.title}': {e}\njudge output:\n{content}")
287
329
  return {"scores": scores, "ranking": [], "reasoning": f"Error: {e}", "llm_output": content}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmax
3
- Version: 0.1.2.dev25
3
+ Version: 0.1.2.dev27
4
4
  Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
5
5
  Author: castie@castform.com
6
6
  Classifier: Programming Language :: Python :: 3
File without changes
File without changes
File without changes