benchmax 0.1.2.dev26__tar.gz → 0.1.2.dev27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/PKG-INFO +1 -1
  2. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/pyproject.toml +1 -1
  3. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/rubric.py +44 -2
  4. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/PKG-INFO +1 -1
  5. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/LICENSE +0 -0
  6. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/README.md +0 -0
  7. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/setup.cfg +0 -0
  8. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/bundle.py +0 -0
  9. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/config.py +0 -0
  10. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/__init__.py +0 -0
  11. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/base_env.py +0 -0
  12. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/crm/crm_env.py +0 -0
  13. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/crm/workdir/reward_fn.py +0 -0
  14. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/example_id.py +0 -0
  15. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/data_utils.py +0 -0
  16. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/excel_env.py +0 -0
  17. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/__init__.py +0 -0
  18. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/excel_code_runner_mcp.py +0 -0
  19. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/excel_utils.py +0 -0
  20. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/excel/workdir/reward_fn.py +0 -0
  21. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/logging.py +0 -0
  22. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/math/math_env.py +0 -0
  23. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/math/workdir/reward_fn.py +0 -0
  24. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/__init__.py +0 -0
  25. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/example_workdir/demo_mcp_server.py +0 -0
  26. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/example_workdir/reward_fn.py +0 -0
  27. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/parallel_mcp_env.py +0 -0
  28. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/__init__.py +0 -0
  29. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/base_provisioner.py +0 -0
  30. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/local_provisioner.py +0 -0
  31. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/manual_provisioner.py +0 -0
  32. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/skypilot_provisioner.py +0 -0
  33. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/provisioners/utils.py +0 -0
  34. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/proxy_server.py +0 -0
  35. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/server_pool.py +0 -0
  36. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/mcp/utils.py +0 -0
  37. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/__init__.py +0 -0
  38. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/linker_env.py +0 -0
  39. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/postgres_search/search_env.py +0 -0
  40. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/reward_helpers.py +0 -0
  41. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/types.py +0 -0
  42. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/wikipedia/utils.py +0 -0
  43. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/envs/wikipedia/wiki_env.py +0 -0
  44. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/__init__.py +0 -0
  45. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/caller.py +0 -0
  46. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/clients.py +0 -0
  47. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/example_usage.py +0 -0
  48. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/inspector.py +0 -0
  49. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/models.py +0 -0
  50. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/multi_model/pricing.py +0 -0
  51. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/platform/__init__.py +0 -0
  52. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/platform/client.py +0 -0
  53. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/platform/credentials.py +0 -0
  54. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/platform/exceptions.py +0 -0
  55. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/platform/training_run.py +0 -0
  56. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/platform/validation.py +0 -0
  57. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/prompts/__init__.py +0 -0
  58. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/prompts/tools.py +0 -0
  59. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/__init__.py +0 -0
  60. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/email.py +0 -0
  61. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/inspector.py +0 -0
  62. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/markdown.py +0 -0
  63. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/models.py +0 -0
  64. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/chunkers/storage.py +0 -0
  65. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/__init__.py +0 -0
  66. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/__init__.py +0 -0
  67. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/client.py +0 -0
  68. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/files.py +0 -0
  69. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/filter_mapper.py +0 -0
  70. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/search.py +0 -0
  71. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/chroma/source.py +0 -0
  72. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/__init__.py +0 -0
  73. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/files.py +0 -0
  74. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/filter_mapper.py +0 -0
  75. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/index_client.py +0 -0
  76. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/search.py +0 -0
  77. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/pinecone/source.py +0 -0
  78. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/__init__.py +0 -0
  79. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/client.py +0 -0
  80. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/exceptions.py +0 -0
  81. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/filter_mapper.py +0 -0
  82. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/models.py +0 -0
  83. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/search.py +0 -0
  84. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/postgres/source.py +0 -0
  85. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_client.py +0 -0
  86. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/__init__.py +0 -0
  87. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/builders.py +0 -0
  88. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/dsl_parser.py +0 -0
  89. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/search_exceptions.py +0 -0
  90. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/search_schema/search_types.py +0 -0
  91. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/source.py +0 -0
  92. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/__init__.py +0 -0
  93. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/files.py +0 -0
  94. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/filter_mapper.py +0 -0
  95. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/namespace.py +0 -0
  96. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/search.py +0 -0
  97. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/corpus/turbopuffer/source.py +0 -0
  98. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/__init__.py +0 -0
  99. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/__init__.py +0 -0
  100. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/clean_bodies.py +0 -0
  101. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/dedupe.py +0 -0
  102. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/filter_automated_email_qas.py +0 -0
  103. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/filter_automated_emails.py +0 -0
  104. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/mbox.py +0 -0
  105. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/preprocess/email/schema.py +0 -0
  106. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/__init__.py +0 -0
  107. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/anchor_selector.py +0 -0
  108. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/auto_tune.py +0 -0
  109. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/batch_processor.py +0 -0
  110. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/checkpoint.py +0 -0
  111. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/corpus_capabilities.py +0 -0
  112. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/corpus_profile.py +0 -0
  113. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/__init__.py +0 -0
  114. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/deterministic_guards.py +0 -0
  115. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/env_rollout.py +0 -0
  116. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/grounding_llm.py +0 -0
  117. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/hop_count_validity.py +0 -0
  118. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/quality_gate.py +0 -0
  119. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/filters/retrieval_llm.py +0 -0
  120. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/formatters/__init__.py +0 -0
  121. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/formatters/train_eval.py +0 -0
  122. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generated_qa.py +0 -0
  123. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generators/__init__.py +0 -0
  124. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/generators/direct_llm.py +0 -0
  125. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/helpers.py +0 -0
  126. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/metadata_linker.py +0 -0
  127. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/metrics.py +0 -0
  128. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/models.py +0 -0
  129. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/pipeline.py +0 -0
  130. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/pipeline_config.py +0 -0
  131. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/protocols.py +0 -0
  132. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/query_rewriter.py +0 -0
  133. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/response_parsers.py +0 -0
  134. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/retrieval_query.py +0 -0
  135. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/scoring.py +0 -0
  136. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/search_agent_linker.py +0 -0
  137. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/storage.py +0 -0
  138. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/style_controls.py +0 -0
  139. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/__init__.py +0 -0
  140. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/base.py +0 -0
  141. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/transformers/dedup.py +0 -0
  142. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/wiki_builder.py +0 -0
  143. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rag/qa_generation/wiki_chunk_linker.py +0 -0
  144. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/__init__.py +0 -0
  145. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/_utils.py +0 -0
  146. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/adaptive.py +0 -0
  147. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/cache.py +0 -0
  148. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/prompts.py +0 -0
  149. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/rubrics/reward_fns.py +0 -0
  150. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/__init__.py +0 -0
  151. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/adapter.py +0 -0
  152. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/__init__.py +0 -0
  153. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/adapter.py +0 -0
  154. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/braintrust/message_extraction.py +0 -0
  155. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/http.py +0 -0
  156. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/pipeline.py +0 -0
  157. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/pivot.py +0 -0
  158. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/processing.py +0 -0
  159. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/traces/registry.py +0 -0
  160. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/utils/__init__.py +0 -0
  161. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax/utils/checkpoint.py +0 -0
  162. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/SOURCES.txt +0 -0
  163. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/dependency_links.txt +0 -0
  164. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/requires.txt +0 -0
  165. {benchmax-0.1.2.dev26 → benchmax-0.1.2.dev27}/src/benchmax.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmax
3
- Version: 0.1.2.dev26
3
+ Version: 0.1.2.dev27
4
4
  Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
5
5
  Author: castie@castform.com
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "benchmax"
3
- version = "0.1.2.dev26"
3
+ version = "0.1.2.dev27"
4
4
  description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "castie@castform.com" }]
@@ -1,9 +1,12 @@
1
+ import logging
1
2
  import os
2
3
  from dataclasses import dataclass
3
4
  from typing import Any, Dict, List, Literal, Optional
4
5
 
5
6
  from openai import AsyncOpenAI
6
7
 
8
+ logger = logging.getLogger(__name__)
9
+
7
10
  from benchmax.platform.credentials import platform_bearer
8
11
 
9
12
  from ._utils import _extract_json
@@ -76,6 +79,7 @@ async def evaluate_single_rubric(
76
79
  ground_truth: Optional[str] = None,
77
80
  api_key: str = "",
78
81
  timeout: Optional[float] = None,
82
+ enable_logging: bool = True,
79
83
  ) -> Dict[str, Any]:
80
84
  """
81
85
  Evaluate a single response against a single rubric.
@@ -146,11 +150,26 @@ async def evaluate_single_rubric(
146
150
  return {"score": 0, "reasoning": "Empty response", "llm_output": ""}
147
151
 
148
152
  result = _extract_json(content)
149
- return {
153
+ out = {
150
154
  "score": result.get("score", 0),
151
155
  "reasoning": result.get("reasoning", ""),
152
156
  "llm_output": content,
153
157
  }
158
+ if enable_logging:
159
+ logger.info(
160
+ "\n┌─ rubric: %s ─────────────────────\n"
161
+ "│ ground_truth : %s\n"
162
+ "│ score : %s\n"
163
+ "│ reasoning : %s\n"
164
+ "│ llm_output :\n%s\n"
165
+ "└──────────────────────────────────────────────────",
166
+ rubric.title,
167
+ (ground_truth or "").strip() or "(none)",
168
+ out["score"],
169
+ out["reasoning"],
170
+ content,
171
+ )
172
+ return out
154
173
 
155
174
  except Exception as e:
156
175
  print(f"Error evaluating rubric '{rubric.title}': {e}\njudge output:\n{content}")
@@ -166,6 +185,7 @@ async def evaluate_rubric_ranking(
166
185
  api_key: str = "",
167
186
  timeout: Optional[float] = None,
168
187
  ground_truth: Optional[str] = None,
188
+ enable_logging: bool = True,
169
189
  ) -> Dict[str, Any]:
170
190
  """
171
191
  Rank N responses against a single rubric in one judge call and convert the
@@ -276,12 +296,34 @@ async def evaluate_rubric_ranking(
276
296
  for j, p in pos_of.items():
277
297
  scores[nonempty[j][0]] = 1.0 - p / max_pos if max_pos > 0 else 1.0
278
298
 
279
- return {
299
+ out = {
280
300
  "scores": scores,
281
301
  "ranking": ranking,
282
302
  "reasoning": result.get("reasoning", ""),
283
303
  "llm_output": content,
284
304
  }
305
+ if enable_logging:
306
+ scores_fmt = " ".join(f"[{i}]={s:.3f}" for i, s in enumerate(scores))
307
+ ranking_fmt = " > ".join(
308
+ f"[{', '.join(str(j) for j in tier)}]" if isinstance(tier, list) else str(tier)
309
+ for tier in ranking
310
+ )
311
+ logger.info(
312
+ "\n┌─ ranked rubric: %s ────────────────────\n"
313
+ "│ ground_truth : %s\n"
314
+ "│ ranking : %s\n"
315
+ "│ scores : %s\n"
316
+ "│ reasoning : %s\n"
317
+ "│ llm_output :\n%s\n"
318
+ "└──────────────────────────────────────────────────",
319
+ rubric.title,
320
+ (ground_truth or "").strip() or "(none)",
321
+ ranking_fmt or "(empty)",
322
+ scores_fmt,
323
+ out["reasoning"],
324
+ content,
325
+ )
326
+ return out
285
327
  except Exception as e:
286
328
  print(f"Error ranking rubric '{rubric.title}': {e}\njudge output:\n{content}")
287
329
  return {"scores": scores, "ranking": [], "reasoning": f"Error: {e}", "llm_output": content}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmax
3
- Version: 0.1.2.dev26
3
+ Version: 0.1.2.dev27
4
4
  Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
5
5
  Author: castie@castform.com
6
6
  Classifier: Programming Language :: Python :: 3
File without changes
File without changes
File without changes