benchmax 0.1.2.dev23__tar.gz → 0.1.2.dev26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/PKG-INFO +68 -72
  2. benchmax-0.1.2.dev26/README.md +138 -0
  3. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/pyproject.toml +25 -22
  4. benchmax-0.1.2.dev26/src/benchmax/bundle.py +303 -0
  5. benchmax-0.1.2.dev26/src/benchmax/config.py +40 -0
  6. benchmax-0.1.2.dev26/src/benchmax/envs/base_env.py +229 -0
  7. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/crm/crm_env.py +13 -16
  8. benchmax-0.1.2.dev26/src/benchmax/envs/example_id.py +147 -0
  9. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/excel/excel_env.py +42 -40
  10. benchmax-0.1.2.dev26/src/benchmax/envs/logging.py +122 -0
  11. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/math/math_env.py +7 -6
  12. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/parallel_mcp_env.py +21 -7
  13. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/provisioners/utils.py +2 -11
  14. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/proxy_server.py +19 -11
  15. benchmax-0.1.2.dev26/src/benchmax/envs/postgres_search/linker_env.py +237 -0
  16. benchmax-0.1.2.dev26/src/benchmax/envs/postgres_search/search_env.py +541 -0
  17. benchmax-0.1.2.dev26/src/benchmax/envs/reward_helpers.py +217 -0
  18. benchmax-0.1.2.dev26/src/benchmax/envs/types.py +39 -0
  19. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/wikipedia/wiki_env.py +18 -21
  20. benchmax-0.1.2.dev26/src/benchmax/multi_model/caller.py +504 -0
  21. benchmax-0.1.2.dev26/src/benchmax/multi_model/clients.py +144 -0
  22. benchmax-0.1.2.dev26/src/benchmax/multi_model/example_usage.py +325 -0
  23. benchmax-0.1.2.dev26/src/benchmax/multi_model/inspector.py +140 -0
  24. benchmax-0.1.2.dev26/src/benchmax/multi_model/models.py +108 -0
  25. benchmax-0.1.2.dev26/src/benchmax/multi_model/pricing.py +67 -0
  26. benchmax-0.1.2.dev26/src/benchmax/platform/__init__.py +12 -0
  27. benchmax-0.1.2.dev26/src/benchmax/platform/client.py +1068 -0
  28. benchmax-0.1.2.dev26/src/benchmax/platform/credentials.py +125 -0
  29. benchmax-0.1.2.dev26/src/benchmax/platform/exceptions.py +46 -0
  30. benchmax-0.1.2.dev26/src/benchmax/platform/training_run.py +148 -0
  31. benchmax-0.1.2.dev26/src/benchmax/platform/validation.py +494 -0
  32. benchmax-0.1.2.dev26/src/benchmax/rag/chunkers/__init__.py +0 -0
  33. benchmax-0.1.2.dev26/src/benchmax/rag/chunkers/email.py +793 -0
  34. benchmax-0.1.2.dev26/src/benchmax/rag/chunkers/inspector.py +230 -0
  35. benchmax-0.1.2.dev26/src/benchmax/rag/chunkers/markdown.py +347 -0
  36. benchmax-0.1.2.dev26/src/benchmax/rag/chunkers/models.py +253 -0
  37. benchmax-0.1.2.dev26/src/benchmax/rag/chunkers/storage.py +78 -0
  38. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/__init__.py +1 -0
  39. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/chroma/__init__.py +0 -0
  40. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/chroma/client.py +409 -0
  41. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/chroma/files.py +162 -0
  42. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/chroma/filter_mapper.py +149 -0
  43. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/chroma/search.py +133 -0
  44. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/chroma/source.py +754 -0
  45. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/pinecone/__init__.py +0 -0
  46. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/pinecone/files.py +159 -0
  47. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/pinecone/filter_mapper.py +194 -0
  48. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/pinecone/index_client.py +387 -0
  49. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/pinecone/search.py +123 -0
  50. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/pinecone/source.py +533 -0
  51. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/postgres/__init__.py +0 -0
  52. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/postgres/client.py +566 -0
  53. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/postgres/exceptions.py +53 -0
  54. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/postgres/filter_mapper.py +119 -0
  55. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/postgres/models.py +63 -0
  56. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/postgres/search.py +113 -0
  57. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/postgres/source.py +402 -0
  58. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/search_client.py +62 -0
  59. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/search_schema/__init__.py +0 -0
  60. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/search_schema/builders.py +58 -0
  61. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/search_schema/dsl_parser.py +47 -0
  62. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/search_schema/search_exceptions.py +45 -0
  63. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/search_schema/search_types.py +176 -0
  64. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/source.py +127 -0
  65. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/turbopuffer/__init__.py +0 -0
  66. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/turbopuffer/files.py +175 -0
  67. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/turbopuffer/filter_mapper.py +139 -0
  68. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/turbopuffer/namespace.py +323 -0
  69. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/turbopuffer/search.py +203 -0
  70. benchmax-0.1.2.dev26/src/benchmax/rag/corpus/turbopuffer/source.py +716 -0
  71. benchmax-0.1.2.dev26/src/benchmax/rag/preprocess/__init__.py +0 -0
  72. benchmax-0.1.2.dev26/src/benchmax/rag/preprocess/email/__init__.py +0 -0
  73. benchmax-0.1.2.dev26/src/benchmax/rag/preprocess/email/clean_bodies.py +513 -0
  74. benchmax-0.1.2.dev26/src/benchmax/rag/preprocess/email/dedupe.py +799 -0
  75. benchmax-0.1.2.dev26/src/benchmax/rag/preprocess/email/filter_automated_email_qas.py +320 -0
  76. benchmax-0.1.2.dev26/src/benchmax/rag/preprocess/email/filter_automated_emails.py +560 -0
  77. benchmax-0.1.2.dev26/src/benchmax/rag/preprocess/email/mbox.py +257 -0
  78. benchmax-0.1.2.dev26/src/benchmax/rag/preprocess/email/schema.py +180 -0
  79. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/__init__.py +64 -0
  80. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/anchor_selector.py +17 -0
  81. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/auto_tune.py +255 -0
  82. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/batch_processor.py +374 -0
  83. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/checkpoint.py +294 -0
  84. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/corpus_capabilities.py +94 -0
  85. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/corpus_profile.py +1688 -0
  86. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/filters/__init__.py +21 -0
  87. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/filters/deterministic_guards.py +225 -0
  88. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/filters/env_rollout.py +280 -0
  89. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/filters/grounding_llm.py +498 -0
  90. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/filters/hop_count_validity.py +992 -0
  91. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/filters/quality_gate.py +243 -0
  92. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/filters/retrieval_llm.py +802 -0
  93. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/formatters/__init__.py +5 -0
  94. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/formatters/train_eval.py +123 -0
  95. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/generated_qa.py +125 -0
  96. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/generators/__init__.py +5 -0
  97. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/generators/direct_llm.py +685 -0
  98. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/helpers.py +133 -0
  99. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/metadata_linker.py +771 -0
  100. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/metrics.py +95 -0
  101. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/models.py +36 -0
  102. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/pipeline.py +2650 -0
  103. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/pipeline_config.py +1111 -0
  104. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/protocols.py +66 -0
  105. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/query_rewriter.py +149 -0
  106. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/response_parsers.py +63 -0
  107. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/retrieval_query.py +76 -0
  108. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/scoring.py +114 -0
  109. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/search_agent_linker.py +609 -0
  110. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/storage.py +142 -0
  111. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/style_controls.py +230 -0
  112. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/transformers/__init__.py +7 -0
  113. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/transformers/base.py +125 -0
  114. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/transformers/dedup.py +195 -0
  115. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/wiki_builder.py +539 -0
  116. benchmax-0.1.2.dev26/src/benchmax/rag/qa_generation/wiki_chunk_linker.py +303 -0
  117. benchmax-0.1.2.dev26/src/benchmax/rubrics/__init__.py +17 -0
  118. benchmax-0.1.2.dev26/src/benchmax/rubrics/_utils.py +50 -0
  119. benchmax-0.1.2.dev26/src/benchmax/rubrics/adaptive.py +135 -0
  120. benchmax-0.1.2.dev26/src/benchmax/rubrics/cache.py +178 -0
  121. benchmax-0.1.2.dev26/src/benchmax/rubrics/prompts.py +178 -0
  122. benchmax-0.1.2.dev26/src/benchmax/rubrics/reward_fns.py +349 -0
  123. benchmax-0.1.2.dev26/src/benchmax/rubrics/rubric.py +287 -0
  124. benchmax-0.1.2.dev26/src/benchmax/traces/__init__.py +3 -0
  125. benchmax-0.1.2.dev26/src/benchmax/traces/adapter.py +291 -0
  126. benchmax-0.1.2.dev26/src/benchmax/traces/braintrust/__init__.py +0 -0
  127. benchmax-0.1.2.dev26/src/benchmax/traces/braintrust/adapter.py +322 -0
  128. benchmax-0.1.2.dev26/src/benchmax/traces/braintrust/message_extraction.py +318 -0
  129. benchmax-0.1.2.dev26/src/benchmax/traces/http.py +90 -0
  130. benchmax-0.1.2.dev26/src/benchmax/traces/pipeline.py +278 -0
  131. benchmax-0.1.2.dev26/src/benchmax/traces/pivot.py +664 -0
  132. benchmax-0.1.2.dev26/src/benchmax/traces/processing.py +776 -0
  133. benchmax-0.1.2.dev26/src/benchmax/traces/registry.py +32 -0
  134. benchmax-0.1.2.dev26/src/benchmax/utils/__init__.py +14 -0
  135. benchmax-0.1.2.dev26/src/benchmax/utils/checkpoint.py +87 -0
  136. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax.egg-info/PKG-INFO +68 -72
  137. benchmax-0.1.2.dev26/src/benchmax.egg-info/SOURCES.txt +163 -0
  138. benchmax-0.1.2.dev26/src/benchmax.egg-info/requires.txt +48 -0
  139. benchmax-0.1.2.dev23/README.md +0 -168
  140. benchmax-0.1.2.dev23/src/benchmax/adapters/benchmax_wrapper.py +0 -283
  141. benchmax-0.1.2.dev23/src/benchmax/adapters/skyrl/benchmax_data_process.py +0 -199
  142. benchmax-0.1.2.dev23/src/benchmax/adapters/skyrl/skyrl_adapter.py +0 -311
  143. benchmax-0.1.2.dev23/src/benchmax/bundle/bundler.py +0 -186
  144. benchmax-0.1.2.dev23/src/benchmax/bundle/errors.py +0 -28
  145. benchmax-0.1.2.dev23/src/benchmax/bundle/loader.py +0 -220
  146. benchmax-0.1.2.dev23/src/benchmax/bundle/payload.py +0 -78
  147. benchmax-0.1.2.dev23/src/benchmax/bundle/validator.py +0 -269
  148. benchmax-0.1.2.dev23/src/benchmax/envs/base_env.py +0 -195
  149. benchmax-0.1.2.dev23/src/benchmax/envs/crm/workdir/salesforce_mcp.py +0 -1135
  150. benchmax-0.1.2.dev23/src/benchmax/envs/tracking.py +0 -134
  151. benchmax-0.1.2.dev23/src/benchmax/envs/types.py +0 -19
  152. benchmax-0.1.2.dev23/src/benchmax.egg-info/SOURCES.txt +0 -50
  153. benchmax-0.1.2.dev23/src/benchmax.egg-info/requires.txt +0 -14
  154. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/LICENSE +0 -0
  155. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/setup.cfg +0 -0
  156. {benchmax-0.1.2.dev23/src/benchmax/adapters → benchmax-0.1.2.dev26/src/benchmax/envs}/__init__.py +0 -0
  157. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/crm/workdir/reward_fn.py +0 -0
  158. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/excel/data_utils.py +0 -0
  159. {benchmax-0.1.2.dev23/src/benchmax/bundle → benchmax-0.1.2.dev26/src/benchmax/envs/excel/workdir}/__init__.py +0 -0
  160. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/excel/workdir/excel_code_runner_mcp.py +0 -0
  161. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/excel/workdir/excel_utils.py +0 -0
  162. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/excel/workdir/reward_fn.py +0 -0
  163. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/math/workdir/reward_fn.py +0 -0
  164. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/__init__.py +0 -0
  165. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/example_workdir/demo_mcp_server.py +0 -0
  166. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/example_workdir/reward_fn.py +0 -0
  167. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/provisioners/__init__.py +0 -0
  168. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/provisioners/base_provisioner.py +0 -0
  169. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/provisioners/local_provisioner.py +0 -0
  170. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/provisioners/manual_provisioner.py +0 -0
  171. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/provisioners/skypilot_provisioner.py +0 -0
  172. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/server_pool.py +0 -0
  173. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/mcp/utils.py +0 -0
  174. {benchmax-0.1.2.dev23/src/benchmax/envs → benchmax-0.1.2.dev26/src/benchmax/envs/postgres_search}/__init__.py +0 -0
  175. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/envs/wikipedia/utils.py +0 -0
  176. {benchmax-0.1.2.dev23/src/benchmax/envs/excel/workdir → benchmax-0.1.2.dev26/src/benchmax/multi_model}/__init__.py +0 -0
  177. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/prompts/__init__.py +0 -0
  178. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax/prompts/tools.py +0 -0
  179. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax.egg-info/dependency_links.txt +0 -0
  180. {benchmax-0.1.2.dev23 → benchmax-0.1.2.dev26}/src/benchmax.egg-info/top_level.txt +0 -0
@@ -1,18 +1,21 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: benchmax
3
- Version: 0.1.2.dev23
3
+ Version: 0.1.2.dev26
4
4
  Summary: Framework-Agnostic RL Environments for LLM Fine-Tuning
5
- Author: cgft.io
5
+ Author: castie@castform.com
6
6
  Classifier: Programming Language :: Python :: 3
7
7
  Classifier: Operating System :: OS Independent
8
- Requires-Python: >=3.12
8
+ Requires-Python: ==3.12.*
9
9
  Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: aiohttp>=3.13.1
12
12
  Requires-Dist: asyncio>=4.0.0
13
13
  Requires-Dist: cloudpickle>=3.0.0
14
14
  Requires-Dist: datasets>=4.0.0
15
- Requires-Dist: expt-logger>=0.1.0.dev22
15
+ Requires-Dist: httpx>=0.27.0
16
+ Requires-Dist: json-repair>=0.59.10
17
+ Requires-Dist: openai>=2.15.0
18
+ Requires-Dist: pydantic>=2.0.0
16
19
  Provides-Extra: mcp
17
20
  Requires-Dist: fastmcp~=2.12.0; extra == "mcp"
18
21
  Requires-Dist: pyjwt>=2.10.1; extra == "mcp"
@@ -20,6 +23,29 @@ Provides-Extra: skypilot
20
23
  Requires-Dist: skypilot[aws,gcp]~=0.8.1; extra == "skypilot"
21
24
  Requires-Dist: pip>=25.3; extra == "skypilot"
22
25
  Requires-Dist: msrestazure>=0.6.4.post1; extra == "skypilot"
26
+ Provides-Extra: excel
27
+ Requires-Dist: openpyxl>=3.1.5; extra == "excel"
28
+ Provides-Extra: excel-mac-windows
29
+ Requires-Dist: openpyxl>=3.1.5; extra == "excel-mac-windows"
30
+ Requires-Dist: xlwings>=0.33.16; extra == "excel-mac-windows"
31
+ Provides-Extra: crm
32
+ Requires-Dist: python-dateutil>=2.9.0.post0; extra == "crm"
33
+ Provides-Extra: rag
34
+ Requires-Dist: keybert>=0.8; extra == "rag"
35
+ Requires-Dist: langchain-text-splitters>=0.3.0; extra == "rag"
36
+ Requires-Dist: nest-asyncio>=1.5.0; extra == "rag"
37
+ Requires-Dist: ragas>=0.4.3; extra == "rag"
38
+ Requires-Dist: ruamel-yaml>=0.19.1; extra == "rag"
39
+ Requires-Dist: scikit-learn>=1.8.0; extra == "rag"
40
+ Requires-Dist: sentence-transformers>=5.2.3; extra == "rag"
41
+ Requires-Dist: tqdm>=4.66.0; extra == "rag"
42
+ Provides-Extra: traces
43
+ Provides-Extra: chroma
44
+ Requires-Dist: chromadb>=1.0.0; extra == "chroma"
45
+ Provides-Extra: pinecone
46
+ Requires-Dist: pinecone>=5.0.0; extra == "pinecone"
47
+ Provides-Extra: turbopuffer
48
+ Requires-Dist: turbopuffer>=1.16.2; extra == "turbopuffer"
23
49
  Dynamic: license-file
24
50
 
25
51
  <picture>
@@ -31,57 +57,26 @@ Dynamic: license-file
31
57
  <div align="center">
32
58
  </div>
33
59
  <div id="badges" align="center">
34
- <a href="https://cgft.io">
35
- <img src="https://img.shields.io/badge/cgft.io-blue?style=for-the-badge" alt="Website"/>
36
- </a>
37
- <a href="https://x.com/cgftlabs">
38
- <img src="https://img.shields.io/badge/Follow @cgftlabs-black?style=for-the-badge&logo=X&logoColor=white" alt="@cgftlabs"/>
60
+ <a href="https://castform.com">
61
+ <img src="https://img.shields.io/badge/castform.com-blue?style=for-the-badge" alt="Website"/>
39
62
  </a>
40
63
  </div>
41
64
  <div align="center" style="line-height: 1;">
42
- <a href="https://github.com/girishbarca/benchmax/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/License-Apache_2.0-blue.svg"/></a>
65
+ <a href="./LICENSE"><img alt="License" src="https://img.shields.io/badge/License-Apache_2.0-blue.svg"/></a>
43
66
  </div>
44
67
 
45
68
  ## 📌 News
46
69
 
47
70
  - **[29 Oct 2025]** 🎉 Added support for easy multi-node parallelization across all major cloud providers using [SkyPilot](https://github.com/skypilot-org/skypilot)
48
- - **[29 Oct 2025]** 🎉 Integration with [SkyRL](https://github.com/NovaSky-AI/SkyRL) for distributed RL training across clusters
49
- - **[Upcoming]** 🛠️ Integration with Tinker API.
50
-
51
- ## 📘 Quickstart
52
-
53
- **Example: Multi-node parallelization of Excel Env with SkyRL and SkyPilot**
54
-
55
- RL environments can be computationally expensive to run (e.g. running tests). To handle these workloads efficiently, we distribute rollouts across multiple nodes using **SkyPilot**, horizontally scaling `benchmax` across cloud providers like GCP, AWS, Azure, etc.
56
-
57
- **SkyRL** is a training framework `benchmax` is currently integrated with. Use our ***SkyRL*** integration to RL finetune Qwen-2.5 to do spreadsheet manipulation using a excel MCP parallelized across multiple nodes. The environment is defined in [`benchmax.envs.excel.excel_env.ExcelEnvSkypilot`](/src/benchmax/envs/excel/excel_env.py)
58
-
59
- 1. **Prepare the dataset**
60
-
61
- ```bash
62
- uv run src/benchmax/adapters/skyrl/benchmax_data_process.py \
63
- --local_dir ~/data/excel \
64
- --dataset_name spreadsheetbench \
65
- --env_path benchmax.envs.excel.excel_env.ExcelEnvLocal
66
- ```
67
-
68
- Note: We are using `ExcelEnvLocal` instead of `ExcelEnvSkypilot` because the MCP is only used for listing tools to prepare the system prompt.
69
-
70
- 2. **Run training and parallelize Excel environment**
71
-
72
- ```bash
73
- bash examples/skyrl/run_benchmax_excel.sh
74
- ```
75
-
76
- This excel env example will spin up 5 nodes with 20 servers per node (total 100 MCP server in parallel). For more details, check out [multi-node parallelization](/src/benchmax/envs/mcp/README.md) and [SkyRL integration](/examples/skyrl/README.md).
77
71
 
78
72
  ## ℹ️ Overview
79
73
 
80
74
  `benchmax` comes with:
81
75
 
82
76
  - A collection of ready-to-use reinforcement learning (RL) environments for LLM fine-tuning ranging from multi-hop search to spreadsheet manipulation to CRM agents
83
- - An easy to define, compose, and parallelize your own environments, including leveraging the existing ecosystem of MCP servers
84
- - Built-in integrations with popular RL training libraries (skyrl, etc.). `benchmax` is trainer-agnostic by design
77
+ - An easy way to define, compose, and parallelize your own environments, including leveraging the existing ecosystem of MCP servers
78
+ - Trainer-agnostic by design `BaseEnv` exposes a small async interface (`list_tools`, `run_tool`, `compute_reward`, plus optional rollout lifecycle hooks) that any rollout loop can drive
79
+ - Optional batteries-included add-ons: synthetic RAG dataset generation (`benchmax[rag]`), agent trace import (`benchmax[traces]`), and clients for the Castform training platform (`benchmax.platform`)
85
80
 
86
81
  Define your environment as:
87
82
 
@@ -93,26 +88,22 @@ Rollout management, parallel execution, etc. comes out of the box.
93
88
 
94
89
  ⭐ Star our repository to show your support!
95
90
 
96
- ## 💡 Core Features
91
+ ## 💡 Core Features
97
92
 
98
93
  **Built-in examples & templates**
99
94
 
100
95
  Get started with ready to use recipes, from Wikipedia search to spreadsheet manipulation. Easy to copy, customize, and extend. And yes, more are on the way.
101
96
 
102
- **Trainer integrations**
103
-
104
- Use your own trainer or training framework - no lock-in. `benchmax` is already integrated into SkyRL, with more integrations (Tinker, etc.) coming soon!
105
-
106
97
  **MCP support**
107
98
 
108
99
  Tap into the growing MCP ecosystem and integrate them as tools within your environments.
109
100
 
110
101
  **Multi-node parallel execution**
111
102
 
112
- Multi-node parallelization enabled out of the box with state isolation across roll-outs (e.g. editing files on filesystem, etc.).
103
+ Multi-node parallelization enabled out of the box with state isolation across roll-outs (e.g. editing files on filesystem, etc.).
113
104
 
114
105
 
115
- ## 🌐 Creating & Training with Environments
106
+ ## 🌐 Creating Environments
116
107
 
117
108
  ### What is an environment?
118
109
 
@@ -128,9 +119,10 @@ We also support MCP servers natively, allowing you to easily leverage the many s
128
119
  Ready-to-use environments with pre-configured tools and reward functions.
129
120
 
130
121
  - [CRM](/src/benchmax/envs/crm/README.md)
131
- - [Excel](/src/benchmax/envs/excel/README.md)
122
+ - [Excel](/src/benchmax/envs/excel/README.md)
132
123
  - [Math](/src/benchmax/envs/math/README.md)
133
124
  - [Wikipedia](/src/benchmax/envs/wikipedia/README.md)
125
+ - [PostgreSQL search](/src/benchmax/envs/postgres_search/) (`benchmax[rag]`)
134
126
 
135
127
  ### How do I create a custom environment?
136
128
 
@@ -142,12 +134,6 @@ Ready-to-use environments with pre-configured tools and reward functions.
142
134
 
143
135
  - Check out our excel spreadsheet RL environment: `benchmax.envs.excel.excel_env.ExcelEnv`
144
136
 
145
- ### How do I use an environment with my preferred RL Trainer?
146
-
147
- We currently have integrations with SkyRL. More incoming!
148
-
149
- [`benchmax` environments with skyrl](/examples/skyrl/README.md)
150
-
151
137
  ### I want a specific environment
152
138
 
153
139
  Open an issue and tag us & we will look into building you one!
@@ -157,36 +143,46 @@ Open an issue and tag us & we will look into building you one!
157
143
  ## 🎯 Motivation
158
144
 
159
145
  - **Modularity and Simplicity**:
160
-
146
+
161
147
  We set out to build a lightweight, modular system for defining RL environments—breaking them down into simple, composable parts: tools, tool output parsing, and reward functions.
162
-
163
- The goals to make it easy for software engineers to build and experiment with RL environments without needing deep RL expertise.
164
-
165
- - **Trainer Integrations**:
166
-
167
- There’s been lots of new RL training frameworks popping up (e.g., numerous forks of verl) & we expect this to continue. They are often tightly coupled with specific environments, leading to fragmentation and limited compatibility.
168
-
169
- We are building `benchmax` as a standalone library with integrations to these different training frameworks & as an easy way for new frameworks to tap into an existing pool of environments. We're already integrated with SkyRL (Tinker coming soon)!
170
-
148
+
149
+ The goal's to make it easy for software engineers to build and experiment with RL environments without needing deep RL expertise.
150
+
171
151
  - **Task Recipes and Ideas**:
172
-
152
+
173
153
  We want `benchmax` to be a living library of reusable, RL-compatible task recipes, ready to inspire and extend beyond the usual suspects like math and coding. We aim to support more real-world workflows, including open-ended and long-horizon tasks.
174
-
154
+
175
155
  - **Parallelization and Cloud Compatibility**:
176
156
  - Enable efficient parallelization with maintained statefulness between rollouts.
177
157
  - Facilitate easy deployment and scalability in cloud environments.
178
158
 
179
159
  - **MCP as a first class citizen**:
180
-
181
- There has been an explosion of MCP servers/tools built out for use-cases ranging from browser use to excel to game creation.`benchmax` allows folks to leverage and compose these existing MCP servers to build environments integrated with real world systems e.g. excel
182
-
160
+
161
+ There has been an explosion of MCP servers/tools built out for use-cases ranging from browser use to excel to game creation. `benchmax` allows folks to leverage and compose these existing MCP servers to build environments integrated with real world systems e.g. excel
162
+
183
163
 
184
164
  ## 🤝 Contributing
185
165
 
186
- We welcome new environment recipes, bug reports, and trainer integrations!
166
+ We welcome new environment recipes and bug reports!
187
167
 
188
168
  ⭐ Star our repository to show your support!
189
169
 
170
+ ## 📦 Add-ons
171
+
172
+ In addition to the core env library, `benchmax` ships several optional
173
+ modules behind extras:
174
+
175
+ | Extra | Module | Purpose |
176
+ |---|---|---|
177
+ | `benchmax[rag]` | `benchmax.rag.*` | Markdown chunking, corpus indexing (Postgres / Chroma / Pinecone / Turbopuffer), synthetic QA dataset generation, RAG-specific reward rubrics |
178
+ | `benchmax[traces]` | `benchmax.traces` | Agentic trace import (Braintrust today, Langfuse coming) and provider-agnostic processing pipeline |
179
+ | `benchmax[chroma]` / `[pinecone]` / `[turbopuffer]` | `benchmax.rag.corpus.*` | Corpus-backend pins (combine with `[rag]`) |
180
+ | _(core)_ | `benchmax.platform` | HTTP clients for the Castform platform — storage uploads, training-job launch, rollout server. Used both internally by `benchmax.rag` and by the high-level [`castform-sdk`](https://pypi.org/project/castform-sdk/). |
181
+
182
+ All platform URLs derive from `CASTFORM_BASE_DOMAIN` (default
183
+ `castform.com`) with per-component overrides; see
184
+ [`benchmax.config`](src/benchmax/config.py).
185
+
190
186
  ## 📜 License
191
187
 
192
- Apache 2.0 © 2025 CGFT Inc.
188
+ Apache 2.0 © 2026 Castform
@@ -0,0 +1,138 @@
1
+ <picture>
2
+ <img alt="Benchmax" src="./static/benchmax.png" width="full">
3
+ </picture>
4
+
5
+ ## benchmax: Framework-Agnostic RL Environments for LLM Fine-Tuning
6
+ *A lightweight, training-framework agnostic library for defining, running, and parallelizing environments, to fine-tune OSS LLMs with reinforcement learning.*
7
+ <div align="center">
8
+ </div>
9
+ <div id="badges" align="center">
10
+ <a href="https://castform.com">
11
+ <img src="https://img.shields.io/badge/castform.com-blue?style=for-the-badge" alt="Website"/>
12
+ </a>
13
+ </div>
14
+ <div align="center" style="line-height: 1;">
15
+ <a href="./LICENSE"><img alt="License" src="https://img.shields.io/badge/License-Apache_2.0-blue.svg"/></a>
16
+ </div>
17
+
18
+ ## 📌 News
19
+
20
+ - **[29 Oct 2025]** 🎉 Added support for easy multi-node parallelization across all major cloud providers using [SkyPilot](https://github.com/skypilot-org/skypilot)
21
+
22
+ ## ℹ️ Overview
23
+
24
+ `benchmax` comes with:
25
+
26
+ - A collection of ready-to-use reinforcement learning (RL) environments for LLM fine-tuning ranging from multi-hop search to spreadsheet manipulation to CRM agents
27
+ - An easy way to define, compose, and parallelize your own environments, including leveraging the existing ecosystem of MCP servers
28
+ - Trainer-agnostic by design — `BaseEnv` exposes a small async interface (`list_tools`, `run_tool`, `compute_reward`, plus optional rollout lifecycle hooks) that any rollout loop can drive
29
+ - Optional batteries-included add-ons: synthetic RAG dataset generation (`benchmax[rag]`), agent trace import (`benchmax[traces]`), and clients for the Castform training platform (`benchmax.platform`)
30
+
31
+ Define your environment as:
32
+
33
+ 1. A **toolset** (LLM calls, external APIs, calculators, MCPs, etc.).
34
+ 2. **Output parsing** logic to extract structured observations.
35
+ 3. **Reward functions** to score model outputs.
36
+
37
+ Rollout management, parallel execution, etc. comes out of the box.
38
+
39
+ ⭐ Star our repository to show your support!
40
+
41
+ ## 💡 Core Features
42
+
43
+ **Built-in examples & templates**
44
+
45
+ Get started with ready to use recipes, from Wikipedia search to spreadsheet manipulation. Easy to copy, customize, and extend. And yes, more are on the way.
46
+
47
+ **MCP support**
48
+
49
+ Tap into the growing MCP ecosystem and integrate them as tools within your environments.
50
+
51
+ **Multi-node parallel execution**
52
+
53
+ Multi-node parallelization enabled out of the box with state isolation across roll-outs (e.g. editing files on filesystem, etc.).
54
+
55
+
56
+ ## 🌐 Creating Environments
57
+
58
+ ### What is an environment?
59
+
60
+ An environment consists of:
61
+
62
+ - A list of tools that an LLM can call
63
+ - A list of reward functions that evaluate the quality & correctness of the model's final output.
64
+
65
+ We also support MCP servers natively, allowing you to easily leverage the many servers built by the community.
66
+
67
+ ### Pre-built environments
68
+
69
+ Ready-to-use environments with pre-configured tools and reward functions.
70
+
71
+ - [CRM](/src/benchmax/envs/crm/README.md)
72
+ - [Excel](/src/benchmax/envs/excel/README.md)
73
+ - [Math](/src/benchmax/envs/math/README.md)
74
+ - [Wikipedia](/src/benchmax/envs/wikipedia/README.md)
75
+ - [PostgreSQL search](/src/benchmax/envs/postgres_search/) (`benchmax[rag]`)
76
+
77
+ ### How do I create a custom environment?
78
+
79
+ 1. [With existing MCP servers](/src/benchmax/envs/mcp/README.md) (Built-in support for multi-node parallelization)
80
+
81
+ 2. [Extend BaseEnv](/src/benchmax/envs/README.md)
82
+
83
+ ### How about more complex environments?
84
+
85
+ - Check out our excel spreadsheet RL environment: `benchmax.envs.excel.excel_env.ExcelEnv`
86
+
87
+ ### I want a specific environment
88
+
89
+ Open an issue and tag us & we will look into building you one!
90
+
91
+ ---
92
+
93
+ ## 🎯 Motivation
94
+
95
+ - **Modularity and Simplicity**:
96
+
97
+ We set out to build a lightweight, modular system for defining RL environments—breaking them down into simple, composable parts: tools, tool output parsing, and reward functions.
98
+
99
+ The goal's to make it easy for software engineers to build and experiment with RL environments without needing deep RL expertise.
100
+
101
+ - **Task Recipes and Ideas**:
102
+
103
+ We want `benchmax` to be a living library of reusable, RL-compatible task recipes, ready to inspire and extend beyond the usual suspects like math and coding. We aim to support more real-world workflows, including open-ended and long-horizon tasks.
104
+
105
+ - **Parallelization and Cloud Compatibility**:
106
+ - Enable efficient parallelization with maintained statefulness between rollouts.
107
+ - Facilitate easy deployment and scalability in cloud environments.
108
+
109
+ - **MCP as a first class citizen**:
110
+
111
+ There has been an explosion of MCP servers/tools built out for use-cases ranging from browser use to excel to game creation. `benchmax` allows folks to leverage and compose these existing MCP servers to build environments integrated with real world systems e.g. excel
112
+
113
+
114
+ ## 🤝 Contributing
115
+
116
+ We welcome new environment recipes and bug reports!
117
+
118
+ ⭐ Star our repository to show your support!
119
+
120
+ ## 📦 Add-ons
121
+
122
+ In addition to the core env library, `benchmax` ships several optional
123
+ modules behind extras:
124
+
125
+ | Extra | Module | Purpose |
126
+ |---|---|---|
127
+ | `benchmax[rag]` | `benchmax.rag.*` | Markdown chunking, corpus indexing (Postgres / Chroma / Pinecone / Turbopuffer), synthetic QA dataset generation, RAG-specific reward rubrics |
128
+ | `benchmax[traces]` | `benchmax.traces` | Agentic trace import (Braintrust today, Langfuse coming) and provider-agnostic processing pipeline |
129
+ | `benchmax[chroma]` / `[pinecone]` / `[turbopuffer]` | `benchmax.rag.corpus.*` | Corpus-backend pins (combine with `[rag]`) |
130
+ | _(core)_ | `benchmax.platform` | HTTP clients for the Castform platform — storage uploads, training-job launch, rollout server. Used both internally by `benchmax.rag` and by the high-level [`castform-sdk`](https://pypi.org/project/castform-sdk/). |
131
+
132
+ All platform URLs derive from `CASTFORM_BASE_DOMAIN` (default
133
+ `castform.com`) with per-component overrides; see
134
+ [`benchmax.config`](src/benchmax/config.py).
135
+
136
+ ## 📜 License
137
+
138
+ Apache 2.0 © 2026 Castform
@@ -1,16 +1,19 @@
1
1
  [project]
2
2
  name = "benchmax"
3
- version = "0.1.2.dev23"
3
+ version = "0.1.2.dev26"
4
4
  description = "Framework-Agnostic RL Environments for LLM Fine-Tuning"
5
5
  readme = "README.md"
6
- authors = [{ name = "cgft.io" }]
7
- requires-python = ">=3.12"
6
+ authors = [{ name = "castie@castform.com" }]
7
+ requires-python = "==3.12.*"
8
8
  dependencies = [
9
9
  "aiohttp>=3.13.1",
10
10
  "asyncio>=4.0.0",
11
11
  "cloudpickle>=3.0.0",
12
12
  "datasets>=4.0.0",
13
- "expt-logger>=0.1.0.dev22",
13
+ "httpx>=0.27.0",
14
+ "json-repair>=0.59.10",
15
+ "openai>=2.15.0",
16
+ "pydantic>=2.0.0",
14
17
  ]
15
18
  classifiers = [
16
19
  "Programming Language :: Python :: 3",
@@ -34,6 +37,23 @@ skypilot = [
34
37
  "pip>=25.3",
35
38
  "msrestazure>=0.6.4.post1",
36
39
  ]
40
+ excel = ["openpyxl>=3.1.5"]
41
+ excel-mac-windows = ["openpyxl>=3.1.5", "xlwings>=0.33.16"]
42
+ crm = ["python-dateutil>=2.9.0.post0"]
43
+ rag = [
44
+ "keybert>=0.8",
45
+ "langchain-text-splitters>=0.3.0",
46
+ "nest-asyncio>=1.5.0",
47
+ "ragas>=0.4.3",
48
+ "ruamel-yaml>=0.19.1",
49
+ "scikit-learn>=1.8.0",
50
+ "sentence-transformers>=5.2.3",
51
+ "tqdm>=4.66.0",
52
+ ]
53
+ traces = []
54
+ chroma = ["chromadb>=1.0.0"]
55
+ pinecone = ["pinecone>=5.0.0"]
56
+ turbopuffer = ["turbopuffer>=1.16.2"]
37
57
 
38
58
  [dependency-groups]
39
59
  dev = [
@@ -51,23 +71,6 @@ skypilot = [
51
71
  "pip>=25.3", # Added as needed for skypilot launch
52
72
  "msrestazure>=0.6.4.post1",
53
73
  ]
54
- skyrl = [
55
- "grpcio>=1.60.0",
56
- "hydra-core>=1.3.2",
57
- "omegaconf>=2.3.0",
58
- "ray>=2.48.0",
59
- "skyrl-gym>=0.1.1",
60
- "skyrl-train[vllm]>=0.2.0",
61
- ]
62
74
  excel = ["openpyxl>=3.1.5"]
63
75
  excel-mac-windows = ["openpyxl>=3.1.5", "xlwings>=0.33.16"]
64
- crm = ["python-dateutil>=2.9.0.post0", "simple-salesforce>=1.12.9"]
65
-
66
- [tool.uv]
67
- conflicts = [[{ group = "skypilot" }, { group = "skyrl" }]]
68
-
69
- # [tool.uv.extra-build-dependencies]
70
- # flash-attn = [{ requirement = "torch", match-runtime = true }]
71
-
72
- # [tool.uv.extra-build-variables]
73
- # flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
76
+ crm = ["python-dateutil>=2.9.0.post0"]