@researai/deepscientist 1.5.17 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (894) hide show
  1. package/AGENTS.md +309 -130
  2. package/AISB/catalog/aisb.b1.agentic_coding.yaml +244 -0
  3. package/AISB/catalog/aisb.b10.climate_earth.yaml +235 -0
  4. package/AISB/catalog/aisb.b11.model_efficiency.yaml +231 -0
  5. package/AISB/catalog/aisb.b12.embodied_ai.yaml +238 -0
  6. package/AISB/catalog/aisb.b2.agent_systems.yaml +229 -0
  7. package/AISB/catalog/aisb.b3.self_evolving_rl.yaml +237 -0
  8. package/AISB/catalog/aisb.b4.lm_reasoning.yaml +240 -0
  9. package/AISB/catalog/aisb.b5.math_proof.yaml +235 -0
  10. package/AISB/catalog/aisb.b6.research_process.yaml +243 -0
  11. package/AISB/catalog/aisb.b7.multimodal_fusion.yaml +232 -0
  12. package/AISB/catalog/aisb.b8.lifesci_drug.yaml +275 -0
  13. package/AISB/catalog/aisb.b9.material_science.yaml +237 -0
  14. package/AISB/catalog/aisb.t3.001_savvy.yaml +159 -0
  15. package/AISB/catalog/aisb.t3.001_savvy.zh.yaml +121 -0
  16. package/AISB/catalog/aisb.t3.002_pinet.yaml +189 -0
  17. package/AISB/catalog/aisb.t3.002_pinet.zh.yaml +130 -0
  18. package/AISB/catalog/aisb.t3.004_decentralattn.yaml +184 -0
  19. package/AISB/catalog/aisb.t3.004_decentralattn.zh.yaml +153 -0
  20. package/AISB/catalog/aisb.t3.005_tsae.yaml +193 -0
  21. package/AISB/catalog/aisb.t3.005_tsae.zh.yaml +139 -0
  22. package/AISB/catalog/aisb.t3.006_physense.yaml +194 -0
  23. package/AISB/catalog/aisb.t3.006_physense.zh.yaml +118 -0
  24. package/AISB/catalog/aisb.t3.007_reasoningiqa.yaml +169 -0
  25. package/AISB/catalog/aisb.t3.007_reasoningiqa.zh.yaml +133 -0
  26. package/AISB/catalog/aisb.t3.008_meanflows.yaml +188 -0
  27. package/AISB/catalog/aisb.t3.008_meanflows.zh.yaml +140 -0
  28. package/AISB/catalog/aisb.t3.009_scoremissing.yaml +179 -0
  29. package/AISB/catalog/aisb.t3.009_scoremissing.zh.yaml +119 -0
  30. package/AISB/catalog/aisb.t3.010_suitabilityfilter.yaml +221 -0
  31. package/AISB/catalog/aisb.t3.010_suitabilityfilter.zh.yaml +141 -0
  32. package/AISB/catalog/aisb.t3.011_osd.yaml +206 -0
  33. package/AISB/catalog/aisb.t3.011_osd.zh.yaml +163 -0
  34. package/AISB/catalog/aisb.t3.012_efficientqat.yaml +206 -0
  35. package/AISB/catalog/aisb.t3.012_efficientqat.zh.yaml +159 -0
  36. package/AISB/catalog/aisb.t3.013_appl.yaml +152 -0
  37. package/AISB/catalog/aisb.t3.013_appl.zh.yaml +126 -0
  38. package/AISB/catalog/aisb.t3.014_piguard.yaml +207 -0
  39. package/AISB/catalog/aisb.t3.014_piguard.zh.yaml +164 -0
  40. package/AISB/catalog/aisb.t3.015_frspec.yaml +209 -0
  41. package/AISB/catalog/aisb.t3.015_frspec.zh.yaml +163 -0
  42. package/AISB/catalog/aisb.t3.016_mathfusion.yaml +166 -0
  43. package/AISB/catalog/aisb.t3.016_mathfusion.zh.yaml +145 -0
  44. package/AISB/catalog/aisb.t3.017_multimodalglp.yaml +171 -0
  45. package/AISB/catalog/aisb.t3.017_multimodalglp.zh.yaml +122 -0
  46. package/AISB/catalog/aisb.t3.018_cotsynth.yaml +206 -0
  47. package/AISB/catalog/aisb.t3.018_cotsynth.zh.yaml +162 -0
  48. package/AISB/catalog/aisb.t3.019_dyscaleut.yaml +211 -0
  49. package/AISB/catalog/aisb.t3.019_dyscaleut.zh.yaml +148 -0
  50. package/AISB/catalog/aisb.t3.020_aristotle.yaml +173 -0
  51. package/AISB/catalog/aisb.t3.020_aristotle.zh.yaml +119 -0
  52. package/AISB/catalog/aisb.t3.021_tokenrecycling.yaml +160 -0
  53. package/AISB/catalog/aisb.t3.021_tokenrecycling.zh.yaml +129 -0
  54. package/AISB/catalog/aisb.t3.022_chainofreasoning.yaml +204 -0
  55. package/AISB/catalog/aisb.t3.022_chainofreasoning.zh.yaml +161 -0
  56. package/AISB/catalog/aisb.t3.023_guidedembed.yaml +211 -0
  57. package/AISB/catalog/aisb.t3.023_guidedembed.zh.yaml +189 -0
  58. package/AISB/catalog/aisb.t3.024_outputcentric.yaml +148 -0
  59. package/AISB/catalog/aisb.t3.024_outputcentric.zh.yaml +131 -0
  60. package/AISB/catalog/aisb.t3.025_deeper.yaml +143 -0
  61. package/AISB/catalog/aisb.t3.025_deeper.zh.yaml +116 -0
  62. package/AISB/catalog/aisb.t3.026_gartkg.yaml +195 -0
  63. package/AISB/catalog/aisb.t3.026_gartkg.zh.yaml +127 -0
  64. package/AISB/catalog/aisb.t3.027_citeeval.yaml +182 -0
  65. package/AISB/catalog/aisb.t3.027_citeeval.zh.yaml +135 -0
  66. package/AISB/catalog/aisb.t3.028_sbam.yaml +206 -0
  67. package/AISB/catalog/aisb.t3.028_sbam.zh.yaml +166 -0
  68. package/AISB/catalog/aisb.t3.029_cdqgeoembed.yaml +224 -0
  69. package/AISB/catalog/aisb.t3.029_cdqgeoembed.zh.yaml +142 -0
  70. package/AISB/catalog/aisb.t3.030_processrm.yaml +211 -0
  71. package/AISB/catalog/aisb.t3.030_processrm.zh.yaml +166 -0
  72. package/AISB/catalog/aisb.t3.031_circuitstability.yaml +172 -0
  73. package/AISB/catalog/aisb.t3.031_circuitstability.zh.yaml +134 -0
  74. package/AISB/catalog/aisb.t3.032_ptsolver.yaml +169 -0
  75. package/AISB/catalog/aisb.t3.032_ptsolver.zh.yaml +135 -0
  76. package/AISB/catalog/aisb.t3.033_gcse.yaml +144 -0
  77. package/AISB/catalog/aisb.t3.033_gcse.zh.yaml +126 -0
  78. package/AISB/catalog/aisb.t3.034_ensemblewm.yaml +183 -0
  79. package/AISB/catalog/aisb.t3.034_ensemblewm.zh.yaml +146 -0
  80. package/AISB/catalog/aisb.t3.035_moralvalueswa.yaml +207 -0
  81. package/AISB/catalog/aisb.t3.035_moralvalueswa.zh.yaml +165 -0
  82. package/AISB/catalog/aisb.t3.036_weakstrongpref.yaml +210 -0
  83. package/AISB/catalog/aisb.t3.036_weakstrongpref.zh.yaml +194 -0
  84. package/AISB/catalog/aisb.t3.037_dementiamask.yaml +172 -0
  85. package/AISB/catalog/aisb.t3.037_dementiamask.zh.yaml +132 -0
  86. package/AISB/catalog/aisb.t3.038_tinysam.yaml +284 -0
  87. package/AISB/catalog/aisb.t3.038_tinysam.zh.yaml +240 -0
  88. package/AISB/catalog/aisb.t3.039_calf.yaml +224 -0
  89. package/AISB/catalog/aisb.t3.039_calf.zh.yaml +194 -0
  90. package/AISB/catalog/aisb.t3.040_graniteguardian.yaml +199 -0
  91. package/AISB/catalog/aisb.t3.040_graniteguardian.zh.yaml +174 -0
  92. package/AISB/catalog/aisb.t3.041_amdm.yaml +149 -0
  93. package/AISB/catalog/aisb.t3.041_amdm.zh.yaml +137 -0
  94. package/AISB/catalog/aisb.t3.042_xpatch.yaml +216 -0
  95. package/AISB/catalog/aisb.t3.042_xpatch.zh.yaml +182 -0
  96. package/AISB/catalog/aisb.t3.043_vhm.yaml +268 -0
  97. package/AISB/catalog/aisb.t3.043_vhm.zh.yaml +193 -0
  98. package/AISB/catalog/aisb.t3.044_rgvi.yaml +224 -0
  99. package/AISB/catalog/aisb.t3.044_rgvi.zh.yaml +176 -0
  100. package/AISB/catalog/aisb.t3.045_pslstm.yaml +203 -0
  101. package/AISB/catalog/aisb.t3.045_pslstm.zh.yaml +179 -0
  102. package/AISB/catalog/aisb.t3.046_nonstatts.yaml +208 -0
  103. package/AISB/catalog/aisb.t3.046_nonstatts.zh.yaml +194 -0
  104. package/AISB/catalog/aisb.t3.047_timepfn.yaml +156 -0
  105. package/AISB/catalog/aisb.t3.047_timepfn.zh.yaml +124 -0
  106. package/AISB/catalog/aisb.t3.048_proxyspex.yaml +148 -0
  107. package/AISB/catalog/aisb.t3.048_proxyspex.zh.yaml +125 -0
  108. package/AISB/catalog/aisb.t3.049_hogwildinference.yaml +183 -0
  109. package/AISB/catalog/aisb.t3.049_hogwildinference.zh.yaml +138 -0
  110. package/AISB/catalog/aisb.t3.050_causalpfn.yaml +214 -0
  111. package/AISB/catalog/aisb.t3.050_causalpfn.zh.yaml +190 -0
  112. package/AISB/catalog/aisb.t3.051_flashtp.yaml +169 -0
  113. package/AISB/catalog/aisb.t3.051_flashtp.zh.yaml +124 -0
  114. package/AISB/catalog/aisb.t3.052_nsdiff.yaml +155 -0
  115. package/AISB/catalog/aisb.t3.052_nsdiff.zh.yaml +138 -0
  116. package/AISB/catalog/aisb.t3.053_k2vae.yaml +158 -0
  117. package/AISB/catalog/aisb.t3.053_k2vae.zh.yaml +132 -0
  118. package/AISB/catalog/aisb.t3.054_timebase.yaml +178 -0
  119. package/AISB/catalog/aisb.t3.054_timebase.zh.yaml +158 -0
  120. package/AISB/catalog/aisb.t3.055_csbrain.yaml +238 -0
  121. package/AISB/catalog/aisb.t3.055_csbrain.zh.yaml +184 -0
  122. package/AISB/catalog/aisb.t3.056_infosam.yaml +224 -0
  123. package/AISB/catalog/aisb.t3.056_infosam.zh.yaml +189 -0
  124. package/AISB/catalog/aisb.t3.057_mdreid.yaml +129 -0
  125. package/AISB/catalog/aisb.t3.057_mdreid.zh.yaml +117 -0
  126. package/AISB/catalog/aisb.t3.058_mindglitch.yaml +171 -0
  127. package/AISB/catalog/aisb.t3.058_mindglitch.zh.yaml +145 -0
  128. package/AISB/catalog/aisb.t3.059_selfsupervised.yaml +154 -0
  129. package/AISB/catalog/aisb.t3.059_selfsupervised.zh.yaml +125 -0
  130. package/AISB/catalog/aisb.t3.060_iaggad.yaml +121 -0
  131. package/AISB/catalog/aisb.t3.060_iaggad.zh.yaml +100 -0
  132. package/AISB/catalog/aisb.t3.061_hsgkn.yaml +136 -0
  133. package/AISB/catalog/aisb.t3.061_hsgkn.zh.yaml +113 -0
  134. package/AISB/catalog/aisb.t3.062_visionts.yaml +237 -0
  135. package/AISB/catalog/aisb.t3.062_visionts.zh.yaml +216 -0
  136. package/AISB/catalog/aisb.t3.063_tsrag.yaml +162 -0
  137. package/AISB/catalog/aisb.t3.063_tsrag.zh.yaml +138 -0
  138. package/AISB/catalog/aisb.t3.064_pir.yaml +221 -0
  139. package/AISB/catalog/aisb.t3.064_pir.zh.yaml +197 -0
  140. package/AISB/catalog/aisb.t3.065_proteinbinding.yaml +234 -0
  141. package/AISB/catalog/aisb.t3.065_proteinbinding.zh.yaml +167 -0
  142. package/AISB/catalog/aisb.t3.066_tropicalattention.yaml +267 -0
  143. package/AISB/catalog/aisb.t3.066_tropicalattention.zh.yaml +229 -0
  144. package/AISB/catalog/aisb.t3.067_kanad.yaml +193 -0
  145. package/AISB/catalog/aisb.t3.067_kanad.zh.yaml +167 -0
  146. package/AISB/catalog/aisb.t3.068_sempo.yaml +187 -0
  147. package/AISB/catalog/aisb.t3.068_sempo.zh.yaml +148 -0
  148. package/AISB/catalog/aisb.t3.069_treehfd.yaml +129 -0
  149. package/AISB/catalog/aisb.t3.069_treehfd.zh.yaml +111 -0
  150. package/AISB/catalog/aisb.t3.070_certifiedunlearning.yaml +224 -0
  151. package/AISB/catalog/aisb.t3.070_certifiedunlearning.zh.yaml +171 -0
  152. package/AISB/catalog/aisb.t3.071_neuralmjd.yaml +142 -0
  153. package/AISB/catalog/aisb.t3.071_neuralmjd.zh.yaml +120 -0
  154. package/AISB/catalog/aisb.t3.072_fedgmt.yaml +181 -0
  155. package/AISB/catalog/aisb.t3.072_fedgmt.zh.yaml +158 -0
  156. package/AISB/catalog/aisb.t3.073_rld.yaml +161 -0
  157. package/AISB/catalog/aisb.t3.073_rld.zh.yaml +129 -0
  158. package/AISB/catalog/aisb.t3.074_lsvi.yaml +163 -0
  159. package/AISB/catalog/aisb.t3.074_lsvi.zh.yaml +129 -0
  160. package/AISB/catalog/aisb.t3.075_treeslicedentropy.yaml +201 -0
  161. package/AISB/catalog/aisb.t3.075_treeslicedentropy.zh.yaml +148 -0
  162. package/AISB/catalog/aisb.t3.076_aanet.yaml +169 -0
  163. package/AISB/catalog/aisb.t3.076_aanet.zh.yaml +129 -0
  164. package/AISB/catalog/aisb.t3.077_cmnn.yaml +199 -0
  165. package/AISB/catalog/aisb.t3.077_cmnn.zh.yaml +165 -0
  166. package/AISB/catalog/aisb.t3.078_conformalanomaly.yaml +146 -0
  167. package/AISB/catalog/aisb.t3.078_conformalanomaly.zh.yaml +117 -0
  168. package/AISB/catalog/aisb.t3.079_dpfkmeans.yaml +131 -0
  169. package/AISB/catalog/aisb.t3.079_dpfkmeans.zh.yaml +104 -0
  170. package/AISB/catalog/aisb.t3.080_latentscorereweight.yaml +169 -0
  171. package/AISB/catalog/aisb.t3.080_latentscorereweight.zh.yaml +123 -0
  172. package/AISB/catalog/aisb.t3.081_qmamba.yaml +150 -0
  173. package/AISB/catalog/aisb.t3.081_qmamba.zh.yaml +117 -0
  174. package/AISB/catalog/aisb.t3.082_onlinellmrouting.yaml +160 -0
  175. package/AISB/catalog/aisb.t3.082_onlinellmrouting.zh.yaml +133 -0
  176. package/AISB/catalog/aisb.t3.083_starformer.yaml +178 -0
  177. package/AISB/catalog/aisb.t3.083_starformer.zh.yaml +140 -0
  178. package/AISB/catalog/aisb.t3.084_ift.yaml +139 -0
  179. package/AISB/catalog/aisb.t3.084_ift.zh.yaml +111 -0
  180. package/AISB/catalog/aisb.t3.085_neuralsurv.yaml +183 -0
  181. package/AISB/catalog/aisb.t3.085_neuralsurv.zh.yaml +143 -0
  182. package/AISB/catalog/aisb.t3.086_stella.yaml +197 -0
  183. package/AISB/catalog/aisb.t3.086_stella.zh.yaml +142 -0
  184. package/AISB/catalog/aisb.t3.087_moses.yaml +167 -0
  185. package/AISB/catalog/aisb.t3.087_moses.zh.yaml +132 -0
  186. package/AISB/catalog/aisb.t3.088_channelnorm.yaml +140 -0
  187. package/AISB/catalog/aisb.t3.088_channelnorm.zh.yaml +109 -0
  188. package/AISB/catalog/aisb.t3.089_causalvelocity.yaml +730 -0
  189. package/AISB/catalog/aisb.t3.089_causalvelocity.zh.yaml +668 -0
  190. package/AISB/catalog/aisb.t3.090_rstib.yaml +144 -0
  191. package/AISB/catalog/aisb.t3.090_rstib.zh.yaml +109 -0
  192. package/AISB/catalog/aisb.t3.091_timeawarecausal.yaml +132 -0
  193. package/AISB/catalog/aisb.t3.091_timeawarecausal.zh.yaml +107 -0
  194. package/AISB/catalog/aisb.t3.092_kmeanslocalopt.yaml +138 -0
  195. package/AISB/catalog/aisb.t3.092_kmeanslocalopt.zh.yaml +110 -0
  196. package/AISB/catalog/aisb.t3.093_fedwmsam.yaml +134 -0
  197. package/AISB/catalog/aisb.t3.093_fedwmsam.zh.yaml +106 -0
  198. package/AISB/catalog/aisb.t3.094_boundre.yaml +147 -0
  199. package/AISB/catalog/aisb.t3.094_boundre.zh.yaml +114 -0
  200. package/AISB/catalog/aisb.t3.095_fastfeaturecp.yaml +153 -0
  201. package/AISB/catalog/aisb.t3.095_fastfeaturecp.zh.yaml +118 -0
  202. package/AISB/catalog/aisb.t3.096_m3svm.yaml +189 -0
  203. package/AISB/catalog/aisb.t3.096_m3svm.zh.yaml +149 -0
  204. package/AISB/catalog/aisb.t3.097_wassersteintl.yaml +212 -0
  205. package/AISB/catalog/aisb.t3.097_wassersteintl.zh.yaml +169 -0
  206. package/AISB/catalog/aisb.t3.098_xmahalanobis.yaml +171 -0
  207. package/AISB/catalog/aisb.t3.098_xmahalanobis.zh.yaml +127 -0
  208. package/AISB/catalog/aisb.t3.099_ollalanding.yaml +248 -0
  209. package/AISB/catalog/aisb.t3.099_ollalanding.zh.yaml +182 -0
  210. package/AISB/catalog/aisb.t3.100_invmissingdata.yaml +179 -0
  211. package/AISB/catalog/aisb.t3.100_invmissingdata.zh.yaml +150 -0
  212. package/AISB/catalog/aisb.t3.101_acia.yaml +164 -0
  213. package/AISB/catalog/aisb.t3.101_acia.zh.yaml +109 -0
  214. package/AISB/catalog/aisb.t3.102_stochasticff.yaml +178 -0
  215. package/AISB/catalog/aisb.t3.102_stochasticff.zh.yaml +130 -0
  216. package/AISB/catalog/aisb.t3.103_qdcp.yaml +150 -0
  217. package/AISB/catalog/aisb.t3.103_qdcp.zh.yaml +116 -0
  218. package/AISB/catalog/aisb.t3.104_balancedactiveinf.yaml +137 -0
  219. package/AISB/catalog/aisb.t3.104_balancedactiveinf.zh.yaml +104 -0
  220. package/AISB/catalog/aisb.t3.105_binaryclasseval.yaml +161 -0
  221. package/AISB/catalog/aisb.t3.105_binaryclasseval.zh.yaml +130 -0
  222. package/AISB/image/001_aisb.t3.001_savvy.jpg +0 -0
  223. package/AISB/image/002_aisb.t3.002_pinet.jpg +0 -0
  224. package/AISB/image/003_aisb.t3.003_dmsqd.jpg +0 -0
  225. package/AISB/image/004_aisb.t3.004_decentralattn.jpg +0 -0
  226. package/AISB/image/005_aisb.t3.005_tsae.jpg +0 -0
  227. package/AISB/image/006_aisb.t3.006_physense.jpg +0 -0
  228. package/AISB/image/007_aisb.t3.007_reasoningiqa.jpg +0 -0
  229. package/AISB/image/008_aisb.t3.008_meanflows.jpg +0 -0
  230. package/AISB/image/009_aisb.t3.009_scoremissing.jpg +0 -0
  231. package/AISB/image/010_aisb.t3.010_suitabilityfilter.jpg +0 -0
  232. package/AISB/image/011_aisb.t3.011_osd.jpg +0 -0
  233. package/AISB/image/012_aisb.t3.012_efficientqat.jpg +0 -0
  234. package/AISB/image/013_aisb.t3.013_appl.jpg +0 -0
  235. package/AISB/image/014_aisb.t3.014_piguard.jpg +0 -0
  236. package/AISB/image/015_aisb.t3.015_frspec.jpg +0 -0
  237. package/AISB/image/016_aisb.t3.016_mathfusion.jpg +0 -0
  238. package/AISB/image/017_aisb.t3.017_multimodalglp.jpg +0 -0
  239. package/AISB/image/018_aisb.t3.018_cotsynth.jpg +0 -0
  240. package/AISB/image/019_aisb.t3.019_dyscaleut.jpg +0 -0
  241. package/AISB/image/020_aisb.t3.020_aristotle.jpg +0 -0
  242. package/AISB/image/021_aisb.t3.021_tokenrecycling.jpg +0 -0
  243. package/AISB/image/022_aisb.t3.022_chainofreasoning.jpg +0 -0
  244. package/AISB/image/023_aisb.t3.023_guidedembed.jpg +0 -0
  245. package/AISB/image/024_aisb.t3.024_outputcentric.jpg +0 -0
  246. package/AISB/image/025_aisb.t3.025_deeper.jpg +0 -0
  247. package/AISB/image/026_aisb.t3.026_gartkg.jpg +0 -0
  248. package/AISB/image/027_aisb.t3.027_citeeval.jpg +0 -0
  249. package/AISB/image/028_aisb.t3.028_sbam.jpg +0 -0
  250. package/AISB/image/029_aisb.t3.029_cdqgeoembed.jpg +0 -0
  251. package/AISB/image/030_aisb.t3.030_processrm.jpg +0 -0
  252. package/AISB/image/031_aisb.t3.031_circuitstability.jpg +0 -0
  253. package/AISB/image/032_aisb.t3.032_ptsolver.jpg +0 -0
  254. package/AISB/image/033_aisb.t3.033_gcse.jpg +0 -0
  255. package/AISB/image/034_aisb.t3.034_ensemblewm.jpg +0 -0
  256. package/AISB/image/035_aisb.t3.035_moralvalueswa.jpg +0 -0
  257. package/AISB/image/036_aisb.t3.036_weakstrongpref.jpg +0 -0
  258. package/AISB/image/037_aisb.t3.037_dementiamask.jpg +0 -0
  259. package/AISB/image/038_aisb.t3.038_tinysam.jpg +0 -0
  260. package/AISB/image/039_aisb.t3.039_calf.jpg +0 -0
  261. package/AISB/image/040_aisb.t3.040_graniteguardian.jpg +0 -0
  262. package/AISB/image/041_aisb.t3.041_amdm.jpg +0 -0
  263. package/AISB/image/042_aisb.t3.042_xpatch.jpg +0 -0
  264. package/AISB/image/043_aisb.t3.043_vhm.jpg +0 -0
  265. package/AISB/image/044_aisb.t3.044_rgvi.jpg +0 -0
  266. package/AISB/image/045_aisb.t3.045_pslstm.jpg +0 -0
  267. package/AISB/image/046_aisb.t3.046_nonstatts.jpg +0 -0
  268. package/AISB/image/047_aisb.t3.047_timepfn.jpg +0 -0
  269. package/AISB/image/048_aisb.t3.048_proxyspex.jpg +0 -0
  270. package/AISB/image/049_aisb.t3.049_hogwildinference.jpg +0 -0
  271. package/AISB/image/050_aisb.t3.050_causalpfn.jpg +0 -0
  272. package/AISB/image/051_aisb.t3.051_flashtp.jpg +0 -0
  273. package/AISB/image/052_aisb.t3.052_nsdiff.jpg +0 -0
  274. package/AISB/image/053_aisb.t3.053_k2vae.jpg +0 -0
  275. package/AISB/image/054_aisb.t3.054_timebase.jpg +0 -0
  276. package/AISB/image/055_aisb.t3.055_csbrain.jpg +0 -0
  277. package/AISB/image/056_aisb.t3.056_infosam.jpg +0 -0
  278. package/AISB/image/057_aisb.t3.057_mdreid.jpg +0 -0
  279. package/AISB/image/058_aisb.t3.058_mindglitch.jpg +0 -0
  280. package/AISB/image/059_aisb.t3.059_selfsupervised.jpg +0 -0
  281. package/AISB/image/060_aisb.t3.060_iaggad.jpg +0 -0
  282. package/AISB/image/061_aisb.t3.061_hsgkn.jpg +0 -0
  283. package/AISB/image/062_aisb.t3.062_visionts.jpg +0 -0
  284. package/AISB/image/063_aisb.t3.063_tsrag.jpg +0 -0
  285. package/AISB/image/064_aisb.t3.064_pir.jpg +0 -0
  286. package/AISB/image/065_aisb.t3.065_proteinbinding.jpg +0 -0
  287. package/AISB/image/066_aisb.t3.066_tropicalattention.jpg +0 -0
  288. package/AISB/image/067_aisb.t3.067_kanad.jpg +0 -0
  289. package/AISB/image/068_aisb.t3.068_sempo.jpg +0 -0
  290. package/AISB/image/069_aisb.t3.069_treehfd.jpg +0 -0
  291. package/AISB/image/070_aisb.t3.070_certifiedunlearning.jpg +0 -0
  292. package/AISB/image/071_aisb.t3.071_neuralmjd.jpg +0 -0
  293. package/AISB/image/072_aisb.t3.072_fedgmt.jpg +0 -0
  294. package/AISB/image/073_aisb.t3.073_rld.jpg +0 -0
  295. package/AISB/image/074_aisb.t3.074_lsvi.jpg +0 -0
  296. package/AISB/image/075_aisb.t3.075_treeslicedentropy.jpg +0 -0
  297. package/AISB/image/076_aisb.t3.076_aanet.jpg +0 -0
  298. package/AISB/image/077_aisb.t3.077_cmnn.jpg +0 -0
  299. package/AISB/image/078_aisb.t3.078_conformalanomaly.jpg +0 -0
  300. package/AISB/image/079_aisb.t3.079_dpfkmeans.jpg +0 -0
  301. package/AISB/image/080_aisb.t3.080_latentscorereweight.jpg +0 -0
  302. package/AISB/image/081_aisb.t3.081_qmamba.jpg +0 -0
  303. package/AISB/image/082_aisb.t3.082_onlinellmrouting.jpg +0 -0
  304. package/AISB/image/083_aisb.t3.083_starformer.jpg +0 -0
  305. package/AISB/image/084_aisb.t3.084_ift.jpg +0 -0
  306. package/AISB/image/085_aisb.t3.085_neuralsurv.jpg +0 -0
  307. package/AISB/image/086_aisb.t3.086_stella.jpg +0 -0
  308. package/AISB/image/087_aisb.t3.087_moses.jpg +0 -0
  309. package/AISB/image/088_aisb.t3.088_channelnorm.jpg +0 -0
  310. package/AISB/image/089_aisb.t3.089_causalvelocity.jpg +0 -0
  311. package/AISB/image/090_aisb.t3.090_rstib.jpg +0 -0
  312. package/AISB/image/091_aisb.t3.091_timeawarecausal.jpg +0 -0
  313. package/AISB/image/092_aisb.t3.092_kmeanslocalopt.jpg +0 -0
  314. package/AISB/image/093_aisb.t3.093_fedwmsam.jpg +0 -0
  315. package/AISB/image/094_aisb.t3.094_boundre.jpg +0 -0
  316. package/AISB/image/095_aisb.t3.095_fastfeaturecp.jpg +0 -0
  317. package/AISB/image/096_aisb.t3.096_m3svm.jpg +0 -0
  318. package/AISB/image/097_aisb.t3.097_wassersteintl.jpg +0 -0
  319. package/AISB/image/098_aisb.t3.098_xmahalanobis.jpg +0 -0
  320. package/AISB/image/099_aisb.t3.099_ollalanding.jpg +0 -0
  321. package/AISB/image/100_aisb.t3.100_invmissingdata.jpg +0 -0
  322. package/AISB/image/101_aisb.t3.101_acia.jpg +0 -0
  323. package/AISB/image/102_aisb.t3.102_stochasticff.jpg +0 -0
  324. package/AISB/image/103_aisb.t3.103_qdcp.jpg +0 -0
  325. package/AISB/image/104_aisb.t3.104_balancedactiveinf.jpg +0 -0
  326. package/AISB/image/105_aisb.t3.105_binaryclasseval.jpg +0 -0
  327. package/AISB/image/106_aisb.t1.reasoning_lite.jpg +0 -0
  328. package/AISB/image/107_aisb.t2.paper_audit.jpg +0 -0
  329. package/AISB/image/108_aisb.t3.multi_gpu_search.jpg +0 -0
  330. package/AISB/image/109_aisb.t3.tdc_admet.jpg +0 -0
  331. package/AISB/image/aisb.b1.agentic_coding.svg +16 -0
  332. package/AISB/image/aisb.b10.climate_earth.svg +16 -0
  333. package/AISB/image/aisb.b11.model_efficiency.svg +16 -0
  334. package/AISB/image/aisb.b12.embodied_ai.svg +16 -0
  335. package/AISB/image/aisb.b2.agent_systems.svg +16 -0
  336. package/AISB/image/aisb.b3.self_evolving_rl.svg +16 -0
  337. package/AISB/image/aisb.b4.lm_reasoning.svg +16 -0
  338. package/AISB/image/aisb.b5.math_proof.svg +16 -0
  339. package/AISB/image/aisb.b6.research_process.svg +16 -0
  340. package/AISB/image/aisb.b7.multimodal_fusion.svg +16 -0
  341. package/AISB/image/aisb.b8.lifesci_drug.svg +16 -0
  342. package/AISB/image/aisb.b9.material_science.svg +16 -0
  343. package/README.md +132 -11
  344. package/bin/ds.js +376 -49
  345. package/docs/en/00_QUICK_START.md +135 -18
  346. package/docs/en/01_SETTINGS_REFERENCE.md +468 -96
  347. package/docs/en/02_START_RESEARCH_GUIDE.md +26 -5
  348. package/docs/en/03_QQ_CONNECTOR_GUIDE.md +14 -3
  349. package/docs/en/04_LINGZHU_CONNECTOR_GUIDE.md +2 -0
  350. package/docs/en/05_TUI_GUIDE.md +171 -2
  351. package/docs/en/07_MEMORY_AND_MCP.md +38 -2
  352. package/docs/en/09_DOCTOR.md +64 -4
  353. package/docs/en/10_WEIXIN_CONNECTOR_GUIDE.md +38 -1
  354. package/docs/en/11_LICENSE_AND_RISK.md +4 -0
  355. package/docs/en/12_GUIDED_WORKFLOW_TOUR.md +15 -0
  356. package/docs/en/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +9 -0
  357. package/docs/en/15_CODEX_PROVIDER_SETUP.md +622 -187
  358. package/docs/en/16_TELEGRAM_CONNECTOR_GUIDE.md +14 -0
  359. package/docs/en/17_WHATSAPP_CONNECTOR_GUIDE.md +14 -0
  360. package/docs/en/18_FEISHU_CONNECTOR_GUIDE.md +14 -0
  361. package/docs/en/21_LOCAL_MODEL_BACKENDS_GUIDE.md +105 -2
  362. package/docs/en/22_BENCHSTORE_YAML_REFERENCE.md +469 -0
  363. package/docs/en/23_BENCHSTORE_GITHUB_RELEASES_SPEC.md +316 -0
  364. package/docs/en/24_CLAUDE_CODE_PROVIDER_SETUP.md +469 -0
  365. package/docs/en/25_OPENCODE_PROVIDER_SETUP.md +653 -0
  366. package/docs/en/26_CITATION_AND_ATTRIBUTION.md +119 -0
  367. package/docs/en/27_KIMI_CODE_PROVIDER_SETUP.md +180 -0
  368. package/docs/en/28_DISCORD_CONNECTOR_GUIDE.md +61 -0
  369. package/docs/en/29_SLACK_CONNECTOR_GUIDE.md +60 -0
  370. package/docs/en/30_SETTINGS_CONTROL_CENTER_GUIDE.md +371 -0
  371. package/docs/en/{19_LOCAL_BROWSER_AUTH.md → 31_LOCAL_BROWSER_AUTH.md} +1 -1
  372. package/docs/en/32_WINDOWS_WSL2_DEPLOYMENT_GUIDE.md +273 -0
  373. package/docs/en/33_WORKSPACE_EXPLORER_QA.md +121 -0
  374. package/docs/en/91_DEVELOPMENT.md +29 -0
  375. package/docs/en/99_ACKNOWLEDGEMENTS.md +24 -19
  376. package/docs/en/README.md +44 -7
  377. package/docs/images/admin/admin-connectors-health-en.png +0 -0
  378. package/docs/images/admin/admin-controllers-en.png +0 -0
  379. package/docs/images/admin/admin-diagnostics-en.png +0 -0
  380. package/docs/images/admin/admin-errors-en.png +0 -0
  381. package/docs/images/admin/admin-issues-en.png +0 -0
  382. package/docs/images/admin/admin-logs-en.png +0 -0
  383. package/docs/images/admin/admin-quest-detail-en.png +0 -0
  384. package/docs/images/admin/admin-quests-en.png +0 -0
  385. package/docs/images/admin/admin-repairs-en.png +0 -0
  386. package/docs/images/admin/admin-runtime-en.png +0 -0
  387. package/docs/images/admin/admin-search-en.png +0 -0
  388. package/docs/images/admin/admin-stats-en.png +0 -0
  389. package/docs/images/admin/admin-summary-en.png +0 -0
  390. package/docs/images/connectors/connector-discord-en.png +0 -0
  391. package/docs/images/connectors/connector-feishu-en.png +0 -0
  392. package/docs/images/connectors/connector-lingzhu-en.png +0 -0
  393. package/docs/images/connectors/connector-qq-en.png +0 -0
  394. package/docs/images/connectors/connector-slack-en.png +0 -0
  395. package/docs/images/connectors/connector-telegram-en.png +0 -0
  396. package/docs/images/connectors/connector-weixin-en.png +0 -0
  397. package/docs/images/connectors/connector-whatsapp-en.png +0 -0
  398. package/docs/images/settings/settings-baselines-en.png +0 -0
  399. package/docs/images/settings/settings-config-en.png +0 -0
  400. package/docs/images/settings/settings-connectors-overview-en.png +0 -0
  401. package/docs/images/settings/settings-deepxiv-en.png +0 -0
  402. package/docs/images/settings/settings-mcp-servers-en.png +0 -0
  403. package/docs/images/settings/settings-plugins-en.png +0 -0
  404. package/docs/images/settings/settings-runners-en.png +0 -0
  405. package/docs/zh/00_QUICK_START.md +92 -17
  406. package/docs/zh/01_SETTINGS_REFERENCE.md +219 -98
  407. package/docs/zh/02_START_RESEARCH_GUIDE.md +26 -5
  408. package/docs/zh/05_TUI_GUIDE.md +171 -2
  409. package/docs/zh/07_MEMORY_AND_MCP.md +29 -2
  410. package/docs/zh/09_DOCTOR.md +39 -4
  411. package/docs/zh/10_WEIXIN_CONNECTOR_GUIDE.md +24 -1
  412. package/docs/zh/11_LICENSE_AND_RISK.md +4 -0
  413. package/docs/zh/12_GUIDED_WORKFLOW_TOUR.md +15 -0
  414. package/docs/zh/14_PROMPT_SKILLS_AND_MCP_GUIDE.md +9 -0
  415. package/docs/zh/15_CODEX_PROVIDER_SETUP.md +550 -188
  416. package/docs/zh/21_LOCAL_MODEL_BACKENDS_GUIDE.md +105 -2
  417. package/docs/zh/22_BENCHSTORE_YAML_REFERENCE.md +459 -0
  418. package/docs/zh/23_BENCHSTORE_GITHUB_RELEASES_SPEC.md +287 -0
  419. package/docs/zh/23_CLAUDE_RUNNER_GUIDE.md +103 -0
  420. package/docs/zh/24_CLAUDE_CODE_PROVIDER_SETUP.md +460 -0
  421. package/docs/zh/25_OPENCODE_PROVIDER_SETUP.md +660 -0
  422. package/docs/zh/26_CITATION_AND_ATTRIBUTION.md +102 -0
  423. package/docs/zh/27_KIMI_CODE_PROVIDER_SETUP.md +51 -0
  424. package/docs/zh/{19_LOCAL_BROWSER_AUTH.md → 31_LOCAL_BROWSER_AUTH.md} +1 -1
  425. package/docs/zh/32_WINDOWS_WSL2_DEPLOYMENT_GUIDE.md +264 -0
  426. package/docs/zh/33_WORKSPACE_EXPLORER_QA.md +127 -0
  427. package/docs/zh/99_ACKNOWLEDGEMENTS.md +23 -19
  428. package/docs/zh/README.md +29 -7
  429. package/install.sh +122 -16
  430. package/package.json +4 -1
  431. package/pyproject.toml +2 -1
  432. package/src/deepscientist/__init__.py +1 -1
  433. package/src/deepscientist/acp/envelope.py +13 -0
  434. package/src/deepscientist/admin/__init__.py +3 -0
  435. package/src/deepscientist/admin/charts.py +681 -0
  436. package/src/deepscientist/admin/logs.py +119 -0
  437. package/src/deepscientist/admin/repairs.py +217 -0
  438. package/src/deepscientist/admin/service.py +1310 -0
  439. package/src/deepscientist/admin/system_info.py +700 -0
  440. package/src/deepscientist/admin/tasks.py +465 -0
  441. package/src/deepscientist/admin/tool_metrics.py +600 -0
  442. package/src/deepscientist/artifact/guidance.py +8 -4
  443. package/src/deepscientist/artifact/schemas.py +115 -0
  444. package/src/deepscientist/artifact/service.py +4268 -260
  445. package/src/deepscientist/bash_exec/monitor.py +30 -3
  446. package/src/deepscientist/bash_exec/service.py +134 -1
  447. package/src/deepscientist/benchstore/__init__.py +4 -0
  448. package/src/deepscientist/benchstore/prompt_builder.py +224 -0
  449. package/src/deepscientist/benchstore/service.py +1716 -0
  450. package/src/deepscientist/channels/weixin_ilink.py +8 -1
  451. package/src/deepscientist/cli.py +92 -17
  452. package/src/deepscientist/codex_cli_compat.py +2 -2
  453. package/src/deepscientist/config/models.py +82 -11
  454. package/src/deepscientist/config/service.py +927 -91
  455. package/src/deepscientist/connector/weixin_support.py +48 -17
  456. package/src/deepscientist/daemon/api/handlers.py +697 -210
  457. package/src/deepscientist/daemon/api/router.py +76 -1
  458. package/src/deepscientist/daemon/app.py +1054 -51
  459. package/src/deepscientist/diagnostics/runner_failures.py +147 -0
  460. package/src/deepscientist/doctor.py +212 -65
  461. package/src/deepscientist/evidence_packets.py +590 -0
  462. package/src/deepscientist/home.py +52 -4
  463. package/src/deepscientist/kimi_cli_compat.py +50 -0
  464. package/src/deepscientist/latex_runtime.py +2 -2
  465. package/src/deepscientist/mcp/context.py +2 -0
  466. package/src/deepscientist/mcp/schemas.py +114 -0
  467. package/src/deepscientist/mcp/server.py +1566 -126
  468. package/src/deepscientist/memory/service.py +203 -16
  469. package/src/deepscientist/process_control.py +8 -1
  470. package/src/deepscientist/prompts/builder.py +836 -92
  471. package/src/deepscientist/quest/__init__.py +2 -2
  472. package/src/deepscientist/quest/layout.py +12 -1
  473. package/src/deepscientist/quest/node_traces.py +10 -0
  474. package/src/deepscientist/quest/service.py +1430 -139
  475. package/src/deepscientist/quest/stage_views.py +1 -1
  476. package/src/deepscientist/runners/__init__.py +18 -0
  477. package/src/deepscientist/runners/base.py +89 -1
  478. package/src/deepscientist/runners/builtins.py +13 -1
  479. package/src/deepscientist/runners/claude.py +391 -0
  480. package/src/deepscientist/runners/codex.py +421 -21
  481. package/src/deepscientist/runners/codex_telemetry.py +127 -0
  482. package/src/deepscientist/runners/kimi.py +334 -0
  483. package/src/deepscientist/runners/metadata.py +68 -0
  484. package/src/deepscientist/runners/opencode.py +414 -0
  485. package/src/deepscientist/runners/runtime_overrides.py +100 -0
  486. package/src/deepscientist/runners/simple_cli.py +538 -0
  487. package/src/deepscientist/runtime_storage.py +303 -0
  488. package/src/deepscientist/shared.py +61 -16
  489. package/src/deepscientist/skills/installer.py +37 -0
  490. package/src/deepscientist/skills/registry.py +2 -0
  491. package/src/deepscientist/tinytex.py +2 -2
  492. package/src/deepscientist/tui.py +10 -3
  493. package/src/prompts/benchstore/system.md +77 -0
  494. package/src/prompts/connectors/qq.md +33 -2
  495. package/src/prompts/connectors/weixin.md +208 -23
  496. package/src/prompts/contracts/admin_ops.md +74 -0
  497. package/src/prompts/contracts/admin_ops_knowledge.md +138 -0
  498. package/src/prompts/contracts/shared_interaction.md +5 -11
  499. package/src/prompts/start_setup/system.md +422 -0
  500. package/src/prompts/system.md +409 -315
  501. package/src/prompts/system_copilot.md +88 -12
  502. package/src/skills/analysis-campaign/SKILL.md +239 -578
  503. package/src/skills/analysis-campaign/references/artifact-flow-examples.md +102 -0
  504. package/src/skills/analysis-campaign/references/boundary-cases.md +98 -0
  505. package/src/skills/analysis-campaign/references/campaign-checklist-template.md +39 -24
  506. package/src/skills/analysis-campaign/references/campaign-design.md +26 -10
  507. package/src/skills/analysis-campaign/references/campaign-plan-template.md +53 -54
  508. package/src/skills/analysis-campaign/references/operational-guidance.md +97 -0
  509. package/src/skills/analysis-campaign/references/writing-facing-slice-examples.md +10 -20
  510. package/src/skills/baseline/SKILL.md +183 -461
  511. package/src/skills/baseline/references/artifact-flow-examples.md +106 -0
  512. package/src/skills/baseline/references/artifact-payload-examples.md +1 -1
  513. package/src/skills/baseline/references/baseline-checklist-template.md +27 -35
  514. package/src/skills/baseline/references/baseline-plan-template.md +37 -76
  515. package/src/skills/baseline/references/boundary-cases.md +86 -0
  516. package/src/skills/baseline/references/codebase-audit-checklist.md +2 -6
  517. package/src/skills/baseline/references/comparability-contract.md +7 -12
  518. package/src/skills/baseline/references/operational-guidance.md +56 -0
  519. package/src/skills/baseline/references/route-selection.md +5 -25
  520. package/src/skills/decision/SKILL.md +113 -306
  521. package/src/skills/decision/references/checkpoint-memory-template.md +47 -0
  522. package/src/skills/decision/references/operational-guidance.md +94 -0
  523. package/src/skills/decision/references/research-route-criteria.md +7 -8
  524. package/src/skills/decision/references/strategic-decision-template.md +13 -26
  525. package/src/skills/experiment/SKILL.md +132 -670
  526. package/src/skills/experiment/references/execution-playbook.md +374 -0
  527. package/src/skills/experiment/references/main-experiment-checklist-template.md +26 -2
  528. package/src/skills/experiment/references/main-experiment-plan-template.md +28 -17
  529. package/src/skills/experiment/references/operational-guidance.md +108 -0
  530. package/src/skills/finalize/SKILL.md +62 -0
  531. package/src/skills/finalize/references/checkpoint-memory-template.md +49 -0
  532. package/src/skills/finalize/references/resume-packet-template.md +7 -0
  533. package/src/skills/idea/SKILL.md +228 -15
  534. package/src/skills/idea/references/controlled-brainstorming-playbook.md +78 -0
  535. package/src/skills/idea/references/current-board-packet-template.md +61 -0
  536. package/src/skills/idea/references/high-value-idea-sourcing.md +119 -0
  537. package/src/skills/idea/references/idea-generation-playbook.md +21 -0
  538. package/src/skills/idea/references/idea-thinking-flow.md +6 -0
  539. package/src/skills/idea/references/literature-survey-template.md +3 -0
  540. package/src/skills/idea/references/objective-contract-template.md +54 -0
  541. package/src/skills/idea/references/outline-seeding-example.md +56 -0
  542. package/src/skills/idea/references/pre-idea-draft-template.md +105 -0
  543. package/src/skills/idea/references/related-work-playbook.md +75 -2
  544. package/src/skills/idea/references/research-history-playbook.md +114 -0
  545. package/src/skills/idea/references/selection-gate.md +58 -6
  546. package/src/skills/intake-audit/SKILL.md +43 -2
  547. package/src/skills/intake-audit/references/state-audit-template.md +10 -0
  548. package/src/skills/nature-data/SKILL.md +128 -0
  549. package/src/skills/nature-data/UPSTREAM_LICENSE.txt +21 -0
  550. package/src/skills/nature-data/agents/openai.yaml +4 -0
  551. package/src/skills/nature-data/references/chinese-author-alignment.md +84 -0
  552. package/src/skills/nature-data/references/fair-metadata-checklist.md +105 -0
  553. package/src/skills/nature-data/references/policy-principles.md +103 -0
  554. package/src/skills/nature-data/references/repository-and-identifiers.md +96 -0
  555. package/src/skills/nature-data/references/source-basis.md +54 -0
  556. package/src/skills/nature-data/references/statement-patterns.md +153 -0
  557. package/src/skills/nature-figure/SKILL.md +197 -0
  558. package/src/skills/nature-figure/UPSTREAM_LICENSE.txt +21 -0
  559. package/src/skills/nature-figure/agents/openai.yaml +4 -0
  560. package/src/skills/nature-figure/evals/evals.json +37 -0
  561. package/src/skills/nature-figure/references/api.md +428 -0
  562. package/src/skills/nature-figure/references/backend-selection.md +100 -0
  563. package/src/skills/nature-figure/references/chart-types.md +281 -0
  564. package/src/skills/nature-figure/references/common-patterns.md +349 -0
  565. package/src/skills/nature-figure/references/design-theory.md +436 -0
  566. package/src/skills/nature-figure/references/figure-contract.md +93 -0
  567. package/src/skills/nature-figure/references/nature-2026-observations.md +112 -0
  568. package/src/skills/nature-figure/references/qa-contract.md +119 -0
  569. package/src/skills/nature-figure/references/r-template-index.md +66 -0
  570. package/src/skills/nature-figure/references/r-workflow.md +161 -0
  571. package/src/skills/nature-figure/references/tutorials.md +250 -0
  572. package/src/skills/nature-paper2ppt/SKILL.md +507 -0
  573. package/src/skills/nature-paper2ppt/UPSTREAM_LICENSE.txt +21 -0
  574. package/src/skills/nature-paper2ppt/agents/openai.yaml +4 -0
  575. package/src/skills/nature-polishing/SKILL.md +385 -0
  576. package/src/skills/nature-polishing/UPSTREAM_LICENSE.txt +21 -0
  577. package/src/skills/nature-polishing/agents/openai.yaml +4 -0
  578. package/src/skills/nature-polishing/references/phrasebank-playbook.md +162 -0
  579. package/src/skills/nature-polishing/references/section-moves.md +240 -0
  580. package/src/skills/nature-polishing/references/style-guardrails.md +94 -0
  581. package/src/skills/nature-polishing/references/writing-strategy.md +148 -0
  582. package/src/skills/optimize/SKILL.md +177 -1568
  583. package/src/skills/optimize/references/brief-shaping-playbook.md +95 -0
  584. package/src/skills/optimize/references/candidate-board-template.md +13 -0
  585. package/src/skills/optimize/references/candidate-ranking-template.md +51 -0
  586. package/src/skills/optimize/references/codegen-route-playbook.md +50 -0
  587. package/src/skills/optimize/references/debug-response-template.md +29 -0
  588. package/src/skills/optimize/references/frontier-review-template.md +32 -0
  589. package/src/skills/optimize/references/fusion-playbook.md +36 -0
  590. package/src/skills/optimize/references/method-brief-template.md +73 -0
  591. package/src/skills/optimize/references/operational-guidance.md +621 -0
  592. package/src/skills/optimize/references/optimization-memory-template.md +30 -0
  593. package/src/skills/optimize/references/optimize-checklist-template.md +18 -0
  594. package/src/skills/optimize/references/plateau-response-playbook.md +28 -0
  595. package/src/skills/optimize/references/prompt-patterns.md +49 -0
  596. package/src/skills/paper-outline/SKILL.md +227 -0
  597. package/src/skills/paper-outline/references/outline-patterns.md +87 -0
  598. package/src/skills/paper-plot/SKILL.md +79 -0
  599. package/src/skills/paper-plot/agents/openai.yaml +4 -0
  600. package/src/skills/paper-plot/references/bar_grouped_hatch.md +96 -0
  601. package/src/skills/paper-plot/references/bar_paired_delta.md +72 -0
  602. package/src/skills/paper-plot/references/line_confidence_band.md +75 -0
  603. package/src/skills/paper-plot/references/line_loss_with_inset.md +65 -0
  604. package/src/skills/paper-plot/references/line_training_curve.md +44 -0
  605. package/src/skills/paper-plot/references/radar_dual_series.md +59 -0
  606. package/src/skills/paper-plot/references/scatter_broken_axis.md +59 -0
  607. package/src/skills/paper-plot/references/scatter_tsne_cluster.md +72 -0
  608. package/src/skills/paper-plot/scripts/bar_memevolve.py +109 -0
  609. package/src/skills/paper-plot/scripts/bar_spice.py +166 -0
  610. package/src/skills/paper-plot/scripts/line_aime.py +94 -0
  611. package/src/skills/paper-plot/scripts/line_loss_inset.py +157 -0
  612. package/src/skills/paper-plot/scripts/line_selfdistill.py +168 -0
  613. package/src/skills/paper-plot/scripts/radar_dora.py +151 -0
  614. package/src/skills/paper-plot/scripts/scatter_break.py +169 -0
  615. package/src/skills/paper-plot/scripts/scatter_tsne.py +133 -0
  616. package/src/skills/rebuttal/SKILL.md +9 -0
  617. package/src/skills/references/tool-usage-by-stage.md +438 -0
  618. package/src/skills/review/SKILL.md +105 -7
  619. package/src/skills/science/PROVENANCE.md +44 -0
  620. package/src/skills/science/SKILL.md +137 -0
  621. package/src/skills/science/references/artifact-science-tool.md +110 -0
  622. package/src/skills/science/references/claim-type-discipline.md +56 -0
  623. package/src/skills/science/references/domain-index.md +422 -0
  624. package/src/skills/science/references/hpc-via-bash-exec.md +42 -0
  625. package/src/skills/science/references/package-check-playbook.md +64 -0
  626. package/src/skills/science/references/package-index.min.json +3616 -0
  627. package/src/skills/science/references/packages/abinit.md +80 -0
  628. package/src/skills/science/references/packages/acts.md +73 -0
  629. package/src/skills/science/references/packages/aiida-core.md +80 -0
  630. package/src/skills/science/references/packages/alamode.md +80 -0
  631. package/src/skills/science/references/packages/amuse.md +88 -0
  632. package/src/skills/science/references/packages/anndata.md +88 -0
  633. package/src/skills/science/references/packages/arbor.md +80 -0
  634. package/src/skills/science/references/packages/arc.md +73 -0
  635. package/src/skills/science/references/packages/astropy.md +88 -0
  636. package/src/skills/science/references/packages/astroquery.md +88 -0
  637. package/src/skills/science/references/packages/atomate2.md +80 -0
  638. package/src/skills/science/references/packages/atomsmltr.md +73 -0
  639. package/src/skills/science/references/packages/awkward.md +73 -0
  640. package/src/skills/science/references/packages/batman.md +88 -0
  641. package/src/skills/science/references/packages/biopython.md +88 -0
  642. package/src/skills/science/references/packages/bloqade.md +73 -0
  643. package/src/skills/science/references/packages/brian2.md +73 -0
  644. package/src/skills/science/references/packages/bullet3.md +73 -0
  645. package/src/skills/science/references/packages/calculix.md +80 -0
  646. package/src/skills/science/references/packages/cantera.md +73 -0
  647. package/src/skills/science/references/packages/cavity-md-ipi.md +80 -0
  648. package/src/skills/science/references/packages/ccdproc.md +88 -0
  649. package/src/skills/science/references/packages/celerite2.md +88 -0
  650. package/src/skills/science/references/packages/cellrank.md +73 -0
  651. package/src/skills/science/references/packages/cesm.md +80 -0
  652. package/src/skills/science/references/packages/chemicals.md +73 -0
  653. package/src/skills/science/references/packages/chempy.md +73 -0
  654. package/src/skills/science/references/packages/cirq.md +73 -0
  655. package/src/skills/science/references/packages/coffea.md +73 -0
  656. package/src/skills/science/references/packages/cp2k.md +88 -0
  657. package/src/skills/science/references/packages/custodian.md +80 -0
  658. package/src/skills/science/references/packages/dart.md +73 -0
  659. package/src/skills/science/references/packages/datamol.md +88 -0
  660. package/src/skills/science/references/packages/dd4hep.md +73 -0
  661. package/src/skills/science/references/packages/dealii.md +80 -0
  662. package/src/skills/science/references/packages/deepchem.md +88 -0
  663. package/src/skills/science/references/packages/delphes.md +73 -0
  664. package/src/skills/science/references/packages/devito.md +80 -0
  665. package/src/skills/science/references/packages/dftb.md +88 -0
  666. package/src/skills/science/references/packages/dftd4.md +88 -0
  667. package/src/skills/science/references/packages/dftk-jl.md +80 -0
  668. package/src/skills/science/references/packages/dolfinx.md +80 -0
  669. package/src/skills/science/references/packages/drake.md +73 -0
  670. package/src/skills/science/references/packages/dumux.md +73 -0
  671. package/src/skills/science/references/packages/elk.md +80 -0
  672. package/src/skills/science/references/packages/elmerfem.md +80 -0
  673. package/src/skills/science/references/packages/enzo-e.md +88 -0
  674. package/src/skills/science/references/packages/espresso.md +80 -0
  675. package/src/skills/science/references/packages/exoplanet.md +88 -0
  676. package/src/skills/science/references/packages/fairroot.md +73 -0
  677. package/src/skills/science/references/packages/fbpic.md +80 -0
  678. package/src/skills/science/references/packages/fdtdbath-meep.md +80 -0
  679. package/src/skills/science/references/packages/geant4.md +73 -0
  680. package/src/skills/science/references/packages/geosx.md +80 -0
  681. package/src/skills/science/references/packages/gprmax.md +80 -0
  682. package/src/skills/science/references/packages/gromacs.md +80 -0
  683. package/src/skills/science/references/packages/gwaslab.md +73 -0
  684. package/src/skills/science/references/packages/gz-sim.md +73 -0
  685. package/src/skills/science/references/packages/hail.md +88 -0
  686. package/src/skills/science/references/packages/hiphive.md +80 -0
  687. package/src/skills/science/references/packages/hoomd-blue.md +80 -0
  688. package/src/skills/science/references/packages/itensor.md +73 -0
  689. package/src/skills/science/references/packages/itensors-jl.md +73 -0
  690. package/src/skills/science/references/packages/jdftx.md +73 -0
  691. package/src/skills/science/references/packages/jobflow.md +80 -0
  692. package/src/skills/science/references/packages/kadanoffbaym-jl.md +73 -0
  693. package/src/skills/science/references/packages/kite.md +80 -0
  694. package/src/skills/science/references/packages/kratos.md +80 -0
  695. package/src/skills/science/references/packages/kwant.md +73 -0
  696. package/src/skills/science/references/packages/lammps.md +80 -0
  697. package/src/skills/science/references/packages/lightkurve.md +88 -0
  698. package/src/skills/science/references/packages/limix.md +73 -0
  699. package/src/skills/science/references/packages/maxwelllink.md +80 -0
  700. package/src/skills/science/references/packages/mcdc.md +73 -0
  701. package/src/skills/science/references/packages/meep.md +80 -0
  702. package/src/skills/science/references/packages/mfem.md +80 -0
  703. package/src/skills/science/references/packages/mitgcm.md +73 -0
  704. package/src/skills/science/references/packages/modflow6.md +73 -0
  705. package/src/skills/science/references/packages/molecool.md +73 -0
  706. package/src/skills/science/references/packages/mom6.md +73 -0
  707. package/src/skills/science/references/packages/moose.md +80 -0
  708. package/src/skills/science/references/packages/mpas-model.md +73 -0
  709. package/src/skills/science/references/packages/mujoco.md +73 -0
  710. package/src/skills/science/references/packages/mumax3.md +73 -0
  711. package/src/skills/science/references/packages/nekrs.md +80 -0
  712. package/src/skills/science/references/packages/nessi.md +73 -0
  713. package/src/skills/science/references/packages/nest-simulator.md +73 -0
  714. package/src/skills/science/references/packages/netket.md +73 -0
  715. package/src/skills/science/references/packages/neuron.md +73 -0
  716. package/src/skills/science/references/packages/nextflow.md +88 -0
  717. package/src/skills/science/references/packages/nwchem.md +88 -0
  718. package/src/skills/science/references/packages/openbabel.md +88 -0
  719. package/src/skills/science/references/packages/openems.md +80 -0
  720. package/src/skills/science/references/packages/openff-toolkit.md +88 -0
  721. package/src/skills/science/references/packages/openfoam-dev.md +80 -0
  722. package/src/skills/science/references/packages/openmc.md +73 -0
  723. package/src/skills/science/references/packages/openmm.md +80 -0
  724. package/src/skills/science/references/packages/openmoc.md +73 -0
  725. package/src/skills/science/references/packages/openmx.md +80 -0
  726. package/src/skills/science/references/packages/opensees.md +80 -0
  727. package/src/skills/science/references/packages/opensn.md +80 -0
  728. package/src/skills/science/references/packages/opm-simulators.md +73 -0
  729. package/src/skills/science/references/packages/oqupy.md +73 -0
  730. package/src/skills/science/references/packages/packmol.md +80 -0
  731. package/src/skills/science/references/packages/palabos.md +80 -0
  732. package/src/skills/science/references/packages/parflow.md +80 -0
  733. package/src/skills/science/references/packages/pennylane.md +88 -0
  734. package/src/skills/science/references/packages/perceval.md +73 -0
  735. package/src/skills/science/references/packages/phono3py.md +73 -0
  736. package/src/skills/science/references/packages/phonopy.md +73 -0
  737. package/src/skills/science/references/packages/photutils.md +88 -0
  738. package/src/skills/science/references/packages/picongpu.md +80 -0
  739. package/src/skills/science/references/packages/plink-ng.md +88 -0
  740. package/src/skills/science/references/packages/precice.md +73 -0
  741. package/src/skills/science/references/packages/psc.md +80 -0
  742. package/src/skills/science/references/packages/psi4.md +88 -0
  743. package/src/skills/science/references/packages/pybinding.md +73 -0
  744. package/src/skills/science/references/packages/pyfr.md +80 -0
  745. package/src/skills/science/references/packages/pyhf.md +73 -0
  746. package/src/skills/science/references/packages/pyiron_base.md +80 -0
  747. package/src/skills/science/references/packages/pylcp.md +73 -0
  748. package/src/skills/science/references/packages/pylith.md +80 -0
  749. package/src/skills/science/references/packages/pynbody.md +88 -0
  750. package/src/skills/science/references/packages/pysam.md +88 -0
  751. package/src/skills/science/references/packages/pyscf.md +88 -0
  752. package/src/skills/science/references/packages/q-e.md +73 -0
  753. package/src/skills/science/references/packages/qibo.md +73 -0
  754. package/src/skills/science/references/packages/qiskit.md +73 -0
  755. package/src/skills/science/references/packages/quantica-jl.md +73 -0
  756. package/src/skills/science/references/packages/quantumoptics-jl.md +73 -0
  757. package/src/skills/science/references/packages/quimb.md +73 -0
  758. package/src/skills/science/references/packages/qulacs.md +73 -0
  759. package/src/skills/science/references/packages/qutip.md +73 -0
  760. package/src/skills/science/references/packages/rdkit.md +88 -0
  761. package/src/skills/science/references/packages/rmg-py.md +73 -0
  762. package/src/skills/science/references/packages/root.md +73 -0
  763. package/src/skills/science/references/packages/scanpy.md +88 -0
  764. package/src/skills/science/references/packages/scikit-allel.md +88 -0
  765. package/src/skills/science/references/packages/scikit-bio.md +88 -0
  766. package/src/skills/science/references/packages/scqubits.md +73 -0
  767. package/src/skills/science/references/packages/scuff-em.md +80 -0
  768. package/src/skills/science/references/packages/scvi-tools.md +73 -0
  769. package/src/skills/science/references/packages/seissol.md +73 -0
  770. package/src/skills/science/references/packages/sfepy.md +80 -0
  771. package/src/skills/science/references/packages/sisl.md +73 -0
  772. package/src/skills/science/references/packages/smilei.md +80 -0
  773. package/src/skills/science/references/packages/snakemake.md +88 -0
  774. package/src/skills/science/references/packages/specfem3d-globe.md +80 -0
  775. package/src/skills/science/references/packages/specutils.md +88 -0
  776. package/src/skills/science/references/packages/spglib.md +80 -0
  777. package/src/skills/science/references/packages/squidpy.md +88 -0
  778. package/src/skills/science/references/packages/starry.md +88 -0
  779. package/src/skills/science/references/packages/strawberryfields.md +73 -0
  780. package/src/skills/science/references/packages/su2.md +80 -0
  781. package/src/skills/science/references/packages/sunny-jl.md +73 -0
  782. package/src/skills/science/references/packages/sw4.md +73 -0
  783. package/src/skills/science/references/packages/swift.md +88 -0
  784. package/src/skills/science/references/packages/tdnegf.md +73 -0
  785. package/src/skills/science/references/packages/tenpy.md +73 -0
  786. package/src/skills/science/references/packages/thermo.md +73 -0
  787. package/src/skills/science/references/packages/tkwant.md +73 -0
  788. package/src/skills/science/references/packages/tvb-root.md +73 -0
  789. package/src/skills/science/references/packages/uproot5.md +73 -0
  790. package/src/skills/science/references/packages/vampire.md +80 -0
  791. package/src/skills/science/references/packages/wannier_tools.md +73 -0
  792. package/src/skills/science/references/packages/warpx.md +80 -0
  793. package/src/skills/science/references/packages/wrf.md +73 -0
  794. package/src/skills/science/references/packages/xtb.md +88 -0
  795. package/src/skills/science/references/packages/yt.md +73 -0
  796. package/src/skills/science/references/science-task-brief-template.md +71 -0
  797. package/src/skills/scout/SKILL.md +83 -425
  798. package/src/skills/scout/references/literature-scout-template.md +5 -24
  799. package/src/skills/scout/references/operational-guidance.md +191 -0
  800. package/src/skills/scout/references/paper-triage-playbook.md +11 -35
  801. package/src/skills/write/SKILL.md +744 -1246
  802. package/src/skills/write/references/experiments_analysis_patterns.md +129 -0
  803. package/src/skills/write/references/oral_package_patterns.md +252 -0
  804. package/src/skills/write/references/oral_writing_principles.md +291 -0
  805. package/src/skills/write/references/section_rewrite_checklist.md +234 -0
  806. package/src/tui/dist/app/AppContainer.js +1314 -27
  807. package/src/tui/dist/components/Composer.js +26 -1
  808. package/src/tui/dist/components/ConfigScreen.js +2 -1
  809. package/src/tui/dist/components/InputPrompt.js +25 -9
  810. package/src/tui/dist/components/MainContent.js +18 -3
  811. package/src/tui/dist/components/QuestScreen.js +3 -2
  812. package/src/tui/dist/components/UtilityScreen.js +37 -0
  813. package/src/tui/dist/hooks/useSafeInput.js +10 -0
  814. package/src/tui/dist/index.js +13 -1
  815. package/src/tui/dist/layouts/DefaultAppLayout.js +11 -8
  816. package/src/tui/dist/lib/api.js +89 -1
  817. package/src/tui/package.json +1 -1
  818. package/src/ui/dist/assets/{AnalysisPlugin-BCKAfjba.js → AnalysisPlugin-CA94NGmI.js} +1 -1
  819. package/src/ui/dist/assets/CliPlugin-DHBzphZU.js +79 -0
  820. package/src/ui/dist/assets/CodeEditorPlugin-BOFwD2rn.js +2 -0
  821. package/src/ui/dist/assets/{CodeViewerPlugin-CbaFRrUU.js → CodeViewerPlugin-CqDpgjik.js} +4 -4
  822. package/src/ui/dist/assets/{DocViewerPlugin-DAjLVeQD.js → DocViewerPlugin-UDBgt8-4.js} +3 -3
  823. package/src/ui/dist/assets/GitCommitViewerPlugin-BmHtZ0bZ.js +6 -0
  824. package/src/ui/dist/assets/{GitDiffViewerPlugin-CQACjoAA.js → GitDiffViewerPlugin-CAxjNorQ.js} +2 -2
  825. package/src/ui/dist/assets/{GitSnapshotViewer-0r4nLPke.js → GitSnapshotViewer-CweA6VON.js} +2 -2
  826. package/src/ui/dist/assets/{ImageViewerPlugin-nBOmI2v_.js → ImageViewerPlugin-C8wHGvGN.js} +5 -5
  827. package/src/ui/dist/assets/LabPlugin-COyyLUol.js +32 -0
  828. package/src/ui/dist/assets/{LatexPlugin-ZwtV8pIp.js → LatexPlugin-BQjAaA5J.js} +4 -4
  829. package/src/ui/dist/assets/{MarkdownViewerPlugin-DKqVfKyW.js → MarkdownViewerPlugin-Dy1NE2dI.js} +3 -3
  830. package/src/ui/dist/assets/{MarketplacePlugin-BwxStZ9D.js → MarketplacePlugin-DMIZtEJ2.js} +2 -2
  831. package/src/ui/dist/assets/NotebookEditor-CFHMq_Qt.js +91 -0
  832. package/src/ui/dist/assets/{NotebookEditor-DB9N_T9q.js → NotebookEditor-WFyd8Ybt.js} +3 -3
  833. package/src/ui/dist/assets/{PdfLoader-eWBONbQP.js → PdfLoader-CLE5u5TS.js} +3 -3
  834. package/src/ui/dist/assets/{PdfMarkdownPlugin-D22YOZL3.js → PdfMarkdownPlugin-_iNK_H83.js} +1 -1
  835. package/src/ui/dist/assets/PdfViewerPlugin-DgWsbInT.js +22 -0
  836. package/src/ui/dist/assets/SearchPlugin-DrZmn5iw.js +11 -0
  837. package/src/ui/dist/assets/{TextViewerPlugin-C5xqeeUH.js → TextViewerPlugin-D1-T3aC7.js} +4 -4
  838. package/src/ui/dist/assets/branding/runner-claude.svg +107 -0
  839. package/src/ui/dist/assets/branding/runner-codex.svg +10 -0
  840. package/src/ui/dist/assets/branding/runner-kimi.svg +14 -0
  841. package/src/ui/dist/assets/branding/runner-opencode.svg +7 -0
  842. package/src/ui/dist/assets/cli-store-CoZ-x5Ip.js +1 -0
  843. package/src/ui/dist/assets/{code-WlFHE7z_.js → code-DbsmSd3Y.js} +1 -1
  844. package/src/ui/dist/assets/file-diff-panel-DsvyRz47.js +1 -0
  845. package/src/ui/dist/assets/{wrap-text-BC-Hltpd.js → file-jump-queue-DeQBikaw.js} +3 -3
  846. package/src/ui/dist/assets/{file-socket-CfQPKQKj.js → file-socket-DA5XIx88.js} +1 -1
  847. package/src/ui/dist/assets/fonts/ds-fonts.css +50 -4
  848. package/src/ui/dist/assets/images/deepxiv/register-guide.png +0 -0
  849. package/src/ui/dist/assets/index-39vY9LmZ.js +1 -0
  850. package/src/ui/dist/assets/{index-CwNu1aH4.js → index-BsO46tJA.js} +1 -1
  851. package/src/ui/dist/assets/index-CHzJ2xtB.js +3530 -0
  852. package/src/ui/dist/assets/index-DH-zxoZ3.css +33 -0
  853. package/src/ui/dist/assets/{plugin-notebook-HbW2K-1c.js → plugin-notebook-JRhysCqj.js} +2 -2
  854. package/src/ui/dist/assets/{project-sync-C9IdzdZW.js → project-sync-DPmWKmKD.js} +1 -1
  855. package/src/ui/dist/assets/{zoom-out-E_gaeAxL.js → zoom-out-DAukFWen.js} +3 -3
  856. package/src/ui/dist/index.html +3 -3
  857. package/src/skills/analysis-campaign/references/artifact-orchestration.md +0 -58
  858. package/src/skills/baseline/references/memory-playbook.md +0 -40
  859. package/src/skills/baseline/references/publishable-baseline-package.md +0 -30
  860. package/src/skills/write/references/outline-evidence-contract-example.md +0 -107
  861. package/src/skills/write/references/paper-experiment-matrix-template.md +0 -131
  862. package/src/skills/write/references/paper-section-playbook.md +0 -64
  863. package/src/skills/write/references/reviewer-first-writing.md +0 -64
  864. package/src/skills/write/references/revision-checklist.md +0 -70
  865. package/src/skills/write/references/section-contracts.md +0 -82
  866. package/src/skills/write/references/sentence-level-proofing.md +0 -49
  867. package/src/ui/dist/assets/AiManusChatView-Bv-Z8YpU.js +0 -204
  868. package/src/ui/dist/assets/CliPlugin-BCKcpc35.js +0 -109
  869. package/src/ui/dist/assets/CodeEditorPlugin-DbOfSJ8K.js +0 -2
  870. package/src/ui/dist/assets/GitCommitViewerPlugin-CIUqbUDO.js +0 -1
  871. package/src/ui/dist/assets/LabCopilotPanel-BHxOxF4z.js +0 -14
  872. package/src/ui/dist/assets/LabPlugin-BKoZGs95.js +0 -22
  873. package/src/ui/dist/assets/NotebookEditor-BEQhaQbt.js +0 -81
  874. package/src/ui/dist/assets/PdfViewerPlugin-c-RK9DLM.js +0 -17
  875. package/src/ui/dist/assets/SearchPlugin-CxF9ytAx.js +0 -16
  876. package/src/ui/dist/assets/VNCViewer-BoLGLnHz.js +0 -11
  877. package/src/ui/dist/assets/bot-DREQOxzP.js +0 -6
  878. package/src/ui/dist/assets/chevron-up-C9Qpx4DE.js +0 -6
  879. package/src/ui/dist/assets/file-content-BZMz3RYp.js +0 -1
  880. package/src/ui/dist/assets/file-diff-panel-CQhw0jS2.js +0 -1
  881. package/src/ui/dist/assets/file-jump-queue-DA-SdG__.js +0 -1
  882. package/src/ui/dist/assets/git-commit-horizontal-DxZ8DCZh.js +0 -6
  883. package/src/ui/dist/assets/image-Bgl4VIyx.js +0 -6
  884. package/src/ui/dist/assets/index-BpV6lusQ.css +0 -33
  885. package/src/ui/dist/assets/index-CBNVuWcP.js +0 -2496
  886. package/src/ui/dist/assets/index-DrUnlf6K.js +0 -1
  887. package/src/ui/dist/assets/index-NW-h8VzN.js +0 -1
  888. package/src/ui/dist/assets/pdf-effect-queue-J8OnM0jE.js +0 -6
  889. package/src/ui/dist/assets/popover-CLc0pPP8.js +0 -1
  890. package/src/ui/dist/assets/select-Cs2PmzwL.js +0 -11
  891. package/src/ui/dist/assets/sigma-ClKcHAXm.js +0 -6
  892. package/src/ui/dist/assets/trash-DwpbFr3w.js +0 -11
  893. package/src/ui/dist/assets/useCliAccess-NQ8m0Let.js +0 -1
  894. package/src/ui/dist/assets/useFileDiffOverlay-FuhcnKiw.js +0 -1
@@ -0,0 +1,206 @@
1
+ schema_version: 1
2
+ id: aisb.t3.018_cotsynth
3
+ name: 'CoT-based Synthesizer: Enhancing LLM Performance through Answer Synthesis'
4
+ version: 0.1.0
5
+ one_line: 'Inference-time answer-synthesis benchmark: generate diverse CoT candidate
6
+ responses from 8B-scale policy models, then synthesize a superior answer via a trained
7
+ Synthesizer-8B, evaluating accuracy on GSM8k, MATH500, WikiTQ, and FeTaQA.
8
+
9
+ '
10
+ task_description: 'This benchmark reproduces the CoT-based Synthesizer paper (ACL
11
+ 2025). The core workflow is: (1) use a policy model (e.g. Llama3-8B-Instruct) to
12
+ sample N diverse candidate responses per question via temperature/top-p decoding,
13
+ (2) feed candidates into a Synthesizer model that performs CoT-based analysis and
14
+ synthesis to produce a refined final answer, and (3) evaluate on four benchmarks
15
+ — GSM8k (exact-match accuracy), MATH500 (exact-match accuracy via DART-Math evaluator),
16
+ WikiTQ (exact-match accuracy), and FeTaQA (ROUGE-L recall). The snapshot includes
17
+ bundled evaluation test data in the data/ folder, inference code using vLLM, evaluation
18
+ scripts, a fine-tuning pipeline, and a data-generation pipeline for creating synthesis
19
+ training data. A pre-trained Synthesizer-8B checkpoint and training data are available
20
+ on Hugging Face. The workflow is inference-heavy: each evaluation requires serving
21
+ an 8B model via vLLM and running multi-sample generation. MATH evaluation additionally
22
+ requires installing the dart-math package from GitHub. The benchmark compares against
23
+ Self-Consistency, Universal Self-Consistency, Best-of-N with reward models, and
24
+ LMCOR baselines.
25
+
26
+ '
27
+ capability_tags:
28
+ - research_code_optimization
29
+ - large_language_models
30
+ - answer_synthesis
31
+ - mathematical_reasoning
32
+ - table_qa
33
+ - chain_of_thought
34
+ - inference_scaling
35
+ aisb_direction: T3
36
+ track_fit:
37
+ - paper_track
38
+ - benchmark_track
39
+ task_mode: evaluation_driven
40
+ requires_execution: true
41
+ requires_paper: true
42
+ integrity_level: cas_plus_canary
43
+ snapshot_status: runnable
44
+ support_level: advanced
45
+ cost_band: high
46
+ time_band: 1d+
47
+ difficulty: hard
48
+ data_access: public
49
+ primary_outputs:
50
+ - accuracy_gsm8k
51
+ - accuracy_math500
52
+ - accuracy_wikitq
53
+ - rouge_l_fetaqa
54
+ - synthesized_answers
55
+ - vote_report
56
+ launch_profiles:
57
+ - id: quick_check
58
+ label: Quick Check
59
+ description: 'Run synthesis inference on a single dataset (e.g. MATH500 with pre-generated
60
+ candidate responses) and evaluate accuracy. Verifies that the vLLM serving and
61
+ evaluation pipeline work end-to-end.
62
+
63
+ '
64
+ - id: synthesis_eval
65
+ label: Full Synthesis Evaluation
66
+ description: 'Run the complete answer-synthesis and evaluation workflow across all
67
+ four benchmarks (GSM8k, MATH500, WikiTQ, FeTaQA) using the Synthesizer-8B model
68
+ or Llama3.1-70B as the synthesizer. Requires generating candidate responses from
69
+ each policy model first.
70
+
71
+ '
72
+ - id: data_pipeline
73
+ label: Data Generation Pipeline
74
+ description: 'Run the two-stage data generation pipeline (sampling.py → synthesizer.py
75
+ → filter) to produce training data for Synthesizer-8B. Requires a 70B response
76
+ LLM and is compute-intensive.
77
+
78
+ '
79
+ dataset_download:
80
+ primary_method: mixed
81
+ sources:
82
+ - kind: bundled
83
+ url: null
84
+ access: public
85
+ note: 'Test sets for GSM8k, MATH500, WikiTQ, and FeTaQA are included in the data/
86
+ folder of the snapshot archive.
87
+
88
+ '
89
+ - kind: huggingface
90
+ url: https://huggingface.co/datasets/BoHanMint/Synthesizer-8B-math-train-data
91
+ access: public
92
+ note: 'Pre-generated synthesis training data (295k MATH, 87k WikiTQ). Only needed
93
+ if re-training Synthesizer-8B.
94
+
95
+ '
96
+ - kind: huggingface
97
+ url: https://huggingface.co/BoHanMint/Synthesizer-8B-math
98
+ access: public
99
+ note: 'Pre-trained Synthesizer-8B-math checkpoint. Required for inference unless
100
+ training from scratch.
101
+
102
+ '
103
+ - kind: github
104
+ url: https://github.com/hkust-nlp/dart-math
105
+ access: public
106
+ note: 'DART-Math evaluation library required for MATH500 exact-match scoring.
107
+ Must be pip-installed separately.
108
+
109
+ '
110
+ notes:
111
+ - Bundled test data is small (a few MB). Model checkpoints are ~16 GB for the 8B
112
+ model.
113
+ - If running baselines with Llama3.1-70B as synthesizer, ~140 GB of model weights
114
+ are needed.
115
+ credential_requirements:
116
+ mode: optional
117
+ items:
118
+ - HuggingFace token (if gated model access is needed for Llama3 weights)
119
+ - OpenAI API key (only if evaluating GPT-4o as a policy model)
120
+ - GLM-4-Plus API key (only if evaluating GLM-4-Plus as a policy model)
121
+ notes:
122
+ - Core evaluation with open-source models requires no credentials.
123
+ - API keys are only needed to reproduce the full paper results with API-based policy
124
+ models.
125
+ resources:
126
+ minimum:
127
+ cpu_cores: 16
128
+ ram_gb: 64
129
+ disk_gb: 150
130
+ gpu_count: 1
131
+ gpu_vram_gb: 24
132
+ recommended:
133
+ cpu_cores: 32
134
+ ram_gb: 128
135
+ disk_gb: 300
136
+ gpu_count: 2
137
+ gpu_vram_gb: 48
138
+ environment:
139
+ python: '3.11'
140
+ cuda: null
141
+ pytorch: null
142
+ flash_attn: null
143
+ key_packages:
144
+ - deepspeed==0.15.2
145
+ - vllm==0.5.3
146
+ - transformers==4.43.1
147
+ - rouge-score
148
+ - dart-math (pip install from GitHub)
149
+ notes:
150
+ - vLLM is the primary inference engine; synthesis_infer.py defaults to tensor_parallel_size=1
151
+ and max_model_len=8192.
152
+ - MATH evaluation requires dart-math installed via `pip install -e .` from the dart-math
153
+ repo clone.
154
+ - FeTaQA evaluation requires rouge-score package.
155
+ - The code hardcodes CUDA_VISIBLE_DEVICES in several scripts; adjust for your GPU
156
+ topology.
157
+ - See requirements.txt in the snapshot for the full dependency set.
158
+ risk_flags:
159
+ - external_package_dependency
160
+ - large_model_weights
161
+ - hardcoded_gpu_ids
162
+ risk_notes:
163
+ - MATH evaluation depends on the external dart-math package (GitHub clone + pip install
164
+ -e). If unavailable, MATH500 scoring will fail.
165
+ - Scripts hardcode CUDA_VISIBLE_DEVICES (e.g. "0" in inference, "3" in eval). Must
166
+ be adjusted for multi-GPU or different hardware.
167
+ - Full paper reproduction requires serving 70B models for candidate generation baselines,
168
+ which needs ≥2×80GB GPUs.
169
+ - API-based policy model evaluation (GPT-4o, GLM-4-Plus) incurs real monetary cost
170
+ and requires API credentials.
171
+ - No runtime execution was performed during packaging; metric values are not yet validated.
172
+ recommended_when: 'Use this benchmark when you want an inference-heavy LLM evaluation
173
+ task focused on answer-synthesis strategies for mathematical reasoning and table
174
+ QA. Good fit for studying inference-time scaling, multi-response aggregation, and
175
+ CoT-based post-processing with 8B-class models. All evaluation data is bundled and
176
+ pre-trained checkpoints are publicly available on Hugging Face.
177
+
178
+ '
179
+ not_recommended_when: 'Do not use this if you cannot serve 8B-class models on GPU
180
+ (minimum 24 GB VRAM), if you need a benchmark without model-serving overhead, or
181
+ if you need fully self-contained evaluation without any external package dependencies
182
+ (dart-math is required for MATH scoring).
183
+
184
+ '
185
+ paper:
186
+ title: 'CoT-based Synthesizer: Enhancing LLM Performance through Answer Synthesis'
187
+ venue: ACL 2025
188
+ year: 2025
189
+ url: https://arxiv.org/abs/2501.01668
190
+ download:
191
+ url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.018_cotsynth.zip
192
+ archive_type: zip
193
+ local_dir_name: paper-18-CoTSynth
194
+ provider: github_release
195
+ repo: ResearAI/DeepScientist
196
+ tag: aisb-v0.0.1
197
+ asset_name: aisb.t3.018_cotsynth.zip
198
+ sha256: 245a9c52e66e83cc77844c2375cf61a65cb6823fd84bdaf9142687559498a885
199
+ size_bytes: 925436
200
+ commercial:
201
+ annual_fee: null
202
+ display:
203
+ palette_seed: teal-gold-synthesis
204
+ art_style: benchmark-notebook
205
+ accent_priority: high
206
+ image_path: ../image/018_aisb.t3.018_cotsynth.jpg
@@ -0,0 +1,162 @@
1
+ schema_version: 1
2
+ id: aisb.t3.018_cotsynth
3
+ name: 'CoT合成器:通过答案合成提升大语言模型性能'
4
+ version: 0.1.0
5
+ one_line: '推理时答案合成基准测试:从8B规模策略模型生成多样化的CoT候选响应,然后通过训练好的Synthesizer-8B合成更优答案,在GSM8k、MATH500、WikiTQ和FeTaQA上评估准确率。
6
+
7
+ '
8
+ task_description: '该基准测试复现了CoT合成器论文(ACL 2025)。核心工作流程为:(1)使用策略模型(如Llama3-8B-Instruct)通过温度/top-p解码为每个问题采样N个多样化候选响应,(2)将候选响应输入合成器模型,执行基于CoT的分析与合成以产生精炼的最终答案,(3)在四个基准测试上评估——GSM8k(精确匹配准确率)、MATH500(通过DART-Math评估器的精确匹配准确率)、WikiTQ(精确匹配准确率)和FeTaQA(ROUGE-L召回率)。快照包含data/文件夹中的捆绑评估测试数据、使用vLLM的推理代码、评估脚本、微调流程以及用于创建合成训练数据的数据生成流程。预训练的Synthesizer-8B检查点及训练数据可在Hugging Face获取。该工作流程为推理密集型:每次评估需要通过vLLM服务8B模型并运行多样本生成。MATH评估还需从GitHub安装dart-math包。该基准测试与自洽性、通用自洽性、基于奖励模型的Best-of-N以及LMCOR基线进行比较。
9
+
10
+ '
11
+ capability_tags:
12
+ - research_code_optimization
13
+ - large_language_models
14
+ - answer_synthesis
15
+ - mathematical_reasoning
16
+ - table_qa
17
+ - chain_of_thought
18
+ - inference_scaling
19
+ aisb_direction: T3
20
+ track_fit:
21
+ - paper_track
22
+ - benchmark_track
23
+ task_mode: evaluation_driven
24
+ requires_execution: true
25
+ requires_paper: true
26
+ integrity_level: cas_plus_canary
27
+ snapshot_status: runnable
28
+ support_level: advanced
29
+ cost_band: high
30
+ time_band: 1d+
31
+ difficulty: hard
32
+ data_access: public
33
+ primary_outputs:
34
+ - accuracy_gsm8k
35
+ - accuracy_math500
36
+ - accuracy_wikitq
37
+ - rouge_l_fetaqa
38
+ - synthesized_answers
39
+ - vote_report
40
+ launch_profiles:
41
+ - id: quick_check
42
+ label: 快速检查
43
+ description: '在单个数据集(如使用预生成候选响应的MATH500)上运行合成推理并评估准确率。验证vLLM服务和评估流程的端到端工作。
44
+
45
+ '
46
+ - id: synthesis_eval
47
+ label: 完整合成评估
48
+ description: '使用Synthesizer-8B模型或Llama3.1-70B作为合成器,在所有四个基准测试(GSM8k、MATH500、WikiTQ、FeTaQA)上运行完整的答案合成与评估工作流程。需要先从每个策略模型生成候选响应。
49
+
50
+ '
51
+ - id: data_pipeline
52
+ label: 数据生成流程
53
+ description: '运行两阶段数据生成流程(sampling.py → synthesizer.py → filter)以生成Synthesizer-8B的训练数据。需要70B响应LLM且计算密集。
54
+
55
+ '
56
+ dataset_download:
57
+ primary_method: mixed
58
+ sources:
59
+ - kind: bundled
60
+ url: null
61
+ access: public
62
+ note: 'GSM8k、MATH500、WikiTQ和FeTaQA的测试集包含在快照压缩包的data/文件夹中。
63
+
64
+ '
65
+ - kind: huggingface
66
+ url: https://huggingface.co/datasets/BoHanMint/Synthesizer-8B-math-train-data
67
+ access: public
68
+ note: '预生成的合成训练数据(295k MATH、87k WikiTQ)。仅在重新训练Synthesizer-8B时需要。
69
+
70
+ '
71
+ - kind: huggingface
72
+ url: https://huggingface.co/BoHanMint/Synthesizer-8B-math
73
+ access: public
74
+ note: '预训练的Synthesizer-8B-math检查点。除非从头训练,否则推理时必需。
75
+
76
+ '
77
+ - kind: github
78
+ url: https://github.com/hkust-nlp/dart-math
79
+ access: public
80
+ note: 'MATH500精确匹配评分所需的DART-Math评估库。必须单独pip安装。
81
+
82
+ '
83
+ notes:
84
+ - 捆绑测试数据很小(仅几MB)。模型检查点约16 GB(8B模型)。
85
+ - 如使用Llama3.1-70B作为合成器运行基线,需要约140 GB的模型权重。
86
+ credential_requirements:
87
+ mode: optional
88
+ items:
89
+ - HuggingFace token(如需访问门控模型以获取Llama3权重)
90
+ - OpenAI API key(仅在评估GPT-4o作为策略模型时需要)
91
+ - GLM-4-Plus API key(仅在评估GLM-4-Plus作为策略模型时需要)
92
+ notes:
93
+ - 使用开源模型的核心评估无需凭据。
94
+ - 仅在复现API基策略模型的完整论文结果时需要API密钥。
95
+ resources:
96
+ minimum:
97
+ cpu_cores: 16
98
+ ram_gb: 64
99
+ disk_gb: 150
100
+ gpu_count: 1
101
+ gpu_vram_gb: 24
102
+ recommended:
103
+ cpu_cores: 32
104
+ ram_gb: 128
105
+ disk_gb: 300
106
+ gpu_count: 2
107
+ gpu_vram_gb: 48
108
+ environment:
109
+ python: '3.11'
110
+ cuda: null
111
+ pytorch: null
112
+ flash_attn: null
113
+ key_packages:
114
+ - deepspeed==0.15.2
115
+ - vllm==0.5.3
116
+ - transformers==4.43.1
117
+ - rouge-score
118
+ - dart-math (pip install from GitHub)
119
+ notes:
120
+ - vLLM是主要推理引擎;synthesis_infer.py默认tensor_parallel_size=1,max_model_len=8192。
121
+ - MATH评估需要通过`pip install -e .`从dart-math仓库克隆并安装。
122
+ - FeTaQA评估需要rouge-score包。
123
+ - 代码在多个脚本中硬编码了CUDA_VISIBLE_DEVICES;请根据您的GPU拓扑进行调整。
124
+ - 快照中的requirements.txt包含完整的依赖列表。
125
+ risk_flags:
126
+ - external_package_dependency
127
+ - large_model_weights
128
+ - hardcoded_gpu_ids
129
+ risk_notes:
130
+ - MATH评估依赖外部dart-math包(GitHub克隆 + pip install -e)。如不可用,MATH500评分将失败。
131
+ - 脚本硬编码了CUDA_VISIBLE_DEVICES(如推理中为"0",评估中为"3")。在多GPU或不同硬件上必须调整。
132
+ - 完整论文复现需要服务70B模型以生成候选响应基线,需要≥2×80GB GPU。
133
+ - 基于API的策略模型评估(GPT-4o、GLM-4-Plus)会产生实际货币成本,需要API凭据。
134
+ - 打包过程中未执行运行时验证;指标值尚未确认。
135
+ recommended_when: '当您需要一个推理密集型的LLM评估任务,重点关注数学推理和表格问答的答案合成策略时使用此基准测试。非常适合研究推理时扩展、多响应聚合以及基于CoT的8B类模型后处理。所有评估数据已捆绑,预训练检查点可在Hugging Face公开获取。
136
+
137
+ '
138
+ not_recommended_when: '如果无法在GPU上服务8B类模型(最低24 GB显存)、需要无模型服务开销的基准测试,或需要完全自包含的评估而不依赖任何外部包依赖项(dart-math是MATH评分必需的),请勿使用此基准测试。
139
+
140
+ '
141
+ paper:
142
+ title: 'CoT-based Synthesizer: Enhancing LLM Performance through Answer Synthesis'
143
+ venue: ACL 2025
144
+ year: 2025
145
+ url: https://arxiv.org/abs/2501.01668
146
+ download:
147
+ url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.018_cotsynth.zip
148
+ archive_type: zip
149
+ local_dir_name: paper-18-CoTSynth
150
+ provider: github_release
151
+ repo: ResearAI/DeepScientist
152
+ tag: aisb-v0.0.1
153
+ asset_name: aisb.t3.018_cotsynth.zip
154
+ sha256: 245a9c52e66e83cc77844c2375cf61a65cb6823fd84bdaf9142687559498a885
155
+ size_bytes: 925436
156
+ commercial:
157
+ annual_fee: null
158
+ display:
159
+ palette_seed: teal-gold-synthesis
160
+ art_style: benchmark-notebook
161
+ accent_priority: high
162
+ image_path: ../image/018_aisb.t3.018_cotsynth.jpg
@@ -0,0 +1,211 @@
1
+ schema_version: 1
2
+ id: aisb.t3.019_dyscaleut
3
+ name: Dynamic Scaling of Unit Tests for Code Reward Modeling
4
+ version: 0.1.0
5
+ one_line: 'Generate and dynamically scale LLM-produced unit tests as reward signals
6
+ for best-of-N code solution selection, evaluated via pass@1 on HumanEval Plus, MBPP
7
+ Plus, and LiveCodeBench.
8
+
9
+ '
10
+ task_description: 'This benchmark reproduces the CodeRM pipeline for improving code
11
+ generation through scaled unit-test reward modeling. The core workflow is: (1) a
12
+ policy LLM generates N candidate code solutions per programming problem, (2) a reward
13
+ LLM (or the fine-tuned CodeRM-8B unit test generator) produces M unit tests per
14
+ problem, (3) unit tests are executed against solutions inside a Docker sandbox,
15
+ and (4) the best solution is selected via variance-weighted majority voting over
16
+ execution results. A dynamic scaling mechanism allocates more unit tests to harder
17
+ problems using a trained difficulty classifier. The primary metric is pass@1 (best-of-N
18
+ accuracy) computed by evaluation/calculate_result.py. The local snapshot includes
19
+ the evaluation/execution code, bundled benchmark data, pre-computed inference results,
20
+ and a Docker image specification for sandboxed code execution. The inference and
21
+ preprocessing steps (generating new solutions/unit tests from scratch with policy/reward
22
+ LLMs) require external model access and are partially covered by bundled scripts
23
+ but may need adaptation. A pre-computed output.tar.gz from Google Drive can substitute
24
+ for the inference+execution steps.
25
+
26
+ '
27
+ capability_tags:
28
+ - research_code_optimization
29
+ - code_generation
30
+ - reward_modeling
31
+ - unit_test_generation
32
+ - evaluation
33
+ aisb_direction: T3
34
+ track_fit:
35
+ - paper_track
36
+ - benchmark_track
37
+ task_mode: experiment_driven
38
+ requires_execution: true
39
+ requires_paper: true
40
+ integrity_level: cas_plus_canary
41
+ snapshot_status: partial
42
+ support_level: advanced
43
+ cost_band: high
44
+ time_band: 1d+
45
+ difficulty: hard
46
+ data_access: public
47
+ primary_outputs:
48
+ - pass_at_1
49
+ - scaled_unit_tests
50
+ - reward_scores
51
+ launch_profiles:
52
+ - id: quick_check
53
+ label: Quick Check
54
+ description: 'Run evaluation/calculate_result.py on the bundled or downloaded pre-computed
55
+ execution results to verify pass@1 on a single benchmark/model combination. No
56
+ GPU or Docker required.
57
+
58
+ '
59
+ - id: code_reward_eval
60
+ label: Code Reward Eval
61
+ description: 'Pull the Docker sandbox image, execute unit tests against candidate
62
+ solutions using evaluation/evaluate.py, then compute pass@1 with evaluation/calculate_result.py.
63
+ Requires Docker and moderate compute. Uses bundled benchmark data and pre-generated
64
+ solutions/unit tests; does not require LLM inference.
65
+
66
+ '
67
+ - id: full_pipeline
68
+ label: Full Pipeline (Inference + Eval)
69
+ description: 'Run end-to-end: multi-process LLM inference to generate solutions
70
+ and unit tests, preprocessing/merging, Docker-based execution, and final pass@1
71
+ calculation. Requires GPU access for LLM inference (CodeRM-8B or larger models)
72
+ and Docker for execution.
73
+
74
+ '
75
+ dataset_download:
76
+ primary_method: mixed
77
+ sources:
78
+ - kind: huggingface
79
+ url: https://huggingface.co/datasets/KAKA22/CodeRM-UnitTest
80
+ access: public
81
+ note: 60k synthetic Python unit tests used to train CodeRM-8B.
82
+ - kind: huggingface
83
+ url: https://huggingface.co/KAKA22/CodeRM-8B
84
+ access: public
85
+ note: Fine-tuned 8B unit test generator model weights.
86
+ - kind: google_drive
87
+ url: https://drive.google.com/drive/folders/1-wUvy9Ox49V5CY38TMjCr5RlLysapyyj?usp=sharing
88
+ access: public
89
+ note: 'Pre-computed execution output (output.tar.gz) that can replace Steps 1-3
90
+ of the pipeline.
91
+
92
+ '
93
+ - kind: bundled
94
+ url: null
95
+ access: local
96
+ note: 'Benchmark data (HumanEval Plus, MBPP Plus, LiveCodeBench) and pre-generated
97
+ inference results are included under data/benchmark/ and data/result/ in the
98
+ snapshot.
99
+
100
+ '
101
+ notes:
102
+ - Training dataset is ~60k examples; model weights are ~16 GB; pre-computed outputs
103
+ vary by benchmark.
104
+ - Benchmarks themselves (HumanEval Plus, MBPP Plus, LiveCodeBench) are public.
105
+ credential_requirements:
106
+ mode: none
107
+ items: []
108
+ notes:
109
+ - No API keys required if using bundled data and CodeRM-8B for inference.
110
+ - If replicating GPT-4o-mini or GPT-3.5 policy/reward experiments, OpenAI API keys
111
+ are needed.
112
+ resources:
113
+ minimum:
114
+ cpu_cores: 16
115
+ ram_gb: 64
116
+ disk_gb: 150
117
+ gpu_count: 1
118
+ gpu_vram_gb: 24
119
+ recommended:
120
+ cpu_cores: 32
121
+ ram_gb: 128
122
+ disk_gb: 300
123
+ gpu_count: 2
124
+ gpu_vram_gb: 48
125
+ environment:
126
+ python: null
127
+ cuda: null
128
+ pytorch: null
129
+ flash_attn: null
130
+ key_packages:
131
+ - vllm
132
+ - transformers
133
+ - docker
134
+ notes:
135
+ - 'The snapshot includes a dedicated Docker execution environment (kaka0605/exec_unit_test:24.12.30)
136
+ for sandboxed large-scale code execution of generated unit tests.
137
+
138
+ '
139
+ - See docker_source/Dockerfile and docker_source/requirements.txt for the sandbox
140
+ dependencies.
141
+ - See bundled README and requirements for the host-side dependency set.
142
+ - Inference uses multi-process Python (inference/inference_mp.py) and likely requires
143
+ vLLM or similar for efficient serving.
144
+ risk_flags:
145
+ - docker_required
146
+ - partial_snapshot
147
+ - external_model_for_full_replication
148
+ - code_execution_sandbox
149
+ risk_notes:
150
+ - 'The Docker sandbox (kaka0605/exec_unit_test:24.12.30) must be pulled or built locally
151
+ before running evaluation/evaluate.py. Without Docker, only the final calculate_result.py
152
+ step works on pre-computed outputs.
153
+
154
+ '
155
+ - 'The preprocessing scripts (preprocess/) are present but the full inference pipeline
156
+ requires serving a policy LLM and a reward LLM, which is not fully automated in
157
+ the snapshot.
158
+
159
+ '
160
+ - 'exec_main.py executes arbitrary generated Python code with a timeout mechanism
161
+ via signal.SIGALRM; this should only be run inside the provided Docker sandbox or
162
+ an equivalent isolated environment.
163
+
164
+ '
165
+ - 'No benchmark execution was performed during the packaging pass; metric values are
166
+ not yet validated.
167
+
168
+ '
169
+ recommended_when: 'Use this benchmark when you want to study how scaling LLM-generated
170
+ unit tests improves code reward signal quality and best-of-N code selection, or
171
+ when you need a code-generation task with real unit-test execution in the evaluation
172
+ loop. Also suitable for evaluating lightweight unit test generators against larger
173
+ teacher models.
174
+
175
+ '
176
+ not_recommended_when: 'Do not use this if you cannot provide Docker-based containerized
177
+ code execution, if you need a text-only reward-model benchmark without code execution,
178
+ or if you lack GPU resources for LLM inference and only need a quick metric check
179
+ (use the quick_check profile with pre-computed outputs instead).
180
+
181
+ '
182
+ paper:
183
+ title: Dynamic Scaling of Unit Tests for Code Reward Modeling
184
+ authors:
185
+ - Zeyao Ma
186
+ - Xiaokang Zhang
187
+ - Jing Zhang
188
+ - Jifan Yu
189
+ - Sijia Luo
190
+ - Jie Tang
191
+ venue: ACL 2025
192
+ year: 2025
193
+ url: https://arxiv.org/abs/2501.01054
194
+ homepage: https://code-reward-model.github.io/
195
+ download:
196
+ url: https://github.com/ResearAI/DeepScientist/releases/download/aisb-v0.0.1/aisb.t3.019_dyscaleut.zip
197
+ archive_type: zip
198
+ local_dir_name: paper-19-DyScaleUT
199
+ provider: github_release
200
+ repo: ResearAI/DeepScientist
201
+ tag: aisb-v0.0.1
202
+ asset_name: aisb.t3.019_dyscaleut.zip
203
+ sha256: 2eee5573353ade5e13c254f7372a3294b71459ee7c668205f27f2852347c141f
204
+ size_bytes: 60766
205
+ commercial:
206
+ annual_fee: null
207
+ display:
208
+ palette_seed: olive-ink-runtime
209
+ art_style: code-lab
210
+ accent_priority: high
211
+ image_path: ../image/019_aisb.t3.019_dyscaleut.jpg